Skip to content

Commit 1d4e257

Browse files
authored
Multivariate prompter pipeline (#47)
* add interval parsing * change primitive name * fix lint * multivariate pipeline * multivariate pipeline * fix lint * add dim tests * fix lint * update tutorial * update tutorial output * update docstrings on `get * remove trailing whitespace
1 parent f4c431e commit 1d4e257

9 files changed

+1328
-7
lines changed
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
{
2+
"primitives": [
3+
"mlstars.custom.timeseries_preprocessing.time_segments_aggregate",
4+
"sklearn.impute.SimpleImputer",
5+
"sigllm.primitives.transformation.Float2Scalar",
6+
"sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences",
7+
"sigllm.primitives.transformation.format_as_string",
8+
"sigllm.primitives.prompting.huggingface.HF",
9+
"sigllm.primitives.prompting.anomalies.parse_anomaly_response",
10+
"sigllm.primitives.prompting.anomalies.find_anomalies_in_windows",
11+
"sigllm.primitives.prompting.anomalies.merge_anomalous_sequences",
12+
"sigllm.primitives.prompting.anomalies.format_anomalies"
13+
],
14+
"init_params": {
15+
"mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": {
16+
"time_column": "timestamp",
17+
"interval": 21600,
18+
"method": "mean"
19+
},
20+
"sigllm.primitives.transformation.Float2Scalar#1": {
21+
"decimal": 2,
22+
"rescale": true
23+
},
24+
"sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences#1": {
25+
"window_size": 100,
26+
"step_size": 40
27+
},
28+
"sigllm.primitives.transformation.format_as_string#1": {
29+
"space": false
30+
},
31+
"sigllm.primitives.prompting.huggingface.HF#1": {
32+
"name": "mistralai/Mistral-7B-Instruct-v0.2",
33+
"samples": 1,
34+
"temp": 0.01
35+
},
36+
"sigllm.primitives.prompting.anomalies.parse_anomaly_response#1": {
37+
"interval": true
38+
},
39+
"sigllm.primitives.prompting.anomalies.find_anomalies_in_windows#1": {
40+
"alpha": 0.4
41+
},
42+
"sigllm.primitives.prompting.anomalies.merge_anomalous_sequences#1": {
43+
"beta": 0.5
44+
}
45+
},
46+
"input_names": {
47+
"sigllm.primitives.prompting.huggingface.HF#1": {
48+
"X": "X_str"
49+
},
50+
"sigllm.primitives.prompting.anomalies.parse_anomaly_response#1": {
51+
"X": "y_hat"
52+
}
53+
},
54+
"output_names": {
55+
"mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": {
56+
"index": "timestamp"
57+
},
58+
"sigllm.primitives.transformation.format_as_string#1": {
59+
"X": "X_str"
60+
},
61+
"sigllm.primitives.prompting.huggingface.HF#1": {
62+
"y": "y_hat"
63+
},
64+
"sigllm.primitives.prompting.anomalies.parse_anomaly_response#1": {
65+
"X": "y"
66+
}
67+
}
68+
}

sigllm/primitives/jsons/sigllm.primitives.prompting.huggingface.HF.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@
1717
"name": "X",
1818
"type": "ndarray"
1919
},
20+
{
21+
"name": "dim",
22+
"type": "int",
23+
"default": 1
24+
},
2025
{
2126
"name": "normal",
2227
"type": "ndarray",

sigllm/primitives/jsons/sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@
3636
{
3737
"name": "step_size",
3838
"type": "int"
39+
},
40+
{
41+
"name": "dim",
42+
"type": "int"
3943
}
4044
]
4145
},

sigllm/primitives/prompting/anomalies.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,9 @@ def find_anomalies_in_windows(y, alpha=0.5):
131131
ndarray:
132132
A 2-dimensional array containing final anomalous indices of each windows.
133133
"""
134+
if isinstance(y, list):
135+
y = np.array(y, dtype=object)
136+
134137
idx_list = []
135138
for samples in y:
136139
min_vote = np.ceil(alpha * len(samples))
@@ -203,6 +206,7 @@ def format_anomalies(y, timestamp, padding_size=50):
203206
if len(y) == 0:
204207
return []
205208

209+
y = y[y < len(timestamp)].astype(int)
206210
y = timestamp[y] # Convert list of indices into list of timestamps
207211
start, end = timestamp[0], timestamp[-1]
208212
interval = timestamp[1] - timestamp[0]

sigllm/primitives/prompting/huggingface.py

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
# -*- coding: utf-8 -*-
2-
32
import json
43
import logging
54
import os
5+
import re
66

77
import torch
88
from tqdm import tqdm
@@ -24,6 +24,23 @@
2424
DEFAULT_MODEL = 'mistralai/Mistral-7B-Instruct-v0.2'
2525

2626

27+
def get_examples(text, k=3):
28+
"""Extracts the content within the first k sets of parentheses in a string.
29+
30+
Args:
31+
text (str):
32+
Input string.
33+
k (int):
34+
Number of examples to return.
35+
36+
Returns:
37+
list:
38+
A list containing the content within the first k sets of parentheses.
39+
"""
40+
matches = re.findall(r'\(([^)]*)\)', text)
41+
return matches[:k]
42+
43+
2744
class HF:
2845
"""Prompt Pretrained models on HuggingFace to detect anomalies in a time series.
2946
@@ -72,7 +89,6 @@ def __init__(
7289
self.samples = samples
7390
self.padding = padding
7491
self.restrict_tokens = restrict_tokens
75-
7692
self.tokenizer = AutoTokenizer.from_pretrained(self.name, use_fast=False)
7793

7894
# special tokens
@@ -111,12 +127,14 @@ def __init__(
111127

112128
self.model.eval()
113129

114-
def detect(self, X, normal=None, **kwargs):
130+
def detect(self, X, dim=1, normal=None, **kwargs):
115131
"""Use HF to detect anomalies of a signal.
116132
117133
Args:
118134
X (ndarray):
119-
Input sequences of strings containing signal values
135+
Input sequences of strings containing signal values.
136+
dim (int, optional):
137+
Number of dimensions of the time series. Default to 1.
120138
normal (str, optional):
121139
A normal reference sequence for one-shot prompting. If None,
122140
zero-shot prompting is used. Default to None.
@@ -140,7 +158,13 @@ def detect(self, X, normal=None, **kwargs):
140158
if self.restrict_tokens:
141159
user_message = PROMPTS['user_message']
142160
else:
143-
user_message = PROMPTS['user_message_2']
161+
user_message = PROMPTS['user_message_unrestricted']
162+
163+
if dim > 1:
164+
examples = get_examples(text)
165+
examples = ' '.join([f'({ex})' for ex in examples])
166+
167+
user_message = PROMPTS['user_message_multivariate'].format(dim, examples)
144168

145169
# Combine messages with one-shot example if provided
146170
message = ' '.join([
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
{
22
"system_message": "You are an expert in time series analysis. Your task is to detect anomalies in time series data.",
33
"user_message": "Below is a [SEQUENCE], please return the anomalies in that sequence in [RESPONSE]. Only return the numbers. [SEQUENCE]",
4-
"user_message_2": "Below is a [SEQUENCE], analyze the following time series and identify any anomalies. If you find anomalies, provide their values in the format [first_anomaly, ..., last_anomaly]. If no anomalies are found, respond with 'no anomalies'. Be concise, do not write code, do not perform any calculations, just give your answers as told.: [SEQUENCE]",
4+
"user_message_unrestricted": "Below is a [SEQUENCE], analyze the following time series and identify any anomalies. If you find anomalies, provide their values in the format [first_anomaly, ..., last_anomaly]. If no anomalies are found, respond with 'no anomalies'. Be concise, do not write code, do not perform any calculations, just give your answers as told.: [SEQUENCE]",
5+
"user_message_multivariate": "Below is a multivariate time series of {} dimensions with the following format \"{}\".\nYour job is to detect ranges of anomalies in this time series, in terms of the x-axis coordinates. List one by one in a list.\nFor example, if ranges (including two endpoints) [5, 12], [71, 74], and [105, 131], are anomalies, then output \"[[5,12], [71, 74], [105, 131]]\".\nIf no anomalies are found, respond with 'no anomalies'. Be concise, do not write code, do not perform any calculations, just give your answers as told.\n[SEQUENCE]\n",
56
"one_shot_prefix": "Here is a normal reference of the time series: [NORMAL]"
67
}

sigllm/primitives/prompting/timeseries_preprocessing.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@ def rolling_window_sequences(X, window_size=500, step_size=100):
2626
* rolling window sequences.
2727
* first index value of each input sequence.
2828
"""
29+
if X.ndim == 1:
30+
dim = 1
31+
else:
32+
dim = X.shape[1]
33+
2934
index = range(len(X))
3035
out_X = list()
3136
X_index = list()
@@ -37,4 +42,4 @@ def rolling_window_sequences(X, window_size=500, step_size=100):
3742
out_X.append(X[start:end])
3843
X_index.append(index[start])
3944
start = start + step_size
40-
return np.asarray(out_X), np.asarray(X_index), window_size, step_size
45+
return np.asarray(out_X), np.asarray(X_index), window_size, step_size, dim

tests/primitives/prompting/test_timeseries_preprocessing.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,30 @@ def test_rolling_window_sequences(values, window_size, step_size):
3131
np.array([0, 1, 2, 3, 4]),
3232
3,
3333
1,
34+
1,
35+
)
36+
37+
result = rolling_window_sequences(values, window_size, step_size)
38+
39+
if len(result) != len(expected):
40+
raise AssertionError('Tuples has different length')
41+
42+
for arr1, arr2 in zip(result, expected):
43+
np.testing.assert_equal(arr1, arr2)
44+
45+
46+
def test_rolling_window_sequences_multivariate(window_size, step_size):
47+
values = np.array([[0.555, 2.345], [1.501, 5.903], [9.116, 3.068], [7.432, 4.532]])
48+
49+
expected = (
50+
np.array([
51+
[[0.555, 2.345], [1.501, 5.903], [9.116, 3.068]],
52+
[[1.501, 5.903], [9.116, 3.068], [7.432, 4.532]],
53+
]),
54+
np.array([0, 1]),
55+
3,
56+
1,
57+
2,
3458
)
3559

3660
result = rolling_window_sequences(values, window_size, step_size)

tutorials/pipelines/multivariate-mistral-prompter-pipeline.ipynb

Lines changed: 1186 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)