Multivariate prompter pipeline (#47)

sarahmish · web-flow · commit 1d4e25720f6a · 2025-09-02T15:33:35.000-04:00
* add interval parsing

* change primitive name

* fix lint

* multivariate pipeline

* multivariate pipeline

* fix lint

* add dim tests

* fix lint

* update tutorial

* update tutorial output

* update docstrings on `get

* remove trailing whitespace
diff --git a/sigllm/pipelines/prompter/mistral_prompter_multivariate_0shot.json b/sigllm/pipelines/prompter/mistral_prompter_multivariate_0shot.json
@@ -0,0 +1,68 @@
+{
+    "primitives": [
+        "mlstars.custom.timeseries_preprocessing.time_segments_aggregate",
+        "sklearn.impute.SimpleImputer",
+        "sigllm.primitives.transformation.Float2Scalar",
+        "sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences",
+	    "sigllm.primitives.transformation.format_as_string",
+        "sigllm.primitives.prompting.huggingface.HF",
+        "sigllm.primitives.prompting.anomalies.parse_anomaly_response",
+        "sigllm.primitives.prompting.anomalies.find_anomalies_in_windows",
+        "sigllm.primitives.prompting.anomalies.merge_anomalous_sequences",
+        "sigllm.primitives.prompting.anomalies.format_anomalies"
+    ],
+    "init_params": {
+        "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": {
+            "time_column": "timestamp",
+            "interval": 21600,
+            "method": "mean"
+        },
+        "sigllm.primitives.transformation.Float2Scalar#1": {
+            "decimal": 2,
+            "rescale": true
+        },
+        "sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences#1": {
+            "window_size": 100,
+            "step_size": 40
+        },
+        "sigllm.primitives.transformation.format_as_string#1": {
+            "space": false
+        },
+        "sigllm.primitives.prompting.huggingface.HF#1": {
+            "name": "mistralai/Mistral-7B-Instruct-v0.2",
+            "samples": 1,
+            "temp": 0.01
+        },
+        "sigllm.primitives.prompting.anomalies.parse_anomaly_response#1": {
+            "interval": true
+        },
+        "sigllm.primitives.prompting.anomalies.find_anomalies_in_windows#1": {
+            "alpha": 0.4
+        },
+        "sigllm.primitives.prompting.anomalies.merge_anomalous_sequences#1": {
+            "beta": 0.5
+        }
+    },
+    "input_names": {
+        "sigllm.primitives.prompting.huggingface.HF#1": {
+            "X": "X_str"
+        },
+        "sigllm.primitives.prompting.anomalies.parse_anomaly_response#1": {
+            "X": "y_hat"
+        }
+    },
+    "output_names": {
+        "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": {
+            "index": "timestamp"
+        },
+        "sigllm.primitives.transformation.format_as_string#1": {
+            "X": "X_str"
+        },
+        "sigllm.primitives.prompting.huggingface.HF#1": {
+            "y": "y_hat"
+        },
+        "sigllm.primitives.prompting.anomalies.parse_anomaly_response#1": {
+            "X": "y"
+        }
+    }
+}
diff --git a/sigllm/primitives/jsons/sigllm.primitives.prompting.huggingface.HF.json b/sigllm/primitives/jsons/sigllm.primitives.prompting.huggingface.HF.json
@@ -17,6 +17,11 @@
                 "name": "X",
                 "type": "ndarray"
             },
+            {
+                "name": "dim",
+                "type": "int",
+                "default": 1
+            },
             {
                 "name": "normal",
                 "type": "ndarray",
diff --git a/sigllm/primitives/jsons/sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences.json b/sigllm/primitives/jsons/sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences.json
@@ -36,6 +36,10 @@
             {
                 "name": "step_size", 
                 "type": "int"
+            },
+            {
+                "name": "dim",
+                "type": "int"
             }
         ]
     }, 
diff --git a/sigllm/primitives/prompting/anomalies.py b/sigllm/primitives/prompting/anomalies.py
@@ -131,6 +131,9 @@ def find_anomalies_in_windows(y, alpha=0.5):
         ndarray:
             A 2-dimensional array containing final anomalous indices of each windows.
     """
+    if isinstance(y, list):
+        y = np.array(y, dtype=object)
+
     idx_list = []
     for samples in y:
         min_vote = np.ceil(alpha * len(samples))
@@ -203,6 +206,7 @@ def format_anomalies(y, timestamp, padding_size=50):
     if len(y) == 0:
         return []
 
+    y = y[y < len(timestamp)].astype(int)
     y = timestamp[y]  # Convert list of indices into list of timestamps
     start, end = timestamp[0], timestamp[-1]
     interval = timestamp[1] - timestamp[0]
diff --git a/sigllm/primitives/prompting/huggingface.py b/sigllm/primitives/prompting/huggingface.py
@@ -1,8 +1,8 @@
 # -*- coding: utf-8 -*-
-
 import json
 import logging
 import os
+import re
 
 import torch
 from tqdm import tqdm
@@ -24,6 +24,23 @@
 DEFAULT_MODEL = 'mistralai/Mistral-7B-Instruct-v0.2'
 
 
+def get_examples(text, k=3):
+    """Extracts the content within the first k sets of parentheses in a string.
+
+    Args:
+        text (str):
+            Input string.
+        k (int):
+            Number of examples to return.
+
+    Returns:
+        list:
+            A list containing the content within the first k sets of parentheses.
+    """
+    matches = re.findall(r'\(([^)]*)\)', text)
+    return matches[:k]
+
+
 class HF:
     """Prompt Pretrained models on HuggingFace to detect anomalies in a time series.
 
@@ -72,7 +89,6 @@ def __init__(
         self.samples = samples
         self.padding = padding
         self.restrict_tokens = restrict_tokens
-
         self.tokenizer = AutoTokenizer.from_pretrained(self.name, use_fast=False)
 
         # special tokens
@@ -111,12 +127,14 @@ def __init__(
 
         self.model.eval()
 
-    def detect(self, X, normal=None, **kwargs):
+    def detect(self, X, dim=1, normal=None, **kwargs):
         """Use HF to detect anomalies of a signal.
 
         Args:
             X (ndarray):
-                Input sequences of strings containing signal values
+                Input sequences of strings containing signal values.
+            dim (int, optional):
+                Number of dimensions of the time series. Default to 1.
             normal (str, optional):
                 A normal reference sequence for one-shot prompting. If None,
                 zero-shot prompting is used. Default to None.
@@ -140,7 +158,13 @@ def detect(self, X, normal=None, **kwargs):
             if self.restrict_tokens:
                 user_message = PROMPTS['user_message']
             else:
-                user_message = PROMPTS['user_message_2']
+                user_message = PROMPTS['user_message_unrestricted']
+
+            if dim > 1:
+                examples = get_examples(text)
+                examples = ' '.join([f'({ex})' for ex in examples])
+
+                user_message = PROMPTS['user_message_multivariate'].format(dim, examples)
 
             # Combine messages with one-shot example if provided
             message = ' '.join([
diff --git a/sigllm/primitives/prompting/huggingface_messages.json b/sigllm/primitives/prompting/huggingface_messages.json
@@ -1,6 +1,7 @@
 {
     "system_message": "You are an expert in time series analysis. Your task is to detect anomalies in time series data.",
     "user_message": "Below is a [SEQUENCE], please return the anomalies in that sequence in [RESPONSE]. Only return the numbers. [SEQUENCE]",
-    "user_message_2": "Below is a [SEQUENCE], analyze the following time series and identify any anomalies. If you find anomalies, provide their values in the format [first_anomaly, ..., last_anomaly]. If no anomalies are found, respond with 'no anomalies'. Be concise, do not write code, do not perform any calculations, just give your answers as told.: [SEQUENCE]",
+    "user_message_unrestricted": "Below is a [SEQUENCE], analyze the following time series and identify any anomalies. If you find anomalies, provide their values in the format [first_anomaly, ..., last_anomaly]. If no anomalies are found, respond with 'no anomalies'. Be concise, do not write code, do not perform any calculations, just give your answers as told.: [SEQUENCE]",
+    "user_message_multivariate": "Below is a multivariate time series of {} dimensions with the following format \"{}\".\nYour job is to detect ranges of anomalies in this time series, in terms of the x-axis coordinates. List one by one in a list.\nFor example, if ranges (including two endpoints) [5, 12], [71, 74], and [105, 131], are anomalies, then output \"[[5,12], [71, 74], [105, 131]]\".\nIf no anomalies are found, respond with 'no anomalies'. Be concise, do not write code, do not perform any calculations, just give your answers as told.\n[SEQUENCE]\n",
     "one_shot_prefix": "Here is a normal reference of the time series: [NORMAL]"
 }
diff --git a/sigllm/primitives/prompting/timeseries_preprocessing.py b/sigllm/primitives/prompting/timeseries_preprocessing.py
@@ -26,6 +26,11 @@ def rolling_window_sequences(X, window_size=500, step_size=100):
             * rolling window sequences.
             * first index value of each input sequence.
     """
+    if X.ndim == 1:
+        dim = 1
+    else:
+        dim = X.shape[1]
+
     index = range(len(X))
     out_X = list()
     X_index = list()
@@ -37,4 +42,4 @@ def rolling_window_sequences(X, window_size=500, step_size=100):
         out_X.append(X[start:end])
         X_index.append(index[start])
         start = start + step_size
-    return np.asarray(out_X), np.asarray(X_index), window_size, step_size
+    return np.asarray(out_X), np.asarray(X_index), window_size, step_size, dim
diff --git a/tests/primitives/prompting/test_timeseries_preprocessing.py b/tests/primitives/prompting/test_timeseries_preprocessing.py
@@ -31,6 +31,30 @@ def test_rolling_window_sequences(values, window_size, step_size):
         np.array([0, 1, 2, 3, 4]),
         3,
         1,
+        1,
+    )
+
+    result = rolling_window_sequences(values, window_size, step_size)
+
+    if len(result) != len(expected):
+        raise AssertionError('Tuples has different length')
+
+    for arr1, arr2 in zip(result, expected):
+        np.testing.assert_equal(arr1, arr2)
+
+
+def test_rolling_window_sequences_multivariate(window_size, step_size):
+    values = np.array([[0.555, 2.345], [1.501, 5.903], [9.116, 3.068], [7.432, 4.532]])
+
+    expected = (
+        np.array([
+            [[0.555, 2.345], [1.501, 5.903], [9.116, 3.068]],
+            [[1.501, 5.903], [9.116, 3.068], [7.432, 4.532]],
+        ]),
+        np.array([0, 1]),
+        3,
+        1,
+        2,
     )
 
     result = rolling_window_sequences(values, window_size, step_size)
diff --git a/tutorials/pipelines/multivariate-mistral-prompter-pipeline.ipynb b/tutorials/pipelines/multivariate-mistral-prompter-pipeline.ipynb

Original file line number	Diff line number	Diff line change
`@@ -36,6 +36,10 @@`
`36`	`36`	`{`
`37`	`37`	`"name": "step_size",`
`38`	`38`	`"type": "int"`
	`39`	`+ },`
	`40`	`+ {`
	`41`	`+ "name": "dim",`
	`42`	`+ "type": "int"`
`39`	`43`	`}`
`40`	`44`	`]`
`41`	`45`	`},`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"system_message": "You are an expert in time series analysis. Your task is to detect anomalies in time series data.",`
`3`	`3`	`"user_message": "Below is a [SEQUENCE], please return the anomalies in that sequence in [RESPONSE]. Only return the numbers. [SEQUENCE]",`
`4`		`- "user_message_2": "Below is a [SEQUENCE], analyze the following time series and identify any anomalies. If you find anomalies, provide their values in the format [first_anomaly, ..., last_anomaly]. If no anomalies are found, respond with 'no anomalies'. Be concise, do not write code, do not perform any calculations, just give your answers as told.: [SEQUENCE]",`
	`4`	`+ "user_message_unrestricted": "Below is a [SEQUENCE], analyze the following time series and identify any anomalies. If you find anomalies, provide their values in the format [first_anomaly, ..., last_anomaly]. If no anomalies are found, respond with 'no anomalies'. Be concise, do not write code, do not perform any calculations, just give your answers as told.: [SEQUENCE]",`
	`5`	+ "user_message_multivariate": "Below is a multivariate time series of {} dimensions with the following format \"{}\".\nYour job is to detect ranges of anomalies in this time series, in terms of the x-axis coordinates. List one by one in a list.\nFor example, if ranges (including two endpoints) [5, 12], [71, 74], and [105, 131], are anomalies, then output \"[[5,12], [71, 74], [105, 131]]\".\nIf no anomalies are found, respond with 'no anomalies'. Be concise, do not write code, do not perform any calculations, just give your answers as told.\n[SEQUENCE]\n",
`5`	`6`	`"one_shot_prefix": "Here is a normal reference of the time series: [NORMAL]"`
`6`	`7`	`}`