stratosphereips
diff --git a/‎llm-unittest/providers/providers.yaml‎
Lines changed: 3 additions & 0 deletions b/‎llm-unittest/providers/providers.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎llm-unittest/providers/providers_fmt_openai.yaml‎
Lines changed: 4 additions & 0 deletions b/‎llm-unittest/providers/providers_fmt_openai.yaml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎llm-unittest/results/cols_export_bitnet.json‎
Lines changed: 92 additions & 0 deletions b/‎llm-unittest/results/cols_export_bitnet.json‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎llm-unittest/results/cols_export_openai_8b.json‎
Lines changed: 1132 additions & 0 deletions b/‎llm-unittest/results/cols_export_openai_8b.json‎
Lines changed: 1132 additions & 0 deletions
diff --git a/‎llm-unittest/results/evals_bitnet.csv‎
Lines changed: 10 additions & 0 deletions b/‎llm-unittest/results/evals_bitnet.csv‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎llm-unittest/results/evals_openai_8b.csv‎
Lines changed: 10 additions & 0 deletions b/‎llm-unittest/results/evals_openai_8b.csv‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎llm-unittest/results/evals_openai_8b.json‎
Lines changed: 992 additions & 0 deletions b/‎llm-unittest/results/evals_openai_8b.json‎
Lines changed: 992 additions & 0 deletions
diff --git a/‎llm-unittest/results/evals_openai_8b_smol.csv‎
Lines changed: 10 additions & 0 deletions b/‎llm-unittest/results/evals_openai_8b_smol.csv‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎llm-unittest/results/models_heatmap.png‎
-11.2 KB b/‎llm-unittest/results/models_heatmap.png‎
-11.2 KB
diff --git a/‎llm-unittest/scripts/eval_analysis.rmd‎
Lines changed: 34 additions & 2 deletions b/‎llm-unittest/scripts/eval_analysis.rmd‎
Lines changed: 34 additions & 2 deletions
@@ -19,6 +19,9 @@
 - id: openai:chat:qwen2.5:3b
   config:
     num_predict: 2048
+- id: openai:chat:qwen2.5:3b-instruct-q8_0
+  config:
+    num_predict: 2048
 - id: openai:chat:granite3.1-dense:2b
   config:
     num_predict: 2048
 
@@ -26,6 +26,10 @@
   config:
     num_predict: 2048
     response_format: {'type':'json_object'}
+- id: openai:chat:qwen2.5:3b-instruct-q8_0
+  config:
+    num_predict: 2048
+    response_format: {'type':'json_object'}
 - id: openai:chat:granite3.1-dense:2b
   config:
     num_predict: 2048
 
@@ -0,0 +1,10 @@
+ID,Created,Description,Pass Rate,# Tests
+eval-rL4-2025-04-30T20:50:05,"4/30/2025, 5:50:05 PM",Generate Structured Networking JSON (API),60,5
+eval-vtx-2025-04-30T20:48:41,"4/30/2025, 5:48:41 PM",Generate Function Call from Prompt,50,2
+eval-8uC-2025-04-30T20:44:25,"4/30/2025, 5:44:25 PM",Summarize Zeek Logs and Make Classifications,50,2
+eval-jcs-2025-04-30T20:42:42,"4/30/2025, 5:42:42 PM",Generate Valid Zeek Log Line,33.333,3
+eval-9MV-2025-04-30T20:40:06,"4/30/2025, 5:40:06 PM",Interpret Zeek Log Entries,100,2
+eval-cuP-2025-04-30T20:39:11,"4/30/2025, 5:39:11 PM",Test the generation of valid JSON actions with correct structure.,60,5
+eval-ce4-2025-04-30T20:38:16,"4/30/2025, 5:38:16 PM",Generate Structured Networking JSON,80,5
+eval-CzS-2025-04-30T20:35:18,"4/30/2025, 5:35:18 PM",Summarize Networking Actions from JSON,80,5
+eval-KZv-2025-04-30T20:34:29,"4/30/2025, 5:34:29 PM",Field Extraction from Networking JSON,60,5
@@ -0,0 +1,10 @@
+ID,Created,Description,Pass Rate,# Tests
+eval-ilS-2025-04-17T14:28:42,"4/17/2025, 11:28:42 AM",Generate Structured Networking JSON (API),80,5
+eval-5eW-2025-04-17T14:28:33,"4/17/2025, 11:28:33 AM",Generate Function Call from Prompt,25,2
+eval-u4z-2025-04-17T14:28:21,"4/17/2025, 11:28:21 AM",Summarize Zeek Logs and Make Classifications,50,2
+eval-tjS-2025-04-17T14:28:10,"4/17/2025, 11:28:10 AM",Generate Valid Zeek Log Line,30.556,3
+eval-qFB-2025-04-17T14:28:00,"4/17/2025, 11:28:00 AM",Interpret Zeek Log Entries,45.833,2
+eval-Hut-2025-04-17T14:27:54,"4/17/2025, 11:27:54 AM",Test the generation of valid JSON actions with correct structure.,45,5
+eval-Z6N-2025-04-17T14:27:45,"4/17/2025, 11:27:45 AM",Generate Structured Networking JSON,50,5
+eval-9aW-2025-04-17T14:27:34,"4/17/2025, 11:27:34 AM",Summarize Networking Actions from JSON,63.333,5
+eval-FlW-2025-04-17T14:27:23,"4/17/2025, 11:27:23 AM",Field Extraction from Networking JSON,83.333,5
@@ -0,0 +1,10 @@
+ID,Created,Description,Pass Rate,# Tests
+eval-nel-2025-04-17T13:35:28,"4/17/2025, 10:35:28 AM",Generate Structured Networking JSON (API),83.636,5
+eval-nD5-2025-04-17T13:35:14,"4/17/2025, 10:35:14 AM",Generate Function Call from Prompt,27.273,2
+eval-nTF-2025-04-17T13:34:58,"4/17/2025, 10:34:58 AM",Summarize Zeek Logs and Make Classifications,54.545,2
+eval-BFL-2025-04-17T13:34:46,"4/17/2025, 10:34:46 AM",Generate Valid Zeek Log Line,33.333,3
+eval-4oN-2025-04-17T13:34:33,"4/17/2025, 10:34:33 AM",Interpret Zeek Log Entries,50,2
+eval-oyI-2025-04-17T13:34:27,"4/17/2025, 10:34:27 AM",Test the generation of valid JSON actions with correct structure.,49.091,5
+eval-Plj-2025-04-17T13:34:19,"4/17/2025, 10:34:19 AM",Generate Structured Networking JSON,54.545,5
+eval-jdz-2025-04-17T13:34:07,"4/17/2025, 10:34:07 AM",Summarize Networking Actions from JSON,67.273,5
+eval-3tl-2025-04-17T13:33:56,"4/17/2025, 10:33:56 AM",Field Extraction from Networking JSON,89.091,5
@@ -5,17 +5,43 @@ output: html_notebook
 ```{r}
 library(tidyverse)
 library(jsonlite)
+```
+```{r}
+evals_file<-"/home/harpo/Dropbox/ongoing-work/git-repos/slips-tools/llm-unittest/results/evals_openai_8b.csv"
+cols_file<-"/home/harpo/Dropbox/ongoing-work/git-repos/slips-tools/llm-unittest/results/cols_export_openai_8b.json"
+```
+
+
+```{r}
+bitnet_evals_file<-"/home/harpo/Dropbox/ongoing-work/git-repos/slips-tools/llm-unittest/results/evals_bitnet.csv"
+bitnet_cols_file<-"/home/harpo/Dropbox/ongoing-work/git-repos/slips-tools/llm-unittest/results/cols_export_bitnet.json"
+
+```
+
+```{r}
+
+
 ```
 
 
 Here we have the information about the description of the test
 ```{r}
-promptfoo_data_test<-read_csv("/home/harpo/Dropbox/ongoing-work/git-repos/slips-tools/llm-unittest/results/evals_openai.csv")
+promptfoo_data_test<-read_csv(evals_file)
+```
+```{r}
+promptfoo_data_test_bitnet<-read_csv(bitnet_evals_file)
+promptfoo_data_test<- rbind(promptfoo_data_test,promptfoo_data_test_bitnet)
+```
+
+
+```{r}
 promptfoo_data_test <- promptfoo_data_test %>% rename(evalId="ID")
 ```
 Here we have the actual tests
 ```{r}
-promptfoo_data<-fromJSON("/home/harpo/Dropbox/ongoing-work/git-repos/slips-tools/llm-unittest/results/cols_export_openai.json",flatten = TRUE)
+promptfoo_data<-fromJSON(cols_file,flatten = TRUE)
+promptfoo_data_bitnet<-fromJSON(bitnet_cols_file,flatten = TRUE)
+promptfoo_data <- rbind(promptfoo_data,promptfoo_data_bitnet)
 #promptfoo_data %>% select(description) %>% unique()
 ```
 
@@ -42,6 +68,9 @@ pivot_scores <- promptfoo_data %>%
     values_from = pass_rate
   )
 
+pivot_scores<-pivot_scores %>% filter(! provider %in% c("openai:chat:qwen2.5:3b-instruct-q8_0",
+                                         "openai:chat:smollm:1.7b-instruct-v0.2-q5_K_M",
+                                         "openai:chat:smollm:1.7b-instruct-v0.2-q8_0"))
 ```
 ```{r fig.height=8, fig.width=8}
 library(ggplot2)
@@ -51,6 +80,9 @@ library(tidyr)
 long_data <- pivot_scores %>%
   pivot_longer(-provider, names_to = "Description", values_to = "pass_rate")
 
+long_data$provider <- gsub("openai:chat:", "", long_data$provider)
+
+
 ggplot(long_data, aes(x = Description, y = provider, fill = pass_rate)) +
   geom_tile(color = "white") +
   scale_fill_gradient2(low = "red", mid = "yellow", high = "darkgreen", midpoint = 50, na.value = "grey90") +