Skip to content

Commit f3fa070

Browse files
authored
Merge pull request #4 from stratosphereips/harpo-add-8b-models
Harpo add 8b and 1b models into the evaluation
2 parents dcc50fa + 4c06ebb commit f3fa070

File tree

10 files changed

+2287
-2
lines changed

10 files changed

+2287
-2
lines changed

llm-unittest/providers/providers.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@
1919
- id: openai:chat:qwen2.5:3b
2020
config:
2121
num_predict: 2048
22+
- id: openai:chat:qwen2.5:3b-instruct-q8_0
23+
config:
24+
num_predict: 2048
2225
- id: openai:chat:granite3.1-dense:2b
2326
config:
2427
num_predict: 2048

llm-unittest/providers/providers_fmt_openai.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@
2626
config:
2727
num_predict: 2048
2828
response_format: {'type':'json_object'}
29+
- id: openai:chat:qwen2.5:3b-instruct-q8_0
30+
config:
31+
num_predict: 2048
32+
response_format: {'type':'json_object'}
2933
- id: openai:chat:granite3.1-dense:2b
3034
config:
3135
num_predict: 2048

llm-unittest/results/cols_export_bitnet.json

Lines changed: 92 additions & 0 deletions
Large diffs are not rendered by default.

llm-unittest/results/cols_export_openai_8b.json

Lines changed: 1132 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
ID,Created,Description,Pass Rate,# Tests
2+
eval-rL4-2025-04-30T20:50:05,"4/30/2025, 5:50:05 PM",Generate Structured Networking JSON (API),60,5
3+
eval-vtx-2025-04-30T20:48:41,"4/30/2025, 5:48:41 PM",Generate Function Call from Prompt,50,2
4+
eval-8uC-2025-04-30T20:44:25,"4/30/2025, 5:44:25 PM",Summarize Zeek Logs and Make Classifications,50,2
5+
eval-jcs-2025-04-30T20:42:42,"4/30/2025, 5:42:42 PM",Generate Valid Zeek Log Line,33.333,3
6+
eval-9MV-2025-04-30T20:40:06,"4/30/2025, 5:40:06 PM",Interpret Zeek Log Entries,100,2
7+
eval-cuP-2025-04-30T20:39:11,"4/30/2025, 5:39:11 PM",Test the generation of valid JSON actions with correct structure.,60,5
8+
eval-ce4-2025-04-30T20:38:16,"4/30/2025, 5:38:16 PM",Generate Structured Networking JSON,80,5
9+
eval-CzS-2025-04-30T20:35:18,"4/30/2025, 5:35:18 PM",Summarize Networking Actions from JSON,80,5
10+
eval-KZv-2025-04-30T20:34:29,"4/30/2025, 5:34:29 PM",Field Extraction from Networking JSON,60,5
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
ID,Created,Description,Pass Rate,# Tests
2+
eval-ilS-2025-04-17T14:28:42,"4/17/2025, 11:28:42 AM",Generate Structured Networking JSON (API),80,5
3+
eval-5eW-2025-04-17T14:28:33,"4/17/2025, 11:28:33 AM",Generate Function Call from Prompt,25,2
4+
eval-u4z-2025-04-17T14:28:21,"4/17/2025, 11:28:21 AM",Summarize Zeek Logs and Make Classifications,50,2
5+
eval-tjS-2025-04-17T14:28:10,"4/17/2025, 11:28:10 AM",Generate Valid Zeek Log Line,30.556,3
6+
eval-qFB-2025-04-17T14:28:00,"4/17/2025, 11:28:00 AM",Interpret Zeek Log Entries,45.833,2
7+
eval-Hut-2025-04-17T14:27:54,"4/17/2025, 11:27:54 AM",Test the generation of valid JSON actions with correct structure.,45,5
8+
eval-Z6N-2025-04-17T14:27:45,"4/17/2025, 11:27:45 AM",Generate Structured Networking JSON,50,5
9+
eval-9aW-2025-04-17T14:27:34,"4/17/2025, 11:27:34 AM",Summarize Networking Actions from JSON,63.333,5
10+
eval-FlW-2025-04-17T14:27:23,"4/17/2025, 11:27:23 AM",Field Extraction from Networking JSON,83.333,5

llm-unittest/results/evals_openai_8b.json

Lines changed: 992 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
ID,Created,Description,Pass Rate,# Tests
2+
eval-nel-2025-04-17T13:35:28,"4/17/2025, 10:35:28 AM",Generate Structured Networking JSON (API),83.636,5
3+
eval-nD5-2025-04-17T13:35:14,"4/17/2025, 10:35:14 AM",Generate Function Call from Prompt,27.273,2
4+
eval-nTF-2025-04-17T13:34:58,"4/17/2025, 10:34:58 AM",Summarize Zeek Logs and Make Classifications,54.545,2
5+
eval-BFL-2025-04-17T13:34:46,"4/17/2025, 10:34:46 AM",Generate Valid Zeek Log Line,33.333,3
6+
eval-4oN-2025-04-17T13:34:33,"4/17/2025, 10:34:33 AM",Interpret Zeek Log Entries,50,2
7+
eval-oyI-2025-04-17T13:34:27,"4/17/2025, 10:34:27 AM",Test the generation of valid JSON actions with correct structure.,49.091,5
8+
eval-Plj-2025-04-17T13:34:19,"4/17/2025, 10:34:19 AM",Generate Structured Networking JSON,54.545,5
9+
eval-jdz-2025-04-17T13:34:07,"4/17/2025, 10:34:07 AM",Summarize Networking Actions from JSON,67.273,5
10+
eval-3tl-2025-04-17T13:33:56,"4/17/2025, 10:33:56 AM",Field Extraction from Networking JSON,89.091,5
-11.2 KB
Loading

llm-unittest/scripts/eval_analysis.rmd

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,43 @@ output: html_notebook
55
```{r}
66
library(tidyverse)
77
library(jsonlite)
8+
```
9+
```{r}
10+
evals_file<-"/home/harpo/Dropbox/ongoing-work/git-repos/slips-tools/llm-unittest/results/evals_openai_8b.csv"
11+
cols_file<-"/home/harpo/Dropbox/ongoing-work/git-repos/slips-tools/llm-unittest/results/cols_export_openai_8b.json"
12+
```
13+
14+
15+
```{r}
16+
bitnet_evals_file<-"/home/harpo/Dropbox/ongoing-work/git-repos/slips-tools/llm-unittest/results/evals_bitnet.csv"
17+
bitnet_cols_file<-"/home/harpo/Dropbox/ongoing-work/git-repos/slips-tools/llm-unittest/results/cols_export_bitnet.json"
18+
19+
```
20+
21+
```{r}
22+
23+
824
```
925

1026

1127
Here we have the information about the description of the test
1228
```{r}
13-
promptfoo_data_test<-read_csv("/home/harpo/Dropbox/ongoing-work/git-repos/slips-tools/llm-unittest/results/evals_openai.csv")
29+
promptfoo_data_test<-read_csv(evals_file)
30+
```
31+
```{r}
32+
promptfoo_data_test_bitnet<-read_csv(bitnet_evals_file)
33+
promptfoo_data_test<- rbind(promptfoo_data_test,promptfoo_data_test_bitnet)
34+
```
35+
36+
37+
```{r}
1438
promptfoo_data_test <- promptfoo_data_test %>% rename(evalId="ID")
1539
```
1640
Here we have the actual tests
1741
```{r}
18-
promptfoo_data<-fromJSON("/home/harpo/Dropbox/ongoing-work/git-repos/slips-tools/llm-unittest/results/cols_export_openai.json",flatten = TRUE)
42+
promptfoo_data<-fromJSON(cols_file,flatten = TRUE)
43+
promptfoo_data_bitnet<-fromJSON(bitnet_cols_file,flatten = TRUE)
44+
promptfoo_data <- rbind(promptfoo_data,promptfoo_data_bitnet)
1945
#promptfoo_data %>% select(description) %>% unique()
2046
```
2147

@@ -42,6 +68,9 @@ pivot_scores <- promptfoo_data %>%
4268
values_from = pass_rate
4369
)
4470
71+
pivot_scores<-pivot_scores %>% filter(! provider %in% c("openai:chat:qwen2.5:3b-instruct-q8_0",
72+
"openai:chat:smollm:1.7b-instruct-v0.2-q5_K_M",
73+
"openai:chat:smollm:1.7b-instruct-v0.2-q8_0"))
4574
```
4675
```{r fig.height=8, fig.width=8}
4776
library(ggplot2)
@@ -51,6 +80,9 @@ library(tidyr)
5180
long_data <- pivot_scores %>%
5281
pivot_longer(-provider, names_to = "Description", values_to = "pass_rate")
5382
83+
long_data$provider <- gsub("openai:chat:", "", long_data$provider)
84+
85+
5486
ggplot(long_data, aes(x = Description, y = provider, fill = pass_rate)) +
5587
geom_tile(color = "white") +
5688
scale_fill_gradient2(low = "red", mid = "yellow", high = "darkgreen", midpoint = 50, na.value = "grey90") +

0 commit comments

Comments
 (0)