Merge pull request #6 from stratosphereips/harpo_llm_rpi5_mem_cpu_bench

harpomaxx · web-flow · commit aeb8ffb029f5 · 2025-05-11T10:41:32.000-03:00
add scripts for testing mem/tokens/s for rpi5 llm models
diff --git a/benchmark_models/README.md b/benchmark_models/README.md
@@ -0,0 +1,79 @@
+# Ollama Model Benchmark Script
+
+This directory contains a Bash script for benchmarking models. The script automates the benchmarking of models served by a remote Ollama instance. It fetches the list of available models, runs a specified prompt against each model using a local Python script, and measures the model's performance. For each model, it collects:
+
+Quantization level
+
+Disk size (from /api/tags)
+
+RAM usage while loaded (from /api/ps)
+
+Tokens per second (TPS) from the Python output
+
+The data is printed in a formatted table and also saved to a CSV file (results.csv) for further analysi
+
+##  Requirements
+
+* Bash
+* `curl`
+* [`jq`](https://stedolan.github.io/jq/) for JSON parsing
+* Python script: `stream_query_llm.py` that supports:
+
+  * `--model`
+  * `--prompt`
+  * `--base_url`
+  * `--stats_only` flag
+
+##  Files
+
+* `benchmark_ollama_models.sh` — Main benchmarking script
+* `results.csv` — Output file containing the benchmark results
+* `stream_query_llm.py` — Your local script that streams responses and prints usage stats
+
+## Collected Metrics
+
+The script gathers and logs the following for each model:
+
+| Metric            | Source        | Description                                   |
+| ----------------- | ------------- | --------------------------------------------- |
+| Model name        | /api/tags     | Name of the model (e.g., llama3:8b)           |
+| Quantization      | /api/ps       | Runtime quantization level (e.g., Q4\_K\_M)   |
+| Disk size (MB)    | /api/tags     | Size of the GGUF model on disk                |
+| RAM size (MB)     | /api/ps       | Actual loaded model size in memory            |
+| Tokens per second | Python script | Measured performance (completion tokens/time) |
+
+##  Usage
+
+1. Make the script executable:
+
+```bash
+chmod +x benchmark_ollama_models.sh
+```
+
+2. Run it:
+
+```bash
+./benchmark_ollama_models.sh
+```
+
+This will:
+
+* Call each available model
+* Run a prompt
+* Log and print performance data
+* Save results to `results.csv`
+
+##  Configuration
+
+Inside the script, you can customize:
+
+* `OLLAMA_HOST`: IP or hostname of your Ollama server
+* `PORT`: Ollama server port (default is `11434`)
+* `PROMPT`: Prompt string to be used for benchmarking
+
+##  Notes
+
+* Only models successfully loaded and used will return RAM size
+* If TPS extraction fails, the script will mark the entry as `ERROR`
+* The script assumes Ollama's REST API is accessible remotely
+
diff --git a/benchmark_models/benchmark_ollama_models.sh b/benchmark_models/benchmark_ollama_models.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+# --- Configuration ---
+OLLAMA_HOST="10.147.20.102"
+PORT="11434"
+BASE_URL="http://$OLLAMA_HOST:$PORT/v1"
+API_TAGS_URL="http://$OLLAMA_HOST:$PORT/api/tags"
+API_PS_URL="http://$OLLAMA_HOST:$PORT/api/ps"
+PYTHON_SCRIPT="stream_query_llm.py"
+PROMPT="generate a zeek script for detecting Suspicious HTTP User-Agents. Be concise."
+CSV_FILE="results.csv"
+
+# --- Check prerequisites ---
+if [[ ! -f "$PYTHON_SCRIPT" ]]; then
+  echo "❌ Python script '$PYTHON_SCRIPT' not found!"
+  exit 1
+fi
+
+if ! command -v jq &>/dev/null; then
+  echo "❌ 'jq' is required but not installed."
+  exit 1
+fi
+
+# --- Fetch model list ---
+tags_response=$(curl -s "$API_TAGS_URL")
+if [[ -z "$tags_response" ]]; then
+  echo "❌ Failed to fetch models from $API_TAGS_URL"
+  exit 1
+fi
+
+# --- CSV Output File ---
+echo "model,quantization,disk_size_mb,ram_size_mb,tokens_per_second" > "$CSV_FILE"
+
+# --- Table Header ---
+echo -e "\n📊 Remote Model Benchmark Summary:"
+printf "%-25s %-12s %14s %14s %10s\n" "Model" "Quantization" "Disk Size (MB)" "RAM Size (MB)" "TPS"
+printf "%-25s %-12s %14s %14s %10s\n" "-----" "------------" "--------------" "-------------" "----"
+
+# --- Loop through models ---
+models=$(echo "$tags_response" | jq -c '.models[]')
+
+for model_json in $models; do
+  model=$(echo "$model_json" | jq -r '.name')
+  quant=$(echo "$model_json" | jq -r '.details.quantization_level // "N/A"')
+  disk_bytes=$(echo "$model_json" | jq -r '.size // 0')
+  disk_mb=$(awk "BEGIN {printf \"%.1f\", $disk_bytes/1024/1024}")
+
+  # Run benchmark
+  output=$(python3 "$PYTHON_SCRIPT" \
+    --prompt "$PROMPT" \
+    --base_url "$BASE_URL" \
+    --model "$model" \
+    --stats_only 2>/dev/null)
+
+  # Extract TPS
+  tps=$(echo "$output" | grep "Tokens per second" | awk '{print $(NF-1)}')
+  display_tps="${tps:-ERROR}"
+  csv_tps="${tps:-}"
+
+  # Query /api/ps for live RAM usage
+  ps_response=$(curl -s "$API_PS_URL")
+  ps_entry=$(echo "$ps_response" | jq -c --arg name "$model" '.models[] | select(.name == $name)')
+
+  ram_bytes=$(echo "$ps_entry" | jq -r '.size // 0')
+  ram_mb=$(awk "BEGIN {printf \"%.1f\", $ram_bytes/1024/1024}")
+
+  # Print to terminal
+  printf "%-25s %-12s %14s %14s %10s\n" "$model" "$quant" "$disk_mb" "$ram_mb" "$display_tps"
+
+  # Append to CSV
+  echo "$model,$quant,$disk_mb,$ram_mb,$csv_tps" >> "$CSV_FILE"
+done
+
+echo -e "\n✅ Results saved to: $CSV_FILE"
+
diff --git a/benchmark_models/results.csv b/benchmark_models/results.csv
@@ -0,0 +1,11 @@
+model,quantization,disk_size_mb,ram_size_mb,tokens_per_second
+granite3.1-dense:2b,Q4_K_M,1497.0,2615.0,5.96
+llama3.2:3b,Q4_K_M,1925.8,3331.3,4.82
+smollm2:1.7b,Q8_0,1736.1,3946.4,5.13
+qwen2.5:1.5b,Q4_K_M,940.4,1495.0,9.87
+phi4-mini:latest,Q4_K_M,2376.4,3998.7,3.98
+gemma3:4b,Q4_K_M,3184.1,5231.4,3.90
+qwen2.5:3b,Q4_K_M,1840.5,2478.6,5.11
+gemma3:1b,Q4_K_M,777.5,1361.8,11.49
+deepseek-r1:1.5b,Q4_K_M,1065.6,1495.0,9.69
+llama3.2:1b,Q8_0,1259.9,2130.1,7.45
diff --git a/benchmark_models/stream_query_llm.py b/benchmark_models/stream_query_llm.py
@@ -0,0 +1,65 @@
+import openai
+import time
+import argparse
+
+def stream_chat_with_usage(prompt, base_url,model,stats_only):
+    # Custom OpenAI-compatible API endpoint
+    client = openai.OpenAI(
+        api_key="ollama",  # Leave blank or use a value if your API requires it
+        base_url=base_url
+    )
+
+    messages = [{"role": "user", "content": prompt}]
+    #model = "gpt-4"  # Change this if your local model uses a different name
+
+    if not stats_only:
+        print("AI:", end=" ", flush=True)
+    full_reply = ""
+    usage_info = None
+
+    response = client.chat.completions.create(
+        model=model,
+        messages=messages,
+        stream=True,
+        stream_options={"include_usage": True}
+    )
+
+    start_time = time.time()
+    for chunk in response:
+        if chunk.choices and chunk.choices[0].delta.content:
+            part = chunk.choices[0].delta.content
+            if not stats_only:
+                print(part, end="", flush=True)
+            full_reply += part
+        if hasattr(chunk, "usage") and chunk.usage:
+            usage_info = chunk.usage
+
+    end_time = time.time()
+    duration = end_time - start_time
+
+    print("\n\n🧠 Stats:")
+    if usage_info:
+        prompt_tokens = usage_info.prompt_tokens
+        completion_tokens = usage_info.completion_tokens
+        total_tokens = usage_info.total_tokens
+        tps = completion_tokens / duration if duration > 0 else 0
+
+        print(f"  Prompt tokens:     {prompt_tokens}")
+        print(f"  Completion tokens: {completion_tokens}")
+        print(f"  Total tokens:      {total_tokens}")
+        print(f"  Time taken:        {duration:.2f} sec")
+        print(f"  Tokens per second: {tps:.2f} TPS")
+    else:
+        print("  Usage information not available.")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Stream chat completions with usage metrics.")
+    parser.add_argument("--base_url", default="http://rpi5:18080/v1", help="Base URL of the OpenAI-compatible API")
+    parser.add_argument("--prompt", required=True, help="Prompt to send to the model")
+    parser.add_argument("--model", default="qwen2.5:1.5b", help="the model to use")
+    parser.add_argument("--stats_only",action="store_true", help="Print only the stats, no prompt nor completion")
+
+    args = parser.parse_args()
+
+    stream_chat_with_usage(args.prompt, args.base_url,args.model, args.stats_only)
+