wandb
diff --git a/‎configs/base_config.yaml‎
Lines changed: 52 additions & 13 deletions b/‎configs/base_config.yaml‎
Lines changed: 52 additions & 13 deletions
diff --git a/‎configs/gpt-4o-2024-11-20.yaml‎
Lines changed: 17 additions & 3 deletions b/‎configs/gpt-4o-2024-11-20.yaml‎
Lines changed: 17 additions & 3 deletions
diff --git a/‎configs/gpt-5-2025-08-07.yaml‎
Lines changed: 31 additions & 0 deletions b/‎configs/gpt-5-2025-08-07.yaml‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎configs/gpt-5-mini-2025-08-07.yaml‎
Lines changed: 27 additions & 6 deletions b/‎configs/gpt-5-mini-2025-08-07.yaml‎
Lines changed: 27 additions & 6 deletions
diff --git a/‎configs/vllm_test/qwen-qwen3-4b-litellm-client.yaml‎
Lines changed: 19 additions & 0 deletions b/‎configs/vllm_test/qwen-qwen3-4b-litellm-client.yaml‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎configs/vllm_test/qwen-qwen3-4b-offline-inference.yaml‎
Lines changed: 8 additions & 0 deletions b/‎configs/vllm_test/qwen-qwen3-4b-offline-inference.yaml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎configs/vllm_test/qwen-qwen3-4b-openai-client.yaml‎
Lines changed: 19 additions & 0 deletions b/‎configs/vllm_test/qwen-qwen3-4b-openai-client.yaml‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎examples/swebench_config.yaml‎
Lines changed: 52 additions & 0 deletions b/‎examples/swebench_config.yaml‎
Lines changed: 52 additions & 0 deletions
@@ -4,10 +4,14 @@ wandb:
     entity: horangi
     project: horangi4-dev
     project_dataset: horangi4-dataset
+  run_name: swebench-o4mini-2
 
 # testmode configurations
 testmode: false
 
+# LLM 호출 간 간격(초). 누락 시 오류 방지를 위해 기본값 제공
+inference_interval: 0.0
+
 mt_bench:
   split: test
   subset: ["roleplay", "humanities", "writing", "reasoning", "coding"]
@@ -23,18 +27,7 @@ mt_bench:
       temperature: 0.0
 # dataset configurations
 # SWE-bench Verified 설정
-swe_bench_verified:
-  split: test
-  subset: official_80
-  limit: 80
-  evaluation:
-    method: "swebench_verified"
-  # 원격 SWE-bench API 서버 설정 (있을 경우 실제 실행)
-  server:
-    url: ""  # 예: https://swebench-api.my-domain.com
-    token: ""  # 필요 시 토큰
-    poll_interval_sec: 10
-    max_wait_sec: 36000
+
 
 komoral:
   split: test
@@ -203,9 +196,55 @@ bfcl:
   evaluation:
     model: gpt-4o-2024-11-20
 
+# ================= SWE-bench (integrated) =================
+# SWE-bench Verified dataset for evaluating code patch generation
+swebench:
+  # Dataset configuration
+  split: test
+  subset: default
+  limit: 80  # Maximum number of samples to evaluate
+
+  # Dataset parameters (passed to Dataset constructor)
+  params:
+    artifacts_path: horangi/horangi4-dataset/swebench_verified_official_80:v4
+    dataset_dir: .
+
+  # Model parameters override for this dataset
+  # These will override the model config from <model_name>.yaml
+  model_params:
+    max_tokens: 32768  # Model limit (will be automatically capped per model)
+    temperature: 0.0
+
+  # Evaluation method
+  evaluation:
+    method: swebench
+    params:
+      # API server for running tests in Docker
+      api_endpoint: https://api.nejumi-swebench.org/
+      # api_endpoint: http://localhost:8000  # For local development
+      timeout_sec: 1800  # 30 minutes per test
+      concurrency: 2  # Number of parallel jobs
+      namespace: swebench  # Docker image namespace
+      tag: latest  # Docker image tag
+
+  # Legacy settings (kept for backward compatibility with external/swe_bench)
+  max_samples: 80
+  max_workers: 2
+  background_eval: false
+  fc_enabled: true
+  prebuild_images: false
+  images:
+    namespace: swebench
+    tag: latest
+  api_server:
+    enabled: true
+    endpoint: https://api.nejumi-swebench.org/
+    timeout_sec: 1800
+    concurrency: 2
+
 # arc_agi2:
 #   split: evaluation
 #   subset: default
 #   limit: 1000
 #   evaluation:
-#     method: "grid_match"
+#     method: "grid_match"
@@ -3,9 +3,23 @@ model:
   params:
     model_name: gpt-4o-2024-11-20
     provider: openai
-    batch_size: 32
-    max_tokens: 1024
+    max_tokens: 16384
     temperature: 0.0
+    batch_size: 32
   release_date: "2024-11-20"
   model_size: None
-  size_category: large
+  size_category: large
+
+# SWE-bench test configuration to verify Weave automatic tracking
+swebench:
+  limit: 5  # Test with 5 samples to verify Weave auto-tracking at scale
+  params:
+    artifacts_path: horangi/horangi4-dataset/swebench_verified_official_80:v4
+    dataset_dir: .
+  model_params:
+    max_tokens: 16384  # Model limit for GPT-4o
+    timeout: 300.0  # Longer timeout for SWE-bench
+  evaluation:
+    params:
+      concurrency: 1
+      timeout_sec: 1800
@@ -0,0 +1,31 @@
+model:
+  name: openai_responses  # Using OpenAI Responses API for reasoning models
+  params:
+    model_name: gpt-5-2025-08-07
+    api_base: https://api.openai.com/v1
+    batch_size: 8  # Lower for full reasoning model
+    max_tokens: 65536  # Model limit for GPT-5 (will be automatically capped)
+    temperature: 1.0  # Required for reasoning models
+    timeout: 900.0  # 15 minutes for reasoning chains
+    reasoning:
+      effort: "high"  # "low", "medium", or "high"
+      summary: "auto"  # "auto", "full", or "none"
+  release_date: "2025-08-07"
+  model_size: None
+  size_category: large
+
+# Override dataset settings from base_config.yaml for this model
+swebench:
+  limit: 80  # Full dataset evaluation
+  params:
+    artifacts_path: horangi/horangi4-dataset/swebench_verified_official_80:v4
+    dataset_dir: .
+  model_params:
+    max_tokens: 65536  # Model limit for GPT-5
+    timeout: 900.0  # 15 minutes for complex reasoning
+    reasoning:
+      effort: "high"  # Max reasoning effort for best results
+  evaluation:
+    params:
+      concurrency: 4  # Parallel processing (4 concurrent jobs)
+      timeout_sec: 2400  # 40 minutes per test for complex patches
@@ -1,11 +1,32 @@
 model:
-  name: litellm
+  name: openai_responses  # Using OpenAI Responses API for reasoning models
   params:
     model_name: gpt-5-mini-2025-08-07
-    provider: openai
-    batch_size: 32
-    max_completion_tokens: 1024
-    temperature: 1
+    api_base: https://api.openai.com/v1
+    batch_size: 16  # Lower for reasoning models
+    max_tokens: 32768  # Model limit for GPT-5-mini (will be automatically capped)
+    temperature: 1.0  # Required for reasoning models
+    timeout: 600.0  # 10 minutes for reasoning chains
+    reasoning:
+      effort: "high"  # "low", "medium", or "high"
+      summary: "auto"  # "auto", "full", or "none"
   release_date: "2025-08-07"
   model_size: None
-  size_category: mini
+  size_category: mini
+
+# Override dataset settings from base_config.yaml for this model
+# Testing with 5 samples
+swebench:
+  limit: 1  # Test with 1 sample to verify token tracking
+  params:
+    artifacts_path: horangi/horangi4-dataset/swebench_verified_official_80:v4
+    dataset_dir: .
+  model_params:
+    max_tokens: 32768  # Model limit for GPT-5-mini
+    timeout: 900.0  # 15 minutes for complex reasoning
+    reasoning:
+      effort: "high"  # Max reasoning effort for best results
+  evaluation:
+    params:
+      concurrency: 1  # Sequential processing for reasoning models
+      timeout_sec: 2400  # 40 minutes per test for complex patches
@@ -0,0 +1,19 @@
+model:
+  name: litellm
+  params:
+    model_name: hosted_vllm/Qwen/Qwen3-4B
+    provider: hosted_vllm
+    api_base: http://localhost:8000/v1
+    max_tokens: 3072
+    temperature: 0.1
+    batch_size: 8  # 클라이언트가 동시에 보낼 요청 수
+  vllm_params:
+    batch_size: 16  # vLLM 서버가 동시에 처리할 수 있는 요청 수 (클라이언트보다 크게 설정)
+    dtype: "auto"
+    download_dir: "/home/data_storage/huggingface"
+    max_model_len: 4096
+    num_gpus: 1
+    port: 8000
+    pretrained_model_name_or_path: "Qwen/Qwen3-4B"
+    tensor_parallel_size: 1
+    trust_remote_code: true
@@ -0,0 +1,8 @@
+model:
+  name: vllm
+  params:
+    model_name: Qwen/Qwen3-4B
+    model_name_or_path: Qwen/Qwen3-4B
+    max_tokens: 1024
+    temperature: 0.1
+    tensor_parallel_size: 1
@@ -0,0 +1,19 @@
+model:
+  name: openai
+  params:
+    provider: hosted_vllm
+    model_name: Qwen/Qwen3-4B
+    api_base: http://localhost:8000/v1
+    batch_size: 1
+    max_tokens: 1024
+    temperature: 0.1
+  vllm_params:
+    batch_size: 1
+    dtype: "auto"
+    download_dir: "/home/data_storage/huggingface"
+    max_model_len: 4096
+    num_gpus: 1
+    port: 8000
+    pretrained_model_name_or_path: "Qwen/Qwen3-4B"
+    tensor_parallel_size: 1
+    trust_remote_code: true
@@ -0,0 +1,52 @@
+# SWE-bench Evaluation Configuration Example
+#
+# This configuration file demonstrates how to set up SWE-bench evaluation
+# using the HRET framework.
+
+# Dataset Configuration
+dataset:
+  name: swebench
+  split: test
+  params:
+    # W&B artifact path containing the SWE-bench dataset
+    artifacts_path: "horangi/horangi4-dataset/swebench_verified_official_80:v4"
+    # Directory within the artifact (usually ".")
+    dataset_dir: "."
+    # Maximum number of samples to evaluate (useful for testing)
+    max_samples: 10
+
+# Model Configuration
+model:
+  name: litellm  # Can be: litellm, openai, huggingface, etc.
+  params:
+    model_name_or_path: "gpt-4o-2024-11-20"
+    temperature: 0.0
+    max_tokens: 16000
+
+# Evaluation Method Configuration
+evaluation:
+  method: swebench
+  params:
+    # API server endpoint for running tests
+    api_endpoint: "https://api.nejumi-swebench.org/"
+    # Optional API key (can also use SWE_API_KEY env var)
+    # api_key: "your-api-key-here"
+    # Docker image namespace
+    namespace: "swebench"
+    # Docker image tag
+    tag: "latest"
+    # Timeout for each test execution (seconds)
+    timeout_sec: 1800
+    # Number of parallel jobs to submit to the API server
+    concurrency: 2
+
+# W&B Configuration (optional)
+wandb:
+  params:
+    entity: "your-entity"
+    project: "swebench-eval"
+  run_name: "swebench-gpt4o-test"
+
+# General Settings
+testmode: false
+inference_interval: 0.0