Skip to content

Commit da7c240

Browse files
committed
Merge branch 'horangi4-dev' into horangi-4-dev/korean_sat
2 parents eb0a20d + c02947d commit da7c240

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+4357
-1277
lines changed

configs/base_config.yaml

Lines changed: 52 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,14 @@ wandb:
44
entity: horangi
55
project: horangi4-dev
66
project_dataset: horangi4-dataset
7+
run_name: swebench-o4mini-2
78

89
# testmode configurations
910
testmode: false
1011

12+
# LLM 호출 간 간격(초). 누락 시 오류 방지를 위해 기본값 제공
13+
inference_interval: 0.0
14+
1115
mt_bench:
1216
split: test
1317
subset: ["roleplay", "humanities", "writing", "reasoning", "coding"]
@@ -23,18 +27,7 @@ mt_bench:
2327
temperature: 0.0
2428
# dataset configurations
2529
# SWE-bench Verified 설정
26-
swe_bench_verified:
27-
split: test
28-
subset: official_80
29-
limit: 80
30-
evaluation:
31-
method: "swebench_verified"
32-
# 원격 SWE-bench API 서버 설정 (있을 경우 실제 실행)
33-
server:
34-
url: "" # 예: https://swebench-api.my-domain.com
35-
token: "" # 필요 시 토큰
36-
poll_interval_sec: 10
37-
max_wait_sec: 36000
30+
3831

3932
komoral:
4033
split: test
@@ -203,9 +196,55 @@ bfcl:
203196
evaluation:
204197
model: gpt-4o-2024-11-20
205198

199+
# ================= SWE-bench (integrated) =================
200+
# SWE-bench Verified dataset for evaluating code patch generation
201+
swebench:
202+
# Dataset configuration
203+
split: test
204+
subset: default
205+
limit: 80 # Maximum number of samples to evaluate
206+
207+
# Dataset parameters (passed to Dataset constructor)
208+
params:
209+
artifacts_path: horangi/horangi4-dataset/swebench_verified_official_80:v4
210+
dataset_dir: .
211+
212+
# Model parameters override for this dataset
213+
# These will override the model config from <model_name>.yaml
214+
model_params:
215+
max_tokens: 32768 # Model limit (will be automatically capped per model)
216+
temperature: 0.0
217+
218+
# Evaluation method
219+
evaluation:
220+
method: swebench
221+
params:
222+
# API server for running tests in Docker
223+
api_endpoint: https://api.nejumi-swebench.org/
224+
# api_endpoint: http://localhost:8000 # For local development
225+
timeout_sec: 1800 # 30 minutes per test
226+
concurrency: 2 # Number of parallel jobs
227+
namespace: swebench # Docker image namespace
228+
tag: latest # Docker image tag
229+
230+
# Legacy settings (kept for backward compatibility with external/swe_bench)
231+
max_samples: 80
232+
max_workers: 2
233+
background_eval: false
234+
fc_enabled: true
235+
prebuild_images: false
236+
images:
237+
namespace: swebench
238+
tag: latest
239+
api_server:
240+
enabled: true
241+
endpoint: https://api.nejumi-swebench.org/
242+
timeout_sec: 1800
243+
concurrency: 2
244+
206245
# arc_agi2:
207246
# split: evaluation
208247
# subset: default
209248
# limit: 1000
210249
# evaluation:
211-
# method: "grid_match"
250+
# method: "grid_match"

configs/gpt-4o-2024-11-20.yaml

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,23 @@ model:
33
params:
44
model_name: gpt-4o-2024-11-20
55
provider: openai
6-
batch_size: 32
7-
max_tokens: 1024
6+
max_tokens: 16384
87
temperature: 0.0
8+
batch_size: 32
99
release_date: "2024-11-20"
1010
model_size: None
11-
size_category: large
11+
size_category: large
12+
13+
# SWE-bench test configuration to verify Weave automatic tracking
14+
swebench:
15+
limit: 5 # Test with 5 samples to verify Weave auto-tracking at scale
16+
params:
17+
artifacts_path: horangi/horangi4-dataset/swebench_verified_official_80:v4
18+
dataset_dir: .
19+
model_params:
20+
max_tokens: 16384 # Model limit for GPT-4o
21+
timeout: 300.0 # Longer timeout for SWE-bench
22+
evaluation:
23+
params:
24+
concurrency: 1
25+
timeout_sec: 1800

configs/gpt-5-2025-08-07.yaml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
model:
2+
name: openai_responses # Using OpenAI Responses API for reasoning models
3+
params:
4+
model_name: gpt-5-2025-08-07
5+
api_base: https://api.openai.com/v1
6+
batch_size: 8 # Lower for full reasoning model
7+
max_tokens: 65536 # Model limit for GPT-5 (will be automatically capped)
8+
temperature: 1.0 # Required for reasoning models
9+
timeout: 900.0 # 15 minutes for reasoning chains
10+
reasoning:
11+
effort: "high" # "low", "medium", or "high"
12+
summary: "auto" # "auto", "full", or "none"
13+
release_date: "2025-08-07"
14+
model_size: None
15+
size_category: large
16+
17+
# Override dataset settings from base_config.yaml for this model
18+
swebench:
19+
limit: 80 # Full dataset evaluation
20+
params:
21+
artifacts_path: horangi/horangi4-dataset/swebench_verified_official_80:v4
22+
dataset_dir: .
23+
model_params:
24+
max_tokens: 65536 # Model limit for GPT-5
25+
timeout: 900.0 # 15 minutes for complex reasoning
26+
reasoning:
27+
effort: "high" # Max reasoning effort for best results
28+
evaluation:
29+
params:
30+
concurrency: 4 # Parallel processing (4 concurrent jobs)
31+
timeout_sec: 2400 # 40 minutes per test for complex patches

configs/gpt-5-mini-2025-08-07.yaml

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,32 @@
11
model:
2-
name: litellm
2+
name: openai_responses # Using OpenAI Responses API for reasoning models
33
params:
44
model_name: gpt-5-mini-2025-08-07
5-
provider: openai
6-
batch_size: 32
7-
max_completion_tokens: 1024
8-
temperature: 1
5+
api_base: https://api.openai.com/v1
6+
batch_size: 16 # Lower for reasoning models
7+
max_tokens: 32768 # Model limit for GPT-5-mini (will be automatically capped)
8+
temperature: 1.0 # Required for reasoning models
9+
timeout: 600.0 # 10 minutes for reasoning chains
10+
reasoning:
11+
effort: "high" # "low", "medium", or "high"
12+
summary: "auto" # "auto", "full", or "none"
913
release_date: "2025-08-07"
1014
model_size: None
11-
size_category: mini
15+
size_category: mini
16+
17+
# Override dataset settings from base_config.yaml for this model
18+
# Testing with 5 samples
19+
swebench:
20+
limit: 1 # Test with 1 sample to verify token tracking
21+
params:
22+
artifacts_path: horangi/horangi4-dataset/swebench_verified_official_80:v4
23+
dataset_dir: .
24+
model_params:
25+
max_tokens: 32768 # Model limit for GPT-5-mini
26+
timeout: 900.0 # 15 minutes for complex reasoning
27+
reasoning:
28+
effort: "high" # Max reasoning effort for best results
29+
evaluation:
30+
params:
31+
concurrency: 1 # Sequential processing for reasoning models
32+
timeout_sec: 2400 # 40 minutes per test for complex patches
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
model:
2+
name: litellm
3+
params:
4+
model_name: hosted_vllm/Qwen/Qwen3-4B
5+
provider: hosted_vllm
6+
api_base: http://localhost:8000/v1
7+
max_tokens: 3072
8+
temperature: 0.1
9+
batch_size: 8 # 클라이언트가 동시에 보낼 요청 수
10+
vllm_params:
11+
batch_size: 16 # vLLM 서버가 동시에 처리할 수 있는 요청 수 (클라이언트보다 크게 설정)
12+
dtype: "auto"
13+
download_dir: "/home/data_storage/huggingface"
14+
max_model_len: 4096
15+
num_gpus: 1
16+
port: 8000
17+
pretrained_model_name_or_path: "Qwen/Qwen3-4B"
18+
tensor_parallel_size: 1
19+
trust_remote_code: true
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
model:
2+
name: vllm
3+
params:
4+
model_name: Qwen/Qwen3-4B
5+
model_name_or_path: Qwen/Qwen3-4B
6+
max_tokens: 1024
7+
temperature: 0.1
8+
tensor_parallel_size: 1
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
model:
2+
name: openai
3+
params:
4+
provider: hosted_vllm
5+
model_name: Qwen/Qwen3-4B
6+
api_base: http://localhost:8000/v1
7+
batch_size: 1
8+
max_tokens: 1024
9+
temperature: 0.1
10+
vllm_params:
11+
batch_size: 1
12+
dtype: "auto"
13+
download_dir: "/home/data_storage/huggingface"
14+
max_model_len: 4096
15+
num_gpus: 1
16+
port: 8000
17+
pretrained_model_name_or_path: "Qwen/Qwen3-4B"
18+
tensor_parallel_size: 1
19+
trust_remote_code: true

examples/swebench_config.yaml

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# SWE-bench Evaluation Configuration Example
2+
#
3+
# This configuration file demonstrates how to set up SWE-bench evaluation
4+
# using the HRET framework.
5+
6+
# Dataset Configuration
7+
dataset:
8+
name: swebench
9+
split: test
10+
params:
11+
# W&B artifact path containing the SWE-bench dataset
12+
artifacts_path: "horangi/horangi4-dataset/swebench_verified_official_80:v4"
13+
# Directory within the artifact (usually ".")
14+
dataset_dir: "."
15+
# Maximum number of samples to evaluate (useful for testing)
16+
max_samples: 10
17+
18+
# Model Configuration
19+
model:
20+
name: litellm # Can be: litellm, openai, huggingface, etc.
21+
params:
22+
model_name_or_path: "gpt-4o-2024-11-20"
23+
temperature: 0.0
24+
max_tokens: 16000
25+
26+
# Evaluation Method Configuration
27+
evaluation:
28+
method: swebench
29+
params:
30+
# API server endpoint for running tests
31+
api_endpoint: "https://api.nejumi-swebench.org/"
32+
# Optional API key (can also use SWE_API_KEY env var)
33+
# api_key: "your-api-key-here"
34+
# Docker image namespace
35+
namespace: "swebench"
36+
# Docker image tag
37+
tag: "latest"
38+
# Timeout for each test execution (seconds)
39+
timeout_sec: 1800
40+
# Number of parallel jobs to submit to the API server
41+
concurrency: 2
42+
43+
# W&B Configuration (optional)
44+
wandb:
45+
params:
46+
entity: "your-entity"
47+
project: "swebench-eval"
48+
run_name: "swebench-gpt4o-test"
49+
50+
# General Settings
51+
testmode: false
52+
inference_interval: 0.0

0 commit comments

Comments
 (0)