44 entity : horangi
55 project : horangi4-dev
66 project_dataset : horangi4-dataset
7+ run_name : swebench-o4mini-2
78
89# testmode configurations
910testmode : false
1011
12+ # LLM 호출 간 간격(초). 누락 시 오류 방지를 위해 기본값 제공
13+ inference_interval : 0.0
14+
1115mt_bench :
1216 split : test
1317 subset : ["roleplay", "humanities", "writing", "reasoning", "coding"]
@@ -23,18 +27,7 @@ mt_bench:
2327 temperature : 0.0
2428# dataset configurations
2529# SWE-bench Verified 설정
26- swe_bench_verified :
27- split : test
28- subset : official_80
29- limit : 80
30- evaluation :
31- method : " swebench_verified"
32- # 원격 SWE-bench API 서버 설정 (있을 경우 실제 실행)
33- server :
34- url : " " # 예: https://swebench-api.my-domain.com
35- token : " " # 필요 시 토큰
36- poll_interval_sec : 10
37- max_wait_sec : 36000
30+
3831
3932komoral :
4033 split : test
@@ -203,9 +196,55 @@ bfcl:
203196 evaluation :
204197 model : gpt-4o-2024-11-20
205198
199+ # ================= SWE-bench (integrated) =================
200+ # SWE-bench Verified dataset for evaluating code patch generation
201+ swebench :
202+ # Dataset configuration
203+ split : test
204+ subset : default
205+ limit : 80 # Maximum number of samples to evaluate
206+
207+ # Dataset parameters (passed to Dataset constructor)
208+ params :
209+ artifacts_path : horangi/horangi4-dataset/swebench_verified_official_80:v4
210+ dataset_dir : .
211+
212+ # Model parameters override for this dataset
213+ # These will override the model config from <model_name>.yaml
214+ model_params :
215+ max_tokens : 32768 # Model limit (will be automatically capped per model)
216+ temperature : 0.0
217+
218+ # Evaluation method
219+ evaluation :
220+ method : swebench
221+ params :
222+ # API server for running tests in Docker
223+ api_endpoint : https://api.nejumi-swebench.org/
224+ # api_endpoint: http://localhost:8000 # For local development
225+ timeout_sec : 1800 # 30 minutes per test
226+ concurrency : 2 # Number of parallel jobs
227+ namespace : swebench # Docker image namespace
228+ tag : latest # Docker image tag
229+
230+ # Legacy settings (kept for backward compatibility with external/swe_bench)
231+ max_samples : 80
232+ max_workers : 2
233+ background_eval : false
234+ fc_enabled : true
235+ prebuild_images : false
236+ images :
237+ namespace : swebench
238+ tag : latest
239+ api_server :
240+ enabled : true
241+ endpoint : https://api.nejumi-swebench.org/
242+ timeout_sec : 1800
243+ concurrency : 2
244+
206245# arc_agi2:
207246# split: evaluation
208247# subset: default
209248# limit: 1000
210249# evaluation:
211- # method: "grid_match"
250+ # method: "grid_match"
0 commit comments