47
47
)
48
48
from swebench .harness .grading import get_eval_report
49
49
from swebench .harness .test_spec import make_test_spec , TestSpec
50
- from swebench .harness .utils import load_swebench_dataset , str2bool
50
+ from swebench .harness .utils import load_swebench_dataset , str2bool , EvaluationError
51
+ from swebench .harness .run_evaluation_modal import run_instances_modal
51
52
52
53
53
- class EvaluationError (Exception ):
54
- def __init__ (self , instance_id , message , logger ):
55
- super ().__init__ (message )
56
- self .super_str = super ().__str__ ()
57
- self .instance_id = instance_id
58
- self .log_file = logger .log_file
59
- self .logger = logger
60
-
61
- def __str__ (self ):
62
- return (
63
- f"Evaluation error for { self .instance_id } : { self .super_str } \n "
64
- f"Check ({ self .log_file } ) for more information."
65
- )
66
-
67
54
68
55
def run_instance (
69
56
test_spec : TestSpec ,
@@ -229,7 +216,6 @@ def run_instance(
229
216
close_logger (logger )
230
217
return
231
218
232
-
233
219
def run_instances (
234
220
predictions : dict ,
235
221
instances : list ,
@@ -299,7 +285,6 @@ def run_instances(
299
285
continue
300
286
print ("All instances run." )
301
287
302
-
303
288
def get_dataset_from_preds (
304
289
dataset_name : str ,
305
290
split : str ,
@@ -495,6 +480,31 @@ def get_gold_predictions(dataset_name: str, split: str):
495
480
} for datum in dataset
496
481
]
497
482
483
+ def get_predictions_from_file (predictions_path : str , dataset_name : str , split : str ):
484
+ if predictions_path == "gold" :
485
+ print ("Using gold predictions - ignoring predictions_path" )
486
+ return get_gold_predictions (dataset_name , split )
487
+ if predictions_path .endswith (".json" ):
488
+ with open (predictions_path , "r" ) as f :
489
+ return json .load (f )
490
+ elif predictions_path .endswith (".jsonl" ):
491
+ with open (predictions_path , "r" ) as f :
492
+ return [json .loads (line ) for line in f ]
493
+ else :
494
+ raise ValueError ("Predictions path must be .json or .jsonl" )
495
+
496
+ def validate_modal_credentials ():
497
+ """
498
+ Validate that Modal credentials exist by checking for ~/.modal.toml file.
499
+ Raises an exception if credentials are not configured.
500
+ """
501
+ modal_config_path = Path .home () / ".modal.toml"
502
+ if not modal_config_path .exists ():
503
+ raise RuntimeError (
504
+ "~/.modal.toml not found - it looks like you haven't configured credentials for Modal.\n "
505
+ "Run 'modal token new' in your terminal to configure credentials."
506
+ )
507
+
498
508
499
509
def main (
500
510
dataset_name : str ,
@@ -508,34 +518,36 @@ def main(
508
518
open_file_limit : int ,
509
519
run_id : str ,
510
520
timeout : int ,
521
+ modal : bool ,
511
522
):
512
523
"""
513
524
Run evaluation harness for the given dataset and predictions.
514
525
"""
515
526
# set open file limit
516
527
assert len (run_id ) > 0 , "Run ID must be provided"
517
- if platform .system () == 'Linux' :
518
- resource .setrlimit (resource .RLIMIT_NOFILE , (open_file_limit , open_file_limit ))
519
- client = docker .from_env ()
520
528
521
529
# load predictions as map of instance_id to prediction
522
- if predictions_path == 'gold' :
523
- print ("Using gold predictions - ignoring predictions_path" )
524
- predictions = get_gold_predictions (dataset_name , split )
525
- else :
526
- if predictions_path .endswith (".json" ):
527
- with open (predictions_path , "r" ) as f :
528
- predictions = json .load (f )
529
- elif predictions_path .endswith (".jsonl" ):
530
- with open (predictions_path , "r" ) as f :
531
- predictions = [json .loads (line ) for line in f ]
532
- else :
533
- raise ValueError ("Predictions path must be \" gold\" , .json, or .jsonl" )
530
+ predictions = get_predictions_from_file (predictions_path , dataset_name , split )
534
531
predictions = {pred [KEY_INSTANCE_ID ]: pred for pred in predictions }
535
532
536
533
# get dataset from predictions
537
534
dataset = get_dataset_from_preds (dataset_name , split , instance_ids , predictions , run_id )
538
535
full_dataset = load_swebench_dataset (dataset_name , split , instance_ids )
536
+
537
+ if modal :
538
+ # run instances on Modal
539
+ if not dataset :
540
+ print ("No instances to run." )
541
+ else :
542
+ validate_modal_credentials ()
543
+ run_instances_modal (predictions , dataset , full_dataset , run_id , timeout )
544
+ return
545
+
546
+ # run instances locally
547
+ if platform .system () == 'Linux' :
548
+ resource .setrlimit (resource .RLIMIT_NOFILE , (open_file_limit , open_file_limit ))
549
+ client = docker .from_env ()
550
+
539
551
existing_images = list_images (client )
540
552
print (f"Running { len (dataset )} unevaluated instances..." )
541
553
if not dataset :
@@ -549,18 +561,21 @@ def main(
549
561
clean_images (client , existing_images , cache_level , clean )
550
562
make_run_report (predictions , full_dataset , client , run_id )
551
563
552
-
553
564
if __name__ == "__main__" :
554
565
parser = ArgumentParser ()
566
+
567
+ # Common args
555
568
parser .add_argument ("--dataset_name" , default = "princeton-nlp/SWE-bench_Lite" , type = str , help = "Name of dataset or path to JSON file." )
556
569
parser .add_argument ("--split" , type = str , default = "test" , help = "Split of the dataset" )
557
570
parser .add_argument ("--instance_ids" , nargs = "+" , type = str , help = "Instance IDs to run (space separated)" )
558
571
parser .add_argument ("--predictions_path" , type = str , help = "Path to predictions file - if 'gold', uses gold predictions" , required = True )
572
+
573
+ # Local execution args
559
574
parser .add_argument ("--max_workers" , type = int , default = 4 , help = "Maximum number of workers (should be <= 75%% of CPU cores)" )
560
575
parser .add_argument ("--open_file_limit" , type = int , default = 4096 , help = "Open file limit" )
561
576
parser .add_argument (
562
577
"--timeout" , type = int , default = 1_800 , help = "Timeout (in seconds) for running tests for each instance"
563
- )
578
+ )
564
579
parser .add_argument (
565
580
"--force_rebuild" , type = str2bool , default = False , help = "Force rebuild of all images"
566
581
)
@@ -577,6 +592,9 @@ def main(
577
592
"--clean" , type = str2bool , default = False , help = "Clean images above cache level"
578
593
)
579
594
parser .add_argument ("--run_id" , type = str , required = True , help = "Run ID - identifies the run" )
580
- args = parser .parse_args ()
581
595
596
+ # Modal execution args
597
+ parser .add_argument ("--modal" , action = "store_true" , default = False , help = "Run on Modal" )
598
+
599
+ args = parser .parse_args ()
582
600
main (** vars (args ))
0 commit comments