Skip to content

Commit 25f5620

Browse files
Merge pull request #237 from azliu0/main
Add support for Modal
2 parents 5f5a7df + 75c7faf commit 25f5620

File tree

6 files changed

+736
-36
lines changed

6 files changed

+736
-36
lines changed

setup.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@
3939
'requests',
4040
'rich',
4141
'unidiff',
42+
'tenacity',
43+
'modal',
4244
'tqdm',
4345
],
4446
extras_require={

swebench/harness/docker_build.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,13 @@ def __str__(self):
4646
)
4747

4848

49-
def setup_logger(instance_id: str, log_file: Path, mode="w"):
49+
def setup_logger(instance_id: str, log_file: Path, mode="w", add_stdout: bool = False):
5050
"""
5151
This logger is used for logging the build process of images and containers.
5252
It writes logs to the log file.
53+
54+
If `add_stdout` is True, logs will also be sent to stdout, which can be used for
55+
streaming ephemeral output from Modal containers.
5356
"""
5457
log_file.parent.mkdir(parents=True, exist_ok=True)
5558
logger = logging.getLogger(f"{instance_id}.{log_file.name}")
@@ -60,6 +63,12 @@ def setup_logger(instance_id: str, log_file: Path, mode="w"):
6063
logger.setLevel(logging.INFO)
6164
logger.propagate = False
6265
setattr(logger, "log_file", log_file)
66+
if add_stdout:
67+
import sys
68+
handler = logging.StreamHandler(sys.stdout)
69+
formatter = logging.Formatter(f"%(asctime)s - {instance_id} - %(levelname)s - %(message)s")
70+
handler.setFormatter(formatter)
71+
logger.addHandler(handler)
6372
return logger
6473

6574

swebench/harness/run_evaluation.py

Lines changed: 53 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -47,23 +47,10 @@
4747
)
4848
from swebench.harness.grading import get_eval_report
4949
from swebench.harness.test_spec import make_test_spec, TestSpec
50-
from swebench.harness.utils import load_swebench_dataset, str2bool
50+
from swebench.harness.utils import load_swebench_dataset, str2bool, EvaluationError
51+
from swebench.harness.run_evaluation_modal import run_instances_modal
5152

5253

53-
class EvaluationError(Exception):
54-
def __init__(self, instance_id, message, logger):
55-
super().__init__(message)
56-
self.super_str = super().__str__()
57-
self.instance_id = instance_id
58-
self.log_file = logger.log_file
59-
self.logger = logger
60-
61-
def __str__(self):
62-
return (
63-
f"Evaluation error for {self.instance_id}: {self.super_str}\n"
64-
f"Check ({self.log_file}) for more information."
65-
)
66-
6754

6855
def run_instance(
6956
test_spec: TestSpec,
@@ -229,7 +216,6 @@ def run_instance(
229216
close_logger(logger)
230217
return
231218

232-
233219
def run_instances(
234220
predictions: dict,
235221
instances: list,
@@ -299,7 +285,6 @@ def run_instances(
299285
continue
300286
print("All instances run.")
301287

302-
303288
def get_dataset_from_preds(
304289
dataset_name: str,
305290
split: str,
@@ -495,6 +480,31 @@ def get_gold_predictions(dataset_name: str, split: str):
495480
} for datum in dataset
496481
]
497482

483+
def get_predictions_from_file(predictions_path: str, dataset_name: str, split: str):
484+
if predictions_path == "gold":
485+
print("Using gold predictions - ignoring predictions_path")
486+
return get_gold_predictions(dataset_name, split)
487+
if predictions_path.endswith(".json"):
488+
with open(predictions_path, "r") as f:
489+
return json.load(f)
490+
elif predictions_path.endswith(".jsonl"):
491+
with open(predictions_path, "r") as f:
492+
return [json.loads(line) for line in f]
493+
else:
494+
raise ValueError("Predictions path must be .json or .jsonl")
495+
496+
def validate_modal_credentials():
497+
"""
498+
Validate that Modal credentials exist by checking for ~/.modal.toml file.
499+
Raises an exception if credentials are not configured.
500+
"""
501+
modal_config_path = Path.home() / ".modal.toml"
502+
if not modal_config_path.exists():
503+
raise RuntimeError(
504+
"~/.modal.toml not found - it looks like you haven't configured credentials for Modal.\n"
505+
"Run 'modal token new' in your terminal to configure credentials."
506+
)
507+
498508

499509
def main(
500510
dataset_name: str,
@@ -508,34 +518,36 @@ def main(
508518
open_file_limit: int,
509519
run_id: str,
510520
timeout: int,
521+
modal: bool,
511522
):
512523
"""
513524
Run evaluation harness for the given dataset and predictions.
514525
"""
515526
# set open file limit
516527
assert len(run_id) > 0, "Run ID must be provided"
517-
if platform.system() == 'Linux':
518-
resource.setrlimit(resource.RLIMIT_NOFILE, (open_file_limit, open_file_limit))
519-
client = docker.from_env()
520528

521529
# load predictions as map of instance_id to prediction
522-
if predictions_path == 'gold':
523-
print("Using gold predictions - ignoring predictions_path")
524-
predictions = get_gold_predictions(dataset_name, split)
525-
else:
526-
if predictions_path.endswith(".json"):
527-
with open(predictions_path, "r") as f:
528-
predictions = json.load(f)
529-
elif predictions_path.endswith(".jsonl"):
530-
with open(predictions_path, "r") as f:
531-
predictions = [json.loads(line) for line in f]
532-
else:
533-
raise ValueError("Predictions path must be \"gold\", .json, or .jsonl")
530+
predictions = get_predictions_from_file(predictions_path, dataset_name, split)
534531
predictions = {pred[KEY_INSTANCE_ID]: pred for pred in predictions}
535532

536533
# get dataset from predictions
537534
dataset = get_dataset_from_preds(dataset_name, split, instance_ids, predictions, run_id)
538535
full_dataset = load_swebench_dataset(dataset_name, split, instance_ids)
536+
537+
if modal:
538+
# run instances on Modal
539+
if not dataset:
540+
print("No instances to run.")
541+
else:
542+
validate_modal_credentials()
543+
run_instances_modal(predictions, dataset, full_dataset, run_id, timeout)
544+
return
545+
546+
# run instances locally
547+
if platform.system() == 'Linux':
548+
resource.setrlimit(resource.RLIMIT_NOFILE, (open_file_limit, open_file_limit))
549+
client = docker.from_env()
550+
539551
existing_images = list_images(client)
540552
print(f"Running {len(dataset)} unevaluated instances...")
541553
if not dataset:
@@ -549,18 +561,21 @@ def main(
549561
clean_images(client, existing_images, cache_level, clean)
550562
make_run_report(predictions, full_dataset, client, run_id)
551563

552-
553564
if __name__ == "__main__":
554565
parser = ArgumentParser()
566+
567+
# Common args
555568
parser.add_argument("--dataset_name", default="princeton-nlp/SWE-bench_Lite", type=str, help="Name of dataset or path to JSON file.")
556569
parser.add_argument("--split", type=str, default="test", help="Split of the dataset")
557570
parser.add_argument("--instance_ids", nargs="+", type=str, help="Instance IDs to run (space separated)")
558571
parser.add_argument("--predictions_path", type=str, help="Path to predictions file - if 'gold', uses gold predictions", required=True)
572+
573+
# Local execution args
559574
parser.add_argument("--max_workers", type=int, default=4, help="Maximum number of workers (should be <= 75%% of CPU cores)")
560575
parser.add_argument("--open_file_limit", type=int, default=4096, help="Open file limit")
561576
parser.add_argument(
562577
"--timeout", type=int, default=1_800, help="Timeout (in seconds) for running tests for each instance"
563-
)
578+
)
564579
parser.add_argument(
565580
"--force_rebuild", type=str2bool, default=False, help="Force rebuild of all images"
566581
)
@@ -577,6 +592,9 @@ def main(
577592
"--clean", type=str2bool, default=False, help="Clean images above cache level"
578593
)
579594
parser.add_argument("--run_id", type=str, required=True, help="Run ID - identifies the run")
580-
args = parser.parse_args()
581595

596+
# Modal execution args
597+
parser.add_argument("--modal", action="store_true", default=False, help="Run on Modal")
598+
599+
args = parser.parse_args()
582600
main(**vars(args))

0 commit comments

Comments
 (0)