SWE-bench
diff --git a/‎setup.py
Lines changed: 2 additions & 0 deletions b/‎setup.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎swebench/harness/docker_build.py
Lines changed: 10 additions & 1 deletion b/‎swebench/harness/docker_build.py
Lines changed: 10 additions & 1 deletion
diff --git a/‎swebench/harness/run_evaluation.py
Lines changed: 53 additions & 35 deletions b/‎swebench/harness/run_evaluation.py
Lines changed: 53 additions & 35 deletions
@@ -39,6 +39,8 @@
         'requests',
         'rich',
         'unidiff',
+        'tenacity',
+        'modal',
         'tqdm',
     ],
     extras_require={
 
@@ -46,10 +46,13 @@ def __str__(self):
         )
 
 
-def setup_logger(instance_id: str, log_file: Path, mode="w"):
+def setup_logger(instance_id: str, log_file: Path, mode="w", add_stdout: bool = False):
     """
     This logger is used for logging the build process of images and containers.
     It writes logs to the log file.
+
+    If `add_stdout` is True, logs will also be sent to stdout, which can be used for
+    streaming ephemeral output from Modal containers.
     """
     log_file.parent.mkdir(parents=True, exist_ok=True)
     logger = logging.getLogger(f"{instance_id}.{log_file.name}")
@@ -60,6 +63,12 @@ def setup_logger(instance_id: str, log_file: Path, mode="w"):
     logger.setLevel(logging.INFO)
     logger.propagate = False
     setattr(logger, "log_file", log_file)
+    if add_stdout:
+        import sys
+        handler = logging.StreamHandler(sys.stdout)
+        formatter = logging.Formatter(f"%(asctime)s - {instance_id} - %(levelname)s - %(message)s")
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
     return logger
 
 
 
@@ -47,23 +47,10 @@
 )
 from swebench.harness.grading import get_eval_report
 from swebench.harness.test_spec import make_test_spec, TestSpec
-from swebench.harness.utils import load_swebench_dataset, str2bool
+from swebench.harness.utils import load_swebench_dataset, str2bool, EvaluationError
+from swebench.harness.run_evaluation_modal import run_instances_modal
 
 
-class EvaluationError(Exception):
-    def __init__(self, instance_id, message, logger):
-        super().__init__(message)
-        self.super_str = super().__str__()
-        self.instance_id = instance_id
-        self.log_file = logger.log_file
-        self.logger = logger
-
-    def __str__(self):
-        return (
-            f"Evaluation error for {self.instance_id}: {self.super_str}\n"
-            f"Check ({self.log_file}) for more information."
-        )
-
 
 def run_instance(
         test_spec: TestSpec,
@@ -229,7 +216,6 @@ def run_instance(
         close_logger(logger)
     return
 
-
 def run_instances(
         predictions: dict,
         instances: list,
@@ -299,7 +285,6 @@ def run_instances(
                     continue
     print("All instances run.")
 
-
 def get_dataset_from_preds(
         dataset_name: str,
         split: str,
@@ -495,6 +480,31 @@ def get_gold_predictions(dataset_name: str, split: str):
         } for datum in dataset
     ]
 
+def get_predictions_from_file(predictions_path: str, dataset_name: str, split: str):
+    if predictions_path == "gold":
+        print("Using gold predictions - ignoring predictions_path")
+        return get_gold_predictions(dataset_name, split)
+    if predictions_path.endswith(".json"):
+        with open(predictions_path, "r") as f:
+            return json.load(f)
+    elif predictions_path.endswith(".jsonl"):
+        with open(predictions_path, "r") as f:
+            return [json.loads(line) for line in f]
+    else:
+        raise ValueError("Predictions path must be .json or .jsonl")
+
+def validate_modal_credentials():
+    """
+    Validate that Modal credentials exist by checking for ~/.modal.toml file.
+    Raises an exception if credentials are not configured.
+    """
+    modal_config_path = Path.home() / ".modal.toml"
+    if not modal_config_path.exists():
+        raise RuntimeError(
+            "~/.modal.toml not found - it looks like you haven't configured credentials for Modal.\n"
+            "Run 'modal token new' in your terminal to configure credentials."
+        )
+
 
 def main(
         dataset_name: str,
@@ -508,34 +518,36 @@ def main(
         open_file_limit: int,
         run_id: str,
         timeout: int,
+        modal: bool,
     ):
     """
     Run evaluation harness for the given dataset and predictions.
     """
     # set open file limit
     assert len(run_id) > 0, "Run ID must be provided"
-    if platform.system() == 'Linux':
-        resource.setrlimit(resource.RLIMIT_NOFILE, (open_file_limit, open_file_limit))
-    client = docker.from_env()
 
     # load predictions as map of instance_id to prediction
-    if predictions_path == 'gold':
-        print("Using gold predictions - ignoring predictions_path")
-        predictions = get_gold_predictions(dataset_name, split)
-    else:
-        if predictions_path.endswith(".json"):
-            with open(predictions_path, "r") as f:
-                predictions = json.load(f)
-        elif predictions_path.endswith(".jsonl"):
-            with open(predictions_path, "r") as f:
-                predictions = [json.loads(line) for line in f]
-        else:
-            raise ValueError("Predictions path must be \"gold\", .json, or .jsonl")
+    predictions = get_predictions_from_file(predictions_path, dataset_name, split)
     predictions = {pred[KEY_INSTANCE_ID]: pred for pred in predictions}
 
     # get dataset from predictions
     dataset = get_dataset_from_preds(dataset_name, split, instance_ids, predictions, run_id)
     full_dataset = load_swebench_dataset(dataset_name, split, instance_ids)
+
+    if modal:
+        # run instances on Modal
+        if not dataset:
+            print("No instances to run.")
+        else:
+            validate_modal_credentials()
+            run_instances_modal(predictions, dataset, full_dataset, run_id, timeout)
+        return
+
+    # run instances locally
+    if platform.system() == 'Linux':
+        resource.setrlimit(resource.RLIMIT_NOFILE, (open_file_limit, open_file_limit))
+    client = docker.from_env()
+
     existing_images = list_images(client)
     print(f"Running {len(dataset)} unevaluated instances...")
     if not dataset:
@@ -549,18 +561,21 @@ def main(
     clean_images(client, existing_images, cache_level, clean)
     make_run_report(predictions, full_dataset, client, run_id)
 
-
 if __name__ == "__main__":
     parser = ArgumentParser()
+
+    # Common args
     parser.add_argument("--dataset_name", default="princeton-nlp/SWE-bench_Lite", type=str, help="Name of dataset or path to JSON file.")
     parser.add_argument("--split", type=str, default="test", help="Split of the dataset")
     parser.add_argument("--instance_ids", nargs="+", type=str, help="Instance IDs to run (space separated)")
     parser.add_argument("--predictions_path", type=str, help="Path to predictions file - if 'gold', uses gold predictions", required=True)
+
+    # Local execution args
     parser.add_argument("--max_workers", type=int, default=4, help="Maximum number of workers (should be <= 75%% of CPU cores)")
     parser.add_argument("--open_file_limit", type=int, default=4096, help="Open file limit")
     parser.add_argument(
         "--timeout", type=int, default=1_800, help="Timeout (in seconds) for running tests for each instance"
-        )
+    )
     parser.add_argument(
         "--force_rebuild", type=str2bool, default=False, help="Force rebuild of all images"
     )
@@ -577,6 +592,9 @@ def main(
         "--clean", type=str2bool, default=False, help="Clean images above cache level"
     )
     parser.add_argument("--run_id", type=str, required=True, help="Run ID - identifies the run")
-    args = parser.parse_args()
 
+    # Modal execution args
+    parser.add_argument("--modal", action="store_true", default=False, help="Run on Modal")
+
+    args = parser.parse_args()
     main(**vars(args))