Move make_run_report to swebench/harness/reporting for run_evaluation…

… and run_evaluation_modal
swe-bench · Jan 17, 2025 · 265f9ff · 265f9ff
1 parent a941896
commit 265f9ff
Show file tree

Hide file tree

Showing 5 changed files with 145 additions and 229 deletions.
diff --git a/swebench/harness/grading.py b/swebench/harness/grading.py
@@ -204,7 +204,7 @@ def get_resolution_status(report: dict[str, dict[str, Any]]) -> str:
         return ResolvedStatus.PARTIAL.value
     else:
         return ResolvedStatus.NO.value
-    
+
 
 def get_eval_report(
     test_spec: TestSpec,

diff --git a/swebench/harness/modal_eval/run_evaluation_modal.py b/swebench/harness/modal_eval/run_evaluation_modal.py
@@ -13,8 +13,8 @@
 
 from dataclasses import dataclass
 from pathlib import Path
-from swebench.harness.constants import KEY_INSTANCE_ID
 from swebench.harness.docker_build import setup_logger
+from swebench.harness.reporting import make_run_report
 from swebench.harness.utils import EvaluationError
 from typing import cast
 
@@ -195,110 +195,13 @@ def get_instance_image(test_spec: TestSpec) -> modal.Image:
             )
             .workdir("/testbed/")
         )
-
-def make_run_report(
-        predictions: dict,
-        full_dataset: list,
-        run_id: str
-    ) -> Path:
-    """
-    Make a final evaluation and run report of the instances that have been run.
 
-    Args:
-        predictions (dict): Predictions dict generated by the model
-        full_dataset (list): List of all instances
-        run_id (str): Run ID
-    
-    Returns:
-        Path to report file
-    """
-    # Sets to store IDs of different outcomes
-    completed_ids = set()
-    resolved_ids = set()
-    error_ids = set()
-    unresolved_ids = set()
-    incomplete_ids = set()
-    empty_patch_ids = set()
-
-    for instance in full_dataset:
-        instance_id = instance[KEY_INSTANCE_ID]
-
-        # Instances that were not submitted
-        if instance_id not in predictions:
-            incomplete_ids.add(instance_id)
-            continue
-
-        # Instances with empty patches
-        prediction = predictions[instance_id]
-        if prediction.get("model_patch", None) in ["", None]:
-            empty_patch_ids.add(instance_id)
-            continue
-
-        # Instances that errored
-        log_dir = get_log_dir(predictions[instance_id], run_id, instance_id)
-        report_file = log_dir / "report.json"
-        if not report_file.exists():
-            error_ids.add(instance_id)
-            continue
-
-        # Instance completed successfully
-        completed_ids.add(instance_id)
-        try:
-            report = json.loads(report_file.read_text())
-            if report[instance_id]["resolved"]:
-                resolved_ids.add(instance_id)
-            else:
-                unresolved_ids.add(instance_id)
-        except Exception as e:
-            print(f"{instance_id}: error loading report.json: {e}")
-            error_ids.add(instance_id)
-
-    # Print final report
-    dataset_ids = {i[KEY_INSTANCE_ID] for i in full_dataset}
-    print(f"Total instances: {len(full_dataset)}")
-    print(f"Instances submitted: {len(set(predictions.keys()) & dataset_ids)}")
-    print(f"Instances completed: {len(completed_ids)}")
-    print(f"Instances incomplete: {len(incomplete_ids)}")
-    print(f"Instances resolved: {len(resolved_ids)}")
-    print(f"Instances unresolved: {len(unresolved_ids)}")
-    print(f"Instances with empty patches: {len(empty_patch_ids)}")
-    print(f"Instances with errors: {len(error_ids)}")
-
-    # Write report to file
-    report = {
-        "total_instances": len(full_dataset),
-        "submitted_instances": len(predictions),
-        "completed_instances": len(completed_ids),
-        "resolved_instances": len(resolved_ids),
-        "unresolved_instances": len(unresolved_ids),
-        "empty_patch_instances": len(empty_patch_ids),
-        "error_instances": len(error_ids),
-        "completed_ids": list(sorted(completed_ids)),
-        "incomplete_ids": list(sorted(incomplete_ids)),
-        "empty_patch_ids": list(sorted(empty_patch_ids)),
-        "submitted_ids": list(sorted(predictions.keys())),
-        "resolved_ids": list(sorted(resolved_ids)),
-        "unresolved_ids": list(sorted(unresolved_ids)),
-        "error_ids": list(sorted(error_ids)),
-        "schema_version": 2,
-    }
-
-    report_file = Path(
-        list(predictions.values())[0]["model_name_or_path"].replace("/", "__")
-        + f".{run_id}"
-        + ".json"
-    )
-
-    with open(report_file, "w") as f:
-        print(json.dumps(report, indent=4), file=f)
-
-    print(f"Report written to {report_file}")
-    return report_file
 
 def get_log_dir(pred: dict, run_id: str, instance_id: str) -> Path:
     model_name_or_path = cast(str, pred.get("model_name_or_path", "None").replace("/", "__"))
     return RUN_EVALUATION_LOG_DIR / run_id / model_name_or_path / instance_id
 
+
 @app.function(
     image=swebench_image.add_local_file(
         LOCAL_SANDBOX_ENTRYPOINT_PATH,
@@ -415,7 +318,7 @@ def run_instance_modal(
         report = get_eval_report(
             test_spec=test_spec,
             prediction=pred,
-            log_path=test_output_path,
+            test_log_path=test_output_path,
             include_tests_status=True,
         )
         logger.info(
@@ -465,6 +368,7 @@ def run_instance_modal(
             errored=True,
         )
 
+
 def run_instances_modal(
         predictions: dict,
         instances: list,

diff --git a/swebench/harness/reporting.py b/swebench/harness/reporting.py
@@ -0,0 +1,137 @@
+import docker
+import json
+from pathlib import Path
+from typing import Optional
+
+from swebench.harness.constants import (
+    KEY_INSTANCE_ID,
+    KEY_MODEL,
+    KEY_PREDICTION,
+    RUN_EVALUATION_LOG_DIR,
+    LOG_REPORT,
+)
+from swebench.harness.docker_utils import list_images
+from swebench.harness.test_spec.test_spec import make_test_spec
+
+
+def make_run_report(
+        predictions: dict,
+        full_dataset: list,
+        run_id: str,
+        client: Optional[docker.DockerClient] = None,
+    ) -> Path:
+    """
+    Make a final evaluation and run report of the instances that have been run.
+    Also reports on images and containers that may still running if client is provided.
+
+    Args:
+        predictions (dict): Predictions dict generated by the model
+        full_dataset (list): List of all instances
+        run_id (str): Run ID
+        client (docker.DockerClient): Docker client (optional)
+    
+    Returns:
+        Path to report file
+    """
+    # instantiate sets to store IDs of different outcomes
+    completed_ids = set()
+    resolved_ids = set()
+    error_ids = set()
+    unstopped_containers = set()
+    unremoved_images = set()
+    unresolved_ids = set()
+    incomplete_ids = set()
+    # get instances with empty patches
+    empty_patch_ids = set()
+
+    # iterate through dataset and check if the instance has been run
+    for instance in full_dataset:
+        instance_id = instance[KEY_INSTANCE_ID]
+        if instance_id not in predictions:
+            # skip instances without predictions
+            incomplete_ids.add(instance_id)
+            continue
+        prediction = predictions[instance_id]
+        if prediction.get(KEY_PREDICTION, None) in ["", None]:
+            empty_patch_ids.add(instance_id)
+            continue
+        report_file = (
+            RUN_EVALUATION_LOG_DIR
+            / run_id
+            / prediction[KEY_MODEL].replace("/", "__")
+            / prediction[KEY_INSTANCE_ID]
+            / LOG_REPORT
+        )
+        if report_file.exists():
+            # If report file exists, then the instance has been run
+            completed_ids.add(instance_id)
+            report = json.loads(report_file.read_text())
+            if report[instance_id]["resolved"]:
+                # Record if the instance was resolved
+                resolved_ids.add(instance_id)
+            else:
+                unresolved_ids.add(instance_id)
+        else:
+            # Otherwise, the instance was not run successfully
+            error_ids.add(instance_id)
+
+    if client:
+        # get remaining images and containers
+        images = list_images(client)
+        test_specs = list(map(make_test_spec, full_dataset))
+        for spec in test_specs:
+            image_name = spec.instance_image_key
+            if image_name in images:
+                unremoved_images.add(image_name)
+        containers = client.containers.list(all=True)
+        for container in containers:
+            if run_id in container.name:
+                unstopped_containers.add(container.name)
+
+    # print final report
+    dataset_ids = {i[KEY_INSTANCE_ID] for i in full_dataset}
+    print(f"Total instances: {len(full_dataset)}")
+    print(f"Instances submitted: {len(set(predictions.keys()) & dataset_ids)}")
+    print(f"Instances completed: {len(completed_ids)}")
+    print(f"Instances incomplete: {len(incomplete_ids)}")
+    print(f"Instances resolved: {len(resolved_ids)}")
+    print(f"Instances unresolved: {len(unresolved_ids)}")
+    print(f"Instances with empty patches: {len(empty_patch_ids)}")
+    print(f"Instances with errors: {len(error_ids)}")
+    if client:
+        print(f"Unstopped containers: {len(unstopped_containers)}")
+        print(f"Unremoved images: {len(unremoved_images)}")
+
+    # write report to file
+    report = {
+        "total_instances": len(full_dataset),
+        "submitted_instances": len(predictions),
+        "completed_instances": len(completed_ids),
+        "resolved_instances": len(resolved_ids),
+        "unresolved_instances": len(unresolved_ids),
+        "empty_patch_instances": len(empty_patch_ids),
+        "error_instances": len(error_ids),
+        "completed_ids": list(sorted(completed_ids)),
+        "incomplete_ids": list(sorted(incomplete_ids)),
+        "empty_patch_ids": list(sorted(empty_patch_ids)),
+        "submitted_ids": list(sorted(predictions.keys())),
+        "resolved_ids": list(sorted(resolved_ids)),
+        "unresolved_ids": list(sorted(unresolved_ids)),
+        "error_ids": list(sorted(error_ids)),
+        "schema_version": 2,
+    }
+    if not client:
+        report.update({
+            "unstopped_instances": len(unstopped_containers),
+            "unstopped_containers": list(sorted(unstopped_containers)),
+            "unremoved_images": list(sorted(unremoved_images)),
+        })
+    report_file = Path(
+        list(predictions.values())[0][KEY_MODEL].replace("/", "__")
+        + f".{run_id}"
+        + ".json"
+    )
+    with open(report_file, "w") as f:
+        print(json.dumps(report, indent=4), file=f)
+    print(f"Report written to {report_file}")
+    return report_file