diff --git a/swebench/harness/reporting.py b/swebench/harness/reporting.py new file mode 100644 index 00000000..3920e8b8 --- /dev/null +++ b/swebench/harness/reporting.py @@ -0,0 +1,136 @@ +import docker +import json +from pathlib import Path + +from swebench.harness.constants import ( + KEY_INSTANCE_ID, + KEY_MODEL, + KEY_PREDICTION, + RUN_EVALUATION_LOG_DIR, + LOG_REPORT, +) +from swebench.harness.docker_utils import list_images +from swebench.harness.test_spec import make_test_spec + + +def make_run_report( + predictions: dict, + full_dataset: list, + run_id: str, + client: docker.DockerClient | None = None, + ) -> Path: + """ + Make a final evaluation and run report of the instances that have been run. + Also reports on images and containers that may still running if client is provided. + + Args: + predictions (dict): Predictions dict generated by the model + full_dataset (list): List of all instances + run_id (str): Run ID + client (docker.DockerClient): Docker client (optional) + + Returns: + Path to report file + """ + # instantiate sets to store IDs of different outcomes + completed_ids = set() + resolved_ids = set() + error_ids = set() + unstopped_containers = set() + unremoved_images = set() + unresolved_ids = set() + incomplete_ids = set() + # get instances with empty patches + empty_patch_ids = set() + + # iterate through dataset and check if the instance has been run + for instance in full_dataset: + instance_id = instance[KEY_INSTANCE_ID] + if instance_id not in predictions: + # skip instances without predictions + incomplete_ids.add(instance_id) + continue + prediction = predictions[instance_id] + if prediction.get(KEY_PREDICTION, None) in ["", None]: + empty_patch_ids.add(instance_id) + continue + report_file = ( + RUN_EVALUATION_LOG_DIR + / run_id + / prediction[KEY_MODEL].replace("/", "__") + / prediction[KEY_INSTANCE_ID] + / LOG_REPORT + ) + if report_file.exists(): + # If report file exists, then the instance has been run + completed_ids.add(instance_id) + report = json.loads(report_file.read_text()) + if report[instance_id]["resolved"]: + # Record if the instance was resolved + resolved_ids.add(instance_id) + else: + unresolved_ids.add(instance_id) + else: + # Otherwise, the instance was not run successfully + error_ids.add(instance_id) + + if client: + # get remaining images and containers + images = list_images(client) + test_specs = list(map(make_test_spec, full_dataset)) + for spec in test_specs: + image_name = spec.instance_image_key + if image_name in images: + unremoved_images.add(image_name) + containers = client.containers.list(all=True) + for container in containers: + if run_id in container.name: + unstopped_containers.add(container.name) + + # print final report + dataset_ids = {i[KEY_INSTANCE_ID] for i in full_dataset} + print(f"Total instances: {len(full_dataset)}") + print(f"Instances submitted: {len(set(predictions.keys()) & dataset_ids)}") + print(f"Instances completed: {len(completed_ids)}") + print(f"Instances incomplete: {len(incomplete_ids)}") + print(f"Instances resolved: {len(resolved_ids)}") + print(f"Instances unresolved: {len(unresolved_ids)}") + print(f"Instances with empty patches: {len(empty_patch_ids)}") + print(f"Instances with errors: {len(error_ids)}") + if client: + print(f"Unstopped containers: {len(unstopped_containers)}") + print(f"Unremoved images: {len(unremoved_images)}") + + # write report to file + report = { + "total_instances": len(full_dataset), + "submitted_instances": len(predictions), + "completed_instances": len(completed_ids), + "resolved_instances": len(resolved_ids), + "unresolved_instances": len(unresolved_ids), + "empty_patch_instances": len(empty_patch_ids), + "error_instances": len(error_ids), + "completed_ids": list(sorted(completed_ids)), + "incomplete_ids": list(sorted(incomplete_ids)), + "empty_patch_ids": list(sorted(empty_patch_ids)), + "submitted_ids": list(sorted(predictions.keys())), + "resolved_ids": list(sorted(resolved_ids)), + "unresolved_ids": list(sorted(unresolved_ids)), + "error_ids": list(sorted(error_ids)), + "schema_version": 2, + } + if not client: + report.update({ + "unstopped_instances": len(unstopped_containers), + "unstopped_containers": list(sorted(unstopped_containers)), + "unremoved_images": list(sorted(unremoved_images)), + }) + report_file = Path( + list(predictions.values())[0][KEY_MODEL].replace("/", "__") + + f".{run_id}" + + ".json" + ) + with open(report_file, "w") as f: + print(json.dumps(report, indent=4), file=f) + print(f"Report written to {report_file}") + return report_file diff --git a/swebench/harness/run_evaluation.py b/swebench/harness/run_evaluation.py index 61d2885d..b3ab7e0c 100644 --- a/swebench/harness/run_evaluation.py +++ b/swebench/harness/run_evaluation.py @@ -48,10 +48,10 @@ from swebench.harness.grading import get_eval_report from swebench.harness.test_spec import make_test_spec, TestSpec from swebench.harness.utils import load_swebench_dataset, str2bool, EvaluationError +from swebench.harness.reporting import make_run_report from swebench.harness.run_evaluation_modal import run_instances_modal - def run_instance( test_spec: TestSpec, pred: dict, @@ -349,124 +349,6 @@ def get_dataset_from_preds( return dataset -def make_run_report( - predictions: dict, - full_dataset: list, - client: docker.DockerClient, - run_id: str - ) -> Path: - """ - Make a final evaluation and run report of the instances that have been run. - Also reports on images and containers that may still running! - - Args: - predictions (dict): Predictions dict generated by the model - full_dataset (list): List of all instances - client (docker.DockerClient): Docker client - run_id (str): Run ID - - Returns: - Path to report file - """ - # instantiate sets to store IDs of different outcomes - completed_ids = set() - resolved_ids = set() - error_ids = set() - unstopped_containers = set() - unremoved_images = set() - unresolved_ids = set() - incomplete_ids = set() - # get instances with empty patches - empty_patch_ids = set() - - # iterate through dataset and check if the instance has been run - for instance in full_dataset: - instance_id = instance[KEY_INSTANCE_ID] - if instance_id not in predictions: - # skip instances without - incomplete_ids.add(instance_id) - continue - prediction = predictions[instance_id] - if prediction.get(KEY_PREDICTION, None) in ["", None]: - empty_patch_ids.add(instance_id) - continue - report_file = ( - RUN_EVALUATION_LOG_DIR - / run_id - / prediction[KEY_MODEL].replace("/", "__") - / prediction[KEY_INSTANCE_ID] - / LOG_REPORT - ) - if report_file.exists(): - # If report file exists, then the instance has been run - completed_ids.add(instance_id) - report = json.loads(report_file.read_text()) - if report[instance_id]["resolved"]: - # Record if the instance was resolved - resolved_ids.add(instance_id) - else: - unresolved_ids.add(instance_id) - else: - # Otherwise, the instance was not run successfully - error_ids.add(instance_id) - - # get remaining images and containers - images = list_images(client) - test_specs = list(map(make_test_spec, full_dataset)) - for spec in test_specs: - image_name = spec.instance_image_key - if image_name in images: - unremoved_images.add(image_name) - containers = client.containers.list(all=True) - for container in containers: - if run_id in container.name: - unstopped_containers.add(container.name) - - # print final report - dataset_ids = {i[KEY_INSTANCE_ID] for i in full_dataset} - print(f"Total instances: {len(full_dataset)}") - print(f"Instances submitted: {len(set(predictions.keys()) & dataset_ids)}") - print(f"Instances completed: {len(completed_ids)}") - print(f"Instances incomplete: {len(incomplete_ids)}") - print(f"Instances resolved: {len(resolved_ids)}") - print(f"Instances unresolved: {len(unresolved_ids)}") - print(f"Instances with empty patches: {len(empty_patch_ids)}") - print(f"Instances with errors: {len(error_ids)}") - print(f"Unstopped containers: {len(unstopped_containers)}") - print(f"Unremoved images: {len(unremoved_images)}") - - # write report to file - report = { - "total_instances": len(full_dataset), - "submitted_instances": len(predictions), - "completed_instances": len(completed_ids), - "resolved_instances": len(resolved_ids), - "unresolved_instances": len(unresolved_ids), - "empty_patch_instances": len(empty_patch_ids), - "error_instances": len(error_ids), - "unstopped_instances": len(unstopped_containers), - "completed_ids": list(sorted(completed_ids)), - "incomplete_ids": list(sorted(incomplete_ids)), - "empty_patch_ids": list(sorted(empty_patch_ids)), - "submitted_ids": list(sorted(predictions.keys())), - "resolved_ids": list(sorted(resolved_ids)), - "unresolved_ids": list(sorted(unresolved_ids)), - "error_ids": list(sorted(error_ids)), - "unstopped_containers": list(sorted(unstopped_containers)), - "unremoved_images": list(sorted(unremoved_images)), - "schema_version": 2, - } - report_file = Path( - list(predictions.values())[0][KEY_MODEL].replace("/", "__") - + f".{run_id}" - + ".json" - ) - with open(report_file, "w") as f: - print(json.dumps(report, indent=4), file=f) - print(f"Report written to {report_file}") - return report_file - - def get_gold_predictions(dataset_name: str, split: str): """ Get gold predictions for the given dataset and split. @@ -559,7 +441,7 @@ def main( # clean images + make final report clean_images(client, existing_images, cache_level, clean) - make_run_report(predictions, full_dataset, client, run_id) + make_run_report(predictions, full_dataset, run_id, client) if __name__ == "__main__": parser = ArgumentParser() diff --git a/swebench/harness/run_evaluation_modal.py b/swebench/harness/run_evaluation_modal.py index 787f3e91..5976f1ef 100644 --- a/swebench/harness/run_evaluation_modal.py +++ b/swebench/harness/run_evaluation_modal.py @@ -16,11 +16,10 @@ from typing import cast -from logging import Logger - from swebench.harness.docker_build import setup_logger -from swebench.harness.constants import KEY_INSTANCE_ID from swebench.harness.utils import EvaluationError +from swebench.harness.reporting import make_run_report + SANDBOX_ENTRYPOINT = "run_evaluation_modal_entrypoint" LOCAL_SANDBOX_ENTRYPOINT_PATH = (Path(__file__).parent / f"{SANDBOX_ENTRYPOINT}.py").resolve() @@ -205,110 +204,13 @@ def get_instance_image(test_spec: TestSpec) -> modal.Image: ) .workdir("/testbed/") ) - -def make_run_report( - predictions: dict, - full_dataset: list, - run_id: str - ) -> Path: - """ - Make a final evaluation and run report of the instances that have been run. - Args: - predictions (dict): Predictions dict generated by the model - full_dataset (list): List of all instances - run_id (str): Run ID - - Returns: - Path to report file - """ - # Sets to store IDs of different outcomes - completed_ids = set() - resolved_ids = set() - error_ids = set() - unresolved_ids = set() - incomplete_ids = set() - empty_patch_ids = set() - - for instance in full_dataset: - instance_id = instance[KEY_INSTANCE_ID] - - # Instances that were not submitted - if instance_id not in predictions: - incomplete_ids.add(instance_id) - continue - - # Instances with empty patches - prediction = predictions[instance_id] - if prediction.get("model_patch", None) in ["", None]: - empty_patch_ids.add(instance_id) - continue - - # Instances that errored - log_dir = get_log_dir(predictions[instance_id], run_id, instance_id) - report_file = log_dir / "report.json" - if not report_file.exists(): - error_ids.add(instance_id) - continue - - # Instance completed successfully - completed_ids.add(instance_id) - try: - report = json.loads(report_file.read_text()) - if report[instance_id]["resolved"]: - resolved_ids.add(instance_id) - else: - unresolved_ids.add(instance_id) - except Exception as e: - print(f"{instance_id}: error loading report.json: {e}") - error_ids.add(instance_id) - - # Print final report - dataset_ids = {i[KEY_INSTANCE_ID] for i in full_dataset} - print(f"Total instances: {len(full_dataset)}") - print(f"Instances submitted: {len(set(predictions.keys()) & dataset_ids)}") - print(f"Instances completed: {len(completed_ids)}") - print(f"Instances incomplete: {len(incomplete_ids)}") - print(f"Instances resolved: {len(resolved_ids)}") - print(f"Instances unresolved: {len(unresolved_ids)}") - print(f"Instances with empty patches: {len(empty_patch_ids)}") - print(f"Instances with errors: {len(error_ids)}") - - # Write report to file - report = { - "total_instances": len(full_dataset), - "submitted_instances": len(predictions), - "completed_instances": len(completed_ids), - "resolved_instances": len(resolved_ids), - "unresolved_instances": len(unresolved_ids), - "empty_patch_instances": len(empty_patch_ids), - "error_instances": len(error_ids), - "completed_ids": list(sorted(completed_ids)), - "incomplete_ids": list(sorted(incomplete_ids)), - "empty_patch_ids": list(sorted(empty_patch_ids)), - "submitted_ids": list(sorted(predictions.keys())), - "resolved_ids": list(sorted(resolved_ids)), - "unresolved_ids": list(sorted(unresolved_ids)), - "error_ids": list(sorted(error_ids)), - "schema_version": 2, - } - - report_file = Path( - list(predictions.values())[0]["model_name_or_path"].replace("/", "__") - + f".{run_id}" - + ".json" - ) - - with open(report_file, "w") as f: - print(json.dumps(report, indent=4), file=f) - - print(f"Report written to {report_file}") - return report_file def get_log_dir(pred: dict, run_id: str, instance_id: str) -> Path: model_name_or_path = cast(str, pred.get("model_name_or_path", "None").replace("/", "__")) return RUN_EVALUATION_LOG_DIR / run_id / model_name_or_path / instance_id + @app.function( image=swebench_image, mounts=[ @@ -478,6 +380,7 @@ def run_instance_modal( errored=True, ) + def run_instances_modal( predictions: dict, instances: list, diff --git a/swebench/harness/utils.py b/swebench/harness/utils.py index 3188b686..6c47ccd2 100644 --- a/swebench/harness/utils.py +++ b/swebench/harness/utils.py @@ -36,6 +36,7 @@ def __str__(self): f"Check ({self.log_file}) for more information." ) + HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py index 7dff357e..b00cffa6 100644 --- a/tests/test_evaluation.py +++ b/tests/test_evaluation.py @@ -27,8 +27,8 @@ def test_make_run_report(tmpdir) -> None: } }, [TEST_INSTANCE], + "test", client, - "test" ) assert output_path.is_file() report = json.loads(output_path.read_text())