-
Notifications
You must be signed in to change notification settings - Fork 403
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Move make_run_report to swebench/harness/reporting for run_evaluation…
… and run_evaluation_modal
- Loading branch information
1 parent
a941896
commit 265f9ff
Showing
5 changed files
with
145 additions
and
229 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
import docker | ||
import json | ||
from pathlib import Path | ||
from typing import Optional | ||
|
||
from swebench.harness.constants import ( | ||
KEY_INSTANCE_ID, | ||
KEY_MODEL, | ||
KEY_PREDICTION, | ||
RUN_EVALUATION_LOG_DIR, | ||
LOG_REPORT, | ||
) | ||
from swebench.harness.docker_utils import list_images | ||
from swebench.harness.test_spec.test_spec import make_test_spec | ||
|
||
|
||
def make_run_report( | ||
predictions: dict, | ||
full_dataset: list, | ||
run_id: str, | ||
client: Optional[docker.DockerClient] = None, | ||
) -> Path: | ||
""" | ||
Make a final evaluation and run report of the instances that have been run. | ||
Also reports on images and containers that may still running if client is provided. | ||
Args: | ||
predictions (dict): Predictions dict generated by the model | ||
full_dataset (list): List of all instances | ||
run_id (str): Run ID | ||
client (docker.DockerClient): Docker client (optional) | ||
Returns: | ||
Path to report file | ||
""" | ||
# instantiate sets to store IDs of different outcomes | ||
completed_ids = set() | ||
resolved_ids = set() | ||
error_ids = set() | ||
unstopped_containers = set() | ||
unremoved_images = set() | ||
unresolved_ids = set() | ||
incomplete_ids = set() | ||
# get instances with empty patches | ||
empty_patch_ids = set() | ||
|
||
# iterate through dataset and check if the instance has been run | ||
for instance in full_dataset: | ||
instance_id = instance[KEY_INSTANCE_ID] | ||
if instance_id not in predictions: | ||
# skip instances without predictions | ||
incomplete_ids.add(instance_id) | ||
continue | ||
prediction = predictions[instance_id] | ||
if prediction.get(KEY_PREDICTION, None) in ["", None]: | ||
empty_patch_ids.add(instance_id) | ||
continue | ||
report_file = ( | ||
RUN_EVALUATION_LOG_DIR | ||
/ run_id | ||
/ prediction[KEY_MODEL].replace("/", "__") | ||
/ prediction[KEY_INSTANCE_ID] | ||
/ LOG_REPORT | ||
) | ||
if report_file.exists(): | ||
# If report file exists, then the instance has been run | ||
completed_ids.add(instance_id) | ||
report = json.loads(report_file.read_text()) | ||
if report[instance_id]["resolved"]: | ||
# Record if the instance was resolved | ||
resolved_ids.add(instance_id) | ||
else: | ||
unresolved_ids.add(instance_id) | ||
else: | ||
# Otherwise, the instance was not run successfully | ||
error_ids.add(instance_id) | ||
|
||
if client: | ||
# get remaining images and containers | ||
images = list_images(client) | ||
test_specs = list(map(make_test_spec, full_dataset)) | ||
for spec in test_specs: | ||
image_name = spec.instance_image_key | ||
if image_name in images: | ||
unremoved_images.add(image_name) | ||
containers = client.containers.list(all=True) | ||
for container in containers: | ||
if run_id in container.name: | ||
unstopped_containers.add(container.name) | ||
|
||
# print final report | ||
dataset_ids = {i[KEY_INSTANCE_ID] for i in full_dataset} | ||
print(f"Total instances: {len(full_dataset)}") | ||
print(f"Instances submitted: {len(set(predictions.keys()) & dataset_ids)}") | ||
print(f"Instances completed: {len(completed_ids)}") | ||
print(f"Instances incomplete: {len(incomplete_ids)}") | ||
print(f"Instances resolved: {len(resolved_ids)}") | ||
print(f"Instances unresolved: {len(unresolved_ids)}") | ||
print(f"Instances with empty patches: {len(empty_patch_ids)}") | ||
print(f"Instances with errors: {len(error_ids)}") | ||
if client: | ||
print(f"Unstopped containers: {len(unstopped_containers)}") | ||
print(f"Unremoved images: {len(unremoved_images)}") | ||
|
||
# write report to file | ||
report = { | ||
"total_instances": len(full_dataset), | ||
"submitted_instances": len(predictions), | ||
"completed_instances": len(completed_ids), | ||
"resolved_instances": len(resolved_ids), | ||
"unresolved_instances": len(unresolved_ids), | ||
"empty_patch_instances": len(empty_patch_ids), | ||
"error_instances": len(error_ids), | ||
"completed_ids": list(sorted(completed_ids)), | ||
"incomplete_ids": list(sorted(incomplete_ids)), | ||
"empty_patch_ids": list(sorted(empty_patch_ids)), | ||
"submitted_ids": list(sorted(predictions.keys())), | ||
"resolved_ids": list(sorted(resolved_ids)), | ||
"unresolved_ids": list(sorted(unresolved_ids)), | ||
"error_ids": list(sorted(error_ids)), | ||
"schema_version": 2, | ||
} | ||
if not client: | ||
report.update({ | ||
"unstopped_instances": len(unstopped_containers), | ||
"unstopped_containers": list(sorted(unstopped_containers)), | ||
"unremoved_images": list(sorted(unremoved_images)), | ||
}) | ||
report_file = Path( | ||
list(predictions.values())[0][KEY_MODEL].replace("/", "__") | ||
+ f".{run_id}" | ||
+ ".json" | ||
) | ||
with open(report_file, "w") as f: | ||
print(json.dumps(report, indent=4), file=f) | ||
print(f"Report written to {report_file}") | ||
return report_file |
Oops, something went wrong.