Skip to content

Commit

Permalink
Move make_run_report to swebench/harness/reporting for run_evaluation…
Browse files Browse the repository at this point in the history
… and run_evaluation_modal
  • Loading branch information
carlosejimenez committed Jan 17, 2025
1 parent a941896 commit 265f9ff
Show file tree
Hide file tree
Showing 5 changed files with 145 additions and 229 deletions.
2 changes: 1 addition & 1 deletion swebench/harness/grading.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def get_resolution_status(report: dict[str, dict[str, Any]]) -> str:
return ResolvedStatus.PARTIAL.value
else:
return ResolvedStatus.NO.value


def get_eval_report(
test_spec: TestSpec,
Expand Down
104 changes: 4 additions & 100 deletions swebench/harness/modal_eval/run_evaluation_modal.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@

from dataclasses import dataclass
from pathlib import Path
from swebench.harness.constants import KEY_INSTANCE_ID
from swebench.harness.docker_build import setup_logger
from swebench.harness.reporting import make_run_report
from swebench.harness.utils import EvaluationError
from typing import cast

Expand Down Expand Up @@ -195,110 +195,13 @@ def get_instance_image(test_spec: TestSpec) -> modal.Image:
)
.workdir("/testbed/")
)

def make_run_report(
predictions: dict,
full_dataset: list,
run_id: str
) -> Path:
"""
Make a final evaluation and run report of the instances that have been run.

Args:
predictions (dict): Predictions dict generated by the model
full_dataset (list): List of all instances
run_id (str): Run ID
Returns:
Path to report file
"""
# Sets to store IDs of different outcomes
completed_ids = set()
resolved_ids = set()
error_ids = set()
unresolved_ids = set()
incomplete_ids = set()
empty_patch_ids = set()

for instance in full_dataset:
instance_id = instance[KEY_INSTANCE_ID]

# Instances that were not submitted
if instance_id not in predictions:
incomplete_ids.add(instance_id)
continue

# Instances with empty patches
prediction = predictions[instance_id]
if prediction.get("model_patch", None) in ["", None]:
empty_patch_ids.add(instance_id)
continue

# Instances that errored
log_dir = get_log_dir(predictions[instance_id], run_id, instance_id)
report_file = log_dir / "report.json"
if not report_file.exists():
error_ids.add(instance_id)
continue

# Instance completed successfully
completed_ids.add(instance_id)
try:
report = json.loads(report_file.read_text())
if report[instance_id]["resolved"]:
resolved_ids.add(instance_id)
else:
unresolved_ids.add(instance_id)
except Exception as e:
print(f"{instance_id}: error loading report.json: {e}")
error_ids.add(instance_id)

# Print final report
dataset_ids = {i[KEY_INSTANCE_ID] for i in full_dataset}
print(f"Total instances: {len(full_dataset)}")
print(f"Instances submitted: {len(set(predictions.keys()) & dataset_ids)}")
print(f"Instances completed: {len(completed_ids)}")
print(f"Instances incomplete: {len(incomplete_ids)}")
print(f"Instances resolved: {len(resolved_ids)}")
print(f"Instances unresolved: {len(unresolved_ids)}")
print(f"Instances with empty patches: {len(empty_patch_ids)}")
print(f"Instances with errors: {len(error_ids)}")

# Write report to file
report = {
"total_instances": len(full_dataset),
"submitted_instances": len(predictions),
"completed_instances": len(completed_ids),
"resolved_instances": len(resolved_ids),
"unresolved_instances": len(unresolved_ids),
"empty_patch_instances": len(empty_patch_ids),
"error_instances": len(error_ids),
"completed_ids": list(sorted(completed_ids)),
"incomplete_ids": list(sorted(incomplete_ids)),
"empty_patch_ids": list(sorted(empty_patch_ids)),
"submitted_ids": list(sorted(predictions.keys())),
"resolved_ids": list(sorted(resolved_ids)),
"unresolved_ids": list(sorted(unresolved_ids)),
"error_ids": list(sorted(error_ids)),
"schema_version": 2,
}

report_file = Path(
list(predictions.values())[0]["model_name_or_path"].replace("/", "__")
+ f".{run_id}"
+ ".json"
)

with open(report_file, "w") as f:
print(json.dumps(report, indent=4), file=f)

print(f"Report written to {report_file}")
return report_file

def get_log_dir(pred: dict, run_id: str, instance_id: str) -> Path:
model_name_or_path = cast(str, pred.get("model_name_or_path", "None").replace("/", "__"))
return RUN_EVALUATION_LOG_DIR / run_id / model_name_or_path / instance_id


@app.function(
image=swebench_image.add_local_file(
LOCAL_SANDBOX_ENTRYPOINT_PATH,
Expand Down Expand Up @@ -415,7 +318,7 @@ def run_instance_modal(
report = get_eval_report(
test_spec=test_spec,
prediction=pred,
log_path=test_output_path,
test_log_path=test_output_path,
include_tests_status=True,
)
logger.info(
Expand Down Expand Up @@ -465,6 +368,7 @@ def run_instance_modal(
errored=True,
)


def run_instances_modal(
predictions: dict,
instances: list,
Expand Down
137 changes: 137 additions & 0 deletions swebench/harness/reporting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import docker
import json
from pathlib import Path
from typing import Optional

from swebench.harness.constants import (
KEY_INSTANCE_ID,
KEY_MODEL,
KEY_PREDICTION,
RUN_EVALUATION_LOG_DIR,
LOG_REPORT,
)
from swebench.harness.docker_utils import list_images
from swebench.harness.test_spec.test_spec import make_test_spec


def make_run_report(
predictions: dict,
full_dataset: list,
run_id: str,
client: Optional[docker.DockerClient] = None,
) -> Path:
"""
Make a final evaluation and run report of the instances that have been run.
Also reports on images and containers that may still running if client is provided.
Args:
predictions (dict): Predictions dict generated by the model
full_dataset (list): List of all instances
run_id (str): Run ID
client (docker.DockerClient): Docker client (optional)
Returns:
Path to report file
"""
# instantiate sets to store IDs of different outcomes
completed_ids = set()
resolved_ids = set()
error_ids = set()
unstopped_containers = set()
unremoved_images = set()
unresolved_ids = set()
incomplete_ids = set()
# get instances with empty patches
empty_patch_ids = set()

# iterate through dataset and check if the instance has been run
for instance in full_dataset:
instance_id = instance[KEY_INSTANCE_ID]
if instance_id not in predictions:
# skip instances without predictions
incomplete_ids.add(instance_id)
continue
prediction = predictions[instance_id]
if prediction.get(KEY_PREDICTION, None) in ["", None]:
empty_patch_ids.add(instance_id)
continue
report_file = (
RUN_EVALUATION_LOG_DIR
/ run_id
/ prediction[KEY_MODEL].replace("/", "__")
/ prediction[KEY_INSTANCE_ID]
/ LOG_REPORT
)
if report_file.exists():
# If report file exists, then the instance has been run
completed_ids.add(instance_id)
report = json.loads(report_file.read_text())
if report[instance_id]["resolved"]:
# Record if the instance was resolved
resolved_ids.add(instance_id)
else:
unresolved_ids.add(instance_id)
else:
# Otherwise, the instance was not run successfully
error_ids.add(instance_id)

if client:
# get remaining images and containers
images = list_images(client)
test_specs = list(map(make_test_spec, full_dataset))
for spec in test_specs:
image_name = spec.instance_image_key
if image_name in images:
unremoved_images.add(image_name)
containers = client.containers.list(all=True)
for container in containers:
if run_id in container.name:
unstopped_containers.add(container.name)

# print final report
dataset_ids = {i[KEY_INSTANCE_ID] for i in full_dataset}
print(f"Total instances: {len(full_dataset)}")
print(f"Instances submitted: {len(set(predictions.keys()) & dataset_ids)}")
print(f"Instances completed: {len(completed_ids)}")
print(f"Instances incomplete: {len(incomplete_ids)}")
print(f"Instances resolved: {len(resolved_ids)}")
print(f"Instances unresolved: {len(unresolved_ids)}")
print(f"Instances with empty patches: {len(empty_patch_ids)}")
print(f"Instances with errors: {len(error_ids)}")
if client:
print(f"Unstopped containers: {len(unstopped_containers)}")
print(f"Unremoved images: {len(unremoved_images)}")

# write report to file
report = {
"total_instances": len(full_dataset),
"submitted_instances": len(predictions),
"completed_instances": len(completed_ids),
"resolved_instances": len(resolved_ids),
"unresolved_instances": len(unresolved_ids),
"empty_patch_instances": len(empty_patch_ids),
"error_instances": len(error_ids),
"completed_ids": list(sorted(completed_ids)),
"incomplete_ids": list(sorted(incomplete_ids)),
"empty_patch_ids": list(sorted(empty_patch_ids)),
"submitted_ids": list(sorted(predictions.keys())),
"resolved_ids": list(sorted(resolved_ids)),
"unresolved_ids": list(sorted(unresolved_ids)),
"error_ids": list(sorted(error_ids)),
"schema_version": 2,
}
if not client:
report.update({
"unstopped_instances": len(unstopped_containers),
"unstopped_containers": list(sorted(unstopped_containers)),
"unremoved_images": list(sorted(unremoved_images)),
})
report_file = Path(
list(predictions.values())[0][KEY_MODEL].replace("/", "__")
+ f".{run_id}"
+ ".json"
)
with open(report_file, "w") as f:
print(json.dumps(report, indent=4), file=f)
print(f"Report written to {report_file}")
return report_file
Loading

0 comments on commit 265f9ff

Please sign in to comment.