-
Notifications
You must be signed in to change notification settings - Fork 12
wip: add swe-fixer verifier #79
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Draft
rasdani
wants to merge
7
commits into
PrimeIntellect-ai:main
Choose a base branch
from
rasdani:swe-fixer
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Draft
Changes from 1 commit
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
48219c0
wip: add swe-fixer verifier
rasdani 0e0d81f
better patch and verify
rasdani 26eb5c3
fix: typo in file name
rasdani cd02496
fix: typo in registry
rasdani 2f120d8
add test
rasdani 2c982ad
run pre-commit
rasdani 23facd2
add `cydifflib` dependency
rasdani File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,192 @@ | ||
| ### adapted from https://github.com/InternLM/SWE-Fixer/blob/main/evaluation/code_edit.py | ||
| import json | ||
| import os | ||
| import re | ||
| import ast | ||
| import argparse | ||
|
|
||
| from genesys.schemas import Response | ||
| from genesys.verifiers.base_verifier import BaseVerifier | ||
|
|
||
|
|
||
| def parse_json_codeblock_from_reasoning_model_output(markdown_str): | ||
| _, answer_str = markdown_str.split("</think>") | ||
| answer_str = answer_str.strip() | ||
| # Extract everything between ```json and ``` markers | ||
| match = re.search(r"```json\s*(.*?)\s*```", answer_str, re.DOTALL) | ||
| if match: | ||
| return match.group(1).strip() | ||
| else: | ||
| return answer_str.strip() | ||
|
|
||
|
|
||
| def remove_line_numbers(content): | ||
| # Remove line numbers from the file content | ||
| return re.sub(r"^\d+\s", "", content, flags=re.MULTILINE) | ||
|
|
||
| def remove_empty_lines(code): | ||
| lines = code.splitlines() | ||
| filtered_lines = [line for line in lines if line.strip() != ""] | ||
| return "\n".join(filtered_lines) | ||
|
|
||
|
|
||
| def check_syntax(code): | ||
| if not code.strip(): | ||
| return False | ||
| try: | ||
| ast.parse(code) | ||
| except SyntaxError: | ||
| return False | ||
| return True | ||
|
|
||
|
|
||
| def check_code_differ_by_just_empty_lines(code, prev_code): | ||
| normalized_code1 = remove_empty_lines(code) | ||
| normalized_code2 = remove_empty_lines(prev_code) | ||
| return normalized_code1 == normalized_code2 | ||
|
|
||
|
|
||
| class SweFixerVerifier(BaseVerifier): | ||
| """ | ||
| Verifier for the SWE-Fixer dataset. | ||
| https://github.com/InternLM/SWE-Fixer | ||
| """ | ||
|
|
||
|
|
||
| def _patch_files_with_golden_patches(self, task_input): | ||
| """ | ||
| Patch files with golden patches from the task input. | ||
| This creates patched files using the ground truth patches for reference. | ||
|
|
||
| Args: | ||
| task_input: Dictionary containing task input data with modification instructions | ||
|
|
||
| Returns: | ||
| dict: Dictionary with file paths as keys and patched contents as values | ||
| """ | ||
| patched_files = {} | ||
|
|
||
| try: | ||
| # Extract files to be modified from task input | ||
| files_to_modify = task_input["metadata"]["input"]["files to be modified"] | ||
|
|
||
| for file_info in files_to_modify: | ||
| file_path = file_info["file"] | ||
| file_content = remove_line_numbers(file_info["file content"]) | ||
|
|
||
| # Find the golden patch information from the task input | ||
| golden_patches = task_input.get("verification_info", {}).get("golden_patches", []) | ||
|
|
||
| for patch in golden_patches: | ||
| if patch["file"] == file_path: | ||
| code_snippet_to_be_modified = remove_line_numbers( | ||
| patch["code snippet to be modified"] | ||
| ).rstrip() | ||
|
|
||
| correct_code_snippet = remove_line_numbers( | ||
| patch["correct code snippet"] | ||
| ).rstrip() | ||
|
|
||
| # Apply the golden patch if the snippet is found in the file | ||
| if code_snippet_to_be_modified and code_snippet_to_be_modified in file_content: | ||
| new_content = file_content.replace( | ||
| code_snippet_to_be_modified, correct_code_snippet | ||
| ) | ||
| patched_files[file_path] = new_content | ||
| elif file_content == "": # Handle new file case | ||
| patched_files[file_path] = correct_code_snippet | ||
|
|
||
| return patched_files | ||
|
|
||
| except Exception as e: | ||
| print(f"Error in patching files with golden patches: {e}") | ||
| return {} | ||
|
|
||
| def evaluate_task_code_editing(self, task_input, json_output): | ||
| try: | ||
| output = json.loads(json_output) | ||
| files = output | ||
| except Exception as e: | ||
| # logger.error(f"Error in parsing json output for task code editing: {e}") | ||
| # print(f"Error in parsing json output for task code editing: {e}") | ||
| print(f"EXCEPTION: {e}") | ||
| return "", "" | ||
| try: | ||
| git_diffs = "" | ||
| raw_git_diffs = "" | ||
| lint_success = False | ||
|
|
||
| for file in files: | ||
| # file_path = file["file path"] | ||
| file_path = file["file"] | ||
| code_snippet_to_be_modified = file["code snippet to be modified"] | ||
| edited_code_snippet = file["edited code snippet"] | ||
|
|
||
| code_snippet_to_be_modified = remove_line_numbers( | ||
| code_snippet_to_be_modified | ||
| ).rstrip() | ||
|
|
||
| file_content = "" | ||
| for f in task_input["metadata"]["input"]["files to be modified"]: | ||
| if f["file"] == file_path: | ||
| file_content = remove_line_numbers(f["file content"]) | ||
| break | ||
|
|
||
| if ( | ||
| code_snippet_to_be_modified | ||
| and code_snippet_to_be_modified in file_content | ||
| ) or file_content == "": | ||
| if file_content: | ||
| new_content = file_content.replace( | ||
| code_snippet_to_be_modified, edited_code_snippet | ||
| ) | ||
| else: # new file | ||
| new_content = edited_code_snippet | ||
|
|
||
| syntax_success = check_syntax(new_content) | ||
|
|
||
| differ_by_empty_lines = check_code_differ_by_just_empty_lines( | ||
| new_content, file_content | ||
| ) | ||
|
|
||
| if syntax_success and not differ_by_empty_lines: | ||
| return dict(score=1, verification_result_info={}) | ||
| else: | ||
| return dict(score=0, verification_result_info={}) | ||
| else: | ||
| return dict(score=0, verification_result_info={}) | ||
|
|
||
| except Exception as e: | ||
| breakpoint() | ||
| return dict(score=0, verification_result_info={"failure_reason": "Error in evaluating task code editing for instance {instance_id}: {e}"}) | ||
|
|
||
| def verify(self, result: Response): | ||
| """ | ||
| Evaluates the code patches by comparing the model's patches against golden patches. | ||
|
|
||
| The score is either 0 or 1, representing whether the patches are correct. | ||
| """ | ||
|
|
||
| verification_info = result["verification_info"] | ||
| json_output = parse_json_codeblock_from_reasoning_model_output(result["llm_response"]) | ||
|
|
||
| return self.evaluate_task_code_editing(verification_info["input"], json_output) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| parser = argparse.ArgumentParser(description='Verify SWE-Fixer patches') | ||
| parser.add_argument('--file', type=str, required=True, help='Path to the input file containing patches to verify') | ||
| args = parser.parse_args() | ||
|
|
||
| to_verify = [] | ||
| with open(args.file, "r") as f: | ||
| for line in f: | ||
| d = json.loads(line) | ||
| d["verification_info"] = ast.literal_eval(d["verification_info"]) | ||
| d["metadata"] = ast.literal_eval(d["metadata"]) | ||
| to_verify.append(d) | ||
|
|
||
| verifier = SweFixerVerifier() | ||
| for item in to_verify: | ||
| result = verifier.verify(item) | ||
| print(result) | ||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.