x

SumanthRH · SumanthRH · commit fab4d1353840 · 2025-02-23T22:54:44.000-08:00
Signed-off-by: SumanthRH &lt;sumanthrh@anyscale.com&gt;
diff --git a/skythought/evals/cli.py b/skythought/evals/cli.py
@@ -530,6 +530,14 @@ def score(
             case_sensitive=False,
         ),
     ],
+    idx: Annotated[
+        str,
+        typer.Option(
+            ...,
+            help="Unique index of the sample in the results JSON to re-score."
+            "If provided, only the scores for this sample are computed/re-computed. ",
+        ),
+    ] = None,
 ):
     if not os.path.exists(run_dir):
         raise ValueError(f"Run directory {run_dir} does not exist.")
@@ -556,7 +564,7 @@ def score(
 
     run_summary = SummaryResults(**run_summary)
 
-    score_results(handler, run_dir, run_summary)
+    score_results(handler, run_dir, run_summary, idx)
 
 
 def main():
diff --git a/skythought/evals/inference_and_check.py b/skythought/evals/inference_and_check.py
@@ -294,6 +294,7 @@ def generate_responses_for_dataset(
 def score_responses(
     handler: TaskHandler,
     id_to_results: Dict[str, Dict[str, Any]],
+    *,
     max_workers: int = 32,
 ) -> Tuple[float, Dict[str, List[int]], int]:
     """Computes correctness for model responses for the given task
@@ -341,7 +342,7 @@ def score_responses(
             # TODO (sumanthrh): this can be improved
             if unique_id not in id_to_scores:
                 id_to_scores[unique_id] = [0 for _ in range(N)]
-            id_to_scores[unique_id][i] = new_response_entry["correctness"]
+            id_to_scores[unique_id][i] = int(new_response_entry["correctness"])
 
             total_correct += new_response_entry["correctness"]
             total_finish += 1
@@ -350,6 +351,40 @@ def score_responses(
     return accuracy, id_to_scores, total_finish
 
 
+def score_responses_for_idx(
+    handler: TaskHandler,
+    id_to_results: Dict[str, Dict[str, Any]],
+    *,
+    idx: str,
+) -> List[int]:
+    """Computes correctness for model responses for the given task for the unique index `idx`.
+
+    The 'id_to_results' dictionary is assumed to be a mapping between problem ID -> { responses: [...], ... },
+    This is updated in-place.
+
+    Returns:
+       - list of scores
+    """
+    if not id_to_results:
+        return []
+
+    # Figure out how many generations per problem
+    N = len(next(iter(id_to_results.values()))["responses"])
+    record = id_to_results[idx]
+    scores = []
+    for i in range(N):
+        content = record["responses"][i]["content"]
+        response_entry = handler.update_results(record, content)
+
+        # Update correctness and reason in the original results dict
+        id_to_results[idx]["responses"][i]["correctness"] = response_entry[
+            "correctness"
+        ]
+        id_to_results[idx]["responses"][i]["reason"] = response_entry["reason"]
+        scores.append(response_entry["correctness"])
+    return scores
+
+
 def generate_and_score(
     handler: TaskHandler,
     model_config: ModelConfig,
@@ -480,17 +515,29 @@ def generate_and_save(
 
 
 def score_results(
-    handler: TaskHandler, run_dir: Path, run_summary: SummaryResults
+    handler: TaskHandler,
+    run_dir: Path,
+    run_summary: SummaryResults,
+    idx: Optional[str] = None,
 ) -> None:
     # load existing results
     result_file = run_dir / RESULTS_FILENAME
     summary_file = run_dir / SUMMARY_FILENAME
     id_to_results = load_existing_results(result_file)
     logger.info(f"Loaded {len(id_to_results)} existing results for scoring.")
 
-    accuracy, id_to_scores, total_finish = score_responses(handler, id_to_results)
-
-    logger.info(f"Accuracy: {accuracy}")
+    if not idx:
+        accuracy, id_to_scores, total_finish = score_responses(handler, id_to_results)
+    else:
+        N = len(next(iter(id_to_results.values()))["responses"])
+        score_responses_for_idx(handler, id_to_results, idx=idx)
+        id_to_scores = {
+            index: [
+                id_to_results[index]["responses"][i]["correctness"] for i in range(N)
+            ]
+            for index in id_to_results
+        }
+        accuracy = sum(map(sum, id_to_scores.values())) / len(id_to_scores) * N
 
     sample_count = 0
     if id_to_results:
@@ -501,7 +548,9 @@ def score_results(
 
     run_summary.accuracy = accuracy
     run_summary.pass_at_k = pass_at_k_metrics
+
+    logger.info(f"Accuracy: {accuracy}")
     save_summary(summary_file, run_summary)
 
     save_results(result_file, id_to_results)
-    logger.info(f"Re-scored results saved to {result_file}")
+    logger.info(f"Scored results saved to {result_file}")
diff --git a/skythought/evals/tasks/base.py b/skythought/evals/tasks/base.py
@@ -58,7 +58,7 @@ def check_correctness(
         pass
 
     @abstractmethod
-    def update_results(self, problem: Dict[str, Any], response: str):
+    def update_results(self, problem: Dict[str, Any], response: str) -> Dict[str, Any]:
         pass
 
     def make_conversations(
diff --git a/skythought/evals/util/metrics.py b/skythought/evals/util/metrics.py
@@ -1,7 +1,7 @@
 import logging
 import math
 from collections import defaultdict
-from typing import Any, Dict
+from typing import Dict, List
 
 import numpy as np
 
@@ -17,7 +17,7 @@ def _pass_at_k(n, c, k):
     return float(1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)))
 
 
-def pass_at_k(N: int, id_to_scores: Dict[str, Dict[str, Any]]):
+def pass_at_k(N: int, id_to_scores: Dict[str, List[int]]):
     final_passk_scores = {}
     k_to_passk_scores = defaultdict(list)  # k -> list of scores
     for _, sample_scores in id_to_scores.items():