lf.eval.v2.EvaluationState to release processed examples right after evaluating them.

daiyip · langfun authors · commit cb561017907a · 2025-03-12T19:39:13.000-07:00
This allows memory intensive benchmarks to free processed examples once it's evaluated.

PiperOrigin-RevId: 736348489
diff --git a/langfun/core/eval/v2/checkpointing.py b/langfun/core/eval/v2/checkpointing.py
@@ -53,14 +53,14 @@ def on_experiment_start(
       self._load_experiment(runner, experiment)
 
     example_ids_to_evaluate = current_run.examples_to_evaluate(experiment)
-    if experiment.state.evaluated_examples:
+    if experiment.state.ckpt_examples:
       loaded_example_ids = list(
-          sorted(experiment.state.evaluated_examples.keys())
+          sorted(experiment.state.ckpt_examples.keys())
       )
       example_ids_to_evaluate -= set(loaded_example_ids)
       example_ids_to_evaluate = list(sorted(example_ids_to_evaluate))
       experiment.info(
-          f'{len(experiment.state.evaluated_examples)} examples '
+          f'{len(experiment.state.ckpt_examples)} examples '
           'loaded from checkpoint files. Their outputs will be used '
           f'for recomputing metrics. Example IDs: {loaded_example_ids}.'
       )
@@ -316,7 +316,7 @@ def on_experiment_complete(
         writer = self._sequence_writer.pop(experiment.id)
         writer.close()
         experiment.info(
-            f'{len(experiment.state.evaluated_examples)} examples are '
+            f'{len(experiment.state.evaluation_status)} examples are '
             f'checkpointed to {writer.path}.'
         )
 
diff --git a/langfun/core/eval/v2/checkpointing_test.py b/langfun/core/eval/v2/checkpointing_test.py
@@ -18,6 +18,7 @@
 from langfun.core.eval.v2 import checkpointing
 from langfun.core.eval.v2 import eval_test_helper
 from langfun.core.eval.v2 import example as example_lib
+from langfun.core.eval.v2 import experiment as experiment_lib
 from langfun.core.eval.v2 import runners as runners_lib  # pylint: disable=unused-import
 import pyglove as pg
 
@@ -52,6 +53,26 @@ def f():
       self.assertEqual(len(list(iter(f))), 1)
 
 
+class ExampleCollector(experiment_lib.Plugin):
+  """Collects all examples."""
+
+  def _on_bound(self):
+    super()._on_bound()
+    self._examples = {}
+
+  @property
+  def examples(self) -> dict[int, example_lib.Example]:
+    return self._examples
+
+  def on_example_complete(
+      self, runner: runners_lib.Runner,
+      experiment: experiment_lib.Experiment,
+      example: example_lib.Example,
+  ):
+    assert experiment.is_leaf, None
+    self._examples[example.id] = example
+
+
 class CheckpointerTest(unittest.TestCase):
 
   def assert_found_in_log(self, experiment, message):
@@ -70,13 +91,15 @@ def test_checkpointing(self):
     experiment = eval_test_helper.test_experiment()
     checkpoint_filename = 'checkpoint.jsonl'
     checkpointer = checkpointing.PerExampleCheckpointer(checkpoint_filename)
+    collector = ExampleCollector()
     run = experiment.run(
-        root_dir, 'new', runner='sequential', plugins=[checkpointer]
+        root_dir, 'new', runner='sequential', plugins=[checkpointer, collector]
     )
     num_processed = {}
     for leaf in experiment.leaf_nodes:
       for i in range(leaf.num_examples):
-        example = leaf.state.get(i + 1)
+        self.assertIn(i + 1, collector.examples)
+        example = collector.examples[i + 1]
         ckpt = run.output_path_for(leaf, f'checkpoint_{example.id}.jsonl')
         if example.has_error:
           self.assertFalse(pg.io.path_exists(ckpt))
@@ -134,12 +157,15 @@ def test_loading_corrupted_checkpoint(self):
     experiment = eval_test_helper.TestEvaluation()
     checkpoint_filename = 'checkpoint.jsonl'
     checkpointer = checkpointing.PerExampleCheckpointer(checkpoint_filename)
+    collector = ExampleCollector()
+
     run = experiment.run(
-        root_dir, 'new', runner='sequential', plugins=[checkpointer]
+        root_dir, 'new', runner='sequential', plugins=[checkpointer, collector]
     )
     num_processed = {}
     for i in range(experiment.num_examples):
-      example = experiment.state.get(i + 1)
+      self.assertIn(i + 1, collector.examples)
+      example = collector.examples[i + 1]
       ckpt = run.output_path_for(experiment, f'checkpoint_{example.id}.jsonl')
       if not example.has_error:
         self.assertTrue(pg.io.path_exists(ckpt))
diff --git a/langfun/core/eval/v2/evaluation.py b/langfun/core/eval/v2/evaluation.py
@@ -166,25 +166,24 @@ def evaluate(
     if pg.MISSING_VALUE == example.input:
       example.input = self.example_input_by_id(example.id)
 
-    cached = self._state.get(example.id)
-
+    checkpointed = self._state.ckpt_example(example.id)
     with pg.timeit('evaluate') as timeit, lf.track_usages() as usage_summary:
-      if cached is None or cached.has_error:
+      if checkpointed is None or checkpointed.has_error:
         example.start_time = time.time()
         self._process(example, raise_if_has_error=raise_if_has_error)
       else:
-        example.start_time = cached.start_time
+        example.start_time = checkpointed.start_time
 
-        # Use cached output and metadata obtained from the previous processing.
-        example.output = cached.output
-        example.metadata = cached.metadata
+        # Use the output and metadata obtained from the previous processing.
+        example.output = checkpointed.output
+        example.metadata = checkpointed.metadata
         example.newly_processed = False
 
         # For previously processed examples, we merge previous usages as
         # cached, so the usage summary will account previous usages, but as
         # cached.
-        assert cached.usage_summary is not None
-        usage_summary.merge(cached.usage_summary, as_cached=True)
+        assert checkpointed.usage_summary is not None
+        usage_summary.merge(checkpointed.usage_summary, as_cached=True)
 
       # Recompute the metrics and metadata for the example even its processed
       # output and metadata were from the cache.
@@ -691,9 +690,29 @@ def _html_tree_view_css_styles(self) -> list[str]:
 class EvaluationState:
   """Evaluation state."""
 
+  class ExampleStatus(pg.Object):
+    """Example state."""
+    evaluated: Annotated[
+        bool,
+        'Whether the example is evaluated.'
+    ] = False
+
+    newly_processed: Annotated[
+        bool,
+        'Whether the example is newly processed.'
+    ] = False
+
+    has_error: Annotated[
+        bool,
+        'Whether the example has error.'
+    ] = False
+
   def __init__(self):
     super().__init__()
-    self._evaluated_examples: dict[int, example_lib.Example] = {}
+    self._ckpt_examples: dict[int, example_lib.Example] = {}
+    self._evaluation_status: dict[
+        int, EvaluationState.ExampleStatus
+    ] = {}
 
   def load(
       self,
@@ -715,17 +734,41 @@ def load(
         assert isinstance(example, example_lib.Example), example
         if filter is not None and not filter(example):
           continue
-        self._evaluated_examples[example.id] = example
+        example.newly_processed = False
+        self._ckpt_examples[example.id] = example
 
   @property
-  def evaluated_examples(self) -> dict[int, example_lib.Example]:
-    """Returns the examples in the state."""
-    return self._evaluated_examples
+  def evaluation_status(self) -> dict[int, ExampleStatus]:
+    """Returns the evaluation status of the examples."""
+    return self._evaluation_status
 
-  def get(self, example_id: int) -> example_lib.Example | None:
-    """Returns the example with the given ID."""
-    return self._evaluated_examples.get(example_id)
+  @property
+  def ckpt_examples(self) -> dict[int, example_lib.Example]:
+    """Returns the unevaluated examples from checkpoints."""
+    return self._ckpt_examples
+
+  def ckpt_example(self, example_id: int) -> example_lib.Example | None:
+    """Returns the unevaluated example from checkpoints for a given ID."""
+    return self._ckpt_examples.get(example_id)
+
+  def get_status(self, example_id: int) -> ExampleStatus:
+    """Returns the evaluation status of the example."""
+    return self._evaluation_status.get(
+        example_id, EvaluationState.ExampleStatus()
+    )
 
   def update(self, example: example_lib.Example) -> None:
     """Updates the state with the given example."""
-    self._evaluated_examples[example.id] = example
+    self._update_status(example)
+    # Processed examples will be removed once it's done.
+    self._ckpt_examples.pop(example.id, None)
+
+  def _update_status(self, example: example_lib.Example) -> None:
+    """Updates the evaluation status of the example."""
+    self._evaluation_status[example.id] = (
+        EvaluationState.ExampleStatus(
+            evaluated=example.output != pg.MISSING_VALUE,
+            newly_processed=example.newly_processed,
+            has_error=example.has_error,
+        )
+    )
diff --git a/langfun/core/eval/v2/evaluation_test.py b/langfun/core/eval/v2/evaluation_test.py
@@ -78,7 +78,9 @@ def my_inputs():
   def test_evaluate(self):
     exp = eval_test_helper.TestEvaluation()
     example = exp.evaluate(Example(id=3))
-    self.assertIs(exp.state.get(3), example)
+    self.assertTrue(exp.state.get_status(3).evaluated)
+    self.assertTrue(exp.state.get_status(3).newly_processed)
+    self.assertFalse(exp.state.get_status(3).has_error)
     self.assertTrue(example.newly_processed)
     self.assertEqual(example.input, pg.Dict(x=2, y=4, groundtruth=6))
     self.assertEqual(example.output, 6)
@@ -111,7 +113,7 @@ def test_evaluate(self):
     self.assertEqual(example.metadata, {})
     self.assertEqual(example.metric_metadata, dict(error='ValueError'))
 
-  def test_evaluate_with_state(self):
+  def test_evaluate_withstate(self):
     eval_dir = os.path.join(tempfile.gettempdir(), 'test_eval')
     pg.io.mkdirs(eval_dir, exist_ok=True)
     state_file = os.path.join(eval_dir, 'state.jsonl')
@@ -121,13 +123,14 @@ def test_evaluate_with_state(self):
       self.assertTrue(example.newly_processed)
       self.assertEqual(example.input, pg.Dict(x=2, y=4, groundtruth=6))
       self.assertEqual(example.output, 6)
-      self.assertEqual(len(exp._state.evaluated_examples), 1)
+      self.assertEqual(len(exp.state.evaluation_status), 1)
       f.add(pg.to_json_str(example))
 
     exp.reset()
-    self.assertEqual(len(exp._state.evaluated_examples), 0)
+    self.assertEqual(len(exp.state.ckpt_examples), 0)
     exp.load_state(state_file)
-    self.assertEqual(len(exp._state.evaluated_examples), 1)
+    self.assertEqual(len(exp.state.ckpt_examples), 1)
+    self.assertEqual(len(exp.state.evaluation_status), 0)
     example = exp.evaluate(3)
     self.assertFalse(example.newly_processed)
     self.assertEqual(example.input, pg.Dict(x=2, y=4, groundtruth=6))
@@ -140,14 +143,14 @@ def test_evaluate_with_state(self):
 
     # Test load_state with filter.
     exp.reset()
-    self.assertEqual(len(exp._state.evaluated_examples), 0)
+    self.assertEqual(len(exp.state.ckpt_examples), 0)
     exp.load_state(state_file, filter=lambda x: x.id == 3)
-    self.assertEqual(len(exp._state.evaluated_examples), 1)
+    self.assertEqual(len(exp.state.ckpt_examples), 1)
 
     exp.reset()
-    self.assertEqual(len(exp._state.evaluated_examples), 0)
+    self.assertEqual(len(exp.state.ckpt_examples), 0)
     exp.load_state(state_file, filter=lambda x: x.id == 1)
-    self.assertEqual(len(exp._state.evaluated_examples), 0)
+    self.assertEqual(len(exp.state.ckpt_examples), 0)
 
   def test_html_view(self):
     exp = eval_test_helper.TestEvaluation()
diff --git a/langfun/core/eval/v2/runners.py b/langfun/core/eval/v2/runners.py
@@ -181,8 +181,8 @@ def _log_experiment_completion(self, experiment: Experiment):
     )
     num_from_checkpoint, num_processed = 0, 0
     for example_id in example_ids:
-      example = experiment.state.get(example_id)
-      if example.newly_processed:
+      status = experiment.state.get_status(example_id)
+      if status.newly_processed:
         num_processed += 1
       else:
         num_from_checkpoint += 1
@@ -358,7 +358,8 @@ def evaluate_item(
     """Runs the evaluation example."""
     self.on_example_start(evaluation, item)
     item = evaluation.evaluate(
-        item, raise_if_has_error=self.current_run.raise_if_has_error
+        item,
+        raise_if_has_error=self.current_run.raise_if_has_error,
     )
     self.on_example_complete(evaluation, item)
     return item