Switch to {group, detector} for all provenance datasets.

TallJimbo · TallJimbo · commit 615ffc8a047e · 2026-01-08T10:11:12.000-05:00
Unfortunately we still need two dataset types, not just one, so
the preprocessing provenance and main-pipeline provenance can coexist
within the same RUN collection.  But this is still a significant
simplification.
diff --git a/python/activator/exception.py b/python/activator/exception.py
@@ -23,7 +23,7 @@
 __all__ = ["NonRetriableError", "RetriableError", "GracefulShutdownInterrupt", "TimeoutInterrupt",
            "InvalidVisitError", "IgnorableVisit",
            "InvalidPipelineError", "NoGoodPipelinesError",
-           "PipelinePreExecutionError", "PipelineExecutionError",
+           "PipelinePreExecutionError", "PipelineExecutionError", "ProvenanceDimensionsError",
            ]
 
 
@@ -144,3 +144,9 @@ class PipelineExecutionError(RuntimeError):
 
     Usually chained to an internal exception.
     """
+
+
+class ProvenanceDimensionsError(RuntimeError):
+    """Exception raised if the 'where' expression used to constrain a pipeline
+    does not yield a viable data ID for provenance.
+    """
diff --git a/python/activator/middleware_interface.py b/python/activator/middleware_interface.py
@@ -62,7 +62,8 @@
 from shared.visit import FannedOutVisit
 from .caching import DatasetCache
 from .exception import GracefulShutdownInterrupt, TimeoutInterrupt, NonRetriableError, RetriableError, \
-    InvalidPipelineError, NoGoodPipelinesError, PipelinePreExecutionError, PipelineExecutionError
+    InvalidPipelineError, NoGoodPipelinesError, PipelinePreExecutionError, PipelineExecutionError, \
+    ProvenanceDimensionsError
 from .timer import enforce_schema, time_this_to_bundle
 
 _log = logging.getLogger("lsst." + __name__)
@@ -441,24 +442,18 @@ def _init_governor_datasets(self, timestamp, skymap):
                                                    collections=self._collection_skymap)
 
     def _init_provenance_dataset_types(self):
-        self._group_provenance_dataset_type = DatasetType(
-            "prompt_group_provenance",
+        self._preprocessing_provenance_dataset_type = DatasetType(
+            "prompt_preprocessing_provenance",
             self.butler.dimensions.conform(["group", "detector"]),
             "ProvenanceQuantumGraph",
         )
-        self.butler.registry.registerDatasetType(self._group_provenance_dataset_type)
-        self._visit_provenance_dataset_type = DatasetType(
-            "prompt_visit_provenance",
-            self.butler.dimensions.conform(["visit", "detector"]),
-            "ProvenanceQuantumGraph",
-        )
-        self.butler.registry.registerDatasetType(self._visit_provenance_dataset_type)
-        self._exposure_provenance_dataset_type = DatasetType(
-            "prompt_exposure_provenance",
-            self.butler.dimensions.conform(["exposure", "detector"]),
+        self.butler.registry.registerDatasetType(self._preprocessing_provenance_dataset_type)
+        self._main_provenance_dataset_type = DatasetType(
+            "prompt_main_provenance",
+            self.butler.dimensions.conform(["group", "detector"]),
             "ProvenanceQuantumGraph",
         )
-        self.butler.registry.registerDatasetType(self._exposure_provenance_dataset_type)
+        self.butler.registry.registerDatasetType(self._main_provenance_dataset_type)
 
     def _define_dimensions(self):
         """Define any dimensions that must be computed from this object's visit.
@@ -1395,9 +1390,10 @@ def _try_pipelines(self, pipelines, in_collections, data_ids, *, label, provenan
                 # Diagnostic logs are the responsibility of GraphBuilder.
                 _log.error(f"Empty quantum graph for {pipeline_file}; see previous logs for details.")
                 continue
-            provenance_ref = self._make_provenance_ref(provenance_dataset_type, qgraph, pipeline_file)
-            if provenance_ref is None:
-                # An error log is always emitted if None is returned.
+            try:
+                provenance_ref = self._make_provenance_ref(provenance_dataset_type, data_ids, output_run)
+            except ProvenanceDimensionsError:
+                _log.exception(f"Failed to determine data ID for provenance for {pipeline_file}.")
                 continue
             # Past this point, partial execution creates datasets.
             # Don't retry -- either fail (raise) or break.
@@ -1424,41 +1420,32 @@ def _try_pipelines(self, pipelines, in_collections, data_ids, *, label, provenan
         else:
             raise NoGoodPipelinesError(f"No {label} pipeline graph could be built.")
 
-    def _make_provenance_ref(self, dataset_type, qg, pipeline_file):
+    def _make_provenance_ref(self, dataset_type, where, output_run):
         """Make the provenance DatasetRef for a quantum graph.
 
         Parameters
         ----------
         dataset_type : `lsst.daf.butler.DatasetType`
             Provenance dataset type for this pipeline.
-        qg : `lsst.pipe.base.quantum_graph.PredictedQuantumGraph`
-            Quantum graph that predicts execution.
-        pipeline_file : `str`
-            Name of the pipeline (for log messages).
+        where : `str`
+            Butler query expression that can be related to a single
+            ``{group, detector}`` data ID.
+        output_run : `str`
+            Output RUN collection.
 
         Returns
         -------
-        ref : `lsst.daf.butler.DatasetRef` or `None`
-            A reference to a to-be-written provenance dataset, or `None` if the
-            quantum graph and the provenance dataset type are incompatible.
-            Error logs are always emitted when `None` is returned.
+        ref : `lsst.daf.butler.DatasetRef`
+            A reference to a to-be-written provenance dataset.
         """
-        for task_node in qg.pipeline_graph.tasks.values():
-            if task_node.dimensions == dataset_type.dimensions:
-                data_ids = qg.quanta_by_task[task_node.label].keys()
-                if len(data_ids) == 1:
-                    return DatasetRef(dataset_type, next(iter(data_ids)), run=qg.header.output_run)
-                else:
-                    _log.error(
-                        f"Task {task_node.label} in pipeline {pipeline_file} has multiple quanta for the "
-                        f"dimensions {dataset_type.dimensions} of the provenance dataset."
-                    )
-                    return None
-        _log.error(
-            f"Pipeline {pipeline_file} has has no tasks with the "
-            f"dimensions {dataset_type.dimensions} of the provenance dataset."
-        )
-        return None
+        query_results = self.butler.query_data_ids(dataset_type.dimensions, where=where, explain=False)
+        try:
+            (data_id,) = query_results
+        except ValueError:
+            raise ProvenanceDimensionsError(
+                f"Expected exactly one data ID for {dataset_type}; got {query_results}."
+            ) from None
+        return DatasetRef(dataset_type, data_id, run=output_run)
 
     def _run_preprocessing(self) -> None:
         """Preprocess a visit ahead of incoming image(s).
@@ -1493,7 +1480,7 @@ def _run_preprocessing(self) -> None:
                             in_collections=[preload_run],
                             data_ids=where,
                             label="preprocessing",
-                            provenance_dataset_type=self._group_provenance_dataset_type,
+                            provenance_dataset_type=self._preprocessing_provenance_dataset_type,
                             )
 
     def _check_permanent_changes(self, where: str) -> bool:
@@ -1578,17 +1565,12 @@ def run_pipeline(self, exposure_ids: set[int]) -> None:
         # faked raw file and appropriate SSO data during prep (and then
         # cleanup when ingesting the real data).
         try:
-            visits_defined = self.define_visits.run({"instrument": self.instrument.getName(),
-                                                    "exposure": exp} for exp in exposure_ids)
+            self.define_visits.run({"instrument": self.instrument.getName(),
+                                    "exposure": exp} for exp in exposure_ids)
         except lsst.daf.butler.registry.DataIdError as e:
             # TODO: a good place for a custom exception?
             raise RuntimeError("No data to process.") from e
 
-        if visits_defined.n_visits:
-            provenance_dataset_type = self._visit_provenance_dataset_type
-        else:
-            provenance_dataset_type = self._exposure_provenance_dataset_type
-
         # Inefficient, but most graph builders can't take equality constraints
         where = (
             f"instrument='{self.visit.instrument}' and detector={self.visit.detector}"
@@ -1603,7 +1585,7 @@ def run_pipeline(self, exposure_ids: set[int]) -> None:
                                 in_collections=pre_runs + [preload_run],
                                 data_ids=where,
                                 label="main",
-                                provenance_dataset_type=provenance_dataset_type,
+                                provenance_dataset_type=self._main_provenance_dataset_type,
                                 )
         # Catch Exception just in case there's a surprise -- raising
         # NonRetriableError on *all* irrevocable changes is important.
diff --git a/tests/test_middleware_interface.py b/tests/test_middleware_interface.py
@@ -748,17 +748,12 @@ def _check_run_pipeline_fallback(self, callable, pipe_files, graphs, final_label
             # dataset types need to have the same dimensions.
             unittest.mock.patch.object(
                 self.interface,
-                "_group_provenance_dataset_type",
+                "_preprocessing_provenance_dataset_type",
                 test_provenance_dataset_type
             ),
             unittest.mock.patch.object(
                 self.interface,
-                "_visit_provenance_dataset_type",
-                test_provenance_dataset_type
-            ),
-            unittest.mock.patch.object(
-                self.interface,
-                "_exposure_provenance_dataset_type",
+                "_main_provenance_dataset_type",
                 test_provenance_dataset_type
             ),
             self.assertLogs(self.logger_name, level="INFO") as logs,