Enable provenance recording.

TallJimbo · TallJimbo · commit 8705bec5f54a · 2025-12-19T12:03:58.000-05:00
diff --git a/python/activator/middleware_interface.py b/python/activator/middleware_interface.py
@@ -363,6 +363,7 @@ def __init__(self, read_butler: Butler, butler_writer: ButlerWriter, image_bucke
         self._define_dimensions()
         self._init_ingester()
         self._init_visit_definer()
+        self._init_provenance_dataset_types()
 
         # How much to pad the spatial region we will copy over.
         self.padding = padding*lsst.geom.arcseconds
@@ -439,6 +440,26 @@ def _init_governor_datasets(self, timestamp, skymap):
         self.skymap = self.read_central_butler.get("skyMap", skymap=self.skymap_name,
                                                    collections=self._collection_skymap)
 
+    def _init_provenance_dataset_types(self):
+        self._group_provenance_dataset_type = DatasetType(
+            "prompt_group_provenance",
+            self.butler.dimensions.conform(["group", "detector"]),
+            "ProvenanceQuantumGraph",
+        )
+        self.butler.registry.registerDatasetType(self._group_provenance_dataset_type)
+        self._visit_provenance_dataset_type = DatasetType(
+            "prompt_visit_provenance",
+            self.butler.dimensions.conform(["visit", "detector"]),
+            "ProvenanceQuantumGraph",
+        )
+        self.butler.registry.registerDatasetType(self._visit_provenance_dataset_type)
+        self._exposure_provenance_dataset_type = DatasetType(
+            "prompt_exposure_provenance",
+            self.butler.dimensions.conform(["exposure", "detector"]),
+            "ProvenanceQuantumGraph",
+        )
+        self.butler.registry.registerDatasetType(self._exposure_provenance_dataset_type)
+
     def _define_dimensions(self):
         """Define any dimensions that must be computed from this object's visit.
 
@@ -1292,14 +1313,14 @@ def _get_graph_executor(self, butler, factory):
         )
         graph_executor = MPGraphExecutor(
             # TODO: re-enable parallel execution once we can log as desired with CliLog or a successor
-            # (see issues linked from DM-42063)
+            # (see issues linked from DM-42063) AND once provenance is supported with multiprocessing.
             num_proc=1,  # Avoid spawning processes, because they bypass our logger
             timeout=2_592_000.0,  # In practice, timeout is never helpful; set to 30 days.
             quantum_executor=quantum_executor,
         )
         return graph_executor
 
-    def _try_pipelines(self, pipelines, in_collections, data_ids, *, label):
+    def _try_pipelines(self, pipelines, in_collections, data_ids, *, label, provenance_dataset_type):
         """Attempt to run pipelines from a prioritized list.
 
         On success, exactly one of the pipelines is run, with outputs going to
@@ -1320,6 +1341,10 @@ def _try_pipelines(self, pipelines, in_collections, data_ids, *, label):
         label : `str`
             A unique name to disambiguate this pipeline run for logging
             purposes.
+        provenance_dataset_type : `lsst.daf.butler.DatasetRef`
+            The butler dataset used to store provenance information.  Must have
+            dimensions that match the tasks of the pipeline and use the
+            "ProvenanceQuantumGraph" storage class.
 
         Returns
         -------
@@ -1370,6 +1395,10 @@ def _try_pipelines(self, pipelines, in_collections, data_ids, *, label):
                 # Diagnostic logs are the responsibility of GraphBuilder.
                 _log.error(f"Empty quantum graph for {pipeline_file}; see previous logs for details.")
                 continue
+            provenance_ref = self._make_provenance_ref(provenance_dataset_type, qg, pipeline_file)
+            if provenance_ref is None:
+                # An error log is always emitted if None is returned.
+                continue
             # Past this point, partial execution creates datasets.
             # Don't retry -- either fail (raise) or break.
 
@@ -1381,7 +1410,8 @@ def _try_pipelines(self, pipelines, in_collections, data_ids, *, label):
                         _log, msg=f"executor.run_pipeline ({label})", level=logging.DEBUG):
                     executor.run_pipeline(
                         qg,
-                        graph_executor=self._get_graph_executor(exec_butler, factory)
+                        graph_executor=self._get_graph_executor(exec_butler, factory),
+                        provenance_dataset_ref=provenance_ref,
                     )
                     _log.info(f"{label.capitalize()} pipeline successfully run.")
                     return output_run
@@ -1394,6 +1424,42 @@ def _try_pipelines(self, pipelines, in_collections, data_ids, *, label):
         else:
             raise NoGoodPipelinesError(f"No {label} pipeline graph could be built.")
 
+    def _make_provenance_ref(self, dataset_type, qg, pipeline_file):
+        """Make the provenance DatasetRef for a quantum graph.
+
+        Parameters
+        ----------
+        dataset_type : `lsst.daf.butler.DatasetType`
+            Provenance dataset type for this pipeline.
+        qg : `lsst.pipe.base.quantum_graph.PredictedQuantumGraph`
+            Quantum graph that predicts execution.
+        pipeline_file : `str`
+            Name of the pipeline (for log messages).
+
+        Returns
+        -------
+        ref : `lsst.daf.butler.DatasetRef` or `None`
+            A reference to a to-be-written provenance dataset, or `None` if the
+            quantum graph and the provenance dataset type are incompatible.
+            Error logs are always emitted when `None` is returned.
+        """
+        for task_node in qg.pipeline_graph.tasks.values():
+            if task_node.dimensions == dataset_type.dimensions:
+                data_ids = qg.quanta_by_task[task_node.label].keys()
+                if len(data_ids) == 1:
+                    return DatasetRef(dataset_type, next(iter(data_ids)), run=qg.header.output_run)
+                else:
+                    _log.error(
+                        f"Task {task_node.label} in pipeline {pipeline_file} has multiple quanta for the "
+                        f"dimensions {dataset_type.dimensions} of the provenance dataset."
+                    )
+                    return None
+        _log.error(
+            f"Pipeline {pipeline_file} has has no tasks with the "
+            f"dimensions {dataset_type.dimensions} of the provenance dataset."
+        )
+        return None
+
     def _run_preprocessing(self) -> None:
         """Preprocess a visit ahead of incoming image(s).
 
@@ -1427,6 +1493,7 @@ def _run_preprocessing(self) -> None:
                             in_collections=[preload_run],
                             data_ids=where,
                             label="preprocessing",
+                            provenance_dataset_type=self._group_provenance_dataset_type,
                             )
 
     def _check_permanent_changes(self, where: str) -> bool:
@@ -1511,12 +1578,17 @@ def run_pipeline(self, exposure_ids: set[int]) -> None:
         # faked raw file and appropriate SSO data during prep (and then
         # cleanup when ingesting the real data).
         try:
-            self.define_visits.run({"instrument": self.instrument.getName(),
-                                    "exposure": exp} for exp in exposure_ids)
+            visits_defined = self.define_visits.run({"instrument": self.instrument.getName(),
+                                                    "exposure": exp} for exp in exposure_ids)
         except lsst.daf.butler.registry.DataIdError as e:
             # TODO: a good place for a custom exception?
             raise RuntimeError("No data to process.") from e
 
+        if visits_defined.n_visits:
+            provenance_dataset_type = self._visit_provenance_dataset_type
+        else:
+            provenance_dataset_type = self._exposure_provenance_dataset_type
+
         # Inefficient, but most graph builders can't take equality constraints
         where = (
             f"instrument='{self.visit.instrument}' and detector={self.visit.detector}"
@@ -1531,6 +1603,7 @@ def run_pipeline(self, exposure_ids: set[int]) -> None:
                                 in_collections=pre_runs + [preload_run],
                                 data_ids=where,
                                 label="main",
+                                provenance_dataset_type=provenance_dataset_type,
                                 )
         # Catch Exception just in case there's a surprise -- raising
         # NonRetriableError on *all* irrevocable changes is important.
diff --git a/tests/test_middleware_interface.py b/tests/test_middleware_interface.py
@@ -41,7 +41,9 @@
 import lsst.afw.image
 import lsst.afw.table
 from lsst.dax.apdb import ApdbSql
-from lsst.daf.butler import Butler, CollectionType, DataCoordinate, DimensionUniverse, EmptyQueryResultError
+from lsst.daf.butler import (
+    Butler, CollectionType, DataCoordinate, DatasetType, DimensionUniverse, EmptyQueryResultError
+)
 import lsst.daf.butler.tests as butler_tests
 from lsst.obs.base.formatters.fitsExposure import FitsImageFormatter
 from lsst.obs.base.ingest import RawFileDatasetInfo, RawFileData
@@ -718,6 +720,11 @@ def _check_run_pipeline_fallback(self, callable, pipe_files, graphs, final_label
             The description of the pipeline that should be run, given
             ``pipe_files`` and ``graphs``.
         """
+        test_provenance_dataset_type = DatasetType(
+            "test_provenance",
+            self.interface.butler.dimensions.conform(["detector"]),
+            "ProvenanceQuantumGraph"
+        )
         with (
             unittest.mock.patch(
                 "activator.middleware_interface.MiddlewareInterface.get_pre_pipeline_files",
@@ -737,6 +744,23 @@ def _check_run_pipeline_fallback(self, callable, pipe_files, graphs, final_label
             unittest.mock.patch(
                 "activator.middleware_interface.SeparablePipelineExecutor.run_pipeline"
             ) as mock_run,
+            # Mocked QGs do not have realistic dimensions, and provenance
+            # dataset types need to have the same dimensions.
+            unittest.mock.patch.object(
+                self.interface,
+                "_group_provenance_dataset_type",
+                test_provenance_dataset_type
+            ),
+            unittest.mock.patch.object(
+                self.interface,
+                "_visit_provenance_dataset_type",
+                test_provenance_dataset_type
+            ),
+            unittest.mock.patch.object(
+                self.interface,
+                "_exposure_provenance_dataset_type",
+                test_provenance_dataset_type
+            ),
             self.assertLogs(self.logger_name, level="INFO") as logs
         ):
             callable()