Add workflow provenance tracking with yProv4WFS integration (#98)

hapix · Hapix · rtmiz · web-flow · commit ce90c1ea7c11 · 2025-05-26T11:28:27.000+02:00
Co-authored-by: Hapix &lt;h@example.com&gt;
Co-authored-by: Gerald Walter Irsiegler &lt;gerald.irsiegler@gmail.com&gt;
Co-authored-by: Gerald Walter Irsiegler &lt;gerald.irsiegler@eodc.eu&gt;
diff --git a/openeo_pg_parser_networkx/graph.py b/openeo_pg_parser_networkx/graph.py
@@ -4,17 +4,27 @@
 
 sys.setrecursionlimit(16385)  # Necessary when parsing really big graphs
 import functools
+
+## For yprov4wfs
 import json
 import logging
+import os
 import random
+import uuid
 from collections import namedtuple
 from dataclasses import dataclass, field
-from functools import partial
+from datetime import datetime
+from functools import partial, wraps
 from pathlib import Path
 from typing import Callable, Optional, Union
 from uuid import UUID
 
+import dask.array as da
 import networkx as nx
+import xarray as xr
+from yprov4wfs.datamodel.data import Data
+from yprov4wfs.datamodel.task import Task
+from yprov4wfs.datamodel.workflow import Workflow
 
 from openeo_pg_parser_networkx.pg_schema import (
     PGEdgeType,
@@ -70,6 +80,10 @@ def __repr__(self):
 
 class OpenEOProcessGraph:
     def __init__(self, pg_data: dict):
+        # Make a workflow object
+        self.workflow = Workflow('openeo_workflow', 'OpenEO Workflow')
+        self.workflow._engineWMS = "Openeo-Workflow"
+        self.workflow._level = "0"
         self.G = nx.DiGraph()
 
         # Save pg_data for resolving later on
@@ -377,7 +391,7 @@ def node_callable(*args, parent_callables, named_parameters=None, **kwargs):
             # The node needs to first call all its parents, so that results are prepopulated in the results_cache
             for func in parent_callables:
                 func(*args, named_parameters=named_parameters, **kwargs)
-
+            cache_users = {}
             try:
                 # If this node has already been computed once, just grab that result from the results_cache instead of recomputing it.
                 # This cannot be done for aggregated data as the wrapped function has to be called multiple times with different values.
@@ -411,13 +425,108 @@ def node_callable(*args, parent_callables, named_parameters=None, **kwargs):
                             kwargs[arg_sub.arg_name] = self.G.nodes(data=True)[node][
                                 "resolved_kwargs"
                             ].__getitem__(arg_sub.arg_name)
-
-                result = prebaked_process_impl(
+                        # Make a dictionary from the nodes that uses the outputs of the other nodes
+                        if source_node not in cache_users:
+                            cache_users[source_node] = []
+                            cache_users[source_node].append(node)
+                # Make the tasks
+                task = Task(node, node_with_data['process_id'])
+                # result = prebaked_process_impl(
+                #     *args, named_parameters=named_parameters, **kwargs
+                # )
+                result, execution_data = self.profile_function(prebaked_process_impl)(
                     *args, named_parameters=named_parameters, **kwargs
                 )
 
+                if isinstance(result, xr.DataArray):
+                    processed_result = {
+                        "entity_type": "xarray.DataArray",
+                        "info": {
+                            "shape": result.shape,
+                            "dimensions": list(result.dims),
+                            # "attributes": result.attrs,
+                            "dtype": str(result.dtype),
+                        },
+                    }
+
+                elif isinstance(result, da.Array):
+                    processed_result = {
+                        "entity_type": "dask.Array",
+                        "info": {
+                            "shape": result.shape,
+                            "dtype": str(result.dtype),
+                            "chunk_size": result.chunksize,
+                            "chunk_type": type(result._meta).__name__,
+                        },
+                    }
+                else:
+                    processed_result = {}
+                    processed_result['info'] = result
+                    processed_result['entity_type'] = type(result).__name__
+                if result is not None:
+                    results_cache_node = Data(
+                        str(uuid.uuid4()), processed_result['entity_type']
+                    )
+                    results_cache_node._info = processed_result['info']
+                    task.add_output(results_cache_node)
+                    self.workflow.add_data(results_cache_node)
                 results_cache[node] = result
 
+                # Loading data info
+                process_id = node_with_data.get("process_id")
+                resolved_kwargs = node_with_data.get("resolved_kwargs", {})
+
+                if process_id in ("load_stac", "load_collection"):
+                    key = "url" if process_id == "load_stac" else "id"
+                    raw_source = resolved_kwargs.get(key, "")
+                    data_source = raw_source.split("\\")[-1]
+
+                    data_src = Data(str(uuid.uuid4()), data_source)
+                    # Extract extra information
+                    if process_id == "load_stac":
+                        data_src._info = resolved_kwargs
+
+                task._start_time = execution_data['start_time']
+                task._end_time = execution_data['end_time']
+                task._status = execution_data['task_status']
+                task._level = "1"
+
+                # This is just for load stac ( for the temporary usage)
+                if node_with_data['process_id'] in ["load_stac", "load_collection"]:
+                    task.add_input(data_src)
+
+                self.workflow.add_task(task)
+
+                if cache_users:
+                    for source_node, target_node in cache_users.items():
+                        output_data_from_source = (
+                            self.workflow.get_task_by_id(source_node)._outputs[0]._id
+                        )
+                        for target in target_node:
+                            self.workflow.get_task_by_id(target).add_input(
+                                self.workflow.get_data_by_id(output_data_from_source)
+                            )
+
+                edges = [
+                    {"source": source, "target": target, "type": data["reference_type"]}
+                    for source, target, data in self.G.edges(node, data=True)
+                ]
+
+                for edge in edges:
+                    self.workflow.get_task_by_id(edge['source']).set_next(
+                        self.workflow.get_task_by_id(edge['target'])
+                    )
+
+                if node == self.result_node:
+                    self.workflow._status = "Ok"
+
+                    # To save the provenance
+                    # timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+                    # save_path = os.path.join(os.getcwd(), f"run_{timestamp}")
+                    # print(f"Provenance file saved to: {save_path}")
+                    # os.makedirs(save_path, exist_ok=True)
+                    # self.workflow.prov_to_json(directory_path=save_path)
+
                 return result
 
         return partial(node_callable, parent_callables=parent_callables)
@@ -516,3 +625,36 @@ def plot(self, reverse=False):
 
         if reverse:
             self.G = self.G.reverse()
+
+    @staticmethod
+    def profile_function(func):
+        """Decorator to track execution performance and return both result and profiling data.
+        In the case in the future there will be some more metrics of intrest (like cpu and memory
+        usage) to extract."""
+
+        @wraps(func)
+        def wrapper(*args, named_parameters, **kwargs):
+            start_dt = datetime.now()
+            start_timestamp = start_dt.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
+
+            try:
+                result = func(*args, named_parameters, **kwargs)
+                status = "Ok"
+            except Exception as e:
+                result = str(e)
+                status = f"Error: {result[:70]}"
+
+            end_dt = datetime.now()
+            end_timestamp = end_dt.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
+            execution_time = (end_dt - start_dt).total_seconds()
+            execution_data = {
+                # "function": func.__name__,
+                "task_status": status,
+                "start_time": start_timestamp,
+                "end_time": end_timestamp,
+                "execution_time_sec": round(execution_time, 4),
+            }
+            # Return both the result and profiling data
+            return result, execution_data
+
+        return wrapper
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "openeo-pg-parser-networkx"
-version = "2024.10.1"
+version = "2025.5.1"
 
 description = "Parse OpenEO process graphs from JSON to traversible Python objects."
 authors = ["Lukas Weidenholzer <lukas.weidenholzer@eodc.eu>", "Sean Hoyal <sean.hoyal@eodc.eu>", "Valentina Hutter <valentina.hutter@eodc.eu>", "Gerald Irsiegler <gerald.irsiegler@eodc.eu>"]
@@ -33,6 +33,9 @@ numpy = "^1.20.3"
 pendulum = "^2.1.2"
 matplotlib = { version = "^3.7.1", optional = true }
 traitlets = "<=5.9.0"
+yprov4wfs = ">=0.0.8"
+xarray = ">=2022.11.0,<=2024.3.0"
+dask = ">=2023.4.0,<2025.2.0"
 
 [tool.poetry.group.dev.dependencies]
 matplotlib = "^3.7.1"
diff --git a/tests/test_pg_provenance.py b/tests/test_pg_provenance.py
@@ -0,0 +1,66 @@
+import json
+
+import pytest
+from yprov4wfs.datamodel.data import Data
+from yprov4wfs.datamodel.task import Task
+from yprov4wfs.datamodel.workflow import Workflow
+
+from openeo_pg_parser_networkx import OpenEOProcessGraph
+from openeo_pg_parser_networkx.process_registry import Process, ProcessRegistry
+
+
+def test_execute_returns_result_and_workflow(process_graph_path):
+    """
+    Test that OpenEOProcessGraph returns result and workflow correctly
+    for all sample graphs, using a mock registry based on required processes.
+    """
+
+    with open(process_graph_path) as f:
+        flat_pg = json.load(f)
+
+    pg = OpenEOProcessGraph(flat_pg)
+
+    mock_registry = ProcessRegistry(wrap_funcs=[])
+    for process_id in pg.required_processes:
+        mock_registry[process_id] = Process(
+            spec={},
+            implementation=lambda *args, **kwargs: args[0] if args else None,
+            namespace="predefined",
+        )
+
+    # Create callable and execute
+    result = pg.to_callable(mock_registry)()
+    workflow = pg.workflow
+
+    # Assertions
+    assert result is not None, "Result should not be None"
+    assert workflow is not None, "Workflow should not be None"
+    assert isinstance(
+        workflow, Workflow
+    ), "Workflow should be a yprov4wfs.Workflow instance"
+    assert len(workflow._tasks) > 0, "Workflow should have at least one task"
+    assert workflow._status in ["Ok", "Error"], "Workflow status should be Ok or Error"
+
+    # Test the tasks
+    assert isinstance(workflow._tasks, list), "Workflow._tasks should be a list"
+    for task in workflow._tasks:
+        # Each task should be a Task instance
+        assert isinstance(
+            task, Task
+        ), f"Each task should be a Task instance but got {type(task)}"
+        assert hasattr(task, "_id"), "Task must have an _id"
+        assert hasattr(task, "_name"), "Task must have a _name"
+        assert hasattr(task, "_start_time"), "Task must have a start_time"
+        assert hasattr(task, "_end_time"), "Task must have an end_time"
+        assert hasattr(task, "_status"), "Task must have a status"
+        assert hasattr(task, "_inputs"), "Task must have _inputs"
+        assert hasattr(task, "_outputs"), "Task must have _outputs"
+
+    # Test the data
+    assert isinstance(workflow._data, list), "Workflow._data should be a list"
+    for data in workflow._data:
+        assert isinstance(
+            data, Data
+        ), f"Each data node should be a Data instance but got {type(data)}"
+        assert hasattr(data, "_id"), "Data must have an _id"
+        assert hasattr(data, "_name"), "Data must have a _name"