|
4 | 4 |
|
5 | 5 | sys.setrecursionlimit(16385) # Necessary when parsing really big graphs |
6 | 6 | import functools |
| 7 | + |
| 8 | +## For yprov4wfs |
7 | 9 | import json |
8 | 10 | import logging |
| 11 | +import os |
9 | 12 | import random |
| 13 | +import uuid |
10 | 14 | from collections import namedtuple |
11 | 15 | from dataclasses import dataclass, field |
12 | | -from functools import partial |
| 16 | +from datetime import datetime |
| 17 | +from functools import partial, wraps |
13 | 18 | from pathlib import Path |
14 | 19 | from typing import Callable, Optional, Union |
15 | 20 | from uuid import UUID |
16 | 21 |
|
| 22 | +import dask.array as da |
17 | 23 | import networkx as nx |
| 24 | +import xarray as xr |
| 25 | +from yprov4wfs.datamodel.data import Data |
| 26 | +from yprov4wfs.datamodel.task import Task |
| 27 | +from yprov4wfs.datamodel.workflow import Workflow |
18 | 28 |
|
19 | 29 | from openeo_pg_parser_networkx.pg_schema import ( |
20 | 30 | PGEdgeType, |
@@ -70,6 +80,10 @@ def __repr__(self): |
70 | 80 |
|
71 | 81 | class OpenEOProcessGraph: |
72 | 82 | def __init__(self, pg_data: dict): |
| 83 | + # Make a workflow object |
| 84 | + self.workflow = Workflow('openeo_workflow', 'OpenEO Workflow') |
| 85 | + self.workflow._engineWMS = "Openeo-Workflow" |
| 86 | + self.workflow._level = "0" |
73 | 87 | self.G = nx.DiGraph() |
74 | 88 |
|
75 | 89 | # Save pg_data for resolving later on |
@@ -377,7 +391,7 @@ def node_callable(*args, parent_callables, named_parameters=None, **kwargs): |
377 | 391 | # The node needs to first call all its parents, so that results are prepopulated in the results_cache |
378 | 392 | for func in parent_callables: |
379 | 393 | func(*args, named_parameters=named_parameters, **kwargs) |
380 | | - |
| 394 | + cache_users = {} |
381 | 395 | try: |
382 | 396 | # If this node has already been computed once, just grab that result from the results_cache instead of recomputing it. |
383 | 397 | # This cannot be done for aggregated data as the wrapped function has to be called multiple times with different values. |
@@ -411,13 +425,108 @@ def node_callable(*args, parent_callables, named_parameters=None, **kwargs): |
411 | 425 | kwargs[arg_sub.arg_name] = self.G.nodes(data=True)[node][ |
412 | 426 | "resolved_kwargs" |
413 | 427 | ].__getitem__(arg_sub.arg_name) |
414 | | - |
415 | | - result = prebaked_process_impl( |
| 428 | + # Make a dictionary from the nodes that uses the outputs of the other nodes |
| 429 | + if source_node not in cache_users: |
| 430 | + cache_users[source_node] = [] |
| 431 | + cache_users[source_node].append(node) |
| 432 | + # Make the tasks |
| 433 | + task = Task(node, node_with_data['process_id']) |
| 434 | + # result = prebaked_process_impl( |
| 435 | + # *args, named_parameters=named_parameters, **kwargs |
| 436 | + # ) |
| 437 | + result, execution_data = self.profile_function(prebaked_process_impl)( |
416 | 438 | *args, named_parameters=named_parameters, **kwargs |
417 | 439 | ) |
418 | 440 |
|
| 441 | + if isinstance(result, xr.DataArray): |
| 442 | + processed_result = { |
| 443 | + "entity_type": "xarray.DataArray", |
| 444 | + "info": { |
| 445 | + "shape": result.shape, |
| 446 | + "dimensions": list(result.dims), |
| 447 | + # "attributes": result.attrs, |
| 448 | + "dtype": str(result.dtype), |
| 449 | + }, |
| 450 | + } |
| 451 | + |
| 452 | + elif isinstance(result, da.Array): |
| 453 | + processed_result = { |
| 454 | + "entity_type": "dask.Array", |
| 455 | + "info": { |
| 456 | + "shape": result.shape, |
| 457 | + "dtype": str(result.dtype), |
| 458 | + "chunk_size": result.chunksize, |
| 459 | + "chunk_type": type(result._meta).__name__, |
| 460 | + }, |
| 461 | + } |
| 462 | + else: |
| 463 | + processed_result = {} |
| 464 | + processed_result['info'] = result |
| 465 | + processed_result['entity_type'] = type(result).__name__ |
| 466 | + if result is not None: |
| 467 | + results_cache_node = Data( |
| 468 | + str(uuid.uuid4()), processed_result['entity_type'] |
| 469 | + ) |
| 470 | + results_cache_node._info = processed_result['info'] |
| 471 | + task.add_output(results_cache_node) |
| 472 | + self.workflow.add_data(results_cache_node) |
419 | 473 | results_cache[node] = result |
420 | 474 |
|
| 475 | + # Loading data info |
| 476 | + process_id = node_with_data.get("process_id") |
| 477 | + resolved_kwargs = node_with_data.get("resolved_kwargs", {}) |
| 478 | + |
| 479 | + if process_id in ("load_stac", "load_collection"): |
| 480 | + key = "url" if process_id == "load_stac" else "id" |
| 481 | + raw_source = resolved_kwargs.get(key, "") |
| 482 | + data_source = raw_source.split("\\")[-1] |
| 483 | + |
| 484 | + data_src = Data(str(uuid.uuid4()), data_source) |
| 485 | + # Extract extra information |
| 486 | + if process_id == "load_stac": |
| 487 | + data_src._info = resolved_kwargs |
| 488 | + |
| 489 | + task._start_time = execution_data['start_time'] |
| 490 | + task._end_time = execution_data['end_time'] |
| 491 | + task._status = execution_data['task_status'] |
| 492 | + task._level = "1" |
| 493 | + |
| 494 | + # This is just for load stac ( for the temporary usage) |
| 495 | + if node_with_data['process_id'] in ["load_stac", "load_collection"]: |
| 496 | + task.add_input(data_src) |
| 497 | + |
| 498 | + self.workflow.add_task(task) |
| 499 | + |
| 500 | + if cache_users: |
| 501 | + for source_node, target_node in cache_users.items(): |
| 502 | + output_data_from_source = ( |
| 503 | + self.workflow.get_task_by_id(source_node)._outputs[0]._id |
| 504 | + ) |
| 505 | + for target in target_node: |
| 506 | + self.workflow.get_task_by_id(target).add_input( |
| 507 | + self.workflow.get_data_by_id(output_data_from_source) |
| 508 | + ) |
| 509 | + |
| 510 | + edges = [ |
| 511 | + {"source": source, "target": target, "type": data["reference_type"]} |
| 512 | + for source, target, data in self.G.edges(node, data=True) |
| 513 | + ] |
| 514 | + |
| 515 | + for edge in edges: |
| 516 | + self.workflow.get_task_by_id(edge['source']).set_next( |
| 517 | + self.workflow.get_task_by_id(edge['target']) |
| 518 | + ) |
| 519 | + |
| 520 | + if node == self.result_node: |
| 521 | + self.workflow._status = "Ok" |
| 522 | + |
| 523 | + # To save the provenance |
| 524 | + # timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| 525 | + # save_path = os.path.join(os.getcwd(), f"run_{timestamp}") |
| 526 | + # print(f"Provenance file saved to: {save_path}") |
| 527 | + # os.makedirs(save_path, exist_ok=True) |
| 528 | + # self.workflow.prov_to_json(directory_path=save_path) |
| 529 | + |
421 | 530 | return result |
422 | 531 |
|
423 | 532 | return partial(node_callable, parent_callables=parent_callables) |
@@ -516,3 +625,36 @@ def plot(self, reverse=False): |
516 | 625 |
|
517 | 626 | if reverse: |
518 | 627 | self.G = self.G.reverse() |
| 628 | + |
| 629 | + @staticmethod |
| 630 | + def profile_function(func): |
| 631 | + """Decorator to track execution performance and return both result and profiling data. |
| 632 | + In the case in the future there will be some more metrics of intrest (like cpu and memory |
| 633 | + usage) to extract.""" |
| 634 | + |
| 635 | + @wraps(func) |
| 636 | + def wrapper(*args, named_parameters, **kwargs): |
| 637 | + start_dt = datetime.now() |
| 638 | + start_timestamp = start_dt.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] |
| 639 | + |
| 640 | + try: |
| 641 | + result = func(*args, named_parameters, **kwargs) |
| 642 | + status = "Ok" |
| 643 | + except Exception as e: |
| 644 | + result = str(e) |
| 645 | + status = f"Error: {result[:70]}" |
| 646 | + |
| 647 | + end_dt = datetime.now() |
| 648 | + end_timestamp = end_dt.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] |
| 649 | + execution_time = (end_dt - start_dt).total_seconds() |
| 650 | + execution_data = { |
| 651 | + # "function": func.__name__, |
| 652 | + "task_status": status, |
| 653 | + "start_time": start_timestamp, |
| 654 | + "end_time": end_timestamp, |
| 655 | + "execution_time_sec": round(execution_time, 4), |
| 656 | + } |
| 657 | + # Return both the result and profiling data |
| 658 | + return result, execution_data |
| 659 | + |
| 660 | + return wrapper |
0 commit comments