-
-
Notifications
You must be signed in to change notification settings - Fork 3
F.A.Q
Jasper edited this page Aug 12, 2024
·
9 revisions
When using docker image you can retrieve the information used through the following query:
PREFIX wfprov:<http://purl.org/wf4ever/wfprov#>
PREFIX nsprov:<http://www.w3.org/ns/prov#>
PREFIX cwlprov:<https://w3id.org/cwl/prov#>
SELECT ?process ?image
WHERE {
VALUES ?process { <urn:uuid:9aa6075f-ddc4-48b6-bf71-7932ee0ccdc9> }
?process a wfprov:ProcessRun .
?process nsprov:wasAssociatedWith ?with .
?with cwlprov:image ?image .
}
Row | ?process | ?image |
---|---|---|
1 | urn:uuid:9aa6075f-ddc4-48b6-bf71-7932ee0ccdc9 | quay.io/biocontainers/filtlong:0.2.1--hd03093a_1 |
For each step involved obtain the runtime information
PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
PREFIX nsprov:<http://www.w3.org/ns/prov#>
PREFIX wfprov:<http://purl.org/wf4ever/wfprov#>
SELECT DISTINCT ?run ?label ?executionTime where {
?run a wfprov:ProcessRun .
?run rdfs:label ?label .
?run nsprov:qualifiedStart/nsprov:atTime ?start .
?run nsprov:qualifiedEnd/nsprov:atTime ?end
BIND(?end - ?start AS ?executionTime)
}
Row | ?run | ?label | ?executionTime |
---|---|---|---|
1 | urn:uuid:9aa6075f-ddc4-48b6-bf71-7932ee0ccdc9 | Run of workflow/packed.cwl#main/filtlong | "P0Y0M0DT0H3M34.646S"^^http://www.w3.org/2001/XMLSchema#duration |
2 | urn:uuid:bf5bcb2d-fc5b-42a5-adf1-9a1dd093d8df | Run of workflow/packed.cwl#main/nanoplot_longreads_unfiltered | "P0Y0M0DT0H1M11.160S"^^http://www.w3.org/2001/XMLSchema#duration |
3 | urn:uuid:144d2a22-5ed2-4d18-aa4e-5beed9426cbd | Run of workflow/packed.cwl#main/sapp_interproscan | "P0Y0M0DT0H0M56.680S"^^http://www.w3.org/2001/XMLSchema#duration |
4 | urn:uuid:1fb22c0c-6cab-4528-af8b-9f1552579147 | Run of workflow/packed.cwl#main/compress_bakta_35 | "P0Y0M0DT0H0M1.010S"^^http://www.w3.org/2001/XMLSchema#duration |
5 | urn:uuid:85c67126-1ee2-48fa-9c62-fd0d235c565d | Run of workflow/packed.cwl#main/busco | "P0Y0M0DT0H4M52.022S"^^http://www.w3.org/2001/XMLSchema#duration |
6 | urn:uuid:b4480820-0d80-4ecf-bc88-5ea4a307e372 | Run of workflow/packed.cwl#main | "P0Y0M0DT0H15M46.961S"^^http://www.w3.org/2001/XMLSchema#duration |
7 | urn:uuid:b4480820-0d80-4ecf-bc88-5ea4a307e372 | Run of workflow/packed.cwl#main | "P0Y0M0DT0H4M35.425S"^^http://www.w3.org/2001/XMLSchema#duration |
8 | urn:uuid:b4480820-0d80-4ecf-bc88-5ea4a307e372 | Run of workflow/packed.cwl#main/illumina_kraken2_unfiltered | "P0Y0M0DT0H15M46.961S"^^http://www.w3.org/2001/XMLSchema#duration |
9 | urn:uuid:b4480820-0d80-4ecf-bc88-5ea4a307e372 | Run of workflow/packed.cwl#main/illumina_kraken2_unfiltered | "P0Y0M0DT0H4M35.425S"^^http://www.w3.org/2001/XMLSchema#duration |
For a selected step (using the values) obtain the input variables of a process. Please note that you change the VALUES ?process (or remove the line) when testing the query on your dataset.
prefix wfprov:<http://purl.org/wf4ever/wfprov#>
prefix nsprov:<http://www.w3.org/ns/prov#>
select ?process ?var ?value where {
VALUES ?process { <urn:uuid:9aa6075f-ddc4-48b6-bf71-7932ee0ccdc9> }
?process a wfprov:ProcessRun .
?process nsprov:qualifiedUsage ?input .
?input a nsprov:Usage .
?input nsprov:hadRole ?role .
?input nsprov:entity ?entity .
?entity nsprov:value ?value .
BIND(REPLACE(STR(?role), ".*/","") AS ?var)
}
Row | ?process | ?var | ?value |
---|---|---|---|
1 | urn:uuid:9aa6075f-ddc4-48b6-bf71-7932ee0ccdc9 | length_weight | 1.0E1 |
2 | urn:uuid:9aa6075f-ddc4-48b6-bf71-7932ee0ccdc9 | minimum_length | 1000 |
3 | urn:uuid:9aa6075f-ddc4-48b6-bf71-7932ee0ccdc9 | keep_percent | 9.0E1 |
4 | urn:uuid:9aa6075f-ddc4-48b6-bf71-7932ee0ccdc9 | output_filename | BMOCK12_repeat_5_run_3_medaka_Nanopore_filtered |
For a given step the following query retrieves the input / output content
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX nsprov: <http://www.w3.org/ns/prov#>
PREFIX cwlprov: <https://w3id.org/cwl/prov#>
PREFIX wf4ever: <http://purl.org/wf4ever/wf4ever#>
PREFIX wfprov: <http://purl.org/wf4ever/wfprov#>
select ?input ?output ?label where {
VALUES ?workflow { <urn:uuid:1d964db6-d1be-4234-85e3-8c4289385786> }
?generated_file a wf4ever:File .
?generated_file cwlprov:basename ?output .
?generated_file nsprov:qualifiedGeneration ?generation .
?generation a nsprov:Generation .
?generation nsprov:activity ?workflow .
?workflow rdfs:label ?label .
?workflow nsprov:qualifiedUsage ?usage .
?usage nsprov:entity ?entity .
?entity a wfprov:Artifact .
?entity cwlprov:basename ?input .
}
Row | ?input | ?output | ?label |
---|---|---|---|
1 | BMOCK12_repeat_5_run_3_medaka_SemiBin_3.fa | BMOCK12_repeat_5_run_3_medaka_SemiBin_3.ffn | Run of workflow/packed.cwl#main/bakta_5 |
2 | bakta_db_v5.1-light | BMOCK12_repeat_5_run_3_medaka_SemiBin_3.ffn | Run of workflow/packed.cwl#main/bakta_5 |
3 | BMOCK12_repeat_5_run_3_medaka_SemiBin_3.fa | BMOCK12_repeat_5_run_3_medaka_SemiBin_3.txt | Run of workflow/packed.cwl#main/bakta_5 |
4 | bakta_db_v5.1-light | BMOCK12_repeat_5_run_3_medaka_SemiBin_3.txt | Run of workflow/packed.cwl#main/bakta_5 |
5 | BMOCK12_repeat_5_run_3_medaka_SemiBin_3.fa | BMOCK12_repeat_5_run_3_medaka_SemiBin_3.svg | Run of workflow/packed.cwl#main/bakta_5 |
6 | bakta_db_v5.1-light | BMOCK12_repeat_5_run_3_medaka_SemiBin_3.svg | Run of workflow/packed.cwl#main/bakta_5 |
.... |
To visualize all the input and outputs and through which workflows they go through the following snippet could be used.
# Network reconstruction from the provenance graph
from SPARQLWrapper import SPARQLWrapper, JSON
import matplotlib.pyplot as plt
import networkx as nx
sparql = SPARQLWrapper("http://localhost:7200/repositories/bmock12")
sparql.setQuery("""
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX nsprov: <http://www.w3.org/ns/prov#>
PREFIX cwlprov: <https://w3id.org/cwl/prov#>
PREFIX wf4ever: <http://purl.org/wf4ever/wf4ever#>
PREFIX wfprov: <http://purl.org/wf4ever/wfprov#>
select ?input ?output ?workflow ?workflow_label ?generation ?entity
where {
?generated_file cwlprov:basename ?output .
?generated_file nsprov:qualifiedGeneration ?generation .
?generation a nsprov:Generation .
?generation nsprov:activity ?workflow .
?workflow rdfs:label ?workflow_label .
?workflow nsprov:qualifiedUsage ?usage .
?usage nsprov:entity ?entity .
?entity a wfprov:Artifact .
?entity cwlprov:basename ?input .
}""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
G = nx.DiGraph()
node_lookup = {}
input_output_lookup = {}
import re
for result in results["results"]["bindings"]:
# Create a network graph
# G.add_node(result['workflow']['value'], description=result['label']['value'], color="red")
workflow_name = result['workflow_label']['value'].split("#main/")[-1]
workflow_name = re.sub("_[0-9]+", "", workflow_name)
G.add_node(workflow_name, description=workflow_name, color="red")
# G.add_node(result['entity']['value'], description=result['input']['value'], color="blue")
G.add_node(result['input']['value'], description=result['input']['value'], color_input="blue")
# G.add_node(result['generation']['value'], description=result['output']['value'], color="green")
G.add_node(result['output']['value'], description=result['output']['value'], color_output="green")
#### Set the edges
# G.add_edge(result['entity']['value'], result['workflow']['value'], name="input")
G.add_edge(result['input']['value'], workflow_name, name="input")
# G.add_edge(result['workflow']['value'], result['generation']['value'])
G.add_edge(workflow_name, result['output']['value'])
# For each node check the color_input and color_output
for node in G.nodes:
if "color_input" in G.nodes[node] and "color_output" in G.nodes[node]:
G.nodes[node]["color"] = "yellow"
elif "color_input" in G.nodes[node]:
G.nodes[node]["color"] = "blue"
elif "color_output" in G.nodes[node]:
G.nodes[node]["color"] = "green"
nx.write_gexf(G, "provenance.gexf")
nx.graphml.write_graphml(G, "provenance.graphml")
# Visualize the graph
G = nx.read_gexf("provenance.gexf")
G = G.to_undirected()
# Split the nodes into each connected component
for component in nx.connected_components(G):
# Plot the connected component
subgraph = G.subgraph(component)
# Get labels and colors
label_dict = nx.get_node_attributes(subgraph, "description")
color_list = nx.get_node_attributes(subgraph, "color").values()
plt.figure(figsize=(20, 20))
# Plot the graph
pos = nx.spring_layout(subgraph)
nx.draw(subgraph, pos, with_labels=True, node_size=100, font_size=5, font_weight="bold", font_color="black", edge_color="gray", linewidths=1, alpha=0.7, arrowsize=20, labels=label_dict, node_color=color_list, arrows=True)
plt.axis("off")
plt.show()
# If you want to save the graph, disable the plt.show() and enable the plt.savefig()
# plt.savefig("provenance.svg")
This gives a result like: