Skip to content
Jasper edited this page Aug 12, 2024 · 9 revisions

Frequently Asked Queries

Image information

When using docker image you can retrieve the information used through the following query:

PREFIX wfprov:<http://purl.org/wf4ever/wfprov#>
PREFIX nsprov:<http://www.w3.org/ns/prov#>
PREFIX cwlprov:<https://w3id.org/cwl/prov#>
SELECT ?process ?image 
WHERE {
    VALUES ?process { <urn:uuid:9aa6075f-ddc4-48b6-bf71-7932ee0ccdc9> }
	?process a wfprov:ProcessRun .
    ?process nsprov:wasAssociatedWith ?with .
    ?with cwlprov:image ?image .
}
Row ?process ?image
1 urn:uuid:9aa6075f-ddc4-48b6-bf71-7932ee0ccdc9 quay.io/biocontainers/filtlong:0.2.1--hd03093a_1

Runtime information

For each step involved obtain the runtime information

PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
PREFIX nsprov:<http://www.w3.org/ns/prov#>
PREFIX wfprov:<http://purl.org/wf4ever/wfprov#>
SELECT DISTINCT ?run ?label ?executionTime where {
    ?run a wfprov:ProcessRun .
	?run rdfs:label ?label .
    ?run nsprov:qualifiedStart/nsprov:atTime ?start .
    ?run nsprov:qualifiedEnd/nsprov:atTime ?end
    BIND(?end - ?start AS ?executionTime)
}
Row ?run ?label ?executionTime
1 urn:uuid:9aa6075f-ddc4-48b6-bf71-7932ee0ccdc9 Run of workflow/packed.cwl#main/filtlong "P0Y0M0DT0H3M34.646S"^^http://www.w3.org/2001/XMLSchema#duration
2 urn:uuid:bf5bcb2d-fc5b-42a5-adf1-9a1dd093d8df Run of workflow/packed.cwl#main/nanoplot_longreads_unfiltered "P0Y0M0DT0H1M11.160S"^^http://www.w3.org/2001/XMLSchema#duration
3 urn:uuid:144d2a22-5ed2-4d18-aa4e-5beed9426cbd Run of workflow/packed.cwl#main/sapp_interproscan "P0Y0M0DT0H0M56.680S"^^http://www.w3.org/2001/XMLSchema#duration
4 urn:uuid:1fb22c0c-6cab-4528-af8b-9f1552579147 Run of workflow/packed.cwl#main/compress_bakta_35 "P0Y0M0DT0H0M1.010S"^^http://www.w3.org/2001/XMLSchema#duration
5 urn:uuid:85c67126-1ee2-48fa-9c62-fd0d235c565d Run of workflow/packed.cwl#main/busco "P0Y0M0DT0H4M52.022S"^^http://www.w3.org/2001/XMLSchema#duration
6 urn:uuid:b4480820-0d80-4ecf-bc88-5ea4a307e372 Run of workflow/packed.cwl#main "P0Y0M0DT0H15M46.961S"^^http://www.w3.org/2001/XMLSchema#duration
7 urn:uuid:b4480820-0d80-4ecf-bc88-5ea4a307e372 Run of workflow/packed.cwl#main "P0Y0M0DT0H4M35.425S"^^http://www.w3.org/2001/XMLSchema#duration
8 urn:uuid:b4480820-0d80-4ecf-bc88-5ea4a307e372 Run of workflow/packed.cwl#main/illumina_kraken2_unfiltered "P0Y0M0DT0H15M46.961S"^^http://www.w3.org/2001/XMLSchema#duration
9 urn:uuid:b4480820-0d80-4ecf-bc88-5ea4a307e372 Run of workflow/packed.cwl#main/illumina_kraken2_unfiltered "P0Y0M0DT0H4M35.425S"^^http://www.w3.org/2001/XMLSchema#duration

Step input

For a selected step (using the values) obtain the input variables of a process. Please note that you change the VALUES ?process (or remove the line) when testing the query on your dataset.

prefix wfprov:<http://purl.org/wf4ever/wfprov#>
prefix nsprov:<http://www.w3.org/ns/prov#>
select ?process ?var ?value where {
    VALUES ?process { <urn:uuid:9aa6075f-ddc4-48b6-bf71-7932ee0ccdc9> }
	?process a wfprov:ProcessRun .
    ?process nsprov:qualifiedUsage ?input .
    ?input a nsprov:Usage .
    ?input nsprov:hadRole ?role .
    ?input nsprov:entity ?entity .
    ?entity nsprov:value ?value .
    BIND(REPLACE(STR(?role), ".*/","") AS ?var)
}
Row ?process ?var ?value
1 urn:uuid:9aa6075f-ddc4-48b6-bf71-7932ee0ccdc9 length_weight 1.0E1
2 urn:uuid:9aa6075f-ddc4-48b6-bf71-7932ee0ccdc9 minimum_length 1000
3 urn:uuid:9aa6075f-ddc4-48b6-bf71-7932ee0ccdc9 keep_percent 9.0E1
4 urn:uuid:9aa6075f-ddc4-48b6-bf71-7932ee0ccdc9 output_filename BMOCK12_repeat_5_run_3_medaka_Nanopore_filtered

Input & output files

For a given step the following query retrieves the input / output content

PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX nsprov: <http://www.w3.org/ns/prov#>
PREFIX cwlprov: <https://w3id.org/cwl/prov#>
PREFIX wf4ever: <http://purl.org/wf4ever/wf4ever#>
PREFIX wfprov: <http://purl.org/wf4ever/wfprov#>
select ?input ?output ?label where {
    VALUES ?workflow { <urn:uuid:1d964db6-d1be-4234-85e3-8c4289385786> }
    ?generated_file a wf4ever:File .
    ?generated_file cwlprov:basename ?output .
    ?generated_file nsprov:qualifiedGeneration ?generation .
    ?generation a nsprov:Generation .
    ?generation nsprov:activity ?workflow .
    ?workflow rdfs:label ?label .
    ?workflow nsprov:qualifiedUsage ?usage .
    ?usage nsprov:entity ?entity .
    ?entity a wfprov:Artifact .
    ?entity cwlprov:basename ?input .
}
Row ?input ?output ?label
1 BMOCK12_repeat_5_run_3_medaka_SemiBin_3.fa BMOCK12_repeat_5_run_3_medaka_SemiBin_3.ffn Run of workflow/packed.cwl#main/bakta_5
2 bakta_db_v5.1-light BMOCK12_repeat_5_run_3_medaka_SemiBin_3.ffn Run of workflow/packed.cwl#main/bakta_5
3 BMOCK12_repeat_5_run_3_medaka_SemiBin_3.fa BMOCK12_repeat_5_run_3_medaka_SemiBin_3.txt Run of workflow/packed.cwl#main/bakta_5
4 bakta_db_v5.1-light BMOCK12_repeat_5_run_3_medaka_SemiBin_3.txt Run of workflow/packed.cwl#main/bakta_5
5 BMOCK12_repeat_5_run_3_medaka_SemiBin_3.fa BMOCK12_repeat_5_run_3_medaka_SemiBin_3.svg Run of workflow/packed.cwl#main/bakta_5
6 bakta_db_v5.1-light BMOCK12_repeat_5_run_3_medaka_SemiBin_3.svg Run of workflow/packed.cwl#main/bakta_5
....

Network visualization

To visualize all the input and outputs and through which workflows they go through the following snippet could be used.

# Network reconstruction from the provenance graph

from SPARQLWrapper import SPARQLWrapper, JSON
import matplotlib.pyplot as plt
import networkx as nx


sparql = SPARQLWrapper("http://localhost:7200/repositories/bmock12")
sparql.setQuery("""
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX nsprov: <http://www.w3.org/ns/prov#>
PREFIX cwlprov: <https://w3id.org/cwl/prov#>
PREFIX wf4ever: <http://purl.org/wf4ever/wf4ever#>
PREFIX wfprov: <http://purl.org/wf4ever/wfprov#>
select ?input ?output ?workflow ?workflow_label ?generation ?entity 
where {
    ?generated_file cwlprov:basename ?output .
    ?generated_file nsprov:qualifiedGeneration ?generation .
    ?generation a nsprov:Generation .
    ?generation nsprov:activity ?workflow .
    ?workflow rdfs:label ?workflow_label .
    ?workflow nsprov:qualifiedUsage ?usage .
    ?usage nsprov:entity ?entity .
    ?entity a wfprov:Artifact .
    ?entity cwlprov:basename ?input .
}""")

sparql.setReturnFormat(JSON)
results = sparql.query().convert()

G = nx.DiGraph()

node_lookup = {}
input_output_lookup = {}
import re

for result in results["results"]["bindings"]:
    # Create a network graph
    # G.add_node(result['workflow']['value'], description=result['label']['value'], color="red")
    workflow_name = result['workflow_label']['value'].split("#main/")[-1]
    workflow_name = re.sub("_[0-9]+", "", workflow_name)
    G.add_node(workflow_name, description=workflow_name, color="red")
    # G.add_node(result['entity']['value'], description=result['input']['value'], color="blue")
    G.add_node(result['input']['value'], description=result['input']['value'], color_input="blue")
    # G.add_node(result['generation']['value'], description=result['output']['value'], color="green")
    G.add_node(result['output']['value'], description=result['output']['value'], color_output="green")
    #### Set the edges
    # G.add_edge(result['entity']['value'], result['workflow']['value'], name="input")
    G.add_edge(result['input']['value'], workflow_name, name="input")
    # G.add_edge(result['workflow']['value'], result['generation']['value'])
    G.add_edge(workflow_name, result['output']['value'])
    

# For each node check the color_input and color_output
for node in G.nodes:
    if "color_input" in G.nodes[node] and "color_output" in G.nodes[node]:
        G.nodes[node]["color"] = "yellow"
    elif "color_input" in G.nodes[node]:
        G.nodes[node]["color"] = "blue"
    elif "color_output" in G.nodes[node]:
        G.nodes[node]["color"] = "green"

nx.write_gexf(G, "provenance.gexf")
nx.graphml.write_graphml(G, "provenance.graphml")

# Visualize the graph
G = nx.read_gexf("provenance.gexf")
G = G.to_undirected()
# Split the nodes into each connected component
for component in nx.connected_components(G):
    # Plot the connected component
    subgraph = G.subgraph(component)
    # Get labels and colors
    label_dict = nx.get_node_attributes(subgraph, "description")
    color_list = nx.get_node_attributes(subgraph, "color").values()
    plt.figure(figsize=(20, 20))
    # Plot the graph
    pos = nx.spring_layout(subgraph)
    nx.draw(subgraph, pos, with_labels=True, node_size=100, font_size=5, font_weight="bold", font_color="black", edge_color="gray", linewidths=1, alpha=0.7, arrowsize=20, labels=label_dict, node_color=color_list, arrows=True)
    plt.axis("off")
    plt.show()

# If you want to save the graph, disable the plt.show() and enable the plt.savefig()
# plt.savefig("provenance.svg")

This gives a result like:

provenance