Skip to content

Commit 103cbcd

Browse files
authored
Merge pull request #698 from biolink/issue-pipeline-387
Issue pipeline 387 - add GAF header information to aggregated GPAD produced by megamake step (aka. ontobio/bin/validate.py)
2 parents 4fb0db6 + 04794fb commit 103cbcd

File tree

4 files changed

+84
-32
lines changed

4 files changed

+84
-32
lines changed

bin/README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@ alternatively, you can run the following commands to test the validate.produce c
1616
Note: snapshot below in the URL can be changed to any pipeline branch; its listed here for ease of cp/paste.
1717
```bash
1818
poetry install
19-
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/snapshot/" --only-dataset mgi MGI --gpad-gpi-output-version 2.0
20-
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/snapshot/" --only-dataset goa_chicken goa --gpad-gpi-output-version 2.0
21-
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://skyhook.berkeleybop.org/snapshot/" --only-dataset zfin ZFIN --gpad-gpi-output-version 2.0
19+
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://snapshot.geneontology.org/" --only-dataset mgi MGI --gpad-gpi-output-version 2.0
20+
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://snapshot.geneontology.org/" --only-dataset goa_chicken goa --gpad-gpi-output-version 2.0
21+
poetry run validate produce -m ../go-site/metadata --gpad -t . -o go-basic.json --base-download-url "http://snapshot.geneontology..org/" --only-dataset zfin ZFIN --gpad-gpi-output-version 2.0
2222
```
2323

2424
To test whether a GAF file is valid (passes all the GORules):

bin/validate.py

Lines changed: 60 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
import urllib
99
import shutil
1010
import logging
11-
import traceback
11+
12+
from ontobio.model.association import GoAssociation
1213
from ontobio.model.association import Curie, ExtensionUnit
1314
from ontobio.io.entityparser import GpiParser
1415
from ontobio.ontol_factory import OntologyFactory
@@ -26,7 +27,7 @@
2627
from ontobio.validation import tools
2728
from ontobio.validation import rules
2829

29-
from typing import Dict, Set
30+
from typing import Dict, Set, List
3031

3132
# logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s: %(message)s", level=logging.WARNING)
3233

@@ -342,7 +343,7 @@ def make_ttls(dataset, gaf_path, products, ontology_graph):
342343

343344
@tools.gzips
344345
def make_gpads(dataset, gaf_path, products, ontology_graph,
345-
noctua_gpad_file, paint_gaf_src, gpi, gpad_gpi_output_version):
346+
noctua_gpad_file, paint_gaf_src, gpi, gpad_gpi_output_version) -> (List[GoAssociation], List[str]):
346347
"""
347348
Using the gaf files and the noctua gpad file, produce a gpad file that contains both kinds of annotations
348349
without any loss.
@@ -355,74 +356,103 @@ def make_gpads(dataset, gaf_path, products, ontology_graph,
355356
:param paint_gaf_src: The source of the paint gaf file
356357
:param gpi: The path to the gpi file -- needed to convert isoform annotations from Noctua files
357358
to gene annotations in GAF outputs.
358-
:return: The path to the gpad file
359+
:return: (The path to the gpad file, the headers from all the files that contributed to the final GPAD file)
359360
360361
"""
361362
gpad_file_path = os.path.join(os.path.split(gaf_path)[0], f"{dataset}.gpad")
362363

363364
if not products["gpad"]:
364365
return []
366+
noctua_header = None
367+
all_gaf_headers = None
368+
noctua_associations = []
369+
all_gaf_associations = []
365370

366371
# Open the file once and keep it open for all operations within this block
367372
with open(gpad_file_path, "w") as outfile:
368373
gpadwriter = GpadWriter(file=outfile, version=gpad_gpi_output_version)
369-
370-
# If there's a noctua gpad file, process it
374+
headers = []
375+
# If there's a noctua gpad file, process it, return the parsing Report so we can get its headers for
376+
# the final file provenance
371377
if noctua_gpad_file:
372-
click.echo("Making noctua gpad products...{}".format(noctua_gpad_file))
378+
click.echo("Making noctua gpad products...")
373379
# Process noctua gpad file
374-
process_noctua_gpad_file(noctua_gpad_file, gpadwriter, ontology_graph, gpi)
375-
376-
# Process the GAF file
377-
process_gaf_file(gaf_path, gpadwriter, ontology_graph, paint_gaf_src)
380+
(noctua_associations, noctua_header) = process_noctua_gpad_file(noctua_gpad_file, ontology_graph)
381+
headers.append(noctua_header)
382+
# Process the GAF file, store the report object so we can get its headers for the final file provenance
383+
(all_gaf_associations, all_gaf_headers) = process_gaf_file(gaf_path, ontology_graph, paint_gaf_src)
384+
385+
if noctua_header:
386+
for header in noctua_header:
387+
gpadwriter._write("!Header from source noctua GPAD file\n")
388+
gpadwriter._write("!=================================\n")
389+
gpadwriter._write(header)
390+
if all_gaf_headers:
391+
for header in all_gaf_headers:
392+
gpadwriter._write("!Header from source GAF file(s)\n")
393+
gpadwriter._write("!=================================\n")
394+
for header_line in header:
395+
gpadwriter._write(header_line+"\n")
396+
397+
click.echo("Wrote all headers for GPAD, now writing associations...")
398+
if noctua_associations:
399+
for assoc in noctua_associations:
400+
gpadwriter.write_assoc(assoc)
401+
if all_gaf_associations:
402+
for assoc in all_gaf_associations:
403+
gpadwriter.write_assoc(assoc)
378404

379405
# The file will be automatically closed here, after exiting the 'with' block
380406
return [gpad_file_path]
381407

382-
383-
def process_noctua_gpad_file(noctua_gpad_file, gpadwriter, ontology_graph, gpi):
408+
def process_noctua_gpad_file(noctua_gpad_file, ontology_graph) -> (List[GoAssociation], List[str]):
384409
"""
385410
Process a noctua gpad file and write the associations to the gpad writer.
386411
387412
:param noctua_gpad_file: The path to the noctua gpad file
388-
:param gpadwriter: The gpad writer to write the associations to
389413
:param ontology_graph: The ontology graph to use for parsing the associations
390-
:param gpi: The path to the gpi file -- needed to convert isoform annotations from Noctua files
391414
"""
392415

416+
processed_associations = []
393417
with open(noctua_gpad_file) as nf:
394418
lines = sum(1 for line in nf)
395419
nf.seek(0) # Reset file pointer to the beginning after counting lines
396420
gpadparser = GpadParser(config=assocparser.AssocParserConfig(ontology=ontology_graph,
397421
paint=False,
398422
rule_set="all"))
423+
399424
click.echo("Making noctua gpad products...")
400425
with click.progressbar(iterable=gpadparser.association_generator(file=nf), length=lines) as associations:
401426
for association in associations:
402427
# If the association is an isoform annotation, convert it to a gene annotation
403-
gpadwriter.write_assoc(association)
428+
processed_associations.append(association)
429+
430+
return processed_associations, gpadparser.report.header
404431

405432

406-
def process_gaf_file(gaf_path, gpadwriter, ontology_graph, paint_gaf_src):
433+
def process_gaf_file(gaf_path, ontology_graph, paint_gaf_src) -> (List[GoAssociation], List[str]):
407434
"""
408435
Process a gaf file and write the associations to the gpad writer.
409436
410437
:param gaf_path: The path to the gaf file
411-
:param gpadwriter: The gpad writer to write the associations to
412438
:param ontology_graph: The ontology graph to use for parsing the associations
413439
:param paint_gaf_src: The source of the paint gaf file
414440
441+
:return: The headers from the variious gaf files in a list of Report objects
415442
"""
443+
headers = []
444+
associations = []
416445
with open(gaf_path) as gf:
417446
lines = sum(1 for line in gf)
418447
gf.seek(0) # Reset file pointer to the beginning after counting lines
419448
gafparser = GafParser(config=assocparser.AssocParserConfig(ontology=ontology_graph,
420449
paint=True,
421450
rule_set="all"))
422451
click.echo("Merging in source gaf to gpad product...")
423-
with click.progressbar(iterable=gafparser.association_generator(file=gf), length=lines) as associations:
424-
for association in associations:
425-
gpadwriter.write_assoc(association)
452+
with click.progressbar(iterable=gafparser.association_generator(file=gf), length=lines) as gaf_assocs:
453+
for association in gaf_assocs:
454+
associations.append(association)
455+
headers.append(gafparser.report.header)
426456

427457
if paint_gaf_src is not None:
428458
with open(paint_gaf_src) as pgf:
@@ -432,10 +462,12 @@ def process_gaf_file(gaf_path, gpadwriter, ontology_graph, paint_gaf_src):
432462
paint=True,
433463
rule_set="all"))
434464
click.echo("Merging in paint gaf to gpad product...")
435-
with click.progressbar(iterable=gafparser.association_generator(file=pgf), length=lines) as associations:
436-
for association in associations:
437-
gpadwriter.write_assoc(association)
465+
with click.progressbar(iterable=gafparser.association_generator(file=pgf), length=lines) as paint_assocs:
466+
for association in paint_assocs:
467+
associations.append(association)
468+
headers.append(gafparser.report.header)
438469

470+
return associations, headers
439471

440472
@tools.gzips
441473
def produce_gpi(dataset, target_dir, gaf_path, ontology_graph, gpad_gpi_output_version):
@@ -626,7 +658,7 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
626658
:param metadata_dir: The directory containing the metadata files
627659
:param gpad: Produce GPAD files
628660
:param gpad_gpi_output_version: The version of the GPAD and GPI files to produce
629-
:param ttl: Produce TTL files
661+
:param ttl: TTL files
630662
:param target: The directory to put the files in
631663
:param ontology: The ontology to use for validation
632664
:param exclude: Datasets to exclude
@@ -662,7 +694,7 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
662694
replace_existing_files=not skip_existing_files,
663695
only_dataset=only_dataset)
664696

665-
click.echo("Downloaded GAF sources: {}".format(downloaded_gaf_sources))
697+
click.echo("Downloaded GAF sources")
666698
# extract the titles for the go rules, this is a dictionary comprehension
667699
rule_metadata = metadata.yamldown_lookup(os.path.join(absolute_metadata, "rules"))
668700
goref_metadata = metadata.yamldown_lookup(os.path.join(absolute_metadata, "gorefs"))
@@ -755,6 +787,7 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
755787
ontology_graph, noctua_gpad_src, paint_gaf_src,
756788
gpi, gpad_gpi_output_version)
757789

790+
758791
end_gaf = mixin_a_dataset(valid_gaf, [noctua_metadata, paint_metadata],
759792
group_metadata["id"], dataset, absolute_target,
760793
ontology_graph, gpipaths=gpi_list, base_download_url=base_download_url,
@@ -766,8 +799,7 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
766799
click.echo("Executing the isoform fixing step in validate.produce...")
767800
# run the resulting gaf through one last parse and replace, to handle the isoforms
768801
# see: https://github.com/geneontology/go-site/issues/2291
769-
click.echo("path to end gaf _temp.gaf: {}".format(end_gaf))
770-
click.echo(os.path)
802+
click.echo("path to end gaf _temp.gaf")
771803

772804
click.echo(os.path.split(end_gaf)[0])
773805
temp_output_gaf_path = os.path.join(os.path.split(end_gaf)[0], "{}_temp.gaf".format(dataset))

tests/test_gafparser.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import io
1818
import json
1919
import re
20-
20+
import yaml
2121
ecomap = EcoMap()
2222
ecomap.mappings()
2323

@@ -90,6 +90,16 @@ def test_parse_gpad():
9090
parse_with(POMBASE_GPAD, GpadParser())
9191

9292

93+
def test_gaf_association_generator_header_report():
94+
p = GpadParser(config=assocparser.AssocParserConfig(group_metadata=yaml.load(open("tests/resources/mgi.dataset.yaml"),
95+
Loader=yaml.FullLoader)))
96+
test_gaf_file = "tests/resources/test-qualifiers-2.2.gaf"
97+
assert len(p.report.header) == 0
98+
for a in p.association_generator(open(test_gaf_file, "r")):
99+
continue
100+
assert len(p.report.header) > 0
101+
print(p.report.header)
102+
93103
def parse_with(f, p):
94104
p.config.ecomap = EcoMap()
95105
is_gaf = f == POMBASE

tests/test_gpad_parser.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,16 @@ def test_parse():
103103
print(p.report.to_markdown())
104104

105105

106+
def test_gpad_association_generator_header_report():
107+
p = GpadParser(config=assocparser.AssocParserConfig(group_metadata=yaml.load(open("tests/resources/mgi.dataset.yaml"),
108+
Loader=yaml.FullLoader)))
109+
test_gpad_file = "tests/resources/mgi.test.gpad"
110+
assert len(p.report.header) == 0
111+
for a in p.association_generator(open(test_gpad_file, "r")):
112+
continue
113+
assert len(p.report.header) > 0
114+
115+
106116
def test_parse_1_2():
107117
report = assocparser.Report(group="unknown", dataset="unknown")
108118
vals = [

0 commit comments

Comments
 (0)