88import urllib
99import shutil
1010import logging
11- import traceback
11+
12+ from ontobio .model .association import GoAssociation
1213from ontobio .model .association import Curie , ExtensionUnit
1314from ontobio .io .entityparser import GpiParser
1415from ontobio .ontol_factory import OntologyFactory
2627from ontobio .validation import tools
2728from ontobio .validation import rules
2829
29- from typing import Dict , Set
30+ from typing import Dict , Set , List
3031
3132# logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s: %(message)s", level=logging.WARNING)
3233
@@ -342,7 +343,7 @@ def make_ttls(dataset, gaf_path, products, ontology_graph):
342343
343344@tools .gzips
344345def make_gpads (dataset , gaf_path , products , ontology_graph ,
345- noctua_gpad_file , paint_gaf_src , gpi , gpad_gpi_output_version ):
346+ noctua_gpad_file , paint_gaf_src , gpi , gpad_gpi_output_version ) -> ( List [ GoAssociation ], List [ str ]) :
346347 """
347348 Using the gaf files and the noctua gpad file, produce a gpad file that contains both kinds of annotations
348349 without any loss.
@@ -355,74 +356,103 @@ def make_gpads(dataset, gaf_path, products, ontology_graph,
355356 :param paint_gaf_src: The source of the paint gaf file
356357 :param gpi: The path to the gpi file -- needed to convert isoform annotations from Noctua files
357358 to gene annotations in GAF outputs.
358- :return: The path to the gpad file
359+ :return: ( The path to the gpad file, the headers from all the files that contributed to the final GPAD file)
359360
360361 """
361362 gpad_file_path = os .path .join (os .path .split (gaf_path )[0 ], f"{ dataset } .gpad" )
362363
363364 if not products ["gpad" ]:
364365 return []
366+ noctua_header = None
367+ all_gaf_headers = None
368+ noctua_associations = []
369+ all_gaf_associations = []
365370
366371 # Open the file once and keep it open for all operations within this block
367372 with open (gpad_file_path , "w" ) as outfile :
368373 gpadwriter = GpadWriter (file = outfile , version = gpad_gpi_output_version )
369-
370- # If there's a noctua gpad file, process it
374+ headers = []
375+ # If there's a noctua gpad file, process it, return the parsing Report so we can get its headers for
376+ # the final file provenance
371377 if noctua_gpad_file :
372- click .echo ("Making noctua gpad products...{}" . format ( noctua_gpad_file ) )
378+ click .echo ("Making noctua gpad products..." )
373379 # Process noctua gpad file
374- process_noctua_gpad_file (noctua_gpad_file , gpadwriter , ontology_graph , gpi )
375-
376- # Process the GAF file
377- process_gaf_file (gaf_path , gpadwriter , ontology_graph , paint_gaf_src )
380+ (noctua_associations , noctua_header ) = process_noctua_gpad_file (noctua_gpad_file , ontology_graph )
381+ headers .append (noctua_header )
382+ # Process the GAF file, store the report object so we can get its headers for the final file provenance
383+ (all_gaf_associations , all_gaf_headers ) = process_gaf_file (gaf_path , ontology_graph , paint_gaf_src )
384+
385+ if noctua_header :
386+ for header in noctua_header :
387+ gpadwriter ._write ("!Header from source noctua GPAD file\n " )
388+ gpadwriter ._write ("!=================================\n " )
389+ gpadwriter ._write (header )
390+ if all_gaf_headers :
391+ for header in all_gaf_headers :
392+ gpadwriter ._write ("!Header from source GAF file(s)\n " )
393+ gpadwriter ._write ("!=================================\n " )
394+ for header_line in header :
395+ gpadwriter ._write (header_line + "\n " )
396+
397+ click .echo ("Wrote all headers for GPAD, now writing associations..." )
398+ if noctua_associations :
399+ for assoc in noctua_associations :
400+ gpadwriter .write_assoc (assoc )
401+ if all_gaf_associations :
402+ for assoc in all_gaf_associations :
403+ gpadwriter .write_assoc (assoc )
378404
379405 # The file will be automatically closed here, after exiting the 'with' block
380406 return [gpad_file_path ]
381407
382-
383- def process_noctua_gpad_file (noctua_gpad_file , gpadwriter , ontology_graph , gpi ):
408+ def process_noctua_gpad_file (noctua_gpad_file , ontology_graph ) -> (List [GoAssociation ], List [str ]):
384409 """
385410 Process a noctua gpad file and write the associations to the gpad writer.
386411
387412 :param noctua_gpad_file: The path to the noctua gpad file
388- :param gpadwriter: The gpad writer to write the associations to
389413 :param ontology_graph: The ontology graph to use for parsing the associations
390- :param gpi: The path to the gpi file -- needed to convert isoform annotations from Noctua files
391414 """
392415
416+ processed_associations = []
393417 with open (noctua_gpad_file ) as nf :
394418 lines = sum (1 for line in nf )
395419 nf .seek (0 ) # Reset file pointer to the beginning after counting lines
396420 gpadparser = GpadParser (config = assocparser .AssocParserConfig (ontology = ontology_graph ,
397421 paint = False ,
398422 rule_set = "all" ))
423+
399424 click .echo ("Making noctua gpad products..." )
400425 with click .progressbar (iterable = gpadparser .association_generator (file = nf ), length = lines ) as associations :
401426 for association in associations :
402427 # If the association is an isoform annotation, convert it to a gene annotation
403- gpadwriter .write_assoc (association )
428+ processed_associations .append (association )
429+
430+ return processed_associations , gpadparser .report .header
404431
405432
406- def process_gaf_file (gaf_path , gpadwriter , ontology_graph , paint_gaf_src ):
433+ def process_gaf_file (gaf_path , ontology_graph , paint_gaf_src ) -> ( List [ GoAssociation ], List [ str ] ):
407434 """
408435 Process a gaf file and write the associations to the gpad writer.
409436
410437 :param gaf_path: The path to the gaf file
411- :param gpadwriter: The gpad writer to write the associations to
412438 :param ontology_graph: The ontology graph to use for parsing the associations
413439 :param paint_gaf_src: The source of the paint gaf file
414440
441+ :return: The headers from the variious gaf files in a list of Report objects
415442 """
443+ headers = []
444+ associations = []
416445 with open (gaf_path ) as gf :
417446 lines = sum (1 for line in gf )
418447 gf .seek (0 ) # Reset file pointer to the beginning after counting lines
419448 gafparser = GafParser (config = assocparser .AssocParserConfig (ontology = ontology_graph ,
420449 paint = True ,
421450 rule_set = "all" ))
422451 click .echo ("Merging in source gaf to gpad product..." )
423- with click .progressbar (iterable = gafparser .association_generator (file = gf ), length = lines ) as associations :
424- for association in associations :
425- gpadwriter .write_assoc (association )
452+ with click .progressbar (iterable = gafparser .association_generator (file = gf ), length = lines ) as gaf_assocs :
453+ for association in gaf_assocs :
454+ associations .append (association )
455+ headers .append (gafparser .report .header )
426456
427457 if paint_gaf_src is not None :
428458 with open (paint_gaf_src ) as pgf :
@@ -432,10 +462,12 @@ def process_gaf_file(gaf_path, gpadwriter, ontology_graph, paint_gaf_src):
432462 paint = True ,
433463 rule_set = "all" ))
434464 click .echo ("Merging in paint gaf to gpad product..." )
435- with click .progressbar (iterable = gafparser .association_generator (file = pgf ), length = lines ) as associations :
436- for association in associations :
437- gpadwriter .write_assoc (association )
465+ with click .progressbar (iterable = gafparser .association_generator (file = pgf ), length = lines ) as paint_assocs :
466+ for association in paint_assocs :
467+ associations .append (association )
468+ headers .append (gafparser .report .header )
438469
470+ return associations , headers
439471
440472@tools .gzips
441473def produce_gpi (dataset , target_dir , gaf_path , ontology_graph , gpad_gpi_output_version ):
@@ -626,7 +658,7 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
626658 :param metadata_dir: The directory containing the metadata files
627659 :param gpad: Produce GPAD files
628660 :param gpad_gpi_output_version: The version of the GPAD and GPI files to produce
629- :param ttl: Produce TTL files
661+ :param ttl: TTL files
630662 :param target: The directory to put the files in
631663 :param ontology: The ontology to use for validation
632664 :param exclude: Datasets to exclude
@@ -662,7 +694,7 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
662694 replace_existing_files = not skip_existing_files ,
663695 only_dataset = only_dataset )
664696
665- click .echo ("Downloaded GAF sources: {}" . format ( downloaded_gaf_sources ) )
697+ click .echo ("Downloaded GAF sources" )
666698 # extract the titles for the go rules, this is a dictionary comprehension
667699 rule_metadata = metadata .yamldown_lookup (os .path .join (absolute_metadata , "rules" ))
668700 goref_metadata = metadata .yamldown_lookup (os .path .join (absolute_metadata , "gorefs" ))
@@ -755,6 +787,7 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
755787 ontology_graph , noctua_gpad_src , paint_gaf_src ,
756788 gpi , gpad_gpi_output_version )
757789
790+
758791 end_gaf = mixin_a_dataset (valid_gaf , [noctua_metadata , paint_metadata ],
759792 group_metadata ["id" ], dataset , absolute_target ,
760793 ontology_graph , gpipaths = gpi_list , base_download_url = base_download_url ,
@@ -766,8 +799,7 @@ def produce(ctx, group, metadata_dir, gpad, gpad_gpi_output_version, ttl, target
766799 click .echo ("Executing the isoform fixing step in validate.produce..." )
767800 # run the resulting gaf through one last parse and replace, to handle the isoforms
768801 # see: https://github.com/geneontology/go-site/issues/2291
769- click .echo ("path to end gaf _temp.gaf: {}" .format (end_gaf ))
770- click .echo (os .path )
802+ click .echo ("path to end gaf _temp.gaf" )
771803
772804 click .echo (os .path .split (end_gaf )[0 ])
773805 temp_output_gaf_path = os .path .join (os .path .split (end_gaf )[0 ], "{}_temp.gaf" .format (dataset ))
0 commit comments