66'''
77
88import os , sys , string , random , subprocess , time , operator , math , datetime , numpy #pysam
9- from collections import defaultdict
9+ from collections import defaultdict , namedtuple
1010import shutil
1111import multiprocessing
1212import shlex
@@ -17,15 +17,15 @@ from logger import logger, TqdmToLogger, MIN_TQDM_INTERVAL
1717import argparse
1818import signal
1919from multiprocessing import Pool
20- from Bio import SeqIO
20+ from Bio import SeqIO , AlignIO
2121from glob import glob
2222from pathlib import Path
2323
2424
2525import extend as ext
2626from tqdm import tqdm
2727
28- __version__ = "2.0.4 "
28+ __version__ = "2.0.5 "
2929reroot_tree = True #use --midpoint-reroot
3030random_seeded = random .Random (42 )
3131
@@ -141,6 +141,54 @@ def handler(signum, frame):
141141
142142signal .signal (signal .SIGINT , handler )
143143
144+ def xmfa_to_maf (xmfa_path , maf_path , all_input_paths ):
145+ sample_delim = '#'
146+ SeqInfo = namedtuple ("SeqInfo" , "cid seq_length" )
147+ hdr_block_pattern = re .compile ("##SequenceIndex (\d+)\n ##SequenceFile (.+)\n ##SequenceHeader >\s*(\S+).*\n ##SequenceLength (\d+)bp" )
148+ idx_to_fname = {}
149+ ref_fname = ""
150+ with open (xmfa_path ) as xmfa_in :
151+ next (xmfa_in ) # skip version
152+ line = next (xmfa_in )
153+ seq_count = int (re .match ("#SequenceCount (\d+)\n " , line ).groups ()[0 ])
154+ for i in range (seq_count ):
155+ info_block = ""
156+ for _ in range (4 ):
157+ info_block += next (xmfa_in )
158+ parsed_info = hdr_block_pattern .match (info_block ).groups ()
159+ fname = parsed_info [1 ]
160+ if fname .endswith (".ref" ) and ref_fname == "" :
161+ fname = fname [:- 4 ]
162+ ref_fname = fname
163+ idx_to_fname [parsed_info [0 ]] = fname
164+
165+ fname_to_info = defaultdict (list )
166+ for f in all_input_paths :
167+ fname = Path (f ).name
168+ if fname .endswith (".ref" ):
169+ fname = fname [:- 4 ]
170+ fname_to_info [fname ] = [SeqInfo (rec .id .split (" " )[0 ], len (rec .seq )) for rec in SeqIO .parse (f , "fasta" )]
171+
172+ seqid_pattern = re .compile (r'^cluster(\d+) s(\d+):p(\d+)' )
173+ with open (maf_path , 'w' ) as maf_out :
174+ for aln in AlignIO .parse (xmfa_path , "mauve" ):
175+ for rec_idx , rec in enumerate (aln ):
176+ aln_len = rec .annotations ["end" ] - rec .annotations ["start" ]
177+ cluster_idx , contig_idx , startpos = [int (x ) for x in seqid_pattern .match (rec .id ).groups ()]
178+ fname = idx_to_fname [rec .name ]
179+ if rec_idx == 0 :
180+ maf_out .write (f"a cluster={ cluster_idx } \n " )
181+ elif fname == ref_fname :
182+ continue
183+ rec_info = fname_to_info [fname ][contig_idx - 1 ]
184+ maf_out .write (f"s\t { '.' .join (fname .split ('.' )[:- 1 ])} { sample_delim } { rec_info .cid } " )
185+ maf_out .write (f"\t { startpos } " )
186+ maf_out .write (f"\t { aln_len } " )
187+ maf_out .write (f"\t { '+' if rec .annotations ['strand' ] == 1 else '-' } " )
188+ maf_out .write (f"\t { rec_info .seq_length } " )
189+ maf_out .write (f"\t { rec .seq } \n " )
190+ maf_out .write ("\n " )
191+ return
144192
145193#TODO Merge run fns
146194def run_phipack (query ,seqlen ,workingdir ):
@@ -487,6 +535,10 @@ def parse_args():
487535 "--vcf" ,
488536 action = "store_true" ,
489537 help = "Generate VCF file." )
538+ misc_args .add_argument (
539+ "--no-maf" ,
540+ action = "store_true" ,
541+ help = "Do not generage MAF file (XMFA only)" )
490542 misc_args .add_argument (
491543 "-p" ,
492544 "--threads" ,
@@ -893,6 +945,7 @@ def parse_input_files(input_files, curated, validate_input):
893945 '''
894946 # global ff, hdr, seq
895947 fnafiles = []
948+ fname_to_info = defaultdict (list )
896949 for input_file in input_files [:]:
897950 # If biopython cannot parse the input as a fasta, kick it out of the list and move on:
898951 if validate_input :
@@ -1242,9 +1295,7 @@ SETTINGS:
12421295 # fnafiles.remove(ref)
12431296
12441297 #sort reference by largest replicon to smallest
1245- if ref in fnafiles :
1246- fnafiles = [f for f in fnafiles if f != ref ]
1247- elif sortem and os .path .exists (ref ) and not autopick_ref :
1298+ if sortem and os .path .exists (ref ) and not autopick_ref :
12481299 sequences = SeqIO .parse (ref , "fasta" )
12491300 new_ref = os .path .join (outputDir , os .path .basename (ref )+ ".ref" )
12501301 SeqIO .write (sequences , new_ref , "fasta" )
@@ -1453,7 +1504,7 @@ SETTINGS:
14531504
14541505 finalfiles = sorted (finalfiles )
14551506 random_seeded .shuffle (finalfiles )
1456- totseqs = len (finalfiles )
1507+ totseqs = len (finalfiles ) + 1 # Add one to include reference
14571508
14581509 #initiate parallelPhiPack tasks
14591510 run_recomb_filter = 0
@@ -1705,6 +1756,12 @@ SETTINGS:
17051756 for lcb in new_lcbs :
17061757 partition .write_aln_to_xmfa (lcb , out_f )
17071758
1759+
1760+ # Generate MAF file
1761+ if not args .no_maf :
1762+ logger .debug (f"Writing MAF file to { Path (parsnp_output ).with_suffix ('.maf' )} " )
1763+ xmfa_to_maf (parsnp_output , Path (parsnp_output ).with_suffix (".maf" ), finalfiles + [ref ])
1764+
17081765 #add genbank here, if present
17091766 if len (genbank_ref ) != 0 :
17101767 rnc = f"harvesttools -q -o { outputDir } /parsnp.ggr -x { parsnp_output } "
0 commit comments