Skip to content

Commit 8e0d41c

Browse files
authored
Merge pull request #151 from marbl/add-maf-option
Add maf option
2 parents e6402b1 + 5d438e0 commit 8e0d41c

File tree

2 files changed

+65
-8
lines changed

2 files changed

+65
-8
lines changed

logger.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
COLOR_SEQ = "\033[1;%dm"
1616
BOLD_SEQ = "\033[1m"
1717

18-
MIN_TQDM_INTERVAL=30
18+
MIN_TQDM_INTERVAL=5
1919

2020

2121
# Logging redirect copied from https://stackoverflow.com/questions/14897756/python-progress-bar-through-logging-module

parsnp

Lines changed: 64 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
'''
77

88
import os, sys, string, random, subprocess, time, operator, math, datetime, numpy #pysam
9-
from collections import defaultdict
9+
from collections import defaultdict, namedtuple
1010
import shutil
1111
import multiprocessing
1212
import shlex
@@ -17,15 +17,15 @@ from logger import logger, TqdmToLogger, MIN_TQDM_INTERVAL
1717
import argparse
1818
import signal
1919
from multiprocessing import Pool
20-
from Bio import SeqIO
20+
from Bio import SeqIO, AlignIO
2121
from glob import glob
2222
from pathlib import Path
2323

2424

2525
import extend as ext
2626
from tqdm import tqdm
2727

28-
__version__ = "2.0.4"
28+
__version__ = "2.0.5"
2929
reroot_tree = True #use --midpoint-reroot
3030
random_seeded = random.Random(42)
3131

@@ -141,6 +141,54 @@ def handler(signum, frame):
141141

142142
signal.signal(signal.SIGINT, handler)
143143

144+
def xmfa_to_maf(xmfa_path, maf_path, all_input_paths):
145+
sample_delim = '#'
146+
SeqInfo = namedtuple("SeqInfo", "cid seq_length")
147+
hdr_block_pattern = re.compile("##SequenceIndex (\d+)\n##SequenceFile (.+)\n##SequenceHeader >\s*(\S+).*\n##SequenceLength (\d+)bp")
148+
idx_to_fname = {}
149+
ref_fname = ""
150+
with open(xmfa_path) as xmfa_in:
151+
next(xmfa_in) # skip version
152+
line = next(xmfa_in)
153+
seq_count = int(re.match("#SequenceCount (\d+)\n", line).groups()[0])
154+
for i in range(seq_count):
155+
info_block = ""
156+
for _ in range(4):
157+
info_block += next(xmfa_in)
158+
parsed_info = hdr_block_pattern.match(info_block).groups()
159+
fname = parsed_info[1]
160+
if fname.endswith(".ref") and ref_fname == "":
161+
fname = fname[:-4]
162+
ref_fname = fname
163+
idx_to_fname[parsed_info[0]] = fname
164+
165+
fname_to_info = defaultdict(list)
166+
for f in all_input_paths:
167+
fname = Path(f).name
168+
if fname.endswith(".ref"):
169+
fname = fname[:-4]
170+
fname_to_info[fname] = [SeqInfo(rec.id.split(" ")[0], len(rec.seq)) for rec in SeqIO.parse(f, "fasta")]
171+
172+
seqid_pattern = re.compile(r'^cluster(\d+) s(\d+):p(\d+)')
173+
with open(maf_path, 'w') as maf_out:
174+
for aln in AlignIO.parse(xmfa_path, "mauve"):
175+
for rec_idx, rec in enumerate(aln):
176+
aln_len = rec.annotations["end"] - rec.annotations["start"]
177+
cluster_idx, contig_idx, startpos = [int(x) for x in seqid_pattern.match(rec.id).groups()]
178+
fname = idx_to_fname[rec.name]
179+
if rec_idx == 0:
180+
maf_out.write(f"a cluster={cluster_idx}\n")
181+
elif fname == ref_fname:
182+
continue
183+
rec_info = fname_to_info[fname][contig_idx-1]
184+
maf_out.write(f"s\t{'.'.join(fname.split('.')[:-1])}{sample_delim}{rec_info.cid}")
185+
maf_out.write(f"\t{startpos}")
186+
maf_out.write(f"\t{aln_len}")
187+
maf_out.write(f"\t{'+' if rec.annotations['strand'] == 1 else '-'}")
188+
maf_out.write(f"\t{rec_info.seq_length}")
189+
maf_out.write(f"\t{rec.seq}\n")
190+
maf_out.write("\n")
191+
return
144192

145193
#TODO Merge run fns
146194
def run_phipack(query,seqlen,workingdir):
@@ -487,6 +535,10 @@ def parse_args():
487535
"--vcf",
488536
action = "store_true",
489537
help = "Generate VCF file.")
538+
misc_args.add_argument(
539+
"--no-maf",
540+
action = "store_true",
541+
help = "Do not generage MAF file (XMFA only)")
490542
misc_args.add_argument(
491543
"-p",
492544
"--threads",
@@ -893,6 +945,7 @@ def parse_input_files(input_files, curated, validate_input):
893945
'''
894946
# global ff, hdr, seq
895947
fnafiles = []
948+
fname_to_info = defaultdict(list)
896949
for input_file in input_files[:]:
897950
# If biopython cannot parse the input as a fasta, kick it out of the list and move on:
898951
if validate_input:
@@ -1242,9 +1295,7 @@ SETTINGS:
12421295
# fnafiles.remove(ref)
12431296

12441297
#sort reference by largest replicon to smallest
1245-
if ref in fnafiles:
1246-
fnafiles = [f for f in fnafiles if f != ref]
1247-
elif sortem and os.path.exists(ref) and not autopick_ref:
1298+
if sortem and os.path.exists(ref) and not autopick_ref:
12481299
sequences = SeqIO.parse(ref, "fasta")
12491300
new_ref = os.path.join(outputDir, os.path.basename(ref)+".ref")
12501301
SeqIO.write(sequences, new_ref, "fasta")
@@ -1453,7 +1504,7 @@ SETTINGS:
14531504

14541505
finalfiles = sorted(finalfiles)
14551506
random_seeded.shuffle(finalfiles)
1456-
totseqs = len(finalfiles)
1507+
totseqs = len(finalfiles) + 1 # Add one to include reference
14571508

14581509
#initiate parallelPhiPack tasks
14591510
run_recomb_filter = 0
@@ -1705,6 +1756,12 @@ SETTINGS:
17051756
for lcb in new_lcbs:
17061757
partition.write_aln_to_xmfa(lcb, out_f)
17071758

1759+
1760+
# Generate MAF file
1761+
if not args.no_maf:
1762+
logger.debug(f"Writing MAF file to {Path(parsnp_output).with_suffix('.maf')}")
1763+
xmfa_to_maf(parsnp_output, Path(parsnp_output).with_suffix(".maf"), finalfiles + [ref])
1764+
17081765
#add genbank here, if present
17091766
if len(genbank_ref) != 0:
17101767
rnc = f"harvesttools -q -o {outputDir}/parsnp.ggr -x {parsnp_output}"

0 commit comments

Comments
 (0)