Skip to content

Commit 9b4bcc2

Browse files
committed
Merge branch 'master' of github.com:marbl/parsnp
2 parents 277383d + 148bdd6 commit 9b4bcc2

File tree

1 file changed

+50
-64
lines changed

1 file changed

+50
-64
lines changed

parsnp

Lines changed: 50 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ from glob import glob
2222
from pathlib import Path
2323

2424

25-
import extend as ext
2625
from tqdm import tqdm
2726

2827
__version__ = "2.0.6"
@@ -913,18 +912,35 @@ def make_genome_and_reference_output_strings(ref, genbank_files):
913912
return sortem, ref_string, genome_string, ref
914913

915914

915+
def readfa(fp):
916+
"""
917+
Fasta parser taken from readfq
918+
"""
919+
last = None # this is a buffer keeping the last unprocessed line
920+
while True: # mimic closure; is it a bad idea?
921+
if not last: # the first record or a record following a fastq
922+
for l in fp: # search for the start of the next record
923+
if l[0] == '>': # fasta header line
924+
last = l[:-1] # save this line
925+
break
926+
if not last: break
927+
name, seqs, last = last[1:].partition(" ")[0], [], None
928+
for l in fp: # read the sequence
929+
if l[0] in '>':
930+
last = l[:-1]
931+
break
932+
seqs.append(l[:-1])
933+
934+
yield name, ''.join(seqs) # yield a fasta record
935+
if not last: break
936+
916937
def check_ref_genome_aligned(ref):
917-
# global ff, hdr, seq, line, reflen
938+
reflen = 0
918939
with open(ref, 'r') as ff:
919-
hdr = ff.readline()
920-
seq = ff.read()
921-
if hdr[0] != ">":
922-
logger.critical("Reference {} has improperly formatted header.".format(ref))
923-
sys.exit(1)
924-
for line in seq.split('\n'):
925-
if '-' in line and line[0] != ">":
926-
logger.warning("Reference genome sequence %s has '-' in the sequence!" % ((ref)))
927-
reflen = len(seq) - seq.count('\n')
940+
for hdr, seq in readfa(ff):
941+
if '-' in seq:
942+
logger.warning(f"Reference genome sequence {hdr} in {ref} has '-' in the sequence!")
943+
reflen += len(seq)
928944

929945
return reflen
930946

@@ -962,22 +978,18 @@ def parse_input_files(input_files, curated, validate_input):
962978

963979
# Old version of the parser:
964980
with open(input_file, 'r') as ff:
965-
hdr = ff.readline()
966-
seq = ff.read()
967-
name_flag = True
968-
seqlen = len(seq) - seq.count('\n')
969-
if hdr[0] != ">":
970-
logger.error("{} has improperly formatted header. Skip!".format(input_file))
981+
concat_seq = ""
982+
for hdr, seq in readfa(ff):
983+
concat_seq += seq
984+
985+
seqlen = len(concat_seq)
986+
if '-' in concat_seq:
987+
logger.error("Genome sequence %s seems to be aligned! Skip!" % ((input_file)))
971988
continue
972-
elif '-' in seq:
973-
seq = seq.split('\n')
974-
if any('-' in l and ('>' not in l) for l in seq):
975-
logger.error("Genome sequence %s seems to be aligned! Skip!" % ((input_file)))
976-
continue
977989
elif seqlen <= 20:
978990
logger.error("File %s is less than or equal to 20bp in length. Skip!" % (input_file))
979991
continue
980-
sizediff = float(reflen) / float(seqlen)
992+
sizediff = float(reflen) / seqlen
981993

982994
# Argument for ignoring any issues with the input/references:
983995
if curated:
@@ -1296,31 +1308,15 @@ SETTINGS:
12961308

12971309
#sort reference by largest replicon to smallest
12981310
if sortem and os.path.exists(ref) and not autopick_ref:
1299-
sequences = SeqIO.parse(ref, "fasta")
1311+
with open(ref, 'r') as ff:
1312+
seq_dict = {hdr: seq for hdr, seq in readfa(ff)}
1313+
seqs_sorted_by_len = sorted(seq_dict.items(), key=lambda kv: -len(kv[1]))
13001314
new_ref = os.path.join(outputDir, os.path.basename(ref)+".ref")
1301-
SeqIO.write(sequences, new_ref, "fasta")
1315+
with open(new_ref, 'w') as ffo:
1316+
for hdr, seq in seqs_sorted_by_len:
1317+
ffo.write(f">{hdr}\n")
1318+
ffo.write(f"{seq}\n")
13021319
ref = new_ref
1303-
# logger.debug("Sorting reference replicons")
1304-
# ff = open(ref, 'r')
1305-
# seqs = ff.read().split(">")[1:]
1306-
# seq_dict = {}
1307-
# seq_len = {}
1308-
# for seq in seqs:
1309-
# try:
1310-
# hdr, seq = seq.split("\n",1)
1311-
# except ValueError:
1312-
# # TODO Why do we ignore when theres a header but no sequence?
1313-
# continue
1314-
# seq_dict[hdr] = seq
1315-
# seq_len[hdr] = len(seq) - seq.count('\n')
1316-
# seq_len_sort = sorted(iter(seq_len.items()), key=operator.itemgetter(1), reverse=True)
1317-
# ref = os.path.join(outputDir, os.path.basename(ref)+".ref")
1318-
# ffo = open(ref, 'w')
1319-
# for hdr, seq in seq_len_sort:
1320-
# ffo.write(">%s\n"%(hdr))
1321-
# ffo.write("%s"%(seq_dict[hdr]))
1322-
# ff.close()
1323-
# ffo.close()
13241320
else:
13251321
ref = genbank_ref
13261322

@@ -1479,27 +1475,17 @@ SETTINGS:
14791475
# More stuff to autopick the reference if needed:
14801476
orig_auto_ref = auto_ref
14811477
if os.path.exists(auto_ref) and autopick_ref:
1482-
#TODO This code block is duplicated
1483-
ff = open(auto_ref, 'r')
1484-
seqs = ff.read().split(">")[1:]
14851478
seq_dict = {}
14861479
seq_len = {}
1487-
for seq in seqs:
1488-
try:
1489-
hdr, seq = seq.split("\n",1)
1490-
except ValueError:
1491-
continue
1492-
seq_dict[hdr] = seq
1493-
seq_len[hdr] = len(seq) - seq.count('\n')
1494-
seq_len_sort = sorted(seq_len.iteritems(), key=operator.itemgetter(1))
1495-
seq_len_sort.reverse()
1480+
with open(auto_ref, 'r') as ff:
1481+
seq_dict = {hdr: seq for hdr, seq in readfa(ff)}
1482+
1483+
seqs_sorted_by_len = sorted(seq_dict.items(), key=lambda kv: -len(kv[1]))
14961484
auto_ref = os.path.join(outputDir, os.path.basename(auto_ref)+".ref")
1497-
ffo = open(ref, 'w')
1498-
for item in seq_len_sort:
1499-
ffo.write(">%s\n"%(item[0]))
1500-
ffo.write(seq_dict[item[0]])
1501-
ff.close()
1502-
ffo.close()
1485+
with open(ref, 'w') as ffo:
1486+
for hdr, seq in seqs_sorted_by_len:
1487+
ffo.write(f">{hdr}\n")
1488+
ffo.write(f"{seq}\n")
15031489
ref = auto_ref
15041490

15051491
finalfiles = sorted(finalfiles)

0 commit comments

Comments
 (0)