@@ -22,7 +22,6 @@ from glob import glob
2222from pathlib import Path
2323
2424
25- import extend as ext
2625from tqdm import tqdm
2726
2827__version__ = "2.0.6"
@@ -913,18 +912,35 @@ def make_genome_and_reference_output_strings(ref, genbank_files):
913912 return sortem , ref_string , genome_string , ref
914913
915914
915+ def readfa (fp ):
916+ """
917+ Fasta parser taken from readfq
918+ """
919+ last = None # this is a buffer keeping the last unprocessed line
920+ while True : # mimic closure; is it a bad idea?
921+ if not last : # the first record or a record following a fastq
922+ for l in fp : # search for the start of the next record
923+ if l [0 ] == '>' : # fasta header line
924+ last = l [:- 1 ] # save this line
925+ break
926+ if not last : break
927+ name , seqs , last = last [1 :].partition (" " )[0 ], [], None
928+ for l in fp : # read the sequence
929+ if l [0 ] in '>' :
930+ last = l [:- 1 ]
931+ break
932+ seqs .append (l [:- 1 ])
933+
934+ yield name , '' .join (seqs ) # yield a fasta record
935+ if not last : break
936+
916937def check_ref_genome_aligned (ref ):
917- # global ff, hdr, seq, line, reflen
938+ reflen = 0
918939 with open (ref , 'r' ) as ff :
919- hdr = ff .readline ()
920- seq = ff .read ()
921- if hdr [0 ] != ">" :
922- logger .critical ("Reference {} has improperly formatted header." .format (ref ))
923- sys .exit (1 )
924- for line in seq .split ('\n ' ):
925- if '-' in line and line [0 ] != ">" :
926- logger .warning ("Reference genome sequence %s has '-' in the sequence!" % ((ref )))
927- reflen = len (seq ) - seq .count ('\n ' )
940+ for hdr , seq in readfa (ff ):
941+ if '-' in seq :
942+ logger .warning (f"Reference genome sequence { hdr } in { ref } has '-' in the sequence!" )
943+ reflen += len (seq )
928944
929945 return reflen
930946
@@ -962,22 +978,18 @@ def parse_input_files(input_files, curated, validate_input):
962978
963979 # Old version of the parser:
964980 with open (input_file , 'r' ) as ff :
965- hdr = ff .readline ()
966- seq = ff .read ()
967- name_flag = True
968- seqlen = len (seq ) - seq .count ('\n ' )
969- if hdr [0 ] != ">" :
970- logger .error ("{} has improperly formatted header. Skip!" .format (input_file ))
981+ concat_seq = ""
982+ for hdr , seq in readfa (ff ):
983+ concat_seq += seq
984+
985+ seqlen = len (concat_seq )
986+ if '-' in concat_seq :
987+ logger .error ("Genome sequence %s seems to be aligned! Skip!" % ((input_file )))
971988 continue
972- elif '-' in seq :
973- seq = seq .split ('\n ' )
974- if any ('-' in l and ('>' not in l ) for l in seq ):
975- logger .error ("Genome sequence %s seems to be aligned! Skip!" % ((input_file )))
976- continue
977989 elif seqlen <= 20 :
978990 logger .error ("File %s is less than or equal to 20bp in length. Skip!" % (input_file ))
979991 continue
980- sizediff = float (reflen ) / float ( seqlen )
992+ sizediff = float (reflen ) / seqlen
981993
982994 # Argument for ignoring any issues with the input/references:
983995 if curated :
@@ -1296,31 +1308,15 @@ SETTINGS:
12961308
12971309 #sort reference by largest replicon to smallest
12981310 if sortem and os .path .exists (ref ) and not autopick_ref :
1299- sequences = SeqIO .parse (ref , "fasta" )
1311+ with open (ref , 'r' ) as ff :
1312+ seq_dict = {hdr : seq for hdr , seq in readfa (ff )}
1313+ seqs_sorted_by_len = sorted (seq_dict .items (), key = lambda kv : - len (kv [1 ]))
13001314 new_ref = os .path .join (outputDir , os .path .basename (ref )+ ".ref" )
1301- SeqIO .write (sequences , new_ref , "fasta" )
1315+ with open (new_ref , 'w' ) as ffo :
1316+ for hdr , seq in seqs_sorted_by_len :
1317+ ffo .write (f">{ hdr } \n " )
1318+ ffo .write (f"{ seq } \n " )
13021319 ref = new_ref
1303- # logger.debug("Sorting reference replicons")
1304- # ff = open(ref, 'r')
1305- # seqs = ff.read().split(">")[1:]
1306- # seq_dict = {}
1307- # seq_len = {}
1308- # for seq in seqs:
1309- # try:
1310- # hdr, seq = seq.split("\n",1)
1311- # except ValueError:
1312- # # TODO Why do we ignore when theres a header but no sequence?
1313- # continue
1314- # seq_dict[hdr] = seq
1315- # seq_len[hdr] = len(seq) - seq.count('\n')
1316- # seq_len_sort = sorted(iter(seq_len.items()), key=operator.itemgetter(1), reverse=True)
1317- # ref = os.path.join(outputDir, os.path.basename(ref)+".ref")
1318- # ffo = open(ref, 'w')
1319- # for hdr, seq in seq_len_sort:
1320- # ffo.write(">%s\n"%(hdr))
1321- # ffo.write("%s"%(seq_dict[hdr]))
1322- # ff.close()
1323- # ffo.close()
13241320 else :
13251321 ref = genbank_ref
13261322
@@ -1479,27 +1475,17 @@ SETTINGS:
14791475 # More stuff to autopick the reference if needed:
14801476 orig_auto_ref = auto_ref
14811477 if os .path .exists (auto_ref ) and autopick_ref :
1482- #TODO This code block is duplicated
1483- ff = open (auto_ref , 'r' )
1484- seqs = ff .read ().split (">" )[1 :]
14851478 seq_dict = {}
14861479 seq_len = {}
1487- for seq in seqs :
1488- try :
1489- hdr , seq = seq .split ("\n " ,1 )
1490- except ValueError :
1491- continue
1492- seq_dict [hdr ] = seq
1493- seq_len [hdr ] = len (seq ) - seq .count ('\n ' )
1494- seq_len_sort = sorted (seq_len .iteritems (), key = operator .itemgetter (1 ))
1495- seq_len_sort .reverse ()
1480+ with open (auto_ref , 'r' ) as ff :
1481+ seq_dict = {hdr : seq for hdr , seq in readfa (ff )}
1482+
1483+ seqs_sorted_by_len = sorted (seq_dict .items (), key = lambda kv : - len (kv [1 ]))
14961484 auto_ref = os .path .join (outputDir , os .path .basename (auto_ref )+ ".ref" )
1497- ffo = open (ref , 'w' )
1498- for item in seq_len_sort :
1499- ffo .write (">%s\n " % (item [0 ]))
1500- ffo .write (seq_dict [item [0 ]])
1501- ff .close ()
1502- ffo .close ()
1485+ with open (ref , 'w' ) as ffo :
1486+ for hdr , seq in seqs_sorted_by_len :
1487+ ffo .write (f">{ hdr } \n " )
1488+ ffo .write (f"{ seq } \n " )
15031489 ref = auto_ref
15041490
15051491 finalfiles = sorted (finalfiles )
0 commit comments