44"""
55
66import datetime
7+ import bisect
78import os
89import re
910import string
11+ import struct
1012import sys
1113import shutil
1214import warnings
@@ -163,7 +165,6 @@ def __getitem__(self, n):
163165 slice_stop = 0
164166 start = self_end - slice_stop
165167 end = self_start + slice_stop
166- #print(locals())
167168 else :
168169 start = self_start + slice_start
169170 end = self_start + slice_stop + correction_factor
@@ -345,6 +346,39 @@ def __len__(self):
345346 return self .rlen
346347
347348
349+ class BgzfBlock (namedtuple ('BgzfBlock' , ['cstart' , 'clen' , 'ustart' , 'ulen' ])):
350+
351+ __slots__ = ()
352+
353+ def __getitem__ (self , key ):
354+ if type (key ) == str :
355+ return getattr (self , key )
356+ return tuple .__getitem__ (self , key )
357+
358+ def as_bytes (self ):
359+ return struct .pack ('<QQ' , self .cstart , self .ustart )
360+
361+ def __lt__ (self , other ):
362+ """Compare BgzfBlock ustart to an integer, or to another BgzfBlock by ustart."""
363+ if isinstance (other , BgzfBlock ):
364+ return self .ustart < other .ustart
365+ elif isinstance (other , (int , float )):
366+ return self .ustart < other
367+ return NotImplemented
368+
369+ def __len__ (self ):
370+ return self .ulen
371+
372+ @property
373+ def empty (self ):
374+ """Check for the EOF marker, which is an empty BGZF block.
375+ https://github.com/biopython/biopython/blob/master/Bio/bgzf.py#L171"""
376+ if self .ulen == 0 :
377+ return True
378+ else :
379+ return False
380+
381+
348382class Faidx (object ):
349383 """ A python implementation of samtools faidx FASTA indexing """
350384
@@ -373,6 +407,33 @@ def __init__(self,
373407 as_raw: optional parameter to specify whether to return sequences as a
374408 Sequence() object or as a raw string.
375409 Default: False (i.e. return a Sequence() object).
410+ strict_bounds: if True, will raise FetchError if the requested region
411+ is outside the bounds of the sequence.
412+ read_ahead: number of bases to read ahead when fetching a sequence.
413+ mutable: if True, the Faidx object will be mutable, allowing
414+ modification of the FASTA file in place. This is not supported
415+ for BGZF files, and will raise an UnsupportedCompressionFormat
416+ exception if the file is compressed.
417+ split_char: if not None, the keys will be split on this character
418+ duplicate_action: how to handle duplicate keys in the index. Options are:
419+ "stop" (default): raise ValueError if a duplicate key is found
420+ "first": keep the first occurrence of the key
421+ "last": keep the last occurrence of the key
422+ "longest": keep the longest sequence for the key
423+ "shortest": keep the shortest sequence for the key
424+ "drop": drop the duplicate key
425+ filt_function: a function that filters the keys returned by key_function.
426+ The function should take a single argument (the key) and return
427+ True if the key should be included, or False if it should be excluded.
428+ one_based_attributes: if True, the start and end attributes of the
429+ Sequence object will be one-based (i.e. start=1, end=length).
430+ read_long_names: if True, the index will use the full sequence name
431+ (including any whitespace) as the key. If False, the sequence name
432+ will be split on whitespace and the first part will be used as the key.
433+ sequence_always_upper: if True, the sequence will always be returned
434+ in uppercase, regardless of the case in the FASTA file.
435+ rebuild: if True, the index will be rebuilt if it is stale or does not exist.
436+ build_index: if True, the index will be built if it does not exist or is stale.
376437 """
377438
378439 if fsspec and isinstance (filename , fsspec .core .OpenFile ):
@@ -412,6 +473,7 @@ def __init__(self,
412473 raise TypeError ("indexname expected NoneType, str, os.PathLike or fsspec.OpenFile, got: %r" % indexname )
413474
414475 if self .filename .lower ().endswith (('.bgz' , '.gz' )):
476+ self .gzi_indexname = self .filename + '.gzi'
415477 # Only try to import Bio if we actually need the bgzf reader.
416478 try :
417479 from Bio import bgzf
@@ -457,10 +519,6 @@ def __init__(self,
457519 self .duplicate_action = duplicate_action
458520 self .as_raw = as_raw
459521 self .default_seq = default_seq
460- if self ._bgzf and self .default_seq is not None :
461- raise FetchError (
462- "The default_seq argument is not supported with using BGZF compression. Please decompress your FASTA file and try again."
463- )
464522 if self ._bgzf :
465523 self .strict_bounds = True
466524 else :
@@ -481,6 +539,7 @@ def __init__(self,
481539 self .mutable = mutable
482540 with self .lock : # lock around index generation so only one thread calls method
483541
542+ # Check if .fai index exists
484543 if self ._fai_fs is None :
485544 index_exists = os .path .exists (self .indexname )
486545 else :
@@ -497,6 +556,19 @@ def __init__(self,
497556 else :
498557 index_is_stale = False
499558
559+ # If BGZF and .gzi is missing, force rebuild of .fai before .gzi
560+ if self ._bgzf and not os .path .exists (self .gzi_indexname ):
561+ # Always rebuild .fai if .gzi is missing for BGZF
562+ try :
563+ self .build_index ()
564+ except FastaIndexingError :
565+ self .file .close ()
566+ raise
567+ # After .fai is rebuilt, continue to build .gzi below
568+ index_exists = True
569+ index_is_stale = False
570+
571+ # Usual .fai build logic
500572 if (
501573 build_index
502574 and (not index_exists or (index_is_stale and rebuild ))
@@ -507,6 +579,17 @@ def __init__(self,
507579 self .file .close ()
508580 raise
509581
582+ # BGZF: handle .gzi index
583+ if self ._bgzf :
584+ if os .path .exists (self .gzi_indexname ) and getmtime (self .gzi_indexname ) >= getmtime (self .gzi_indexname ):
585+ self .read_gzi ()
586+ elif os .path .exists (self .gzi_indexname ) and getmtime (self .gzi_indexname ) < getmtime (self .gzi_indexname ) and not rebuild :
587+ self .read_gzi ()
588+ warnings .warn ("BGZF Index file {0} is older than FASTA file {1}." .format (self .gzi_indexname , self .filename ), RuntimeWarning )
589+ else :
590+ self .build_gzi ()
591+ self .write_gzi ()
592+
510593 try :
511594 self .read_fai ()
512595 except Exception :
@@ -634,8 +717,7 @@ def build_index(self):
634717 "Bad sequence name %s at line %s." %
635718 (line .rstrip ('\n \r ' ), str (i )))
636719 offset += line_blen
637- thisoffset = fastafile .tell (
638- ) if self ._bgzf else offset
720+ thisoffset = offset
639721 else : # check line and advance offset
640722 if not blen :
641723 blen = line_blen
@@ -683,6 +765,48 @@ def build_index(self):
683765 % self .indexname )
684766 elif isinstance (e , FastaIndexingError ):
685767 raise e
768+
769+ def build_gzi (self ):
770+ """ Build the htslib .gzi index format """
771+ from Bio import bgzf
772+ with open (self .filename , 'rb' ) as bgzf_file :
773+ self .gzi_index = []
774+ for i , values in enumerate (bgzf .BgzfBlocks (bgzf_file )):
775+ self .gzi_index .append (BgzfBlock (* values ))
776+ # htslib expects the last block to be an empty block, which is the EOF marker,
777+ # and discards it from the index before writing it to disk.
778+ eof = self .gzi_index .pop ()
779+ if not eof .empty :
780+ raise IOError ("BGZF EOF marker not found. File %s is not a valid BGZF file." % self .filename )
781+ # htslib discards the first block which has cstart=0, ustart=0, and ulen=0.
782+ # https://github.com/samtools/htslib/blob/d677f345fe35d451587319ca38ac611862a46e1b/bgzf.c#L2401
783+ first_block = self .gzi_index .pop (0 )
784+
785+ def write_gzi (self ):
786+ """ Write the on disk format for the htslib .gzi index
787+ https://github.com/samtools/htslib/issues/473
788+ See note about file format in
789+ https://github.com/samtools/htslib/blob/d677f345fe35d451587319ca38ac611862a46e1b/bgzf.c#L2382-L2384
790+ """
791+ with open (self .gzi_indexname , 'wb' ) as bzi_file :
792+ bzi_file .write (struct .pack ('<Q' , len (self .gzi_index )))
793+ for block in self .gzi_index :
794+ bzi_file .write (block .as_bytes ())
795+
796+ def read_gzi (self ):
797+ """ Read the on disk format for the htslib .gzi index
798+ https://github.com/samtools/htslib/issues/473"""
799+ from ctypes import c_uint64 , sizeof
800+ with open (self .gzi_indexname , 'rb' ) as bzi_file :
801+ number_of_blocks = struct .unpack ('<Q' , bzi_file .read (sizeof (c_uint64 )))[0 ]
802+ # htslib discards the first block which has cstart=0, ustart=0, and ulen=0.
803+ self .gzi_index = [BgzfBlock (0 , None , 0 , None )]
804+ for i in range (number_of_blocks ):
805+ cstart , ustart = struct .unpack ('<QQ' , bzi_file .read (sizeof (c_uint64 ) * 2 ))
806+ if cstart == '' or ustart == '' :
807+ raise IndexError ("Unexpected end of .gzi file. " )
808+ else :
809+ self .gzi_index .append (BgzfBlock (cstart , None , ustart , None ))
686810
687811 def write_fai (self ):
688812 with self .lock :
@@ -749,34 +873,42 @@ def from_file(self, rname, start, end, internals=False):
749873 newlines_to_end = int (end / i .lenc ) if i .lenc else 0
750874 newlines_inside = newlines_to_end - newlines_before
751875 newline_blen = i .lenb - i .lenc
752- seq_blen = newlines_inside * newline_blen + seq_len
753- bstart = i .offset + newlines_before * newline_blen + start0
876+ seq_blen = ( newlines_inside * newline_blen ) + seq_len
877+ bstart = i .offset + ( newlines_before * newline_blen ) + start0
754878 if seq_blen < 0 and self .strict_bounds :
755879 raise FetchError ("Requested coordinates start={0:n} end={1:n} are "
756880 "invalid.\n " .format (start , end ))
757881 elif end > i .rlen and self .strict_bounds :
882+ if self ._bgzf :
883+ raise FetchError ("Requested end coordinate {0:n} outside of {1}. "
884+ "strict_bounds=False is incompatible with BGZF compressed files."
885+ "\n " .format (end , rname ))
758886 raise FetchError ("Requested end coordinate {0:n} outside of {1}. "
759887 "\n " .format (end , rname ))
760888
761889 with self .lock :
762- if self ._bgzf : # We can't add to virtual offsets, so we need to read from the beginning of the record and trim the beginning if needed
763- self .file .seek (i .offset )
764- chunk = start0 + (newlines_before * newline_blen ) + (newlines_inside * newline_blen ) + seq_len
765- chunk_seq = self .file .read (chunk ).decode ()
766- seq = chunk_seq [start0 + newlines_before :]
890+ if self ._bgzf :
891+ from Bio import bgzf
892+ insertion_i = bisect .bisect_left (self .gzi_index , bstart )
893+ if insertion_i == 0 : # bisect_left already returns index to the left if values are the same
894+ start_block = self .gzi_index [insertion_i ]
895+ else :
896+ start_block = self .gzi_index [insertion_i - 1 ]
897+ within_block_offset = bstart - start_block .ustart
898+ self .file .seek (bgzf .make_virtual_offset (start_block .cstart , within_block_offset ))
767899 else :
768900 self .file .seek (bstart )
769901
770- # If the requested sequence exceeds len(FastaRecord), return as much as possible
771- if bstart + seq_blen > i .bend and not self .strict_bounds :
772- seq_blen = i .bend - bstart
773- # Otherwise it should be safe to read the sequence
774- if seq_blen > 0 :
775- seq = self .file .read (seq_blen ).decode ()
776- # If the requested sequence is negative, we will pad the empty string with default_seq.
777- # This was changed to support #155 with strict_bounds=True.
778- elif seq_blen <= 0 :
779- seq = ''
902+ # If the requested sequence exceeds len(FastaRecord), return as much as possible
903+ if bstart + seq_blen > i .bend and not self .strict_bounds :
904+ seq_blen = i .bend - bstart
905+ # Otherwise it should be safe to read the sequence
906+ if seq_blen > 0 :
907+ seq = self .file .read (seq_blen ).decode ()
908+ # If the requested sequence is negative, we will pad the empty string with default_seq.
909+ # This was changed to support #155 with strict_bounds=True.
910+ elif seq_blen <= 0 :
911+ seq = ''
780912
781913 if not internals :
782914 return seq .replace ('\n ' , '' ).replace ('\r ' , '' )
@@ -1446,8 +1578,7 @@ def get_valid_filename(s):
14461578 'chromosome_6.fa'
14471579 """
14481580 s = str (s ).strip ().replace (' ' , '_' )
1449- return re .sub (r'(?u)[^-\w.]' , '' , s )
1450-
1581+ return re .sub (r'(?u)[^-\w.]' , '' , s )
14511582
14521583if __name__ == "__main__" :
14531584 import doctest
0 commit comments