Skip to content

Commit de5cdf8

Browse files
authored
Merge pull request #164 from mdshw5/samtools_bgzf_compatibility
Support for constructing and using GZI format files for BGZF compressed FASTA
2 parents e70abc5 + ad3d878 commit de5cdf8

File tree

6 files changed

+322
-33
lines changed

6 files changed

+322
-33
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,6 @@ tests/data/chr22*
1313
.eggs/
1414
tests/data/genes.fasta.gz
1515
*~
16+
tests/data/genes.fasta.lower
17+
tests/data/genes.fasta
18+
tests/data/issue_141.fasta

pyfaidx/__init__.py

100644100755
Lines changed: 157 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@
44
"""
55

66
import datetime
7+
import bisect
78
import os
89
import re
910
import string
11+
import struct
1012
import sys
1113
import shutil
1214
import warnings
@@ -163,7 +165,6 @@ def __getitem__(self, n):
163165
slice_stop = 0
164166
start = self_end - slice_stop
165167
end = self_start + slice_stop
166-
#print(locals())
167168
else:
168169
start = self_start + slice_start
169170
end = self_start + slice_stop + correction_factor
@@ -345,6 +346,39 @@ def __len__(self):
345346
return self.rlen
346347

347348

349+
class BgzfBlock(namedtuple('BgzfBlock', ['cstart', 'clen', 'ustart', 'ulen'])):
350+
351+
__slots__ = ()
352+
353+
def __getitem__(self, key):
354+
if type(key) == str:
355+
return getattr(self, key)
356+
return tuple.__getitem__(self, key)
357+
358+
def as_bytes(self):
359+
return struct.pack('<QQ', self.cstart, self.ustart)
360+
361+
def __lt__(self, other):
362+
"""Compare BgzfBlock ustart to an integer, or to another BgzfBlock by ustart."""
363+
if isinstance(other, BgzfBlock):
364+
return self.ustart < other.ustart
365+
elif isinstance(other, (int, float)):
366+
return self.ustart < other
367+
return NotImplemented
368+
369+
def __len__(self):
370+
return self.ulen
371+
372+
@property
373+
def empty(self):
374+
"""Check for the EOF marker, which is an empty BGZF block.
375+
https://github.com/biopython/biopython/blob/master/Bio/bgzf.py#L171"""
376+
if self.ulen == 0:
377+
return True
378+
else:
379+
return False
380+
381+
348382
class Faidx(object):
349383
""" A python implementation of samtools faidx FASTA indexing """
350384

@@ -373,6 +407,33 @@ def __init__(self,
373407
as_raw: optional parameter to specify whether to return sequences as a
374408
Sequence() object or as a raw string.
375409
Default: False (i.e. return a Sequence() object).
410+
strict_bounds: if True, will raise FetchError if the requested region
411+
is outside the bounds of the sequence.
412+
read_ahead: number of bases to read ahead when fetching a sequence.
413+
mutable: if True, the Faidx object will be mutable, allowing
414+
modification of the FASTA file in place. This is not supported
415+
for BGZF files, and will raise an UnsupportedCompressionFormat
416+
exception if the file is compressed.
417+
split_char: if not None, the keys will be split on this character
418+
duplicate_action: how to handle duplicate keys in the index. Options are:
419+
"stop" (default): raise ValueError if a duplicate key is found
420+
"first": keep the first occurrence of the key
421+
"last": keep the last occurrence of the key
422+
"longest": keep the longest sequence for the key
423+
"shortest": keep the shortest sequence for the key
424+
"drop": drop the duplicate key
425+
filt_function: a function that filters the keys returned by key_function.
426+
The function should take a single argument (the key) and return
427+
True if the key should be included, or False if it should be excluded.
428+
one_based_attributes: if True, the start and end attributes of the
429+
Sequence object will be one-based (i.e. start=1, end=length).
430+
read_long_names: if True, the index will use the full sequence name
431+
(including any whitespace) as the key. If False, the sequence name
432+
will be split on whitespace and the first part will be used as the key.
433+
sequence_always_upper: if True, the sequence will always be returned
434+
in uppercase, regardless of the case in the FASTA file.
435+
rebuild: if True, the index will be rebuilt if it is stale or does not exist.
436+
build_index: if True, the index will be built if it does not exist or is stale.
376437
"""
377438

378439
if fsspec and isinstance(filename, fsspec.core.OpenFile):
@@ -412,6 +473,7 @@ def __init__(self,
412473
raise TypeError("indexname expected NoneType, str, os.PathLike or fsspec.OpenFile, got: %r" % indexname)
413474

414475
if self.filename.lower().endswith(('.bgz', '.gz')):
476+
self.gzi_indexname = self.filename + '.gzi'
415477
# Only try to import Bio if we actually need the bgzf reader.
416478
try:
417479
from Bio import bgzf
@@ -457,10 +519,6 @@ def __init__(self,
457519
self.duplicate_action = duplicate_action
458520
self.as_raw = as_raw
459521
self.default_seq = default_seq
460-
if self._bgzf and self.default_seq is not None:
461-
raise FetchError(
462-
"The default_seq argument is not supported with using BGZF compression. Please decompress your FASTA file and try again."
463-
)
464522
if self._bgzf:
465523
self.strict_bounds = True
466524
else:
@@ -481,6 +539,7 @@ def __init__(self,
481539
self.mutable = mutable
482540
with self.lock: # lock around index generation so only one thread calls method
483541

542+
# Check if .fai index exists
484543
if self._fai_fs is None:
485544
index_exists = os.path.exists(self.indexname)
486545
else:
@@ -497,6 +556,19 @@ def __init__(self,
497556
else:
498557
index_is_stale = False
499558

559+
# If BGZF and .gzi is missing, force rebuild of .fai before .gzi
560+
if self._bgzf and not os.path.exists(self.gzi_indexname):
561+
# Always rebuild .fai if .gzi is missing for BGZF
562+
try:
563+
self.build_index()
564+
except FastaIndexingError:
565+
self.file.close()
566+
raise
567+
# After .fai is rebuilt, continue to build .gzi below
568+
index_exists = True
569+
index_is_stale = False
570+
571+
# Usual .fai build logic
500572
if (
501573
build_index
502574
and (not index_exists or (index_is_stale and rebuild))
@@ -507,6 +579,17 @@ def __init__(self,
507579
self.file.close()
508580
raise
509581

582+
# BGZF: handle .gzi index
583+
if self._bgzf:
584+
if os.path.exists(self.gzi_indexname) and getmtime(self.gzi_indexname) >= getmtime(self.gzi_indexname):
585+
self.read_gzi()
586+
elif os.path.exists(self.gzi_indexname) and getmtime(self.gzi_indexname) < getmtime(self.gzi_indexname) and not rebuild:
587+
self.read_gzi()
588+
warnings.warn("BGZF Index file {0} is older than FASTA file {1}.".format(self.gzi_indexname, self.filename), RuntimeWarning)
589+
else:
590+
self.build_gzi()
591+
self.write_gzi()
592+
510593
try:
511594
self.read_fai()
512595
except Exception:
@@ -634,8 +717,7 @@ def build_index(self):
634717
"Bad sequence name %s at line %s." %
635718
(line.rstrip('\n\r'), str(i)))
636719
offset += line_blen
637-
thisoffset = fastafile.tell(
638-
) if self._bgzf else offset
720+
thisoffset = offset
639721
else: # check line and advance offset
640722
if not blen:
641723
blen = line_blen
@@ -683,6 +765,48 @@ def build_index(self):
683765
% self.indexname)
684766
elif isinstance(e, FastaIndexingError):
685767
raise e
768+
769+
def build_gzi(self):
770+
""" Build the htslib .gzi index format """
771+
from Bio import bgzf
772+
with open(self.filename, 'rb') as bgzf_file:
773+
self.gzi_index = []
774+
for i, values in enumerate(bgzf.BgzfBlocks(bgzf_file)):
775+
self.gzi_index.append(BgzfBlock(*values))
776+
# htslib expects the last block to be an empty block, which is the EOF marker,
777+
# and discards it from the index before writing it to disk.
778+
eof = self.gzi_index.pop()
779+
if not eof.empty:
780+
raise IOError("BGZF EOF marker not found. File %s is not a valid BGZF file." % self.filename)
781+
# htslib discards the first block which has cstart=0, ustart=0, and ulen=0.
782+
# https://github.com/samtools/htslib/blob/d677f345fe35d451587319ca38ac611862a46e1b/bgzf.c#L2401
783+
first_block = self.gzi_index.pop(0)
784+
785+
def write_gzi(self):
786+
""" Write the on disk format for the htslib .gzi index
787+
https://github.com/samtools/htslib/issues/473
788+
See note about file format in
789+
https://github.com/samtools/htslib/blob/d677f345fe35d451587319ca38ac611862a46e1b/bgzf.c#L2382-L2384
790+
"""
791+
with open(self.gzi_indexname, 'wb') as bzi_file:
792+
bzi_file.write(struct.pack('<Q', len(self.gzi_index)))
793+
for block in self.gzi_index:
794+
bzi_file.write(block.as_bytes())
795+
796+
def read_gzi(self):
797+
""" Read the on disk format for the htslib .gzi index
798+
https://github.com/samtools/htslib/issues/473"""
799+
from ctypes import c_uint64, sizeof
800+
with open(self.gzi_indexname, 'rb') as bzi_file:
801+
number_of_blocks = struct.unpack('<Q', bzi_file.read(sizeof(c_uint64)))[0]
802+
# htslib discards the first block which has cstart=0, ustart=0, and ulen=0.
803+
self.gzi_index = [BgzfBlock(0, None, 0, None)]
804+
for i in range(number_of_blocks):
805+
cstart, ustart = struct.unpack('<QQ', bzi_file.read(sizeof(c_uint64) * 2))
806+
if cstart == '' or ustart == '':
807+
raise IndexError("Unexpected end of .gzi file. ")
808+
else:
809+
self.gzi_index.append(BgzfBlock(cstart, None, ustart, None))
686810

687811
def write_fai(self):
688812
with self.lock:
@@ -749,34 +873,42 @@ def from_file(self, rname, start, end, internals=False):
749873
newlines_to_end = int(end / i.lenc) if i.lenc else 0
750874
newlines_inside = newlines_to_end - newlines_before
751875
newline_blen = i.lenb - i.lenc
752-
seq_blen = newlines_inside * newline_blen + seq_len
753-
bstart = i.offset + newlines_before * newline_blen + start0
876+
seq_blen = (newlines_inside * newline_blen) + seq_len
877+
bstart = i.offset + (newlines_before * newline_blen) + start0
754878
if seq_blen < 0 and self.strict_bounds:
755879
raise FetchError("Requested coordinates start={0:n} end={1:n} are "
756880
"invalid.\n".format(start, end))
757881
elif end > i.rlen and self.strict_bounds:
882+
if self._bgzf:
883+
raise FetchError("Requested end coordinate {0:n} outside of {1}. "
884+
"strict_bounds=False is incompatible with BGZF compressed files."
885+
"\n".format(end, rname))
758886
raise FetchError("Requested end coordinate {0:n} outside of {1}. "
759887
"\n".format(end, rname))
760888

761889
with self.lock:
762-
if self._bgzf: # We can't add to virtual offsets, so we need to read from the beginning of the record and trim the beginning if needed
763-
self.file.seek(i.offset)
764-
chunk = start0 + (newlines_before * newline_blen) + (newlines_inside * newline_blen) + seq_len
765-
chunk_seq = self.file.read(chunk).decode()
766-
seq = chunk_seq[start0 + newlines_before:]
890+
if self._bgzf:
891+
from Bio import bgzf
892+
insertion_i = bisect.bisect_left(self.gzi_index, bstart)
893+
if insertion_i == 0: # bisect_left already returns index to the left if values are the same
894+
start_block = self.gzi_index[insertion_i]
895+
else:
896+
start_block = self.gzi_index[insertion_i - 1]
897+
within_block_offset = bstart - start_block.ustart
898+
self.file.seek(bgzf.make_virtual_offset(start_block.cstart, within_block_offset))
767899
else:
768900
self.file.seek(bstart)
769901

770-
# If the requested sequence exceeds len(FastaRecord), return as much as possible
771-
if bstart + seq_blen > i.bend and not self.strict_bounds:
772-
seq_blen = i.bend - bstart
773-
# Otherwise it should be safe to read the sequence
774-
if seq_blen > 0:
775-
seq = self.file.read(seq_blen).decode()
776-
# If the requested sequence is negative, we will pad the empty string with default_seq.
777-
# This was changed to support #155 with strict_bounds=True.
778-
elif seq_blen <= 0:
779-
seq = ''
902+
# If the requested sequence exceeds len(FastaRecord), return as much as possible
903+
if bstart + seq_blen > i.bend and not self.strict_bounds:
904+
seq_blen = i.bend - bstart
905+
# Otherwise it should be safe to read the sequence
906+
if seq_blen > 0:
907+
seq = self.file.read(seq_blen).decode()
908+
# If the requested sequence is negative, we will pad the empty string with default_seq.
909+
# This was changed to support #155 with strict_bounds=True.
910+
elif seq_blen <= 0:
911+
seq = ''
780912

781913
if not internals:
782914
return seq.replace('\n', '').replace('\r', '')
@@ -1446,8 +1578,7 @@ def get_valid_filename(s):
14461578
'chromosome_6.fa'
14471579
"""
14481580
s = str(s).strip().replace(' ', '_')
1449-
return re.sub(r'(?u)[^-\w.]', '', s)
1450-
1581+
return re.sub(r'(?u)[^-\w.]', '', s)
14511582

14521583
if __name__ == "__main__":
14531584
import doctest

pyfaidx/cli.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
#!/usr/bin/env python
2+
23
import argparse
34
import sys
45
import os.path
56
import re
67
from pyfaidx import Fasta, wrap_sequence, FetchError, ucsc_split, bed_split, get_valid_filename
78
from collections import defaultdict
9+
from operator import itemgetter
10+
from heapq import nlargest
11+
from itertools import repeat
812

913
def detect_fasta_newline(filepath):
1014
"""Detect the newline style used in a FASTA file by reading the first non-header line."""
@@ -118,7 +122,7 @@ def mask_sequence(args):
118122
span = len(fasta[rname][start:end])
119123
fasta[rname][start:end] = span * args.default_seq
120124
elif args.mask_by_case:
121-
fasta[rname][start:end] = fasta[rname][start:end].lowercase()
125+
fasta[rname][start:end] = str(fasta[rname][start:end]).lower()
122126

123127

124128
def split_regions(args):
@@ -153,7 +157,13 @@ def transform_sequence(args, fasta, name, start=None, end=None):
153157
C = nucs.pop('C', 0)
154158
G = nucs.pop('G', 0)
155159
N = nucs.pop('N', 0)
156-
others = '|'.join([':'.join((k, str(v))) for k, v in nucs.items()])
160+
# If there are other nucleotides, we will report them as well
161+
# Set others to 0 if no other nucleotides are present
162+
if not nucs:
163+
others = 0
164+
else:
165+
others = '|'.join([':'.join((k, str(v))) for k, v in nucs.items()])
166+
# Return the nucleotide counts in a tab-separated format
157167
return '{sname}\t{sstart}\t{send}\t{A}\t{T}\t{C}\t{G}\t{N}\t{others}\n'.format(sname=s.name, sstart=s.start, send=s.end, **locals())
158168
elif args.transform == 'transposed':
159169
return '{name}\t{start}\t{end}\t{seq}\n'.format(name=s.name, start=s.start, end=s.end, seq=str(s))
@@ -252,13 +262,15 @@ def __init__(self, iterable=None, **kwds):
252262
def __missing__(self, key):
253263
return 0
254264

265+
255266
def most_common(self, n=None):
256267
'''List the n most common elements and their counts from the most
257268
common to the least. If n is None, then list all element counts.
258269
'''
270+
items = self.items()
259271
if n is None:
260-
return sorted(self.iteritems(), key=itemgetter(1), reverse=True)
261-
return nlargest(n, self.iteritems(), key=itemgetter(1))
272+
return sorted(items, key=itemgetter(1), reverse=True)
273+
return nlargest(n, items, key=itemgetter(1))
262274

263275
def elements(self):
264276
'''Iterator over elements repeating each as many times as its count.
@@ -267,7 +279,7 @@ def elements(self):
267279
elements() will ignore it.
268280
269281
'''
270-
for elem, count in self.iteritems():
282+
for elem, count in self.items():
271283
for _ in repeat(None, count):
272284
yield elem
273285

tests/test_Fasta_bgzip.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ def remove_index():
2121
pass # some tests may delete this file
2222

2323
@pytest.mark.skipif(not bio, reason="Biopython is not installed.")
24-
@pytest.mark.xfail
2524
def test_build_issue_126(remove_index):
2625
""" Samtools BGZF index should be identical to pyfaidx BGZF index """
2726
expect_index = (

0 commit comments

Comments
 (0)