Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement fsspec in place of open #168

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ python:
- 'pypy'
- 'pypy3'
install:
- pip install cython pysam requests coverage pyfasta pyvcf numpy
- pip install cython pysam requests coverage pyfasta pyvcf numpy fsspec
- pip install biopython || true
- python setup.py install
- if [ ! -f samtools-1.2 ]; then curl -sL https://github.com/samtools/samtools/releases/download/1.2/samtools-1.2.tar.bz2 | tar -xjv; fi
Expand Down
1 change: 0 additions & 1 deletion dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,3 @@ six
nose
biopython
setuptools >= 0.7
mock; python_version < '3.3'
89 changes: 39 additions & 50 deletions pyfaidx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from math import ceil
from os.path import getmtime
from threading import Lock
import fsspec

from six import PY2, PY3, integer_types, string_types
from six.moves import zip_longest
Expand Down Expand Up @@ -323,6 +324,7 @@ def __init__(self,
strict_bounds=False,
read_ahead=None,
mutable=False,
compression='infer',
split_char=None,
duplicate_action="stop",
filt_function=lambda x: True,
Expand All @@ -340,47 +342,18 @@ def __init__(self,
Default: False (i.e. return a Sequence() object).
"""
self.filename = filename
self._fasta_opener = fsspec.open
self._compression = compression

if filename.lower().endswith('.bgz') or filename.lower().endswith(
'.gz'):
# Only try to import Bio if we actually need the bgzf reader.
try:
from Bio import bgzf
from Bio import __version__ as bgzf_version
from distutils.version import LooseVersion
if LooseVersion(bgzf_version) < LooseVersion('1.73'):
raise ImportError
except ImportError:
raise ImportError(
"BioPython >= 1.73 must be installed to read block gzip files.")
else:
self._fasta_opener = bgzf.open
self._bgzf = True
elif filename.lower().endswith('.bz2') or filename.lower().endswith(
'.zip'):
raise UnsupportedCompressionFormat(
"Compressed FASTA is only supported in BGZF format. Use "
"bgzip to compresss your FASTA.")
else:
self._fasta_opener = open
self._bgzf = False
self._bgzf = False

try:
self.file = self._fasta_opener(filename, 'r+b'
if mutable else 'rb')
except (ValueError, IOError) as e:
if str(e).find('BGZF') > -1:
raise UnsupportedCompressionFormat(
"Compressed FASTA is only supported in BGZF format. Use "
"the samtools bgzip utility (instead of gzip) to "
"compress your FASTA.")
else:
raise FastaNotFoundError(
"Cannot read FASTA file %s" % filename)
# use the open method instead of the more standard context
self.file = self._fasta_opener(filename, 'r+' if mutable else 'r', compression=self._compression).open()

self.indexname = filename + '.fai'
self.read_long_names = read_long_names
self.key_function = key_function

try:
key_fn_test = self.key_function(
"TestingReturnType of_key_function")
Expand All @@ -390,6 +363,7 @@ def __init__(self,
format(type(key_fn_test)))
except Exception as e:
pass

self.filt_function = filt_function
assert duplicate_action in ("stop", "first", "last", "longest",
"shortest", "drop")
Expand Down Expand Up @@ -514,8 +488,8 @@ def read_fai(self):

def build_index(self):
try:
with self._fasta_opener(self.filename, 'rb') as fastafile:
with open(self.indexname, 'w') as indexfile:
with self._fasta_opener(self.filename, 'r', compression=self._compression) as fastafile:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hardingnj The mode here should be 'rb' if possible so that later code can see the line terminator characters.

with fsspec.open(self.indexname, 'w') as indexfile:
rname = None # reference sequence name
offset = 0 # binary offset of end of current line
rlen = 0 # reference character length
Expand All @@ -527,7 +501,6 @@ def build_index(self):
lastline = None
for i, line in enumerate(fastafile):
line_blen = len(line)
line = line.decode()
line_clen = len(line.rstrip('\n\r'))
lastline = i
# write an index line
Expand All @@ -550,9 +523,10 @@ def build_index(self):
rlen = 0
clen = 0
bad_lines = []
try: # must catch empty deflines (actually these might be okay: https://github.com/samtools/htslib/pull/258)
rname = line.rstrip('\n\r')[1:].split()[
0] # duplicates are detected with read_fai
try:
# must catch empty deflines
# (actually these might be okay: https://github.com/samtools/htslib/pull/258)
rname = line.rstrip('\n\r')[1:].split()[0] # duplicates are detected with read_fai
except IndexError:
raise FastaIndexingError(
"Bad sequence name %s at line %s." %
Expand Down Expand Up @@ -675,7 +649,7 @@ def from_file(self, rname, start, end, internals=False):
if self._bgzf: # We can't add to virtual offsets, so we need to read from the beginning of the record and trim the beginning if needed
self.file.seek(i.offset)
chunk = start0 + newlines_before + newlines_inside + seq_len
chunk_seq = self.file.read(chunk).decode()
chunk_seq = self.file.read(chunk)
seq = chunk_seq[start0 + newlines_before:]
else:
self.file.seek(bstart)
Expand All @@ -685,7 +659,7 @@ def from_file(self, rname, start, end, internals=False):
seq_blen = i.bend - bstart
# Otherwise it should be safe to read the sequence
if seq_blen > 0:
seq = self.file.read(seq_blen).decode()
seq = self.file.read(seq_blen)
# If the requested sequence is negative, we will pad the empty string with default_seq.
# This was changed to support #155 with strict_bounds=True.
elif seq_blen <= 0:
Expand Down Expand Up @@ -742,15 +716,15 @@ def to_file(self, rname, start, end, seq):
newline_char = '\n'
self.file.seek(internals['bstart'])
if internals['newlines_inside'] == 0:
self.file.write(seq.encode())
self.file.write(seq)
elif internals['newlines_inside'] > 0:
n = 0
m = file_seq.index(newline_char)
while m < len(seq):
self.file.write(''.join([seq[n:m], newline_char]).encode())
self.file.write(''.join([seq[n:m], newline_char]))
n = m
m += line_len
self.file.write(seq[n:].encode())
self.file.write(seq[n:])
self.file.flush()

def get_long_name(self, rname):
Expand All @@ -766,7 +740,7 @@ def _long_name_from_index_record(self, index_record):
prev_bend = index_record.prev_bend
defline_end = index_record.offset
self.file.seek(prev_bend)
return self.file.read(defline_end - prev_bend).decode()[1:-1]
return self.file.read(defline_end - prev_bend)[1:-1]

def _long_name_from_bgzf(self, index_record):
""" Return the full sequence defline and description. Internal method passing IndexRecord
Expand All @@ -779,7 +753,7 @@ def _long_name_from_bgzf(self, index_record):
self.file.seek(prev_bend)
defline = []
while True:
chunk = self.file.read(4096).decode()
chunk = self.file.read(4096)
defline.append(chunk)
if '\n' in chunk or '\r' in chunk:
break
Expand Down Expand Up @@ -866,6 +840,10 @@ def __repr__(self):
def __len__(self):
return self._fa.faidx.index[self.name].rlen

def __array__(self):
import numpy as np
return np.array([val for record in self for val in record.seq])

@property
def unpadded_len(self):
""" Returns the length of the contig without 5' and 3' N padding.
Expand Down Expand Up @@ -930,14 +908,14 @@ def __array_interface__(self):
'shape': (len(self), ),
'typestr': '|S1',
'version': 3,
'data': buffer(str(self).encode('ascii'))
'data': buffer(str(self))
}


class MutableFastaRecord(FastaRecord):
def __init__(self, name, fa):
super(MutableFastaRecord, self).__init__(name, fa)
if self._fa.faidx._fasta_opener != open:
if self._fa.faidx._bgzf:
raise UnsupportedCompressionFormat(
"BGZF compressed FASTA is not supported for MutableFastaRecord. "
"Please decompress your FASTA file.")
Expand Down Expand Up @@ -1021,6 +999,17 @@ def __getitem__(self, rname):
if isinstance(rname, integer_types):
rname = next(islice(self.records.keys(), rname, None))
try:
#
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cruft- crept in when trying to debug!

# x = self.records[rname]
# print(rname)
# # print(x)
# # print(type(x))
# print(len(x))
# #print(x[0])
# print(list(x))
# import numpy as np
# y = np.array(x)
# print(y.shape)
return self.records[rname]
except KeyError:
raise KeyError("{0} not in {1}.".format(rname, self.filename))
Expand Down