-
Notifications
You must be signed in to change notification settings - Fork 74
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement fsspec in place of open #168
Closed
Closed
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
477db4d
Add fsspec for open
8a7de5b
Remove print statement
e0e9a38
Add conda compat
393804b
Add fsspec to ci
bc21ce3
Print resulting index to inspect
mdshw5 9679b97
Further work on fsspec
6b3af27
Merge branch 'implement_fsspec' of github.com:hardingnj/pyfaidx into …
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,4 +2,3 @@ six | |
nose | ||
biopython | ||
setuptools >= 0.7 | ||
mock; python_version < '3.3' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,7 @@ | |
from math import ceil | ||
from os.path import getmtime | ||
from threading import Lock | ||
import fsspec | ||
|
||
from six import PY2, PY3, integer_types, string_types | ||
from six.moves import zip_longest | ||
|
@@ -323,6 +324,7 @@ def __init__(self, | |
strict_bounds=False, | ||
read_ahead=None, | ||
mutable=False, | ||
compression='infer', | ||
split_char=None, | ||
duplicate_action="stop", | ||
filt_function=lambda x: True, | ||
|
@@ -340,47 +342,18 @@ def __init__(self, | |
Default: False (i.e. return a Sequence() object). | ||
""" | ||
self.filename = filename | ||
self._fasta_opener = fsspec.open | ||
self._compression = compression | ||
|
||
if filename.lower().endswith('.bgz') or filename.lower().endswith( | ||
'.gz'): | ||
# Only try to import Bio if we actually need the bgzf reader. | ||
try: | ||
from Bio import bgzf | ||
from Bio import __version__ as bgzf_version | ||
from distutils.version import LooseVersion | ||
if LooseVersion(bgzf_version) < LooseVersion('1.73'): | ||
raise ImportError | ||
except ImportError: | ||
raise ImportError( | ||
"BioPython >= 1.73 must be installed to read block gzip files.") | ||
else: | ||
self._fasta_opener = bgzf.open | ||
self._bgzf = True | ||
elif filename.lower().endswith('.bz2') or filename.lower().endswith( | ||
'.zip'): | ||
raise UnsupportedCompressionFormat( | ||
"Compressed FASTA is only supported in BGZF format. Use " | ||
"bgzip to compresss your FASTA.") | ||
else: | ||
self._fasta_opener = open | ||
self._bgzf = False | ||
self._bgzf = False | ||
|
||
try: | ||
self.file = self._fasta_opener(filename, 'r+b' | ||
if mutable else 'rb') | ||
except (ValueError, IOError) as e: | ||
if str(e).find('BGZF') > -1: | ||
raise UnsupportedCompressionFormat( | ||
"Compressed FASTA is only supported in BGZF format. Use " | ||
"the samtools bgzip utility (instead of gzip) to " | ||
"compress your FASTA.") | ||
else: | ||
raise FastaNotFoundError( | ||
"Cannot read FASTA file %s" % filename) | ||
# use the open method instead of the more standard context | ||
self.file = self._fasta_opener(filename, 'r+' if mutable else 'r', compression=self._compression).open() | ||
|
||
self.indexname = filename + '.fai' | ||
self.read_long_names = read_long_names | ||
self.key_function = key_function | ||
|
||
try: | ||
key_fn_test = self.key_function( | ||
"TestingReturnType of_key_function") | ||
|
@@ -390,6 +363,7 @@ def __init__(self, | |
format(type(key_fn_test))) | ||
except Exception as e: | ||
pass | ||
|
||
self.filt_function = filt_function | ||
assert duplicate_action in ("stop", "first", "last", "longest", | ||
"shortest", "drop") | ||
|
@@ -514,8 +488,8 @@ def read_fai(self): | |
|
||
def build_index(self): | ||
try: | ||
with self._fasta_opener(self.filename, 'rb') as fastafile: | ||
with open(self.indexname, 'w') as indexfile: | ||
with self._fasta_opener(self.filename, 'r', compression=self._compression) as fastafile: | ||
with fsspec.open(self.indexname, 'w') as indexfile: | ||
rname = None # reference sequence name | ||
offset = 0 # binary offset of end of current line | ||
rlen = 0 # reference character length | ||
|
@@ -527,7 +501,6 @@ def build_index(self): | |
lastline = None | ||
for i, line in enumerate(fastafile): | ||
line_blen = len(line) | ||
line = line.decode() | ||
line_clen = len(line.rstrip('\n\r')) | ||
lastline = i | ||
# write an index line | ||
|
@@ -550,9 +523,10 @@ def build_index(self): | |
rlen = 0 | ||
clen = 0 | ||
bad_lines = [] | ||
try: # must catch empty deflines (actually these might be okay: https://github.com/samtools/htslib/pull/258) | ||
rname = line.rstrip('\n\r')[1:].split()[ | ||
0] # duplicates are detected with read_fai | ||
try: | ||
# must catch empty deflines | ||
# (actually these might be okay: https://github.com/samtools/htslib/pull/258) | ||
rname = line.rstrip('\n\r')[1:].split()[0] # duplicates are detected with read_fai | ||
except IndexError: | ||
raise FastaIndexingError( | ||
"Bad sequence name %s at line %s." % | ||
|
@@ -675,7 +649,7 @@ def from_file(self, rname, start, end, internals=False): | |
if self._bgzf: # We can't add to virtual offsets, so we need to read from the beginning of the record and trim the beginning if needed | ||
self.file.seek(i.offset) | ||
chunk = start0 + newlines_before + newlines_inside + seq_len | ||
chunk_seq = self.file.read(chunk).decode() | ||
chunk_seq = self.file.read(chunk) | ||
seq = chunk_seq[start0 + newlines_before:] | ||
else: | ||
self.file.seek(bstart) | ||
|
@@ -685,7 +659,7 @@ def from_file(self, rname, start, end, internals=False): | |
seq_blen = i.bend - bstart | ||
# Otherwise it should be safe to read the sequence | ||
if seq_blen > 0: | ||
seq = self.file.read(seq_blen).decode() | ||
seq = self.file.read(seq_blen) | ||
# If the requested sequence is negative, we will pad the empty string with default_seq. | ||
# This was changed to support #155 with strict_bounds=True. | ||
elif seq_blen <= 0: | ||
|
@@ -742,15 +716,15 @@ def to_file(self, rname, start, end, seq): | |
newline_char = '\n' | ||
self.file.seek(internals['bstart']) | ||
if internals['newlines_inside'] == 0: | ||
self.file.write(seq.encode()) | ||
self.file.write(seq) | ||
elif internals['newlines_inside'] > 0: | ||
n = 0 | ||
m = file_seq.index(newline_char) | ||
while m < len(seq): | ||
self.file.write(''.join([seq[n:m], newline_char]).encode()) | ||
self.file.write(''.join([seq[n:m], newline_char])) | ||
n = m | ||
m += line_len | ||
self.file.write(seq[n:].encode()) | ||
self.file.write(seq[n:]) | ||
self.file.flush() | ||
|
||
def get_long_name(self, rname): | ||
|
@@ -766,7 +740,7 @@ def _long_name_from_index_record(self, index_record): | |
prev_bend = index_record.prev_bend | ||
defline_end = index_record.offset | ||
self.file.seek(prev_bend) | ||
return self.file.read(defline_end - prev_bend).decode()[1:-1] | ||
return self.file.read(defline_end - prev_bend)[1:-1] | ||
|
||
def _long_name_from_bgzf(self, index_record): | ||
""" Return the full sequence defline and description. Internal method passing IndexRecord | ||
|
@@ -779,7 +753,7 @@ def _long_name_from_bgzf(self, index_record): | |
self.file.seek(prev_bend) | ||
defline = [] | ||
while True: | ||
chunk = self.file.read(4096).decode() | ||
chunk = self.file.read(4096) | ||
defline.append(chunk) | ||
if '\n' in chunk or '\r' in chunk: | ||
break | ||
|
@@ -866,6 +840,10 @@ def __repr__(self): | |
def __len__(self): | ||
return self._fa.faidx.index[self.name].rlen | ||
|
||
def __array__(self): | ||
import numpy as np | ||
return np.array([val for record in self for val in record.seq]) | ||
|
||
@property | ||
def unpadded_len(self): | ||
""" Returns the length of the contig without 5' and 3' N padding. | ||
|
@@ -930,14 +908,14 @@ def __array_interface__(self): | |
'shape': (len(self), ), | ||
'typestr': '|S1', | ||
'version': 3, | ||
'data': buffer(str(self).encode('ascii')) | ||
'data': buffer(str(self)) | ||
} | ||
|
||
|
||
class MutableFastaRecord(FastaRecord): | ||
def __init__(self, name, fa): | ||
super(MutableFastaRecord, self).__init__(name, fa) | ||
if self._fa.faidx._fasta_opener != open: | ||
if self._fa.faidx._bgzf: | ||
raise UnsupportedCompressionFormat( | ||
"BGZF compressed FASTA is not supported for MutableFastaRecord. " | ||
"Please decompress your FASTA file.") | ||
|
@@ -1021,6 +999,17 @@ def __getitem__(self, rname): | |
if isinstance(rname, integer_types): | ||
rname = next(islice(self.records.keys(), rname, None)) | ||
try: | ||
# | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. cruft- crept in when trying to debug! |
||
# x = self.records[rname] | ||
# print(rname) | ||
# # print(x) | ||
# # print(type(x)) | ||
# print(len(x)) | ||
# #print(x[0]) | ||
# print(list(x)) | ||
# import numpy as np | ||
# y = np.array(x) | ||
# print(y.shape) | ||
return self.records[rname] | ||
except KeyError: | ||
raise KeyError("{0} not in {1}.".format(rname, self.filename)) | ||
|
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@hardingnj The mode here should be 'rb' if possible so that later code can see the line terminator characters.