mdshw5 · hardingnj · Jul 16, 2020 · Jul 16, 2020 · Jul 16, 2020 · Jul 16, 2020
diff --git a/.travis.yml b/.travis.yml
@@ -11,7 +11,7 @@ python:
     - 'pypy'
     - 'pypy3'
 install:
-    - pip install cython pysam requests coverage pyfasta pyvcf numpy
+    - pip install cython pysam requests coverage pyfasta pyvcf numpy fsspec
     - pip install biopython || true
     - python setup.py install
     - if [ ! -f samtools-1.2 ]; then curl -sL https://github.com/samtools/samtools/releases/download/1.2/samtools-1.2.tar.bz2 | tar -xjv; fi

diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -2,4 +2,3 @@ six
 nose
 biopython
 setuptools >= 0.7
-mock; python_version < '3.3'
diff --git a/pyfaidx/__init__.py b/pyfaidx/__init__.py
@@ -15,6 +15,7 @@
 from math import ceil
 from os.path import getmtime
 from threading import Lock
+import fsspec
 
 from six import PY2, PY3, integer_types, string_types
 from six.moves import zip_longest
@@ -323,6 +324,7 @@ def __init__(self,
                  strict_bounds=False,
                  read_ahead=None,
                  mutable=False,
+                 compression='infer',
                  split_char=None,
                  duplicate_action="stop",
                  filt_function=lambda x: True,
@@ -340,47 +342,18 @@ def __init__(self,
           Default: False (i.e. return a Sequence() object).
         """
         self.filename = filename
+        self._fasta_opener = fsspec.open
+        self._compression = compression
 
-        if filename.lower().endswith('.bgz') or filename.lower().endswith(
-                '.gz'):
-            # Only try to import Bio if we actually need the bgzf reader.
-            try:
-                from Bio import bgzf
-                from Bio import __version__ as bgzf_version
-                from distutils.version import LooseVersion
-                if LooseVersion(bgzf_version) < LooseVersion('1.73'):
-                    raise ImportError
-            except ImportError:
-                raise ImportError(
-                    "BioPython >= 1.73 must be installed to read block gzip files.")
-            else:
-                self._fasta_opener = bgzf.open
-                self._bgzf = True
-        elif filename.lower().endswith('.bz2') or filename.lower().endswith(
-                '.zip'):
-            raise UnsupportedCompressionFormat(
-                "Compressed FASTA is only supported in BGZF format. Use "
-                "bgzip to compresss your FASTA.")
-        else:
-            self._fasta_opener = open
-            self._bgzf = False
+        self._bgzf = False
 
-        try:
-            self.file = self._fasta_opener(filename, 'r+b'
-                                           if mutable else 'rb')
-        except (ValueError, IOError) as e:
-            if str(e).find('BGZF') > -1:
-                raise UnsupportedCompressionFormat(
-                    "Compressed FASTA is only supported in BGZF format. Use "
-                    "the samtools bgzip utility (instead of gzip) to "
-                    "compress your FASTA.")
-            else:
-                raise FastaNotFoundError(
-                    "Cannot read FASTA file %s" % filename)
+        # use the open method instead of the more standard context
+        self.file = self._fasta_opener(filename, 'r+' if mutable else 'r', compression=self._compression).open()
 
         self.indexname = filename + '.fai'
         self.read_long_names = read_long_names
         self.key_function = key_function
+
         try:
             key_fn_test = self.key_function(
                 "TestingReturnType of_key_function")
@@ -390,6 +363,7 @@ def __init__(self,
                     format(type(key_fn_test)))
         except Exception as e:
             pass
+
         self.filt_function = filt_function
         assert duplicate_action in ("stop", "first", "last", "longest",
                                     "shortest", "drop")
@@ -514,8 +488,8 @@ def read_fai(self):
 
     def build_index(self):
         try:
-            with self._fasta_opener(self.filename, 'rb') as fastafile:
-                with open(self.indexname, 'w') as indexfile:
+            with self._fasta_opener(self.filename, 'r', compression=self._compression) as fastafile:
+                with fsspec.open(self.indexname, 'w') as indexfile:
                     rname = None  # reference sequence name
                     offset = 0  # binary offset of end of current line
                     rlen = 0  # reference character length
@@ -527,7 +501,6 @@ def build_index(self):
                     lastline = None
                     for i, line in enumerate(fastafile):
                         line_blen = len(line)
-                        line = line.decode()
                         line_clen = len(line.rstrip('\n\r'))
                         lastline = i
                         # write an index line
@@ -550,9 +523,10 @@ def build_index(self):
                             rlen = 0
                             clen = 0
                             bad_lines = []
-                            try:  # must catch empty deflines (actually these might be okay: https://github.com/samtools/htslib/pull/258)
-                                rname = line.rstrip('\n\r')[1:].split()[
-                                    0]  # duplicates are detected with read_fai
+                            try:
+                                # must catch empty deflines
+                                # (actually these might be okay: https://github.com/samtools/htslib/pull/258)
+                                rname = line.rstrip('\n\r')[1:].split()[0]  # duplicates are detected with read_fai
                             except IndexError:
                                 raise FastaIndexingError(
                                     "Bad sequence name %s at line %s." %
@@ -675,7 +649,7 @@ def from_file(self, rname, start, end, internals=False):
             if self._bgzf:  # We can't add to virtual offsets, so we need to read from the beginning of the record and trim the beginning if needed
                 self.file.seek(i.offset)
                 chunk = start0 + newlines_before + newlines_inside + seq_len
-                chunk_seq = self.file.read(chunk).decode()
+                chunk_seq = self.file.read(chunk)
                 seq = chunk_seq[start0 + newlines_before:]
             else:
                 self.file.seek(bstart)
@@ -685,7 +659,7 @@ def from_file(self, rname, start, end, internals=False):
                     seq_blen = i.bend - bstart
                 # Otherwise it should be safe to read the sequence
                 if seq_blen > 0:
-                    seq = self.file.read(seq_blen).decode()
+                    seq = self.file.read(seq_blen)
                 # If the requested sequence is negative, we will pad the empty string with default_seq.
                 # This was changed to support #155 with strict_bounds=True.
                 elif seq_blen <= 0:
@@ -742,15 +716,15 @@ def to_file(self, rname, start, end, seq):
                     newline_char = '\n'
                 self.file.seek(internals['bstart'])
                 if internals['newlines_inside'] == 0:
-                    self.file.write(seq.encode())
+                    self.file.write(seq)
                 elif internals['newlines_inside'] > 0:
                     n = 0
                     m = file_seq.index(newline_char)
                     while m < len(seq):
-                        self.file.write(''.join([seq[n:m], newline_char]).encode())
+                        self.file.write(''.join([seq[n:m], newline_char]))
                         n = m
                         m += line_len
-                    self.file.write(seq[n:].encode())
+                    self.file.write(seq[n:])
                     self.file.flush()
 
     def get_long_name(self, rname):
@@ -766,7 +740,7 @@ def _long_name_from_index_record(self, index_record):
         prev_bend = index_record.prev_bend
         defline_end = index_record.offset
         self.file.seek(prev_bend)
-        return self.file.read(defline_end - prev_bend).decode()[1:-1]
+        return self.file.read(defline_end - prev_bend)[1:-1]
 
     def _long_name_from_bgzf(self, index_record):
         """ Return the full sequence defline and description. Internal method passing IndexRecord
@@ -779,7 +753,7 @@ def _long_name_from_bgzf(self, index_record):
         self.file.seek(prev_bend)
         defline = []
         while True:
-            chunk = self.file.read(4096).decode()
+            chunk = self.file.read(4096)
             defline.append(chunk)
             if '\n' in chunk or '\r' in chunk:
                 break
@@ -866,6 +840,10 @@ def __repr__(self):
     def __len__(self):
         return self._fa.faidx.index[self.name].rlen
 
+    def __array__(self):
+        import numpy as np
+        return np.array([val for record in self for val in record.seq])
+
     @property
     def unpadded_len(self):
         """ Returns the length of the contig without 5' and 3' N padding.
@@ -930,14 +908,14 @@ def __array_interface__(self):
             'shape': (len(self), ),
             'typestr': '|S1',
             'version': 3,
-            'data': buffer(str(self).encode('ascii'))
+            'data': buffer(str(self))
         }
 
 
 class MutableFastaRecord(FastaRecord):
     def __init__(self, name, fa):
         super(MutableFastaRecord, self).__init__(name, fa)
-        if self._fa.faidx._fasta_opener != open:
+        if self._fa.faidx._bgzf:
             raise UnsupportedCompressionFormat(
                 "BGZF compressed FASTA is not supported for MutableFastaRecord. "
                 "Please decompress your FASTA file.")
@@ -1021,6 +999,17 @@ def __getitem__(self, rname):
         if isinstance(rname, integer_types):
             rname = next(islice(self.records.keys(), rname, None))
         try:
+            #
+            # x = self.records[rname]
+            # print(rname)
+            # # print(x)
+            # # print(type(x))
+            # print(len(x))
+            # #print(x[0])
+            # print(list(x))
+            # import numpy as np
+            # y = np.array(x)
+            # print(y.shape)
             return self.records[rname]
         except KeyError:
             raise KeyError("{0} not in {1}.".format(rname, self.filename))