Skip to content

Commit

Permalink
Merge pull request #11 from slowkoni/feature/mt-default-loop-true
Browse files Browse the repository at this point in the history
MT updates: 'N' removal now optional; loop allowed on rCRS/RSRS & synonyms.
  • Loading branch information
matstrand authored Apr 25, 2019
2 parents b033bfc + d92eabd commit 773659e
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 8 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ backward-compatibility then you may load it directly by accession (`NC_001807.4`
The rCRS mitochondria sequence contains an 'N' base at position 3106-3107 to
preserve legacy nucleotide numbering. This can be useful for using legacy
coordinates but but is impractical when working with sequences that are
expected to align to observed human mitochondrial sequences. SeqSeek removes this `N`:
expected to align to observed human mitochondrial sequences. SeqSeek
removes this `N` unless it is explicitly requested by passing `RCRS_N_remove=False`.

```python
Chromosome('MT').sequence(3106, 3107) # => ''
Expand Down
12 changes: 7 additions & 5 deletions seqseek/chromosome.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

from .exceptions import TooManyLoops, MissingDataError
from .lib import (BUILD37, BUILD38, get_data_directory, sorted_nicely,
BUILD37_ACCESSIONS, BUILD38_ACCESSIONS, ACCESSION_LENGTHS, RCRS_ACCESSION)
BUILD37_ACCESSIONS, BUILD38_ACCESSIONS, ACCESSION_LENGTHS,
RCRS_ACCESSION, MITOCHONDRIA_NAMES)


class Chromosome(object):
Expand All @@ -12,7 +13,7 @@ class Chromosome(object):
BUILD38: BUILD38_ACCESSIONS
}

def __init__(self, chromosome_name, assembly=BUILD37, loop=False):
def __init__(self, chromosome_name, assembly=BUILD37, loop=False, RCRS_N_remove=True):
"""
Usage:
Expand All @@ -28,6 +29,7 @@ def __init__(self, chromosome_name, assembly=BUILD37, loop=False):
self.name = str(chromosome_name)
self.assembly = assembly
self.loop = loop
self.RCRS_N_remove = RCRS_N_remove

self.validate_assembly()
self.validate_name()
Expand Down Expand Up @@ -56,7 +58,7 @@ def validate_name(self):
name=self.name))

def validate_loop(self):
if self.loop and self.name != 'MT':
if self.loop and self.name not in MITOCHONDRIA_NAMES:
raise ValueError('Loop may only be specified for the mitochondria.')

def validate_coordinates(self, start, end):
Expand Down Expand Up @@ -87,7 +89,7 @@ def sorted_chromosome_length_tuples(cls, assembly):
ACCESSION_LENGTHS.keys()).index(name_to_accession[pair[0]]))

def filename(self):
return '{}.fa'.format(self.accession)
return '{}.fa'.format(self.accession)

def path(self):
data_dir = get_data_directory()
Expand Down Expand Up @@ -127,7 +129,7 @@ def sequence(self, start, end):
# The rCRS mito contig contains an 'N' base at position 3107 to preserve legacy
# nucleotide numbering. We remove it because it is not part of the observed
# sequence. See http://www.mitomap.org/MITOMAP/HumanMitoSeq
if self.accession == RCRS_ACCESSION:
if self.accession == RCRS_ACCESSION and self.RCRS_N_remove is True:
sequence = sequence.replace('N', '')

return sequence
3 changes: 3 additions & 0 deletions seqseek/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,9 @@
'NT_167251.1': 1680828,
}

MITOCHONDRIA_NAMES = {'MT', 'RSRS', BUILD37_ACCESSIONS['MT'], BUILD37_ACCESSIONS['RSRS'],
BUILD38_ACCESSIONS['MT'], BUILD38_ACCESSIONS['RSRS']}


def get_data_directory():
default = os.path.expanduser(DEFAULT_DATA_DIR)
Expand Down
6 changes: 6 additions & 0 deletions seqseek/tests/test_functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,12 @@ def test_chrMT_sequence(self):
seq = Chromosome('MT').sequence(5, 10)
self.assertEqual(seq, expected_seq)

def test_rCRS_sequence_retain_N(self):
expected_seq = 'GATCACAGGTCTNTCACCCT'
seq = Chromosome('MT', RCRS_N_remove=False).sequence(0, 20)
self.assertEqual(seq, expected_seq)
self.assertTrue('N' in seq) # the N base was *not* removed

def test_mito_loop_end(self):
expected_seq = 'CTTCACCCTGATCACAGGT'

Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@

setup(
name='seqseek',
version='0.3.3',
version='0.4.1',
url='https://github.com/23andMe/seqseek',
download_url = 'https://github.com/23andMe/seqseek/tarball/0.3.3',
download_url = 'https://github.com/23andMe/seqseek/tarball/0.4.1',
author='23andMe Engineering',
author_email=['[email protected]'],
description='Easy access to human reference genome sequences',
Expand Down

0 comments on commit 773659e

Please sign in to comment.