Skip to content

Commit 773659e

Browse files
authored
Merge pull request #11 from slowkoni/feature/mt-default-loop-true
MT updates: 'N' removal now optional; loop allowed on rCRS/RSRS & synonyms.
2 parents b033bfc + d92eabd commit 773659e

File tree

5 files changed

+20
-8
lines changed

5 files changed

+20
-8
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,8 @@ backward-compatibility then you may load it directly by accession (`NC_001807.4`
9494
The rCRS mitochondria sequence contains an 'N' base at position 3106-3107 to
9595
preserve legacy nucleotide numbering. This can be useful for using legacy
9696
coordinates but but is impractical when working with sequences that are
97-
expected to align to observed human mitochondrial sequences. SeqSeek removes this `N`:
97+
expected to align to observed human mitochondrial sequences. SeqSeek
98+
removes this `N` unless it is explicitly requested by passing `RCRS_N_remove=False`.
9899

99100
```python
100101
Chromosome('MT').sequence(3106, 3107) # => ''

seqseek/chromosome.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22

33
from .exceptions import TooManyLoops, MissingDataError
44
from .lib import (BUILD37, BUILD38, get_data_directory, sorted_nicely,
5-
BUILD37_ACCESSIONS, BUILD38_ACCESSIONS, ACCESSION_LENGTHS, RCRS_ACCESSION)
5+
BUILD37_ACCESSIONS, BUILD38_ACCESSIONS, ACCESSION_LENGTHS,
6+
RCRS_ACCESSION, MITOCHONDRIA_NAMES)
67

78

89
class Chromosome(object):
@@ -12,7 +13,7 @@ class Chromosome(object):
1213
BUILD38: BUILD38_ACCESSIONS
1314
}
1415

15-
def __init__(self, chromosome_name, assembly=BUILD37, loop=False):
16+
def __init__(self, chromosome_name, assembly=BUILD37, loop=False, RCRS_N_remove=True):
1617
"""
1718
Usage:
1819
@@ -28,6 +29,7 @@ def __init__(self, chromosome_name, assembly=BUILD37, loop=False):
2829
self.name = str(chromosome_name)
2930
self.assembly = assembly
3031
self.loop = loop
32+
self.RCRS_N_remove = RCRS_N_remove
3133

3234
self.validate_assembly()
3335
self.validate_name()
@@ -56,7 +58,7 @@ def validate_name(self):
5658
name=self.name))
5759

5860
def validate_loop(self):
59-
if self.loop and self.name != 'MT':
61+
if self.loop and self.name not in MITOCHONDRIA_NAMES:
6062
raise ValueError('Loop may only be specified for the mitochondria.')
6163

6264
def validate_coordinates(self, start, end):
@@ -87,7 +89,7 @@ def sorted_chromosome_length_tuples(cls, assembly):
8789
ACCESSION_LENGTHS.keys()).index(name_to_accession[pair[0]]))
8890

8991
def filename(self):
90-
return '{}.fa'.format(self.accession)
92+
return '{}.fa'.format(self.accession)
9193

9294
def path(self):
9395
data_dir = get_data_directory()
@@ -127,7 +129,7 @@ def sequence(self, start, end):
127129
# The rCRS mito contig contains an 'N' base at position 3107 to preserve legacy
128130
# nucleotide numbering. We remove it because it is not part of the observed
129131
# sequence. See http://www.mitomap.org/MITOMAP/HumanMitoSeq
130-
if self.accession == RCRS_ACCESSION:
132+
if self.accession == RCRS_ACCESSION and self.RCRS_N_remove is True:
131133
sequence = sequence.replace('N', '')
132134

133135
return sequence

seqseek/lib.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,9 @@
152152
'NT_167251.1': 1680828,
153153
}
154154

155+
MITOCHONDRIA_NAMES = {'MT', 'RSRS', BUILD37_ACCESSIONS['MT'], BUILD37_ACCESSIONS['RSRS'],
156+
BUILD38_ACCESSIONS['MT'], BUILD38_ACCESSIONS['RSRS']}
157+
155158

156159
def get_data_directory():
157160
default = os.path.expanduser(DEFAULT_DATA_DIR)

seqseek/tests/test_functional.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,12 @@ def test_chrMT_sequence(self):
7171
seq = Chromosome('MT').sequence(5, 10)
7272
self.assertEqual(seq, expected_seq)
7373

74+
def test_rCRS_sequence_retain_N(self):
75+
expected_seq = 'GATCACAGGTCTNTCACCCT'
76+
seq = Chromosome('MT', RCRS_N_remove=False).sequence(0, 20)
77+
self.assertEqual(seq, expected_seq)
78+
self.assertTrue('N' in seq) # the N base was *not* removed
79+
7480
def test_mito_loop_end(self):
7581
expected_seq = 'CTTCACCCTGATCACAGGT'
7682

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@
1515

1616
setup(
1717
name='seqseek',
18-
version='0.3.3',
18+
version='0.4.1',
1919
url='https://github.com/23andMe/seqseek',
20-
download_url = 'https://github.com/23andMe/seqseek/tarball/0.3.3',
20+
download_url = 'https://github.com/23andMe/seqseek/tarball/0.4.1',
2121
author='23andMe Engineering',
2222
author_email=['[email protected]'],
2323
description='Easy access to human reference genome sequences',

0 commit comments

Comments
 (0)