diff --git a/README.md b/README.md index ae47319..ef676bf 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,8 @@ backward-compatibility then you may load it directly by accession (`NC_001807.4` The rCRS mitochondria sequence contains an 'N' base at position 3106-3107 to preserve legacy nucleotide numbering. This can be useful for using legacy coordinates but but is impractical when working with sequences that are -expected to align to observed human mitochondrial sequences. SeqSeek removes this `N`: +expected to align to observed human mitochondrial sequences. SeqSeek +removes this `N` unless it is explicitly requested by passing `RCRS_N_remove=False`. ```python Chromosome('MT').sequence(3106, 3107) # => '' diff --git a/seqseek/chromosome.py b/seqseek/chromosome.py index 007c6fa..34607f9 100644 --- a/seqseek/chromosome.py +++ b/seqseek/chromosome.py @@ -2,7 +2,8 @@ from .exceptions import TooManyLoops, MissingDataError from .lib import (BUILD37, BUILD38, get_data_directory, sorted_nicely, - BUILD37_ACCESSIONS, BUILD38_ACCESSIONS, ACCESSION_LENGTHS, RCRS_ACCESSION) + BUILD37_ACCESSIONS, BUILD38_ACCESSIONS, ACCESSION_LENGTHS, + RCRS_ACCESSION, MITOCHONDRIA_NAMES) class Chromosome(object): @@ -12,7 +13,7 @@ class Chromosome(object): BUILD38: BUILD38_ACCESSIONS } - def __init__(self, chromosome_name, assembly=BUILD37, loop=False): + def __init__(self, chromosome_name, assembly=BUILD37, loop=False, RCRS_N_remove=True): """ Usage: @@ -28,6 +29,7 @@ def __init__(self, chromosome_name, assembly=BUILD37, loop=False): self.name = str(chromosome_name) self.assembly = assembly self.loop = loop + self.RCRS_N_remove = RCRS_N_remove self.validate_assembly() self.validate_name() @@ -56,7 +58,7 @@ def validate_name(self): name=self.name)) def validate_loop(self): - if self.loop and self.name != 'MT': + if self.loop and self.name not in MITOCHONDRIA_NAMES: raise ValueError('Loop may only be specified for the mitochondria.') def validate_coordinates(self, start, end): @@ -87,7 +89,7 @@ def sorted_chromosome_length_tuples(cls, assembly): ACCESSION_LENGTHS.keys()).index(name_to_accession[pair[0]])) def filename(self): - return '{}.fa'.format(self.accession) + return '{}.fa'.format(self.accession) def path(self): data_dir = get_data_directory() @@ -127,7 +129,7 @@ def sequence(self, start, end): # The rCRS mito contig contains an 'N' base at position 3107 to preserve legacy # nucleotide numbering. We remove it because it is not part of the observed # sequence. See http://www.mitomap.org/MITOMAP/HumanMitoSeq - if self.accession == RCRS_ACCESSION: + if self.accession == RCRS_ACCESSION and self.RCRS_N_remove is True: sequence = sequence.replace('N', '') return sequence diff --git a/seqseek/lib.py b/seqseek/lib.py index 03ece83..93a70cd 100644 --- a/seqseek/lib.py +++ b/seqseek/lib.py @@ -152,6 +152,9 @@ 'NT_167251.1': 1680828, } +MITOCHONDRIA_NAMES = {'MT', 'RSRS', BUILD37_ACCESSIONS['MT'], BUILD37_ACCESSIONS['RSRS'], + BUILD38_ACCESSIONS['MT'], BUILD38_ACCESSIONS['RSRS']} + def get_data_directory(): default = os.path.expanduser(DEFAULT_DATA_DIR) diff --git a/seqseek/tests/test_functional.py b/seqseek/tests/test_functional.py index 67d5a75..d68858b 100644 --- a/seqseek/tests/test_functional.py +++ b/seqseek/tests/test_functional.py @@ -71,6 +71,12 @@ def test_chrMT_sequence(self): seq = Chromosome('MT').sequence(5, 10) self.assertEqual(seq, expected_seq) + def test_rCRS_sequence_retain_N(self): + expected_seq = 'GATCACAGGTCTNTCACCCT' + seq = Chromosome('MT', RCRS_N_remove=False).sequence(0, 20) + self.assertEqual(seq, expected_seq) + self.assertTrue('N' in seq) # the N base was *not* removed + def test_mito_loop_end(self): expected_seq = 'CTTCACCCTGATCACAGGT' diff --git a/setup.py b/setup.py index c642b9c..eb9fbc9 100755 --- a/setup.py +++ b/setup.py @@ -15,9 +15,9 @@ setup( name='seqseek', - version='0.3.3', + version='0.4.1', url='https://github.com/23andMe/seqseek', - download_url = 'https://github.com/23andMe/seqseek/tarball/0.3.3', + download_url = 'https://github.com/23andMe/seqseek/tarball/0.4.1', author='23andMe Engineering', author_email=['mstrand@23andme.com'], description='Easy access to human reference genome sequences',