Skip to content

Commit 297c5e6

Browse files
committed
🚧 Error when translating sequence lengths indivisible by 3
… instead of silently padding with N to translate to 'X'. A sequence with 1 or 2 extra bases indicates a problem with the data. It doesn't make sense to end the translation with 'X' which implies 3 bases. 🚧: see FIXME in code
1 parent ea8c7e4 commit 297c5e6

File tree

2 files changed

+25
-8
lines changed

2 files changed

+25
-8
lines changed

‎augur/translate.py‎

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,25 +51,26 @@ def safe_translate(sequence):
5151
>>> safe_translate("")
5252
''
5353
>>> safe_translate("ATGT")
54-
'MX'
54+
Traceback (most recent call last):
55+
...
56+
ValueError: Sequence not divisible by 3.
5557
"""
5658
from Bio.Data.CodonTable import TranslationError
5759
from Bio.Seq import CodonTable
5860

59-
#sequences not mod 3 give messy BiopythonWarning, so avoid by padding.
6061
if len(sequence)%3:
61-
sequence_padded = sequence + "N"*(3-len(sequence)%3)
62-
else:
63-
sequence_padded = sequence
62+
raise ValueError("Sequence not divisible by 3.")
63+
# FIXME: handle this elsewhere since a proper error message needs more context - which sequence from which file?
64+
6465
try:
6566
# Attempt translation by extracting the sequence according to the
6667
# BioPhython SeqFeature in frame gaps of three will translate as '-'
67-
translated_sequence = str(Seq.Seq(sequence_padded).translate(gap='-'))
68+
translated_sequence = str(Seq.Seq(sequence).translate(gap='-'))
6869
except TranslationError:
6970
# Any other codon like '-AA' or 'NNT' etc will fail. Translate codons
7071
# one by one.
7172
codon_table = CodonTable.ambiguous_dna_by_name['Standard'].forward_table
72-
str_seq = str(sequence_padded)
73+
str_seq = str(sequence)
7374
codons = np.frombuffer(str_seq[:len(str_seq) - len(str_seq) % 3].encode(), dtype='S3').astype("U")
7475
assert len(codons) > 0
7576
aas = []

‎tests/test_translate.py‎

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from pathlib import Path
55
import sys
66

7+
import pytest
78
from Bio.Seq import Seq
89
from Bio.SeqFeature import SeqFeature, FeatureLocation
910

@@ -23,14 +24,29 @@ def test_safe_translate(self):
2324
(('ATG---',), 'M-'),
2425
(('ATGTAG',), 'M*'),
2526
(('',), ''),
26-
(('ATGT',), 'MX'),
2727
(('ATGA-G',), 'MX')]
2828

2929
# input each pair into the function and check
3030
for pair in params_and_outs:
3131
params, out = pair
3232
assert translate.safe_translate(*params) == out
3333

34+
def test_safe_translate_errors(self):
35+
'''
36+
Test that safe_translate raises ValueError when sequence is not divisible by 3
37+
'''
38+
invalid_sequences = [
39+
'A',
40+
'AT',
41+
'ATGT',
42+
'ATGTA',
43+
'ATGTAGA',
44+
]
45+
46+
for seq in invalid_sequences:
47+
with pytest.raises(ValueError, match="Sequence not divisible by 3"):
48+
translate.safe_translate(seq)
49+
3450
def test_translate_feature(self):
3551
'''
3652
Test translate_feature from a dictionary of given nucleotides to dictionary of translated amino acids

0 commit comments

Comments
 (0)