Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,16 @@

## __NEXT__

### Major Changes

* ancestral, translate: These will now error when the length of any reference gene is indivisible by 3, instead of silently padding with N to translate to 'X'. [#1895][] (@victorlin)

### Features

* `augur curate apply-record-annotations` will now warn if an annotation was unnecessary, often indicative of the upstream data being updated. [#1893][] (@jameshadfield)

[#1893]: https://github.com/nextstrain/augur/pull/1893
[#1895]: https://github.com/nextstrain/augur/issues/1895

## 31.5.0 (17 September 2025)

Expand Down
42 changes: 39 additions & 3 deletions augur/io/sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,16 +212,52 @@ def load_features(reference, feature_names=None):
Raises
------
AugurError
If the reference file doesn't exist, or is malformed / empty
If the reference file doesn't exist, is malformed / empty, or has errors
"""
#checks explicitly for GFF otherwise assumes Genbank
if not os.path.isfile(reference):
raise AugurError(f"reference sequence file {reference!r} not found")

if '.gff' in reference.lower():
return _read_gff(reference, feature_names)
features = _read_gff(reference, feature_names)
else:
return _read_genbank(reference, feature_names)
features = _read_genbank(reference, feature_names)

if errors := find_feature_errors(features):
raise AugurError(dedent(f"""\
Reference file {reference!r} has errors:
{indented_list(errors, " ")}"""))

return features


def find_feature_errors(features):
"""
Find and return errors for features parsed from a GFF/GenBank reference
file.

Parameters
----------
features : dict
keys: feature names, values: :py:class:`Bio.SeqFeature.SeqFeature`

Returns
-------
list of str
Error messages
"""
errors = []

for feature_name, feat in features.items():
if feature_name == 'nuc':
continue

# Error if feature length is not a multiple of 3.
length = len(feat.location)
if length % 3:
errors.append(f"{feature_name!r} has length {length} which is not a multiple of 3.")

return errors


def _read_nuc_annotation_from_gff(record, reference):
Expand Down
9 changes: 2 additions & 7 deletions augur/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def translate_feature(aln, feature):
return translations


def translate_vcf_feature(sequences, ref, feature, feature_name):
def translate_vcf_feature(sequences, ref, feature):
'''Translates a subsequence of input nucleotide sequences.

Parameters
Expand Down Expand Up @@ -156,11 +156,6 @@ def str_reverse_comp(str_seq):
prot['positions'] = []

refNuc = str(feature.extract( SeqRecord(seq=Seq(ref)) ).seq)
# Need to get ref translation to store. check if multiple of 3 for sanity.
# will be padded in safe_translate if not
if len(refNuc)%3:
print_err(f"Gene length of {feature_name!r} is not a multiple of 3. will pad with N")

ref_aa_seq = safe_translate(refNuc)
prot['reference'] = ref_aa_seq

Expand Down Expand Up @@ -425,7 +420,7 @@ def run(args):
if fname=='nuc':
continue
try:
translations[fname] = translate_vcf_feature(sequences, ref, feat, fname)
translations[fname] = translate_vcf_feature(sequences, ref, feat)
reference_translations[fname] = translations[fname]['reference']
except NoVariationError:
features_without_variation.append(fname)
Expand Down
49 changes: 49 additions & 0 deletions tests/functional/translate/cram/basic-error-checking.t
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,52 @@ Missing reference file
> --output-node-data "aa_muts.json" > /dev/null
ERROR: reference sequence file '.+/reference.doesnt-exist.gff' not found (re)
[2]

Gene length not divisible by 3

$ cat >invalid.gff <<~~
> ##gff-version 3
> ##sequence-region reference_name 1 50
> reference_name RefSeq region 1 50 . + . ID=reference_name
> reference_name RefSeq gene 10 23 . + . Name=gene1;gene=gene1
> reference_name RefSeq gene 36 48 . - . Name=gene2;gene=gene2
> ~~

$ ${AUGUR} translate \
> --tree $ANC_DATA/tree.nwk \
> --ancestral-sequences $ANC_DATA/nt_muts.ref-seq.json \
> --reference-sequence invalid.gff \
> --output-node-data "aa_muts.json"
ERROR: Reference file 'invalid.gff' has errors:
'gene1' has length 14 which is not a multiple of 3.
'gene2' has length 13 which is not a multiple of 3.
[2]

Gene with compound location not divisible by 3

$ cat >invalid.gb <<~~
> LOCUS Test 100 bp DNA linear VRL 01-JAN-2025
> DEFINITION Test genome for compound location validation
> FEATURES Location/Qualifiers
> source 1..100
> CDS join(10..20,35..49)
> /gene="test_gene"
> ORIGIN
> 1 aaaaaaaaaa tgccctgcgg gtaaaaaaaa aaaaactact tgaccataaa
> 51 aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa aaaaaaaaaa
> //
> ~~

$ ${AUGUR} translate \
> --tree $ANC_DATA/tree.nwk \
> --ancestral-sequences $ANC_DATA/nt_muts.ref-seq.json \
> --reference-sequence invalid.gb \
> --output-node-data "aa_muts.json"
.* BiopythonParserWarning: Attempting to parse malformed locus line: (re)
.* (re)
.* (re)
.* (re)
.* (re)
ERROR: Reference file 'invalid.gb' has errors:
'test_gene' has length 26 which is not a multiple of 3.
[2]
Loading