Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

## __NEXT__

* `augur refine` will now warn when building a time tree if sequence IDs in the tree don't match metadata IDs, suggesting the use of `--metadata-id-columns` to explicitly set the correct ID column. [#XXXX][] (@corneliusroemer)
* `augur curate apply-record-annotations` will now warn if an annotation was unnecessary, often indicative of the upstream data being updated. [#1893][] (@jameshadfield)

[#XXXX]: https://github.com/nextstrain/augur/pull/XXXX
[#1893]: https://github.com/nextstrain/augur/pull/1893

## 31.5.0 (17 September 2025)
Expand Down
19 changes: 19 additions & 0 deletions augur/refine.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,25 @@ def run(args):
except InvalidYearBounds as error:
raise AugurError(f"Invalid value for --year-bounds: {error}")

# Check for mismatches between sequence IDs and metadata IDs
terminal_names = {n.name for n in T.get_terminals()}
matched_ids = terminal_names & set(dates.keys())

if len(matched_ids) == 0:
print("\nWARNING: No matches found between sequence IDs in the tree and metadata IDs!", file=sys.stderr)
print(f" - Tree has {len(terminal_names)} sequences", file=sys.stderr)
print(f" - Metadata has {len(dates)} entries", file=sys.stderr)
print(f" - Metadata is using '{metadata_object.id_column}' as the ID column", file=sys.stderr)
print(f"\nThis will prevent time tree inference from working correctly.", file=sys.stderr)
print(f"You may need to explicitly set the metadata ID column using --metadata-id-columns.", file=sys.stderr)
print(f"By default, the columns {DEFAULT_ID_COLUMNS} are tried in order.\n", file=sys.stderr)
elif len(matched_ids) < len(terminal_names) * 0.5:
print(f"\nWARNING: Only {len(matched_ids)}/{len(terminal_names)} sequence IDs match metadata IDs!", file=sys.stderr)
print(f" - Metadata is using '{metadata_object.id_column}' as the ID column", file=sys.stderr)
print(f"\nThis may prevent time tree inference from working correctly.", file=sys.stderr)
print(f"You may need to explicitly set the metadata ID column using --metadata-id-columns.", file=sys.stderr)
print(f"By default, the columns {DEFAULT_ID_COLUMNS} are tried in order.\n", file=sys.stderr)

# save input state string for later export
for n in T.get_terminals():
if n.name in metadata.index and METADATA_DATE_COLUMN in metadata.columns:
Expand Down
Loading