nextstrain · victorlin · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025
diff --git a/augur/dates/ambiguous_date.py b/augur/dates/ambiguous_date.py
@@ -29,7 +29,7 @@ def resolve_uncertain_int(uncertain_string, min_or_max):
     elif min_or_max == "max":
         result = int(uncertain_string.replace("X", "9"))
     else:
-        raise "Tried to resolve an uncertain integer to something other than `min` or `max`."
+        raise ValueError("Tried to resolve an uncertain integer to something other than `min` or `max`.")
 
     if result == 0:
         # A date component cannot be 0. Well, year can, but...

diff --git a/augur/filter/subsample.py b/augur/filter/subsample.py
@@ -303,9 +303,9 @@ def get_probabilistic_group_sizes(groups, target_group_size, random_seed=None):
     return max_sizes_per_group
 
 
-TARGET_SIZE_COLUMN = '_augur_filter_target_size'
-INPUT_SIZE_COLUMN = '_augur_filter_input_size'
-OUTPUT_SIZE_COLUMN = '_augur_filter_subsampling_output_size'
+TARGET_SIZE_COLUMN = 'augur_filter_target_size'
+INPUT_SIZE_COLUMN = 'augur_filter_input_size'
+OUTPUT_SIZE_COLUMN = 'augur_filter_subsampling_output_size'
 
 
 def get_weighted_group_sizes(
@@ -349,11 +349,12 @@ def get_weighted_group_sizes(
     weights[OUTPUT_SIZE_COLUMN] = weights[[INPUT_SIZE_COLUMN, TARGET_SIZE_COLUMN]].min(axis=1)
 
     # Warn on any under-sampled groups
-    for _, row in weights.iterrows():
+    for row in weights.itertuples():
+        row = row._asdict()
         if row[INPUT_SIZE_COLUMN] < row[TARGET_SIZE_COLUMN]:
             sequences = _n('sequence', 'sequences', int(row[TARGET_SIZE_COLUMN]))
             are = _n('is', 'are', int(row[INPUT_SIZE_COLUMN]))
-            group = list(f'{col}={value!r}' for col, value in row[group_by].items())
+            group = list(f'{col}={row[col]!r}' for col in group_by)
             print_err(f"WARNING: Targeted {row[TARGET_SIZE_COLUMN]} {sequences} for group {group} but only {row[INPUT_SIZE_COLUMN]} {are} available.")
 
     if output_sizes_file:

diff --git a/augur/utils.py b/augur/utils.py
@@ -7,7 +7,7 @@
 from collections import OrderedDict
 from io import RawIOBase
 from shlex import quote as shquote
-from typing import List, Union
+from typing import List, Optional, Union
 from .__version__ import __version__
 
 from augur.data import as_file
@@ -459,7 +459,7 @@ def read_bed_file(bed_file):
         Sorted list of unique zero-indexed sites
     """
     in_header = True
-    initial_chrom_value: str | None = None
+    initial_chrom_value: Optional[str] = None
     mask_sites: list[int] = []
 
     bed_file_size = os.path.getsize(bed_file)

diff --git a/augur/validate_export.py b/augur/validate_export.py
@@ -6,6 +6,23 @@
 
 import sys
 from collections import defaultdict
+from typing import Any, DefaultDict, Set, Tuple, TypedDict
+
+
+class AttrInfo(TypedDict):
+    count: int
+    values: Set[Any]
+    onAllNodes: bool
+
+
+TreeAttrs = DefaultDict[str, AttrInfo]
+"""
+Mapping from attribute name to object with values and statistics.
+"""
+
+def _new_attr_info() -> AttrInfo:
+    return {"count": 0, "values": set(), "onAllNodes": False}
+
 
 def ensure_no_duplicate_names(root, ValidateError):
     """
@@ -21,19 +38,21 @@ def recurse(node):
     recurse(root)
 
 
-def collectTreeAttrsV2(root, warn):
+def collectTreeAttrsV2(root, warn) -> Tuple[TreeAttrs, int]:
     """
     Collect all keys specified on `node["node_attrs"]` throughout the tree
     and the values associated with them. Note that this will only look at
     attributes which are themselves objects with a `value` property.
     I.e. a node attribute `node["node_attrs"]["div"] -> numeric` will not
     be collected.
-    Returns a tuple.
-    return[0]: dict of `node_attr_property` -> x, where x is a dict with
-    keys `count` -> INT, `values` -> SET, `onAllNodes` -> BOOL.
-    return[1]: INT of number of terminal nodes in tree
+
+    Returns
+    -------
+    :py:class:`TreeAttrs`
+    :py:class:`int`
+        Number of terminal nodes in tree
     """
-    seen = defaultdict(lambda: {"count": 0, "values": set(), "onAllNodes": False})
+    seen = defaultdict(_new_attr_info)
     num_nodes, num_terminal = (0, 0)
     def recurse(node):
         nonlocal num_nodes, num_terminal
@@ -177,12 +196,12 @@ def warn(msg):
     return not warnings
 
 
-def collectTreeAttrsV1(root):
+def collectTreeAttrsV1(root) -> Tuple[TreeAttrs, int]:
     """
     Collect all keys specified on node->attr (or node->traits) throughout the tree
     If the values of these keys are strings, then also collect the values
     """
-    seen = defaultdict(lambda: {"count": 0, "values": set(), "onAllNodes": False})
+    seen = defaultdict(_new_attr_info)
     num_nodes, num_terminal = (0, 0)
     def recurse(node):
         nonlocal num_nodes, num_terminal

diff --git a/pyrightconfig.json b/pyrightconfig.json
@@ -19,7 +19,6 @@
   "reportArgumentType": false,
   "reportAttributeAccessIssue": false,
   "reportCallIssue": false,
-  "reportGeneralTypeIssues": false,
   "reportIncompatibleMethodOverride": false,
   "reportMissingImports": false,
   "reportMissingModuleSource": false,

diff --git a/tests/functional/filter/cram/subsample-weighted-and-uniform-mix.t b/tests/functional/filter/cram/subsample-weighted-and-uniform-mix.t
@@ -31,9 +31,9 @@ Weight locations A:B as 2:1. This is reflected in target_group_sizes.tsv below.
   >   --output-metadata filtered.tsv 2>/dev/null
 
   $ cat target_group_sizes.tsv | tsv-pretty
-  location  weight  _augur_filter_target_size  _augur_filter_input_size  _augur_filter_subsampling_output_size
-  A              2                         67                       100                                     67
-  B              1                         33                       150                                     33
+  location  weight  augur_filter_target_size  augur_filter_input_size  augur_filter_subsampling_output_size
+  A              2                        67                      100                                    67
+  B              1                        33                      150                                    33
 
 There are also enough rows per group that the output metadata directly reflects
 the target group sizes.
@@ -60,9 +60,9 @@ Using 1:1 weights is similarly straightforward, with 50 sequences from each loca
   >   --output-strains strains.txt 2>/dev/null
 
   $ cat target_group_sizes.tsv | tsv-pretty
-  location  weight  _augur_filter_target_size  _augur_filter_input_size  _augur_filter_subsampling_output_size
-  A              1                         50                       100                                     50
-  B              1                         50                       150                                     50
+  location  weight  augur_filter_target_size  augur_filter_input_size  augur_filter_subsampling_output_size
+  A              1                        50                      100                                    50
+  B              1                        50                      150                                    50
 
 Keep the 1:1 location weighting, but add uniform sampling on year.
 The uniform sampling happens "within" each weighted column value, so the 1:1
@@ -79,12 +79,12 @@ available per location.
   >   --output-strains strains.txt 2>/dev/null
 
   $ cat target_group_sizes.tsv | tsv-pretty
-  year  location       weight  _augur_filter_target_size  _augur_filter_input_size  _augur_filter_subsampling_output_size
-  2000  A         0.5                                 25                        50                                     25
-  2000  B         0.3333333333333333                  16                        50                                     16
-  2001  A         0.5                                 25                        50                                     25
-  2001  B         0.3333333333333333                  16                        50                                     16
-  2002  B         0.3333333333333333                  17                        50                                     17
+  year  location       weight  augur_filter_target_size  augur_filter_input_size  augur_filter_subsampling_output_size
+  2000  A         0.5                                25                       50                                    25
+  2000  B         0.3333333333333333                 16                       50                                    16
+  2001  A         0.5                                25                       50                                    25
+  2001  B         0.3333333333333333                 16                       50                                    16
+  2002  B         0.3333333333333333                 17                       50                                    17
 
 If a single sequence is added for group (2002,A), the weighting now appears
 "equal" among all years and locations.
@@ -110,13 +110,13 @@ requested 17, so the total number of sequences outputted is lower than requested
   83 strains passed all filters
 
   $ cat target_group_sizes.tsv | tsv-pretty
-  year  location       weight  _augur_filter_target_size  _augur_filter_input_size  _augur_filter_subsampling_output_size
-  2000  A         0.3333333333333333                  17                        50                                     17
-  2000  B         0.3333333333333333                  16                        50                                     16
-  2001  A         0.3333333333333333                  16                        50                                     16
-  2001  B         0.3333333333333333                  16                        50                                     16
-  2002  A         0.3333333333333333                  17                         1                                      1
-  2002  B         0.3333333333333333                  17                        50                                     17
+  year  location       weight  augur_filter_target_size  augur_filter_input_size  augur_filter_subsampling_output_size
+  2000  A         0.3333333333333333                 17                       50                                    17
+  2000  B         0.3333333333333333                 16                       50                                    16
+  2001  A         0.3333333333333333                 16                       50                                    16
+  2001  B         0.3333333333333333                 16                       50                                    16
+  2002  A         0.3333333333333333                 17                        1                                     1
+  2002  B         0.3333333333333333                 17                       50                                    17
 
   $ wc -l strains.txt
   \s*83 .* (re)