Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion augur/dates/ambiguous_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def resolve_uncertain_int(uncertain_string, min_or_max):
elif min_or_max == "max":
result = int(uncertain_string.replace("X", "9"))
else:
raise "Tried to resolve an uncertain integer to something other than `min` or `max`."
raise ValueError("Tried to resolve an uncertain integer to something other than `min` or `max`.")

if result == 0:
# A date component cannot be 0. Well, year can, but...
Expand Down
11 changes: 6 additions & 5 deletions augur/filter/subsample.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,9 +303,9 @@ def get_probabilistic_group_sizes(groups, target_group_size, random_seed=None):
return max_sizes_per_group


TARGET_SIZE_COLUMN = '_augur_filter_target_size'
INPUT_SIZE_COLUMN = '_augur_filter_input_size'
OUTPUT_SIZE_COLUMN = '_augur_filter_subsampling_output_size'
TARGET_SIZE_COLUMN = 'augur_filter_target_size'
INPUT_SIZE_COLUMN = 'augur_filter_input_size'
OUTPUT_SIZE_COLUMN = 'augur_filter_subsampling_output_size'


def get_weighted_group_sizes(
Expand Down Expand Up @@ -349,11 +349,12 @@ def get_weighted_group_sizes(
weights[OUTPUT_SIZE_COLUMN] = weights[[INPUT_SIZE_COLUMN, TARGET_SIZE_COLUMN]].min(axis=1)

# Warn on any under-sampled groups
for _, row in weights.iterrows():
for row in weights.itertuples():
row = row._asdict()
if row[INPUT_SIZE_COLUMN] < row[TARGET_SIZE_COLUMN]:
sequences = _n('sequence', 'sequences', int(row[TARGET_SIZE_COLUMN]))
are = _n('is', 'are', int(row[INPUT_SIZE_COLUMN]))
group = list(f'{col}={value!r}' for col, value in row[group_by].items())
group = list(f'{col}={row[col]!r}' for col in group_by)
print_err(f"WARNING: Targeted {row[TARGET_SIZE_COLUMN]} {sequences} for group {group} but only {row[INPUT_SIZE_COLUMN]} {are} available.")

if output_sizes_file:
Expand Down
4 changes: 2 additions & 2 deletions augur/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from collections import OrderedDict
from io import RawIOBase
from shlex import quote as shquote
from typing import List, Union
from typing import List, Optional, Union
from .__version__ import __version__

from augur.data import as_file
Expand Down Expand Up @@ -459,7 +459,7 @@ def read_bed_file(bed_file):
Sorted list of unique zero-indexed sites
"""
in_header = True
initial_chrom_value: str | None = None
initial_chrom_value: Optional[str] = None
mask_sites: list[int] = []

bed_file_size = os.path.getsize(bed_file)
Expand Down
35 changes: 27 additions & 8 deletions augur/validate_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,23 @@

import sys
from collections import defaultdict
from typing import Any, DefaultDict, Set, Tuple, TypedDict


class AttrInfo(TypedDict):
count: int
values: Set[Any]
onAllNodes: bool


TreeAttrs = DefaultDict[str, AttrInfo]
"""
Mapping from attribute name to object with values and statistics.
"""

def _new_attr_info() -> AttrInfo:
return {"count": 0, "values": set(), "onAllNodes": False}


def ensure_no_duplicate_names(root, ValidateError):
"""
Expand All @@ -21,19 +38,21 @@ def recurse(node):
recurse(root)


def collectTreeAttrsV2(root, warn):
def collectTreeAttrsV2(root, warn) -> Tuple[TreeAttrs, int]:
"""
Collect all keys specified on `node["node_attrs"]` throughout the tree
and the values associated with them. Note that this will only look at
attributes which are themselves objects with a `value` property.
I.e. a node attribute `node["node_attrs"]["div"] -> numeric` will not
be collected.
Returns a tuple.
return[0]: dict of `node_attr_property` -> x, where x is a dict with
keys `count` -> INT, `values` -> SET, `onAllNodes` -> BOOL.
return[1]: INT of number of terminal nodes in tree

Returns
-------
:py:class:`TreeAttrs`
:py:class:`int`
Number of terminal nodes in tree
"""
seen = defaultdict(lambda: {"count": 0, "values": set(), "onAllNodes": False})
seen = defaultdict(_new_attr_info)
num_nodes, num_terminal = (0, 0)
def recurse(node):
nonlocal num_nodes, num_terminal
Expand Down Expand Up @@ -177,12 +196,12 @@ def warn(msg):
return not warnings


def collectTreeAttrsV1(root):
def collectTreeAttrsV1(root) -> Tuple[TreeAttrs, int]:
"""
Collect all keys specified on node->attr (or node->traits) throughout the tree
If the values of these keys are strings, then also collect the values
"""
seen = defaultdict(lambda: {"count": 0, "values": set(), "onAllNodes": False})
seen = defaultdict(_new_attr_info)
num_nodes, num_terminal = (0, 0)
def recurse(node):
nonlocal num_nodes, num_terminal
Expand Down
1 change: 0 additions & 1 deletion pyrightconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
"reportArgumentType": false,
"reportAttributeAccessIssue": false,
"reportCallIssue": false,
"reportGeneralTypeIssues": false,
"reportIncompatibleMethodOverride": false,
"reportMissingImports": false,
"reportMissingModuleSource": false,
Expand Down
38 changes: 19 additions & 19 deletions tests/functional/filter/cram/subsample-weighted-and-uniform-mix.t
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ Weight locations A:B as 2:1. This is reflected in target_group_sizes.tsv below.
> --output-metadata filtered.tsv 2>/dev/null

$ cat target_group_sizes.tsv | tsv-pretty
location weight _augur_filter_target_size _augur_filter_input_size _augur_filter_subsampling_output_size
A 2 67 100 67
B 1 33 150 33
location weight augur_filter_target_size augur_filter_input_size augur_filter_subsampling_output_size
A 2 67 100 67
B 1 33 150 33

There are also enough rows per group that the output metadata directly reflects
the target group sizes.
Expand All @@ -60,9 +60,9 @@ Using 1:1 weights is similarly straightforward, with 50 sequences from each loca
> --output-strains strains.txt 2>/dev/null

$ cat target_group_sizes.tsv | tsv-pretty
location weight _augur_filter_target_size _augur_filter_input_size _augur_filter_subsampling_output_size
A 1 50 100 50
B 1 50 150 50
location weight augur_filter_target_size augur_filter_input_size augur_filter_subsampling_output_size
A 1 50 100 50
B 1 50 150 50

Keep the 1:1 location weighting, but add uniform sampling on year.
The uniform sampling happens "within" each weighted column value, so the 1:1
Expand All @@ -79,12 +79,12 @@ available per location.
> --output-strains strains.txt 2>/dev/null

$ cat target_group_sizes.tsv | tsv-pretty
year location weight _augur_filter_target_size _augur_filter_input_size _augur_filter_subsampling_output_size
2000 A 0.5 25 50 25
2000 B 0.3333333333333333 16 50 16
2001 A 0.5 25 50 25
2001 B 0.3333333333333333 16 50 16
2002 B 0.3333333333333333 17 50 17
year location weight augur_filter_target_size augur_filter_input_size augur_filter_subsampling_output_size
2000 A 0.5 25 50 25
2000 B 0.3333333333333333 16 50 16
2001 A 0.5 25 50 25
2001 B 0.3333333333333333 16 50 16
2002 B 0.3333333333333333 17 50 17

If a single sequence is added for group (2002,A), the weighting now appears
"equal" among all years and locations.
Expand All @@ -110,13 +110,13 @@ requested 17, so the total number of sequences outputted is lower than requested
83 strains passed all filters

$ cat target_group_sizes.tsv | tsv-pretty
year location weight _augur_filter_target_size _augur_filter_input_size _augur_filter_subsampling_output_size
2000 A 0.3333333333333333 17 50 17
2000 B 0.3333333333333333 16 50 16
2001 A 0.3333333333333333 16 50 16
2001 B 0.3333333333333333 16 50 16
2002 A 0.3333333333333333 17 1 1
2002 B 0.3333333333333333 17 50 17
year location weight augur_filter_target_size augur_filter_input_size augur_filter_subsampling_output_size
2000 A 0.3333333333333333 17 50 17
2000 B 0.3333333333333333 16 50 16
2001 A 0.3333333333333333 16 50 16
2001 B 0.3333333333333333 16 50 16
2002 A 0.3333333333333333 17 1 1
2002 B 0.3333333333333333 17 50 17

$ wc -l strains.txt
\s*83 .* (re)
Loading