Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions augur/filter/subsample.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,9 +303,9 @@ def get_probabilistic_group_sizes(groups, target_group_size, random_seed=None):
return max_sizes_per_group


TARGET_SIZE_COLUMN = '_augur_filter_target_size'
INPUT_SIZE_COLUMN = '_augur_filter_input_size'
OUTPUT_SIZE_COLUMN = '_augur_filter_subsampling_output_size'
TARGET_SIZE_COLUMN = 'augur_filter_target_size'
INPUT_SIZE_COLUMN = 'augur_filter_input_size'
OUTPUT_SIZE_COLUMN = 'augur_filter_subsampling_output_size'


def get_weighted_group_sizes(
Expand Down Expand Up @@ -349,11 +349,12 @@ def get_weighted_group_sizes(
weights[OUTPUT_SIZE_COLUMN] = weights[[INPUT_SIZE_COLUMN, TARGET_SIZE_COLUMN]].min(axis=1)

# Warn on any under-sampled groups
for _, row in weights.iterrows():
for row in weights.itertuples():
row = row._asdict()
if row[INPUT_SIZE_COLUMN] < row[TARGET_SIZE_COLUMN]:
sequences = _n('sequence', 'sequences', int(row[TARGET_SIZE_COLUMN]))
are = _n('is', 'are', int(row[INPUT_SIZE_COLUMN]))
group = list(f'{col}={value!r}' for col, value in row[group_by].items())
group = list(f'{col}={row[col]!r}' for col in group_by)
print_err(f"WARNING: Targeted {row[TARGET_SIZE_COLUMN]} {sequences} for group {group} but only {row[INPUT_SIZE_COLUMN]} {are} available.")

if output_sizes_file:
Expand Down
38 changes: 19 additions & 19 deletions tests/functional/filter/cram/subsample-weighted-and-uniform-mix.t
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ Weight locations A:B as 2:1. This is reflected in target_group_sizes.tsv below.
> --output-metadata filtered.tsv 2>/dev/null

$ cat target_group_sizes.tsv | tsv-pretty
location weight _augur_filter_target_size _augur_filter_input_size _augur_filter_subsampling_output_size
A 2 67 100 67
B 1 33 150 33
location weight augur_filter_target_size augur_filter_input_size augur_filter_subsampling_output_size
A 2 67 100 67
B 1 33 150 33

There are also enough rows per group that the output metadata directly reflects
the target group sizes.
Expand All @@ -60,9 +60,9 @@ Using 1:1 weights is similarly straightforward, with 50 sequences from each loca
> --output-strains strains.txt 2>/dev/null

$ cat target_group_sizes.tsv | tsv-pretty
location weight _augur_filter_target_size _augur_filter_input_size _augur_filter_subsampling_output_size
A 1 50 100 50
B 1 50 150 50
location weight augur_filter_target_size augur_filter_input_size augur_filter_subsampling_output_size
A 1 50 100 50
B 1 50 150 50

Keep the 1:1 location weighting, but add uniform sampling on year.
The uniform sampling happens "within" each weighted column value, so the 1:1
Expand All @@ -79,12 +79,12 @@ available per location.
> --output-strains strains.txt 2>/dev/null

$ cat target_group_sizes.tsv | tsv-pretty
year location weight _augur_filter_target_size _augur_filter_input_size _augur_filter_subsampling_output_size
2000 A 0.5 25 50 25
2000 B 0.3333333333333333 16 50 16
2001 A 0.5 25 50 25
2001 B 0.3333333333333333 16 50 16
2002 B 0.3333333333333333 17 50 17
year location weight augur_filter_target_size augur_filter_input_size augur_filter_subsampling_output_size
2000 A 0.5 25 50 25
2000 B 0.3333333333333333 16 50 16
2001 A 0.5 25 50 25
2001 B 0.3333333333333333 16 50 16
2002 B 0.3333333333333333 17 50 17

If a single sequence is added for group (2002,A), the weighting now appears
"equal" among all years and locations.
Expand All @@ -110,13 +110,13 @@ requested 17, so the total number of sequences outputted is lower than requested
83 strains passed all filters

$ cat target_group_sizes.tsv | tsv-pretty
year location weight _augur_filter_target_size _augur_filter_input_size _augur_filter_subsampling_output_size
2000 A 0.3333333333333333 17 50 17
2000 B 0.3333333333333333 16 50 16
2001 A 0.3333333333333333 16 50 16
2001 B 0.3333333333333333 16 50 16
2002 A 0.3333333333333333 17 1 1
2002 B 0.3333333333333333 17 50 17
year location weight augur_filter_target_size augur_filter_input_size augur_filter_subsampling_output_size
2000 A 0.3333333333333333 17 50 17
2000 B 0.3333333333333333 16 50 16
2001 A 0.3333333333333333 16 50 16
2001 B 0.3333333333333333 16 50 16
2002 A 0.3333333333333333 17 1 1
2002 B 0.3333333333333333 17 50 17

$ wc -l strains.txt
\s*83 .* (re)
Loading