Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

filter: Improve help text #1430

Draft
wants to merge 7 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
### Bug Fixes

* filter: Updated docs with an example of tiered subsampling. [#1425][] (@victorlin)
* filter: Added clarity to help text shown when running `augur filter --help` and rendered on the docs page. [#1430][] (@victorlin)

[#1425]: https://github.com/nextstrain/augur/pull/1425
[#1429]: https://github.com/nextstrain/augur/pull/1429
[#1430]: https://github.com/nextstrain/augur/pull/1430

## 24.2.3 (23 February 2024)

Expand Down
213 changes: 154 additions & 59 deletions augur/filter/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Filter and subsample a sequence set.
"""
from augur.argparse_ import SKIP_AUTO_DEFAULT_IN_HELP
from augur.dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT
from augur.filter.io import ACCEPTED_TYPES, column_type_pair
from augur.io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, METADATA_DATE_COLUMN
Expand All @@ -14,74 +15,168 @@ def register_arguments(parser):
Kept as a separate function than `register_parser` to continue to support
unit tests that use this function to create argparser.
"""
input_group = parser.add_argument_group("inputs", "metadata and sequences to be filtered")
input_group.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata")
input_group.add_argument('--sequences', '-s', help="sequences in FASTA or VCF format")
input_group.add_argument('--sequence-index', help="sequence composition report generated by augur index. If not provided, an index will be created on the fly.")
input_group.add_argument('--metadata-chunk-size', type=int, default=100000, help="maximum number of metadata records to read into memory at a time. Increasing this number can speed up filtering at the cost of more memory used.")
input_group.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
input_group.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
input_group = parser.add_argument_group(
title="Inputs",
description="Metadata and sequences to be filtered.")
input_group.add_argument('--metadata', required=True, metavar="FILE",
help=f"Sequence metadata. {SKIP_AUTO_DEFAULT_IN_HELP}")
input_group.add_argument('--sequences', '-s', metavar="FILE",
help=f"Sequences in FASTA or VCF format. {SKIP_AUTO_DEFAULT_IN_HELP}")
input_group.add_argument('--sequence-index', metavar="FILE",
help=f"""Sequence composition report generated by augur index. If not
provided, an index will be created on the fly.
{SKIP_AUTO_DEFAULT_IN_HELP}""")
input_group.add_argument('--metadata-chunk-size', type=int, metavar="N", default=100000,
help="""Maximum number of metadata records to read into memory at a
time. Increasing this number can speed up filtering at the cost of
more memory used.""")
input_group.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+", metavar="COLUMN",
help="""Names of possible metadata columns containing strain identifier
information, ordered by priority. Only one ID column will be
inferred.""")
input_group.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", metavar="DELIMITER",
help="""Delimiters to accept when reading a metadata file. Only one
delimiter will be inferred.""")

metadata_filter_group = parser.add_argument_group("metadata filters", "filters to apply to metadata")
metadata_filter_group.add_argument(
'--query',
help="""Filter samples by attribute.
Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.
(e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")"""
metadata_filter_group = parser.add_argument_group(
title="Metadata filters",
description="Filters to apply to metadata.")
metadata_filter_group.add_argument('--query', metavar="QUERY",
help=f"""Filter strains by attribute. Uses Pandas DataFrame querying, see
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query
for syntax. (e.g., --query "country == 'Colombia'" or --query
"(country == 'USA' & (division == 'Washington'))")
{SKIP_AUTO_DEFAULT_IN_HELP}"""
)
metadata_filter_group.add_argument('--query-columns', type=column_type_pair, nargs="+", help=f"""
Use alongside --query to specify columns and data types in the format 'column:type', where type is one of ({','.join(ACCEPTED_TYPES)}).
Automatic type inference will be attempted on all unspecified columns used in the query.
Example: region:str coverage:float.
""")
metadata_filter_group.add_argument('--min-date', type=numeric_date_type, help=f"minimal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
metadata_filter_group.add_argument('--max-date', type=numeric_date_type, help=f"maximal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'],
help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").')
metadata_filter_group.add_argument('--exclude', type=str, nargs="+", help="file(s) with list of strains to exclude")
metadata_filter_group.add_argument('--exclude-where', nargs='+',
help="Exclude samples matching these conditions. Ex: \"host=rat\" or \"host!=rat\". Multiple values are processed as OR (matching any of those specified will be excluded), not AND")
metadata_filter_group.add_argument('--exclude-all', action="store_true", help="exclude all strains by default. Use this with the include arguments to select a specific subset of strains.")
metadata_filter_group.add_argument('--include', type=str, nargs="+", help="file(s) with list of strains to include regardless of priorities, subsampling, or absence of an entry in --sequences.")
metadata_filter_group.add_argument('--include-where', nargs='+', help="""
Include samples with these values. ex: host=rat. Multiple values are
processed as OR (having any of those specified will be included), not
AND. This rule is applied last and ensures any strains matching these
rules will be included regardless of priorities, subsampling, or absence
of an entry in --sequences.""")
metadata_filter_group.add_argument('--query-columns', type=column_type_pair, nargs="+", metavar="COLUMN",
help=f"""Use alongside --query to specify columns and data types in the
format 'column:type', where type is one of
{sorted(ACCEPTED_TYPES)}. Automatic type inference will be
attempted on all unspecified columns used in the query. Example:
region:str coverage:float. {SKIP_AUTO_DEFAULT_IN_HELP}""")
metadata_filter_group.add_argument('--min-date', type=numeric_date_type, metavar="DATE",
help=f"""Minimal cutoff for date, the cutoff date is inclusive; may be
specified as: {SUPPORTED_DATE_HELP_TEXT}
{SKIP_AUTO_DEFAULT_IN_HELP}""")
metadata_filter_group.add_argument('--max-date', type=numeric_date_type, metavar="DATE",
help=f"""Maximal cutoff for date, the cutoff date is inclusive; may be
specified as: {SUPPORTED_DATE_HELP_TEXT}
{SKIP_AUTO_DEFAULT_IN_HELP}""")
metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'], metavar="LEVEL",
help=f"""Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g.,
2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An
ambiguous year makes the corresponding month and day ambiguous,
too, even if those fields have unambiguous values (e.g.,
"201X-10-01"). Similarly, an ambiguous month makes the
corresponding day ambiguous (e.g., "2010-XX-01").
{SKIP_AUTO_DEFAULT_IN_HELP}""")
metadata_filter_group.add_argument('--exclude', type=str, nargs="+", metavar="FILE",
help=f"""File(s) with list of strain IDs to exclude. The ID column is
determined by --metadata-id-columns.
{SKIP_AUTO_DEFAULT_IN_HELP}""")
metadata_filter_group.add_argument('--exclude-where', nargs='+', metavar="CONDITION",
help=f"""Exclude strains matching these conditions. Ex: \"host=rat\" or
\"host!=rat\". Multiple values are processed as OR (matching any of
those specified will be excluded), not AND. {SKIP_AUTO_DEFAULT_IN_HELP}""")
metadata_filter_group.add_argument('--exclude-all', action="store_true",
help=f"""Exclude all strains by default. Use this with the include
arguments to select a specific subset of strains.
{SKIP_AUTO_DEFAULT_IN_HELP}""")
metadata_filter_group.add_argument('--include', type=str, nargs="+", metavar="FILE",
help=f"""File(s) with list of strain IDs to include regardless of
priorities, subsampling, or absence of an entry in --sequences.
The ID column is determined by --metadata-id-columns.
{SKIP_AUTO_DEFAULT_IN_HELP}""")
metadata_filter_group.add_argument('--include-where', nargs='+', metavar="CONDITION",
help=f"""Include strains with these values. ex: host=rat. Multiple
values are processed as OR (having any of those specified will be
included), not AND. This rule is applied last and ensures any
strains matching these rules will be included regardless of
priorities, subsampling, or absence of an entry in --sequences.
{SKIP_AUTO_DEFAULT_IN_HELP}""")

sequence_filter_group = parser.add_argument_group("sequence filters", "filters to apply to sequence data")
sequence_filter_group.add_argument('--min-length', type=int, help="minimal length of the sequences, only counting standard nucleotide characters A, C, G, or T (case-insensitive)")
sequence_filter_group.add_argument('--max-length', type=int, help="maximum length of the sequences, only counting standard nucleotide characters A, C, G, or T (case-insensitive)")
sequence_filter_group.add_argument('--non-nucleotide', action='store_true', help="exclude sequences that contain illegal characters")
sequence_filter_group = parser.add_argument_group(
title="Sequence filters",
description="Filters to apply to sequence data.")
sequence_filter_group.add_argument('--min-length', type=int, metavar="N",
help=f"""Minimal length of the sequences, only counting standard
nucleotide characters A, C, G, or T (case-insensitive).
{SKIP_AUTO_DEFAULT_IN_HELP}""")
sequence_filter_group.add_argument('--max-length', type=int, metavar="N",
help=f"""Maximum length of the sequences, only counting standard
nucleotide characters A, C, G, or T (case-insensitive).
{SKIP_AUTO_DEFAULT_IN_HELP}""")
sequence_filter_group.add_argument('--non-nucleotide', action='store_true',
help=f"""Exclude sequences that contain illegal characters.
{SKIP_AUTO_DEFAULT_IN_HELP}""")

subsample_group = parser.add_argument_group("subsampling", "options to subsample filtered data")
subsample_group.add_argument('--group-by', nargs='+', help=f"""
categories with respect to subsample.
Notes:
(1) Grouping by {sorted(constants.GROUP_BY_GENERATED_COLUMNS)} is only supported when there is a {METADATA_DATE_COLUMN!r} column in the metadata.
(2) 'week' uses the ISO week numbering system, where a week starts on a Monday and ends on a Sunday.
subsample_group = parser.add_argument_group(
title="Subsampling",
description="Options to subsample filtered data.")
subsample_group.add_argument('--group-by', nargs='+', metavar="COLUMN",
help=f"""Categories with respect to subsample. Notes:
(1) Grouping by {sorted(constants.GROUP_BY_GENERATED_COLUMNS)} is only
supported when there is a {METADATA_DATE_COLUMN!r} column in the
metadata.
(2) 'week' uses the ISO week numbering system, where a week starts on a
Monday and ends on a Sunday.
(3) 'month' and 'week' grouping cannot be used together.
(4) Custom columns {sorted(constants.GROUP_BY_GENERATED_COLUMNS)} in the metadata are ignored for grouping. Please rename them if you want to use their values for grouping.""")
(4) Custom columns {sorted(constants.GROUP_BY_GENERATED_COLUMNS)} in the
metadata are ignored for grouping. Please rename them if you want to
use their values for grouping.
{SKIP_AUTO_DEFAULT_IN_HELP}""")

subsample_limits_group = subsample_group.add_mutually_exclusive_group()
subsample_limits_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category")
subsample_limits_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences; can be used without the group_by argument")
subsample_limits_group.add_argument('--sequences-per-group', type=int, metavar="N",
help=f"""Subsample to no more than this number of strains per
category. {SKIP_AUTO_DEFAULT_IN_HELP}""")
subsample_limits_group.add_argument('--subsample-max-sequences', type=int, metavar="N",
help=f"""Subsample to no more than this number of strains; can be used
without --group-by. {SKIP_AUTO_DEFAULT_IN_HELP}""")

probabilistic_sampling_group = subsample_group.add_mutually_exclusive_group()
probabilistic_sampling_group.add_argument('--probabilistic-sampling', action='store_true', help="Allow probabilistic sampling during subsampling. This is useful when there are more groups than requested sequences. This option only applies when `--subsample-max-sequences` is provided.")
probabilistic_sampling_group.add_argument('--probabilistic-sampling', action='store_true',
help="""Allow probabilistic sampling during subsampling. This is useful
when there are more groups than requested strains. This option only
applies when `--subsample-max-sequences` is provided.""")
probabilistic_sampling_group.add_argument('--no-probabilistic-sampling', action='store_false', dest='probabilistic_sampling')
subsample_group.add_argument('--priority', type=str, help="""tab-delimited file with list of priority scores for strains (e.g., "<strain>\\t<priority>") and no header.
When scores are provided, Augur converts scores to floating point values, sorts strains within each subsampling group from highest to lowest priority, and selects the top N strains per group where N is the calculated or requested number of strains per group.
Higher numbers indicate higher priority.
Since priorities represent relative values between strains, these values can be arbitrary.""")
subsample_group.add_argument('--subsample-seed', type=int, help="random number generator seed to allow reproducible subsampling (with same input data).")

output_group = parser.add_argument_group("outputs", "options related to outputs, at least one of the possible representations of filtered data (--output, --output-metadata, --output-strains) is required")
output_group.add_argument('--output', '--output-sequences', '-o', help="filtered sequences in FASTA format")
output_group.add_argument('--output-metadata', help="metadata for strains that passed filters")
output_group.add_argument('--output-strains', help="list of strains that passed filters (no header)")
output_group.add_argument('--output-log', help="tab-delimited file with one row for each filtered strain and the reason it was filtered. Keyword arguments used for a given filter are reported in JSON format in a `kwargs` column.")
output_group.add_argument(
'--empty-output-reporting',
subsample_group.add_argument('--priority', type=str, metavar="FILE",
help=f"""Tab-delimited file with list of priority scores for strains
(e.g., "<strain ID>\\t<priority>") and no header. When scores are
provided, Augur converts scores to floating point values, sorts
strains within each subsampling group from highest to lowest
priority, and selects the top N strains per group where N is the
calculated or requested number of strains per group. Higher
numbers indicate higher priority. Since priorities represent
relative values between strains, these values can be arbitrary.
The ID column is determined by --metadata-id-columns.
{SKIP_AUTO_DEFAULT_IN_HELP}""")
subsample_group.add_argument('--subsample-seed', type=int, metavar="N",
help=f"""Random number generator seed to allow reproducible subsampling
(with same input data). {SKIP_AUTO_DEFAULT_IN_HELP}""")

output_group = parser.add_argument_group(
title="Outputs",
description="""Options related to outputs. At least one of the possible
representations of filtered data (--output,
--output-metadata, --output-strains) is required.""")
output_group.add_argument('--output', '--output-sequences', '-o', metavar="FILE",
help=f"""Filtered sequences in FASTA format.
{SKIP_AUTO_DEFAULT_IN_HELP}""")
output_group.add_argument('--output-metadata', metavar="FILE",
help=f"""Metadata for strains that passed filters.
{SKIP_AUTO_DEFAULT_IN_HELP}""")
output_group.add_argument('--output-strains', metavar="FILE",
help=f"""List of strain IDs that passed filters (no header). The ID
column is determined by --metadata-id-columns.
{SKIP_AUTO_DEFAULT_IN_HELP}""")
output_group.add_argument('--output-log', metavar="FILE",
help=f"""Tab-delimited file with one row for each filtered strain and
the reason it was filtered. Keyword arguments used for a given
filter are reported in JSON format in a `kwargs` column.
{SKIP_AUTO_DEFAULT_IN_HELP}""")
output_group.add_argument('--empty-output-reporting',
type=EmptyOutputReportingMethod.argtype,
choices=list(EmptyOutputReportingMethod),
default=EmptyOutputReportingMethod.ERROR,
Expand Down
Loading
Loading