nextstrain · victorlin · Mar 5, 2024 · Mar 5, 2024 · Mar 5, 2024 · Mar 5, 2024
diff --git a/CHANGES.md b/CHANGES.md
@@ -9,9 +9,11 @@
 ### Bug Fixes
 
 * filter: Updated docs with an example of tiered subsampling. [#1425][] (@victorlin)
+* filter: Added clarity to help text shown when running `augur filter --help` and rendered on the docs page. [#1430][] (@victorlin)
 
 [#1425]: https://github.com/nextstrain/augur/pull/1425
 [#1429]: https://github.com/nextstrain/augur/pull/1429
+[#1430]: https://github.com/nextstrain/augur/pull/1430
 
 ## 24.2.3 (23 February 2024)
 

diff --git a/augur/filter/__init__.py b/augur/filter/__init__.py
@@ -1,6 +1,7 @@
 """
 Filter and subsample a sequence set.
 """
+from augur.argparse_ import SKIP_AUTO_DEFAULT_IN_HELP
 from augur.dates import numeric_date_type, SUPPORTED_DATE_HELP_TEXT
 from augur.filter.io import ACCEPTED_TYPES, column_type_pair
 from augur.io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, METADATA_DATE_COLUMN
@@ -14,74 +15,168 @@ def register_arguments(parser):
     Kept as a separate function than `register_parser` to continue to support
     unit tests that use this function to create argparser.
     """
-    input_group = parser.add_argument_group("inputs", "metadata and sequences to be filtered")
-    input_group.add_argument('--metadata', required=True, metavar="FILE", help="sequence metadata")
-    input_group.add_argument('--sequences', '-s', help="sequences in FASTA or VCF format")
-    input_group.add_argument('--sequence-index', help="sequence composition report generated by augur index. If not provided, an index will be created on the fly.")
-    input_group.add_argument('--metadata-chunk-size', type=int, default=100000, help="maximum number of metadata records to read into memory at a time. Increasing this number can speed up filtering at the cost of more memory used.")
-    input_group.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
-    input_group.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
+    input_group = parser.add_argument_group(
+        title="Inputs",
+        description="Metadata and sequences to be filtered.")
+    input_group.add_argument('--metadata', required=True, metavar="FILE",
+        help=f"Sequence metadata. {SKIP_AUTO_DEFAULT_IN_HELP}")
+    input_group.add_argument('--sequences', '-s', metavar="FILE",
+        help=f"Sequences in FASTA or VCF format. {SKIP_AUTO_DEFAULT_IN_HELP}")
+    input_group.add_argument('--sequence-index', metavar="FILE",
+        help=f"""Sequence composition report generated by augur index. If not
+              provided, an index will be created on the fly.
+              {SKIP_AUTO_DEFAULT_IN_HELP}""")
+    input_group.add_argument('--metadata-chunk-size', type=int, metavar="N", default=100000,
+        help="""Maximum number of metadata records to read into memory at a
+             time. Increasing this number can speed up filtering at the cost of
+             more memory used.""")
+    input_group.add_argument('--metadata-id-columns', default=DEFAULT_ID_COLUMNS, nargs="+", metavar="COLUMN",
+        help="""Names of possible metadata columns containing strain identifier
+             information, ordered by priority. Only one ID column will be
+             inferred.""")
+    input_group.add_argument('--metadata-delimiters', default=DEFAULT_DELIMITERS, nargs="+", metavar="DELIMITER",
+        help="""Delimiters to accept when reading a metadata file. Only one
+             delimiter will be inferred.""")
 
-    metadata_filter_group = parser.add_argument_group("metadata filters", "filters to apply to metadata")
-    metadata_filter_group.add_argument(
-        '--query',
-        help="""Filter samples by attribute.
-        Uses Pandas Dataframe querying, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query for syntax.
-        (e.g., --query "country == 'Colombia'" or --query "(country == 'USA' & (division == 'Washington'))")"""
+    metadata_filter_group = parser.add_argument_group(
+        title="Metadata filters",
+        description="Filters to apply to metadata.")
+    metadata_filter_group.add_argument('--query', metavar="QUERY",
+        help=f"""Filter strains by attribute. Uses Pandas DataFrame querying, see
+              https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query
+              for syntax. (e.g., --query "country == 'Colombia'" or --query
+              "(country == 'USA' & (division == 'Washington'))")
+              {SKIP_AUTO_DEFAULT_IN_HELP}"""
     )
-    metadata_filter_group.add_argument('--query-columns', type=column_type_pair, nargs="+", help=f"""
-        Use alongside --query to specify columns and data types in the format 'column:type', where type is one of ({','.join(ACCEPTED_TYPES)}).
-        Automatic type inference will be attempted on all unspecified columns used in the query.
-        Example: region:str coverage:float.
-    """)
-    metadata_filter_group.add_argument('--min-date', type=numeric_date_type, help=f"minimal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
-    metadata_filter_group.add_argument('--max-date', type=numeric_date_type, help=f"maximal cutoff for date, the cutoff date is inclusive; may be specified as: {SUPPORTED_DATE_HELP_TEXT}")
-    metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'],
-                                help='Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g., 2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous year makes the corresponding month and day ambiguous, too, even if those fields have unambiguous values (e.g., "201X-10-01"). Similarly, an ambiguous month makes the corresponding day ambiguous (e.g., "2010-XX-01").')
-    metadata_filter_group.add_argument('--exclude', type=str, nargs="+", help="file(s) with list of strains to exclude")
-    metadata_filter_group.add_argument('--exclude-where', nargs='+',
-                                help="Exclude samples matching these conditions. Ex: \"host=rat\" or \"host!=rat\". Multiple values are processed as OR (matching any of those specified will be excluded), not AND")
-    metadata_filter_group.add_argument('--exclude-all', action="store_true", help="exclude all strains by default. Use this with the include arguments to select a specific subset of strains.")
-    metadata_filter_group.add_argument('--include', type=str, nargs="+", help="file(s) with list of strains to include regardless of priorities, subsampling, or absence of an entry in --sequences.")
-    metadata_filter_group.add_argument('--include-where', nargs='+', help="""
-        Include samples with these values. ex: host=rat. Multiple values are
-        processed as OR (having any of those specified will be included), not
-        AND. This rule is applied last and ensures any strains matching these
-        rules will be included regardless of priorities, subsampling, or absence
-        of an entry in --sequences.""")
+    metadata_filter_group.add_argument('--query-columns', type=column_type_pair, nargs="+", metavar="COLUMN",
+        help=f"""Use alongside --query to specify columns and data types in the
+              format 'column:type', where type is one of
+              {sorted(ACCEPTED_TYPES)}. Automatic type inference will be
+              attempted on all unspecified columns used in the query. Example:
+              region:str coverage:float. {SKIP_AUTO_DEFAULT_IN_HELP}""")
+    metadata_filter_group.add_argument('--min-date', type=numeric_date_type, metavar="DATE",
+        help=f"""Minimal cutoff for date, the cutoff date is inclusive; may be
+              specified as: {SUPPORTED_DATE_HELP_TEXT}
+              {SKIP_AUTO_DEFAULT_IN_HELP}""")
+    metadata_filter_group.add_argument('--max-date', type=numeric_date_type, metavar="DATE",
+        help=f"""Maximal cutoff for date, the cutoff date is inclusive; may be
+              specified as: {SUPPORTED_DATE_HELP_TEXT}
+              {SKIP_AUTO_DEFAULT_IN_HELP}""")
+    metadata_filter_group.add_argument('--exclude-ambiguous-dates-by', choices=['any', 'day', 'month', 'year'], metavar="LEVEL",
+        help=f"""Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g.,
+              2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An
+              ambiguous year makes the corresponding month and day ambiguous,
+              too, even if those fields have unambiguous values (e.g.,
+              "201X-10-01"). Similarly, an ambiguous month makes the
+              corresponding day ambiguous (e.g., "2010-XX-01").
+              {SKIP_AUTO_DEFAULT_IN_HELP}""")
+    metadata_filter_group.add_argument('--exclude', type=str, nargs="+", metavar="FILE",
+        help=f"""File(s) with list of strain IDs to exclude. The ID column is
+             determined by --metadata-id-columns.
+             {SKIP_AUTO_DEFAULT_IN_HELP}""")
+    metadata_filter_group.add_argument('--exclude-where', nargs='+', metavar="CONDITION",
+        help=f"""Exclude strains matching these conditions. Ex: \"host=rat\" or
+              \"host!=rat\". Multiple values are processed as OR (matching any of
+              those specified will be excluded), not AND. {SKIP_AUTO_DEFAULT_IN_HELP}""")
+    metadata_filter_group.add_argument('--exclude-all', action="store_true",
+        help=f"""Exclude all strains by default. Use this with the include
+              arguments to select a specific subset of strains.
+              {SKIP_AUTO_DEFAULT_IN_HELP}""")
+    metadata_filter_group.add_argument('--include', type=str, nargs="+", metavar="FILE",
+        help=f"""File(s) with list of strain IDs to include regardless of
+              priorities, subsampling, or absence of an entry in --sequences.
+              The ID column is determined by --metadata-id-columns.
+              {SKIP_AUTO_DEFAULT_IN_HELP}""")
+    metadata_filter_group.add_argument('--include-where', nargs='+', metavar="CONDITION",
+        help=f"""Include strains with these values. ex: host=rat. Multiple
+              values are processed as OR (having any of those specified will be
+              included), not AND. This rule is applied last and ensures any
+              strains matching these rules will be included regardless of
+              priorities, subsampling, or absence of an entry in --sequences.
+              {SKIP_AUTO_DEFAULT_IN_HELP}""")
 
-    sequence_filter_group = parser.add_argument_group("sequence filters", "filters to apply to sequence data")
-    sequence_filter_group.add_argument('--min-length', type=int, help="minimal length of the sequences, only counting standard nucleotide characters A, C, G, or T (case-insensitive)")
-    sequence_filter_group.add_argument('--max-length', type=int, help="maximum length of the sequences, only counting standard nucleotide characters A, C, G, or T (case-insensitive)")
-    sequence_filter_group.add_argument('--non-nucleotide', action='store_true', help="exclude sequences that contain illegal characters")
+    sequence_filter_group = parser.add_argument_group(
+        title="Sequence filters",
+        description="Filters to apply to sequence data.")
+    sequence_filter_group.add_argument('--min-length', type=int, metavar="N",
+        help=f"""Minimal length of the sequences, only counting standard
+              nucleotide characters A, C, G, or T (case-insensitive).
+              {SKIP_AUTO_DEFAULT_IN_HELP}""")
+    sequence_filter_group.add_argument('--max-length', type=int, metavar="N",
+        help=f"""Maximum length of the sequences, only counting standard
+              nucleotide characters A, C, G, or T (case-insensitive).
+              {SKIP_AUTO_DEFAULT_IN_HELP}""")
+    sequence_filter_group.add_argument('--non-nucleotide', action='store_true',
+        help=f"""Exclude sequences that contain illegal characters.
+              {SKIP_AUTO_DEFAULT_IN_HELP}""")
 
-    subsample_group = parser.add_argument_group("subsampling", "options to subsample filtered data")
-    subsample_group.add_argument('--group-by', nargs='+', help=f"""
-        categories with respect to subsample.
-        Notes:
-        (1) Grouping by {sorted(constants.GROUP_BY_GENERATED_COLUMNS)} is only supported when there is a {METADATA_DATE_COLUMN!r} column in the metadata.
-        (2) 'week' uses the ISO week numbering system, where a week starts on a Monday and ends on a Sunday.
+    subsample_group = parser.add_argument_group(
+        title="Subsampling",
+        description="Options to subsample filtered data.")
+    subsample_group.add_argument('--group-by', nargs='+', metavar="COLUMN",
+        help=f"""Categories with respect to subsample. Notes:
+        (1) Grouping by {sorted(constants.GROUP_BY_GENERATED_COLUMNS)} is only
+            supported when there is a {METADATA_DATE_COLUMN!r} column in the
+            metadata.
+        (2) 'week' uses the ISO week numbering system, where a week starts on a
+            Monday and ends on a Sunday.
         (3) 'month' and 'week' grouping cannot be used together.
-        (4) Custom columns {sorted(constants.GROUP_BY_GENERATED_COLUMNS)} in the metadata are ignored for grouping. Please rename them if you want to use their values for grouping.""")
+        (4) Custom columns {sorted(constants.GROUP_BY_GENERATED_COLUMNS)} in the
+            metadata are ignored for grouping. Please rename them if you want to
+            use their values for grouping.
+        {SKIP_AUTO_DEFAULT_IN_HELP}""")
+
     subsample_limits_group = subsample_group.add_mutually_exclusive_group()
-    subsample_limits_group.add_argument('--sequences-per-group', type=int, help="subsample to no more than this number of sequences per category")
-    subsample_limits_group.add_argument('--subsample-max-sequences', type=int, help="subsample to no more than this number of sequences; can be used without the group_by argument")
+    subsample_limits_group.add_argument('--sequences-per-group', type=int, metavar="N",
+        help=f"""Subsample to no more than this number of strains per
+              category. {SKIP_AUTO_DEFAULT_IN_HELP}""")
+    subsample_limits_group.add_argument('--subsample-max-sequences', type=int, metavar="N",
+        help=f"""Subsample to no more than this number of strains; can be used
+              without --group-by. {SKIP_AUTO_DEFAULT_IN_HELP}""")
+
     probabilistic_sampling_group = subsample_group.add_mutually_exclusive_group()
-    probabilistic_sampling_group.add_argument('--probabilistic-sampling', action='store_true', help="Allow probabilistic sampling during subsampling. This is useful when there are more groups than requested sequences. This option only applies when `--subsample-max-sequences` is provided.")
+    probabilistic_sampling_group.add_argument('--probabilistic-sampling', action='store_true',
+        help="""Allow probabilistic sampling during subsampling. This is useful
+             when there are more groups than requested strains. This option only
+             applies when `--subsample-max-sequences` is provided.""")
     probabilistic_sampling_group.add_argument('--no-probabilistic-sampling', action='store_false', dest='probabilistic_sampling')
-    subsample_group.add_argument('--priority', type=str, help="""tab-delimited file with list of priority scores for strains (e.g., "<strain>\\t<priority>") and no header.
-    When scores are provided, Augur converts scores to floating point values, sorts strains within each subsampling group from highest to lowest priority, and selects the top N strains per group where N is the calculated or requested number of strains per group.
-    Higher numbers indicate higher priority.
-    Since priorities represent relative values between strains, these values can be arbitrary.""")
-    subsample_group.add_argument('--subsample-seed', type=int, help="random number generator seed to allow reproducible subsampling (with same input data).")
 
-    output_group = parser.add_argument_group("outputs", "options related to outputs, at least one of the possible representations of filtered data (--output, --output-metadata, --output-strains) is required")
-    output_group.add_argument('--output', '--output-sequences', '-o', help="filtered sequences in FASTA format")
-    output_group.add_argument('--output-metadata', help="metadata for strains that passed filters")
-    output_group.add_argument('--output-strains', help="list of strains that passed filters (no header)")
-    output_group.add_argument('--output-log', help="tab-delimited file with one row for each filtered strain and the reason it was filtered. Keyword arguments used for a given filter are reported in JSON format in a `kwargs` column.")
-    output_group.add_argument(
-        '--empty-output-reporting',
+    subsample_group.add_argument('--priority', type=str, metavar="FILE",
+        help=f"""Tab-delimited file with list of priority scores for strains
+              (e.g., "<strain ID>\\t<priority>") and no header. When scores are
+              provided, Augur converts scores to floating point values, sorts
+              strains within each subsampling group from highest to lowest
+              priority, and selects the top N strains per group where N is the
+              calculated or requested number of strains per group. Higher
+              numbers indicate higher priority. Since priorities represent
+              relative values between strains, these values can be arbitrary.
+              The ID column is determined by --metadata-id-columns.
+              {SKIP_AUTO_DEFAULT_IN_HELP}""")
+    subsample_group.add_argument('--subsample-seed', type=int, metavar="N",
+        help=f"""Random number generator seed to allow reproducible subsampling
+              (with same input data). {SKIP_AUTO_DEFAULT_IN_HELP}""")
+
+    output_group = parser.add_argument_group(
+        title="Outputs",
+        description="""Options related to outputs. At least one of the possible
+                    representations of filtered data (--output,
+                    --output-metadata, --output-strains) is required.""")
+    output_group.add_argument('--output', '--output-sequences', '-o', metavar="FILE",
+        help=f"""Filtered sequences in FASTA format.
+              {SKIP_AUTO_DEFAULT_IN_HELP}""")
+    output_group.add_argument('--output-metadata', metavar="FILE",
+        help=f"""Metadata for strains that passed filters.
+              {SKIP_AUTO_DEFAULT_IN_HELP}""")
+    output_group.add_argument('--output-strains', metavar="FILE",
+        help=f"""List of strain IDs that passed filters (no header). The ID
+              column is determined by --metadata-id-columns.
+              {SKIP_AUTO_DEFAULT_IN_HELP}""")
+    output_group.add_argument('--output-log', metavar="FILE",
+        help=f"""Tab-delimited file with one row for each filtered strain and
+              the reason it was filtered. Keyword arguments used for a given
+              filter are reported in JSON format in a `kwargs` column.
+              {SKIP_AUTO_DEFAULT_IN_HELP}""")
+    output_group.add_argument('--empty-output-reporting',
         type=EmptyOutputReportingMethod.argtype,
         choices=list(EmptyOutputReportingMethod),
         default=EmptyOutputReportingMethod.ERROR,