nextstrain
diff --git a/‎.gitattributes‎
Lines changed: 4 additions & 0 deletions b/‎.gitattributes‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.github/workflows/ci.yaml‎
Lines changed: 20 additions & 0 deletions b/‎.github/workflows/ci.yaml‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎CHANGES.md‎
Lines changed: 4 additions & 0 deletions b/‎CHANGES.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎augur/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎augur/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎augur/data/schema-subsample-config.json‎
Lines changed: 140 additions & 0 deletions b/‎augur/data/schema-subsample-config.json‎
Lines changed: 140 additions & 0 deletions
diff --git a/‎augur/dates/__init__.py‎
Lines changed: 4 additions & 3 deletions b/‎augur/dates/__init__.py‎
Lines changed: 4 additions & 3 deletions
@@ -0,0 +1,4 @@
+# This is a large generated file that, while text, it is not useful to
+# routinely show the diff of.  A diff can be forced as needed, e.g. with `git
+# diff --text`.
+/augur/data/schema-subsample-config.json -diff
@@ -384,6 +384,26 @@ jobs:
             exit 1
           fi
 
+  check-subsample-config-schema:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/setup-python@v5
+
+      - uses: actions/checkout@v5
+
+      - run: pip install .[dev]
+
+      - run: ./devel/regenerate-subsample-schema
+
+      - name: Check for changes
+        run: |
+          if [[ -n $(git status --porcelain) ]]; then
+            git add .
+            git diff --staged >&2
+            echo "There are changes that affect the augur subsample config schema. Please regenerate by running devel/regenerate-subsample-schema." >&2
+            exit 1
+          fi
+
   release:
     # Only run when called by the release workflow on the default branch
     if: github.workflow_ref == format('{0}/.github/workflows/release.yaml@refs/heads/{1}', github.repository, github.event.repository.default_branch)
 
@@ -2,6 +2,10 @@
 
 ## __NEXT__
 
+* A new command, `augur subsample`, supports complex subsampling using file-based configuration. See the updated [Filtering and Subsampling guide][] for a comparison with `augur filter`. [#635][] (@victorlin)
+
+[#635]: https://github.com/nextstrain/augur/issues/635
+[Filtering and Subsampling guide]: https://docs.nextstrain.org/en/latest/guides/bioinformatics/filtering-and-subsampling.html
 
 ## 31.4.0 (14 August 2025)
 
 
@@ -25,6 +25,7 @@
     "merge",
     "index",
     "filter",
+    "subsample",
     "mask",
     "align",
     "tree",
 
@@ -0,0 +1,140 @@
+{
+    "_description": "This file is generated by devel/regenerate-subsample-schema. Do not edit manually - edit the script instead.",
+    "$schema": "http://json-schema.org/draft-07/schema#",
+    "$id": "https://nextstrain.org/schemas/augur/subsample-config/v1",
+    "title": "Configuration file to be supplied to `augur subsample --config`",
+    "type": "object",
+    "additionalProperties": false,
+    "required": [
+        "samples"
+    ],
+    "$defs": {
+        "sampleProperties": {
+            "type": "object",
+            "additionalProperties": false,
+            "properties": {
+                "exclude": {
+                    "oneOf": [
+                        {
+                            "type": "string"
+                        },
+                        {
+                            "type": "array",
+                            "items": {
+                                "type": "string"
+                            }
+                        }
+                    ],
+                    "description": "File(s) with list of strains to exclude. Paths must be relative to the\nworking directory."
+                },
+                "exclude_all": {
+                    "type": "boolean",
+                    "description": "Exclude all strains by default. Use this with the include arguments to\nselect a specific subset of strains."
+                },
+                "exclude_ambiguous_dates_by": {
+                    "type": "string",
+                    "enum": [
+                        "any",
+                        "day",
+                        "month",
+                        "year"
+                    ],
+                    "description": "Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g.,\n2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous\nyear makes the corresponding month and day ambiguous, too, even if those\nfields have unambiguous values (e.g., \"201X-10-01\"). Similarly, an\nambiguous month makes the corresponding day ambiguous (e.g.,\n\"2010-XX-01\")."
+                },
+                "exclude_where": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    },
+                    "description": "Exclude sequences matching these conditions. Ex: \"host=rat\" or\n\"host!=rat\". Multiple values are processed as OR (matching any of those\nspecified will be excluded), not AND."
+                },
+                "include": {
+                    "oneOf": [
+                        {
+                            "type": "string"
+                        },
+                        {
+                            "type": "array",
+                            "items": {
+                                "type": "string"
+                            }
+                        }
+                    ],
+                    "description": "File(s) with list of strains to include regardless of priorities,\nsubsampling, or absence of an entry in sequences. Paths must be relative\nto the working directory."
+                },
+                "include_where": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    },
+                    "description": "Include sequences with these values. ex: host=rat. Multiple values are\nprocessed as OR (having any of those specified will be included), not\nAND. This rule is applied last and ensures any strains matching these\nrules will be included regardless of priorities, subsampling, or absence\nof an entry in sequences."
+                },
+                "min_date": {
+                    "type": "string",
+                    "description": "Minimal cutoff for date (inclusive). Supported formats:\n\n1. an Augur-style numeric date with the year as the integer part (e.g.\n   2020.42) or\n2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') or\n3. a backwards-looking relative date in ISO 8601 duration format with\n   optional P prefix (e.g. '1W', 'P1W')"
+                },
+                "max_date": {
+                    "type": "string",
+                    "description": "Maximal cutoff for date (inclusive). Supported formats:\n\n1. an Augur-style numeric date with the year as the integer part (e.g.\n   2020.42) or\n2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') or\n3. a backwards-looking relative date in ISO 8601 duration format with\n   optional P prefix (e.g. '1W', 'P1W')"
+                },
+                "min_length": {
+                    "type": "integer",
+                    "description": "Minimal length of the sequences, only counting standard nucleotide\ncharacters A, C, G, or T (case-insensitive)."
+                },
+                "max_length": {
+                    "type": "integer",
+                    "description": "Maximum length of the sequences, only counting standard nucleotide\ncharacters A, C, G, or T (case-insensitive)."
+                },
+                "non_nucleotide": {
+                    "type": "boolean",
+                    "description": "Exclude sequences that contain illegal characters."
+                },
+                "query": {
+                    "type": "string",
+                    "description": "Filter sequences by attribute. Uses `Pandas DataFrame query syntax`__.\n(e.g., \"country == 'Colombia'\" or \"(country == 'USA' & (division ==\n'Washington'))\")\n\n__ https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query"
+                },
+                "query_columns": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    },
+                    "description": "Use alongside query to specify columns and data types in the format\n'column:type', where type is one of\n(bool,float,int,str). Automatic type inference will be\nattempted on all unspecified columns used in the query. Example:\nregion:str coverage:float."
+                },
+                "group_by": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    },
+                    "description": "Grouping columns for subsampling. Notes:\n\n(1) Grouping by ['month', 'week', 'year'] is only\n    supported when there is a 'date' column in the\n    metadata.\n(2) 'week' uses the ISO week numbering system, where a week starts on a\n    Monday and ends on a Sunday.\n(3) 'month' and 'week' grouping cannot be used together.\n(4) Custom columns ['month', 'week', 'year'] in the\n    metadata are ignored for grouping. Please rename them if you want to\n    use their values for grouping."
+                },
+                "group_by_weights": {
+                    "type": "string",
+                    "description": "TSV file defining weights for grouping. Path must be relative to the\nworking directory. Requirements:\n\n(1) Lines starting with '#' are treated as comment lines.\n(2) The first non-comment line must be a header row.\n(3) There must be a numeric ``weight`` column (weights can take on any\n    non-negative values).\n(4) Other columns must be a subset of grouping columns, with\n    combinations of values covering all combinations present in the\n    metadata.\n(5) This option only applies when grouping columns and a total sample\n    size are provided.\n(6) This option can only be used when probabilistic sampling is allowed.\n\nNotes:\n\n(1) Any grouping columns absent from this file will be given equal\n    weighting across all values *within* groups defined by the other\n    weighted columns.\n(2) An entry with the value ``default`` under all columns will be\n    treated as the default weight for specific groups present in the\n    metadata but missing from the weights file. If there is no default\n    weight and the metadata contains rows that are not covered by the\n    given weights, augur filter will exit with an error."
+                },
+                "probabilistic_sampling": {
+                    "type": "boolean",
+                    "description": "Allow probabilistic sampling during subsampling. This is useful when\nthere are more groups than requested sequences. This option only applies\nwhen a total sample size is provided."
+                },
+                "sequences_per_group": {
+                    "type": "integer",
+                    "description": "Select no more than this number of sequences per category."
+                },
+                "max_sequences": {
+                    "type": "integer",
+                    "description": "Select no more than this number of sequences (i.e. total sample\nsize). Can be used without grouping columns."
+                }
+            }
+        }
+    },
+    "properties": {
+        "samples": {
+            "type": "object",
+            "minProperties": 1,
+            "patternProperties": {
+                "^.+$": {
+                    "$ref": "#/$defs/sampleProperties"
+                }
+            }
+        }
+    }
+}
@@ -13,10 +13,11 @@
 from .ambiguous_date import AmbiguousDate
 
 SUPPORTED_DATE_HELP_TEXT = dedent("""\
-    1. an Augur-style numeric date with the year as the integer part (e.g. 2020.42) or
+    1. an Augur-style numeric date with the year as the integer part (e.g.
+       2020.42) or
     2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') or
-    3. a backwards-looking relative date in ISO 8601 duration format with optional P prefix (e.g. '1W', 'P1W')
-""")
+    3. a backwards-looking relative date in ISO 8601 duration format with
+       optional P prefix (e.g. '1W', 'P1W')""")
 
 def date_to_numeric(date: datetime.date) -> float:
     """Wrapper around treetime.utils.numeric_date that ensures a float is returned."""