You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: CHANGES.md
+4Lines changed: 4 additions & 0 deletions
Display the source diff
Display the rich diff
Original file line number
Diff line number
Diff line change
@@ -2,6 +2,10 @@
2
2
3
3
## __NEXT__
4
4
5
+
* A new command, `augur subsample`, supports complex subsampling using file-based configuration. See the updated [Filtering and Subsampling guide][] for a comparison with `augur filter`. [#635][] (@victorlin)
"title": "Configuration file to be supplied to `augur subsample --config`",
6
+
"type": "object",
7
+
"additionalProperties": false,
8
+
"required": [
9
+
"samples"
10
+
],
11
+
"$defs": {
12
+
"sampleProperties": {
13
+
"type": "object",
14
+
"additionalProperties": false,
15
+
"properties": {
16
+
"exclude": {
17
+
"oneOf": [
18
+
{
19
+
"type": "string"
20
+
},
21
+
{
22
+
"type": "array",
23
+
"items": {
24
+
"type": "string"
25
+
}
26
+
}
27
+
],
28
+
"description": "File(s) with list of strains to exclude. Paths must be relative to the\nworking directory."
29
+
},
30
+
"exclude_all": {
31
+
"type": "boolean",
32
+
"description": "Exclude all strains by default. Use this with the include arguments to\nselect a specific subset of strains."
33
+
},
34
+
"exclude_ambiguous_dates_by": {
35
+
"type": "string",
36
+
"enum": [
37
+
"any",
38
+
"day",
39
+
"month",
40
+
"year"
41
+
],
42
+
"description": "Exclude ambiguous dates by day (e.g., 2020-09-XX), month (e.g.,\n2020-XX-XX), year (e.g., 200X-10-01), or any date fields. An ambiguous\nyear makes the corresponding month and day ambiguous, too, even if those\nfields have unambiguous values (e.g., \"201X-10-01\"). Similarly, an\nambiguous month makes the corresponding day ambiguous (e.g.,\n\"2010-XX-01\")."
43
+
},
44
+
"exclude_where": {
45
+
"type": "array",
46
+
"items": {
47
+
"type": "string"
48
+
},
49
+
"description": "Exclude sequences matching these conditions. Ex: \"host=rat\" or\n\"host!=rat\". Multiple values are processed as OR (matching any of those\nspecified will be excluded), not AND."
50
+
},
51
+
"include": {
52
+
"oneOf": [
53
+
{
54
+
"type": "string"
55
+
},
56
+
{
57
+
"type": "array",
58
+
"items": {
59
+
"type": "string"
60
+
}
61
+
}
62
+
],
63
+
"description": "File(s) with list of strains to include regardless of priorities,\nsubsampling, or absence of an entry in sequences. Paths must be relative\nto the working directory."
64
+
},
65
+
"include_where": {
66
+
"type": "array",
67
+
"items": {
68
+
"type": "string"
69
+
},
70
+
"description": "Include sequences with these values. ex: host=rat. Multiple values are\nprocessed as OR (having any of those specified will be included), not\nAND. This rule is applied last and ensures any strains matching these\nrules will be included regardless of priorities, subsampling, or absence\nof an entry in sequences."
71
+
},
72
+
"min_date": {
73
+
"type": "string",
74
+
"description": "Minimal cutoff for date (inclusive). Supported formats:\n\n1. an Augur-style numeric date with the year as the integer part (e.g.\n 2020.42) or\n2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') or\n3. a backwards-looking relative date in ISO 8601 duration format with\n optional P prefix (e.g. '1W', 'P1W')"
75
+
},
76
+
"max_date": {
77
+
"type": "string",
78
+
"description": "Maximal cutoff for date (inclusive). Supported formats:\n\n1. an Augur-style numeric date with the year as the integer part (e.g.\n 2020.42) or\n2. a date in ISO 8601 date format (i.e. YYYY-MM-DD) (e.g. '2020-06-04') or\n3. a backwards-looking relative date in ISO 8601 duration format with\n optional P prefix (e.g. '1W', 'P1W')"
79
+
},
80
+
"min_length": {
81
+
"type": "integer",
82
+
"description": "Minimal length of the sequences, only counting standard nucleotide\ncharacters A, C, G, or T (case-insensitive)."
83
+
},
84
+
"max_length": {
85
+
"type": "integer",
86
+
"description": "Maximum length of the sequences, only counting standard nucleotide\ncharacters A, C, G, or T (case-insensitive)."
87
+
},
88
+
"non_nucleotide": {
89
+
"type": "boolean",
90
+
"description": "Exclude sequences that contain illegal characters."
"description": "Use alongside query to specify columns and data types in the format\n'column:type', where type is one of\n(bool,float,int,str). Automatic type inference will be\nattempted on all unspecified columns used in the query. Example:\nregion:str coverage:float."
102
+
},
103
+
"group_by": {
104
+
"type": "array",
105
+
"items": {
106
+
"type": "string"
107
+
},
108
+
"description": "Grouping columns for subsampling. Notes:\n\n(1) Grouping by ['month', 'week', 'year'] is only\n supported when there is a 'date' column in the\n metadata.\n(2) 'week' uses the ISO week numbering system, where a week starts on a\n Monday and ends on a Sunday.\n(3) 'month' and 'week' grouping cannot be used together.\n(4) Custom columns ['month', 'week', 'year'] in the\n metadata are ignored for grouping. Please rename them if you want to\n use their values for grouping."
109
+
},
110
+
"group_by_weights": {
111
+
"type": "string",
112
+
"description": "TSV file defining weights for grouping. Path must be relative to the\nworking directory. Requirements:\n\n(1) Lines starting with '#' are treated as comment lines.\n(2) The first non-comment line must be a header row.\n(3) There must be a numeric ``weight`` column (weights can take on any\n non-negative values).\n(4) Other columns must be a subset of grouping columns, with\n combinations of values covering all combinations present in the\n metadata.\n(5) This option only applies when grouping columns and a total sample\n size are provided.\n(6) This option can only be used when probabilistic sampling is allowed.\n\nNotes:\n\n(1) Any grouping columns absent from this file will be given equal\n weighting across all values *within* groups defined by the other\n weighted columns.\n(2) An entry with the value ``default`` under all columns will be\n treated as the default weight for specific groups present in the\n metadata but missing from the weights file. If there is no default\n weight and the metadata contains rows that are not covered by the\n given weights, augur filter will exit with an error."
113
+
},
114
+
"probabilistic_sampling": {
115
+
"type": "boolean",
116
+
"description": "Allow probabilistic sampling during subsampling. This is useful when\nthere are more groups than requested sequences. This option only applies\nwhen a total sample size is provided."
117
+
},
118
+
"sequences_per_group": {
119
+
"type": "integer",
120
+
"description": "Select no more than this number of sequences per category."
121
+
},
122
+
"max_sequences": {
123
+
"type": "integer",
124
+
"description": "Select no more than this number of sequences (i.e. total sample\nsize). Can be used without grouping columns."
0 commit comments