Skip to content

Commit 92202ab

Browse files
authored
Merge pull request #703 from nextstrain/epiweeks
Annotate epiweek as color and filter option
2 parents ec0575c + d0593f2 commit 92202ab

File tree

6 files changed

+153
-3
lines changed

6 files changed

+153
-3
lines changed

defaults/auspice_config.json

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,11 @@
112112
"key": "region_exposure",
113113
"title": "Region of exposure",
114114
"type": "categorical"
115+
},
116+
{
117+
"key": "epiweek",
118+
"title": "Epiweek (CDC)",
119+
"type": "continuous"
115120
}
116121
],
117122
"geo_resolutions": [
@@ -136,7 +141,7 @@
136141
"division",
137142
"location",
138143
"host",
139-
"author"
144+
"epiweek"
140145
],
141146
"panels": [
142147
"tree",

docs/change_log.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,20 @@
33
As of April 2021, we use major version numbers (e.g. v2) to reflect backward incompatible changes to the workflow that likely require you to update your Nextstrain installation.
44
We also use this change log to document new features that maintain backward compatibility, indicating these features by the date they were added.
55

6+
## v8 (19 Aug 2021)
7+
8+
### Major changes
9+
10+
- Annotate CDC-style epiweeks (e.g., "202019") as a color-by and filter option in Auspice JSONs ([#703](https://github.com/nextstrain/ncov/pull/703)). This functionality requires [the Python epiweeks package](https://pypi.org/project/epiweeks/). You will need to update your software environment to include this package, depending on how you run your builds.
11+
- If you use the Nextstrain CLI with Docker, update the Docker image with `nextstrain update` and then run your builds as usual with `nextstrain build`.
12+
- If you use the Nextstrain CLI without Docker, run your builds with `nextstrain build . --use-conda <...other options...>`.
13+
- If you use Snakemake, run your builds with `snakemake --use-conda <...other options...>`.
14+
- If you manage your own Conda environment, install epiweeks manually in the environment with `conda install -c bioconda epiweeks`.
15+
16+
### Features
17+
18+
- Update Conda environment to use [Augur 13.0.0](https://github.com/nextstrain/augur/blob/master/CHANGES.md#1300-17-august-2021) for an improved filtering experience ([#703](https://github.com/nextstrain/ncov/pull/703)).
19+
620
## New features since last version update
721

822
- 11 August 2021: Add support for "Sequences" and "Patient status metadata" downloads from GISAID's search interface including [documentation in the tutorial of how to use these data](https://docs.nextstrain.org/en/latest/tutorials/SARS-CoV-2/steps/data-prep.html#curate-data-from-gisaid-search-and-downloads). ([#701](https://github.com/nextstrain/ncov/pull/701))

scripts/calculate_epiweek.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#!/usr/bin/env python3
2+
import argparse
3+
from augur.utils import write_json
4+
import epiweeks
5+
import pandas as pd
6+
7+
8+
if __name__ == '__main__':
9+
parser = argparse.ArgumentParser(
10+
usage="Calculate epiweeks for dates in the given metadata",
11+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
12+
)
13+
parser.add_argument("--metadata", required=True, help="metadata with a 'date' column")
14+
parser.add_argument("--attribute-name", default="epiweek", help="name to store annotations of epiweeks in JSON output")
15+
parser.add_argument("--output-node-data", required=True, help="node data JSON with epiweek annotations")
16+
17+
args = parser.parse_args()
18+
19+
# Read metadata with pandas because Augur's read_metadata utility does not
20+
# support metadata without a "strain" or "name" field.
21+
metadata = pd.read_csv(
22+
args.metadata,
23+
sep=None,
24+
engine="python",
25+
skipinitialspace=True,
26+
dtype={
27+
"strain": "string",
28+
"name": "string",
29+
}
30+
).fillna("")
31+
32+
# Find records with unambiguous dates.
33+
metadata_with_dates = metadata.loc[~metadata["date"].str.contains("X"), ["strain", "date"]].copy()
34+
35+
# Convert date strings to timestamps.
36+
metadata_with_dates["date"] = pd.to_datetime(metadata_with_dates["date"])
37+
38+
# Calculate epiweeks from date objects as a new annotation.
39+
metadata_with_dates["epiweek"] = metadata_with_dates["date"].apply(lambda date: epiweeks.Week.fromdate(date).cdcformat())
40+
41+
# Create a node data object with epiweeks.
42+
node_data = {}
43+
for record in metadata_with_dates.to_dict(orient="records"):
44+
node_data[record["strain"]] = {
45+
args.attribute_name: record["epiweek"],
46+
}
47+
48+
# Save node data.
49+
write_json({"nodes": node_data}, args.output_node_data)

scripts/fix-colorings.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,65 @@
11
import argparse
22
import json
3+
import re
4+
from numpy import linspace
5+
from math import floor
6+
7+
def adjust_coloring_for_epiweeks(dataset):
8+
"""
9+
If an auspice JSON specifies a colouring with the key "epiweek" (case sensitive) then we create a categorical
10+
colorscale which evenly spaces the canonical nextstrain rainbow across the observed time window.
11+
12+
NOTE: epiweek must be in CDC format ("YYYYMM") but this may be relaxed to include ISO format in the future.
13+
"""
14+
EPIKEY="epiweek"
15+
try:
16+
(cidx, coloring) = [(i, c) for i, c in enumerate(dataset['meta'].get("colorings", [])) if c['key']==EPIKEY][0]
17+
except IndexError: # coloring doesn't define an epiweek
18+
return
19+
20+
# remove any duplicate coloring entries in the JSON to ensure the entry we edit is the one used by Auspice
21+
# (NOTE: this is augur bug https://github.com/nextstrain/augur/issues/719)
22+
dataset['meta']['colorings'] = [c for i,c in enumerate(dataset['meta']['colorings']) if not (c['key']==EPIKEY and i!=cidx)]
23+
24+
# delay import to support older setups not using epiweeks package
25+
from epiweeks import Year, Week
26+
27+
observed_values = set()
28+
def recurse(node):
29+
value = node.get("node_attrs", {}).get(EPIKEY, {}).get("value", False)
30+
if value:
31+
# we validate using both the epiweeks package and a regex (epiweeks will perform coercion of non-valid data into valid data)
32+
if not re.match(r'^(\d{4})(\d{2})$', value):
33+
raise(ValueError(f"Epiweek value {value} was not in format YYYYMM."))
34+
week = Week.fromstring(value, system="cdc") # raises ValueError if not valid
35+
observed_values.add(week)
36+
for child in node.get("children", []):
37+
recurse(child)
38+
try:
39+
recurse(dataset["tree"])
40+
except ValueError as e:
41+
print(str(e))
42+
print("Skipping color scale creation for epiweek.")
43+
return
44+
observed_values = sorted(list(observed_values))
45+
46+
## generate epiweeks across the entire observed range for color generation
47+
epiweeks = [ observed_values[0] ]
48+
while epiweeks[-1] < observed_values[-1]:
49+
epiweeks.append(epiweeks[-1]+1)
50+
## generate rainbow colour scale across epiweeks.
51+
## Since a "default" augur install does not include matplotlib, rather than interpolating between values in the scale
52+
## we reuse them. This only applies when n(epiweeks)>30, where distinguising between colors is problematic anyway.
53+
rainbow = ["#511EA8", "#482BB6", "#4039C3", "#3F4ACA", "#3E5CD0", "#416CCE", "#447CCD", "#4989C4", "#4E96BC", "#559FB0", "#5DA8A4", "#66AE96", "#6FB388", "#7AB77C", "#85BA6F", "#91BC64", "#9DBE5A", "#AABD53", "#B6BD4B", "#C2BA46", "#CDB642", "#D6B03F", "#DDA83C", "#E29D39", "#E69036", "#E67F33", "#E56D30", "#E2592C", "#DF4428", "#DC2F24"]
54+
color_indicies = [floor(x) for x in linspace(0, len(rainbow), endpoint=False, num=len(epiweeks))]
55+
coloring['scale'] = [
56+
[epiweek.cdcformat(), rainbow[color_indicies[i]]]
57+
for i,epiweek in enumerate(epiweeks)
58+
if epiweek in observed_values
59+
]
60+
## auspice will order the legend according to the provided color scale, so there is no need to set
61+
## `coloring['legend']` unless we want to restrict this for some reason.
62+
coloring['type'] = 'categorical' # force the scale type to be categorical
363

464
if __name__ == '__main__':
565
parser = argparse.ArgumentParser(
@@ -23,5 +83,7 @@
2383

2484
input_json["meta"]["colorings"] = fixed_colorings
2585

86+
adjust_coloring_for_epiweeks(input_json)
87+
2688
with open(args.output, 'w') as f:
2789
json.dump(input_json, f, indent=2)

workflow/envs/nextstrain.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@ channels:
44
- bioconda
55
- defaults
66
dependencies:
7-
- augur=12.0.0
7+
- augur=13.0.0
8+
- epiweeks=2.1.2
89
- iqtree=2.1.2
910
- mafft=7.475
1011
- nextalign=0.2.0

workflow/snakemake_rules/main_workflow.smk

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1124,6 +1124,24 @@ rule logistic_growth:
11241124
--output {output.node_data} 2>&1 | tee {log}
11251125
"""
11261126

1127+
rule calculate_epiweeks:
1128+
input:
1129+
metadata="results/{build_name}/metadata_adjusted.tsv.xz",
1130+
output:
1131+
node_data="results/{build_name}/epiweeks.json",
1132+
benchmark:
1133+
"benchmarks/calculate_epiweeks_{build_name}.txt",
1134+
conda:
1135+
config["conda_environment"],
1136+
log:
1137+
"logs/calculate_epiweeks_{build_name}.txt",
1138+
shell:
1139+
"""
1140+
python3 scripts/calculate_epiweek.py \
1141+
--metadata {input.metadata} \
1142+
--output-node-data {output.node_data}
1143+
"""
1144+
11271145
def export_title(wildcards):
11281146
# TODO: maybe we could replace this with a config entry for full/human-readable build name?
11291147
location_name = wildcards.build_name
@@ -1158,7 +1176,8 @@ def _get_node_data_by_wildcards(wildcards):
11581176
rules.traits.output.node_data,
11591177
rules.logistic_growth.output.node_data,
11601178
rules.aa_muts_explicit.output.node_data,
1161-
rules.distances.output.node_data
1179+
rules.distances.output.node_data,
1180+
rules.calculate_epiweeks.output.node_data
11621181
]
11631182

11641183
if "run_pangolin" in config and config["run_pangolin"]:

0 commit comments

Comments
 (0)