Merge pull request #703 from nextstrain/epiweeks

huddlej · web-flow · commit 92202ab92162 · 2021-08-19T15:04:13.000-07:00
Annotate epiweek as color and filter option
diff --git a/defaults/auspice_config.json b/defaults/auspice_config.json
@@ -112,6 +112,11 @@
       "key": "region_exposure",
       "title": "Region of exposure",
       "type": "categorical"
+    },
+    {
+      "key": "epiweek",
+      "title": "Epiweek (CDC)",
+      "type": "continuous"
     }
   ],
   "geo_resolutions": [
@@ -136,7 +141,7 @@
     "division",
     "location",
     "host",
-    "author"
+    "epiweek"
   ],
   "panels": [
     "tree",
diff --git a/docs/change_log.md b/docs/change_log.md
@@ -3,6 +3,20 @@
 As of April 2021, we use major version numbers (e.g. v2) to reflect backward incompatible changes to the workflow that likely require you to update your Nextstrain installation.
 We also use this change log to document new features that maintain backward compatibility, indicating these features by the date they were added.
 
+## v8 (19 Aug 2021)
+
+### Major changes
+
+- Annotate CDC-style epiweeks (e.g., "202019") as a color-by and filter option in Auspice JSONs ([#703](https://github.com/nextstrain/ncov/pull/703)). This functionality requires [the Python epiweeks package](https://pypi.org/project/epiweeks/). You will need to update your software environment to include this package, depending on how you run your builds.
+  - If you use the Nextstrain CLI with Docker, update the Docker image with `nextstrain update` and then run your builds as usual with `nextstrain build`.
+  - If you use the Nextstrain CLI without Docker, run your builds with `nextstrain build . --use-conda <...other options...>`.
+  - If you use Snakemake, run your builds with `snakemake --use-conda <...other options...>`.
+  - If you manage your own Conda environment, install epiweeks manually in the environment with `conda install -c bioconda epiweeks`.
+
+### Features
+
+- Update Conda environment to use [Augur 13.0.0](https://github.com/nextstrain/augur/blob/master/CHANGES.md#1300-17-august-2021) for an improved filtering experience ([#703](https://github.com/nextstrain/ncov/pull/703)).
+
 ## New features since last version update
 
  - 11 August 2021: Add support for "Sequences" and "Patient status metadata" downloads from GISAID's search interface including [documentation in the tutorial of how to use these data](https://docs.nextstrain.org/en/latest/tutorials/SARS-CoV-2/steps/data-prep.html#curate-data-from-gisaid-search-and-downloads). ([#701](https://github.com/nextstrain/ncov/pull/701))
diff --git a/scripts/calculate_epiweek.py b/scripts/calculate_epiweek.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+import argparse
+from augur.utils import write_json
+import epiweeks
+import pandas as pd
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        usage="Calculate epiweeks for dates in the given metadata",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("--metadata", required=True, help="metadata with a 'date' column")
+    parser.add_argument("--attribute-name", default="epiweek", help="name to store annotations of epiweeks in JSON output")
+    parser.add_argument("--output-node-data", required=True, help="node data JSON with epiweek annotations")
+
+    args = parser.parse_args()
+
+    # Read metadata with pandas because Augur's read_metadata utility does not
+    # support metadata without a "strain" or "name" field.
+    metadata = pd.read_csv(
+        args.metadata,
+        sep=None,
+        engine="python",
+        skipinitialspace=True,
+        dtype={
+            "strain": "string",
+            "name": "string",
+        }
+    ).fillna("")
+
+    # Find records with unambiguous dates.
+    metadata_with_dates = metadata.loc[~metadata["date"].str.contains("X"), ["strain", "date"]].copy()
+
+    # Convert date strings to timestamps.
+    metadata_with_dates["date"] = pd.to_datetime(metadata_with_dates["date"])
+
+    # Calculate epiweeks from date objects as a new annotation.
+    metadata_with_dates["epiweek"] = metadata_with_dates["date"].apply(lambda date: epiweeks.Week.fromdate(date).cdcformat())
+
+    # Create a node data object with epiweeks.
+    node_data = {}
+    for record in metadata_with_dates.to_dict(orient="records"):
+        node_data[record["strain"]] = {
+            args.attribute_name: record["epiweek"],
+        }
+
+    # Save node data.
+    write_json({"nodes": node_data}, args.output_node_data)
diff --git a/scripts/fix-colorings.py b/scripts/fix-colorings.py
@@ -1,5 +1,65 @@
 import argparse
 import json
+import re
+from numpy import linspace
+from math import floor
+
+def adjust_coloring_for_epiweeks(dataset):
+    """
+    If an auspice JSON specifies a colouring with the key "epiweek" (case sensitive) then we create a categorical
+    colorscale which evenly spaces the canonical nextstrain rainbow across the observed time window.
+
+    NOTE: epiweek must be in CDC format ("YYYYMM") but this may be relaxed to include ISO format in the future.
+    """
+    EPIKEY="epiweek"
+    try:
+        (cidx, coloring) = [(i, c) for i, c in enumerate(dataset['meta'].get("colorings", [])) if c['key']==EPIKEY][0]
+    except IndexError: # coloring doesn't define an epiweek
+        return 
+
+    # remove any duplicate coloring entries in the JSON to ensure the entry we edit is the one used by Auspice
+    # (NOTE: this is augur bug https://github.com/nextstrain/augur/issues/719)
+    dataset['meta']['colorings'] = [c for i,c in enumerate(dataset['meta']['colorings']) if not (c['key']==EPIKEY and i!=cidx)]
+
+    # delay import to support older setups not using epiweeks package
+    from epiweeks import Year, Week
+    
+    observed_values = set()
+    def recurse(node):
+        value = node.get("node_attrs", {}).get(EPIKEY, {}).get("value", False)
+        if value:
+            # we validate using both the epiweeks package and a regex (epiweeks will perform coercion of non-valid data into valid data)
+            if not re.match(r'^(\d{4})(\d{2})$', value):
+                raise(ValueError(f"Epiweek value {value} was not in format YYYYMM."))
+            week = Week.fromstring(value, system="cdc") # raises ValueError if not valid
+            observed_values.add(week)
+        for child in node.get("children", []):
+            recurse(child)
+    try:
+        recurse(dataset["tree"])
+    except ValueError as e:
+        print(str(e))
+        print("Skipping color scale creation for epiweek.")
+        return
+    observed_values = sorted(list(observed_values))
+
+    ## generate epiweeks across the entire observed range for color generation
+    epiweeks = [ observed_values[0] ]
+    while epiweeks[-1] < observed_values[-1]:
+        epiweeks.append(epiweeks[-1]+1)
+    ## generate rainbow colour scale across epiweeks.
+    ## Since a "default" augur install does not include matplotlib, rather than interpolating between values in the scale
+    ## we reuse them. This only applies when n(epiweeks)>30, where distinguising between colors is problematic anyway.
+    rainbow = ["#511EA8", "#482BB6", "#4039C3", "#3F4ACA", "#3E5CD0", "#416CCE", "#447CCD", "#4989C4", "#4E96BC", "#559FB0", "#5DA8A4", "#66AE96", "#6FB388", "#7AB77C", "#85BA6F", "#91BC64", "#9DBE5A", "#AABD53", "#B6BD4B", "#C2BA46", "#CDB642", "#D6B03F", "#DDA83C", "#E29D39", "#E69036", "#E67F33", "#E56D30", "#E2592C", "#DF4428", "#DC2F24"]
+    color_indicies = [floor(x) for x in linspace(0, len(rainbow), endpoint=False, num=len(epiweeks))]
+    coloring['scale'] = [
+        [epiweek.cdcformat(), rainbow[color_indicies[i]]]
+        for i,epiweek in enumerate(epiweeks)
+        if epiweek in observed_values
+    ]
+    ## auspice will order the legend according to the provided color scale, so there is no need to set
+    ## `coloring['legend']` unless we want to restrict this for some reason.
+    coloring['type'] = 'categorical' # force the scale type to be categorical
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(
@@ -23,5 +83,7 @@
 
     input_json["meta"]["colorings"] = fixed_colorings
 
+    adjust_coloring_for_epiweeks(input_json)
+
     with open(args.output, 'w') as f:
         json.dump(input_json, f, indent=2)
diff --git a/workflow/envs/nextstrain.yaml b/workflow/envs/nextstrain.yaml
@@ -4,7 +4,8 @@ channels:
   - bioconda
   - defaults
 dependencies:
-  - augur=12.0.0
+  - augur=13.0.0
+  - epiweeks=2.1.2
   - iqtree=2.1.2
   - mafft=7.475
   - nextalign=0.2.0
diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
@@ -1124,6 +1124,24 @@ rule logistic_growth:
             --output {output.node_data} 2>&1 | tee {log}
         """
 
+rule calculate_epiweeks:
+    input:
+        metadata="results/{build_name}/metadata_adjusted.tsv.xz",
+    output:
+        node_data="results/{build_name}/epiweeks.json",
+    benchmark:
+        "benchmarks/calculate_epiweeks_{build_name}.txt",
+    conda:
+        config["conda_environment"],
+    log:
+        "logs/calculate_epiweeks_{build_name}.txt",
+    shell:
+        """
+        python3 scripts/calculate_epiweek.py \
+            --metadata {input.metadata} \
+            --output-node-data {output.node_data}
+        """
+
 def export_title(wildcards):
     # TODO: maybe we could replace this with a config entry for full/human-readable build name?
     location_name = wildcards.build_name
@@ -1158,7 +1176,8 @@ def _get_node_data_by_wildcards(wildcards):
         rules.traits.output.node_data,
         rules.logistic_growth.output.node_data,
         rules.aa_muts_explicit.output.node_data,
-        rules.distances.output.node_data
+        rules.distances.output.node_data,
+        rules.calculate_epiweeks.output.node_data
     ]
 
     if "run_pangolin" in config and config["run_pangolin"]: