Merge pull request #1160 from nextstrain/nextclade-v3

joverlee521 · web-flow · commit 80c06c15f066 · 2024-10-30T09:21:42.000-07:00
Migrate to Nextclade v3
diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml
@@ -71,7 +71,7 @@ files:
   mutational_fitness_distance_map: "defaults/mutational_fitness_distance_map.json"
   sites_to_mask: "defaults/sites_ignored_for_tree_topology.txt"
 
-# Define genes to translate during alignment by nextalign.
+# Define genes to translate during alignment by nextclade.
 genes: ["ORF1a", "ORF1b", "S", "ORF3a", "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8", "N", "ORF9b"]
 
 # Filter settings
diff --git a/docs/src/reference/change_log.md b/docs/src/reference/change_log.md
@@ -5,6 +5,10 @@ We also use this change log to document new features that maintain backward comp
 
 ## New features since last version update
 
+## v14 (23 October 2024)
+
+- 23 October 2024: Update workflow to use Nextclade v3. This includes the removal of unused mutation summary script and rules that expected Nextclade v2 outputs. Dropping the mutation summary rules removed the need for the full alignment rule `align` to produce the insertions and translations outputs, so they have been removed. The `build_align` rule no longer produces a separate `insertions.tsv` since insertions are now included in the `nextclade_qc.tsv`. [PR 1160](https://github.com/nextstrain/ncov/pull/1160)
+
 - 2 October 2024: Include a new parameter for `clade_recency` under `colors`. This parameter is used to define which clades should receive a color from the standard rainbow palette. A value of `6M` will cause clades with strains in the tree sampled within the last 6 months to be colored and earlier strains to not receive a color (and be colored in a palette of grays by Auspice). This `clade_recency` parameter is used in `builds.yaml` in `nextstrain_profiles` to color clades according for the `1m`, `2m`, `6m` and `all-time` timepoints. If `clade_recency` is not supplied then all clades will be colored. [PR 1132](https://github.com/nextstrain/ncov/pull/1132)
 
 - 30 September 2024: Use population-based weighted sampling for `nextstrain_profiles`. This requires a minimum Augur version of 25.3.0. PRs [1106](https://github.com/nextstrain/ncov/pull/1106), [1150](https://github.com/nextstrain/ncov/pull/1150), [1151](https://github.com/nextstrain/ncov/pull/1151)
diff --git a/docs/src/reference/workflow-config-file.rst b/docs/src/reference/workflow-config-file.rst
@@ -431,7 +431,7 @@ genes
 -----
 
 -  type: array
--  description: A list of genes for which ``nextalign`` should generate amino acid sequences during the alignment process. Gene names must match the names provided in the gene map from the ``annotation`` parameter.
+-  description: A list of genes for which ``nextclade`` should generate amino acid sequences during the alignment process. Gene names must match the names provided in the gene map from the ``annotation`` parameter.
 -  default: ``["ORF1a", "ORF1b", "S", "ORF3a", "M", "N"]``
 -  used in rules: ``align``, ``build_align``, ``translate``, ``mutational_fitness``
 
@@ -513,17 +513,17 @@ alignment_reference
 ~~~~~~~~~~~~~~~~~~~
 
 -  type: string
--  description: Path to a FASTA-formatted sequence to use for alignment with ``nextalign``
+-  description: Path to a FASTA-formatted sequence to use for alignment with ``nextclade``
 -  default: ``defaults/reference_seq.fasta``
--  used in rules: ``align``, ``proximity_score`` (subsampling), ``build_align``, ``build_mutation_summary``
+-  used in rules: ``align``, ``proximity_score`` (subsampling)
 
 annotation
 ~~~~~~~~~~
 
 -  type: string
--  description: Path to a GFF-formatted annotation of gene coordinates (e.g., a “gene map”) for use by ``nextalign`` and mutation summaries.
+-  description: Path to a GFF-formatted annotation of gene coordinates (e.g., a “gene map”) for use by ``nextclade`` for codon-aware alignment.
 -  default: ``defaults/annotation.gff``
--  used in rules: ``align``, ``build_align``, ``build_mutation_summary``
+-  used in rules: ``align``
 
 outgroup
 ~~~~~~~~
diff --git a/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml b/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml
@@ -18,7 +18,6 @@ slack_token: ~
 slack_channel: "#ncov-gisaid-updates"
 
 genes: ["ORF1a", "ORF1b", "S", "ORF3a", "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8", "N", "ORF9b"]
-use_nextalign: true
 include_hcov19_prefix: True
 
 files:
diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml
@@ -17,7 +17,6 @@ slack_token: ~
 slack_channel: "#ncov-gisaid-updates"
 
 genes: ["ORF1a", "ORF1b", "S", "ORF3a", "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8", "N", "ORF9b"]
-use_nextalign: true
 include_hcov19_prefix: True
 
 files:
diff --git a/nextstrain_profiles/nextstrain-open/builds.yaml b/nextstrain_profiles/nextstrain-open/builds.yaml
@@ -17,7 +17,6 @@ slack_token: ~
 slack_channel: "#ncov-genbank-updates"
 
 genes: ["ORF1a", "ORF1b", "S", "ORF3a", "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8", "N", "ORF9b"]
-use_nextalign: true
 include_hcov19_prefix: False
 
 files:
diff --git a/scripts/mutation_summary.py b/scripts/mutation_summary.py
diff --git a/workflow/envs/nextstrain.yaml b/workflow/envs/nextstrain.yaml
@@ -7,8 +7,7 @@ dependencies:
   - augur=22.4.0
   - epiweeks=2.1.2
   - iqtree=2.2.0.3
-  - nextalign=2.14.0
-  - nextclade=2.14.0
+  - nextclade=3.9.0
   - pangolin=3.1.20
   - pangolearn=2022.01.20
   - python>=3.8*
diff --git a/workflow/snakemake_rules/export_for_nextstrain.smk b/workflow/snakemake_rules/export_for_nextstrain.smk
@@ -93,38 +93,6 @@ rule export_all_regions:
         """
 
 
-rule mutation_summary:
-    message: "Summarizing {input.alignment}"
-    input:
-        alignment = rules.align.output.alignment,
-        insertions = rules.align.output.insertions,
-        translations = rules.align.output.translations,
-        reference = config["files"]["alignment_reference"],
-        genemap = config["files"]["annotation"]
-    output:
-        mutation_summary = "results/mutation_summary_{origin}.tsv.xz"
-    log:
-        "logs/mutation_summary_{origin}.txt"
-    benchmark:
-        "benchmarks/mutation_summary_{origin}.txt"
-    params:
-        outdir = "results/translations",
-        basename = "seqs_{origin}",
-        genes=config["genes"],
-    conda: config["conda_environment"]
-    shell:
-        r"""
-        python3 scripts/mutation_summary.py \
-            --alignment {input.alignment} \
-            --insertions {input.insertions} \
-            --directory {params.outdir} \
-            --basename {params.basename} \
-            --reference {input.reference} \
-            --genes {params.genes:q} \
-            --genemap {input.genemap} \
-            --output {output.mutation_summary} 2>&1 | tee {log}
-        """
-
 #
 # Rule for generating a per-build auspice config
 #
diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
@@ -65,15 +65,13 @@ rule align:
         """
     input:
         sequences = lambda wildcards: _get_path_for_input("sequences", wildcards.origin),
-        genemap = config["files"]["annotation"],
+        # Annotation used for codon-aware nucleotide alignment
+        # <https://docs.nextstrain.org/projects/nextclade/en/stable/user/algorithm/01-sequence-alignment.html>
+        annotation = config["files"]["annotation"],
         reference = config["files"]["alignment_reference"]
     output:
         alignment = "results/aligned_{origin}.fasta.xz",
-        insertions = "results/insertions_{origin}.tsv",
-        translations = expand("results/translations/seqs_{{origin}}.gene.{gene}.fasta.xz", gene=config.get('genes', ['S']))
     params:
-        output_translations = lambda w: f"results/translations/seqs_{w.origin}.gene.{{gene}}.fasta",
-        output_translations_toxz = "results/translations/seqs_{origin}.gene.*.fasta",
         strain_prefixes=config["strip_strain_prefixes"],
         # Strip the compression suffix for the intermediate output from the aligner.
         uncompressed_alignment=lambda wildcards, output: Path(output.alignment).with_suffix(""),
@@ -92,15 +90,12 @@ rule align:
             --sequences {input.sequences} \
             --strip-prefixes {params.strain_prefixes:q} \
             --output /dev/stdout 2> {params.sanitize_log} \
-            | nextalign run \
+            | nextclade run \
             --jobs={threads} \
-            --reference {input.reference} \
-            --genemap {input.genemap} \
-            --output-translations {params.output_translations} \
-            --output-fasta {params.uncompressed_alignment} \
-            --output-insertions {output.insertions} > {log} 2>&1;
+            --input-ref {input.reference} \
+            --input-annotation {input.annotation} \
+            --output-fasta {params.uncompressed_alignment} > {log} 2>&1;
         xz -2 -T {threads} {params.uncompressed_alignment};
-        xz -2 -T {threads} {params.output_translations_toxz}
         """
 
 def _get_subsampling_settings(wildcards):
@@ -466,8 +461,8 @@ rule prepare_nextclade:
     conda: config["conda_environment"]
     shell:
         r"""
-        nextclade2 --version
-        nextclade2 dataset get --name {params.name} --output-zip {output.nextclade_dataset}
+        nextclade --version
+        nextclade dataset get --name {params.name} --output-zip {output.nextclade_dataset}
         """
 
 rule build_align:
@@ -481,14 +476,13 @@ rule build_align:
         nextclade_dataset = "data/sars-cov-2-nextclade-defaults.zip",
     output:
         alignment = "results/{build_name}/aligned.fasta",
-        insertions = "results/{build_name}/insertions.tsv",
         nextclade_qc = 'results/{build_name}/nextclade_qc.tsv',
         translations = expand("results/{{build_name}}/translations/aligned.gene.{gene}.fasta", gene=config.get('genes', ['S']))
     params:
         outdir = "results/{build_name}/translations",
         strain_prefixes=config["strip_strain_prefixes"],
         sanitize_log="logs/sanitize_sequences_before_nextclade_{build_name}.txt",
-        output_translations = lambda w: f"results/{w.build_name}/translations/aligned.gene.{{gene}}.fasta"
+        output_translations = lambda w: f"results/{w.build_name}/translations/aligned.gene.{{cds}}.fasta"
     log:
         "logs/align_{build_name}.txt"
     benchmark:
@@ -503,13 +497,12 @@ rule build_align:
             --sequences {input.sequences} \
             --strip-prefixes {params.strain_prefixes:q} \
             --output /dev/stdout 2> {params.sanitize_log} \
-            | nextclade2 run \
+            | nextclade run \
             --jobs {threads} \
             --input-dataset {input.nextclade_dataset} \
             --output-tsv {output.nextclade_qc} \
             --output-fasta {output.alignment} \
-            --output-translations {params.output_translations} \
-            --output-insertions {output.insertions} 2>&1 | tee {log}
+            --output-translations {params.output_translations} 2>&1 | tee {log}
         """
 
 rule join_metadata_and_nextclade_qc:
@@ -936,34 +929,6 @@ rule translate:
             --output {output.node_data} 2>&1 | tee {log}
         """
 
-rule build_mutation_summary:
-    message: "Summarizing {input.alignment}"
-    input:
-        alignment = rules.build_align.output.alignment,
-        insertions = rules.build_align.output.insertions,
-        translations = rules.build_align.output.translations,
-        reference = config["files"]["alignment_reference"],
-        genemap = config["files"]["annotation"]
-    output:
-        mutation_summary = "results/{build_name}/mutation_summary.tsv"
-    log:
-        "logs/mutation_summary_{build_name}.txt"
-    params:
-        outdir = "results/{build_name}/translations",
-        basename = "aligned"
-    conda: config["conda_environment"]
-    shell:
-        r"""
-        python3 scripts/mutation_summary.py \
-            --alignment {input.alignment} \
-            --insertions {input.insertions} \
-            --directory {params.outdir} \
-            --basename {params.basename} \
-            --reference {input.reference} \
-            --genemap {input.genemap} \
-            --output {output.mutation_summary} 2>&1 | tee {log}
-        """
-
 rule distances:
     input:
         tree = rules.refine.output.tree,