Skip to content

Commit c5699a6

Browse files
authored
Merge pull request #842 from nextstrain/nextclade-align
Nextclade align
2 parents 59da03e + 46fa71e commit c5699a6

File tree

6 files changed

+204
-13
lines changed

6 files changed

+204
-13
lines changed

defaults/parameters.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,7 @@ sanitize_metadata:
3737
- "Lineage=pango_lineage"
3838
- "Pangolin version=pangolin_version"
3939
- "Variant=variant"
40-
- "AA Substitutions=aa_substitutions"
41-
- "aaSubstitutions=aa_substitutions"
40+
- "AA Substitutions=aaSubstitutions"
4241
- "Submission date=date_submitted"
4342
- "Is reference?=is_reference"
4443
- "Is complete?=is_complete"

docs/src/reference/change_log.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
As of April 2021, we use major version numbers (e.g. v2) to reflect backward incompatible changes to the workflow that likely require you to update your Nextstrain installation.
44
We also use this change log to document new features that maintain backward compatibility, indicating these features by the date they were added.
55

6+
## v11 (3 February 2022)
7+
8+
- Run Nextclade QC and filtering on the final sample set before building a tree. Nextclade also runs `nextalign` under the hood. Importantly, this enables filtering the final sample set to omit strains with many reversions and/or possible contaminants, significantly improving the quality of Omicron trees. [See the original pull request for more details](https://github.com/nextstrain/ncov/pull/842). To disable this filtering by Nextclade quality control metrics, set `skip_diagnostics: true` in [the `filter` section of your build configuration file](https://docs.nextstrain.org/projects/ncov/en/latest/reference/configuration.html#filter).
9+
610
## New features since last version update
711

812
- 29 January 2022: Update "mutational fitness" coloring based on latest results from [Obermeyer et al model](https://www.medrxiv.org/content/10.1101/2021.09.07.21263228v1) via [github.com/broadinstitute/pyro-cov/](https://github.com/broadinstitute/pyro-cov/blob/master/paper/mutations.tsv).

docs/src/reference/configuration.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,11 @@ Builds support any named attributes that can be referenced by subsampling scheme
209209
* description: Minimum collection date for strains to include in the analysis used by `augur filter --min-date`. Dates can be numeric floating point values (e.g., `2019.74`) or ISO 8601-style strings (e.g., `2019-10-01`).
210210
* default: `2019.74`
211211

212+
### skip_diagnostics
213+
* type: boolean
214+
* description: Skip filtering by Nextclade quality control metrics like clock rate deviation, number of SNP clusters, possible contaminations, etc.
215+
* default: `false`
216+
212217
## frequencies
213218
### min_date
214219
* type: float or string
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Copied from https://github.com/nextstrain/ncov-ingest/blob/master/bin/join-metadata-and-clades
4+
"""
5+
import argparse
6+
import sys
7+
from datetime import datetime
8+
import pandas as pd
9+
import numpy as np
10+
11+
INSERT_BEFORE_THIS_COLUMN = "pango_lineage"
12+
METADATA_JOIN_COLUMN_NAME = 'strain'
13+
NEXTCLADE_JOIN_COLUMN_NAME = 'seqName'
14+
VALUE_MISSING_DATA = '?'
15+
16+
rate_per_day = 0.0007 * 29903 / 365
17+
reference_day = datetime(2020,1,1).toordinal()
18+
19+
column_map = {
20+
"clade": "Nextstrain_clade",
21+
"totalMissing": "missing_data",
22+
"totalSubstitutions": "divergence",
23+
"totalNonACGTNs": "nonACGTN",
24+
"privateNucMutations.totalUnlabeledSubstitutions": "rare_mutations",
25+
"privateNucMutations.totalReversionSubstitutions": "reversion_mutations",
26+
"privateNucMutations.totalLabeledSubstitutions": "potential_contaminants",
27+
"qc.missingData.status": "QC_missing_data",
28+
"qc.mixedSites.status": "QC_mixed_sites",
29+
"qc.privateMutations.status": "QC_rare_mutations",
30+
"qc.snpClusters.status": "QC_snp_clusters",
31+
"qc.frameShifts.status": "QC_frame_shifts",
32+
"qc.stopCodons.status": "QC_stop_codons",
33+
"frameShifts": "frame_shifts",
34+
"deletions": "deletions",
35+
"insertions": "insertions",
36+
"substitutions": "substitutions",
37+
"aaSubstitutions": "aaSubstitutions"
38+
}
39+
40+
preferred_types = {
41+
"divergence": "int32",
42+
"nonACGTN": "int32",
43+
"missing_data": "int32",
44+
"snp_clusters": "int32",
45+
"rare_mutations": "int32"
46+
}
47+
48+
def reorder_columns(result: pd.DataFrame):
49+
"""
50+
Moves the new clade column after a specified column
51+
"""
52+
columns = list(result.columns)
53+
columns.remove(column_map['clade'])
54+
insert_at = columns.index(INSERT_BEFORE_THIS_COLUMN)
55+
columns.insert(insert_at, column_map['clade'])
56+
return result[columns]
57+
58+
59+
def parse_args():
60+
parser = argparse.ArgumentParser(
61+
description="Joins metadata file with Nextclade clade output",
62+
)
63+
parser.add_argument("first_file")
64+
parser.add_argument("second_file")
65+
parser.add_argument("-o", default=sys.stdout)
66+
return parser.parse_args()
67+
68+
def datestr_to_ordinal(x):
69+
try:
70+
return datetime.strptime(x,"%Y-%m-%d").toordinal()
71+
except:
72+
return np.nan
73+
74+
def isfloat(value):
75+
try:
76+
float(value)
77+
return True
78+
except ValueError:
79+
return False
80+
81+
def main():
82+
args = parse_args()
83+
84+
metadata = pd.read_csv(args.first_file, index_col=METADATA_JOIN_COLUMN_NAME,
85+
sep='\t', low_memory=False)
86+
87+
# Check for existing annotations in the given metadata. Skip join with
88+
# Nextclade QC file, if those annotations already exist and none of the
89+
# columns have empty values. In the case where metadata were combined from
90+
# different sources with and without annotations, the "clock_deviation"
91+
# column will exist but some values will be missing. We handle this case as
92+
# if the annotations do not exist at all and reannotate all columns. We
93+
# cannot look for missing values across all expected columns as evidence of
94+
# incomplete annotations, since a complete annotation by Nextclade will
95+
# include missing values for some columns by design.
96+
expected_columns = list(column_map.values()) + ["clock_deviation"]
97+
existing_annotation_columns = metadata.columns.intersection(expected_columns)
98+
if len(existing_annotation_columns) == len(expected_columns):
99+
if metadata["clock_deviation"].isnull().sum() == 0:
100+
print(f"Metadata file '{args.first_file}' has already been annotated with Nextclade QC columns. Skipping re-annotation.")
101+
metadata.to_csv(args.o, sep="\t")
102+
return
103+
104+
# Read and rename clade column to be more descriptive
105+
clades = pd.read_csv(args.second_file, index_col=NEXTCLADE_JOIN_COLUMN_NAME,
106+
sep='\t', low_memory=False, na_filter = False) \
107+
.rename(columns=column_map)
108+
109+
clade_columns = clades.columns.intersection(list(column_map.values()))
110+
clades = clades[clade_columns]
111+
112+
# Concatenate on columns
113+
result = pd.merge(
114+
metadata, clades,
115+
left_index=True,
116+
right_index=True,
117+
how='left',
118+
suffixes=["_original", ""],
119+
)
120+
all_clades = result.Nextstrain_clade.unique()
121+
t = result["date"].apply(datestr_to_ordinal)
122+
div_array = np.array([float(x) if isfloat(x) else np.nan for x in result.divergence])
123+
offset_by_clade = {}
124+
for clade in all_clades:
125+
ind = result.Nextstrain_clade==clade
126+
if ind.sum()>100:
127+
deviation = div_array[ind] - (t[ind] - reference_day)*rate_per_day
128+
offset_by_clade[clade] = np.mean(deviation[~np.isnan(deviation)])
129+
130+
# extract divergence, time and offset information into vectors or series
131+
offset = result["Nextstrain_clade"].apply(lambda x: offset_by_clade.get(x, 2.0))
132+
# calculate divergence
133+
result["clock_deviation"] = np.array(div_array - ((t-reference_day)*rate_per_day + offset), dtype=int)
134+
result.loc[np.isnan(div_array)|np.isnan(t), "clock_deviation"] = np.nan
135+
136+
for col in list(column_map.values()) + ["clock_deviation"]:
137+
result[col] = result[col].fillna(VALUE_MISSING_DATA)
138+
139+
# Move the new column so that it's next to other clade columns
140+
if INSERT_BEFORE_THIS_COLUMN in result.columns:
141+
result = reorder_columns(result) #.astype(preferred_types)
142+
143+
result.to_csv(args.o, index_label=METADATA_JOIN_COLUMN_NAME, sep='\t')
144+
145+
146+
if __name__ == '__main__':
147+
main()

workflow/envs/nextstrain.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ dependencies:
88
- epiweeks=2.1.2
99
- iqtree=2.1.4-beta
1010
- nextalign=1.9.0
11+
- nextclade=1.10.1
1112
- pangolin=3.1.17
1213
- pangolearn=2021.12.06
1314
- python>=3.7*

workflow/snakemake_rules/main_workflow.smk

Lines changed: 46 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -444,19 +444,35 @@ rule combine_samples:
444444
--output-metadata {output.metadata} 2>&1 | tee {log}
445445
"""
446446

447+
rule prepare_nextclade:
448+
message:
449+
"""
450+
Downloading reference files for nextclade (used for alignment and qc).
451+
"""
452+
output:
453+
nextclade_dataset = directory("data/sars-cov-2-nextclade-defaults"),
454+
params:
455+
name = "sars-cov-2",
456+
shell:
457+
"""
458+
nextclade dataset get --name {params.name} --output-dir {output.nextclade_dataset}
459+
"""
460+
447461
rule build_align:
448462
message:
449463
"""
450-
Aligning sequences to {input.reference}
464+
Running nextclade QC and aligning sequences to {input.reference}
451465
- gaps relative to reference are considered real
452466
"""
453467
input:
454468
sequences = rules.combine_samples.output.sequences,
455469
genemap = config["files"]["annotation"],
456-
reference = config["files"]["alignment_reference"]
470+
reference = config["files"]["alignment_reference"],
471+
nextclade_dataset = rules.prepare_nextclade.output.nextclade_dataset,
457472
output:
458473
alignment = "results/{build_name}/aligned.fasta",
459474
insertions = "results/{build_name}/insertions.tsv",
475+
nextclade_qc = 'results/{build_name}/nextclade_qc.tsv',
460476
translations = expand("results/{{build_name}}/translations/aligned.gene.{gene}.fasta", gene=config.get('genes', ['S']))
461477
params:
462478
outdir = "results/{build_name}/translations",
@@ -472,22 +488,41 @@ rule build_align:
472488
mem_mb=3000
473489
shell:
474490
"""
475-
xz -c -d {input.sequences} | nextalign \
476-
--jobs={threads} \
491+
xz -c -d {input.sequences} | nextclade run \
492+
--jobs {threads} \
493+
--input-fasta /dev/stdin \
477494
--reference {input.reference} \
478-
--genemap {input.genemap} \
479-
--genes {params.genes} \
480-
--sequences /dev/stdin \
495+
--input-dataset {input.nextclade_dataset} \
496+
--output-tsv {output.nextclade_qc} \
481497
--output-dir {params.outdir} \
482498
--output-basename {params.basename} \
483499
--output-fasta {output.alignment} \
484-
--output-insertions {output.insertions} > {log} 2>&1
500+
--output-insertions {output.insertions} 2>&1 | tee {log}
501+
"""
502+
503+
rule join_metadata_and_nextclade_qc:
504+
input:
505+
metadata = "results/{build_name}/{build_name}_subsampled_metadata.tsv.xz",
506+
nextclade_qc = "results/{build_name}/nextclade_qc.tsv",
507+
output:
508+
metadata = "results/{build_name}/metadata_with_nextclade_qc.tsv",
509+
log:
510+
"logs/join_metadata_and_nextclade_qc_{build_name}.txt",
511+
benchmark:
512+
"benchmarks/join_metadata_and_nextclade_qc_{build_name}.txt",
513+
conda: config["conda_environment"]
514+
shell:
515+
"""
516+
python3 scripts/join-metadata-and-clades.py \
517+
{input.metadata} \
518+
{input.nextclade_qc} \
519+
-o {output.metadata} 2>&1 | tee {log}
485520
"""
486521

487522
rule diagnostic:
488523
message: "Scanning metadata {input.metadata} for problematic sequences. Removing sequences with >{params.clock_filter} deviation from the clock and with more than {params.snp_clusters}."
489524
input:
490-
metadata = "results/{build_name}/{build_name}_subsampled_metadata.tsv.xz",
525+
metadata = "results/{build_name}/metadata_with_nextclade_qc.tsv",
491526
output:
492527
to_exclude = "results/{build_name}/excluded_by_diagnostics.txt"
493528
params:
@@ -596,7 +631,7 @@ rule index:
596631

597632
rule annotate_metadata_with_index:
598633
input:
599-
metadata="results/{build_name}/{build_name}_subsampled_metadata.tsv.xz",
634+
metadata="results/{build_name}/metadata_with_nextclade_qc.tsv",
600635
sequence_index = "results/{build_name}/sequence_index.tsv",
601636
output:
602637
metadata="results/{build_name}/metadata_with_index.tsv",
@@ -716,7 +751,7 @@ rule adjust_metadata_regions:
716751
Adjusting metadata for build '{wildcards.build_name}'
717752
"""
718753
input:
719-
metadata="results/{build_name}/{build_name}_subsampled_metadata.tsv.xz",
754+
metadata="results/{build_name}/metadata_with_index.tsv",
720755
output:
721756
metadata = "results/{build_name}/metadata_adjusted.tsv.xz"
722757
params:

0 commit comments

Comments
 (0)