Merge branch 'vep_annotation' into 'master'

Manavalan Gajapathy · Manavalan Gajapathy · commit 624418aacd55 · 2021-01-29T12:56:40.000-06:00
VCF annotation using VEP

See merge request center-for-computational-genomics-and-data-science/sciops/ditto!1
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,94 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# DotEnv configuration
+.env
+
+# conda
+.conda
+
+# Database
+*.db
+*.rdb
+
+# Pycharm
+.idea
+
+# Jupyter NB Checkpoints
+.ipynb_checkpoints/
+
+# exclude data from source control by default
+# data/
+variant_annotation/data/
+
+#snakemake
+.snakemake/
+
+
+# exclude test data used for development
+to_be_deleted/test_data/data/ref
+to_be_deleted/test_data/data/reads
+
+#logs
+logs/
+
+# vscode
+.vscode/
+
+# .java/fonts dir get created when creating fastqc conda env
+.java/
+
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "variant_annotation/configs/snakemake_profile"]
+	path = variant_annotation/configs/snakemake_profile
+	url = git@gitlab.rc.uab.edu:center-for-computational-genomics-and-data-science/sciops/pipelines/small_variant_caller_pipeline.git
+[submodule "variant_annotation/configs/snakemake_slurm_profile"]
+	path = variant_annotation/configs/snakemake_slurm_profile
+	url = git@gitlab.rc.uab.edu:center-for-computational-genomics-and-data-science/sciops/external-projects/snakemake_slurm_profile.git
diff --git a/variant_annotation/.test/data/processed/vep/testing_variants_hg38_vep-annotated.vcf.gz b/variant_annotation/.test/data/processed/vep/testing_variants_hg38_vep-annotated.vcf.gz
diff --git a/variant_annotation/.test/data/raw/testing_variants_hg38.vcf b/variant_annotation/.test/data/raw/testing_variants_hg38.vcf
diff --git a/variant_annotation/README.md b/variant_annotation/README.md
@@ -0,0 +1,66 @@
+# Variant annotation
+
+Annotated variants in VCF using Variant Effect Predictor (VEP).
+
+Script [`src/run_pipeline.sh`](src/run_pipeline.sh) runs the snakemake workflow, which sets up VEP and then uses it for annotation.
+
+## Setup
+
+1. Create necessary directories to store log files
+
+```sh
+cd variant_annotation
+mkdir -p logs/rule_logs
+```
+
+2. Create dataset config YAML and populate with paths
+
+```sh
+touch ~/.ditto_datasets.yaml
+```
+
+Enter path info into the YAML file in the following format
+
+```yml
+cadd_snv: "/path/to/data/cadd/hg38/v1.6/whole_genome_SNVs.tsv.gz"
+cadd_indel: "/path/to/data/cadd/raw/hg38/v1.6/gnomad.genomes.r3.0.indel.tsv.gz"
+gerp: "/path/to/data/gerp/processed/hg38/v1.6/gerp_score_hg38.bg.gz"
+gnomad_genomes: "/path/to/data/gnomad/v3.0/data/gnomad.genomes.r3.0.sites.vcf.bgz"
+clinvar: "/path/to/data/clinvar/data/grch38/20210119/clinvar_20210119.vcf.gz"
+dbNSFP: "/path/to/data/dbnsfp/processed/v4.1a_20200616/dbNSFP4.1a_variant.complete.bgz"
+```
+
+## Datasets in custom format
+
+Two of the datasets listed in the datasets YAML require custom formatting for use with the VEP annotator. The following
+describes that formatting process that will need to be performed.
+
+**gerp:**
+
+ - GERP is extracted from the annotation database distributed by CADD found [here](https://cadd.gs.washington.edu/download)
+ - Format GERP base-wise RS scores from extracted annotation file into final compressed BedGraph file
+
+**dbNSFP:**
+
+ - dbNSFP data is extracted from dbNSFP zip found [here](https://sites.google.com/site/jpopgen/dbNSFP)
+ - per chromosome tab-seperated value files are extracted from the zip, sorted by GRCh38/hg38 coordinates, joined
+ into a single file, bgzipped and indexed.
+
+All other dataset files listed in the config file are in usable in the format provided by their originating source.
+
+## How to run
+
+- To run in current session (Note: only runs main Snakemake process in current session, Snakemake will still send jobs
+ to Slurm):
+
+    ```sh
+    cd variant_annotation
+    ./src/run_pipeline.sh -v .test/data/raw/testing_variants_hg38.vcf -o .test/data/processed/vep -d ~/.ditto_datasets.yaml
+    ```
+
+- To run it as slurm job:
+
+    ```sh
+    cd variant_annotation
+    ./src/run_pipeline.sh -s -v .test/data/raw/testing_variants_hg38.vcf -o .test/data/processed/vep -d ~/.ditto_datasets.yaml
+    ```
diff --git a/variant_annotation/configs/cluster_config.json b/variant_annotation/configs/cluster_config.json
@@ -0,0 +1,10 @@
+{
+    "__default__": {
+        "ntasks": 1,
+        "partition": "short",
+        "cpus-per-task": "{threads}",
+        "mem": "20G",
+        "output": "logs/rule_logs/{rule}-%j.log",
+        "error": "logs/rule_logs/{rule}-%j.err"
+    }
+}
diff --git a/variant_annotation/configs/env/vep.yaml b/variant_annotation/configs/env/vep.yaml
@@ -0,0 +1,6 @@
+channels:
+  - bioconda
+  - conda-forge
+dependencies:
+  - ensembl-vep =102
+  - bcftools =1.10.2
diff --git a/variant_annotation/configs/snakemake_slurm_profile b/variant_annotation/configs/snakemake_slurm_profile
@@ -0,0 +1 @@
+Subproject commit 4ecaf55d398ebfdf8415dff50c26beea0237c34d
diff --git a/variant_annotation/src/Snakefile b/variant_annotation/src/Snakefile
@@ -0,0 +1,135 @@
+"""
+This pipeline annotates VCF using Variant Effect Predictor
+1. Sets up VEP cache and plugins
+2. Using cache, plugins and other locally available datasets, annoate variants in VCF
+"""
+
+from pathlib import Path
+
+# datasets to use for annotations
+configfile: config["datasets"]
+
+
+#### VEP parameters ####
+VEP_CACHE = 'homo_sapiens_refseq'
+SPECIES = 'homo_sapiens'
+REF_BUILD = "GRCh38"
+ENSEMBL_DATASET_VERSION = "102"
+HGVS = False
+STATS = False
+
+### I/O parameters
+INPUT_VCF = config["vcf"]
+PROCESSED_DIR = Path(config["outdir"])
+EXTERNAL_DIR = Path("data/external")
+
+if not (INPUT_VCF.endswith('vcf') or INPUT_VCF.endswith('vcf.gz')):
+    print (f"Error: Input file extension not in expected format: found {INPUT_VCF}, expecting *.vcf or *.vcf.gz")
+    raise SystemExit(1)
+
+INPUT_VCF = Path(INPUT_VCF)
+OUTPUT_VCF = PROCESSED_DIR / ((INPUT_VCF.name).rstrip(".gz").rstrip(".vcf") + "_vep-annotated.vcf.gz")
+
+
+rule all:
+    input:
+        OUTPUT_VCF
+
+
+rule get_vep_cache:
+    output:
+        cache = directory(EXTERNAL_DIR / "vep" / "cache" / VEP_CACHE),
+    params:
+        species = VEP_CACHE,
+        build = REF_BUILD,
+        release = ENSEMBL_DATASET_VERSION,
+        plugins = "CADD"
+    message:
+        "Retrieves VEP cache data"
+    conda:
+        "../configs/env/vep.yaml"
+    shell:
+        r"""
+        vep_install --AUTO cfp \
+            --SPECIES {params.species} \
+            --ASSEMBLY {params.build} \
+            --PLUGINS {params.plugins} \
+            --CACHE_VERSION {params.release} \
+            --CACHEDIR {output.cache} \
+            --CONVERT \
+            --NO_UPDATE
+        """
+
+
+rule get_vep_plugins:
+    output:
+        directory(EXTERNAL_DIR / "vep" / "plugins"),
+    message:
+        "Downloads VEP plugins"
+    params:
+        release = ENSEMBL_DATASET_VERSION
+    wrapper:
+        "0.59.2/bio/vep/plugins"
+
+
+rule annotate_variants:
+    input:
+        calls = INPUT_VCF,
+        cache = EXTERNAL_DIR / "vep" / "cache" / VEP_CACHE,
+        plugins = EXTERNAL_DIR / "vep" / "plugins",
+        cadd_snv = config['cadd_snv'],
+        cadd_indel = config['cadd_indel'],
+        gerp = config['gerp'],
+        gnomad_genomes = config['gnomad_genomes'],
+        clinvar = config['clinvar'],
+        dbNSFP = config['dbNSFP'],
+    output:
+        calls = OUTPUT_VCF,
+    message:
+        "Annotated vcf using VEP with CADD, gnomad-exomes, gnomad-genomes and GERP. "
+        f"VEP cache used: {VEP_CACHE}, ref build: {REF_BUILD}, Ensemble version: {ENSEMBL_DATASET_VERSION}"
+    params:
+        release = ENSEMBL_DATASET_VERSION,
+        species = SPECIES,
+        build = REF_BUILD,
+        refseq_flag = "--refseq" if 'refseq' in VEP_CACHE else "",
+        hgvs_flag = "--hgvs" if HGVS else "",
+        stats_flag = lambda wildcards, output: f"--stats_file {output.stats}" if STATS else "--no_stats",
+        gnomad_fields = "AC,AN,AF,AF_afr,AF_afr_female,AF_afr_male,AF_ami,AF_ami_female,AF_ami_male,AF_amr,AF_amr_female,AF_amr_male,AF_asj,AF_asj_female," \
+                        "AF_asj_male,AF_eas,AF_eas_female,AF_eas_male,AF_female,AF_fin,AF_fin_female,AF_fin_male,AF_male,AF_nfe,AF_nfe_female,AF_nfe_male," \
+                        "AF_oth,AF_oth_female,AF_oth_male,AF_raw,AF_sas,AF_sas_female,AF_sas_male",
+        clinvar_fields = "AF_ESP,AF_EXAC,AF_TGP,ALLELEID,CLNDN,CLNDNINCL,CLNDISDB,CLNDISDBINCL,CLNREVSTAT,CLNSIG,CLNSIGCONF,CLNSIGINCL,CLNVC,GENEINFO,MC,ORIGIN,RS,SSR",
+        dbNSFP_fields = "LRT_score,MutationTaster_score,MutationAssessor_score,FATHMM_score,PROVEAN_score,VEST4_score,MetaSVM_score,MetaLR_score,M-CAP_score," \
+                        "CADD_phred,DANN_score,fathmm-MKL_coding_score,GenoCanyon_score,integrated_fitCons_score,GERP++_RS,phyloP100way_vertebrate,phyloP30way_mammalian," \
+                        "phastCons100way_vertebrate,phastCons30way_mammalian,SiPhy_29way_logOdds,Eigen-raw_coding,Eigen-raw_coding_rankscore,Eigen-phred_coding," \
+                        "Eigen-PC-raw_coding,Eigen-PC-raw_coding_rankscore,Eigen-PC-phred_coding",
+        warnings_file = lambda wildcards, output: str(output.calls).replace('.vcf.gz', '_STDOUT_warnings.txt'),
+    threads: 8
+    conda:
+        "../configs/env/vep.yaml"
+    shell:
+        r"""
+        # using bcftools view as it might catch vcf-related errors (https://stackoverflow.com/a/63371639/3998252)
+        bcftools view {input.calls} | \
+            vep --fork {threads} \
+                --format vcf \
+                --vcf \
+                --offline \
+                --cache \
+                --cache_version {params.release} \
+                --species {params.species} \
+                --assembly {params.build} \
+                {params.refseq_flag} {params.hgvs_flag} \
+                --sift s --polyphen s \
+                --dir_cache {input.cache} \
+                --dir_plugins {input.plugins} \
+                --plugin CADD,{input.cadd_snv},{input.cadd_indel} \
+                --plugin dbNSFP,{input.dbNSFP},{params.dbNSFP_fields} \
+                --custom {input.gerp},GERP,bed \
+                --custom {input.gnomad_genomes},gnomADv3,vcf,exact,0,{params.gnomad_fields} \
+                --custom {input.clinvar},clinvar,vcf,exact,0,{params.clinvar_fields} \
+                {params.stats_flag} \
+                --warning_file {params.warnings_file} \
+                --compress_output bgzip \
+                --output_file {output.calls}
+        """
diff --git a/variant_annotation/src/run_pipeline.sh b/variant_annotation/src/run_pipeline.sh