Description
Describe the issue
I am trying to annotate my cohort VCF (70 WES patients), previously analyzed using the GATK GenotypeGVCF pipeline, with VEP in Singularity.
I need the gnomAD_AF frequencies in order to subsequently perform an association test on rare variants against a public database such as gnomAD.
I have read all the recommendations and command suggestions, but despite this, no gnomAD annotation appears in my VCF and the corresponding fields in the CSQ are empty.
Should I perhaps use a custom mode as suggested in some examples and download the VCF files from version 4.1, or is there something else I should do? Furthermore, I did not understand if, in the case I need to use the custom function, I have to download each file individually from
https://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh38/variation_genotype/gnomad/v4.1/exomes/
and then add them all one by one to the --custom option?
I hope you can help me, thank you very much.
System
- VEP version: 114.1
- VEP Cache version: 114.1
- Perl version: v5.30.0
- OS: Ubuntu
- tabix installed : Yes
Full VEP command line
singularity exec \
--no-home \
-B "${bind_cache}" \
-B "${bind_ref}" \
-B "${bind_vcfs}" \
-B "${bind_out}" \
-B "${bind_cadd}" \
-B "${bind_plugins}" \
"${sandbox_dir}" \
vep \
--vcf --compress_output bgzip \
--cache --offline --force_overwrite \
--dir_cache /opt/vep/.vep \
--dir_plugins /plugins \
--verbose \
--af_gnomad \
--safe \
--fasta /data/hg38/hg38.fa \
--fork ${SLURM_CPUS_PER_TASK} \
--input_file ${INVCF} \
--output_file ${OUTVCF} \
--synonyms ${synonyms} \
--plugin CADD,snv=/data/cadd/whole_genome_SNVs.tsv.gz,indels=/data/cadd/gnomad.genomes.r4.0.indel.tsv.gz \
--plugin LoFtool \
--keep_csq
Full error message
Here is the debug code for the missing information:
vcf_macro <- "cohort_MACRO_MELAS/cohort_MACRO_PNRR.vep3.vcf.gz"
> hdr <- scanVcfHeader(vcf_macro)
Warnings:
In .bcfHeaderAsSimpleList(header) :
duplicate keys in header will be forced to unique rownames
> info_df <- info(hdr)
> csq_desc <- info_df["CSQ","Description"]
> cat("CSQ header:\n", csq_desc, "\n\n")
CSQ header:
Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|SYMBOL_SOURCE|HGNC_ID|gnomADe_AF|gnomADe_AFR_AF|gnomADe_AMR_AF|gnomADe_ASJ_AF|gnomADe_EAS_AF|gnomADe_FIN_AF|gnomADe_MID_AF|gnomADe_NFE_AF|gnomADe_REMAINING_AF|gnomADe_SAS_AF|CLIN_SIG|SOMATIC|PHENO|CADD_PHRED|CADD_RAW|LoFtool
> fields <- sub(".*Format: (.*)$", "\\1", csq_desc)
> field_list <- strsplit(fields, "[|,]")[[1]] %>% trimws()
> cat("CSQ sub‐fields:\n", paste0(seq_along(field_list),":", field_list, collapse="\n"), "\n\n")
CSQ sub‐fields:
1:Allele
2:Consequence
3:IMPACT
4:SYMBOL
5:Gene
6:Feature_type
7:Feature
8:BIOTYPE
9:EXON
10:INTRON
11:HGVSc
12:HGVSp
13:cDNA_position
14:CDS_position
15:Protein_position
16:Amino_acids
17:Codons
18:Existing_variation
19:DISTANCE
20:STRAND
21:FLAGS
22:SYMBOL_SOURCE
23:HGNC_ID
24:gnomADe_AF
25:gnomADe_AFR_AF
26:gnomADe_AMR_AF
27:gnomADe_ASJ_AF
28:gnomADe_EAS_AF
29:gnomADe_FIN_AF
30:gnomADe_MID_AF
31:gnomADe_NFE_AF
32:gnomADe_REMAINING_AF
33:gnomADe_SAS_AF
34:CLIN_SIG
35:SOMATIC
36:PHENO
37:CADD_PHRED
38:CADD_RAW
39:LoFtool
> cat("All fileds INFO available:\n")
All fileds INFO available:
> print(rownames(info_df))
[1] "AC" "AF" "AN"
[4] "AS_BaseQRankSum" "AS_FS" "AS_InbreedingCoeff"
[7] "AS_MQ" "AS_MQRankSum" "AS_QD"
[10] "AS_RAW_BaseQRankSum" "AS_RAW_MQ" "AS_RAW_MQRankSum"
[13] "AS_RAW_ReadPosRankSum" "AS_ReadPosRankSum" "AS_SB_TABLE"
[16] "AS_SOR" "BaseQRankSum" "DP"
[19] "END" "ExcessHet" "FS"
[22] "InbreedingCoeff" "MLEAC" "MLEAF"
[25] "MQ" "MQRankSum" "NEGATIVE_TRAIN_SITE"
[28] "PG" "POSITIVE_TRAIN_SITE" "QD"
[31] "RAW_MQandDP" "ReadPosRankSum" "SOR"
[34] "VQSLOD" "culprit" "CSQ"
> param <- ScanVcfParam(info=rownames(info_df), geno="DP",
+ which=GRanges("chr1", IRanges(1,1e6)))
> vcf_small <- readVcf(vcf_macro, "hg38", param)
[W::hts_idx_load3] The index file is older than the data file: cohort_MACRO_MELAS/cohort_MACRO_PNRR.vep3.vcf.gz.tbi
Warnings:
In .bcfHeaderAsSimpleList(header) :
duplicate keys in header will be forced to unique rownames
> csq_list <- info(vcf_small)$CSQ
> i0 <- which(lengths(csq_list)>0)[1]
> entry0 <- unlist(csq_list[i0])[1]
> parts <- strsplit(entry0, "\\|")[[1]]
> cat("Esempio CSQ completo:\n", entry0, "\n\n")
Esempio CSQ completo:
G|missense_variant|MODERATE|OR4F5|ENSG00000186092|Transcript|ENST00000641515|protein_coding|3/3||||544|484|162|T/A|Aca/Gca|||1||HGNC|HGNC:14825||||||||||||||4.335|0.395063|
> for (j in seq_along(field_list)) {
+ cat(sprintf("%2d %-12s = %s\n",
+ j, field_list[j],
+ ifelse(parts[j]=="", "<NA>", parts[j])))
+ }
1 Allele = G
2 Consequence = missense_variant
3 IMPACT = MODERATE
4 SYMBOL = OR4F5
5 Gene = ENSG00000186092
6 Feature_type = Transcript
7 Feature = ENST00000641515
8 BIOTYPE = protein_coding
9 EXON = 3/3
10 INTRON = <NA>
11 HGVSc = <NA>
12 HGVSp = <NA>
13 cDNA_position = 544
14 CDS_position = 484
15 Protein_position = 162
16 Amino_acids = T/A
17 Codons = Aca/Gca
18 Existing_variation = <vuoto>
19 DISTANCE = <vuoto>
20 STRAND = 1
21 FLAGS = <NA>
22 SYMBOL_SOURCE = HGNC
23 HGNC_ID = HGNC:14825
24 gnomADe_AF = <NA>
25 gnomADe_AFR_AF = <NA>
26 gnomADe_AMR_AF = <NA>
27 gnomADe_ASJ_AF = <NA>
28 gnomADe_EAS_AF = <NA>
29 gnomADe_FIN_AF = <NA>
30 gnomADe_MID_AF = <NA>
31 gnomADe_NFE_AF = <NA>
32 gnomADe_REMAINING_AF = <NA>
33 gnomADe_SAS_AF = <NA>
34 CLIN_SIG = <NA>
35 SOMATIC = <NA>
36 PHENO = <NA>
37 CADD_PHRED = 4.335
38 CADD_RAW = 0.395063
39 LoFtool = NA