BAM INDEX: wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/seqc/Somatic_Mutation_WG/data/WGS/WGS_EA_N_1.bwa.dedup.bai
BAM INDEX: wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/seqc/Somatic_Mutation_WG/data/WGS/WGS_EA_T_1.bwa.dedup.bai
- wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/seqc/Somatic_Mutation_WG/data/WES/WES_EA_N_1.bwa.dedup.bam
- wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/seqc/Somatic_Mutation_WG/data/WES/WES_EA_N_1.bwa.dedup.bai
- wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/seqc/Somatic_Mutation_WG/data/WES/WES_EA_T_1.bwa.dedup.bam
- wget https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/seqc/Somatic_Mutation_WG/data/WES/WES_EA_T_1.bwa.dedup.bai
Downloading GRCh38 reference genome along with known sites from GDC/SEQC2, these files will be used for alignment and variant calling.
The reference files are hosted at GDC/NCI website. (https://gdc.cancer.gov/about-data/gdc-data-processing/gdc-reference-files)
wget https://api.gdc.cancer.gov/data/254f697d-310d-4d7d-a27b-27fbf767a834 -O GRCh38.d1.vd1.fa.tar.gz
tar -xvzf GRCh38.d1.vd1.fa.tar.gz
wget https://api.gdc.cancer.gov/data/25217ec9-af07-4a17-8db9-101271ee7225 -O GRCh38.d1.vd1_BWA.tar.gz
tar -xvzf GRCh38.d1.vd1_BWA.tar.gz
wget ftp://[email protected]:21/bundle/hg38/dbsnp_146.hg38.vcf.gz
wget https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/other_mapping_resources/ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz
## Please edit line 22 of this file add missing double quote before dbSNP
Description=“dbSNP ssID of the allele”
##INFO=<ID=ssID,Number=A,Type=String,Description=dbSNP ssID of the allele">
## Please edit line 42 change “POS=POS-1” to “POS_POS-1”
##INFO=<ID=POS=POS1,Number=0,Type=Flag,Description="POS has been adjusted due to missing REF in NCBI VCF file">
wget https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/other_mapping_resources/Mills_and_1000G_gold_standard.indels.b38.primary_assembly.vcf.gz
Before starting the analysis run, please install following software’s will be handy for some data processing:Download accessory tools
cd /mnt/disks/local
## Downlaod and install GATK
wget https://github.com/broadinstitute/gatk/releases/download/4.2.0.0/gatk-4.2.0.0.zip
unzip gatk-4.2.0.0.zip
## Install samtools
sudo apt install samtools
## Install Tabix
sudo apt install tabix
## index fasta file
samtools faidx Refs/GRCh38.d1.vd1.fa
## create dictionary file from fasta:
gatk-4.2.0.0/gatk CreateSequenceDictionary R=/home/ubuntu/Refs/GRCh38.d1.vd1.fa O=/home/ubuntu/Refs/GRCh38.d1.vd1.dict
#### Caller1: Mutect2 variant calling ($GPU=2,4,8), here we used $GPU=4
pbrun mutectcaller --ref Refs/GRCh38.d1.vd1.fa \
--in-tumor-bam WGS_EA_T_1.bwa.dedup.bam \
--in-normal-bam WGS_EA_N_1.bwa.dedup.bam \
--in-tumor-recal-file WGS_EA_T_1.bwa.dedup.recal.txt \
--in-normal-recal-file WGS_EA_N_1.bwa.dedup.recal.txt \
--out-vcf WGS_EA_TN-1-mutect2.vcf \
--tumor-name WGS_EA_T_1 \
--normal-name WGS_EA_N_1
--num-gpus $GPU \
#### Caller2: MuSE
##### Step1: MuSE call
pbrun muse --ref Refs/GRCh38.d1.vd1.fa \
--in-tumor-bam WGS_EA_T_1.bwa.dedup.bam \
--in-normal-bam WGS_EA_N_1.bwa.dedup.bam \
--out-file WGS_EA_TN-1-muse.call \
--num-threads 16 --mode call
##### Step2: MuSe sump
pbrun muse --ref Refs/GRCh38.d1.vd1.fa \
--in-callfile WGS_EA_TN-1-muse.call.MuSE.txt \
--datatype G \
--out-vcf WGS_EA_TN-1-muse.call.MuSE.vcf \
--num-threads 16 --mode sump \
--in-dbsnp Refs/dbsnp_146.hg38.vcf.gz
#### Caller3: Somatic Sniper
pbrun somaticsniper --ref Refs/GRCh38.d1.vd1.fa \
--in-tumor-bam WGS_EA_T_1.bwa.dedup.bam \
--in-normal-bam WGS_EA_N_1.bwa.dedup.bam \
--min-mapq 1 --no-gain --no-loh --out-format vcf \
--out-file WGS_EA_TN-1-somaticsniper.vcf \
--num-threads 16 \
#### Caller4: LoFreq
pbrun lofreq --ref Refs/GRCh38.d1.vd1.fa \
--in-tumor-bam WGS_EA_T_1.bwa.dedup.bam \
--in-normal-bam WGS_EA_N_1.bwa.dedup.bam \
--output-dir pbrun_LoFreq \
--num-threads 4 --num-gpus $GPU \
--in-dbsnp-file Refs/dbsnp_146.hg38.vcf.gz