Skip to content

Commit b9bbbdc

Browse files
author
bshifaw
authored
Wdlupdate (#23)
* added haplotypcaller nio, added disk sizing within task. * update gatk image to 4.0.6.0 , minor correction to picard command in JD * Added conditional task to convert input CRAM to bam before running haplotypcaller. * updated gatk version in joint discovery json
1 parent 3087acc commit b9bbbdc

6 files changed

+346
-34
lines changed

haplotypecaller-gvcf-gatk4-nio.wdl

Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
## Copyright Broad Institute, 2017
2+
##
3+
## This WDL workflow runs HaplotypeCaller from GATK4 in GVCF mode on a single sample
4+
## according to the GATK Best Practices (June 2016), scattered across intervals.
5+
##
6+
## Requirements/expectations :
7+
## - One analysis-ready BAM file for a single sample (as identified in RG:SM)
8+
## - Set of variant calling intervals lists for the scatter, provided in a file
9+
##
10+
## Outputs :
11+
## - One GVCF file and its index
12+
##
13+
## Cromwell version support
14+
## - Successfully tested on v31
15+
## - Does not work on versions < v23 due to output syntax
16+
##
17+
## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
18+
##
19+
## LICENSING :
20+
## This script is released under the WDL source code license (BSD-3) (see LICENSE in
21+
## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
22+
## be subject to different licenses. Users are responsible for checking that they are
23+
## authorized to run all programs before running this script. Please see the dockers
24+
## for detailed licensing information pertaining to the included programs.
25+
26+
# WORKFLOW DEFINITION
27+
workflow HaplotypeCallerGvcf_GATK4 {
28+
File input_bam
29+
File input_bam_index
30+
File ref_dict
31+
File ref_fasta
32+
File ref_fasta_index
33+
File scattered_calling_intervals_list
34+
35+
Boolean? make_gvcf
36+
Boolean making_gvcf = select_first([make_gvcf,true])
37+
38+
String? gatk_docker_override
39+
String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.0.6.0"])
40+
String? gatk_path_override
41+
String gatk_path = select_first([gatk_path_override, "/gatk/gatk"])
42+
String? gitc_docker_override
43+
String gitc_docker = select_first([gitc_docker_override, "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"])
44+
45+
Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list)
46+
47+
#is the input a cram file?
48+
Boolean is_cram = sub(basename(input_bam), ".*\\.", "") == "cram"
49+
50+
String sample_basename = if is_cram then basename(input_bam, ".cram") else basename(input_bam, ".bam")
51+
String vcf_basename = sample_basename
52+
String output_suffix = if making_gvcf then ".g.vcf.gz" else ".vcf.gz"
53+
String output_filename = vcf_basename + output_suffix
54+
55+
# We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller.
56+
# If we take the number we are scattering by and reduce by 20 we will have enough disk space
57+
# to account for the fact that the data is quite uneven across the shards.
58+
Int potential_hc_divisor = length(scattered_calling_intervals) - 20
59+
Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1
60+
61+
62+
if ( is_cram ) {
63+
call CramToBamTask {
64+
input:
65+
input_cram = input_bam,
66+
sample_name = sample_basename,
67+
ref_dict = ref_dict,
68+
ref_fasta = ref_fasta,
69+
ref_fasta_index = ref_fasta_index,
70+
docker = gitc_docker
71+
}
72+
}
73+
74+
# Call variants in parallel over grouped calling intervals
75+
scatter (interval_file in scattered_calling_intervals) {
76+
77+
# Generate GVCF by interval
78+
call HaplotypeCaller {
79+
input:
80+
input_bam = select_first([CramToBamTask.output_bam, input_bam]),
81+
input_bam_index = select_first([CramToBamTask.output_bai, input_bam_index]),
82+
interval_list = interval_file,
83+
output_filename = output_filename,
84+
ref_dict = ref_dict,
85+
ref_fasta = ref_fasta,
86+
ref_fasta_index = ref_fasta_index,
87+
hc_scatter = hc_divisor,
88+
make_gvcf = making_gvcf,
89+
docker = gatk_docker,
90+
gatk_path = gatk_path
91+
}
92+
}
93+
94+
# Merge per-interval GVCFs
95+
call MergeGVCFs {
96+
input:
97+
input_vcfs = HaplotypeCaller.output_vcf,
98+
input_vcfs_indexes = HaplotypeCaller.output_vcf_index,
99+
output_filename = output_filename,
100+
docker = gatk_docker,
101+
gatk_path = gatk_path
102+
}
103+
104+
# Outputs that will be retained when execution is complete
105+
output {
106+
File output_vcf = MergeGVCFs.output_vcf
107+
File output_vcf_index = MergeGVCFs.output_vcf_index
108+
}
109+
}
110+
111+
# TASK DEFINITIONS
112+
113+
task CramToBamTask {
114+
# Command parameters
115+
File ref_fasta
116+
File ref_fasta_index
117+
File ref_dict
118+
File input_cram
119+
String sample_name
120+
121+
# Runtime parameters
122+
String docker
123+
Int? machine_mem_gb
124+
Int? disk_space_gb
125+
Boolean use_ssd = false
126+
Int? preemptible_attempts
127+
128+
Float output_bam_size = size(input_cram, "GB") / 0.60
129+
Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
130+
Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + 20
131+
132+
command {
133+
set -e
134+
set -o pipefail
135+
136+
samtools view -h -T ${ref_fasta} ${input_cram} |
137+
samtools view -b -o ${sample_name}.bam -
138+
samtools index -b ${sample_name}.bam
139+
mv ${sample_name}.bam.bai ${sample_name}.bai
140+
}
141+
runtime {
142+
docker: docker
143+
memory: select_first([machine_mem_gb, 15]) + " GB"
144+
disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD"
145+
preemptible: preemptible_attempts
146+
}
147+
output {
148+
File output_bam = "${sample_name}.bam"
149+
File output_bai = "${sample_name}.bai"
150+
}
151+
}
152+
153+
# HaplotypeCaller per-sample in GVCF mode
154+
task HaplotypeCaller {
155+
String input_bam
156+
String input_bam_index
157+
File interval_list
158+
String output_filename
159+
File ref_dict
160+
File ref_fasta
161+
File ref_fasta_index
162+
Float? contamination
163+
Boolean make_gvcf
164+
Int hc_scatter
165+
166+
String gatk_path
167+
String? java_options
168+
String java_opt = select_first([java_options, "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10"])
169+
170+
# Runtime parameters
171+
String docker
172+
Int? mem_gb
173+
Int? disk_space_gb
174+
Boolean use_ssd = false
175+
Int? preemptible_attempts
176+
177+
Int machine_mem_gb = select_first([mem_gb, 7])
178+
Int command_mem_gb = machine_mem_gb - 1
179+
180+
Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
181+
Int disk_size = ceil(((size(input_bam, "GB") + 30) / hc_scatter) + ref_size) + 20
182+
183+
command <<<
184+
set -e
185+
186+
${gatk_path} --java-options "-Xmx${command_mem_gb}G ${java_opt}" \
187+
HaplotypeCaller \
188+
-R ${ref_fasta} \
189+
-I ${input_bam} \
190+
-L ${interval_list} \
191+
-O ${output_filename} \
192+
-contamination ${default=0 contamination} ${true="-ERC GVCF" false="" make_gvcf}
193+
>>>
194+
195+
runtime {
196+
docker: docker
197+
memory: machine_mem_gb + " GB"
198+
disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD"
199+
preemptible: select_first([preemptible_attempts, 3])
200+
}
201+
202+
output {
203+
File output_vcf = "${output_filename}"
204+
File output_vcf_index = "${output_filename}.tbi"
205+
}
206+
}
207+
# Merge GVCFs generated per-interval for the same sample
208+
task MergeGVCFs {
209+
Array[File] input_vcfs
210+
Array[File] input_vcfs_indexes
211+
String output_filename
212+
213+
String gatk_path
214+
215+
# Runtime parameters
216+
String docker
217+
Int? mem_gb
218+
Int? disk_space_gb
219+
Boolean use_ssd = false
220+
Int? preemptible_attempts
221+
222+
Int machine_mem_gb = select_first([mem_gb, 3])
223+
Int command_mem_gb = machine_mem_gb - 1
224+
225+
command <<<
226+
set -e
227+
228+
${gatk_path} --java-options "-Xmx${command_mem_gb}G" \
229+
MergeVcfs \
230+
--INPUT ${sep=' --INPUT ' input_vcfs} \
231+
--OUTPUT ${output_filename}
232+
>>>
233+
234+
runtime {
235+
docker: docker
236+
memory: machine_mem_gb + " GB"
237+
disks: "local-disk " + select_first([disk_space_gb, 100]) + if use_ssd then " SSD" else " HDD"
238+
preemptible: select_first([preemptible_attempts, 3])
239+
}
240+
241+
242+
output {
243+
File output_vcf = "${output_filename}"
244+
File output_vcf_index = "${output_filename}.tbi"
245+
}
246+
}
247+
Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
{
22
"##_COMMENT1": "INPUT BAM",
3-
"HaplotypeCallerGvcf_GATK4.input_bam": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bam",
4-
"HaplotypeCallerGvcf_GATK4.input_bam_index": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bai",
3+
"#HaplotypeCallerGvcf_GATK4.input_bam": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bam",
4+
"#HaplotypeCallerGvcf_GATK4.input_bam_index": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bai",
5+
"HaplotypeCallerGvcf_GATK4.input_bam": "gs://broad-public-datasets/NA12878/NA12878.cram",
6+
"HaplotypeCallerGvcf_GATK4.input_bam_index": "gs://broad-public-datasets/NA12878/NA12878.cram.crai",
57

68
"##_COMMENT2": "REFERENCE FILES",
79
"HaplotypeCallerGvcf_GATK4.ref_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict",
@@ -10,27 +12,33 @@
1012

1113
"##_COMMENT3": "INTERVALS",
1214
"HaplotypeCallerGvcf_GATK4.scattered_calling_intervals_list": "gs://gatk-test-data/intervals/hg38_wgs_scattered_calling_intervals.txt",
15+
16+
"##_COMMENT4": "MISCELLANEOUS PARAMETERS",
17+
"#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.make_gvcf": "True",
18+
"#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.contamination": "Float? (optional)",
1319

14-
"##_COMMENT4": "DOCKERS",
15-
"HaplotypeCallerGvcf_GATK4.gatk_docker": "broadinstitute/gatk:4.0.4.0",
20+
"##_COMMENT5": "DOCKERS",
21+
"#HaplotypeCallerGvcf_GATK4.gatk_docker_override": "String? (optional)",
22+
"#HaplotypeCallerGvcf_GATK4.gitc_docker_override": "String? (optional)",
1623

17-
"##_COMMENT5": "PATHS",
18-
"HaplotypeCallerGvcf_GATK4.gatk_path": "/gatk/gatk",
19-
"HaplotypeCallerGvcf_GATK4.HaplotypeCaller.make_gvcf": "True",
20-
"##HaplotypeCallerGvcf_GATK4.HaplotypeCaller.contamination": "(optional) Float?",
24+
"##_COMMENT6": "PATHS",
25+
"#HaplotypeCallerGvcf_GATK4.gatk_path_override": "String? (optional)",
2126

22-
"##_COMMENT6": "JAVA OPTIONS",
23-
"##HaplotypeCallerGvcf_GATK4.HaplotypeCaller.java_options": "(optional) String?",
27+
"##_COMMENT7": "JAVA OPTIONS",
28+
"#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.java_options": "String? (optional)",
2429

25-
"##_COMMENT7": "MEMORY ALLOCATION",
26-
"##HaplotypeCallerGvcf_GATK4.HaplotypeCaller.mem_gb": "(optional) Int?",
27-
"##HaplotypeCallerGvcf_GATK4.MergeGVCFs.mem_gb": "(optional) Int?",
30+
"##_COMMENT8": "MEMORY ALLOCATION",
31+
"#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.mem_gb": "Int? (optional)",
32+
"#HaplotypeCallerGvcf_GATK4.MergeGVCFs.mem_gb": "Int? (optional)",
33+
"#HaplotypeCallerGvcf_GATK4.CramToBamTask.machine_mem_gb": "Int? (optional)",
2834

29-
"##_COMMENT8": "DISK SIZE ALLOCATION",
30-
"##HaplotypeCallerGvcf_GATK4.MergeGVCFs.disk_space_gb": "(optional) Int?",
31-
"##HaplotypeCallerGvcf_GATK4.HaplotypeCaller.disk_space_gb": "(optional) Int?",
35+
"##_COMMENT9": "DISK SIZE ALLOCATION",
36+
"#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.disk_space_gb": "Int? (optional)",
37+
"#HaplotypeCallerGvcf_GATK4.MergeGVCFs.disk_space_gb": "Int? (optional)",
38+
"#HaplotypeCallerGvcf_GATK4.CramToBamTask.disk_space_gb": "Int? (optional)",
3239

33-
"##_COMMENT9": "PREEMPTION",
34-
"##HaplotypeCallerGvcf_GATK4.HaplotypeCaller.preemptible_attempts": "(optional) Int?",
35-
"##HaplotypeCallerGvcf_GATK4.MergeGVCFs.preemptible_attempts": "(optional) Int?"
40+
"##_COMMENT10": "PREEMPTION",
41+
"#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.preemptible_attempts": "Int? (optional)",
42+
"#HaplotypeCallerGvcf_GATK4.MergeGVCFs.preemptible_attempts": "Int? (optional)",
43+
"#HaplotypeCallerGvcf_GATK4.CramToBamTask.preemptible_attempts": "Int? (optional)"
3644
}

0 commit comments

Comments
 (0)