|
| 1 | +## Copyright Broad Institute, 2017 |
| 2 | +## |
| 3 | +## This WDL workflow runs HaplotypeCaller from GATK4 in GVCF mode on a single sample |
| 4 | +## according to the GATK Best Practices (June 2016), scattered across intervals. |
| 5 | +## |
| 6 | +## Requirements/expectations : |
| 7 | +## - One analysis-ready BAM file for a single sample (as identified in RG:SM) |
| 8 | +## - Set of variant calling intervals lists for the scatter, provided in a file |
| 9 | +## |
| 10 | +## Outputs : |
| 11 | +## - One GVCF file and its index |
| 12 | +## |
| 13 | +## Cromwell version support |
| 14 | +## - Successfully tested on v31 |
| 15 | +## - Does not work on versions < v23 due to output syntax |
| 16 | +## |
| 17 | +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. |
| 18 | +## |
| 19 | +## LICENSING : |
| 20 | +## This script is released under the WDL source code license (BSD-3) (see LICENSE in |
| 21 | +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may |
| 22 | +## be subject to different licenses. Users are responsible for checking that they are |
| 23 | +## authorized to run all programs before running this script. Please see the dockers |
| 24 | +## for detailed licensing information pertaining to the included programs. |
| 25 | +
|
| 26 | +# WORKFLOW DEFINITION |
| 27 | +workflow HaplotypeCallerGvcf_GATK4 { |
| 28 | + File input_bam |
| 29 | + File input_bam_index |
| 30 | + File ref_dict |
| 31 | + File ref_fasta |
| 32 | + File ref_fasta_index |
| 33 | + File scattered_calling_intervals_list |
| 34 | + |
| 35 | + Boolean? make_gvcf |
| 36 | + Boolean making_gvcf = select_first([make_gvcf,true]) |
| 37 | + |
| 38 | + String? gatk_docker_override |
| 39 | + String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.0.6.0"]) |
| 40 | + String? gatk_path_override |
| 41 | + String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) |
| 42 | + String? gitc_docker_override |
| 43 | + String gitc_docker = select_first([gitc_docker_override, "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"]) |
| 44 | + |
| 45 | + Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list) |
| 46 | + |
| 47 | + #is the input a cram file? |
| 48 | + Boolean is_cram = sub(basename(input_bam), ".*\\.", "") == "cram" |
| 49 | + |
| 50 | + String sample_basename = if is_cram then basename(input_bam, ".cram") else basename(input_bam, ".bam") |
| 51 | + String vcf_basename = sample_basename |
| 52 | + String output_suffix = if making_gvcf then ".g.vcf.gz" else ".vcf.gz" |
| 53 | + String output_filename = vcf_basename + output_suffix |
| 54 | + |
| 55 | + # We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller. |
| 56 | + # If we take the number we are scattering by and reduce by 20 we will have enough disk space |
| 57 | + # to account for the fact that the data is quite uneven across the shards. |
| 58 | + Int potential_hc_divisor = length(scattered_calling_intervals) - 20 |
| 59 | + Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1 |
| 60 | + |
| 61 | + |
| 62 | + if ( is_cram ) { |
| 63 | + call CramToBamTask { |
| 64 | + input: |
| 65 | + input_cram = input_bam, |
| 66 | + sample_name = sample_basename, |
| 67 | + ref_dict = ref_dict, |
| 68 | + ref_fasta = ref_fasta, |
| 69 | + ref_fasta_index = ref_fasta_index, |
| 70 | + docker = gitc_docker |
| 71 | + } |
| 72 | + } |
| 73 | +
|
| 74 | + # Call variants in parallel over grouped calling intervals |
| 75 | + scatter (interval_file in scattered_calling_intervals) { |
| 76 | + |
| 77 | + # Generate GVCF by interval |
| 78 | + call HaplotypeCaller { |
| 79 | + input: |
| 80 | + input_bam = select_first([CramToBamTask.output_bam, input_bam]), |
| 81 | + input_bam_index = select_first([CramToBamTask.output_bai, input_bam_index]), |
| 82 | + interval_list = interval_file, |
| 83 | + output_filename = output_filename, |
| 84 | + ref_dict = ref_dict, |
| 85 | + ref_fasta = ref_fasta, |
| 86 | + ref_fasta_index = ref_fasta_index, |
| 87 | + hc_scatter = hc_divisor, |
| 88 | + make_gvcf = making_gvcf, |
| 89 | + docker = gatk_docker, |
| 90 | + gatk_path = gatk_path |
| 91 | + } |
| 92 | + } |
| 93 | +
|
| 94 | + # Merge per-interval GVCFs |
| 95 | + call MergeGVCFs { |
| 96 | + input: |
| 97 | + input_vcfs = HaplotypeCaller.output_vcf, |
| 98 | + input_vcfs_indexes = HaplotypeCaller.output_vcf_index, |
| 99 | + output_filename = output_filename, |
| 100 | + docker = gatk_docker, |
| 101 | + gatk_path = gatk_path |
| 102 | + } |
| 103 | +
|
| 104 | + # Outputs that will be retained when execution is complete |
| 105 | + output { |
| 106 | + File output_vcf = MergeGVCFs.output_vcf |
| 107 | + File output_vcf_index = MergeGVCFs.output_vcf_index |
| 108 | + } |
| 109 | +} |
| 110 | + |
| 111 | +# TASK DEFINITIONS |
| 112 | +
|
| 113 | +task CramToBamTask { |
| 114 | + # Command parameters |
| 115 | + File ref_fasta |
| 116 | + File ref_fasta_index |
| 117 | + File ref_dict |
| 118 | + File input_cram |
| 119 | + String sample_name |
| 120 | + |
| 121 | + # Runtime parameters |
| 122 | + String docker |
| 123 | + Int? machine_mem_gb |
| 124 | + Int? disk_space_gb |
| 125 | + Boolean use_ssd = false |
| 126 | + Int? preemptible_attempts |
| 127 | + |
| 128 | + Float output_bam_size = size(input_cram, "GB") / 0.60 |
| 129 | + Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") |
| 130 | + Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + 20 |
| 131 | + |
| 132 | + command { |
| 133 | + set -e |
| 134 | + set -o pipefail |
| 135 | +
|
| 136 | + samtools view -h -T ${ref_fasta} ${input_cram} | |
| 137 | + samtools view -b -o ${sample_name}.bam - |
| 138 | + samtools index -b ${sample_name}.bam |
| 139 | + mv ${sample_name}.bam.bai ${sample_name}.bai |
| 140 | + } |
| 141 | + runtime { |
| 142 | + docker: docker |
| 143 | + memory: select_first([machine_mem_gb, 15]) + " GB" |
| 144 | + disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD" |
| 145 | + preemptible: preemptible_attempts |
| 146 | + } |
| 147 | + output { |
| 148 | + File output_bam = "${sample_name}.bam" |
| 149 | + File output_bai = "${sample_name}.bai" |
| 150 | + } |
| 151 | +} |
| 152 | + |
| 153 | +# HaplotypeCaller per-sample in GVCF mode |
| 154 | +task HaplotypeCaller { |
| 155 | + String input_bam |
| 156 | + String input_bam_index |
| 157 | + File interval_list |
| 158 | + String output_filename |
| 159 | + File ref_dict |
| 160 | + File ref_fasta |
| 161 | + File ref_fasta_index |
| 162 | + Float? contamination |
| 163 | + Boolean make_gvcf |
| 164 | + Int hc_scatter |
| 165 | + |
| 166 | + String gatk_path |
| 167 | + String? java_options |
| 168 | + String java_opt = select_first([java_options, "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10"]) |
| 169 | + |
| 170 | + # Runtime parameters |
| 171 | + String docker |
| 172 | + Int? mem_gb |
| 173 | + Int? disk_space_gb |
| 174 | + Boolean use_ssd = false |
| 175 | + Int? preemptible_attempts |
| 176 | + |
| 177 | + Int machine_mem_gb = select_first([mem_gb, 7]) |
| 178 | + Int command_mem_gb = machine_mem_gb - 1 |
| 179 | + |
| 180 | + Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") |
| 181 | + Int disk_size = ceil(((size(input_bam, "GB") + 30) / hc_scatter) + ref_size) + 20 |
| 182 | + |
| 183 | + command <<< |
| 184 | + set -e |
| 185 | + |
| 186 | + ${gatk_path} --java-options "-Xmx${command_mem_gb}G ${java_opt}" \ |
| 187 | + HaplotypeCaller \ |
| 188 | + -R ${ref_fasta} \ |
| 189 | + -I ${input_bam} \ |
| 190 | + -L ${interval_list} \ |
| 191 | + -O ${output_filename} \ |
| 192 | + -contamination ${default=0 contamination} ${true="-ERC GVCF" false="" make_gvcf} |
| 193 | + >>> |
| 194 | + |
| 195 | + runtime { |
| 196 | + docker: docker |
| 197 | + memory: machine_mem_gb + " GB" |
| 198 | + disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD" |
| 199 | + preemptible: select_first([preemptible_attempts, 3]) |
| 200 | + } |
| 201 | + |
| 202 | + output { |
| 203 | + File output_vcf = "${output_filename}" |
| 204 | + File output_vcf_index = "${output_filename}.tbi" |
| 205 | + } |
| 206 | +} |
| 207 | +# Merge GVCFs generated per-interval for the same sample |
| 208 | +task MergeGVCFs { |
| 209 | + Array[File] input_vcfs |
| 210 | + Array[File] input_vcfs_indexes |
| 211 | + String output_filename |
| 212 | + |
| 213 | + String gatk_path |
| 214 | + |
| 215 | + # Runtime parameters |
| 216 | + String docker |
| 217 | + Int? mem_gb |
| 218 | + Int? disk_space_gb |
| 219 | + Boolean use_ssd = false |
| 220 | + Int? preemptible_attempts |
| 221 | + |
| 222 | + Int machine_mem_gb = select_first([mem_gb, 3]) |
| 223 | + Int command_mem_gb = machine_mem_gb - 1 |
| 224 | + |
| 225 | + command <<< |
| 226 | + set -e |
| 227 | +
|
| 228 | + ${gatk_path} --java-options "-Xmx${command_mem_gb}G" \ |
| 229 | + MergeVcfs \ |
| 230 | + --INPUT ${sep=' --INPUT ' input_vcfs} \ |
| 231 | + --OUTPUT ${output_filename} |
| 232 | + >>> |
| 233 | + |
| 234 | + runtime { |
| 235 | + docker: docker |
| 236 | + memory: machine_mem_gb + " GB" |
| 237 | + disks: "local-disk " + select_first([disk_space_gb, 100]) + if use_ssd then " SSD" else " HDD" |
| 238 | + preemptible: select_first([preemptible_attempts, 3]) |
| 239 | + } |
| 240 | + |
| 241 | + |
| 242 | + output { |
| 243 | + File output_vcf = "${output_filename}" |
| 244 | + File output_vcf_index = "${output_filename}.tbi" |
| 245 | + } |
| 246 | +} |
| 247 | + |
0 commit comments