1 ## Copyright Broad Institute, 2017
3 ## This WDL workflow runs HaplotypeCaller from GATK4 in GVCF mode on a single sample
4 ## according to the GATK Best Practices (June 2016), scattered across intervals.
6 ## Requirements/expectations :
7 ## - One analysis-ready BAM file for a single sample (as identified in RG:SM)
8 ## - Set of variant calling intervals lists for the scatter, provided in a file
11 ## - One GVCF file and its index
13 ## Cromwell version support
14 ## - Successfully tested on v31
15 ## - Does not work on versions < v23 due to output syntax
17 ## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
20 ## This script is released under the WDL source code license (BSD-3) (see LICENSE in
21 ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
22 ## be subject to different licenses. Users are responsible for checking that they are
23 ## authorized to run all programs before running this script. Please see the dockers
24 ## for detailed licensing information pertaining to the included programs.
27 workflow HaplotypeCallerGvcf_GATK4 {
33 File scattered_calling_intervals_list
36 Boolean making_gvcf = select_first([make_gvcf,true])
38 String? gatk_docker_override
39 String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.0.6.0"])
40 String? gatk_path_override
41 String gatk_path = select_first([gatk_path_override, "gatk"])
42 String? gitc_docker_override
43 String gitc_docker = select_first([gitc_docker_override, "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"])
45 Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list)
47 #is the input a cram file?
48 Boolean is_cram = sub(basename(input_bam), ".*\\.", "") == "cram"
50 String sample_basename = if is_cram then basename(input_bam, ".cram") else basename(input_bam, ".bam")
51 String vcf_basename = sample_basename
52 String output_suffix = if making_gvcf then ".g.vcf.gz" else ".vcf.gz"
53 String output_filename = vcf_basename + output_suffix
58 input_cram = input_bam,
59 sample_name = sample_basename,
61 ref_fasta = ref_fasta,
62 ref_fasta_index = ref_fasta_index,
67 # Call variants in parallel over grouped calling intervals
68 scatter (interval_file in scattered_calling_intervals) {
70 # Generate GVCF by interval
71 call HaplotypeCaller {
73 input_bam = select_first([CramToBamTask.output_bam, input_bam]),
74 input_bam_index = select_first([CramToBamTask.output_bai, input_bam_index]),
75 interval_list = interval_file,
76 output_filename = output_filename,
78 ref_fasta = ref_fasta,
79 ref_fasta_index = ref_fasta_index,
80 make_gvcf = making_gvcf,
86 # Merge per-interval GVCFs
89 input_vcfs = HaplotypeCaller.output_vcf,
90 input_vcfs_indexes = HaplotypeCaller.output_vcf_index,
91 output_filename = output_filename,
96 # Outputs that will be retained when execution is complete
98 File output_vcf = MergeGVCFs.output_vcf
99 File output_vcf_index = MergeGVCFs.output_vcf_index
117 Boolean use_ssd = false
118 Int? preemptible_attempts
120 Float output_bam_size = size(input_cram, "GB") / 0.60
121 Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
122 Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + 20
128 samtools view -h -T ${ref_fasta} ${input_cram} |
129 samtools view -b -o ${sample_name}.bam -
130 samtools index -b ${sample_name}.bam
131 mv ${sample_name}.bam.bai ${sample_name}.bai
135 memory: select_first([machine_mem_gb, 15]) + " GB"
136 disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD"
137 preemptibe: preemptible_attempts
140 File output_bam = "${sample_name}.bam"
141 File output_bai = "${sample_name}.bai"
145 # HaplotypeCaller per-sample in GVCF mode
146 task HaplotypeCaller {
150 String output_filename
159 String java_opt = select_first([java_options, "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10"])
165 Boolean use_ssd = false
166 Int? preemptible_attempts
168 Int machine_mem_gb = select_first([mem_gb, 7])
169 Int command_mem_gb = machine_mem_gb - 1
171 Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB")
172 Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20
177 ${gatk_path} --java-options "-Xmx${command_mem_gb}G ${java_opt}" \
181 -L ${interval_list} \
182 -O ${output_filename} \
183 -contamination ${default=0 contamination} ${true="-ERC GVCF" false="" make_gvcf}
188 memory: machine_mem_gb + " GB"
189 disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD"
190 preemptible: select_first([preemptible_attempts, 3])
194 File output_vcf = "${output_filename}"
195 File output_vcf_index = "${output_filename}.tbi"
198 # Merge GVCFs generated per-interval for the same sample
200 Array[File] input_vcfs
201 Array[File] input_vcfs_indexes
202 String output_filename
210 Boolean use_ssd = false
211 Int? preemptible_attempts
213 Int machine_mem_gb = select_first([mem_gb, 3])
214 Int command_mem_gb = machine_mem_gb - 1
219 ${gatk_path} --java-options "-Xmx${command_mem_gb}G" \
221 --INPUT ${sep=' --INPUT ' input_vcfs} \
222 --OUTPUT ${output_filename}
227 memory: machine_mem_gb + " GB"
228 disks: "local-disk " + select_first([disk_space_gb, 100]) + if use_ssd then " SSD" else " HDD"
229 preemptible: select_first([preemptible_attempts, 3])
234 File output_vcf = "${output_filename}"
235 File output_vcf_index = "${output_filename}.tbi"