1 ## Copyright Broad Institute, 2018
3 ## This WDL implements the joint discovery and VQSR filtering portion of the GATK
4 ## Best Practices (June 2016) for germline SNP and Indel discovery in human
5 ## whole-genome sequencing (WGS) and exome sequencing data.
7 ## Requirements/expectations :
8 ## - One or more GVCFs produced by HaplotypeCaller in GVCF mode
9 ## - Bare minimum 1 WGS sample or 30 Exome samples. Gene panels are not supported.
12 ## - A VCF file and its index, filtered using variant quality score recalibration
13 ## (VQSR) with genotypes for all samples present in the input VCF. All sites that
14 ## are present in the input VCF are retained; filtered sites are annotated as such
15 ## in the FILTER field.
17 ## Note about VQSR wiring :
18 ## The SNP and INDEL models are built in parallel, but then the corresponding
19 ## recalibrations are applied in series. Because the INDEL model is generally ready
20 ## first (because there are fewer indels than SNPs) we set INDEL recalibration to
21 ## be applied first to the input VCF, while the SNP model is still being built. By
22 ## the time the SNP model is available, the indel-recalibrated file is available to
23 ## serve as input to apply the SNP recalibration. If we did it the other way around,
24 ## we would have to wait until the SNP recal file was available despite the INDEL
25 ## recal file being there already, then apply SNP recalibration, then apply INDEL
26 ## recalibration. This would lead to a longer wall clock time for complete workflow
27 ## execution. Wiring the INDEL recalibration to be applied first solves the problem.
29 ## Cromwell version support
30 ## - Successfully tested on v31
31 ## - Does not work on versions < v23 due to output syntax
33 ## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
34 ## For program versions, see docker containers.
37 ## This script is released under the WDL source code license (BSD-3) (see LICENSE in
38 ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
39 ## be subject to different licenses. Users are responsible for checking that they are
40 ## authorized to run all programs before running this script. Please see the docker
41 ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
42 ## licensing information pertaining to the included programs.
45 # JAVA_OPTS="-Dsystem.input-read-limits.lines=500000 -Dbackend.providers.Local.config.concurrent-job-limit=16" cromwell run germline2-joint-discovery-gatk4-local.wdl -i germline2-joint-discovery-gatk4-local.hg38.wgs.inputs.json > cromwell.germline2.log &
47 workflow JointGenotyping {
48 File unpadded_intervals_file
59 Array[String] sample_names
60 Array[File] input_gvcfs
61 Array[File] input_gvcfs_indices
71 Array[String] snp_recalibration_tranche_values
72 Array[String] snp_recalibration_annotation_values
73 Array[String] indel_recalibration_tranche_values
74 Array[String] indel_recalibration_annotation_values
76 File eval_interval_list
77 File hapmap_resource_vcf
78 File hapmap_resource_vcf_index
79 File omni_resource_vcf
80 File omni_resource_vcf_index
81 File one_thousand_genomes_resource_vcf
82 File one_thousand_genomes_resource_vcf_index
83 File mills_resource_vcf
84 File mills_resource_vcf_index
85 File axiomPoly_resource_vcf
86 File axiomPoly_resource_vcf_index
87 File dbsnp_resource_vcf = dbsnp_vcf
88 File dbsnp_resource_vcf_index = dbsnp_vcf_index
90 # ExcessHet is a phred-scaled p-value. We want a cutoff of anything more extreme
91 # than a z-score of -4.5 which is a p-value of 3.4e-06, which phred-scaled is 54.69
92 Float excess_het_threshold = 54.69
93 Float snp_filter_level
94 Float indel_filter_level
95 Int SNP_VQSR_downsampleFactor
97 Int num_of_original_intervals = length(read_lines(unpadded_intervals_file))
98 Int num_gvcfs = length(input_gvcfs)
100 # Make a 2.5:1 interval number to samples in callset ratio interval list
101 Int possible_merge_count = floor(num_of_original_intervals / num_gvcfs / 2.5)
102 Int merge_count = if possible_merge_count > 1 then possible_merge_count else 1
104 call DynamicallyCombineIntervals {
106 intervals = unpadded_intervals_file,
107 merge_count = merge_count
110 Array[String] unpadded_intervals = read_lines(DynamicallyCombineIntervals.output_intervals)
112 scatter (idx in range(length(unpadded_intervals))) {
113 # the batch_size value was carefully chosen here as it
114 # is the optimal value for the amount of memory allocated
115 # within the task; please do not change it without consulting
116 # the Hellbender (GATK engine) team!
119 sample_names = sample_names,
120 interval = unpadded_intervals[idx],
121 workspace_dir_name = "genomicsdb",
122 input_gvcfs = input_gvcfs,
123 input_gvcfs_indices = input_gvcfs_indices,
124 disk_size = medium_disk,
125 docker = gatk_docker,
126 gatk_path = gatk_path,
132 workspace_tar = ImportGVCFs.output_genomicsdb,
133 interval = unpadded_intervals[idx],
134 output_vcf_filename = "output.vcf.gz",
135 ref_fasta = ref_fasta,
136 ref_fasta_index = ref_fasta_index,
138 dbsnp_vcf = dbsnp_vcf,
139 dbsnp_vcf_index = dbsnp_vcf_index,
140 disk_size = medium_disk,
141 docker = gatk_docker,
142 gatk_path = gatk_path
145 call HardFilterAndMakeSitesOnlyVcf {
147 vcf = GenotypeGVCFs.output_vcf,
148 vcf_index = GenotypeGVCFs.output_vcf_index,
149 excess_het_threshold = excess_het_threshold,
150 variant_filtered_vcf_filename = callset_name + "." + idx + ".variant_filtered.vcf.gz",
151 sites_only_vcf_filename = callset_name + "." + idx + ".sites_only.variant_filtered.vcf.gz",
152 disk_size = medium_disk,
153 docker = gatk_docker,
154 gatk_path = gatk_path
158 call GatherVcfs as SitesOnlyGatherVcf {
160 input_vcfs_fofn = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf,
161 input_vcf_indexes_fofn = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index,
162 output_vcf_name = callset_name + ".sites_only.vcf.gz",
163 disk_size = medium_disk,
164 docker = gatk_docker,
165 gatk_path = gatk_path
168 call IndelsVariantRecalibrator {
170 sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf,
171 sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index,
172 recalibration_filename = callset_name + ".indels.recal",
173 tranches_filename = callset_name + ".indels.tranches",
174 recalibration_tranche_values = indel_recalibration_tranche_values,
175 recalibration_annotation_values = indel_recalibration_annotation_values,
176 mills_resource_vcf = mills_resource_vcf,
177 mills_resource_vcf_index = mills_resource_vcf_index,
178 axiomPoly_resource_vcf = axiomPoly_resource_vcf,
179 axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index,
180 dbsnp_resource_vcf = dbsnp_resource_vcf,
181 dbsnp_resource_vcf_index = dbsnp_resource_vcf_index,
182 disk_size = small_disk,
183 docker = gatk_docker,
184 gatk_path = gatk_path
187 if (num_gvcfs > 10000) {
188 call SNPsVariantRecalibratorCreateModel {
190 sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf,
191 sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index,
192 recalibration_filename = callset_name + ".snps.recal",
193 tranches_filename = callset_name + ".snps.tranches",
194 recalibration_tranche_values = snp_recalibration_tranche_values,
195 recalibration_annotation_values = snp_recalibration_annotation_values,
196 downsampleFactor = SNP_VQSR_downsampleFactor,
197 model_report_filename = callset_name + ".snps.model.report",
198 hapmap_resource_vcf = hapmap_resource_vcf,
199 hapmap_resource_vcf_index = hapmap_resource_vcf_index,
200 omni_resource_vcf = omni_resource_vcf,
201 omni_resource_vcf_index = omni_resource_vcf_index,
202 one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
203 one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
204 dbsnp_resource_vcf = dbsnp_resource_vcf,
205 dbsnp_resource_vcf_index = dbsnp_resource_vcf_index,
206 disk_size = small_disk,
207 docker = gatk_docker,
208 gatk_path = gatk_path
211 scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf))) {
212 call SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered {
214 sites_only_variant_filtered_vcf = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf[idx],
215 sites_only_variant_filtered_vcf_index = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index[idx],
216 recalibration_filename = callset_name + ".snps." + idx + ".recal",
217 tranches_filename = callset_name + ".snps." + idx + ".tranches",
218 recalibration_tranche_values = snp_recalibration_tranche_values,
219 recalibration_annotation_values = snp_recalibration_annotation_values,
220 model_report = SNPsVariantRecalibratorCreateModel.model_report,
221 hapmap_resource_vcf = hapmap_resource_vcf,
222 hapmap_resource_vcf_index = hapmap_resource_vcf_index,
223 omni_resource_vcf = omni_resource_vcf,
224 omni_resource_vcf_index = omni_resource_vcf_index,
225 one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
226 one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
227 dbsnp_resource_vcf = dbsnp_resource_vcf,
228 dbsnp_resource_vcf_index = dbsnp_resource_vcf_index,
229 disk_size = small_disk,
230 docker = gatk_docker,
231 gatk_path = gatk_path
234 call GatherTranches as SNPGatherTranches {
236 input_fofn = SNPsVariantRecalibratorScattered.tranches,
237 output_filename = callset_name + ".snps.gathered.tranches",
238 disk_size = small_disk,
239 docker = gatk_docker,
240 gatk_path = gatk_path
244 if (num_gvcfs <= 10000){
245 call SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic {
247 sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf,
248 sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index,
249 recalibration_filename = callset_name + ".snps.recal",
250 tranches_filename = callset_name + ".snps.tranches",
251 recalibration_tranche_values = snp_recalibration_tranche_values,
252 recalibration_annotation_values = snp_recalibration_annotation_values,
253 hapmap_resource_vcf = hapmap_resource_vcf,
254 hapmap_resource_vcf_index = hapmap_resource_vcf_index,
255 omni_resource_vcf = omni_resource_vcf,
256 omni_resource_vcf_index = omni_resource_vcf_index,
257 one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
258 one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
259 dbsnp_resource_vcf = dbsnp_resource_vcf,
260 dbsnp_resource_vcf_index = dbsnp_resource_vcf_index,
261 disk_size = small_disk,
262 docker = gatk_docker,
263 gatk_path = gatk_path
267 # For small callsets (fewer than 1000 samples) we can gather the VCF shards and collect metrics directly.
268 # For anything larger, we need to keep the VCF sharded and gather metrics collected from them.
269 Boolean is_small_callset = num_gvcfs <= 1000
271 scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf))) {
272 call ApplyRecalibration {
274 recalibrated_vcf_filename = callset_name + ".filtered." + idx + ".vcf.gz",
275 input_vcf = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf[idx],
276 input_vcf_index = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf_index[idx],
277 indels_recalibration = IndelsVariantRecalibrator.recalibration,
278 indels_recalibration_index = IndelsVariantRecalibrator.recalibration_index,
279 indels_tranches = IndelsVariantRecalibrator.tranches,
280 snps_recalibration = if defined(SNPsVariantRecalibratorScattered.recalibration) then select_first([SNPsVariantRecalibratorScattered.recalibration])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration]),
281 snps_recalibration_index = if defined(SNPsVariantRecalibratorScattered.recalibration_index) then select_first([SNPsVariantRecalibratorScattered.recalibration_index])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration_index]),
282 snps_tranches = select_first([SNPGatherTranches.tranches, SNPsVariantRecalibratorClassic.tranches]),
283 indel_filter_level = indel_filter_level,
284 snp_filter_level = snp_filter_level,
285 disk_size = medium_disk,
286 docker = gatk_docker,
287 gatk_path = gatk_path
290 # for large callsets we need to collect metrics from the shards and gather them later
291 if (!is_small_callset) {
292 call CollectVariantCallingMetrics as CollectMetricsSharded {
294 input_vcf = ApplyRecalibration.recalibrated_vcf,
295 input_vcf_index = ApplyRecalibration.recalibrated_vcf_index,
296 metrics_filename_prefix = callset_name + "." + idx,
297 dbsnp_vcf = dbsnp_vcf,
298 dbsnp_vcf_index = dbsnp_vcf_index,
299 interval_list = eval_interval_list,
301 disk_size = medium_disk,
302 docker = gatk_docker,
303 gatk_path = gatk_path
308 # for small callsets we can gather the VCF shards and then collect metrics on it
309 if (is_small_callset) {
310 call GatherVcfs as FinalGatherVcf {
312 input_vcfs_fofn = ApplyRecalibration.recalibrated_vcf,
313 input_vcf_indexes_fofn = ApplyRecalibration.recalibrated_vcf_index,
314 output_vcf_name = callset_name + ".vcf.gz",
315 disk_size = huge_disk,
316 docker = gatk_docker,
317 gatk_path = gatk_path
320 call CollectVariantCallingMetrics as CollectMetricsOnFullVcf {
322 input_vcf = FinalGatherVcf.output_vcf,
323 input_vcf_index = FinalGatherVcf.output_vcf_index,
324 metrics_filename_prefix = callset_name,
325 dbsnp_vcf = dbsnp_vcf,
326 dbsnp_vcf_index = dbsnp_vcf_index,
327 interval_list = eval_interval_list,
329 disk_size = large_disk,
330 docker = gatk_docker,
331 gatk_path = gatk_path
335 # for large callsets we still need to gather the sharded metrics
336 if (!is_small_callset) {
339 input_details_fofn = select_all(CollectMetricsSharded.detail_metrics_file),
340 input_summaries_fofn = select_all(CollectMetricsSharded.summary_metrics_file),
341 output_prefix = callset_name,
342 disk_size = medium_disk,
343 docker = gatk_docker,
344 gatk_path = gatk_path
349 # outputs from the small callset path through the wdl
350 FinalGatherVcf.output_vcf
351 FinalGatherVcf.output_vcf_index
352 CollectMetricsOnFullVcf.detail_metrics_file
353 CollectMetricsOnFullVcf.summary_metrics_file
355 # outputs from the large callset path through the wdl
356 # (note that we do not list ApplyRecalibration here because it is run in both paths)
357 GatherMetrics.detail_metrics_file
358 GatherMetrics.summary_metrics_file
360 # output the interval list generated/used by this run workflow
361 DynamicallyCombineIntervals.output_intervals
365 task GetNumberOfSamples {
369 wc -l ${sample_name_map} | awk '{print $1}'
377 Int sample_count = read_int(stdout())
382 Array[String] sample_names
383 Array[File] input_gvcfs
384 Array[File] input_gvcfs_indices
387 String workspace_dir_name
399 gvcfs = ['${sep="','" input_gvcfs}']
400 sample_names = ['${sep="','" sample_names}']
402 if len(gvcfs)!= len(sample_names):
405 with open("inputs.list", "w") as fi:
406 for i in range(len(gvcfs)):
407 fi.write(sample_names[i] + "\t" + gvcfs[i] + "\n")
411 # The memory setting here is very important and must be several GB lower
412 # than the total memory allocated to the VM because this tool uses
413 # a significant amount of non-heap memory for native libraries.
414 # Also, testing has shown that the multithreaded reader initialization
415 # does not scale well beyond 5 threads, so don't increase beyond that.
416 ${gatk_path} --java-options "-Xmx4g -Xms4g" \
418 --genomicsdb-workspace-path ${workspace_dir_name} \
419 --batch-size ${batch_size} \
421 --sample-name-map inputs.list \
425 tar -cf ${workspace_dir_name}.tar ${workspace_dir_name}
432 disks: "local-disk " + disk_size + " HDD"
436 File output_genomicsdb = "${workspace_dir_name}.tar"
444 String output_vcf_filename
460 tar -xf ${workspace_tar}
461 WORKSPACE=$( basename ${workspace_tar} .tar)
463 ${gatk_path} --java-options "-Xmx5g -Xms5g" \
466 -O ${output_vcf_filename} \
468 -G StandardAnnotation \
469 --only-output-calls-starting-in-intervals \
470 --use-new-qual-calculator \
471 -V gendb://$WORKSPACE \
478 disks: "local-disk " + disk_size + " HDD"
482 File output_vcf = "${output_vcf_filename}"
483 File output_vcf_index = "${output_vcf_filename}.tbi"
487 task HardFilterAndMakeSitesOnlyVcf {
490 Float excess_het_threshold
492 String variant_filtered_vcf_filename
493 String sites_only_vcf_filename
502 ${gatk_path} --java-options "-Xmx3g -Xms3g" \
504 --filter-expression "ExcessHet > ${excess_het_threshold}" \
505 --filter-name ExcessHet \
506 -O ${variant_filtered_vcf_filename} \
509 ${gatk_path} --java-options "-Xmx3g -Xms3g" \
511 --INPUT ${variant_filtered_vcf_filename} \
512 --OUTPUT ${sites_only_vcf_filename}
519 disks: "local-disk " + disk_size + " HDD"
523 File variant_filtered_vcf = "${variant_filtered_vcf_filename}"
524 File variant_filtered_vcf_index = "${variant_filtered_vcf_filename}.tbi"
525 File sites_only_vcf = "${sites_only_vcf_filename}"
526 File sites_only_vcf_index = "${sites_only_vcf_filename}.tbi"
530 task IndelsVariantRecalibrator {
531 String recalibration_filename
532 String tranches_filename
534 Array[String] recalibration_tranche_values
535 Array[String] recalibration_annotation_values
537 File sites_only_variant_filtered_vcf
538 File sites_only_variant_filtered_vcf_index
540 File mills_resource_vcf
541 File axiomPoly_resource_vcf
542 File dbsnp_resource_vcf
543 File mills_resource_vcf_index
544 File axiomPoly_resource_vcf_index
545 File dbsnp_resource_vcf_index
552 ${gatk_path} --java-options "-Xmx24g -Xms24g" \
553 VariantRecalibrator \
554 -V ${sites_only_variant_filtered_vcf} \
555 -O ${recalibration_filename} \
556 --tranches-file ${tranches_filename} \
557 --trust-all-polymorphic \
558 -tranche ${sep=' -tranche ' recalibration_tranche_values} \
559 -an ${sep=' -an ' recalibration_annotation_values} \
562 -resource mills,known=false,training=true,truth=true,prior=12:${mills_resource_vcf} \
563 -resource axiomPoly,known=false,training=true,truth=false,prior=10:${axiomPoly_resource_vcf} \
564 -resource dbsnp,known=true,training=false,truth=false,prior=2:${dbsnp_resource_vcf}
570 disks: "local-disk " + disk_size + " HDD"
574 File recalibration = "${recalibration_filename}"
575 File recalibration_index = "${recalibration_filename}.idx"
576 File tranches = "${tranches_filename}"
580 task SNPsVariantRecalibratorCreateModel {
581 String recalibration_filename
582 String tranches_filename
584 String model_report_filename
586 Array[String] recalibration_tranche_values
587 Array[String] recalibration_annotation_values
589 File sites_only_variant_filtered_vcf
590 File sites_only_variant_filtered_vcf_index
592 File hapmap_resource_vcf
593 File omni_resource_vcf
594 File one_thousand_genomes_resource_vcf
595 File dbsnp_resource_vcf
596 File hapmap_resource_vcf_index
597 File omni_resource_vcf_index
598 File one_thousand_genomes_resource_vcf_index
599 File dbsnp_resource_vcf_index
606 ${gatk_path} --java-options "-Xmx100g -Xms100g" \
607 VariantRecalibrator \
608 -V ${sites_only_variant_filtered_vcf} \
609 -O ${recalibration_filename} \
610 --tranches-file ${tranches_filename} \
611 --trust-all-polymorphic \
612 -tranche ${sep=' -tranche ' recalibration_tranche_values} \
613 -an ${sep=' -an ' recalibration_annotation_values} \
615 --sample-every-Nth-variant ${downsampleFactor} \
616 --output-model ${model_report_filename} \
618 -resource hapmap,known=false,training=true,truth=true,prior=15:${hapmap_resource_vcf} \
619 -resource omni,known=false,training=true,truth=true,prior=12:${omni_resource_vcf} \
620 -resource 1000G,known=false,training=true,truth=false,prior=10:${one_thousand_genomes_resource_vcf} \
621 -resource dbsnp,known=true,training=false,truth=false,prior=7:${dbsnp_resource_vcf}
627 disks: "local-disk " + disk_size + " HDD"
631 File model_report = "${model_report_filename}"
635 task SNPsVariantRecalibrator {
636 String recalibration_filename
637 String tranches_filename
640 Array[String] recalibration_tranche_values
641 Array[String] recalibration_annotation_values
643 File sites_only_variant_filtered_vcf
644 File sites_only_variant_filtered_vcf_index
646 File hapmap_resource_vcf
647 File omni_resource_vcf
648 File one_thousand_genomes_resource_vcf
649 File dbsnp_resource_vcf
650 File hapmap_resource_vcf_index
651 File omni_resource_vcf_index
652 File one_thousand_genomes_resource_vcf_index
653 File dbsnp_resource_vcf_index
660 ${gatk_path} --java-options "-Xmx3g -Xms3g" \
661 VariantRecalibrator \
662 -V ${sites_only_variant_filtered_vcf} \
663 -O ${recalibration_filename} \
664 --tranches-file ${tranches_filename} \
665 --trust-all-polymorphic \
666 -tranche ${sep=' -tranche ' recalibration_tranche_values} \
667 -an ${sep=' -an ' recalibration_annotation_values} \
669 ${"--input-model " + model_report + " --output-tranches-for-scatter "} \
671 -resource hapmap,known=false,training=true,truth=true,prior=15:${hapmap_resource_vcf} \
672 -resource omni,known=false,training=true,truth=true,prior=12:${omni_resource_vcf} \
673 -resource 1000G,known=false,training=true,truth=false,prior=10:${one_thousand_genomes_resource_vcf} \
674 -resource dbsnp,known=true,training=false,truth=false,prior=7:${dbsnp_resource_vcf}
680 disks: "local-disk " + disk_size + " HDD"
684 File recalibration = "${recalibration_filename}"
685 File recalibration_index = "${recalibration_filename}.idx"
686 File tranches = "${tranches_filename}"
690 task GatherTranches {
691 Array[File] input_fofn
692 String output_filename
703 ${gatk_path} --java-options "-Xmx6g -Xms6g" \
705 --input ${sep=" --input " input_fofn} \
706 --output ${output_filename}
712 disks: "local-disk " + disk_size + " HDD"
716 File tranches = "${output_filename}"
720 task ApplyRecalibration {
721 String recalibrated_vcf_filename
724 File indels_recalibration
725 File indels_recalibration_index
727 File snps_recalibration
728 File snps_recalibration_index
731 Float indel_filter_level
732 Float snp_filter_level
741 ${gatk_path} --java-options "-Xmx5g -Xms5g" \
743 -O tmp.indel.recalibrated.vcf \
745 --recal-file ${indels_recalibration} \
746 --tranches-file ${indels_tranches} \
747 --truth-sensitivity-filter-level ${indel_filter_level} \
748 --create-output-variant-index true \
751 ${gatk_path} --java-options "-Xmx5g -Xms5g" \
753 -O ${recalibrated_vcf_filename} \
754 -V tmp.indel.recalibrated.vcf \
755 --recal-file ${snps_recalibration} \
756 --tranches-file ${snps_tranches} \
757 --truth-sensitivity-filter-level ${snp_filter_level} \
758 --create-output-variant-index true \
765 disks: "local-disk " + disk_size + " HDD"
769 File recalibrated_vcf = "${recalibrated_vcf_filename}"
770 File recalibrated_vcf_index = "${recalibrated_vcf_filename}.tbi"
775 Array[File] input_vcfs_fofn
776 Array[File] input_vcf_indexes_fofn
777 String output_vcf_name
787 # ignoreSafetyChecks make a big performance difference so we include it in our invocation
788 ${gatk_path} --java-options "-Xmx6g -Xms6g" \
790 --ignore-safety-checks \
791 --gather-type BLOCK \
792 --input ${sep=" --input " input_vcfs_fofn} \
793 --output ${output_vcf_name}
795 ${gatk_path} --java-options "-Xmx6g -Xms6g" \
797 --feature-file ${output_vcf_name}
803 disks: "local-disk " + disk_size + " HDD"
807 File output_vcf = "${output_vcf_name}"
808 File output_vcf_index = "${output_vcf_name}.tbi"
812 task CollectVariantCallingMetrics {
816 String metrics_filename_prefix
827 ${gatk_path} --java-options "-Xmx6g -Xms6g" \
828 CollectVariantCallingMetrics \
829 --INPUT ${input_vcf} \
830 --DBSNP ${dbsnp_vcf} \
831 --SEQUENCE_DICTIONARY ${ref_dict} \
832 --OUTPUT ${metrics_filename_prefix} \
834 --TARGET_INTERVALS ${interval_list}
837 File detail_metrics_file = "${metrics_filename_prefix}.variant_calling_detail_metrics"
838 File summary_metrics_file = "${metrics_filename_prefix}.variant_calling_summary_metrics"
844 disks: "local-disk " + disk_size + " HDD"
850 Array[File] input_details_fofn
851 Array[File] input_summaries_fofn
864 ${gatk_path} --java-options "-Xmx2g -Xms2g" \
865 AccumulateVariantCallingMetrics \
866 --INPUT ${sep=" --INPUT " input_details_fofn} \
867 --OUTPUT ${output_prefix}
873 disks: "local-disk " + disk_size + " HDD"
877 File detail_metrics_file = "${output_prefix}.variant_calling_detail_metrics"
878 File summary_metrics_file = "${output_prefix}.variant_calling_summary_metrics"
882 task DynamicallyCombineIntervals {
888 def parse_interval(interval):
889 colon_split = interval.split(":")
890 chromosome = colon_split[0]
891 dash_split = colon_split[1].split("-")
892 start = int(dash_split[0])
893 end = int(dash_split[1])
894 return chromosome, start, end
896 def add_interval(chr, start, end):
897 lines_to_write.append(chr + ":" + str(start) + "-" + str(end))
898 return chr, start, end
901 chain_count = ${merge_count}
902 l_chr, l_start, l_end = "", 0, 0
904 with open("${intervals}") as f:
905 with open("out.intervals", "w") as f1:
906 for line in f.readlines():
909 w_chr, w_start, w_end = parse_interval(line)
912 # reached number to combine, so spit out and start over
913 if count == chain_count:
914 l_char, l_start, l_end = add_interval(w_chr, w_start, w_end)
915 w_chr, w_start, w_end = parse_interval(line)
919 c_chr, c_start, c_end = parse_interval(line)
920 # if adjacent keep the chain going
921 if c_chr == w_chr and c_start == w_end + 1:
925 # not adjacent, end here and start a new chain
927 l_char, l_start, l_end = add_interval(w_chr, w_start, w_end)
928 w_chr, w_start, w_end = parse_interval(line)
930 if l_char != w_chr or l_start != w_start or l_end != w_end:
931 add_interval(w_chr, w_start, w_end)
932 f1.writelines("\n".join(lines_to_write))
939 # docker: "python:2.7"
943 File output_intervals = "out.intervals"