modified: pixi.toml
[GalaxyCodeBases.git] / etc / gatk-wdl / germline2-joint-discovery-gatk4-local.wdl
blob25ea0b75ba7e3a9df40d7bb93c56e3317315b8f1
1 ## Copyright Broad Institute, 2018
2 ## 
3 ## This WDL implements the joint discovery and VQSR filtering portion of the GATK 
4 ## Best Practices (June 2016) for germline SNP and Indel discovery in human 
5 ## whole-genome sequencing (WGS) and exome sequencing data.
6 ##
7 ## Requirements/expectations :
8 ## - One or more GVCFs produced by HaplotypeCaller in GVCF mode 
9 ## - Bare minimum 1 WGS sample or 30 Exome samples. Gene panels are not supported.
11 ## Outputs :
12 ## - A VCF file and its index, filtered using variant quality score recalibration 
13 ##   (VQSR) with genotypes for all samples present in the input VCF. All sites that 
14 ##   are present in the input VCF are retained; filtered sites are annotated as such 
15 ##   in the FILTER field.
17 ## Note about VQSR wiring :
18 ## The SNP and INDEL models are built in parallel, but then the corresponding 
19 ## recalibrations are applied in series. Because the INDEL model is generally ready 
20 ## first (because there are fewer indels than SNPs) we set INDEL recalibration to 
21 ## be applied first to the input VCF, while the SNP model is still being built. By 
22 ## the time the SNP model is available, the indel-recalibrated file is available to 
23 ## serve as input to apply the SNP recalibration. If we did it the other way around, 
24 ## we would have to wait until the SNP recal file was available despite the INDEL 
25 ## recal file being there already, then apply SNP recalibration, then apply INDEL 
26 ## recalibration. This would lead to a longer wall clock time for complete workflow 
27 ## execution. Wiring the INDEL recalibration to be applied first solves the problem.
29 ## Cromwell version support 
30 ## - Successfully tested on v31
31 ## - Does not work on versions < v23 due to output syntax
33 ## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. 
34 ## For program versions, see docker containers. 
36 ## LICENSING : 
37 ## This script is released under the WDL source code license (BSD-3) (see LICENSE in 
38 ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may 
39 ## be subject to different licenses. Users are responsible for checking that they are
40 ## authorized to run all programs before running this script. Please see the docker 
41 ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
42 ## licensing information pertaining to the included programs.
44 # Run with:
45 # JAVA_OPTS="-Dsystem.input-read-limits.lines=500000 -Dbackend.providers.Local.config.concurrent-job-limit=16" cromwell run germline2-joint-discovery-gatk4-local.wdl -i germline2-joint-discovery-gatk4-local.hg38.wgs.inputs.json > cromwell.germline2.log &
47 workflow JointGenotyping {
48   File unpadded_intervals_file
50   String callset_name
51   
52   File ref_fasta
53   File ref_fasta_index
54   File ref_dict
56   File dbsnp_vcf
57   File dbsnp_vcf_index
59   Array[String] sample_names
60   Array[File] input_gvcfs 
61   Array[File] input_gvcfs_indices 
63   String gatk_docker
64   String gatk_path
66   Int small_disk
67   Int medium_disk
68   Int large_disk
69   Int huge_disk
71   Array[String] snp_recalibration_tranche_values
72   Array[String] snp_recalibration_annotation_values
73   Array[String] indel_recalibration_tranche_values
74   Array[String] indel_recalibration_annotation_values
76   File eval_interval_list
77   File hapmap_resource_vcf
78   File hapmap_resource_vcf_index
79   File omni_resource_vcf
80   File omni_resource_vcf_index
81   File one_thousand_genomes_resource_vcf
82   File one_thousand_genomes_resource_vcf_index
83   File mills_resource_vcf
84   File mills_resource_vcf_index
85   File axiomPoly_resource_vcf
86   File axiomPoly_resource_vcf_index
87   File dbsnp_resource_vcf = dbsnp_vcf
88   File dbsnp_resource_vcf_index = dbsnp_vcf_index
90   # ExcessHet is a phred-scaled p-value. We want a cutoff of anything more extreme
91   # than a z-score of -4.5 which is a p-value of 3.4e-06, which phred-scaled is 54.69
92   Float excess_het_threshold = 54.69
93   Float snp_filter_level
94   Float indel_filter_level
95   Int SNP_VQSR_downsampleFactor
97   Int num_of_original_intervals = length(read_lines(unpadded_intervals_file))
98   Int num_gvcfs = length(input_gvcfs)
100   # Make a 2.5:1 interval number to samples in callset ratio interval list
101   Int possible_merge_count = floor(num_of_original_intervals / num_gvcfs / 2.5)
102   Int merge_count = if possible_merge_count > 1 then possible_merge_count else 1
104   call DynamicallyCombineIntervals {
105     input:
106       intervals = unpadded_intervals_file,
107       merge_count = merge_count
108   }
110   Array[String] unpadded_intervals = read_lines(DynamicallyCombineIntervals.output_intervals)
112   scatter (idx in range(length(unpadded_intervals))) {
113     # the batch_size value was carefully chosen here as it
114     # is the optimal value for the amount of memory allocated
115     # within the task; please do not change it without consulting
116     # the Hellbender (GATK engine) team!
117     call ImportGVCFs {
118       input:
119         sample_names = sample_names,
120         interval = unpadded_intervals[idx],
121         workspace_dir_name = "genomicsdb",
122         input_gvcfs = input_gvcfs,
123         input_gvcfs_indices = input_gvcfs_indices,
124         disk_size = medium_disk,
125         docker = gatk_docker,
126         gatk_path = gatk_path,
127         batch_size = 50
128     }
130     call GenotypeGVCFs {
131       input:
132         workspace_tar = ImportGVCFs.output_genomicsdb,
133         interval = unpadded_intervals[idx],
134         output_vcf_filename = "output.vcf.gz",
135         ref_fasta = ref_fasta,
136         ref_fasta_index = ref_fasta_index,
137         ref_dict = ref_dict,
138         dbsnp_vcf = dbsnp_vcf,
139         dbsnp_vcf_index = dbsnp_vcf_index,
140         disk_size = medium_disk,
141         docker = gatk_docker,
142         gatk_path = gatk_path
143     }
145     call HardFilterAndMakeSitesOnlyVcf {
146       input:
147         vcf = GenotypeGVCFs.output_vcf,
148         vcf_index = GenotypeGVCFs.output_vcf_index,
149         excess_het_threshold = excess_het_threshold,
150         variant_filtered_vcf_filename = callset_name + "." + idx + ".variant_filtered.vcf.gz",
151         sites_only_vcf_filename = callset_name + "." + idx + ".sites_only.variant_filtered.vcf.gz",
152         disk_size = medium_disk,
153         docker = gatk_docker,
154         gatk_path = gatk_path
155     }
156   }
158   call GatherVcfs as SitesOnlyGatherVcf {
159     input:
160       input_vcfs_fofn = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf,
161       input_vcf_indexes_fofn = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index,
162       output_vcf_name = callset_name + ".sites_only.vcf.gz",
163       disk_size = medium_disk,
164       docker = gatk_docker,
165       gatk_path = gatk_path
166   }
168   call IndelsVariantRecalibrator {
169     input:
170       sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf,
171       sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index,
172       recalibration_filename = callset_name + ".indels.recal",
173       tranches_filename = callset_name + ".indels.tranches",
174       recalibration_tranche_values = indel_recalibration_tranche_values,
175       recalibration_annotation_values = indel_recalibration_annotation_values,
176       mills_resource_vcf = mills_resource_vcf,
177       mills_resource_vcf_index = mills_resource_vcf_index,
178       axiomPoly_resource_vcf = axiomPoly_resource_vcf,
179       axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index,
180       dbsnp_resource_vcf = dbsnp_resource_vcf,
181       dbsnp_resource_vcf_index = dbsnp_resource_vcf_index,
182       disk_size = small_disk,
183       docker = gatk_docker,
184       gatk_path = gatk_path
185   }
187   if (num_gvcfs > 10000) {
188   call SNPsVariantRecalibratorCreateModel {
189       input:
190         sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf,
191         sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index,
192         recalibration_filename = callset_name + ".snps.recal",
193         tranches_filename = callset_name + ".snps.tranches",
194         recalibration_tranche_values = snp_recalibration_tranche_values,
195         recalibration_annotation_values = snp_recalibration_annotation_values,
196         downsampleFactor = SNP_VQSR_downsampleFactor,
197         model_report_filename = callset_name + ".snps.model.report",
198         hapmap_resource_vcf = hapmap_resource_vcf,
199         hapmap_resource_vcf_index = hapmap_resource_vcf_index,
200         omni_resource_vcf = omni_resource_vcf,
201         omni_resource_vcf_index = omni_resource_vcf_index,
202         one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
203         one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
204         dbsnp_resource_vcf = dbsnp_resource_vcf,
205         dbsnp_resource_vcf_index = dbsnp_resource_vcf_index,
206         disk_size = small_disk,
207         docker = gatk_docker,
208         gatk_path = gatk_path
209     }
211   scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf))) {
212     call SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered {
213       input:
214         sites_only_variant_filtered_vcf = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf[idx],
215         sites_only_variant_filtered_vcf_index = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index[idx],
216         recalibration_filename = callset_name + ".snps." + idx + ".recal",
217         tranches_filename = callset_name + ".snps." + idx + ".tranches",
218         recalibration_tranche_values = snp_recalibration_tranche_values,
219         recalibration_annotation_values = snp_recalibration_annotation_values,
220         model_report = SNPsVariantRecalibratorCreateModel.model_report,
221         hapmap_resource_vcf = hapmap_resource_vcf,
222         hapmap_resource_vcf_index = hapmap_resource_vcf_index,
223         omni_resource_vcf = omni_resource_vcf,
224         omni_resource_vcf_index = omni_resource_vcf_index,
225         one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
226         one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
227         dbsnp_resource_vcf = dbsnp_resource_vcf,
228         dbsnp_resource_vcf_index = dbsnp_resource_vcf_index,
229         disk_size = small_disk,
230         docker = gatk_docker,
231         gatk_path = gatk_path
232       }
233     }
234     call GatherTranches as SNPGatherTranches {
235         input:
236           input_fofn = SNPsVariantRecalibratorScattered.tranches,
237           output_filename = callset_name + ".snps.gathered.tranches",
238           disk_size = small_disk,
239           docker = gatk_docker,
240           gatk_path = gatk_path
241     }
242   }
244   if (num_gvcfs <= 10000){
245     call SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic {
246       input:
247           sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf,
248           sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index,
249           recalibration_filename = callset_name + ".snps.recal",
250           tranches_filename = callset_name + ".snps.tranches",
251           recalibration_tranche_values = snp_recalibration_tranche_values,
252           recalibration_annotation_values = snp_recalibration_annotation_values,
253           hapmap_resource_vcf = hapmap_resource_vcf,
254           hapmap_resource_vcf_index = hapmap_resource_vcf_index,
255           omni_resource_vcf = omni_resource_vcf,
256           omni_resource_vcf_index = omni_resource_vcf_index,
257           one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
258           one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
259           dbsnp_resource_vcf = dbsnp_resource_vcf,
260           dbsnp_resource_vcf_index = dbsnp_resource_vcf_index,
261           disk_size = small_disk,
262           docker = gatk_docker,
263           gatk_path = gatk_path
264     }
265   }
267   # For small callsets (fewer than 1000 samples) we can gather the VCF shards and collect metrics directly.
268   # For anything larger, we need to keep the VCF sharded and gather metrics collected from them.
269   Boolean is_small_callset = num_gvcfs <= 1000
271   scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf))) {
272     call ApplyRecalibration {
273       input:
274         recalibrated_vcf_filename = callset_name + ".filtered." + idx + ".vcf.gz",
275         input_vcf = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf[idx],
276         input_vcf_index = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf_index[idx],
277         indels_recalibration = IndelsVariantRecalibrator.recalibration,
278         indels_recalibration_index = IndelsVariantRecalibrator.recalibration_index,
279         indels_tranches = IndelsVariantRecalibrator.tranches,
280         snps_recalibration = if defined(SNPsVariantRecalibratorScattered.recalibration) then select_first([SNPsVariantRecalibratorScattered.recalibration])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration]),
281         snps_recalibration_index = if defined(SNPsVariantRecalibratorScattered.recalibration_index) then select_first([SNPsVariantRecalibratorScattered.recalibration_index])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration_index]),
282         snps_tranches = select_first([SNPGatherTranches.tranches, SNPsVariantRecalibratorClassic.tranches]),
283         indel_filter_level = indel_filter_level,
284         snp_filter_level = snp_filter_level,
285         disk_size = medium_disk,
286         docker = gatk_docker,
287         gatk_path = gatk_path
288     }
290     # for large callsets we need to collect metrics from the shards and gather them later
291     if (!is_small_callset) {
292       call CollectVariantCallingMetrics as CollectMetricsSharded {
293         input:
294           input_vcf = ApplyRecalibration.recalibrated_vcf,
295           input_vcf_index = ApplyRecalibration.recalibrated_vcf_index,
296           metrics_filename_prefix = callset_name + "." + idx,
297           dbsnp_vcf = dbsnp_vcf,
298           dbsnp_vcf_index = dbsnp_vcf_index,
299           interval_list = eval_interval_list,
300           ref_dict = ref_dict,
301           disk_size = medium_disk,
302           docker = gatk_docker,
303           gatk_path = gatk_path
304       }
305     }
306   }
308   # for small callsets we can gather the VCF shards and then collect metrics on it
309   if (is_small_callset) {
310     call GatherVcfs as FinalGatherVcf {
311       input:
312         input_vcfs_fofn = ApplyRecalibration.recalibrated_vcf,
313         input_vcf_indexes_fofn = ApplyRecalibration.recalibrated_vcf_index,
314         output_vcf_name = callset_name + ".vcf.gz",
315         disk_size = huge_disk,
316         docker = gatk_docker,
317         gatk_path = gatk_path
318     }
320     call CollectVariantCallingMetrics as CollectMetricsOnFullVcf {
321       input:
322         input_vcf = FinalGatherVcf.output_vcf,
323         input_vcf_index = FinalGatherVcf.output_vcf_index,
324         metrics_filename_prefix = callset_name,
325         dbsnp_vcf = dbsnp_vcf,
326         dbsnp_vcf_index = dbsnp_vcf_index,
327         interval_list = eval_interval_list,
328         ref_dict = ref_dict,
329         disk_size = large_disk,
330         docker = gatk_docker,
331         gatk_path = gatk_path
332     }
333   }
335   # for large callsets we still need to gather the sharded metrics
336   if (!is_small_callset) {
337     call GatherMetrics {
338       input:
339         input_details_fofn = select_all(CollectMetricsSharded.detail_metrics_file),
340         input_summaries_fofn = select_all(CollectMetricsSharded.summary_metrics_file),
341         output_prefix = callset_name,
342         disk_size = medium_disk,
343         docker = gatk_docker,
344         gatk_path = gatk_path
345     }
346   }
348   output {
349     # outputs from the small callset path through the wdl
350     FinalGatherVcf.output_vcf
351     FinalGatherVcf.output_vcf_index
352     CollectMetricsOnFullVcf.detail_metrics_file
353     CollectMetricsOnFullVcf.summary_metrics_file
355     # outputs from the large callset path through the wdl
356     # (note that we do not list ApplyRecalibration here because it is run in both paths)
357     GatherMetrics.detail_metrics_file
358     GatherMetrics.summary_metrics_file
360     # output the interval list generated/used by this run workflow
361     DynamicallyCombineIntervals.output_intervals
362   }
365 task GetNumberOfSamples {
366   File sample_name_map
367   String docker
368   command <<<
369     wc -l ${sample_name_map} | awk '{print $1}'
370   >>>
371   runtime {
372     # docker: docker
373     memory: "1 GB"
374     preemptible: 5
375   }
376   output {
377     Int sample_count = read_int(stdout())
378   }
381 task ImportGVCFs {
382   Array[String] sample_names
383   Array[File] input_gvcfs
384   Array[File] input_gvcfs_indices
385   String interval
387   String workspace_dir_name
389   String gatk_path
390   String docker
391   Int disk_size
392   Int batch_size
394   command <<<
395     set -e
396     set -o pipefail
397     
398     python << CODE
399     gvcfs = ['${sep="','" input_gvcfs}']
400     sample_names = ['${sep="','" sample_names}']
402     if len(gvcfs)!= len(sample_names):
403       exit(1)
405     with open("inputs.list", "w") as fi:
406       for i in range(len(gvcfs)):
407         fi.write(sample_names[i] + "\t" + gvcfs[i] + "\n") 
409     CODE
411     # The memory setting here is very important and must be several GB lower
412     # than the total memory allocated to the VM because this tool uses
413     # a significant amount of non-heap memory for native libraries.
414     # Also, testing has shown that the multithreaded reader initialization
415     # does not scale well beyond 5 threads, so don't increase beyond that.
416     ${gatk_path} --java-options "-Xmx4g -Xms4g" \
417     GenomicsDBImport \
418     --genomicsdb-workspace-path ${workspace_dir_name} \
419     --batch-size ${batch_size} \
420     -L ${interval} \
421     --sample-name-map inputs.list \
422     --reader-threads 5 \
423     -ip 500
425     tar -cf ${workspace_dir_name}.tar ${workspace_dir_name}
427   >>>
428   runtime {
429     # docker: docker
430     memory: "7 GB"
431     cpu: "2"
432     disks: "local-disk " + disk_size + " HDD"
433     preemptible: 5
434   }
435   output {
436     File output_genomicsdb = "${workspace_dir_name}.tar"
437   }
440 task GenotypeGVCFs {
441   File workspace_tar
442   String interval
444   String output_vcf_filename
446   File ref_fasta
447   File ref_fasta_index
448   File ref_dict
450   File dbsnp_vcf
451   File dbsnp_vcf_index
453   String gatk_path
454   String docker
455   Int disk_size
457   command <<<
458     set -e
460     tar -xf ${workspace_tar}
461     WORKSPACE=$( basename ${workspace_tar} .tar)
463     ${gatk_path} --java-options "-Xmx5g -Xms5g" \
464      GenotypeGVCFs \
465      -R ${ref_fasta} \
466      -O ${output_vcf_filename} \
467      -D ${dbsnp_vcf} \
468      -G StandardAnnotation \
469      --only-output-calls-starting-in-intervals \
470      --use-new-qual-calculator \
471      -V gendb://$WORKSPACE \
472      -L ${interval}
473   >>>
474   runtime {
475     # docker: docker
476     memory: "7 GB"
477     cpu: "2"
478     disks: "local-disk " + disk_size + " HDD"
479     preemptible: 5
480   }
481   output {
482     File output_vcf = "${output_vcf_filename}"
483     File output_vcf_index = "${output_vcf_filename}.tbi"
484   }
487 task HardFilterAndMakeSitesOnlyVcf {
488   File vcf
489   File vcf_index
490   Float excess_het_threshold
492   String variant_filtered_vcf_filename
493   String sites_only_vcf_filename
494   String gatk_path
496   String docker
497   Int disk_size
499   command {
500     set -e
502     ${gatk_path} --java-options "-Xmx3g -Xms3g" \
503       VariantFiltration \
504       --filter-expression "ExcessHet > ${excess_het_threshold}" \
505       --filter-name ExcessHet \
506       -O ${variant_filtered_vcf_filename} \
507       -V ${vcf}
509     ${gatk_path} --java-options "-Xmx3g -Xms3g" \
510       MakeSitesOnlyVcf \
511       --INPUT ${variant_filtered_vcf_filename} \
512       --OUTPUT ${sites_only_vcf_filename}
514   }
515   runtime {
516     # docker: docker
517     memory: "3.5 GB"
518     cpu: "1"
519     disks: "local-disk " + disk_size + " HDD"
520     preemptible: 5
521   }
522   output {
523     File variant_filtered_vcf = "${variant_filtered_vcf_filename}"
524     File variant_filtered_vcf_index = "${variant_filtered_vcf_filename}.tbi"
525     File sites_only_vcf = "${sites_only_vcf_filename}"
526     File sites_only_vcf_index = "${sites_only_vcf_filename}.tbi"
527   }
530 task IndelsVariantRecalibrator {
531   String recalibration_filename
532   String tranches_filename
534   Array[String] recalibration_tranche_values
535   Array[String] recalibration_annotation_values
537   File sites_only_variant_filtered_vcf
538   File sites_only_variant_filtered_vcf_index
540   File mills_resource_vcf
541   File axiomPoly_resource_vcf
542   File dbsnp_resource_vcf
543   File mills_resource_vcf_index
544   File axiomPoly_resource_vcf_index
545   File dbsnp_resource_vcf_index
547   String gatk_path
548   String docker
549   Int disk_size
551   command {
552     ${gatk_path} --java-options "-Xmx24g -Xms24g" \
553       VariantRecalibrator \
554       -V ${sites_only_variant_filtered_vcf} \
555       -O ${recalibration_filename} \
556       --tranches-file ${tranches_filename} \
557       --trust-all-polymorphic \
558       -tranche ${sep=' -tranche ' recalibration_tranche_values} \
559       -an ${sep=' -an ' recalibration_annotation_values} \
560       -mode INDEL \
561       --max-gaussians 4 \
562       -resource mills,known=false,training=true,truth=true,prior=12:${mills_resource_vcf} \
563       -resource axiomPoly,known=false,training=true,truth=false,prior=10:${axiomPoly_resource_vcf} \
564       -resource dbsnp,known=true,training=false,truth=false,prior=2:${dbsnp_resource_vcf}
565   }
566   runtime {
567     # docker: docker
568     memory: "26 GB"
569     cpu: "2"
570     disks: "local-disk " + disk_size + " HDD"
571     preemptible: 5
572   }
573   output {
574     File recalibration = "${recalibration_filename}"
575     File recalibration_index = "${recalibration_filename}.idx"
576     File tranches = "${tranches_filename}"
577   }
580 task SNPsVariantRecalibratorCreateModel {
581   String recalibration_filename
582   String tranches_filename
583   Int downsampleFactor
584   String model_report_filename
586   Array[String] recalibration_tranche_values
587   Array[String] recalibration_annotation_values
589   File sites_only_variant_filtered_vcf
590   File sites_only_variant_filtered_vcf_index
592   File hapmap_resource_vcf
593   File omni_resource_vcf
594   File one_thousand_genomes_resource_vcf
595   File dbsnp_resource_vcf
596   File hapmap_resource_vcf_index
597   File omni_resource_vcf_index
598   File one_thousand_genomes_resource_vcf_index
599   File dbsnp_resource_vcf_index
601   String gatk_path
602   String docker
603   Int disk_size
605   command {
606     ${gatk_path} --java-options "-Xmx100g -Xms100g" \
607       VariantRecalibrator \
608       -V ${sites_only_variant_filtered_vcf} \
609       -O ${recalibration_filename} \
610       --tranches-file ${tranches_filename} \
611       --trust-all-polymorphic \
612       -tranche ${sep=' -tranche ' recalibration_tranche_values} \
613       -an ${sep=' -an ' recalibration_annotation_values} \
614       -mode SNP \
615       --sample-every-Nth-variant ${downsampleFactor} \
616       --output-model ${model_report_filename} \
617       --max-gaussians 6 \
618       -resource hapmap,known=false,training=true,truth=true,prior=15:${hapmap_resource_vcf} \
619       -resource omni,known=false,training=true,truth=true,prior=12:${omni_resource_vcf} \
620       -resource 1000G,known=false,training=true,truth=false,prior=10:${one_thousand_genomes_resource_vcf} \
621       -resource dbsnp,known=true,training=false,truth=false,prior=7:${dbsnp_resource_vcf}
622   }
623   runtime {
624     # docker: docker
625     memory: "104 GB"
626     cpu: "2"
627     disks: "local-disk " + disk_size + " HDD"
628     preemptible: 5
629   }
630   output {
631     File model_report = "${model_report_filename}"
632   }
635 task SNPsVariantRecalibrator {
636   String recalibration_filename
637   String tranches_filename
638   File? model_report
640   Array[String] recalibration_tranche_values
641   Array[String] recalibration_annotation_values
643   File sites_only_variant_filtered_vcf
644   File sites_only_variant_filtered_vcf_index
646   File hapmap_resource_vcf
647   File omni_resource_vcf
648   File one_thousand_genomes_resource_vcf
649   File dbsnp_resource_vcf
650   File hapmap_resource_vcf_index
651   File omni_resource_vcf_index
652   File one_thousand_genomes_resource_vcf_index
653   File dbsnp_resource_vcf_index
655   String gatk_path
656   String docker
657   Int disk_size
659   command {
660     ${gatk_path} --java-options "-Xmx3g -Xms3g" \
661       VariantRecalibrator \
662       -V ${sites_only_variant_filtered_vcf} \
663       -O ${recalibration_filename} \
664       --tranches-file ${tranches_filename} \
665       --trust-all-polymorphic \
666       -tranche ${sep=' -tranche ' recalibration_tranche_values} \
667       -an ${sep=' -an ' recalibration_annotation_values} \
668       -mode SNP \
669       ${"--input-model " + model_report + " --output-tranches-for-scatter "} \
670       --max-gaussians 6 \
671       -resource hapmap,known=false,training=true,truth=true,prior=15:${hapmap_resource_vcf} \
672       -resource omni,known=false,training=true,truth=true,prior=12:${omni_resource_vcf} \
673       -resource 1000G,known=false,training=true,truth=false,prior=10:${one_thousand_genomes_resource_vcf} \
674       -resource dbsnp,known=true,training=false,truth=false,prior=7:${dbsnp_resource_vcf}
675   }
676   runtime {
677     # docker: docker
678     memory: "3.5 GB"
679     cpu: "2"
680     disks: "local-disk " + disk_size + " HDD"
681     preemptible: 5
682   }
683   output {
684     File recalibration = "${recalibration_filename}"
685     File recalibration_index = "${recalibration_filename}.idx"
686     File tranches = "${tranches_filename}"
687   }
690 task GatherTranches {
691   Array[File] input_fofn
692   String output_filename
694   String gatk_path
696   String docker
697   Int disk_size
699   command <<<
700     set -e
701     set -o pipefail
702     
703       ${gatk_path} --java-options "-Xmx6g -Xms6g" \
704       GatherTranches \
705       --input ${sep=" --input " input_fofn}  \
706       --output ${output_filename}
707   >>>
708   runtime {
709     # docker: docker
710     memory: "7 GB"
711     cpu: "2"
712     disks: "local-disk " + disk_size + " HDD"
713     preemptible: 5
714   }
715   output {
716     File tranches = "${output_filename}"
717   }
720 task ApplyRecalibration {
721   String recalibrated_vcf_filename
722   File input_vcf
723   File input_vcf_index
724   File indels_recalibration
725   File indels_recalibration_index
726   File indels_tranches
727   File snps_recalibration
728   File snps_recalibration_index
729   File snps_tranches
731   Float indel_filter_level
732   Float snp_filter_level
734   String gatk_path
735   String docker
736   Int disk_size
738   command {
739     set -e
741     ${gatk_path} --java-options "-Xmx5g -Xms5g" \
742       ApplyVQSR \
743       -O tmp.indel.recalibrated.vcf \
744       -V ${input_vcf} \
745       --recal-file ${indels_recalibration} \
746       --tranches-file ${indels_tranches} \
747       --truth-sensitivity-filter-level ${indel_filter_level} \
748       --create-output-variant-index true \
749       -mode INDEL
751     ${gatk_path} --java-options "-Xmx5g -Xms5g" \
752       ApplyVQSR \
753       -O ${recalibrated_vcf_filename} \
754       -V tmp.indel.recalibrated.vcf \
755       --recal-file ${snps_recalibration} \
756       --tranches-file ${snps_tranches} \
757       --truth-sensitivity-filter-level ${snp_filter_level} \
758       --create-output-variant-index true \
759       -mode SNP
760   }
761   runtime {
762     # docker: docker
763     memory: "7 GB"
764     cpu: "1"
765     disks: "local-disk " + disk_size + " HDD"
766     preemptible: 5
767   }
768   output {
769     File recalibrated_vcf = "${recalibrated_vcf_filename}"
770     File recalibrated_vcf_index = "${recalibrated_vcf_filename}.tbi"
771   }
774 task GatherVcfs {
775   Array[File] input_vcfs_fofn
776   Array[File] input_vcf_indexes_fofn
777   String output_vcf_name
778   
779   String gatk_path
780   String docker
781   Int disk_size
783   command <<<
784     set -e
785     set -o pipefail
787     # ignoreSafetyChecks make a big performance difference so we include it in our invocation
788     ${gatk_path} --java-options "-Xmx6g -Xms6g" \
789     GatherVcfsCloud \
790     --ignore-safety-checks \
791     --gather-type BLOCK \
792     --input ${sep=" --input " input_vcfs_fofn} \
793     --output ${output_vcf_name}
795     ${gatk_path} --java-options "-Xmx6g -Xms6g" \
796     IndexFeatureFile \
797     --feature-file ${output_vcf_name}
798   >>>
799   runtime {
800     # docker: docker
801     memory: "7 GB"
802     cpu: "1"
803     disks: "local-disk " + disk_size + " HDD"
804     preemptible: 5
805   }
806   output {
807     File output_vcf = "${output_vcf_name}"
808     File output_vcf_index = "${output_vcf_name}.tbi"
809   }
812 task CollectVariantCallingMetrics {
813   File input_vcf
814   File input_vcf_index
816   String metrics_filename_prefix
817   File dbsnp_vcf
818   File dbsnp_vcf_index
819   File interval_list
820   File ref_dict
822   String gatk_path
823   String docker
824   Int disk_size
826   command {
827     ${gatk_path} --java-options "-Xmx6g -Xms6g" \
828       CollectVariantCallingMetrics \
829       --INPUT ${input_vcf} \
830       --DBSNP ${dbsnp_vcf} \
831       --SEQUENCE_DICTIONARY ${ref_dict} \
832       --OUTPUT ${metrics_filename_prefix} \
833       --THREAD_COUNT 8 \
834       --TARGET_INTERVALS ${interval_list}
835   }
836   output {
837     File detail_metrics_file = "${metrics_filename_prefix}.variant_calling_detail_metrics"
838     File summary_metrics_file = "${metrics_filename_prefix}.variant_calling_summary_metrics"
839   }
840   runtime {
841     # docker: docker
842     memory: "7 GB"
843     cpu: 2
844     disks: "local-disk " + disk_size + " HDD"
845     preemptible: 5
846   }
849 task GatherMetrics {
850   Array[File] input_details_fofn
851   Array[File] input_summaries_fofn
853   String output_prefix
855   String gatk_path
856   String docker
857   Int disk_size
859   command <<<
860     set -e
861     set -o pipefail
863     
864     ${gatk_path} --java-options "-Xmx2g -Xms2g" \
865     AccumulateVariantCallingMetrics \
866     --INPUT ${sep=" --INPUT " input_details_fofn} \
867     --OUTPUT ${output_prefix}
868   >>>
869   runtime {
870     # docker: docker
871     memory: "3 GB"
872     cpu: "1"
873     disks: "local-disk " + disk_size + " HDD"
874     preemptible: 5
875   }
876   output {
877     File detail_metrics_file = "${output_prefix}.variant_calling_detail_metrics"
878     File summary_metrics_file = "${output_prefix}.variant_calling_summary_metrics"
879   }
882 task DynamicallyCombineIntervals {
883   File intervals
884   Int merge_count
886   command {
887     python2 << CODE
888     def parse_interval(interval):
889         colon_split = interval.split(":")
890         chromosome = colon_split[0]
891         dash_split = colon_split[1].split("-")
892         start = int(dash_split[0])
893         end = int(dash_split[1])
894         return chromosome, start, end
896     def add_interval(chr, start, end):
897         lines_to_write.append(chr + ":" + str(start) + "-" + str(end))
898         return chr, start, end
900     count = 0
901     chain_count = ${merge_count}
902     l_chr, l_start, l_end = "", 0, 0
903     lines_to_write = []
904     with open("${intervals}") as f:
905         with open("out.intervals", "w") as f1:
906             for line in f.readlines():
907                 # initialization
908                 if count == 0:
909                     w_chr, w_start, w_end = parse_interval(line)
910                     count = 1
911                     continue
912                 # reached number to combine, so spit out and start over
913                 if count == chain_count:
914                     l_char, l_start, l_end = add_interval(w_chr, w_start, w_end)
915                     w_chr, w_start, w_end = parse_interval(line)
916                     count = 1
917                     continue
919                 c_chr, c_start, c_end = parse_interval(line)
920                 # if adjacent keep the chain going
921                 if c_chr == w_chr and c_start == w_end + 1:
922                     w_end = c_end
923                     count += 1
924                     continue
925                 # not adjacent, end here and start a new chain
926                 else:
927                     l_char, l_start, l_end = add_interval(w_chr, w_start, w_end)
928                     w_chr, w_start, w_end = parse_interval(line)
929                     count = 1
930             if l_char != w_chr or l_start != w_start or l_end != w_end:
931                 add_interval(w_chr, w_start, w_end)
932             f1.writelines("\n".join(lines_to_write))
933     CODE
934   }
936   runtime {
937     memory: "3 GB"
938     preemptible: 5
939     # docker: "python:2.7"
940   }
942   output {
943     File output_intervals = "out.intervals"
944   }