etc/gatk-wdl/germline2-joint-discovery-gatk4-local.wdl

   1 ## Copyright Broad Institute, 2018
   2 ##
   3 ## This WDL implements the joint discovery and VQSR filtering portion of the GATK
   4 ## Best Practices (June 2016) for germline SNP and Indel discovery in human
   5 ## whole-genome sequencing (WGS) and exome sequencing data.
   6 ##
   7 ## Requirements/expectations :
   8 ## - One or more GVCFs produced by HaplotypeCaller in GVCF mode
   9 ## - Bare minimum 1 WGS sample or 30 Exome samples. Gene panels are not supported.
  10 ##
  11 ## Outputs :
  12 ## - A VCF file and its index, filtered using variant quality score recalibration
  13 ##   (VQSR) with genotypes for all samples present in the input VCF. All sites that
  14 ##   are present in the input VCF are retained; filtered sites are annotated as such
  15 ##   in the FILTER field.
  16 ##
  17 ## Note about VQSR wiring :
  18 ## The SNP and INDEL models are built in parallel, but then the corresponding
  19 ## recalibrations are applied in series. Because the INDEL model is generally ready
  20 ## first (because there are fewer indels than SNPs) we set INDEL recalibration to
  21 ## be applied first to the input VCF, while the SNP model is still being built. By
  22 ## the time the SNP model is available, the indel-recalibrated file is available to
  23 ## serve as input to apply the SNP recalibration. If we did it the other way around,
  24 ## we would have to wait until the SNP recal file was available despite the INDEL
  25 ## recal file being there already, then apply SNP recalibration, then apply INDEL
  26 ## recalibration. This would lead to a longer wall clock time for complete workflow
  27 ## execution. Wiring the INDEL recalibration to be applied first solves the problem.
  28 ##
  29 ## Cromwell version support
  30 ## - Successfully tested on v31
  31 ## - Does not work on versions < v23 due to output syntax
  32 ##
  33 ## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
  34 ## For program versions, see docker containers.
  35 ##
  36 ## LICENSING :
  37 ## This script is released under the WDL source code license (BSD-3) (see LICENSE in
  38 ## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
  39 ## be subject to different licenses. Users are responsible for checking that they are
  40 ## authorized to run all programs before running this script. Please see the docker
  41 ## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
  42 ## licensing information pertaining to the included programs.
  43
  44 # Run with:
  45 # JAVA_OPTS="-Dsystem.input-read-limits.lines=500000 -Dbackend.providers.Local.config.concurrent-job-limit=16" cromwell run germline2-joint-discovery-gatk4-local.wdl -i germline2-joint-discovery-gatk4-local.hg38.wgs.inputs.json > cromwell.germline2.log &
  46
  47 workflow JointGenotyping {
  48   File unpadded_intervals_file
  49
  50   String callset_name
  51
  52   File ref_fasta
  53   File ref_fasta_index
  54   File ref_dict
  55
  56   File dbsnp_vcf
  57   File dbsnp_vcf_index
  58
  59   Array[String] sample_names
  60   Array[File] input_gvcfs
  61   Array[File] input_gvcfs_indices
  62
  63   String gatk_docker
  64   String gatk_path
  65
  66   Int small_disk
  67   Int medium_disk
  68   Int large_disk
  69   Int huge_disk
  70
  71   Array[String] snp_recalibration_tranche_values
  72   Array[String] snp_recalibration_annotation_values
  73   Array[String] indel_recalibration_tranche_values
  74   Array[String] indel_recalibration_annotation_values
  75
  76   File eval_interval_list
  77   File hapmap_resource_vcf
  78   File hapmap_resource_vcf_index
  79   File omni_resource_vcf
  80   File omni_resource_vcf_index
  81   File one_thousand_genomes_resource_vcf
  82   File one_thousand_genomes_resource_vcf_index
  83   File mills_resource_vcf
  84   File mills_resource_vcf_index
  85   File axiomPoly_resource_vcf
  86   File axiomPoly_resource_vcf_index
  87   File dbsnp_resource_vcf = dbsnp_vcf
  88   File dbsnp_resource_vcf_index = dbsnp_vcf_index
  89
  90   # ExcessHet is a phred-scaled p-value. We want a cutoff of anything more extreme
  91   # than a z-score of -4.5 which is a p-value of 3.4e-06, which phred-scaled is 54.69
  92   Float excess_het_threshold = 54.69
  93   Float snp_filter_level
  94   Float indel_filter_level
  95   Int SNP_VQSR_downsampleFactor
  96
  97   Int num_of_original_intervals = length(read_lines(unpadded_intervals_file))
  98   Int num_gvcfs = length(input_gvcfs)
  99
 100   # Make a 2.5:1 interval number to samples in callset ratio interval list
 101   Int possible_merge_count = floor(num_of_original_intervals / num_gvcfs / 2.5)
 102   Int merge_count = if possible_merge_count > 1 then possible_merge_count else 1
 103
 104   call DynamicallyCombineIntervals {
 105     input:
 106       intervals = unpadded_intervals_file,
 107       merge_count = merge_count
 108   }
 109
 110   Array[String] unpadded_intervals = read_lines(DynamicallyCombineIntervals.output_intervals)
 111
 112   scatter (idx in range(length(unpadded_intervals))) {
 113     # the batch_size value was carefully chosen here as it
 114     # is the optimal value for the amount of memory allocated
 115     # within the task; please do not change it without consulting
 116     # the Hellbender (GATK engine) team!
 117     call ImportGVCFs {
 118       input:
 119         sample_names = sample_names,
 120         interval = unpadded_intervals[idx],
 121         workspace_dir_name = "genomicsdb",
 122         input_gvcfs = input_gvcfs,
 123         input_gvcfs_indices = input_gvcfs_indices,
 124         disk_size = medium_disk,
 125         docker = gatk_docker,
 126         gatk_path = gatk_path,
 127         batch_size = 50
 128     }
 129
 130     call GenotypeGVCFs {
 131       input:
 132         workspace_tar = ImportGVCFs.output_genomicsdb,
 133         interval = unpadded_intervals[idx],
 134         output_vcf_filename = "output.vcf.gz",
 135         ref_fasta = ref_fasta,
 136         ref_fasta_index = ref_fasta_index,
 137         ref_dict = ref_dict,
 138         dbsnp_vcf = dbsnp_vcf,
 139         dbsnp_vcf_index = dbsnp_vcf_index,
 140         disk_size = medium_disk,
 141         docker = gatk_docker,
 142         gatk_path = gatk_path
 143     }
 144
 145     call HardFilterAndMakeSitesOnlyVcf {
 146       input:
 147         vcf = GenotypeGVCFs.output_vcf,
 148         vcf_index = GenotypeGVCFs.output_vcf_index,
 149         excess_het_threshold = excess_het_threshold,
 150         variant_filtered_vcf_filename = callset_name + "." + idx + ".variant_filtered.vcf.gz",
 151         sites_only_vcf_filename = callset_name + "." + idx + ".sites_only.variant_filtered.vcf.gz",
 152         disk_size = medium_disk,
 153         docker = gatk_docker,
 154         gatk_path = gatk_path
 155     }
 156   }
 157
 158   call GatherVcfs as SitesOnlyGatherVcf {
 159     input:
 160       input_vcfs_fofn = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf,
 161       input_vcf_indexes_fofn = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index,
 162       output_vcf_name = callset_name + ".sites_only.vcf.gz",
 163       disk_size = medium_disk,
 164       docker = gatk_docker,
 165       gatk_path = gatk_path
 166   }
 167
 168   call IndelsVariantRecalibrator {
 169     input:
 170       sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf,
 171       sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index,
 172       recalibration_filename = callset_name + ".indels.recal",
 173       tranches_filename = callset_name + ".indels.tranches",
 174       recalibration_tranche_values = indel_recalibration_tranche_values,
 175       recalibration_annotation_values = indel_recalibration_annotation_values,
 176       mills_resource_vcf = mills_resource_vcf,
 177       mills_resource_vcf_index = mills_resource_vcf_index,
 178       axiomPoly_resource_vcf = axiomPoly_resource_vcf,
 179       axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index,
 180       dbsnp_resource_vcf = dbsnp_resource_vcf,
 181       dbsnp_resource_vcf_index = dbsnp_resource_vcf_index,
 182       disk_size = small_disk,
 183       docker = gatk_docker,
 184       gatk_path = gatk_path
 185   }
 186
 187   if (num_gvcfs > 10000) {
 188   call SNPsVariantRecalibratorCreateModel {
 189       input:
 190         sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf,
 191         sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index,
 192         recalibration_filename = callset_name + ".snps.recal",
 193         tranches_filename = callset_name + ".snps.tranches",
 194         recalibration_tranche_values = snp_recalibration_tranche_values,
 195         recalibration_annotation_values = snp_recalibration_annotation_values,
 196         downsampleFactor = SNP_VQSR_downsampleFactor,
 197         model_report_filename = callset_name + ".snps.model.report",
 198         hapmap_resource_vcf = hapmap_resource_vcf,
 199         hapmap_resource_vcf_index = hapmap_resource_vcf_index,
 200         omni_resource_vcf = omni_resource_vcf,
 201         omni_resource_vcf_index = omni_resource_vcf_index,
 202         one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
 203         one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
 204         dbsnp_resource_vcf = dbsnp_resource_vcf,
 205         dbsnp_resource_vcf_index = dbsnp_resource_vcf_index,
 206         disk_size = small_disk,
 207         docker = gatk_docker,
 208         gatk_path = gatk_path
 209     }
 210
 211   scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf))) {
 212     call SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered {
 213       input:
 214         sites_only_variant_filtered_vcf = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf[idx],
 215         sites_only_variant_filtered_vcf_index = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index[idx],
 216         recalibration_filename = callset_name + ".snps." + idx + ".recal",
 217         tranches_filename = callset_name + ".snps." + idx + ".tranches",
 218         recalibration_tranche_values = snp_recalibration_tranche_values,
 219         recalibration_annotation_values = snp_recalibration_annotation_values,
 220         model_report = SNPsVariantRecalibratorCreateModel.model_report,
 221         hapmap_resource_vcf = hapmap_resource_vcf,
 222         hapmap_resource_vcf_index = hapmap_resource_vcf_index,
 223         omni_resource_vcf = omni_resource_vcf,
 224         omni_resource_vcf_index = omni_resource_vcf_index,
 225         one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
 226         one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
 227         dbsnp_resource_vcf = dbsnp_resource_vcf,
 228         dbsnp_resource_vcf_index = dbsnp_resource_vcf_index,
 229         disk_size = small_disk,
 230         docker = gatk_docker,
 231         gatk_path = gatk_path
 232       }
 233     }
 234     call GatherTranches as SNPGatherTranches {
 235         input:
 236           input_fofn = SNPsVariantRecalibratorScattered.tranches,
 237           output_filename = callset_name + ".snps.gathered.tranches",
 238           disk_size = small_disk,
 239           docker = gatk_docker,
 240           gatk_path = gatk_path
 241     }
 242   }
 243
 244   if (num_gvcfs <= 10000){
 245     call SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic {
 246       input:
 247           sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf,
 248           sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index,
 249           recalibration_filename = callset_name + ".snps.recal",
 250           tranches_filename = callset_name + ".snps.tranches",
 251           recalibration_tranche_values = snp_recalibration_tranche_values,
 252           recalibration_annotation_values = snp_recalibration_annotation_values,
 253           hapmap_resource_vcf = hapmap_resource_vcf,
 254           hapmap_resource_vcf_index = hapmap_resource_vcf_index,
 255           omni_resource_vcf = omni_resource_vcf,
 256           omni_resource_vcf_index = omni_resource_vcf_index,
 257           one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
 258           one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
 259           dbsnp_resource_vcf = dbsnp_resource_vcf,
 260           dbsnp_resource_vcf_index = dbsnp_resource_vcf_index,
 261           disk_size = small_disk,
 262           docker = gatk_docker,
 263           gatk_path = gatk_path
 264     }
 265   }
 266
 267   # For small callsets (fewer than 1000 samples) we can gather the VCF shards and collect metrics directly.
 268   # For anything larger, we need to keep the VCF sharded and gather metrics collected from them.
 269   Boolean is_small_callset = num_gvcfs <= 1000
 270
 271   scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf))) {
 272     call ApplyRecalibration {
 273       input:
 274         recalibrated_vcf_filename = callset_name + ".filtered." + idx + ".vcf.gz",
 275         input_vcf = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf[idx],
 276         input_vcf_index = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf_index[idx],
 277         indels_recalibration = IndelsVariantRecalibrator.recalibration,
 278         indels_recalibration_index = IndelsVariantRecalibrator.recalibration_index,
 279         indels_tranches = IndelsVariantRecalibrator.tranches,
 280         snps_recalibration = if defined(SNPsVariantRecalibratorScattered.recalibration) then select_first([SNPsVariantRecalibratorScattered.recalibration])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration]),
 281         snps_recalibration_index = if defined(SNPsVariantRecalibratorScattered.recalibration_index) then select_first([SNPsVariantRecalibratorScattered.recalibration_index])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration_index]),
 282         snps_tranches = select_first([SNPGatherTranches.tranches, SNPsVariantRecalibratorClassic.tranches]),
 283         indel_filter_level = indel_filter_level,
 284         snp_filter_level = snp_filter_level,
 285         disk_size = medium_disk,
 286         docker = gatk_docker,
 287         gatk_path = gatk_path
 288     }
 289
 290     # for large callsets we need to collect metrics from the shards and gather them later
 291     if (!is_small_callset) {
 292       call CollectVariantCallingMetrics as CollectMetricsSharded {
 293         input:
 294           input_vcf = ApplyRecalibration.recalibrated_vcf,
 295           input_vcf_index = ApplyRecalibration.recalibrated_vcf_index,
 296           metrics_filename_prefix = callset_name + "." + idx,
 297           dbsnp_vcf = dbsnp_vcf,
 298           dbsnp_vcf_index = dbsnp_vcf_index,
 299           interval_list = eval_interval_list,
 300           ref_dict = ref_dict,
 301           disk_size = medium_disk,
 302           docker = gatk_docker,
 303           gatk_path = gatk_path
 304       }
 305     }
 306   }
 307
 308   # for small callsets we can gather the VCF shards and then collect metrics on it
 309   if (is_small_callset) {
 310     call GatherVcfs as FinalGatherVcf {
 311       input:
 312         input_vcfs_fofn = ApplyRecalibration.recalibrated_vcf,
 313         input_vcf_indexes_fofn = ApplyRecalibration.recalibrated_vcf_index,
 314         output_vcf_name = callset_name + ".vcf.gz",
 315         disk_size = huge_disk,
 316         docker = gatk_docker,
 317         gatk_path = gatk_path
 318     }
 319
 320     call CollectVariantCallingMetrics as CollectMetricsOnFullVcf {
 321       input:
 322         input_vcf = FinalGatherVcf.output_vcf,
 323         input_vcf_index = FinalGatherVcf.output_vcf_index,
 324         metrics_filename_prefix = callset_name,
 325         dbsnp_vcf = dbsnp_vcf,
 326         dbsnp_vcf_index = dbsnp_vcf_index,
 327         interval_list = eval_interval_list,
 328         ref_dict = ref_dict,
 329         disk_size = large_disk,
 330         docker = gatk_docker,
 331         gatk_path = gatk_path
 332     }
 333   }
 334
 335   # for large callsets we still need to gather the sharded metrics
 336   if (!is_small_callset) {
 337     call GatherMetrics {
 338       input:
 339         input_details_fofn = select_all(CollectMetricsSharded.detail_metrics_file),
 340         input_summaries_fofn = select_all(CollectMetricsSharded.summary_metrics_file),
 341         output_prefix = callset_name,
 342         disk_size = medium_disk,
 343         docker = gatk_docker,
 344         gatk_path = gatk_path
 345     }
 346   }
 347
 348   output {
 349     # outputs from the small callset path through the wdl
 350     FinalGatherVcf.output_vcf
 351     FinalGatherVcf.output_vcf_index
 352     CollectMetricsOnFullVcf.detail_metrics_file
 353     CollectMetricsOnFullVcf.summary_metrics_file
 354
 355     # outputs from the large callset path through the wdl
 356     # (note that we do not list ApplyRecalibration here because it is run in both paths)
 357     GatherMetrics.detail_metrics_file
 358     GatherMetrics.summary_metrics_file
 359
 360     # output the interval list generated/used by this run workflow
 361     DynamicallyCombineIntervals.output_intervals
 362   }
 363 }
 364
 365 task GetNumberOfSamples {
 366   File sample_name_map
 367   String docker
 368   command <<<
 369     wc -l ${sample_name_map} | awk '{print $1}'
 370   >>>
 371   runtime {
 372     # docker: docker
 373     memory: "1 GB"
 374     preemptible: 5
 375   }
 376   output {
 377     Int sample_count = read_int(stdout())
 378   }
 379 }
 380
 381 task ImportGVCFs {
 382   Array[String] sample_names
 383   Array[File] input_gvcfs
 384   Array[File] input_gvcfs_indices
 385   String interval
 386
 387   String workspace_dir_name
 388
 389   String gatk_path
 390   String docker
 391   Int disk_size
 392   Int batch_size
 393
 394   command <<<
 395     set -e
 396     set -o pipefail
 397
 398     python << CODE
 399     gvcfs = ['${sep="','" input_gvcfs}']
 400     sample_names = ['${sep="','" sample_names}']
 401
 402     if len(gvcfs)!= len(sample_names):
 403       exit(1)
 404
 405     with open("inputs.list", "w") as fi:
 406       for i in range(len(gvcfs)):
 407         fi.write(sample_names[i] + "\t" + gvcfs[i] + "\n")
 408
 409     CODE
 410
 411     # The memory setting here is very important and must be several GB lower
 412     # than the total memory allocated to the VM because this tool uses
 413     # a significant amount of non-heap memory for native libraries.
 414     # Also, testing has shown that the multithreaded reader initialization
 415     # does not scale well beyond 5 threads, so don't increase beyond that.
 416     ${gatk_path} --java-options "-Xmx4g -Xms4g" \
 417     GenomicsDBImport \
 418     --genomicsdb-workspace-path ${workspace_dir_name} \
 419     --batch-size ${batch_size} \
 420     -L ${interval} \
 421     --sample-name-map inputs.list \
 422     --reader-threads 5 \
 423     -ip 500
 424
 425     tar -cf ${workspace_dir_name}.tar ${workspace_dir_name}
 426
 427   >>>
 428   runtime {
 429     # docker: docker
 430     memory: "7 GB"
 431     cpu: "2"
 432     disks: "local-disk " + disk_size + " HDD"
 433     preemptible: 5
 434   }
 435   output {
 436     File output_genomicsdb = "${workspace_dir_name}.tar"
 437   }
 438 }
 439
 440 task GenotypeGVCFs {
 441   File workspace_tar
 442   String interval
 443
 444   String output_vcf_filename
 445
 446   File ref_fasta
 447   File ref_fasta_index
 448   File ref_dict
 449
 450   File dbsnp_vcf
 451   File dbsnp_vcf_index
 452
 453   String gatk_path
 454   String docker
 455   Int disk_size
 456
 457   command <<<
 458     set -e
 459
 460     tar -xf ${workspace_tar}
 461     WORKSPACE=$( basename ${workspace_tar} .tar)
 462
 463     ${gatk_path} --java-options "-Xmx5g -Xms5g" \
 464      GenotypeGVCFs \
 465      -R ${ref_fasta} \
 466      -O ${output_vcf_filename} \
 467      -D ${dbsnp_vcf} \
 468      -G StandardAnnotation \
 469      --only-output-calls-starting-in-intervals \
 470      --use-new-qual-calculator \
 471      -V gendb://$WORKSPACE \
 472      -L ${interval}
 473   >>>
 474   runtime {
 475     # docker: docker
 476     memory: "7 GB"
 477     cpu: "2"
 478     disks: "local-disk " + disk_size + " HDD"
 479     preemptible: 5
 480   }
 481   output {
 482     File output_vcf = "${output_vcf_filename}"
 483     File output_vcf_index = "${output_vcf_filename}.tbi"
 484   }
 485 }
 486
 487 task HardFilterAndMakeSitesOnlyVcf {
 488   File vcf
 489   File vcf_index
 490   Float excess_het_threshold
 491
 492   String variant_filtered_vcf_filename
 493   String sites_only_vcf_filename
 494   String gatk_path
 495
 496   String docker
 497   Int disk_size
 498
 499   command {
 500     set -e
 501
 502     ${gatk_path} --java-options "-Xmx3g -Xms3g" \
 503       VariantFiltration \
 504       --filter-expression "ExcessHet > ${excess_het_threshold}" \
 505       --filter-name ExcessHet \
 506       -O ${variant_filtered_vcf_filename} \
 507       -V ${vcf}
 508
 509     ${gatk_path} --java-options "-Xmx3g -Xms3g" \
 510       MakeSitesOnlyVcf \
 511       --INPUT ${variant_filtered_vcf_filename} \
 512       --OUTPUT ${sites_only_vcf_filename}
 513
 514   }
 515   runtime {
 516     # docker: docker
 517     memory: "3.5 GB"
 518     cpu: "1"
 519     disks: "local-disk " + disk_size + " HDD"
 520     preemptible: 5
 521   }
 522   output {
 523     File variant_filtered_vcf = "${variant_filtered_vcf_filename}"
 524     File variant_filtered_vcf_index = "${variant_filtered_vcf_filename}.tbi"
 525     File sites_only_vcf = "${sites_only_vcf_filename}"
 526     File sites_only_vcf_index = "${sites_only_vcf_filename}.tbi"
 527   }
 528 }
 529
 530 task IndelsVariantRecalibrator {
 531   String recalibration_filename
 532   String tranches_filename
 533
 534   Array[String] recalibration_tranche_values
 535   Array[String] recalibration_annotation_values
 536
 537   File sites_only_variant_filtered_vcf
 538   File sites_only_variant_filtered_vcf_index
 539
 540   File mills_resource_vcf
 541   File axiomPoly_resource_vcf
 542   File dbsnp_resource_vcf
 543   File mills_resource_vcf_index
 544   File axiomPoly_resource_vcf_index
 545   File dbsnp_resource_vcf_index
 546
 547   String gatk_path
 548   String docker
 549   Int disk_size
 550
 551   command {
 552     ${gatk_path} --java-options "-Xmx24g -Xms24g" \
 553       VariantRecalibrator \
 554       -V ${sites_only_variant_filtered_vcf} \
 555       -O ${recalibration_filename} \
 556       --tranches-file ${tranches_filename} \
 557       --trust-all-polymorphic \
 558       -tranche ${sep=' -tranche ' recalibration_tranche_values} \
 559       -an ${sep=' -an ' recalibration_annotation_values} \
 560       -mode INDEL \
 561       --max-gaussians 4 \
 562       -resource mills,known=false,training=true,truth=true,prior=12:${mills_resource_vcf} \
 563       -resource axiomPoly,known=false,training=true,truth=false,prior=10:${axiomPoly_resource_vcf} \
 564       -resource dbsnp,known=true,training=false,truth=false,prior=2:${dbsnp_resource_vcf}
 565   }
 566   runtime {
 567     # docker: docker
 568     memory: "26 GB"
 569     cpu: "2"
 570     disks: "local-disk " + disk_size + " HDD"
 571     preemptible: 5
 572   }
 573   output {
 574     File recalibration = "${recalibration_filename}"
 575     File recalibration_index = "${recalibration_filename}.idx"
 576     File tranches = "${tranches_filename}"
 577   }
 578 }
 579
 580 task SNPsVariantRecalibratorCreateModel {
 581   String recalibration_filename
 582   String tranches_filename
 583   Int downsampleFactor
 584   String model_report_filename
 585
 586   Array[String] recalibration_tranche_values
 587   Array[String] recalibration_annotation_values
 588
 589   File sites_only_variant_filtered_vcf
 590   File sites_only_variant_filtered_vcf_index
 591
 592   File hapmap_resource_vcf
 593   File omni_resource_vcf
 594   File one_thousand_genomes_resource_vcf
 595   File dbsnp_resource_vcf
 596   File hapmap_resource_vcf_index
 597   File omni_resource_vcf_index
 598   File one_thousand_genomes_resource_vcf_index
 599   File dbsnp_resource_vcf_index
 600
 601   String gatk_path
 602   String docker
 603   Int disk_size
 604
 605   command {
 606     ${gatk_path} --java-options "-Xmx100g -Xms100g" \
 607       VariantRecalibrator \
 608       -V ${sites_only_variant_filtered_vcf} \
 609       -O ${recalibration_filename} \
 610       --tranches-file ${tranches_filename} \
 611       --trust-all-polymorphic \
 612       -tranche ${sep=' -tranche ' recalibration_tranche_values} \
 613       -an ${sep=' -an ' recalibration_annotation_values} \
 614       -mode SNP \
 615       --sample-every-Nth-variant ${downsampleFactor} \
 616       --output-model ${model_report_filename} \
 617       --max-gaussians 6 \
 618       -resource hapmap,known=false,training=true,truth=true,prior=15:${hapmap_resource_vcf} \
 619       -resource omni,known=false,training=true,truth=true,prior=12:${omni_resource_vcf} \
 620       -resource 1000G,known=false,training=true,truth=false,prior=10:${one_thousand_genomes_resource_vcf} \
 621       -resource dbsnp,known=true,training=false,truth=false,prior=7:${dbsnp_resource_vcf}
 622   }
 623   runtime {
 624     # docker: docker
 625     memory: "104 GB"
 626     cpu: "2"
 627     disks: "local-disk " + disk_size + " HDD"
 628     preemptible: 5
 629   }
 630   output {
 631     File model_report = "${model_report_filename}"
 632   }
 633 }
 634
 635 task SNPsVariantRecalibrator {
 636   String recalibration_filename
 637   String tranches_filename
 638   File? model_report
 639
 640   Array[String] recalibration_tranche_values
 641   Array[String] recalibration_annotation_values
 642
 643   File sites_only_variant_filtered_vcf
 644   File sites_only_variant_filtered_vcf_index
 645
 646   File hapmap_resource_vcf
 647   File omni_resource_vcf
 648   File one_thousand_genomes_resource_vcf
 649   File dbsnp_resource_vcf
 650   File hapmap_resource_vcf_index
 651   File omni_resource_vcf_index
 652   File one_thousand_genomes_resource_vcf_index
 653   File dbsnp_resource_vcf_index
 654
 655   String gatk_path
 656   String docker
 657   Int disk_size
 658
 659   command {
 660     ${gatk_path} --java-options "-Xmx3g -Xms3g" \
 661       VariantRecalibrator \
 662       -V ${sites_only_variant_filtered_vcf} \
 663       -O ${recalibration_filename} \
 664       --tranches-file ${tranches_filename} \
 665       --trust-all-polymorphic \
 666       -tranche ${sep=' -tranche ' recalibration_tranche_values} \
 667       -an ${sep=' -an ' recalibration_annotation_values} \
 668       -mode SNP \
 669       ${"--input-model " + model_report + " --output-tranches-for-scatter "} \
 670       --max-gaussians 6 \
 671       -resource hapmap,known=false,training=true,truth=true,prior=15:${hapmap_resource_vcf} \
 672       -resource omni,known=false,training=true,truth=true,prior=12:${omni_resource_vcf} \
 673       -resource 1000G,known=false,training=true,truth=false,prior=10:${one_thousand_genomes_resource_vcf} \
 674       -resource dbsnp,known=true,training=false,truth=false,prior=7:${dbsnp_resource_vcf}
 675   }
 676   runtime {
 677     # docker: docker
 678     memory: "3.5 GB"
 679     cpu: "2"
 680     disks: "local-disk " + disk_size + " HDD"
 681     preemptible: 5
 682   }
 683   output {
 684     File recalibration = "${recalibration_filename}"
 685     File recalibration_index = "${recalibration_filename}.idx"
 686     File tranches = "${tranches_filename}"
 687   }
 688 }
 689
 690 task GatherTranches {
 691   Array[File] input_fofn
 692   String output_filename
 693
 694   String gatk_path
 695
 696   String docker
 697   Int disk_size
 698
 699   command <<<
 700     set -e
 701     set -o pipefail
 702
 703       ${gatk_path} --java-options "-Xmx6g -Xms6g" \
 704       GatherTranches \
 705       --input ${sep=" --input " input_fofn}  \
 706       --output ${output_filename}
 707   >>>
 708   runtime {
 709     # docker: docker
 710     memory: "7 GB"
 711     cpu: "2"
 712     disks: "local-disk " + disk_size + " HDD"
 713     preemptible: 5
 714   }
 715   output {
 716     File tranches = "${output_filename}"
 717   }
 718 }
 719
 720 task ApplyRecalibration {
 721   String recalibrated_vcf_filename
 722   File input_vcf
 723   File input_vcf_index
 724   File indels_recalibration
 725   File indels_recalibration_index
 726   File indels_tranches
 727   File snps_recalibration
 728   File snps_recalibration_index
 729   File snps_tranches
 730
 731   Float indel_filter_level
 732   Float snp_filter_level
 733
 734   String gatk_path
 735   String docker
 736   Int disk_size
 737
 738   command {
 739     set -e
 740
 741     ${gatk_path} --java-options "-Xmx5g -Xms5g" \
 742       ApplyVQSR \
 743       -O tmp.indel.recalibrated.vcf \
 744       -V ${input_vcf} \
 745       --recal-file ${indels_recalibration} \
 746       --tranches-file ${indels_tranches} \
 747       --truth-sensitivity-filter-level ${indel_filter_level} \
 748       --create-output-variant-index true \
 749       -mode INDEL
 750
 751     ${gatk_path} --java-options "-Xmx5g -Xms5g" \
 752       ApplyVQSR \
 753       -O ${recalibrated_vcf_filename} \
 754       -V tmp.indel.recalibrated.vcf \
 755       --recal-file ${snps_recalibration} \
 756       --tranches-file ${snps_tranches} \
 757       --truth-sensitivity-filter-level ${snp_filter_level} \
 758       --create-output-variant-index true \
 759       -mode SNP
 760   }
 761   runtime {
 762     # docker: docker
 763     memory: "7 GB"
 764     cpu: "1"
 765     disks: "local-disk " + disk_size + " HDD"
 766     preemptible: 5
 767   }
 768   output {
 769     File recalibrated_vcf = "${recalibrated_vcf_filename}"
 770     File recalibrated_vcf_index = "${recalibrated_vcf_filename}.tbi"
 771   }
 772 }
 773
 774 task GatherVcfs {
 775   Array[File] input_vcfs_fofn
 776   Array[File] input_vcf_indexes_fofn
 777   String output_vcf_name
 778
 779   String gatk_path
 780   String docker
 781   Int disk_size
 782
 783   command <<<
 784     set -e
 785     set -o pipefail
 786
 787     # ignoreSafetyChecks make a big performance difference so we include it in our invocation
 788     ${gatk_path} --java-options "-Xmx6g -Xms6g" \
 789     GatherVcfsCloud \
 790     --ignore-safety-checks \
 791     --gather-type BLOCK \
 792     --input ${sep=" --input " input_vcfs_fofn} \
 793     --output ${output_vcf_name}
 794
 795     ${gatk_path} --java-options "-Xmx6g -Xms6g" \
 796     IndexFeatureFile \
 797     --feature-file ${output_vcf_name}
 798   >>>
 799   runtime {
 800     # docker: docker
 801     memory: "7 GB"
 802     cpu: "1"
 803     disks: "local-disk " + disk_size + " HDD"
 804     preemptible: 5
 805   }
 806   output {
 807     File output_vcf = "${output_vcf_name}"
 808     File output_vcf_index = "${output_vcf_name}.tbi"
 809   }
 810 }
 811
 812 task CollectVariantCallingMetrics {
 813   File input_vcf
 814   File input_vcf_index
 815
 816   String metrics_filename_prefix
 817   File dbsnp_vcf
 818   File dbsnp_vcf_index
 819   File interval_list
 820   File ref_dict
 821
 822   String gatk_path
 823   String docker
 824   Int disk_size
 825
 826   command {
 827     ${gatk_path} --java-options "-Xmx6g -Xms6g" \
 828       CollectVariantCallingMetrics \
 829       --INPUT ${input_vcf} \
 830       --DBSNP ${dbsnp_vcf} \
 831       --SEQUENCE_DICTIONARY ${ref_dict} \
 832       --OUTPUT ${metrics_filename_prefix} \
 833       --THREAD_COUNT 8 \
 834       --TARGET_INTERVALS ${interval_list}
 835   }
 836   output {
 837     File detail_metrics_file = "${metrics_filename_prefix}.variant_calling_detail_metrics"
 838     File summary_metrics_file = "${metrics_filename_prefix}.variant_calling_summary_metrics"
 839   }
 840   runtime {
 841     # docker: docker
 842     memory: "7 GB"
 843     cpu: 2
 844     disks: "local-disk " + disk_size + " HDD"
 845     preemptible: 5
 846   }
 847 }
 848
 849 task GatherMetrics {
 850   Array[File] input_details_fofn
 851   Array[File] input_summaries_fofn
 852
 853   String output_prefix
 854
 855   String gatk_path
 856   String docker
 857   Int disk_size
 858
 859   command <<<
 860     set -e
 861     set -o pipefail
 862
 863
 864     ${gatk_path} --java-options "-Xmx2g -Xms2g" \
 865     AccumulateVariantCallingMetrics \
 866     --INPUT ${sep=" --INPUT " input_details_fofn} \
 867     --OUTPUT ${output_prefix}
 868   >>>
 869   runtime {
 870     # docker: docker
 871     memory: "3 GB"
 872     cpu: "1"
 873     disks: "local-disk " + disk_size + " HDD"
 874     preemptible: 5
 875   }
 876   output {
 877     File detail_metrics_file = "${output_prefix}.variant_calling_detail_metrics"
 878     File summary_metrics_file = "${output_prefix}.variant_calling_summary_metrics"
 879   }
 880 }
 881
 882 task DynamicallyCombineIntervals {
 883   File intervals
 884   Int merge_count
 885
 886   command {
 887     python2 << CODE
 888     def parse_interval(interval):
 889         colon_split = interval.split(":")
 890         chromosome = colon_split[0]
 891         dash_split = colon_split[1].split("-")
 892         start = int(dash_split[0])
 893         end = int(dash_split[1])
 894         return chromosome, start, end
 895
 896     def add_interval(chr, start, end):
 897         lines_to_write.append(chr + ":" + str(start) + "-" + str(end))
 898         return chr, start, end
 899
 900     count = 0
 901     chain_count = ${merge_count}
 902     l_chr, l_start, l_end = "", 0, 0
 903     lines_to_write = []
 904     with open("${intervals}") as f:
 905         with open("out.intervals", "w") as f1:
 906             for line in f.readlines():
 907                 # initialization
 908                 if count == 0:
 909                     w_chr, w_start, w_end = parse_interval(line)
 910                     count = 1
 911                     continue
 912                 # reached number to combine, so spit out and start over
 913                 if count == chain_count:
 914                     l_char, l_start, l_end = add_interval(w_chr, w_start, w_end)
 915                     w_chr, w_start, w_end = parse_interval(line)
 916                     count = 1
 917                     continue
 918
 919                 c_chr, c_start, c_end = parse_interval(line)
 920                 # if adjacent keep the chain going
 921                 if c_chr == w_chr and c_start == w_end + 1:
 922                     w_end = c_end
 923                     count += 1
 924                     continue
 925                 # not adjacent, end here and start a new chain
 926                 else:
 927                     l_char, l_start, l_end = add_interval(w_chr, w_start, w_end)
 928                     w_chr, w_start, w_end = parse_interval(line)
 929                     count = 1
 930             if l_char != w_chr or l_start != w_start or l_end != w_end:
 931                 add_interval(w_chr, w_start, w_end)
 932             f1.writelines("\n".join(lines_to_write))
 933     CODE
 934   }
 935
 936   runtime {
 937     memory: "3 GB"
 938     preemptible: 5
 939     # docker: "python:2.7"
 940   }
 941
 942   output {
 943     File output_intervals = "out.intervals"
 944   }
 945 }