etc/gatk-wdl/fm2gatk/germline.wdl

   1 version 1.0
   2
   3 # Copyright (c) 2018 Leiden University Medical Center
   4 #
   5 # Permission is hereby granted, free of charge, to any person obtaining a copy
   6 # of this software and associated documentation files (the "Software"), to deal
   7 # in the Software without restriction, including without limitation the rights
   8 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   9 # copies of the Software, and to permit persons to whom the Software is
  10 # furnished to do so, subject to the following conditions:
  11 #
  12 # The above copyright notice and this permission notice shall be included in
  13 # all copies or substantial portions of the Software.
  14 #
  15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21 # SOFTWARE.
  22
  23 import "sample.wdl" as sampleWf
  24 import "structural-variantcalling/structural-variantcalling.wdl" as structuralVariantCalling
  25 import "gatk-variantcalling/single-sample-variantcalling.wdl" as variantCallingWorkflow
  26 import "gatk-variantcalling/jointgenotyping.wdl" as jgwf
  27 import "gatk-variantcalling/calculate-regions.wdl" as calcRegions
  28 import "structs.wdl" as structs
  29 import "tasks/biowdl.wdl" as biowdl
  30 import "tasks/common.wdl" as common
  31 import "tasks/multiqc.wdl" as multiqc
  32 import "tasks/chunked-scatter.wdl" as chunkedScatter
  33
  34 workflow Germline {
  35     input {
  36         File sampleConfigFile
  37         String outputDir = "."
  38         File referenceFasta
  39         File referenceFastaFai
  40         File referenceFastaDict
  41         BwaIndex bwaIndex
  42         File dockerImagesFile
  43         File dbsnpVCF
  44         File dbsnpVCFIndex
  45         File? regions
  46         File? XNonParRegions
  47         File? YNonParRegions
  48         Boolean jointgenotyping = true
  49         Boolean singleSampleGvcf = false
  50         String? adapterForward = "AGATCGGAAGAG"  # Illumina universal adapter
  51         String? adapterReverse = "AGATCGGAAGAG"  # Illumina universal adapter
  52         String platform = "illumina"
  53         Boolean useBwaKit = false
  54         Int scatterSizeMillions = 1000
  55         Int? scatterSize
  56         Int bwaThreads = 4
  57         # Only run multiQC if the user specified an outputDir
  58         Boolean runSVcalling = false
  59     }
  60     meta {allowNestedInputs: true}
  61
  62     Boolean mergeVcfs = !jointgenotyping || singleSampleGvcf
  63
  64     # Parse docker Tags configuration and sample sheet
  65     call common.YamlToJson as ConvertDockerImagesFile {
  66         input:
  67             yaml = dockerImagesFile
  68     }
  69     Map[String, String] dockerImages = read_json(ConvertDockerImagesFile.json)
  70
  71     call biowdl.InputConverter as ConvertSampleConfig {
  72         input:
  73             samplesheet = sampleConfigFile,
  74             dockerImage = dockerImages["biowdl-input-converter"]
  75     }
  76     SampleConfig sampleConfig = read_json(ConvertSampleConfig.json)
  77
  78     call calcRegions.CalculateRegions as calculateRegions {
  79         input:
  80             referenceFasta = referenceFasta,
  81             referenceFastaFai = referenceFastaFai,
  82             referenceFastaDict = referenceFastaDict,
  83             XNonParRegions = XNonParRegions,
  84             YNonParRegions = YNonParRegions,
  85             regions = regions,
  86             scatterSize = scatterSize,
  87             scatterSizeMillions = scatterSizeMillions,
  88             dockerImages = dockerImages
  89     }
  90
  91     call chunkedScatter.ScatterRegions as scatterList {
  92         input:
  93             inputFile = select_first([regions, referenceFastaFai]),
  94             scatterSize = scatterSize,
  95             scatterSizeMillions = scatterSizeMillions,
  96             dockerImage = dockerImages["chunked-scatter"]
  97     }
  98
  99     # Running sample subworkflow
 100     scatter (sample in sampleConfig.samples) {
 101         String sampleIds = sample.id
 102         String sampleDir = outputDir + "/samples/" + sample.id + "/"
 103         call sampleWf.SampleWorkflow as sampleWorkflow {
 104             input:
 105                 sampleDir = sampleDir,
 106                 sample = sample,
 107                 referenceFasta = referenceFasta,
 108                 referenceFastaFai = referenceFastaFai,
 109                 referenceFastaDict = referenceFastaDict,
 110                 bwaIndex = bwaIndex,
 111                 dbsnpVCF = dbsnpVCF,
 112                 dbsnpVCFIndex = dbsnpVCFIndex,
 113                 adapterForward = adapterForward,
 114                 adapterReverse = adapterReverse,
 115                 useBwaKit = useBwaKit,
 116                 dockerImages = dockerImages,
 117                 scatters = scatterList.scatters,
 118                 bwaThreads = bwaThreads,
 119                 platform = platform
 120         }
 121
 122         call variantCallingWorkflow.SingleSampleCalling as SingleSampleCalling {
 123             input:
 124                 bam = sampleWorkflow.recalibratedBam,
 125                 bamIndex = sampleWorkflow.recalibratedBamIndex,
 126                 gender = select_first([sample.gender, "unknown"]),
 127                 sampleName = sample.id,
 128                 outputDir = sampleDir,
 129                 referenceFasta = referenceFasta,
 130                 referenceFastaFai = referenceFastaFai,
 131                 referenceFastaDict = referenceFastaDict,
 132                 dbsnpVCF = dbsnpVCF,
 133                 dbsnpVCFIndex = dbsnpVCFIndex,
 134                 XNonParRegions = calculateRegions.Xregions,
 135                 YNonParRegions = calculateRegions.Yregions,
 136                 statsRegions = regions,
 137                 autosomalRegionScatters = calculateRegions.autosomalRegionScatters,
 138                 gvcf = jointgenotyping,
 139                 mergeVcf = mergeVcfs,
 140                 dockerImages = dockerImages
 141         }
 142
 143
 144         if (runSVcalling) {
 145             call structuralVariantCalling.SVcalling as svCalling {
 146                 input:
 147                     bamFile = sampleWorkflow.markdupBam,
 148                     bamIndex = sampleWorkflow.markdupBamIndex,
 149                     referenceFasta = referenceFasta,
 150                     referenceFastaFai = referenceFastaFai,
 151                     referenceFastaDict = referenceFastaDict,
 152                     bwaIndex = bwaIndex,
 153                     sample = sample.id,
 154                     outputDir = sampleDir,
 155                     dockerImages = dockerImages
 156             }
 157         }
 158     }
 159
 160     if (jointgenotyping) {
 161         call jgwf.JointGenotyping as JointGenotyping {
 162             input:
 163                 gvcfFiles = flatten(SingleSampleCalling.vcfScatters),
 164                 gvcfFilesIndex = flatten(SingleSampleCalling.vcfIndexScatters),
 165                 outputDir = outputDir,
 166                 vcfBasename = "multisample",
 167                 referenceFasta = referenceFasta,
 168                 referenceFastaFai = referenceFastaFai,
 169                 referenceFastaDict = referenceFastaDict,
 170                 sampleIds = sampleIds,
 171                 dbsnpVCF = dbsnpVCF,
 172                 dbsnpVCFIndex = dbsnpVCFIndex,
 173                 regions = regions,
 174                 scatterSize = scatterSize,
 175                 dockerImages = dockerImages
 176         }
 177     }
 178
 179     Array[File] allReports = flatten([
 180         flatten(sampleWorkflow.reports), flatten(SingleSampleCalling.reports), select_first([JointGenotyping.reports, []])
 181         ])
 182
 183     call multiqc.MultiQC as multiqcTask {
 184         input:
 185             reports = allReports,
 186             outDir = outputDir,
 187             dockerImage = dockerImages["multiqc"]
 188     }
 189
 190     output {
 191         File multiqcReport = multiqcTask.multiqcReport
 192         Array[File] reports = allReports
 193         File? multiSampleVcf = JointGenotyping.multisampleVcf
 194         File? multisampleVcfIndex = JointGenotyping.multisampleVcfIndex
 195         File? multisampleGVcf = JointGenotyping.multisampleGVcf
 196         File? multisampleGVcfIndex = JointGenotyping.multisampleGVcfIndex
 197         Array[File] singleSampleVcfs = if jointgenotyping then [] else select_all(SingleSampleCalling.outputVcf)
 198         Array[File] singleSampleVcfsIndex = if jointgenotyping then [] else select_all(SingleSampleCalling.outputVcfIndex)
 199         Array[File] singleSampleGvcfs = if jointgenotyping then select_all(SingleSampleCalling.outputVcf) else []
 200         Array[File] singleSampleGvcfsIndex = if jointgenotyping then select_all(SingleSampleCalling.outputVcfIndex) else []
 201         Array[File] recalibratedBams = sampleWorkflow.recalibratedBam
 202         Array[File] recalibratedBamIndexes = sampleWorkflow.recalibratedBamIndex
 203         Array[File] markdupBams = sampleWorkflow.markdupBam
 204         Array[File] markdupBamIndexes = sampleWorkflow.markdupBamIndex
 205         Array[File?] cleverVCFs = svCalling.cleverVcf
 206         Array[File?] matecleverVCFs = svCalling.cleverVcf
 207         Array[File?] mantaVCFs = svCalling.mantaVcf
 208         Array[File?] dellyVCFs = svCalling.dellyVcf
 209         Array[File?] survivorVCFs = svCalling.survivorVcf
 210         Array[Array[File]?] renamedVCFs = svCalling.renamedVcfs
 211     }
 212
 213     parameter_meta {
 214         sampleConfigFile: {description: "The samplesheet, including sample ids, library ids, readgroup ids and fastq file locations.",
 215                            category: "required"}
 216         outputDir: {description: "The directory the output should be written to.", category: "common"}
 217         referenceFasta: { description: "The reference fasta file", category: "required" }
 218         referenceFastaFai: { description: "Fasta index (.fai) file of the reference", category: "required" }
 219         referenceFastaDict: { description: "Sequence dictionary (.dict) file of the reference", category: "required" }
 220         dbsnpVCF: { description: "dbsnp VCF file used for checking known sites", category: "required"}
 221         dbsnpVCFIndex: { description: "Index (.tbi) file for the dbsnp VCF", category: "required"}
 222         bwaIndex: {description: "The BWA index files.", category: "required"}
 223         dockerImagesFile: {description: "A YAML file describing the docker image used for the tasks. The dockerImages.yml provided with the pipeline is recommended.",
 224                            category: "advanced"}
 225         regions: {description: "A bed file describing the regions to call variants for.", category: "common"}
 226         runSVcalling: {description: "Whether or not Structural-variantcalling should be run.", category: "advanced"}
 227         runMultiQC: {description: "Whether or not MultiQC should be run.", category: "advanced"}
 228         XNonParRegions: {description: "Bed file with the non-PAR regions of X.", category: "common"}
 229         YNonParRegions: {description: "Bed file with the non-PAR regions of Y.", category: "common"}
 230         useBwaKit: {description: "Whether or not BWA kit should be used. If false BWA mem will be used.", category: "advanced"}
 231         adapterForward: {description: "The adapter to be removed from the reads first or single end reads.", category: "common"}
 232         adapterReverse: {description: "The adapter to be removed from the reads second end reads.", category: "common"}
 233         platform: {description: "The platform used for sequencing.", category: "advanced"}
 234         scatterSize: {description: "The size of the scattered regions in bases for the GATK subworkflows. Scattering is used to speed up certain processes. The genome will be seperated into multiple chunks (scatters) which will be processed in their own job, allowing for parallel processing. Higher values will result in a lower number of jobs. The optimal value here will depend on the available resources.",
 235               category: "advanced"}
 236         scatterSizeMillions:{ description: "Same as scatterSize, but is multiplied by 1000000 to get scatterSize. This allows for setting larger values more easily.",
 237                               category: "advanced"}
 238         jointgenotyping: {description: "Whether to perform jointgenotyping (using HaplotypeCaller to call GVCFs and merge them with GenotypeGVCFs) or not",
 239                   category: "common"}
 240         singleSampleGvcf: {description: "Whether to output single-sample gvcfs", category: "common"}
 241         bwaThreads: {description: "The amount of threads for the alignment process", category: "advanced"}
 242     }
 243 }