1 # Run Funcotator on a set of Segments from GATK4 ModelSegments.
3 # Output filenames are <input_seg_file>.funcotated.tsv AND <input_seg_file>.funcotated.tsv.gene_list.txt
5 # Description of inputs:
8 # File input_seg_file - Seg file from GATK ModelSegments
9 # String gatk_docker - GATK Docker image in which to run
10 # File ref_fasta - Reference FASTA file.
11 # File ref_fasta_fai - Reference FASTA file index.
12 # File ref_fasta_dict - Reference FASTA file sequence dictionary.
13 # File variant_vcf_to_funcotate - Variant Context File (VCF) containing the variants to annotate.
14 # File variant_vcf_to_funcotate_index - Index file corresponding to the input Variant Context File (VCF) containing the variants to annotate.
15 # String funcotator_ref_version - Version of the reference being used. Either `hg19` or `hg38`.
16 # Boolean compress - Whether to compress the resulting output file.
19 # interval_list - Intervals to be used for traversal. If specified will only traverse the given intervals.
20 # funcotator_data_sources_tar_gz - Path to tar.gz containing the data sources for Funcotator to create annotations.
21 # transcript_selection_mode - Method of detailed transcript selection. This will select the transcript for detailed annotation (either `CANONICAL` or `BEST_EFFECT`).
22 # transcript_selection_list - Set of transcript IDs to use for annotation to override selected transcript.
23 # annotation_defaults - Annotations to include in all annotated variants if the annotation is not specified in the data sources (in the format <ANNOTATION>:<VALUE>). This will add the specified annotation to every annotated variant if it is not already present.
24 # annotation_overrides - Override values for annotations (in the format <ANNOTATION>:<VALUE>). Replaces existing annotations of the given name with given values.
25 # funcotator_excluded_fields - output fields to drop. These are just names of column headers.
26 # gatk4_jar_override - Override Jar file containing GATK 4.0. Use this when overriding the docker JAR or when using a backend without docker.
27 # extra_args - Extra command-line arguments to pass through to Funcotator. (e.g. " --exclude-field foo_field --exclude-field bar_field ")
28 # is_removing_untared_datasources - (Default: false) Set this to true when running on-prem or local to reduce the risk of making copies of the datasources in your output directories.
30 # This WDL needs to decide whether to use the ``gatk_jar`` or ``gatk_jar_override`` for the jar location. As of cromwell-0.24,
31 # this logic *must* go into each task. Therefore, there is a lot of duplicated code. This allows users to specify a jar file
32 # independent of what is in the docker file. See the README.md for more info.
34 workflow CNVFuncotateSegmentsWorkflow {
40 String funcotator_ref_version
41 File? gatk4_jar_override
42 File? funcotator_data_sources_tar_gz
43 String? transcript_selection_mode
44 File? transcript_selection_list
45 Array[String]? annotation_defaults
46 Array[String]? annotation_overrides
47 Array[String]? funcotator_excluded_fields
51 # Set to true when running local or on-prem
52 Boolean? is_removing_untared_datasources
58 Boolean? use_ssd = false
60 Int? preemptible_attempts
62 call FuncotateSegments {
64 input_seg_file = input_seg_file,
65 ref_fasta = ref_fasta,
66 ref_fasta_fai = ref_fasta_fai,
67 ref_fasta_dict = ref_fasta_dict,
68 funcotator_ref_version = funcotator_ref_version,
69 gatk4_jar_override = gatk4_jar_override,
70 funcotator_data_sources_tar_gz = funcotator_data_sources_tar_gz,
71 transcript_selection_mode = transcript_selection_mode,
72 transcript_selection_list = transcript_selection_list,
73 annotation_defaults = annotation_defaults,
74 annotation_overrides = annotation_overrides,
75 funcotator_excluded_fields = funcotator_excluded_fields,
76 interval_list = interval_list,
77 extra_args = extra_args,
78 is_removing_untared_datasources = is_removing_untared_datasources,
79 gatk_docker = gatk_docker,
81 disk_space_gb = disk_space_gb,
84 preemptible_attempts = preemptible_attempts
88 File funcotated_seg_simple_tsv = FuncotateSegments.funcotated_seg_simple_tsv
89 File funcotated_gene_list_tsv = FuncotateSegments.funcotated_gene_list_tsv
93 task FuncotateSegments {
99 String funcotator_ref_version
100 File? gatk4_jar_override
101 File? funcotator_data_sources_tar_gz = "gs://broad-public-datasets/funcotator/funcotator_dataSources.v1.6.20190124s.tar.gz"
102 String? transcript_selection_mode = "CANONICAL"
103 File? transcript_selection_list
104 Array[String]? annotation_defaults
105 Array[String]? annotation_overrides
106 Array[String]? funcotator_excluded_fields
110 # Set to true when running local or on-prem
111 Boolean? is_removing_untared_datasources
117 Boolean use_ssd = false
119 Int? preemptible_attempts
121 # You may have to change the following two parameter values depending on the task requirements
122 Int default_ram_mb = 3000
123 # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb).
124 Int default_disk_space_gb = 100
126 # Mem is in units of GB but our command and memory runtime values are in MB
127 Int machine_mem_mb = if defined(mem_gb) then mem_gb *1000 else default_ram_mb
128 Int command_mem_mb = machine_mem_mb - 1000
130 ## Process input args
131 String transcript_selection_arg = if defined(transcript_selection_list) then " --transcript-list " else ""
132 String annotation_def_arg = if defined(annotation_defaults) then " --annotation-default " else ""
133 String annotation_over_arg = if defined(annotation_overrides) then " --annotation-override " else ""
134 String excluded_fields_args = if defined(funcotator_excluded_fields) then " --exclude-field " else ""
135 String interval_list_arg = if defined(interval_list) then " -L " else ""
136 String extra_args_arg = select_first([extra_args, ""])
137 Boolean is_removing_untared_datasources_final = select_first([is_removing_untared_datasources, true])
138 String removing_untared_datasources = if is_removing_untared_datasources_final then "echo Removing $DATA_SOURCES_FOLDER && rm -Rf $DATA_SOURCES_FOLDER " else " echo Not bothering to remove datasources."
139 String basename_input_seg_file = basename(input_seg_file)
143 #export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}
145 # Extract our data sources:
146 echo "Extracting data sources zip file..."
147 mkdir datasources_dir
148 tar zxvf ${funcotator_data_sources_tar_gz} -C datasources_dir --strip-components 1
149 DATA_SOURCES_FOLDER="$PWD/datasources_dir"
151 # Run FuncotateSegments:
152 gatk --java-options "-Xmx${command_mem_mb}m" FuncotateSegments \
153 --data-sources-path $DATA_SOURCES_FOLDER \
154 --ref-version ${funcotator_ref_version} \
155 --output-file-format SEG \
157 --segments ${input_seg_file} \
158 -O ${basename_input_seg_file}.funcotated.tsv \
159 ${interval_list_arg} ${default="" interval_list} \
160 ${"--transcript-selection-mode " + transcript_selection_mode} \
161 ${transcript_selection_arg}${default="" sep=" --transcript-list " transcript_selection_list} \
162 ${annotation_def_arg}${default="" sep=" --annotation-default " annotation_defaults} \
163 ${annotation_over_arg}${default="" sep=" --annotation-override " annotation_overrides} \
164 ${excluded_fields_args}${default="" sep=" --exclude-field " funcotator_excluded_fields} \
167 ${removing_untared_datasources}
171 #docker: "${gatk_docker}"
172 memory: machine_mem_mb + " MB"
173 disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD"
174 cpu: select_first([cpu, 1])
175 preemptible: select_first([preemptible_attempts, 5])
179 File funcotated_seg_simple_tsv = "${basename_input_seg_file}.funcotated.tsv"
180 File funcotated_gene_list_tsv = "${basename_input_seg_file}.funcotated.tsv.gene_list.txt"