5 gen_trial_jbrowse.pl - creates jbrowse instances that include base tracks and accession vcf tracks for each trial name supplied
9 gen_trial_jbrowse.pl -v [absolute path through base instance to dir containing vcfs] -b [absolute path to base instance] -d [name of database to search for trials] -h [db host]
11 =head1 COMMAND-LINE OPTIONS
13 -v absolute path through base instance to dir containing vcfs
14 -b absolute path to base instance
20 Takes a directory containing individual vcf files, including filtered and imputed versions, and creates jbrowse instances by symlinking to all necessary files in a base instance, then generating a uniq tracks.conf file and appending dataset info to jbrowse.conf. Should be run from /data/json/trials dir in jbrowse instance.
23 Bryan Ellerbrock (bje24@cornell.edu) - Oct 2015
30 use Bio
::Chado
::Schema
;
31 use CXGN
::DB
::InsertDBH
;
32 use CXGN
::Trial
::TrialLayout
;
35 our ($opt_v, $opt_b, $opt_h, $opt_d);
37 #-----------------------------------------------------------------------
38 # define paths & array of vcf_file names. Open file handles to read trial lists and append datasets to jbrowse.conf
39 #-----------------------------------------------------------------------
43 my $vcf_dir_path = $opt_v;
46 my $link_path = $opt_b;
47 my $url_path = $opt_v;
48 $url_path =~ s
:$link_path/(.+):$1:;
49 print STDERR
"url path = $url_path \n";
51 my $accessions_found = 0;
52 my ($file_type, $out, $header,@tracks,$imp_track,$filt_track,$h,$q,$dbh,%accession,%control);
53 my @files = ` dir -GD -1 --hide *.tbi $vcf_dir_path ` ;
57 my $schema= Bio
::Chado
::Schema
->connect( sub { $dbh->get_actual_dbh() });
59 open (CONF
, ">>", "../../../jbrowse.conf") || die "Can't open conf file jbrowse.conf!\n";
61 open (LOG
, ">>", "gen_trial.log") || die "Can't open log file!\n";
63 #-----------------------------------------------------------------------
64 # connect to database and extract trial names and ids
65 #-----------------------------------------------------------------------
67 # store database handle and schema
69 $dbh = CXGN
::DB
::InsertDBH
->new( { dbhost
=>$dbhost,
71 dbargs
=> {AutoCommit
=> 0,
77 # prepare and execute sql to extract trial names and ids
79 $q = "SELECT distinct(project.project_id), project.name FROM project LEFT JOIN projectprop AS year ON (project.project_id = year.project_id) LEFT JOIN projectprop AS location ON (project.project_id = location.project_id) LEFT JOIN project_relationship ON (project.project_id = project_relationship.subject_project_id) LEFT JOIN project as program ON (project_relationship.object_project_id=program.project_id) LEFT JOIN projectprop as project_type ON (project.project_id=project_type.project_id) LEFT JOIN cvterm AS type_cvterm ON (project_type.type_id = type_cvterm.cvterm_id) WHERE (year.type_id=76395 OR year.type_id IS NULL) and (location.type_id=76920 OR location.type_id IS NULL) and (project_type.type_id in (76919,76918) OR project_type.type_id IS NULL)";
85 #-----------------------------------------------------------------------
86 # for each trial name, prepare conf file output
87 #-----------------------------------------------------------------------
89 while (my ($trial_id, $trial_name) = $h->fetchrow_array) {
92 print STDERR
"trial name = $trial_name \n";
94 print STDERR
"trial id = $trial_id \n";
96 $out = "$trial_id/tracks.conf";
97 $header = "[general]\ndataset_id = Trial_$trial_id\n";
99 #-----------------------------------------------------------------------
100 # extract list of accessions associated with trial from database
101 #-----------------------------------------------------------------------
103 my $trial_layout = CXGN
::Trial
::TrialLayout
->new({schema
=> $schema, trial_id
=> $trial_id, experiment_type
=>'field_layout'} );
104 my $accession_names_ref = $trial_layout->get_accession_names();
105 my $control_names_ref = $trial_layout->get_control_names();
107 for my $accession (@
$accession_names_ref) {
108 # print STDERR "Trial $trial_name contains accession";
109 print Dumper
($accession);
110 my %accession_hash = %$accession;
111 push (@accession_names, $accession_hash{'accession_name'});
113 for my $control (@
$control_names_ref) {
114 # print STDERR " Trial $trial_name contains control";
115 print Dumper
($control);
116 my %control_hash = %$control;
117 push (@accession_names, $control_hash{'accession_name'});
119 print STDERR
" Trial $trial_name contains accessions @$accession_names_ref \n";
120 print STDERR
" Trial $trial_name contains controls @$control_names_ref \n";
122 #-----------------------------------------------------------------------
123 # for each accession, locate matching indiv vcf files and construct necessary text for tracks.conf
124 #-----------------------------------------------------------------------
126 for my $accession_name (@accession_names) {
127 print STDERR
"Working on trial $trial_name and accession $accession_name! \n";
128 print LOG
"Working on trial $trial_name and accession $accession_name! \n";
130 for my $file (@files) {
133 next if !m/filtered/ && !m/imputed/;
134 $_ =~ s/([^.]+)_2015_V6.*/$1/s;
135 next if ($_ ne $accession_name);
136 print STDERR
"Matched vcf file basename $_ to accession name $accession_name !\n";
137 print LOG
"Matched vcf file basename $_ to accession name $accession_name !\n";
139 $file_type =~ s/.+_([imputedflr]+).vcf.gz/$1/s;
140 if ($file_type eq 'filtered') { #Handle filtered vcf files
141 print STDERR
"Working on filtered file $file \n" ;
142 my $path = $url_path . "/" . $file ;
144 $key =~ s/(.+).vcf.gz/$1/s;
147 [ tracks . ' . $key . ' ]
148 hooks.modify = function( track, feature, div ) { div.style.backgroundColor = track.config.variantIsHeterozygous(feature);}
149 variantIsHeterozygous = function( feature ) {
150 var genotypes = feature.get(\'genotypes\');
151 for( var sampleName in genotypes ) {
153 var gtString = genotypes[sampleName].GT.values[0];
154 if( ! /^1([\|\/]1)*$/.test( gtString) && ! /^0([\|\/]0)*$/.test( gtString ) )
158 if( /^1([\|\/]1)*$/.test( gtString) )
162 storeClass = JBrowse/Store/SeqFeature/VCFTabix
163 urlTemplate = ' . $path .'
164 category = Diversity/NEXTGEN/Unimputed
165 type = JBrowse/View/Track/HTMLVariants
166 metadata.Description = Homozygous reference: Green Heterozygous: Red Homozygous alternate: Blue Filtered to remove SNPs with a depth of 0 and SNPs with 2 or more alt alleles.
167 metadata.Link = <a href=ftp://cassavabase.org/jbrowse/diversity/igdBuildWithV6_\
168 hapmap_20150404_vcfs/' . $file . '>Download VCF File</a>
169 metadata.Provider = NEXTGEN Cassava project
170 metadata.Accession = ' . $accession_name . '
173 push @tracks, $filt_track;
175 } elsif ($file_type eq 'imputed') { #Handle imputed vcf files
176 print STDERR
"Working on imputed file $file \n" ;
177 my $path = $url_path . "/" . $file ;
179 $key =~ s/(.+).vcf.gz/$1/s;
182 [ tracks . ' . $key . ' ]
183 hooks.modify = function( track, feature, div ) { div.style.backgroundColor = track.config.variantIsHeterozygous(feature); div.style.opacity = track.config.variantIsImputed(feature) ? \'0.33\' : \'1.0\'; }
184 variantIsHeterozygous = function( feature ) {
185 var genotypes = feature.get(\'genotypes\');
186 for( var sampleName in genotypes ) {
188 var gtString = genotypes[sampleName].GT.values[0];
189 if( ! /^1([\|\/]1)*$/.test( gtString) && ! /^0([\|\/]0)*$/.test( gtString ) )
193 if( /^1([\|\/]1)*$/.test( gtString) )
196 variantIsImputed = function( feature ) {
197 var genotypes = feature.get(\'genotypes\');
198 for( var sampleName in genotypes ) {
200 var dpString = genotypes[sampleName].DP.values[0];
201 if( /^0$/.test( dpString) )
208 storeClass = JBrowse/Store/SeqFeature/VCFTabix
209 urlTemplate = ' . $path .'
210 category = Diversity/NEXTGEN/Imputed
211 type = JBrowse/View/Track/HTMLVariants
212 metadata.Description = Homozygous reference: Green Heterozygous: Red Homozygous alternate: Blue Values imputed with GLMNET are shown at 1/3 opacity.
213 metadata.Link = <a href=ftp://cassavabase.org/jbrowse/diversity/igdBuildWithV6_\
214 hapmap_20150404_vcfs/' . $file . '>Download VCF File</a>
215 metadata.Provider = NEXTGEN Cassava project
216 metadata.Accession = ' . $accession_name .'
219 push @tracks, $imp_track;
228 #-----------------------------------------------------------------------
229 # if matching vcf files were not found, skip to next trial name
230 #-----------------------------------------------------------------------
232 next unless (@tracks);
234 #-----------------------------------------------------------------------
235 # otherwise increment count, save the new tracks, and move on to next accession
236 #-----------------------------------------------------------------------
239 foreach my $value (@tracks) {
240 push (@conf_info, $value);
243 print STDERR
"Saved accession $accession_name track info, moving onto next accession. Current accessions found = $accessions_found\n";
244 print LOG
"Saved accession $accession_name track info, moving onto next accession. Current accessions found = $accessions_found\n";
248 #-----------------------------------------------------------------------
249 # once all accessions have been searched, check count and only proceed if we've found at least 2
250 #-----------------------------------------------------------------------
251 if ($accessions_found < 2) {
252 print STDERR
"$accessions_found accessions in this trial have vcf files. Skipping it and moving onto next trial \n";
254 undef @accession_names;
255 $accessions_found = 0;
259 #-----------------------------------------------------------------------
260 # if 2 or more have been found, set up jbrowse instance, unless it already exisits
261 #-----------------------------------------------------------------------
263 print STDERR
"Woo! $accessions_found accessions in this trial have vcf files. Setting up jbrowse instance, then moving onto next trial \n";
264 print LOG
"Woo! $accessions_found accessions in this trial have vcf files. Setting up jbrowse instance, then moving onto next trial \n";
267 unless (-d
$trial_id) {
269 `sudo mkdir -p $trial_id`;
270 `sudo ln -sf $link_path/data_files $trial_id/data_files`;
271 `sudo ln -sf $link_path/seq $trial_id/seq`;
272 `sudo ln -sf $link_path/tracks $trial_id/tracks`;
273 `sudo ln -sf $link_path/trackList.json $trial_id/trackList.json`;
274 `sudo ln -sf $link_path/readme $trial_id/readme`;
275 `sudo touch $trial_id/tracks.conf && sudo chmod a+wrx $trial_id/tracks.conf`;
277 #-----------------------------------------------------------------------
278 # then append dataset info to jbrowse.conf, and create and populate trial specific tracks.conf file
279 #-----------------------------------------------------------------------
281 my $dataset = "[datasets.Trial_$trial_id]\n" . "url = ?data=data/json/trials/$trial_id\n" . "name = $trial_name\n\n";
283 open (OUT
, ">", $out) || die "Can't open out file $out! \n";
285 print OUT
join ("\n", @conf_info), "\n";
286 undef @conf_info; # reset arrays to be used for next trial
287 undef @accession_names;
288 $accessions_found = 0;
292 #-----------------------------------------------------------------------
293 # if instance already exists, just append tracks to tracks.conf
294 #-----------------------------------------------------------------------
296 open (OUT
, ">>", $out) || die "Can't open out file $out! \n";
297 print OUT
join ("\n", @conf_info), "\n";
298 undef @conf_info; # reset arrays to be used for next trial
299 undef @accession_names;
300 $accessions_found = 0;