Merge pull request #5230 from solgenomics/topic/open_pollinated
[sgn.git] / bin / jbrowse_vcf_tools / gen_trial_jbrowse.pl
blob8530163387e94bd17d5a050516914b80181f41ad
1 #!/usr/bin/perl -w
3 =head1
5 gen_trial_jbrowse.pl - creates jbrowse instances that include base tracks and accession vcf tracks for each trial name supplied
7 =head1 SYNOPSIS
9 gen_trial_jbrowse.pl -v [absolute path through base instance to dir containing vcfs] -b [absolute path to base instance] -d [name of database to search for trials] -h [db host]
11 =head1 COMMAND-LINE OPTIONS
13 -v absolute path through base instance to dir containing vcfs
14 -b absolute path to base instance
15 -h database hostname
16 -d database name
18 =head1 DESCRIPTION
20 Takes a directory containing individual vcf files, including filtered and imputed versions, and creates jbrowse instances by symlinking to all necessary files in a base instance, then generating a uniq tracks.conf file and appending dataset info to jbrowse.conf. Should be run from /data/json/trials dir in jbrowse instance.
21 =head1 AUTHOR
23 Bryan Ellerbrock (bje24@cornell.edu) - Oct 2015
25 =cut
27 use strict;
28 use File::Slurp;
29 use Getopt::Std;
30 use Bio::Chado::Schema;
31 use CXGN::DB::InsertDBH;
32 use CXGN::Trial::TrialLayout;
33 use Data::Dumper;
35 our ($opt_v, $opt_b, $opt_h, $opt_d);
37 #-----------------------------------------------------------------------
38 # define paths & array of vcf_file names. Open file handles to read trial lists and append datasets to jbrowse.conf
39 #-----------------------------------------------------------------------
41 getopts('v:b:h:d:');
43 my $vcf_dir_path = $opt_v;
44 my $dbhost = $opt_h;
45 my $dbname = $opt_d;
46 my $link_path = $opt_b;
47 my $url_path = $opt_v;
48 $url_path =~ s:$link_path/(.+):$1:;
49 print STDERR "url path = $url_path \n";
51 my $accessions_found = 0;
52 my ($file_type, $out, $header,@tracks,$imp_track,$filt_track,$h,$q,$dbh,%accession,%control);
53 my @files = ` dir -GD -1 --hide *.tbi $vcf_dir_path ` ;
54 my $accession_name;
55 my @accession_names;
56 my @conf_info;
57 my $schema= Bio::Chado::Schema->connect( sub { $dbh->get_actual_dbh() });
59 open (CONF, ">>", "../../../jbrowse.conf") || die "Can't open conf file jbrowse.conf!\n";
61 open (LOG, ">>", "gen_trial.log") || die "Can't open log file!\n";
63 #-----------------------------------------------------------------------
64 # connect to database and extract trial names and ids
65 #-----------------------------------------------------------------------
67 # store database handle and schema
69 $dbh = CXGN::DB::InsertDBH->new( { dbhost=>$dbhost,
70 dbname=>$dbname,
71 dbargs => {AutoCommit => 0,
72 RaiseError => 1}
77 # prepare and execute sql to extract trial names and ids
79 $q = "SELECT distinct(project.project_id), project.name FROM project LEFT JOIN projectprop AS year ON (project.project_id = year.project_id) LEFT JOIN projectprop AS location ON (project.project_id = location.project_id) LEFT JOIN project_relationship ON (project.project_id = project_relationship.subject_project_id) LEFT JOIN project as program ON (project_relationship.object_project_id=program.project_id) LEFT JOIN projectprop as project_type ON (project.project_id=project_type.project_id) LEFT JOIN cvterm AS type_cvterm ON (project_type.type_id = type_cvterm.cvterm_id) WHERE (year.type_id=76395 OR year.type_id IS NULL) and (location.type_id=76920 OR location.type_id IS NULL) and (project_type.type_id in (76919,76918) OR project_type.type_id IS NULL)";
81 $h=$dbh->prepare($q);
83 $h->execute();
85 #-----------------------------------------------------------------------
86 # for each trial name, prepare conf file output
87 #-----------------------------------------------------------------------
89 while (my ($trial_id, $trial_name) = $h->fetchrow_array) {
91 chomp $trial_name;
92 print STDERR "trial name = $trial_name \n";
93 chomp $trial_id;
94 print STDERR "trial id = $trial_id \n";
96 $out = "$trial_id/tracks.conf";
97 $header = "[general]\ndataset_id = Trial_$trial_id\n";
99 #-----------------------------------------------------------------------
100 # extract list of accessions associated with trial from database
101 #-----------------------------------------------------------------------
103 my $trial_layout = CXGN::Trial::TrialLayout->new({schema => $schema, trial_id => $trial_id, experiment_type=>'field_layout'} );
104 my $accession_names_ref = $trial_layout->get_accession_names();
105 my $control_names_ref = $trial_layout->get_control_names();
107 for my $accession (@$accession_names_ref) {
108 # print STDERR "Trial $trial_name contains accession";
109 print Dumper($accession);
110 my %accession_hash = %$accession;
111 push (@accession_names, $accession_hash{'accession_name'});
113 for my $control (@$control_names_ref) {
114 # print STDERR " Trial $trial_name contains control";
115 print Dumper($control);
116 my %control_hash = %$control;
117 push (@accession_names, $control_hash{'accession_name'});
119 print STDERR " Trial $trial_name contains accessions @$accession_names_ref \n";
120 print STDERR " Trial $trial_name contains controls @$control_names_ref \n";
122 #-----------------------------------------------------------------------
123 # for each accession, locate matching indiv vcf files and construct necessary text for tracks.conf
124 #-----------------------------------------------------------------------
126 for my $accession_name (@accession_names) {
127 print STDERR "Working on trial $trial_name and accession $accession_name! \n";
128 print LOG "Working on trial $trial_name and accession $accession_name! \n";
130 for my $file (@files) {
131 chomp $file;
132 $_ = $file;
133 next if !m/filtered/ && !m/imputed/;
134 $_ =~ s/([^.]+)_2015_V6.*/$1/s;
135 next if ($_ ne $accession_name);
136 print STDERR "Matched vcf file basename $_ to accession name $accession_name !\n";
137 print LOG "Matched vcf file basename $_ to accession name $accession_name !\n";
138 $file_type = $file;
139 $file_type =~ s/.+_([imputedflr]+).vcf.gz/$1/s;
140 if ($file_type eq 'filtered') { #Handle filtered vcf files
141 print STDERR "Working on filtered file $file \n" ;
142 my $path = $url_path . "/" . $file ;
143 my $key = $file;
144 $key =~ s/(.+).vcf.gz/$1/s;
146 $filt_track = '
147 [ tracks . ' . $key . ' ]
148 hooks.modify = function( track, feature, div ) { div.style.backgroundColor = track.config.variantIsHeterozygous(feature);}
149 variantIsHeterozygous = function( feature ) {
150 var genotypes = feature.get(\'genotypes\');
151 for( var sampleName in genotypes ) {
152 try {
153 var gtString = genotypes[sampleName].GT.values[0];
154 if( ! /^1([\|\/]1)*$/.test( gtString) && ! /^0([\|\/]0)*$/.test( gtString ) )
155 return \'red\';
156 } catch(e) {}
158 if( /^1([\|\/]1)*$/.test( gtString) )
159 return \'blue\';
161 key = ' . $key .'
162 storeClass = JBrowse/Store/SeqFeature/VCFTabix
163 urlTemplate = ' . $path .'
164 category = Diversity/NEXTGEN/Unimputed
165 type = JBrowse/View/Track/HTMLVariants
166 metadata.Description = Homozygous reference: Green Heterozygous: Red Homozygous alternate: Blue Filtered to remove SNPs with a depth of 0 and SNPs with 2 or more alt alleles.
167 metadata.Link = <a href=ftp://cassavabase.org/jbrowse/diversity/igdBuildWithV6_\
168 hapmap_20150404_vcfs/' . $file . '>Download VCF File</a>
169 metadata.Provider = NEXTGEN Cassava project
170 metadata.Accession = ' . $accession_name . '
171 label = ' . $key . '
173 push @tracks, $filt_track;
175 } elsif ($file_type eq 'imputed') { #Handle imputed vcf files
176 print STDERR "Working on imputed file $file \n" ;
177 my $path = $url_path . "/" . $file ;
178 my $key = $file;
179 $key =~ s/(.+).vcf.gz/$1/s;
181 $imp_track = '
182 [ tracks . ' . $key . ' ]
183 hooks.modify = function( track, feature, div ) { div.style.backgroundColor = track.config.variantIsHeterozygous(feature); div.style.opacity = track.config.variantIsImputed(feature) ? \'0.33\' : \'1.0\'; }
184 variantIsHeterozygous = function( feature ) {
185 var genotypes = feature.get(\'genotypes\');
186 for( var sampleName in genotypes ) {
187 try {
188 var gtString = genotypes[sampleName].GT.values[0];
189 if( ! /^1([\|\/]1)*$/.test( gtString) && ! /^0([\|\/]0)*$/.test( gtString ) )
190 return \'red\';
191 } catch(e) {}
193 if( /^1([\|\/]1)*$/.test( gtString) )
194 return \'blue\';
196 variantIsImputed = function( feature ) {
197 var genotypes = feature.get(\'genotypes\');
198 for( var sampleName in genotypes ) {
199 try {
200 var dpString = genotypes[sampleName].DP.values[0];
201 if( /^0$/.test( dpString) )
202 return true;
203 } catch(e) {}
205 return false;
207 key = ' . $key .'
208 storeClass = JBrowse/Store/SeqFeature/VCFTabix
209 urlTemplate = ' . $path .'
210 category = Diversity/NEXTGEN/Imputed
211 type = JBrowse/View/Track/HTMLVariants
212 metadata.Description = Homozygous reference: Green Heterozygous: Red Homozygous alternate: Blue Values imputed with GLMNET are shown at 1/3 opacity.
213 metadata.Link = <a href=ftp://cassavabase.org/jbrowse/diversity/igdBuildWithV6_\
214 hapmap_20150404_vcfs/' . $file . '>Download VCF File</a>
215 metadata.Provider = NEXTGEN Cassava project
216 metadata.Accession = ' . $accession_name .'
217 label = ' . $key . '
219 push @tracks, $imp_track;
221 } else {
223 next;
228 #-----------------------------------------------------------------------
229 # if matching vcf files were not found, skip to next trial name
230 #-----------------------------------------------------------------------
232 next unless (@tracks);
234 #-----------------------------------------------------------------------
235 # otherwise increment count, save the new tracks, and move on to next accession
236 #-----------------------------------------------------------------------
238 $accessions_found++;
239 foreach my $value (@tracks) {
240 push (@conf_info, $value);
242 undef @tracks;
243 print STDERR "Saved accession $accession_name track info, moving onto next accession. Current accessions found = $accessions_found\n";
244 print LOG "Saved accession $accession_name track info, moving onto next accession. Current accessions found = $accessions_found\n";
248 #-----------------------------------------------------------------------
249 # once all accessions have been searched, check count and only proceed if we've found at least 2
250 #-----------------------------------------------------------------------
251 if ($accessions_found < 2) {
252 print STDERR "$accessions_found accessions in this trial have vcf files. Skipping it and moving onto next trial \n";
253 undef @conf_info;
254 undef @accession_names;
255 $accessions_found = 0;
256 next;
259 #-----------------------------------------------------------------------
260 # if 2 or more have been found, set up jbrowse instance, unless it already exisits
261 #-----------------------------------------------------------------------
263 print STDERR "Woo! $accessions_found accessions in this trial have vcf files. Setting up jbrowse instance, then moving onto next trial \n";
264 print LOG "Woo! $accessions_found accessions in this trial have vcf files. Setting up jbrowse instance, then moving onto next trial \n";
267 unless (-d $trial_id) {
269 `sudo mkdir -p $trial_id`;
270 `sudo ln -sf $link_path/data_files $trial_id/data_files`;
271 `sudo ln -sf $link_path/seq $trial_id/seq`;
272 `sudo ln -sf $link_path/tracks $trial_id/tracks`;
273 `sudo ln -sf $link_path/trackList.json $trial_id/trackList.json`;
274 `sudo ln -sf $link_path/readme $trial_id/readme`;
275 `sudo touch $trial_id/tracks.conf && sudo chmod a+wrx $trial_id/tracks.conf`;
277 #-----------------------------------------------------------------------
278 # then append dataset info to jbrowse.conf, and create and populate trial specific tracks.conf file
279 #-----------------------------------------------------------------------
281 my $dataset = "[datasets.Trial_$trial_id]\n" . "url = ?data=data/json/trials/$trial_id\n" . "name = $trial_name\n\n";
282 print CONF $dataset;
283 open (OUT, ">", $out) || die "Can't open out file $out! \n";
284 print OUT $header;
285 print OUT join ("\n", @conf_info), "\n";
286 undef @conf_info; # reset arrays to be used for next trial
287 undef @accession_names;
288 $accessions_found = 0;
289 next;
292 #-----------------------------------------------------------------------
293 # if instance already exists, just append tracks to tracks.conf
294 #-----------------------------------------------------------------------
296 open (OUT, ">>", $out) || die "Can't open out file $out! \n";
297 print OUT join ("\n", @conf_info), "\n";
298 undef @conf_info; # reset arrays to be used for next trial
299 undef @accession_names;
300 $accessions_found = 0;
301 next;