adding fernbase logo to git
[sgn.git] / bin / jbrowse_vcf_tools / gen_accession_jbrowse.pl
blobc10d396bf2e3923829bf5c5e096e54735aeafc5f
1 #!/usr/bin/perl -w
3 =head1
5 gen_accession_jbrowse.pl - creates jbrowse instances that include base tracks and accession vcf tracks for each accession name supplied
7 =head1 SYNOPSIS
9 gen_accession_jbrowse.pl -v [absolute path through base instance to dir containing vcfs] -b [absolute path to base instance] -d [name of database to search for trials] -h [db host]
11 =head1 COMMAND-LINE OPTIONS
13 -v absolute path through base instance to dir containing vcfs
14 -b absolute path to base instance
15 -h database hostname
16 -d database name
18 =head1 DESCRIPTION
20 Takes a directory containing individual vcf files, including filtered and imputed versions, and creates or updates jbrowse instances by symlinking to all necessary files in a base instance, then generating a uniq tracks.conf file and appending dataset info to jbrowse.conf. Should be run from /data/json/accessions dir in jbrowse instance.
22 =head1 AUTHOR
24 Bryan Ellerbrock (bje24@cornell.edu) - Oct 2015
26 =cut
28 use strict;
29 use File::Slurp;
30 use Getopt::Std;
31 use Bio::Chado::Schema;
32 use CXGN::DB::InsertDBH;
34 our ($opt_v, $opt_b, $opt_h, $opt_d);
36 #-----------------------------------------------------------------------
37 # define paths & array of vcf_file names. Open file handles to read accession lists and append datasets to jbrowse.conf
38 #-----------------------------------------------------------------------
40 getopts('v:b:h:d:');
42 my $vcf_dir_path = $opt_v;
43 my $dbhost = $opt_h;
44 my $dbname = $opt_d;
45 my $link_path = $opt_b;
46 my $url_path = $opt_v;
47 $url_path =~ s:$link_path/(.+):$1:;
48 print STDERR "url path = $url_path \n";
50 my ($file_type,$out,$header,@tracks,$imp_track,$filt_track,$h,$q,$dbh);
51 my @files = ` dir -GD -1 --hide *.tbi $vcf_dir_path ` ;
53 open (CONF, ">>", "../../../jbrowse.conf") || die "Can't open conf file jbrowse.conf!\n";
55 open (LOG, ">>", "gen_accession.log") || die "Can't open log file!\n";
57 #-----------------------------------------------------------------------
58 # connect to database and extract unique acccesion names
59 #-----------------------------------------------------------------------
61 #my $schema= Bio::Chado::Schema->connect( sub { $dbh->get_actual_dbh() });
63 $dbh = CXGN::DB::InsertDBH->new( { dbhost=>$dbhost,
64 dbname=>$dbname,
65 dbargs => {AutoCommit => 0,
66 RaiseError => 1}
71 $q = "SELECT distinct(stock.stock_id), stock.uniquename from stock where type_id = 76392 order by stock.uniquename";
73 $h=$dbh->prepare($q);
75 $h->execute();
77 #-----------------------------------------------------------------------
78 # for each accession name, locate matching indiv vcf files and construct necessary text for tracks.conf
79 #-----------------------------------------------------------------------
81 #print STDERR "fetchrow array = $h->fetchrow_array \n";
83 while (my ($id, $name) = $h->fetchrow_array) {
84 chomp $id;
85 chomp $name;
86 print LOG "id = $id and name = $name \n";
87 $out = "$id/tracks.conf";
88 $header = "[general]\ndataset_id = Accession_$id\n";
90 for my $file (@files) {
91 chomp $file;
92 $_ = $file;
93 next if !m/filtered/ && !m/imputed/;
94 $_ =~ s/([^.]+)_2015_V6.*/$1/s;
95 next if ($_ ne $name);
96 print STDERR "Matched vcf file basename $_ to accession name $name !\n";
97 print LOG "Matched vcf file basename $_ to accession name $name !\n";
98 $file_type = $file;
99 $file_type =~ s/.+_([imputedflr]+).vcf.gz/$1/s;
100 print LOG "filetype = $file_type \n";
101 if ($file_type eq 'filtered') { #Handle filtered vcf files
102 print LOG "Working on filtered file $file \n" ;
103 print STDERR "Working on filtered file $file \n" ;
104 my $path = $url_path . "/" . $file ;
105 my $key = $file;
106 $key =~ s/(.+).vcf.gz/$1/s;
107 print LOG "Key = $key \n";
109 $filt_track = '
110 [ tracks . ' . $key . ' ]
111 hooks.modify = function( track, feature, div ) { div.style.backgroundColor = track.config.variantIsHeterozygous(feature);}
112 variantIsHeterozygous = function( feature ) {
113 var genotypes = feature.get(\'genotypes\');
114 for( var sampleName in genotypes ) {
115 try {
116 var gtString = genotypes[sampleName].GT.values[0];
117 if( ! /^1([\|\/]1)*$/.test( gtString) && ! /^0([\|\/]0)*$/.test( gtString ) )
118 return \'red\';
119 } catch(e) {}
121 if( /^1([\|\/]1)*$/.test( gtString) )
122 return \'blue\';
124 key = ' . $key .'
125 storeClass = JBrowse/Store/SeqFeature/VCFTabix
126 urlTemplate = ' . $path .'
127 category = Diversity/NEXTGEN/Unimputed
128 type = JBrowse/View/Track/HTMLVariants
129 metadata.Description = Homozygous reference: Green Heterozygous: Red Homozygous alternate: Blue Filtered to remove SNPs with a depth of 0 and SNPs with 2 or more alt alleles.
130 metadata.Link = <a href=ftp://cassavabase.org/jbrowse/diversity/igdBuildWithV6_hapmap_20150404_vcfs/' . $file . '>Download VCF File</a>
131 metadata.Provider = NEXTGEN Cassava project
132 metadata.Accession = ' . $name . '
133 label = ' . $key . '
135 push @tracks, $filt_track;
137 } elsif ($file_type eq 'imputed') { #Handle imputed vcf files
139 print STDERR "Working on imputed file $file \n" ;
140 print LOG "Working on imputed file $file \n" ;
141 my $path = $url_path . "/" . $file ;
142 my $key = $file;
143 $key =~ s/(.+).vcf.gz/$1/s;
144 print LOG "Key = $key \n";
146 $imp_track = '
147 [ tracks . ' . $key . ' ]
148 hooks.modify = function( track, feature, div ) { div.style.backgroundColor = track.config.variantIsHeterozygous(feature); div.style.opacity = track.config.variantIsImputed(feature) ? \'0.33\' : \'1.0\'; }
149 variantIsHeterozygous = function( feature ) {
150 var genotypes = feature.get(\'genotypes\');
151 for( var sampleName in genotypes ) {
152 try {
153 var gtString = genotypes[sampleName].GT.values[0];
154 if( ! /^1([\|\/]1)*$/.test( gtString) && ! /^0([\|\/]0)*$/.test( gtString ) )
155 return \'red\';
156 } catch(e) {}
158 if( /^1([\|\/]1)*$/.test( gtString) )
159 return \'blue\';
161 variantIsImputed = function( feature ) {
162 var genotypes = feature.get(\'genotypes\');
163 for( var sampleName in genotypes ) {
164 try {
165 var dpString = genotypes[sampleName].DP.values[0];
166 if( /^0$/.test( dpString) )
167 return true;
168 } catch(e) {}
170 return false;
172 key = ' . $key .'
173 storeClass = JBrowse/Store/SeqFeature/VCFTabix
174 urlTemplate = ' . $path .'
175 category = Diversity/NEXTGEN/Imputed
176 type = JBrowse/View/Track/HTMLVariants
177 metadata.Description = Homozygous reference: Green Heterozygous: Red Homozygous alternate: Blue Values imputed with GLMNET are shown at 1/3 opacity.
178 metadata.Link = <a href=ftp://cassavabase.org/jbrowse/diversity/igdBuildWithV6_hapmap_20150404_vcfs/' . $file . '>Download VCF File</a>
179 metadata.Provider = NEXTGEN Cassava project
180 metadata.Accession = ' . $name . '
181 label = ' . $key . '
183 push @tracks, $imp_track
185 } else {
187 next;
192 #-----------------------------------------------------------------------
193 # if matching vcf files were not found, skip to next accession name
194 #-----------------------------------------------------------------------
196 next unless (@tracks);
198 #-----------------------------------------------------------------------
199 # unless it already exisits, create dir for new jbrowse instance and create symlinks to files in base jbrowse instance
200 #-----------------------------------------------------------------------
202 unless (-d $id) {
203 `sudo mkdir -p $id`;
204 `sudo ln -sf $link_path/data_files $id/data_files`;
205 `sudo ln -sf $link_path/seq $id/seq`;
206 `sudo ln -sf $link_path/tracks $id/tracks`;
207 `sudo ln -sf $link_path/trackList.json $id/trackList.json`;
208 `sudo ln -sf $link_path/readme $id/readme`;
209 `sudo touch $id/tracks.conf && sudo chmod a+wrx $id/tracks.conf`;
211 #-----------------------------------------------------------------------
212 # also append dataset info to jbrowse.conf, and create and populate accession specific tracks.conf file
213 #-----------------------------------------------------------------------
215 my $dataset = "[datasets.Accession_$id]\n" . "url = ?data=data/json/accessions/$id\n" . "name = $name\n\n";
216 print CONF $dataset;
217 open (OUT, ">", $out) || die "Can't open out file $out! \n";
218 print OUT $header;
219 print OUT join ("\n", @tracks), "\n";
220 undef @tracks;
223 #-----------------------------------------------------------------------
224 # if we are just adding a new track, append to tracks.conf
225 #-----------------------------------------------------------------------
227 open (OUT, ">>", $out) || die "Can't open out file $out! \n";
228 print OUT join ("\n", @tracks), "\n";
229 undef @tracks;