bin/jbrowse_vcf_tools/gen_accession_jbrowse.pl

   1 #!/usr/bin/perl -w
   2
   3 =head1
   4
   5 gen_accession_jbrowse.pl - creates jbrowse instances that include base tracks and accession vcf tracks for each accession name supplied
   6
   7 =head1 SYNOPSIS
   8
   9     gen_accession_jbrowse.pl -v [absolute path through base instance to dir containing vcfs] -b [absolute path to base instance] -d [name of database to search for trials] -h [db host]
  10
  11 =head1 COMMAND-LINE OPTIONS
  12
  13  -v absolute path through base instance to dir containing vcfs
  14  -b absolute path to base instance
  15  -h database hostname
  16  -d database name
  17
  18 =head1 DESCRIPTION
  19
  20 Takes a directory containing individual vcf files, including filtered and imputed versions, and creates or updates jbrowse instances by symlinking to all necessary files in a base instance,  then generating a uniq tracks.conf file and appending dataset info to jbrowse.conf. Should be run from /data/json/accessions dir in jbrowse instance.
  21
  22 =head1 AUTHOR
  23
  24 Bryan Ellerbrock (bje24@cornell.edu) - Oct 2015
  25
  26 =cut
  27
  28 use strict;
  29 use File::Slurp;
  30 use Getopt::Std;
  31 use Bio::Chado::Schema;
  32 use CXGN::DB::InsertDBH;
  33
  34 our ($opt_v, $opt_b, $opt_h, $opt_d);
  35
  36 #-----------------------------------------------------------------------
  37 # define paths & array of vcf_file names. Open file handles to read accession lists and append datasets to jbrowse.conf
  38 #-----------------------------------------------------------------------
  39
  40 getopts('v:b:h:d:');
  41
  42 my $vcf_dir_path = $opt_v;
  43 my $dbhost = $opt_h;
  44 my $dbname = $opt_d;
  45 my $link_path = $opt_b;
  46 my $url_path = $opt_v;
  47 $url_path =~ s:$link_path/(.+):$1:;
  48 print STDERR "url path = $url_path \n";
  49
  50 my ($file_type,$out,$header,@tracks,$imp_track,$filt_track,$h,$q,$dbh);
  51 my @files = ` dir -GD -1 --hide *.tbi $vcf_dir_path ` ;
  52
  53 open (CONF, ">>", "../../../jbrowse.conf") || die "Can't open conf file jbrowse.conf!\n";
  54
  55 open (LOG, ">>", "gen_accession.log") || die "Can't open log file!\n";
  56
  57 #-----------------------------------------------------------------------
  58 # connect to database and extract unique acccesion names
  59 #-----------------------------------------------------------------------
  60
  61 #my $schema= Bio::Chado::Schema->connect( sub { $dbh->get_actual_dbh() });
  62
  63 $dbh = CXGN::DB::InsertDBH->new( { dbhost=>$dbhost,
  64                                       dbname=>$dbname,
  65                                    dbargs => {AutoCommit => 0,
  66                                               RaiseError => 1}
  67                                  }
  68     );
  69
  70
  71 $q = "SELECT distinct(stock.stock_id), stock.uniquename from stock where type_id = 76392 order by stock.uniquename";
  72
  73 $h=$dbh->prepare($q);
  74
  75 $h->execute();
  76
  77 #-----------------------------------------------------------------------
  78 # for each accession name, locate matching indiv vcf files and construct necessary text for tracks.conf
  79 #-----------------------------------------------------------------------
  80
  81 #print STDERR "fetchrow array = $h->fetchrow_array \n";
  82
  83 while (my ($id, $name) = $h->fetchrow_array) {
  84     chomp $id;
  85     chomp $name;
  86     print LOG "id = $id and name = $name \n";
  87     $out = "$id/tracks.conf";
  88     $header = "[general]\ndataset_id = Accession_$id\n";
  89
  90     for my $file (@files) {
  91         chomp $file;
  92         $_ = $file;
  93         next if !m/filtered/ && !m/imputed/;
  94         $_ =~ s/([^.]+)_2015_V6.*/$1/s;
  95         next if ($_ ne $name);
  96         print STDERR "Matched vcf file basename $_ to accession name $name !\n";
  97         print LOG "Matched vcf file basename $_ to accession name $name !\n";
  98         $file_type = $file;
  99         $file_type =~ s/.+_([imputedflr]+).vcf.gz/$1/s;
 100         print LOG "filetype = $file_type \n";
 101         if ($file_type eq 'filtered') {                            #Handle filtered vcf files
 102             print LOG "Working on filtered file $file \n" ;
 103             print STDERR "Working on filtered file $file \n" ;
 104             my $path = $url_path . "/" . $file ;
 105             my $key = $file;
 106             $key =~ s/(.+).vcf.gz/$1/s;
 107             print LOG "Key = $key \n";
 108
 109             $filt_track = '
 110 [ tracks . ' . $key . ' ]
 111     hooks.modify = function( track, feature, div ) { div.style.backgroundColor = track.config.variantIsHeterozygous(feature);}
 112 variantIsHeterozygous = function( feature ) {
 113     var genotypes = feature.get(\'genotypes\');
 114     for( var sampleName in genotypes ) {
 115         try {
 116             var gtString = genotypes[sampleName].GT.values[0];
 117             if( ! /^1([\|\/]1)*$/.test( gtString) && ! /^0([\|\/]0)*$/.test( gtString ) )
 118                 return \'red\';
 119         } catch(e) {}
 120     }
 121     if( /^1([\|\/]1)*$/.test( gtString) )
 122                 return \'blue\';
 123   }
 124 key = ' . $key  .'
 125 storeClass = JBrowse/Store/SeqFeature/VCFTabix
 126 urlTemplate = ' . $path .'
 127 category = Diversity/NEXTGEN/Unimputed
 128 type = JBrowse/View/Track/HTMLVariants
 129 metadata.Description = Homozygous reference: Green      Heterozygous: Red               Homozygous alternate: Blue      Filtered to remove SNPs with a depth of 0 and SNPs with 2 or more alt alleles.
 130 metadata.Link = <a href=ftp://cassavabase.org/jbrowse/diversity/igdBuildWithV6_hapmap_20150404_vcfs/' . $file . '>Download VCF File</a>
 131 metadata.Provider = NEXTGEN Cassava project
 132 metadata.Accession = ' . $name . '
 133 label = ' . $key  . '
 134 ' ;
 135 push @tracks, $filt_track;
 136
 137         } elsif ($file_type eq 'imputed') {                        #Handle imputed vcf files
 138
 139             print STDERR "Working on imputed file $file \n" ;
 140             print LOG "Working on imputed file $file \n" ;
 141             my $path = $url_path . "/" . $file ;
 142             my $key = $file;
 143             $key =~ s/(.+).vcf.gz/$1/s;
 144             print LOG "Key = $key \n";
 145
 146             $imp_track = '
 147 [ tracks . ' . $key . ' ]
 148 hooks.modify = function( track, feature, div ) { div.style.backgroundColor = track.config.variantIsHeterozygous(feature);  div.style.opacity = track.config.variantIsImputed(feature) ? \'0.33\' : \'1.0\'; }
 149 variantIsHeterozygous = function( feature ) {
 150     var genotypes = feature.get(\'genotypes\');
 151     for( var sampleName in genotypes ) {
 152         try {
 153             var gtString = genotypes[sampleName].GT.values[0];
 154             if( ! /^1([\|\/]1)*$/.test( gtString) && ! /^0([\|\/]0)*$/.test( gtString ) )
 155                 return \'red\';
 156         } catch(e) {}
 157     }
 158         if( /^1([\|\/]1)*$/.test( gtString) )
 159                 return \'blue\';
 160   }
 161 variantIsImputed = function( feature ) {
 162     var genotypes = feature.get(\'genotypes\');
 163     for( var sampleName in genotypes ) {
 164         try {
 165             var dpString = genotypes[sampleName].DP.values[0];
 166             if( /^0$/.test( dpString) )
 167                 return true;
 168         } catch(e) {}
 169     }
 170     return false;
 171   }
 172 key = ' . $key  .'
 173 storeClass = JBrowse/Store/SeqFeature/VCFTabix
 174 urlTemplate = ' . $path .'
 175 category = Diversity/NEXTGEN/Imputed
 176 type = JBrowse/View/Track/HTMLVariants
 177 metadata.Description = Homozygous reference: Green      Heterozygous: Red               Homozygous alternate: Blue      Values imputed with GLMNET are shown at 1/3 opacity.
 178 metadata.Link = <a href=ftp://cassavabase.org/jbrowse/diversity/igdBuildWithV6_hapmap_20150404_vcfs/' . $file . '>Download VCF File</a>
 179 metadata.Provider = NEXTGEN Cassava project
 180 metadata.Accession = ' . $name . '
 181 label = ' . $key  . '
 182 ' ;
 183 push @tracks, $imp_track
 184
 185         } else {
 186
 187         next;
 188
 189         }
 190 }
 191
 192 #-----------------------------------------------------------------------
 193 # if matching vcf files were not found, skip to next accession name
 194 #-----------------------------------------------------------------------
 195
 196 next unless (@tracks);
 197
 198 #-----------------------------------------------------------------------
 199 # unless it already exisits, create dir for new jbrowse instance and create symlinks to files in base jbrowse instance
 200 #-----------------------------------------------------------------------
 201
 202 unless (-d $id) {
 203     `sudo mkdir -p $id`;
 204     `sudo ln -sf $link_path/data_files $id/data_files`;
 205     `sudo ln -sf $link_path/seq $id/seq`;
 206     `sudo ln -sf $link_path/tracks $id/tracks`;
 207     `sudo ln -sf $link_path/trackList.json $id/trackList.json`;
 208     `sudo ln -sf $link_path/readme $id/readme`;
 209     `sudo touch $id/tracks.conf && sudo chmod a+wrx $id/tracks.conf`;
 210
 211 #-----------------------------------------------------------------------
 212 # also append dataset info to jbrowse.conf, and create and populate accession specific tracks.conf file
 213 #-----------------------------------------------------------------------
 214
 215 my $dataset = "[datasets.Accession_$id]\n" . "url  = ?data=data/json/accessions/$id\n" . "name = $name\n\n";
 216 print CONF $dataset;
 217 open (OUT, ">", $out) || die "Can't open out file $out! \n";
 218 print OUT $header;
 219 print OUT join ("\n", @tracks), "\n";
 220 undef @tracks;
 221 }
 222
 223 #-----------------------------------------------------------------------
 224 # if we are just adding a new track, append to tracks.conf
 225 #-----------------------------------------------------------------------
 226
 227 open (OUT, ">>", $out) || die "Can't open out file $out! \n";
 228 print OUT join ("\n", @tracks), "\n";
 229 undef @tracks;
 230 }
 231