bin/load_snps.pl

   1  #!/usr/bin/perl
   2
   3 # basic script to load snp genotypes
   4
   5 # usage: load_snps.pl -H hostname D dbname  -i infile
   6
   7 # In General row headings are the accession name (or synonym) , which needs to be looked up in the stock table, and column headings are marker name, or alias.
   8
   9 # copy and edit this file as necessary
  10 # common changes include the following:
  11
  12
  13 =head1
  14
  15  NAME
  16
  17 load_snps.pl - a script to load snp genotypes into the SGN database (see sgn.snp table) .
  18
  19 =head1 DESCRIPTION
  20
  21 usage: load_snps.pl
  22
  23 Options:
  24
  25 =over 5
  26
  27 =item -H
  28
  29 The hostname of the server hosting the database.
  30
  31 =item -D
  32
  33 the name of the database
  34
  35 =item -t
  36
  37 (optional) test mode. Rollback after the script terminates. Database should not be affected. Good for test runs.
  38
  39
  40 =item -i
  41
  42 infile with the marker info
  43
  44 =item -o
  45
  46 outfile for catching errors and other messages
  47
  48 =back
  49
  50 The tab-delimited snp genotype file must have stocks and markers which already exist in the database.
  51 Non-existing stocks or markers will be skipped.
  52
  53
  54 =head1 AUTHORS
  55
  56 Naama Menda <nm249@cornell.edu>
  57
  58
  59 =cut
  60
  61 use strict;
  62 use warnings;
  63
  64 use CXGN::Tools::File::Spreadsheet;
  65 use CXGN::Tools::Text;
  66 use File::Slurp;
  67 use Bio::Chado::Schema;
  68
  69 use CXGN::Marker;
  70 use CXGN::Marker::Tools;
  71 use CXGN::DB::Connection;
  72 use CXGN::DB::InsertDBH;
  73
  74 use Data::Dumper;
  75 use CXGN::DB::SQLWrappers;
  76
  77 use Getopt::Std;
  78
  79
  80 our ($opt_H, $opt_D, $opt_i, $opt_t, $opt_o);
  81
  82 getopts('H:D:ti:o:');
  83
  84
  85
  86 my $dbh = CXGN::DB::InsertDBH->new({
  87     dbname => $opt_D,
  88     dbhost => $opt_H,
  89     dbargs => {AutoCommit => 0,
  90                RaiseError => 1}
  91                                    });
  92 my $schema= Bio::Chado::Schema->connect( sub { $dbh->get_actual_dbh() } , );
  93               #                           { on_connect_do => ['SET search_path TO public'], }, );
  94
  95 my $sql=CXGN::DB::SQLWrappers->new($dbh);
  96
  97 eval {
  98
  99     # make an object to give us the values from the spreadsheet
 100     my $ss = CXGN::Tools::File::Spreadsheet->new($opt_i);
 101     my @stocks = $ss->row_labels(); # row labels are the marker names
 102     my @markers = $ss->column_labels(); # column labels are the headings for the data columns
 103
 104     for my $stock_name (@stocks) {
 105         print "stockname = $stock_name\n";
 106         my $stock_id = $schema->resultset("Cv::Cvterm")->search( {
 107             name => 'solcap number' } )->
 108                 search_related('stockprops' , { value => $stock_name } )->
 109                 first->stock_id or die("No stock found for solcap number $stock_name! \n\n");
 110         message( "*************Stock name = $stock_name, id = $stock_id\n" );
 111         for my $marker_name (@markers) {
 112             print "marker: $marker_name\n";
 113             my @marker_ids =  CXGN::Marker::Tools::marker_name_to_ids($dbh,$marker_name);
 114             if (@marker_ids>1) { die "Too many IDs found for marker '$marker_name'" }
 115             # just get the first ID in the list (if the list is longer than 1, we've already died)
 116             my $marker_id = $marker_ids[0];
 117
 118             if(!$marker_id) {
 119                 message("Marker $marker_name does not exist! Skipping!!\n");
 120                 next;
 121             }
 122             else {  message( "Marker name : $marker_name, marker_id found: $marker_id\n" ) ; }
 123
 124             my $genotype=$ss->value_at($stock_name,$marker_name)
 125                 or message("No genotype found for stock $stock_name and marker $marker_name!");
 126             print "genotype: $genotype\n";
 127             if ($genotype !~ /[a-zA-Z]/ ) {
 128                 message("non-snp genotype ($genotype) . Skipping!!");
 129                 next;
 130             }
 131             my $snp_genotype =$sql->insert_unless_exists('snp',{marker_id=>$marker_id, snp_nucleotide => $genotype, stock_id=> $stock_id } );
 132         }
 133     }
 134 };
 135
 136 if ($@) {
 137     print $@;
 138     print "Failed; rolling back.\n";
 139     $dbh->rollback();
 140 }
 141 else {
 142     print"Succeeded.\n";
 143     if ($opt_t) {
 144         print"Rolling back.\n";
 145         $dbh->rollback();
 146     }
 147     else  {
 148         print"Committing.\n";
 149         $dbh->commit();
 150     }
 151 }
 152
 153 sub message {
 154     my $message = shift;
 155     print $message;
 156     write_file( $opt_o,  {append => 1 }, $message . "\n" )  if $opt_o;
 157 }