bin/load_blast.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 load_blast.pl - loading blast tables into cxgn databases
   6
   7 =head1 SYNOPSIS
   8
   9 load_blast.pl -H [dbhost] -D [dbname] -i [infile]
  10
  11 =head1 COMMAND-LINE OPTIONS
  12
  13 =head2 ARGUMENTS
  14
  15 =over 5
  16
  17 =item -H
  18
  19 host name (required) e.g. "localhost"
  20
  21 =item -D
  22
  23 database name (required) e.g. "cxgn_cassava"
  24
  25 =item -i
  26
  27 path to infile (required)
  28
  29 =back
  30
  31 =head2 FLAGS
  32
  33 =over 5
  34
  35 =item -t
  36
  37 Test run. Rolling back at the end.
  38
  39 =back
  40
  41 =head1 DESCRIPTION
  42
  43 This script populates blast tables (sgn.blast_db, sgn.blast_db_group and the linking table). Each column in the spreadsheet represents a single blast fasta file. Connections to the blast_group are also made.
  44
  45 The input file is xlsx format, and should have the following columns (column order is not important):
  46
  47  file_base      # starts where blast_path ends in sgn_local.conf
  48  title
  49  type           # either nucleotide or protein
  50  source_url
  51  lookup_url
  52  update_freq
  53  info_url
  54  index_seqs
  55  blast_db_group
  56  web_interface_visible
  57  description
  58  jbrowse_src
  59
  60 Only file_base, title, type and blast_db_group are required. web_interface_visible is 'T' by default.
  61
  62 =head1 AUTHOR
  63
  64  Lukas Mueller <lam87@cornell.edu>, Nov 2022
  65
  66 =cut
  67
  68 use strict;
  69
  70 use Getopt::Std;
  71 use Data::Dumper;
  72 use JSON::Any;
  73 use JSON::PP;
  74 use Carp qw /croak/ ;
  75 use Try::Tiny;
  76 use Pod::Usage;
  77 use Spreadsheet::XLSX;
  78 use Bio::Chado::Schema;
  79 use CXGN::People::Person;
  80 use CXGN::People::Schema;
  81 use SGN::Schema;
  82 use CXGN::DB::InsertDBH;
  83 use SGN::Model::Cvterm;
  84 use Text::Iconv;
  85
  86 our ($opt_H, $opt_D, $opt_i, $opt_t);
  87
  88 getopts('H:i:tD:');
  89
  90 my $dbhost = $opt_H;
  91 my $dbname = $opt_D;
  92 my $file = $opt_i;
  93
  94 print STDERR "Input file: $file\n";
  95 print STDERR "DB host: $dbhost\n";
  96 print STDERR "DB name: $dbname\n";
  97 print STDERR "Rollback: $opt_t\n";
  98
  99 if (!$opt_H || !$opt_D || !$opt_i) {
 100     pod2usage(-verbose => 2, -message => "Must provide options -H (hostname), -D (database name), -i (input file)\n");
 101 }
 102
 103 my $dbh = CXGN::DB::InsertDBH->new( { dbhost=>$dbhost,
 104                                       dbname=>$dbname,
 105                                       dbargs => {AutoCommit => 1,
 106                                                  RaiseError => 1}
 107                                     }
 108     );
 109
 110 my $sgn_schema = SGN::Schema->connect( sub { $dbh->get_actual_dbh() } );
 111
 112 my $schema= Bio::Chado::Schema->connect( sub { $dbh->get_actual_dbh() } );
 113 $dbh->do('SET search_path TO public,sgn,sgn_people');
 114
 115 my $converter = Text::Iconv->new("utf-8", "windows-1251");
 116
 117 my $excel = Spreadsheet::XLSX->new($opt_i, $converter);
 118
 119 my $coderef = sub {
 120
 121     foreach my $sheet (@{$excel->{Worksheet}}) {
 122
 123         printf("Sheet: %s\n", $sheet->{Name});
 124
 125         $sheet->{MaxRow} ||= $sheet->{MinRow};
 126
 127         print STDERR "MIN ROW = ".$sheet->{MinRow}."\n";
 128
 129         # parse header
 130         #
 131         my @required_headers = qw | file_base title type blast_db_group |;
 132
 133         my %ch;
 134         my @header;
 135
 136         foreach my $row ($sheet->{MinRow} .. $sheet->{MaxRow}) {
 137             @header = @{$sheet->{Cells}->[0]};
 138
 139             for(my $i =0; $i< @header; $i++) {
 140                 print STDERR $header[$i]->{Val}."\n";
 141                 $ch{$header[$i]->{Val} } = $i;
 142             }
 143         }
 144
 145         print STDERR "HEADERS: ".Dumper(\%ch);
 146         print STDERR "REQUIRED: ".Dumper(\@required_headers);
 147
 148         my @missing_headers;
 149         foreach my $h (@required_headers) {
 150             if (!exists($ch{$h})) {
 151                 push @missing_headers, $h;
 152             }
 153         }
 154
 155         if (@missing_headers) {
 156             print STDERR "Required headers include: ". join(", ", @required_headers)."\n";
 157             print STDERR "Missing: ".join(", ", @missing_headers)."\n";
 158
 159             die "Check file format for header requirements.";
 160         }
 161
 162         foreach my $row (1 .. $sheet->{MaxRow}) {
 163
 164             $sheet->{MaxCol} ||= $sheet->{MinCol};
 165
 166             my @fields = map { $_ ->{Val} } @{$sheet->{Cells}->[$row]};
 167
 168             my %data;
 169             for(my $n =0; $n< @header; $n++) {
 170                 if ($fields[$n]) {
 171                     $data{$header[$n]->{Val}} = $fields[$n];
 172                 }
 173             }
 174
 175             print STDERR "DATA: ".Dumper(\%data);
 176
 177             if (!$data{file_base} || !$data{title} || !$data{type} || !$data{blast_db_group}) {
 178                 print STDERR "Not enough information provided in row ".join(", ", @fields).". Skipping.\n";
 179                 next();
 180             }
 181
 182             my $group_row = $sgn_schema->resultset("BlastDbGroup")->find_or_create( { name => $data{blast_db_group}, ordinal => 20 });
 183
 184             my $blast_db_group_id;
 185
 186 #           if (!$group_row) {
 187
 188 #               $group_row = $sgn_schema->resultset("BlastDbGroup")->find_or_create(#
 189                    # {
 190         #               name => $data{blast_db_group},
 191 #                       ordinal => 20,
 192 #                   });
 193 #           }
 194             $blast_db_group_id = $group_row->blast_db_group_id();
 195
 196
 197             my $row = $sgn_schema->resultset("BlastDb")->find( { title => $data{title} } );
 198
 199             my $data = {
 200                 file_base => $data{file_base},
 201                 title => $data{title},
 202                 type => $data{type},
 203                 source_url => $data{source_url},
 204                 lookup_url => $data{lookup_url},
 205                 update_freq => $data{update_freq},
 206                 info_url => $data{info_url},
 207                 index_seqs => $data{index_seqs},
 208                 blast_db_group_id => $blast_db_group_id,
 209                 web_interface_visible => $data{web_interface_visible} || 'T',
 210                 description => $data{description},
 211             };
 212
 213
 214             if ($row) {
 215                 print STDERR "upading blast dataset $data{title}...\n";
 216
 217                 $row->update($data);
 218
 219             }
 220
 221             else {
 222                 $row = $sgn_schema->resultset("BlastDb")->find_or_create($data);
 223             }
 224
 225             my $grow = $sgn_schema->resultset("BlastDbBlastDbGroup")->find_or_create(
 226                 {
 227                     blast_db_id=> $row->blast_db_id(),
 228                     blast_db_group_id => $blast_db_group_id,
 229                 });
 230         }
 231     }
 232 };
 233
 234 try {
 235     $schema->txn_do($coderef);
 236     if (!$opt_t) {
 237         print "Transaction succeeded! Commiting user data! \n\n";
 238     }
 239     else {
 240         die "Not storing, rolling back\n";
 241     }
 242
 243 } catch {
 244     # Transaction failed
 245     die "An error occured! Rolling back!" . $_ . "\n";
 246 };
 247
 248 $dbh->disconnect();