Merge pull request #5199 from solgenomics/topic/tracking_transformation
[sgn.git] / bin / load_blast.pl
blobd31c08d31520790b8dfffd16bdb52dba13c3f9fc
1 #!/usr/bin/perl
3 =head1 NAME
5 load_blast.pl - loading blast tables into cxgn databases
7 =head1 SYNOPSIS
9 load_blast.pl -H [dbhost] -D [dbname] -i [infile]
11 =head1 COMMAND-LINE OPTIONS
13 =head2 ARGUMENTS
15 =over 5
17 =item -H
19 host name (required) e.g. "localhost"
21 =item -D
23 database name (required) e.g. "cxgn_cassava"
25 =item -i
27 path to infile (required)
29 =back
31 =head2 FLAGS
33 =over 5
35 =item -t
37 Test run. Rolling back at the end.
39 =back
41 =head1 DESCRIPTION
43 This script populates blast tables (sgn.blast_db, sgn.blast_db_group and the linking table). Each column in the spreadsheet represents a single blast fasta file. Connections to the blast_group are also made.
45 The input file is xlsx format, and should have the following columns (column order is not important):
47 file_base # starts where blast_path ends in sgn_local.conf
48 title
49 type # either nucleotide or protein
50 source_url
51 lookup_url
52 update_freq
53 info_url
54 index_seqs
55 blast_db_group
56 web_interface_visible
57 description
58 jbrowse_src
60 Only file_base, title, type and blast_db_group are required. web_interface_visible is 'T' by default.
62 =head1 AUTHOR
64 Lukas Mueller <lam87@cornell.edu>, Nov 2022
66 =cut
68 use strict;
70 use Getopt::Std;
71 use Data::Dumper;
72 use JSON::Any;
73 use JSON::PP;
74 use Carp qw /croak/ ;
75 use Try::Tiny;
76 use Pod::Usage;
77 use Spreadsheet::XLSX;
78 use Bio::Chado::Schema;
79 use CXGN::People::Person;
80 use CXGN::People::Schema;
81 use SGN::Schema;
82 use CXGN::DB::InsertDBH;
83 use SGN::Model::Cvterm;
84 use Text::Iconv;
86 our ($opt_H, $opt_D, $opt_i, $opt_t);
88 getopts('H:i:tD:');
90 my $dbhost = $opt_H;
91 my $dbname = $opt_D;
92 my $file = $opt_i;
94 print STDERR "Input file: $file\n";
95 print STDERR "DB host: $dbhost\n";
96 print STDERR "DB name: $dbname\n";
97 print STDERR "Rollback: $opt_t\n";
99 if (!$opt_H || !$opt_D || !$opt_i) {
100 pod2usage(-verbose => 2, -message => "Must provide options -H (hostname), -D (database name), -i (input file)\n");
103 my $dbh = CXGN::DB::InsertDBH->new( { dbhost=>$dbhost,
104 dbname=>$dbname,
105 dbargs => {AutoCommit => 1,
106 RaiseError => 1}
110 my $sgn_schema = SGN::Schema->connect( sub { $dbh->get_actual_dbh() } );
112 my $schema= Bio::Chado::Schema->connect( sub { $dbh->get_actual_dbh() } );
113 $dbh->do('SET search_path TO public,sgn,sgn_people');
115 my $converter = Text::Iconv->new("utf-8", "windows-1251");
117 my $excel = Spreadsheet::XLSX->new($opt_i, $converter);
119 my $coderef = sub {
121 foreach my $sheet (@{$excel->{Worksheet}}) {
123 printf("Sheet: %s\n", $sheet->{Name});
125 $sheet->{MaxRow} ||= $sheet->{MinRow};
127 print STDERR "MIN ROW = ".$sheet->{MinRow}."\n";
129 # parse header
131 my @required_headers = qw | file_base title type blast_db_group |;
133 my %ch;
134 my @header;
136 foreach my $row ($sheet->{MinRow} .. $sheet->{MaxRow}) {
137 @header = @{$sheet->{Cells}->[0]};
139 for(my $i =0; $i< @header; $i++) {
140 print STDERR $header[$i]->{Val}."\n";
141 $ch{$header[$i]->{Val} } = $i;
145 print STDERR "HEADERS: ".Dumper(\%ch);
146 print STDERR "REQUIRED: ".Dumper(\@required_headers);
148 my @missing_headers;
149 foreach my $h (@required_headers) {
150 if (!exists($ch{$h})) {
151 push @missing_headers, $h;
155 if (@missing_headers) {
156 print STDERR "Required headers include: ". join(", ", @required_headers)."\n";
157 print STDERR "Missing: ".join(", ", @missing_headers)."\n";
159 die "Check file format for header requirements.";
162 foreach my $row (1 .. $sheet->{MaxRow}) {
164 $sheet->{MaxCol} ||= $sheet->{MinCol};
166 my @fields = map { $_ ->{Val} } @{$sheet->{Cells}->[$row]};
168 my %data;
169 for(my $n =0; $n< @header; $n++) {
170 if ($fields[$n]) {
171 $data{$header[$n]->{Val}} = $fields[$n];
175 print STDERR "DATA: ".Dumper(\%data);
177 if (!$data{file_base} || !$data{title} || !$data{type} || !$data{blast_db_group}) {
178 print STDERR "Not enough information provided in row ".join(", ", @fields).". Skipping.\n";
179 next();
182 my $group_row = $sgn_schema->resultset("BlastDbGroup")->find_or_create( { name => $data{blast_db_group}, ordinal => 20 });
184 my $blast_db_group_id;
186 # if (!$group_row) {
188 # $group_row = $sgn_schema->resultset("BlastDbGroup")->find_or_create(#
190 # name => $data{blast_db_group},
191 # ordinal => 20,
192 # });
194 $blast_db_group_id = $group_row->blast_db_group_id();
197 my $row = $sgn_schema->resultset("BlastDb")->find( { title => $data{title} } );
199 my $data = {
200 file_base => $data{file_base},
201 title => $data{title},
202 type => $data{type},
203 source_url => $data{source_url},
204 lookup_url => $data{lookup_url},
205 update_freq => $data{update_freq},
206 info_url => $data{info_url},
207 index_seqs => $data{index_seqs},
208 blast_db_group_id => $blast_db_group_id,
209 web_interface_visible => $data{web_interface_visible} || 'T',
210 description => $data{description},
214 if ($row) {
215 print STDERR "upading blast dataset $data{title}...\n";
217 $row->update($data);
221 else {
222 $row = $sgn_schema->resultset("BlastDb")->find_or_create($data);
225 my $grow = $sgn_schema->resultset("BlastDbBlastDbGroup")->find_or_create(
227 blast_db_id=> $row->blast_db_id(),
228 blast_db_group_id => $blast_db_group_id,
234 try {
235 $schema->txn_do($coderef);
236 if (!$opt_t) {
237 print "Transaction succeeded! Commiting user data! \n\n";
239 else {
240 die "Not storing, rolling back\n";
243 } catch {
244 # Transaction failed
245 die "An error occured! Rolling back!" . $_ . "\n";
248 $dbh->disconnect();