5 load_blast.pl - loading blast tables into cxgn databases
9 load_blast.pl -H [dbhost] -D [dbname] -i [infile]
11 =head1 COMMAND-LINE OPTIONS
19 host name (required) e.g. "localhost"
23 database name (required) e.g. "cxgn_cassava"
27 path to infile (required)
37 Test run. Rolling back at the end.
43 This script populates blast tables (sgn.blast_db, sgn.blast_db_group and the linking table). Each column in the spreadsheet represents a single blast fasta file. Connections to the blast_group are also made.
45 The input file is xlsx format, and should have the following columns (column order is not important):
47 file_base # starts where blast_path ends in sgn_local.conf
49 type # either nucleotide or protein
60 Only file_base, title, type and blast_db_group are required. web_interface_visible is 'T' by default.
64 Lukas Mueller <lam87@cornell.edu>, Nov 2022
77 use Spreadsheet
::XLSX
;
78 use Bio
::Chado
::Schema
;
79 use CXGN
::People
::Person
;
80 use CXGN
::People
::Schema
;
82 use CXGN
::DB
::InsertDBH
;
83 use SGN
::Model
::Cvterm
;
86 our ($opt_H, $opt_D, $opt_i, $opt_t);
94 print STDERR
"Input file: $file\n";
95 print STDERR
"DB host: $dbhost\n";
96 print STDERR
"DB name: $dbname\n";
97 print STDERR
"Rollback: $opt_t\n";
99 if (!$opt_H || !$opt_D || !$opt_i) {
100 pod2usage
(-verbose
=> 2, -message
=> "Must provide options -H (hostname), -D (database name), -i (input file)\n");
103 my $dbh = CXGN
::DB
::InsertDBH
->new( { dbhost
=>$dbhost,
105 dbargs
=> {AutoCommit
=> 1,
110 my $sgn_schema = SGN
::Schema
->connect( sub { $dbh->get_actual_dbh() } );
112 my $schema= Bio
::Chado
::Schema
->connect( sub { $dbh->get_actual_dbh() } );
113 $dbh->do('SET search_path TO public,sgn,sgn_people');
115 my $converter = Text
::Iconv
->new("utf-8", "windows-1251");
117 my $excel = Spreadsheet
::XLSX
->new($opt_i, $converter);
121 foreach my $sheet (@
{$excel->{Worksheet
}}) {
123 printf("Sheet: %s\n", $sheet->{Name
});
125 $sheet->{MaxRow
} ||= $sheet->{MinRow
};
127 print STDERR
"MIN ROW = ".$sheet->{MinRow
}."\n";
131 my @required_headers = qw
| file_base title type blast_db_group
|;
136 foreach my $row ($sheet->{MinRow
} .. $sheet->{MaxRow
}) {
137 @header = @
{$sheet->{Cells
}->[0]};
139 for(my $i =0; $i< @header; $i++) {
140 print STDERR
$header[$i]->{Val
}."\n";
141 $ch{$header[$i]->{Val
} } = $i;
145 print STDERR
"HEADERS: ".Dumper
(\
%ch);
146 print STDERR
"REQUIRED: ".Dumper
(\
@required_headers);
149 foreach my $h (@required_headers) {
150 if (!exists($ch{$h})) {
151 push @missing_headers, $h;
155 if (@missing_headers) {
156 print STDERR
"Required headers include: ". join(", ", @required_headers)."\n";
157 print STDERR
"Missing: ".join(", ", @missing_headers)."\n";
159 die "Check file format for header requirements.";
162 foreach my $row (1 .. $sheet->{MaxRow
}) {
164 $sheet->{MaxCol
} ||= $sheet->{MinCol
};
166 my @fields = map { $_ ->{Val
} } @
{$sheet->{Cells
}->[$row]};
169 for(my $n =0; $n< @header; $n++) {
171 $data{$header[$n]->{Val
}} = $fields[$n];
175 print STDERR
"DATA: ".Dumper
(\
%data);
177 if (!$data{file_base
} || !$data{title
} || !$data{type
} || !$data{blast_db_group
}) {
178 print STDERR
"Not enough information provided in row ".join(", ", @fields).". Skipping.\n";
182 my $group_row = $sgn_schema->resultset("BlastDbGroup")->find_or_create( { name
=> $data{blast_db_group
}, ordinal
=> 20 });
184 my $blast_db_group_id;
188 # $group_row = $sgn_schema->resultset("BlastDbGroup")->find_or_create(#
190 # name => $data{blast_db_group},
194 $blast_db_group_id = $group_row->blast_db_group_id();
197 my $row = $sgn_schema->resultset("BlastDb")->find( { title
=> $data{title
} } );
200 file_base
=> $data{file_base
},
201 title
=> $data{title
},
203 source_url
=> $data{source_url
},
204 lookup_url
=> $data{lookup_url
},
205 update_freq
=> $data{update_freq
},
206 info_url
=> $data{info_url
},
207 index_seqs
=> $data{index_seqs
},
208 blast_db_group_id
=> $blast_db_group_id,
209 web_interface_visible
=> $data{web_interface_visible
} || 'T',
210 description
=> $data{description
},
215 print STDERR
"upading blast dataset $data{title}...\n";
222 $row = $sgn_schema->resultset("BlastDb")->find_or_create($data);
225 my $grow = $sgn_schema->resultset("BlastDbBlastDbGroup")->find_or_create(
227 blast_db_id
=> $row->blast_db_id(),
228 blast_db_group_id
=> $blast_db_group_id,
235 $schema->txn_do($coderef);
237 print "Transaction succeeded! Commiting user data! \n\n";
240 die "Not storing, rolling back\n";
245 die "An error occured! Rolling back!" . $_ . "\n";