make test pass for multicat parsing with two xlsx files for testing.
[sgn.git] / bin / bdb_update_blast_dbs.pl
blob8a32b98104be1ce234ef228f9219dcdb1dbb5c63
1 #!/usr/bin/env perl
3 use strict;
4 use warnings;
5 use English;
6 use Carp;
7 #$Carp::Verbose = 1;
8 use FindBin;
9 use Getopt::Std;
10 use Data::Dumper;
11 use File::Spec;
12 use File::Temp qw/tempfile/;
14 use SGN::Schema;
15 use CXGN::Blast;
16 use CXGN::DB::InsertDBH;
18 sub usage {
19 my $sgn_schema = shift;
20 my $message = shift || '';
21 $message = "Error: $message\n" if $message;
23 # my $file_bases = join '', sort map ' '.$_->file_base."\n", CXGN::BlastDB->retrieve_all($sgn_schema, $opt{d});
25 die <<EOU;
26 $message
27 Usage:
28 $FindBin::Script [ options ] -d <path>
30 Go over all the BLAST databases we keep in stock and update them if
31 needed. When run with just the -g option, goes over all the BLAST
32 dbs listed in the sgn.blast_db table and updates them if needed,
33 putting them under the top-level BLAST db path given with the -d
34 option.
36 Options:
38 -H <dbhost>
40 -D <dbname>
42 -p <password> (if not supplied, will prompt)
44 -U <dbuser> (if -p option is supplied)
46 -d <path> required. path where all blast DB files are expected to go.
48 -t <path> path to put tempfiles. must be writable. Defaults to /tmp.
50 -x dry run, just print what you would update
52 -f <db name> force-update the DB with the given file base (e.g. 'genbank/nr')
54 -a force update all dbs in blast dir - override needs_update and run makeblastdb on all blast datasets found in filebase
56 Current list of file_bases:
58 EOU
62 our %opt;
63 getopts('axt:d:f:H:D:p:U:h',\%opt) or die "Invalid arguments";
65 $opt{t} ||= File::Spec->tmpdir;
67 print STDERR "Connecting to database... $opt{H} $opt{D}\n";
69 my $dbh;
71 if (!$opt{p}) {
72 $dbh = CXGN::DB::InsertDBH->new( { dbhost => $opt{H}, dbname => $opt{D} });
74 else {
75 $dbh = CXGN::DB::Connection->new( { dbhost => $opt{H}, dbname => $opt{D}, dbpass => $opt{p}, dbuser => $opt{U} });
78 print STDERR "Creating schema object...\n";
79 my $sgn_schema = SGN::Schema->connect( sub{ $dbh->get_actual_dbh() });
81 if ($opt{h}) { usage($sgn_schema); exit(); }
83 #if a alternate blast dbs path was given, set it in the BlastDB
84 #object
85 $opt{d} or usage($sgn_schema, '-d option is required');
86 -d $opt{d} or usage($sgn_schema, "directory $opt{d} not found");
88 my $bdbs = CXGN::Blast->new( sgn_schema => $sgn_schema, dbpath => $opt{d} );
90 my @dbs = $opt{f} ? CXGN::Blast->search( $sgn_schema, $opt{d}, file_base => $opt{f} )
91 : CXGN::Blast->retrieve_all($sgn_schema, $opt{d});
92 unless(@dbs) {
93 print $opt{f} ? "No database found with file_base='$opt{f}'.\n"
94 : "No dbs found in database.\n";
97 my $count;
98 my @errs;
100 foreach my $db (@dbs) {
102 print STDERR "Processing database ".$db->title(). "\n". $db->file_base . "\n" ;
103 #check if the blast db needs an update
104 unless($opt{f} || $db->needs_update || $opt{a}) {
105 print $db->file_base." is up to date.\n";
106 next;
109 print STDERR "checking source url..\n";
110 ##Not usig source_url anymore. Source files need to be in the blast db dir
111 #skip the DB if it does not have a source url defined
112 #unless($db->source_url) {
113 # warn $db->file_base." needs to be updated, but has no source_url. Skipped.\n";
114 # next;
116 ###########
117 ###do not use source_url. Need to make sure all db fasta files are in the blast basedir
118 #my $source_url = $db->source_url ;
119 #$source_url =~ s/^ftp:\/\/ftp.sgn.cornell.edu/http:\/\/solgenomics.net\/ftp/;
120 ###########
121 my $file_path = $opt{d} . $db->file_base;
123 if( $opt{x} ) {
124 print "Would update ".$db->file_base." from file ".$file_path."\n";
125 next;
126 } else {
127 print "Updating ".$db->file_base." from file...\n";
130 eval {
132 print STDERR "Checking permissions...\n";
134 # check whether we have permissions to do the format
135 if( my $perm_error = $db->check_format_permissions() ) {
136 die "Cannot format ".$db->file_base.":\n$perm_error";
139 #download the sequences from the source url to a tempfile
140 print STDERR "Reading source file (".$file_path.")...\n";
142 #### no longer used. Check if some blast dbs need to be copied manually to the blast basedir
143 #use source_url only if file not found in file_path
144 #my (undef,$sourcefile) = tempfile('blastdb-source-XXXXXXXX',
145 # DIR => $opt{t},
146 # UNLINK => 1,
147 # );
149 # my $wget_opts = { cache => 0 };
150 # $wget_opts->{gunzip} = 1 if $source_url =~ /\.gz$/i;
151 # wget_filter( $source_url => $sourcefile, $wget_opts );
153 #formatdb it into the correct place
154 print STDERR "Formatting database...";
155 $db->format_from_file($file_path);
157 #unlink $sourcefile or warn "$! unlinking tempfile '$sourcefile'";
159 print $db->file_base." done.\n";
160 }; if( $EVAL_ERROR ) {
161 print STDERR "Update failed for ".$db->file_base.":\n$EVAL_ERROR";
162 push(@errs , "Update failed for ".$db->file_base.":\n$EVAL_ERROR\n");
164 $count++;
167 print STDERR "Updated $count blast dbs\n";
168 print STDERR join(", ", @errs);
171 $dbh->disconnect();