add more STDERR output for merge function.
[cxgn-corelibs.git] / lib / CXGN / BlastDB.pm
blobe0ef4165d6f4cdaa1219a2bfba7947cdf9132e3f
1 package CXGN::BlastDB;
3 =head1 NAME
5 CXGN::BlastDB - a BLAST database that we keep in stock and updated.
7 =head1 SYNOPSIS
9 ### SIMPLE MECHANICS
11 my $db = CXGN::BlastDB->from_id(10); #get by ID
13 my @dbs = CXGN::BlastDB->retrieve_all(); #get all blastDB objects
15 #search ilike by title
16 my @dbs = CXGN::BlastDB->search_ilike( title => '%solanum%' );
18 #change the title of a blast db in memory
19 $dbs[0]->title( 'Sequences from Tomatoes' );
21 #write the changes to the DB object to the database
22 $dbs[0]->update;
24 #do a blast against this database
25 CXGN::Tools::Run->run( 'blastall',
26 -m => 8,
27 -i => 'myseqs.seq',
28 -d => $dbs[0]->full_file_basename,
29 -p => 'blastn',
30 -e => '1e-10',
31 -o => 'myreport.m8.blast',
34 ### NIFTY THINGS
36 #get a handle on our managed copy of the NCBI UniVec database
37 my ($uv) = CXGN::BlastDB->search_ilike( title => '%univec%');
39 #set the path to our blast databases repository
40 CXGN::BlastDB->dbpath('/data/shared/blast/databases');
42 #does it need to be updated?
43 print "univec needs updating\n" if $uv->needs_update;
45 #list the files that are part of our univec DB
46 my @uv_files = $uv->list_files;
47 #returns ( '/data/shared/blast/databases/screening/vector/UniVec.nin',
48 # '/data/shared/blast/databases/screening/vector/UniVec.nhr',
49 # '/data/shared/blast/databases/screening/vector/UniVec.nsq',
50 # )
52 #how many sequences does it have in it?
53 print "this copy of univec has ".$uv->sequence_count." sequences in it\n";
55 #we've got an updated copy of univec here, let's format it and install it
56 #in place
57 $uv->format_from_file('new_univec.seq');
59 #i'll plop another formatted copy of univec in my home dir too
60 CXGN::BlastDB->dbpath('/home/rob/blast');
61 $uv->format_from_file('new_univec.seq');
62 #that will have done a mkpath -p /home/rob/blast/screening/vector,
63 #then it will have put the Univec.nin, nhr, and nsq files there.
65 =head1 DESCRIPTION
67 This is a handle on a BLAST database we manage. Each of
68 these objects corresponds to a row in the sgn.blast_db table and
69 a set of files in the filesystem. at a place specified by this class's
70 dbpath() data member (see dbpath docs below). This path defaults to
71 the value of the 'blast_db_path' configuration variable (see L<CXGN::Config>).
73 =head1 METHODS
75 =cut
77 use strict;
78 use Carp;
79 use File::Spec;
80 use File::Basename;
81 use File::Copy;
82 use File::Path;
83 use POSIX;
85 use List::MoreUtils qw/ uniq /;
87 use Memoize;
89 use CXGN::BlastDB::Config;
90 use CXGN::Tools::List qw/any min all max/;
91 use CXGN::Tools::Run;
93 use Bio::BLAST::Database;
95 use base qw/CXGN::CDBI::Class::DBI Class::Data::Inheritable/;
96 __PACKAGE__->table('blast_db');
98 #define our database columns with class::dbi
99 our @primary_key_names = ('blast_db_id');
101 our @column_names =
102 ('blast_db_id', #-database serial number
103 'file_base', #-basename of the database files, with a path prepended,
104 # e.g. 'genbank/nr'
105 'title', #-title of the database, e.g. 'NCBI Non-redundant proteins'
106 'type', #-type, either 'protein' or 'nucleotide'
107 'source_url', #-the URL new copies of this database can be fetched from
108 'lookup_url', #-printf-style format string that can be used to generate
109 # a URL where a user can get more info on a sequence in
110 # this blast db
111 'info_url', #-URL that gives information about the contents of this DB
112 'update_freq', #-frequency of updating this blast database
113 'index_seqs', #- corresponds to formatdb's -o option. Set true if formatdb
114 # should be given a '-o T'. This is used if you later want to
115 # fetch specific sequences out of this blast db
116 'web_interface_visible', #whether the blast web interface should display this DB
117 'blast_db_group_id', #ID of the blast DB group this db belondgs to,
118 #if any. used for displaying them in the web
119 #interface
120 'description', # text description of the database, display on the database details page
122 __PACKAGE__->columns( Primary => @primary_key_names, );
123 __PACKAGE__->columns( All => @column_names, );
124 __PACKAGE__->columns( Essential => @column_names, );
125 __PACKAGE__->sequence( 'blast_db_blast_db_id_seq' );
127 __PACKAGE__->has_a( blast_db_group_id => 'CXGN::BlastDB::Group');
129 =head2 from_id
131 Usage: my $bdb = CXGN::BlastDB->from_id(12);
132 Desc : retrieve a BlastDB object using its ID number
133 Ret : a BlastDB object for that id number, or undef if none found
134 Args : the id number of the object to retrieve
135 Side Effects: accesses the database
137 =cut
139 sub from_id { shift->retrieve(@_); }
141 #only document title and type for external users of this module,
142 #some nicer wrappers will be provided for using the other information
144 =head2 title
146 Usage: $db->title
147 Desc : get/set the title of this blast DB object
148 Ret : string containing the title, e.g. 'NCBI Non-redundant proteins'
149 Args : optional new value for the title
151 Note: must run $db->update for changes to be written to the db, unless you've
152 set $db->autoupdate.
154 =head2 type
156 Usage: $db->type
157 Desc : get the type of this blast db, whether it holds
158 proteins or nucleotides
159 Ret : 'protein' or 'nucleotide'
160 Args : optional new value for the type, either 'protein' or 'nucleotide'
162 Note: must run $db->update for changes to be written to the db, unless you've
163 set $db->autoupdate.
165 =head2 file_base
167 Usage: $db->file_base;
168 Desc : get/set the basename and path relative to 'blast_db_path' config var
169 Ret : the path and basename, e.g. 'genbank/nr' or 'screening/organelle/ATH_mitochondria'
170 Args : (optional) new string containing subpath and basename
171 Side Effects: none
173 Note: must run $db->update for changes to be written to the db, unless you've
174 set $db->autoupdate.
176 =cut
178 =head2 genomic_libraries_annotated
180 Desc: get the L<CXGN::Genomic::Library> objects that are slated as using this
181 blast database for annotation
182 Args: none
183 Ret : array of L<CXGN::Genomic::Library> objects
185 =cut
187 __PACKAGE__->has_many( genomic_libraries_annotated =>
188 [ 'CXGN::Genomic::LibraryAnnotationDB' => 'library_id' ],
191 =head2 file_modtime
193 Desc: get the earliest unix modification time of the database files
194 Args: none
195 Ret : unix modification time of the database files, or nothing if does not exist
196 Side Effects:
197 Example:
199 =cut
201 sub file_modtime {
202 my $self = shift;
203 return unless $self->_fileset;
204 return $self->_fileset->file_modtime;
207 =head2 format_time
209 Usage: my $time = $db->format_time;
210 Desc : get the format time of these db files
211 Ret : the value time() would have returned when
212 this database was last formatted, or undef
213 if that could not be determined (like if the
214 files aren't there)
215 Args : none
216 Side Effects: runs 'fastacmd' to extract the formatting
217 time from the database files
219 NOTE: This function assumes that the computer that
220 last formatted this database had the same time zone
221 set as the computer we are running on.
223 =cut
225 sub format_time {
226 my ($self) = @_;
227 return unless $self->_fileset;
228 return $self->_fileset->format_time;
231 =head2 full_file_basename
233 Desc:
234 Args: none
235 Ret : full path to the blast database file basename,
236 Side Effects: none
237 Example:
239 my $basename = $db->full_file_basename;
240 #returns '/data/shared/blast/databases/genbank/nr'
242 =cut
244 sub full_file_basename {
245 my $this = shift;
246 my $class = ref $this;
248 return scalar File::Spec->catfile( $class->dbpath,
249 $this->file_base,
254 =head2 list_files
256 Usage: my @files = $db->list_files;
257 Desc : get the list of files that belong to this blast database
258 Ret : list of full paths to all files belonging to this blast database,
259 Args : none
260 Side Effects: looks in the filesystem
262 =cut
264 sub list_files {
265 my $self = shift;
266 return unless $self->_fileset;
267 $self->_fileset->list_files();
270 =head2 files_are_complete
272 Usage: print "complete!" if $db->files_are_complete;
273 Desc : tell whether this blast db has a complete set of files on disk
274 Ret : true if the set of files on disk looks complete,
275 false if not
276 Args : none
277 Side Effects: lists files on disk
279 =cut
281 sub files_are_complete {
282 my ($self) = @_;
283 return unless $self->_fileset;
284 return $self->_fileset->files_are_complete;
287 =head2 is_split
289 Usage: print "that thing is split, yo" if $db->is_split;
290 Desc : determine whether this database is in multiple parts
291 Ret : true if this database has been split into multiple
292 files by formatdb (e.g. nr.00.pin, nr.01.pin, etc.)
293 Args : none
294 Side Effects: looks in filesystem
296 =cut
298 sub is_split {
299 my ($self) = @_;
300 return unless $self->_fileset;
301 return $self->_fileset->is_split;
304 =head2 is_indexed
306 Usage: $bdb->is_indexed
307 Desc : checks whether this blast db is indexed on disk to support
308 individual sequence retrieval. note that this is different
309 from index_seqs(), which is the flag of whether this db
310 _should_ be indexed.
311 Args : none
312 Ret : false if not on disk or not indexed, true if indexed
314 =cut
316 sub is_indexed {
317 my ( $self ) = @_;
318 return unless $self->_fileset;
319 return $self->_fileset->files_are_complete && $self->_fileset->indexed_seqs;
323 =head2 sequences_count
325 Desc: get the number of sequences in this blast database
326 Args: none
327 Ret : number of distinct sequences in this blast database, or undef
328 if it could not be determined due to some error or other
329 Side Effects: runs 'fastacmd' to get stats on the blast database file
331 =cut
333 sub sequences_count {
334 my $self = shift;
335 return unless $self->_fileset;
336 return $self->_fileset->sequences_count;
339 =head2 is_contaminant_for
341 This method doesn't work yet.
343 Usage: my $is_contam = $bdb->is_contaminant_for($lib);
344 Desc : return whether this BlastDB contains sequences
345 from something that would be considered a contaminant
346 in the given CXGN::Genomic::Library
347 Ret : 1 or 0
348 Args : a CXGN::Genomic::Library object
350 =cut
352 __PACKAGE__->has_many( _lib_annots => 'CXGN::Genomic::LibraryAnnotationDB' );
353 sub is_contaminant_for {
354 my ($this,$lib) = @_;
356 #return true if any arguments are true
357 return any( map { $_->is_contaminant && $_->library_id == $lib } $this->_lib_annots);
360 =head2 needs_update
362 Usage: print "you should update ".$db->title if $db->needs_update;
363 Desc : check whether this blast DB needs to be updated
364 Ret : true if this database's files need an update or are missing,
365 false otherwise
366 Args : none
367 Side Effects: runs format_time(), which runs `fastacmd`
369 =cut
371 sub needs_update {
372 my ($self) = @_;
374 #it of course needs an update if it is not complete
375 return 1 unless $self->files_are_complete;
377 my $modtime = $self->format_time();
379 #if no modtime, files must not even be there
380 return 1 unless $modtime;
382 #manually updated DBs never _need_ updates if their
383 #files are there
384 return 0 if $self->update_freq eq 'manual';
386 #also need update if it is set to be indexed but is not indexed
387 return 1 if $self->index_seqs && ! $self->is_indexed;
389 #figure out the maximum number of seconds we'll tolerate
390 #the files being out of date
391 my $max_time_offset = 60 * 60 * 24 * do { #figure out number of days
392 if( $self->update_freq eq 'daily' ) { 1 }
393 elsif( $self->update_freq eq 'weekly' ) { 7 }
394 elsif( $self->update_freq eq 'monthly' ) { 31 }
395 else {
396 confess "invalid update_freq ".$self->update_freq;
400 #subtract from modtime and make a decision
401 return time-$modtime > $max_time_offset ? 1 : 0;
405 =head2 check_format_permissions
407 Usage: $bdb->check_format_from_file() or die "cannot format!\n";
408 Desc : check directory existence and file permissions to see if a
409 format_from_file() is likely to succeed. This is useful,
410 for example, when you have a script that downloads some
411 remote database and you'd like to check first whether
412 we even have permissions to format before you take the
413 time to download something.
414 Args : none
415 Ret : nothing if everything looks good,
416 otherwise a string error message summarizing the reason
417 for failure
418 Side Effects: reads from filesystem, may stat some files
420 =cut
422 sub check_format_permissions {
423 my ($self,$ffbn) = @_;
424 croak "ffbn arg is no longer supported, maybe you should just use a new Bio::BLAST::Database object" if $ffbn;
425 return unless $self->_fileset('write');
426 return $self->_fileset('write')->check_format_permissions;
429 =head2 format_from_file
431 Usage: $db->format_from_file('mysequences.seq');
432 Desc : format this blast database from the given source file,
433 into its proper place on disk, overwriting the files already
434 present
435 Ret : nothing meaningful
436 Args : filename containing sequences,
437 Side Effects: runs 'formatdb' to format the given sequences,
438 dies on failure
440 =cut
442 sub format_from_file {
443 my ($self,$seqfile,$ffbn) = @_;
444 $ffbn and croak "ffbn arg no longer supported. maybe you should make a new Bio::BLAST::Database object";
446 $self->_fileset('write')
447 ->format_from_file( seqfile => $seqfile, indexed_seqs => $self->index_seqs, title => $self->title );
450 =head2 to_fasta
452 Usage: my $fasta_fh = $bdb->to_fasta;
453 Desc : get the contents of this blast database in FASTA format
454 Ret : an IO::Pipe filehandle, or nothing if it could not be opened
455 Args : none
456 Side Effects: runs 'fastacmd' in a forked process, cleaning up its output,
457 and passing it to you
459 =cut
461 sub to_fasta {
462 my ($self) = @_;
463 return unless $self->_fileset;
464 return $self->_fileset->to_fasta;
467 =head2 get_sequence
469 Usage: my $seq = $bdb->get_sequence('LE_HBa0001A02');
470 Desc : get a particular sequence from this db
471 Args : sequence name to retrieve
472 Ret : Bio::PrimarySeqI object, or nothing if not found or
473 if db does not exist
474 Side Effects: dies on error, like if this db is not indexed
476 =cut
478 sub get_sequence {
479 my ($self, $seqname) = @_;
480 return unless $self->_fileset;
481 return $self->_fileset->get_sequence($seqname);
484 =head2 dbpath
486 Usage: CXGN::BlastDB->dbpath('/data/cluster/blast/databases');
487 Desc : class method to get/set the location where all blast database
488 files are expected to be found. Defaults to the value of the
489 CXGN configuration variable 'blast_db_path'.
490 Ret : the current base path
491 Args : (optional) new base path
492 Side Effects: gets/sets a piece of CLASS-WIDE data
494 =cut
496 #mk_classdata is from Class::Data::Inheritable. good little module,
497 #you should look at it
498 __PACKAGE__->mk_classdata( dbpath => CXGN::BlastDB::Config->load->{'blast_db_path'} );
500 =head2 identifier_url
502 Usage: my $url = $db->identifier_url('some ident from this bdb');
503 Desc : get a URL to look up more information on this identifier.
504 first tries to make a URL using the lookup_url column in the
505 sgn.blast_db table, then tries to use identifier_url() from
506 L<CXGN::Tools::Identifiers>
507 Args : the identifier to lookup, assumed
508 to be from this blast db
509 Ret : a URL, or undef if none could be found
510 Side Effects: Example:
512 =cut
514 sub identifier_url {
515 my ($self,$ident) = @_;
516 $ident or croak 'must pass an identifier to link';
518 return $self->lookup_url
519 ? sprintf($self->lookup_url,$ident)
520 : do { require CXGN::Tools::Identifiers; CXGN::Tools::Identifiers::identifier_url($ident) };
523 # accessor that holds our encapsulated Bio::BLAST::Database
524 memoize '_fileset',
525 NORMALIZER => sub { #< need to take the full_file_basename (really the dbpath) into account for the memoization
526 my $s = shift; join ',',$s,@_,$s->full_file_basename
528 sub _fileset {
529 my ($self,$write) = @_;
530 my $ffbn = $self->full_file_basename;
531 return Bio::BLAST::Database->open( full_file_basename => $ffbn,
532 type => $self->type,
533 ($write ? ( write => 1,
534 create_dirs => 1,
536 : (),
541 =head1 MAINTAINER
543 Robert Buels
545 =head1 AUTHOR
547 Robert Buels, E<lt>rmb32@cornell.eduE<gt>
549 =head1 COPYRIGHT & LICENSE
551 Copyright 2009 Boyce Thompson Institute for Plant Research
553 This program is free software; you can redistribute it and/or modify
554 it under the same terms as Perl itself.
556 =cut
558 package CXGN::BlastDB::Group;
559 use strict;
560 use English;
561 use Carp;
563 use base qw/CXGN::CDBI::Class::DBI Class::Data::Inheritable/;
564 __PACKAGE__->table(__PACKAGE__->qualify_schema('sgn') . '.blast_db_group');
566 #define our database columns with class::dbi
567 our @primary_key_names = ('blast_db_group_id');
569 our @column_names =
570 ('blast_db_group_id',
571 'name',
572 'ordinal'
574 __PACKAGE__->columns( Primary => @primary_key_names, );
575 __PACKAGE__->columns( All => @column_names, );
576 __PACKAGE__->columns( Essential => @column_names, );
577 __PACKAGE__->sequence( __PACKAGE__->base_schema('sgn'). '.blast_db_group_blast_db_group_id_seq' );
579 __PACKAGE__->has_many( blast_dbs => 'CXGN::BlastDB', {order_by => 'title'} );
582 ####
583 1; # do not remove
584 ####