Merge pull request #42 from solgenomics/topic/duplicate_image_warning
[cxgn-corelibs.git] / lib / CXGN / BlastDB.pm
blob490f5dfbf12bbe1e4e23cf30935ca8e393382200
1 package CXGN::BlastDB;
3 =head1 NAME
5 CXGN::BlastDB - a BLAST database that we keep in stock and updated.
7 NOTE!!! This object is deprecated and used only in conjunction with
8 cgi-bin scripts. An equivalent object based on DBIx::Class and Moose
9 can be found in the sgn/ repo, called CXGN::Blast.
11 =head1 SYNOPSIS
13 ### SIMPLE MECHANICS
15 my $db = CXGN::BlastDB->from_id(10); #get by ID
17 my @dbs = CXGN::BlastDB->retrieve_all(); #get all blastDB objects
19 #search ilike by title
20 my @dbs = CXGN::BlastDB->search_ilike( title => '%solanum%' );
22 #change the title of a blast db in memory
23 $dbs[0]->title( 'Sequences from Tomatoes' );
25 #write the changes to the DB object to the database
26 $dbs[0]->update;
28 #do a blast against this database
29 CXGN::Tools::Run->run( 'blastall',
30 -m => 8,
31 -i => 'myseqs.seq',
32 -d => $dbs[0]->full_file_basename,
33 -p => 'blastn',
34 -e => '1e-10',
35 -o => 'myreport.m8.blast',
38 ### NIFTY THINGS
40 #get a handle on our managed copy of the NCBI UniVec database
41 my ($uv) = CXGN::BlastDB->search_ilike( title => '%univec%');
43 #set the path to our blast databases repository
44 CXGN::BlastDB->dbpath('/data/shared/blast/databases');
46 #does it need to be updated?
47 print "univec needs updating\n" if $uv->needs_update;
49 #list the files that are part of our univec DB
50 my @uv_files = $uv->list_files;
51 #returns ( '/data/shared/blast/databases/screening/vector/UniVec.nin',
52 # '/data/shared/blast/databases/screening/vector/UniVec.nhr',
53 # '/data/shared/blast/databases/screening/vector/UniVec.nsq',
54 # )
56 #how many sequences does it have in it?
57 print "this copy of univec has ".$uv->sequence_count." sequences in it\n";
59 #we've got an updated copy of univec here, let's format it and install it
60 #in place
61 $uv->format_from_file('new_univec.seq');
63 #i'll plop another formatted copy of univec in my home dir too
64 CXGN::BlastDB->dbpath('/home/rob/blast');
65 $uv->format_from_file('new_univec.seq');
66 #that will have done a mkpath -p /home/rob/blast/screening/vector,
67 #then it will have put the Univec.nin, nhr, and nsq files there.
69 =head1 DESCRIPTION
71 This is a handle on a BLAST database we manage. Each of
72 these objects corresponds to a row in the sgn.blast_db table and
73 a set of files in the filesystem. at a place specified by this class's
74 dbpath() data member (see dbpath docs below). This path defaults to
75 the value of the 'blast_db_path' configuration variable (see L<CXGN::Config>).
77 =head1 METHODS
79 =cut
81 use strict;
82 use Carp;
83 use File::Spec;
84 use File::Basename;
85 use File::Copy;
86 use File::Path;
87 use POSIX;
89 use List::MoreUtils qw/ uniq /;
91 use Memoize;
93 use CXGN::BlastDB::Config;
94 use CXGN::Tools::List qw/any min all max/;
95 use CXGN::Tools::Run;
97 use Bio::BLAST::Database;
99 use base qw/CXGN::CDBI::Class::DBI Class::Data::Inheritable/;
100 __PACKAGE__->table('blast_db');
102 #define our database columns with class::dbi
103 our @primary_key_names = ('blast_db_id');
105 our @column_names =
106 ('blast_db_id', #-database serial number
107 'file_base', #-basename of the database files, with a path prepended,
108 # e.g. 'genbank/nr'
109 'title', #-title of the database, e.g. 'NCBI Non-redundant proteins'
110 'type', #-type, either 'protein' or 'nucleotide'
111 'source_url', #-the URL new copies of this database can be fetched from
112 'lookup_url', #-printf-style format string that can be used to generate
113 # a URL where a user can get more info on a sequence in
114 # this blast db
115 'info_url', #-URL that gives information about the contents of this DB
116 'update_freq', #-frequency of updating this blast database
117 'index_seqs', #- corresponds to formatdb's -o option. Set true if formatdb
118 # should be given a '-o T'. This is used if you later want to
119 # fetch specific sequences out of this blast db
120 'web_interface_visible', #whether the blast web interface should display this DB
121 'blast_db_group_id', #ID of the blast DB group this db belondgs to,
122 #if any. used for displaying them in the web
123 #interface
124 'description', # text description of the database, display on the database details page
126 __PACKAGE__->columns( Primary => @primary_key_names, );
127 __PACKAGE__->columns( All => @column_names, );
128 __PACKAGE__->columns( Essential => @column_names, );
129 __PACKAGE__->sequence( 'blast_db_blast_db_id_seq' );
131 __PACKAGE__->has_a( blast_db_group_id => 'CXGN::BlastDB::Group');
133 =head2 from_id
135 Usage: my $bdb = CXGN::BlastDB->from_id(12);
136 Desc : retrieve a BlastDB object using its ID number
137 Ret : a BlastDB object for that id number, or undef if none found
138 Args : the id number of the object to retrieve
139 Side Effects: accesses the database
141 =cut
143 sub from_id { shift->retrieve(@_); }
145 #only document title and type for external users of this module,
146 #some nicer wrappers will be provided for using the other information
148 =head2 title
150 Usage: $db->title
151 Desc : get/set the title of this blast DB object
152 Ret : string containing the title, e.g. 'NCBI Non-redundant proteins'
153 Args : optional new value for the title
155 Note: must run $db->update for changes to be written to the db, unless you've
156 set $db->autoupdate.
158 =head2 type
160 Usage: $db->type
161 Desc : get the type of this blast db, whether it holds
162 proteins or nucleotides
163 Ret : 'protein' or 'nucleotide'
164 Args : optional new value for the type, either 'protein' or 'nucleotide'
166 Note: must run $db->update for changes to be written to the db, unless you've
167 set $db->autoupdate.
169 =head2 file_base
171 Usage: $db->file_base;
172 Desc : get/set the basename and path relative to 'blast_db_path' config var
173 Ret : the path and basename, e.g. 'genbank/nr' or 'screening/organelle/ATH_mitochondria'
174 Args : (optional) new string containing subpath and basename
175 Side Effects: none
177 Note: must run $db->update for changes to be written to the db, unless you've
178 set $db->autoupdate.
180 =cut
182 =head2 genomic_libraries_annotated
184 Desc: get the L<CXGN::Genomic::Library> objects that are slated as using this
185 blast database for annotation
186 Args: none
187 Ret : array of L<CXGN::Genomic::Library> objects
189 =cut
191 __PACKAGE__->has_many( genomic_libraries_annotated =>
192 [ 'CXGN::Genomic::LibraryAnnotationDB' => 'library_id' ],
195 =head2 file_modtime
197 Desc: get the earliest unix modification time of the database files
198 Args: none
199 Ret : unix modification time of the database files, or nothing if does not exist
200 Side Effects:
201 Example:
203 =cut
205 sub file_modtime {
206 my $self = shift;
207 return unless $self->_fileset;
208 return $self->_fileset->file_modtime;
211 =head2 format_time
213 Usage: my $time = $db->format_time;
214 Desc : get the format time of these db files
215 Ret : the value time() would have returned when
216 this database was last formatted, or undef
217 if that could not be determined (like if the
218 files aren't there)
219 Args : none
220 Side Effects: runs 'fastacmd' to extract the formatting
221 time from the database files
223 NOTE: This function assumes that the computer that
224 last formatted this database had the same time zone
225 set as the computer we are running on.
227 =cut
229 sub format_time {
230 my ($self) = @_;
231 return unless $self->_fileset;
232 return $self->_fileset->format_time;
235 =head2 full_file_basename
237 Desc:
238 Args: none
239 Ret : full path to the blast database file basename,
240 Side Effects: none
241 Example:
243 my $basename = $db->full_file_basename;
244 #returns '/data/shared/blast/databases/genbank/nr'
246 =cut
248 sub full_file_basename {
249 my $this = shift;
250 my $class = ref $this;
252 return scalar File::Spec->catfile( $class->dbpath,
253 $this->file_base,
258 =head2 list_files
260 Usage: my @files = $db->list_files;
261 Desc : get the list of files that belong to this blast database
262 Ret : list of full paths to all files belonging to this blast database,
263 Args : none
264 Side Effects: looks in the filesystem
266 =cut
268 sub list_files {
269 my $self = shift;
270 return unless $self->_fileset;
271 $self->_fileset->list_files();
274 =head2 files_are_complete
276 Usage: print "complete!" if $db->files_are_complete;
277 Desc : tell whether this blast db has a complete set of files on disk
278 Ret : true if the set of files on disk looks complete,
279 false if not
280 Args : none
281 Side Effects: lists files on disk
283 =cut
285 sub files_are_complete {
286 my ($self) = @_;
287 return unless $self->_fileset;
288 return $self->_fileset->files_are_complete;
291 =head2 is_split
293 Usage: print "that thing is split, yo" if $db->is_split;
294 Desc : determine whether this database is in multiple parts
295 Ret : true if this database has been split into multiple
296 files by formatdb (e.g. nr.00.pin, nr.01.pin, etc.)
297 Args : none
298 Side Effects: looks in filesystem
300 =cut
302 sub is_split {
303 my ($self) = @_;
304 return unless $self->_fileset;
305 return $self->_fileset->is_split;
308 =head2 is_indexed
310 Usage: $bdb->is_indexed
311 Desc : checks whether this blast db is indexed on disk to support
312 individual sequence retrieval. note that this is different
313 from index_seqs(), which is the flag of whether this db
314 _should_ be indexed.
315 Args : none
316 Ret : false if not on disk or not indexed, true if indexed
318 =cut
320 sub is_indexed {
321 my ( $self ) = @_;
322 return unless $self->_fileset;
323 return $self->_fileset->files_are_complete && $self->_fileset->indexed_seqs;
327 =head2 sequences_count
329 Desc: get the number of sequences in this blast database
330 Args: none
331 Ret : number of distinct sequences in this blast database, or undef
332 if it could not be determined due to some error or other
333 Side Effects: runs 'fastacmd' to get stats on the blast database file
335 =cut
337 sub sequences_count {
338 my $self = shift;
339 return unless $self->_fileset;
340 return $self->_fileset->sequences_count;
343 =head2 is_contaminant_for
345 This method doesn't work yet.
347 Usage: my $is_contam = $bdb->is_contaminant_for($lib);
348 Desc : return whether this BlastDB contains sequences
349 from something that would be considered a contaminant
350 in the given CXGN::Genomic::Library
351 Ret : 1 or 0
352 Args : a CXGN::Genomic::Library object
354 =cut
356 __PACKAGE__->has_many( _lib_annots => 'CXGN::Genomic::LibraryAnnotationDB' );
357 sub is_contaminant_for {
358 my ($this,$lib) = @_;
360 #return true if any arguments are true
361 return any( map { $_->is_contaminant && $_->library_id == $lib } $this->_lib_annots);
364 =head2 needs_update
366 Usage: print "you should update ".$db->title if $db->needs_update;
367 Desc : check whether this blast DB needs to be updated
368 Ret : true if this database's files need an update or are missing,
369 false otherwise
370 Args : none
371 Side Effects: runs format_time(), which runs `fastacmd`
373 =cut
375 sub needs_update {
376 my ($self) = @_;
378 #it of course needs an update if it is not complete
379 return 1 unless $self->files_are_complete;
381 my $modtime = $self->format_time();
383 #if no modtime, files must not even be there
384 return 1 unless $modtime;
386 #manually updated DBs never _need_ updates if their
387 #files are there
388 return 0 if $self->update_freq eq 'manual';
390 #also need update if it is set to be indexed but is not indexed
391 return 1 if $self->index_seqs && ! $self->is_indexed;
393 #figure out the maximum number of seconds we'll tolerate
394 #the files being out of date
395 my $max_time_offset = 60 * 60 * 24 * do { #figure out number of days
396 if( $self->update_freq eq 'daily' ) { 1 }
397 elsif( $self->update_freq eq 'weekly' ) { 7 }
398 elsif( $self->update_freq eq 'monthly' ) { 31 }
399 else {
400 confess "invalid update_freq ".$self->update_freq;
404 #subtract from modtime and make a decision
405 return time-$modtime > $max_time_offset ? 1 : 0;
409 =head2 check_format_permissions
411 Usage: $bdb->check_format_from_file() or die "cannot format!\n";
412 Desc : check directory existence and file permissions to see if a
413 format_from_file() is likely to succeed. This is useful,
414 for example, when you have a script that downloads some
415 remote database and you'd like to check first whether
416 we even have permissions to format before you take the
417 time to download something.
418 Args : none
419 Ret : nothing if everything looks good,
420 otherwise a string error message summarizing the reason
421 for failure
422 Side Effects: reads from filesystem, may stat some files
424 =cut
426 sub check_format_permissions {
427 my ($self,$ffbn) = @_;
428 croak "ffbn arg is no longer supported, maybe you should just use a new Bio::BLAST::Database object" if $ffbn;
429 return unless $self->_fileset('write');
430 return $self->_fileset('write')->check_format_permissions;
433 =head2 format_from_file
435 Usage: $db->format_from_file('mysequences.seq');
436 Desc : format this blast database from the given source file,
437 into its proper place on disk, overwriting the files already
438 present
439 Ret : nothing meaningful
440 Args : filename containing sequences,
441 Side Effects: runs 'formatdb' to format the given sequences,
442 dies on failure
444 =cut
446 sub format_from_file {
447 my ($self,$seqfile,$ffbn) = @_;
448 $ffbn and croak "ffbn arg no longer supported. maybe you should make a new Bio::BLAST::Database object";
450 $self->_fileset('write')
451 ->format_from_file( seqfile => $seqfile, indexed_seqs => $self->index_seqs, title => $self->title );
454 =head2 to_fasta
456 Usage: my $fasta_fh = $bdb->to_fasta;
457 Desc : get the contents of this blast database in FASTA format
458 Ret : an IO::Pipe filehandle, or nothing if it could not be opened
459 Args : none
460 Side Effects: runs 'fastacmd' in a forked process, cleaning up its output,
461 and passing it to you
463 =cut
465 sub to_fasta {
466 my ($self) = @_;
467 return unless $self->_fileset;
468 return $self->_fileset->to_fasta;
471 =head2 get_sequence
473 Usage: my $seq = $bdb->get_sequence('LE_HBa0001A02');
474 Desc : get a particular sequence from this db
475 Args : sequence name to retrieve
476 Ret : Bio::PrimarySeqI object, or nothing if not found or
477 if db does not exist
478 Side Effects: dies on error, like if this db is not indexed
480 =cut
482 sub get_sequence {
483 my ($self, $seqname) = @_;
484 return unless $self->_fileset;
485 return $self->_fileset->get_sequence($seqname);
488 =head2 dbpath
490 Usage: CXGN::BlastDB->dbpath('/data/cluster/blast/databases');
491 Desc : class method to get/set the location where all blast database
492 files are expected to be found. Defaults to the value of the
493 CXGN configuration variable 'blast_db_path'.
494 Ret : the current base path
495 Args : (optional) new base path
496 Side Effects: gets/sets a piece of CLASS-WIDE data
498 =cut
500 #mk_classdata is from Class::Data::Inheritable. good little module,
501 #you should look at it
502 __PACKAGE__->mk_classdata( dbpath => CXGN::BlastDB::Config->load->{'blast_db_path'} );
504 =head2 identifier_url
506 Usage: my $url = $db->identifier_url('some ident from this bdb');
507 Desc : get a URL to look up more information on this identifier.
508 first tries to make a URL using the lookup_url column in the
509 sgn.blast_db table, then tries to use identifier_url() from
510 L<CXGN::Tools::Identifiers>
511 Args : the identifier to lookup, assumed
512 to be from this blast db
513 Ret : a URL, or undef if none could be found
514 Side Effects: Example:
516 =cut
518 sub identifier_url {
519 my ($self,$ident) = @_;
520 $ident or croak 'must pass an identifier to link';
522 return $self->lookup_url
523 ? sprintf($self->lookup_url,$ident)
524 : do { require CXGN::Tools::Identifiers; CXGN::Tools::Identifiers::identifier_url($ident) };
527 # accessor that holds our encapsulated Bio::BLAST::Database
528 memoize '_fileset',
529 NORMALIZER => sub { #< need to take the full_file_basename (really the dbpath) into account for the memoization
530 my $s = shift; join ',',$s,@_,$s->full_file_basename
532 sub _fileset {
533 my ($self,$write) = @_;
534 my $ffbn = $self->full_file_basename;
535 return Bio::BLAST::Database->open( full_file_basename => $ffbn,
536 type => $self->type,
537 ($write ? ( write => 1,
538 create_dirs => 1,
540 : (),
545 =head1 MAINTAINER
547 Robert Buels
549 =head1 AUTHOR
551 Robert Buels, E<lt>rmb32@cornell.eduE<gt>
553 =head1 COPYRIGHT & LICENSE
555 Copyright 2009 Boyce Thompson Institute for Plant Research
557 This program is free software; you can redistribute it and/or modify
558 it under the same terms as Perl itself.
560 =cut
562 package CXGN::BlastDB::Group;
563 use strict;
564 use English;
565 use Carp;
567 use base qw/CXGN::CDBI::Class::DBI Class::Data::Inheritable/;
568 __PACKAGE__->table(__PACKAGE__->qualify_schema('sgn') . '.blast_db_group');
570 #define our database columns with class::dbi
571 our @primary_key_names = ('blast_db_group_id');
573 our @column_names =
574 ('blast_db_group_id',
575 'name',
576 'ordinal'
578 __PACKAGE__->columns( Primary => @primary_key_names, );
579 __PACKAGE__->columns( All => @column_names, );
580 __PACKAGE__->columns( Essential => @column_names, );
581 __PACKAGE__->sequence( __PACKAGE__->base_schema('sgn'). '.blast_db_group_blast_db_group_id_seq' );
583 __PACKAGE__->has_many( blast_dbs => 'CXGN::BlastDB', {order_by => 'title'} );
586 ####
587 1; # do not remove
588 ####