5 CXGN::BlastDB - a BLAST database that we keep in stock and updated.
7 NOTE!!! This object is deprecated and used only in conjunction with
8 cgi-bin scripts. An equivalent object based on DBIx::Class and Moose
9 can be found in the sgn/ repo, called CXGN::Blast.
15 my $db = CXGN::BlastDB->from_id(10); #get by ID
17 my @dbs = CXGN::BlastDB->retrieve_all(); #get all blastDB objects
19 #search ilike by title
20 my @dbs = CXGN::BlastDB->search_ilike( title => '%solanum%' );
22 #change the title of a blast db in memory
23 $dbs[0]->title( 'Sequences from Tomatoes' );
25 #write the changes to the DB object to the database
28 #do a blast against this database
29 CXGN::Tools::Run->run( 'blastall',
32 -d => $dbs[0]->full_file_basename,
35 -o => 'myreport.m8.blast',
40 #get a handle on our managed copy of the NCBI UniVec database
41 my ($uv) = CXGN::BlastDB->search_ilike( title => '%univec%');
43 #set the path to our blast databases repository
44 CXGN::BlastDB->dbpath('/data/shared/blast/databases');
46 #does it need to be updated?
47 print "univec needs updating\n" if $uv->needs_update;
49 #list the files that are part of our univec DB
50 my @uv_files = $uv->list_files;
51 #returns ( '/data/shared/blast/databases/screening/vector/UniVec.nin',
52 # '/data/shared/blast/databases/screening/vector/UniVec.nhr',
53 # '/data/shared/blast/databases/screening/vector/UniVec.nsq',
56 #how many sequences does it have in it?
57 print "this copy of univec has ".$uv->sequence_count." sequences in it\n";
59 #we've got an updated copy of univec here, let's format it and install it
61 $uv->format_from_file('new_univec.seq');
63 #i'll plop another formatted copy of univec in my home dir too
64 CXGN::BlastDB->dbpath('/home/rob/blast');
65 $uv->format_from_file('new_univec.seq');
66 #that will have done a mkpath -p /home/rob/blast/screening/vector,
67 #then it will have put the Univec.nin, nhr, and nsq files there.
71 This is a handle on a BLAST database we manage. Each of
72 these objects corresponds to a row in the sgn.blast_db table and
73 a set of files in the filesystem. at a place specified by this class's
74 dbpath() data member (see dbpath docs below). This path defaults to
75 the value of the 'blast_db_path' configuration variable (see L<CXGN::Config>).
89 use List
::MoreUtils qw
/ uniq /;
93 use CXGN
::BlastDB
::Config
;
94 use CXGN
::Tools
::List qw
/any min all max/;
97 use Bio
::BLAST
::Database
;
99 use base qw
/CXGN::CDBI::Class::DBI Class::Data::Inheritable/;
100 __PACKAGE__
->table('blast_db');
102 #define our database columns with class::dbi
103 our @primary_key_names = ('blast_db_id');
106 ('blast_db_id', #-database serial number
107 'file_base', #-basename of the database files, with a path prepended,
109 'title', #-title of the database, e.g. 'NCBI Non-redundant proteins'
110 'type', #-type, either 'protein' or 'nucleotide'
111 'source_url', #-the URL new copies of this database can be fetched from
112 'lookup_url', #-printf-style format string that can be used to generate
113 # a URL where a user can get more info on a sequence in
115 'info_url', #-URL that gives information about the contents of this DB
116 'update_freq', #-frequency of updating this blast database
117 'index_seqs', #- corresponds to formatdb's -o option. Set true if formatdb
118 # should be given a '-o T'. This is used if you later want to
119 # fetch specific sequences out of this blast db
120 'web_interface_visible', #whether the blast web interface should display this DB
121 'blast_db_group_id', #ID of the blast DB group this db belondgs to,
122 #if any. used for displaying them in the web
124 'description', # text description of the database, display on the database details page
126 __PACKAGE__
->columns( Primary
=> @primary_key_names, );
127 __PACKAGE__
->columns( All
=> @column_names, );
128 __PACKAGE__
->columns( Essential
=> @column_names, );
129 __PACKAGE__
->sequence( 'blast_db_blast_db_id_seq' );
131 __PACKAGE__
->has_a( blast_db_group_id
=> 'CXGN::BlastDB::Group');
135 Usage: my $bdb = CXGN::BlastDB->from_id(12);
136 Desc : retrieve a BlastDB object using its ID number
137 Ret : a BlastDB object for that id number, or undef if none found
138 Args : the id number of the object to retrieve
139 Side Effects: accesses the database
143 sub from_id
{ shift->retrieve(@_); }
145 #only document title and type for external users of this module,
146 #some nicer wrappers will be provided for using the other information
151 Desc : get/set the title of this blast DB object
152 Ret : string containing the title, e.g. 'NCBI Non-redundant proteins'
153 Args : optional new value for the title
155 Note: must run $db->update for changes to be written to the db, unless you've
161 Desc : get the type of this blast db, whether it holds
162 proteins or nucleotides
163 Ret : 'protein' or 'nucleotide'
164 Args : optional new value for the type, either 'protein' or 'nucleotide'
166 Note: must run $db->update for changes to be written to the db, unless you've
171 Usage: $db->file_base;
172 Desc : get/set the basename and path relative to 'blast_db_path' config var
173 Ret : the path and basename, e.g. 'genbank/nr' or 'screening/organelle/ATH_mitochondria'
174 Args : (optional) new string containing subpath and basename
177 Note: must run $db->update for changes to be written to the db, unless you've
182 =head2 genomic_libraries_annotated
184 Desc: get the L<CXGN::Genomic::Library> objects that are slated as using this
185 blast database for annotation
187 Ret : array of L<CXGN::Genomic::Library> objects
191 __PACKAGE__
->has_many( genomic_libraries_annotated
=>
192 [ 'CXGN::Genomic::LibraryAnnotationDB' => 'library_id' ],
197 Desc: get the earliest unix modification time of the database files
199 Ret : unix modification time of the database files, or nothing if does not exist
207 return unless $self->_fileset;
208 return $self->_fileset->file_modtime;
213 Usage: my $time = $db->format_time;
214 Desc : get the format time of these db files
215 Ret : the value time() would have returned when
216 this database was last formatted, or undef
217 if that could not be determined (like if the
220 Side Effects: runs 'fastacmd' to extract the formatting
221 time from the database files
223 NOTE: This function assumes that the computer that
224 last formatted this database had the same time zone
225 set as the computer we are running on.
231 return unless $self->_fileset;
232 return $self->_fileset->format_time;
235 =head2 full_file_basename
239 Ret : full path to the blast database file basename,
243 my $basename = $db->full_file_basename;
244 #returns '/data/shared/blast/databases/genbank/nr'
248 sub full_file_basename
{
250 my $class = ref $this;
252 return scalar File
::Spec
->catfile( $class->dbpath,
260 Usage: my @files = $db->list_files;
261 Desc : get the list of files that belong to this blast database
262 Ret : list of full paths to all files belonging to this blast database,
264 Side Effects: looks in the filesystem
270 return unless $self->_fileset;
271 $self->_fileset->list_files();
274 =head2 files_are_complete
276 Usage: print "complete!" if $db->files_are_complete;
277 Desc : tell whether this blast db has a complete set of files on disk
278 Ret : true if the set of files on disk looks complete,
281 Side Effects: lists files on disk
285 sub files_are_complete
{
287 return unless $self->_fileset;
288 return $self->_fileset->files_are_complete;
293 Usage: print "that thing is split, yo" if $db->is_split;
294 Desc : determine whether this database is in multiple parts
295 Ret : true if this database has been split into multiple
296 files by formatdb (e.g. nr.00.pin, nr.01.pin, etc.)
298 Side Effects: looks in filesystem
304 return unless $self->_fileset;
305 return $self->_fileset->is_split;
310 Usage: $bdb->is_indexed
311 Desc : checks whether this blast db is indexed on disk to support
312 individual sequence retrieval. note that this is different
313 from index_seqs(), which is the flag of whether this db
316 Ret : false if not on disk or not indexed, true if indexed
322 return unless $self->_fileset;
323 return $self->_fileset->files_are_complete && $self->_fileset->indexed_seqs;
327 =head2 sequences_count
329 Desc: get the number of sequences in this blast database
331 Ret : number of distinct sequences in this blast database, or undef
332 if it could not be determined due to some error or other
333 Side Effects: runs 'fastacmd' to get stats on the blast database file
337 sub sequences_count
{
339 return unless $self->_fileset;
340 return $self->_fileset->sequences_count;
343 =head2 is_contaminant_for
345 This method doesn't work yet.
347 Usage: my $is_contam = $bdb->is_contaminant_for($lib);
348 Desc : return whether this BlastDB contains sequences
349 from something that would be considered a contaminant
350 in the given CXGN::Genomic::Library
352 Args : a CXGN::Genomic::Library object
356 __PACKAGE__
->has_many( _lib_annots
=> 'CXGN::Genomic::LibraryAnnotationDB' );
357 sub is_contaminant_for
{
358 my ($this,$lib) = @_;
360 #return true if any arguments are true
361 return any
( map { $_->is_contaminant && $_->library_id == $lib } $this->_lib_annots);
366 Usage: print "you should update ".$db->title if $db->needs_update;
367 Desc : check whether this blast DB needs to be updated
368 Ret : true if this database's files need an update or are missing,
371 Side Effects: runs format_time(), which runs `fastacmd`
378 #it of course needs an update if it is not complete
379 return 1 unless $self->files_are_complete;
381 my $modtime = $self->format_time();
383 #if no modtime, files must not even be there
384 return 1 unless $modtime;
386 #manually updated DBs never _need_ updates if their
388 return 0 if $self->update_freq eq 'manual';
390 #also need update if it is set to be indexed but is not indexed
391 return 1 if $self->index_seqs && ! $self->is_indexed;
393 #figure out the maximum number of seconds we'll tolerate
394 #the files being out of date
395 my $max_time_offset = 60 * 60 * 24 * do { #figure out number of days
396 if( $self->update_freq eq 'daily' ) { 1 }
397 elsif( $self->update_freq eq 'weekly' ) { 7 }
398 elsif( $self->update_freq eq 'monthly' ) { 31 }
400 confess
"invalid update_freq ".$self->update_freq;
404 #subtract from modtime and make a decision
405 return time-$modtime > $max_time_offset ?
1 : 0;
409 =head2 check_format_permissions
411 Usage: $bdb->check_format_from_file() or die "cannot format!\n";
412 Desc : check directory existence and file permissions to see if a
413 format_from_file() is likely to succeed. This is useful,
414 for example, when you have a script that downloads some
415 remote database and you'd like to check first whether
416 we even have permissions to format before you take the
417 time to download something.
419 Ret : nothing if everything looks good,
420 otherwise a string error message summarizing the reason
422 Side Effects: reads from filesystem, may stat some files
426 sub check_format_permissions
{
427 my ($self,$ffbn) = @_;
428 croak
"ffbn arg is no longer supported, maybe you should just use a new Bio::BLAST::Database object" if $ffbn;
429 return unless $self->_fileset('write');
430 return $self->_fileset('write')->check_format_permissions;
433 =head2 format_from_file
435 Usage: $db->format_from_file('mysequences.seq');
436 Desc : format this blast database from the given source file,
437 into its proper place on disk, overwriting the files already
439 Ret : nothing meaningful
440 Args : filename containing sequences,
441 Side Effects: runs 'formatdb' to format the given sequences,
446 sub format_from_file
{
447 my ($self,$seqfile,$ffbn) = @_;
448 $ffbn and croak
"ffbn arg no longer supported. maybe you should make a new Bio::BLAST::Database object";
450 $self->_fileset('write')
451 ->format_from_file( seqfile
=> $seqfile, indexed_seqs
=> $self->index_seqs, title
=> $self->title );
456 Usage: my $fasta_fh = $bdb->to_fasta;
457 Desc : get the contents of this blast database in FASTA format
458 Ret : an IO::Pipe filehandle, or nothing if it could not be opened
460 Side Effects: runs 'fastacmd' in a forked process, cleaning up its output,
461 and passing it to you
467 return unless $self->_fileset;
468 return $self->_fileset->to_fasta;
473 Usage: my $seq = $bdb->get_sequence('LE_HBa0001A02');
474 Desc : get a particular sequence from this db
475 Args : sequence name to retrieve
476 Ret : Bio::PrimarySeqI object, or nothing if not found or
478 Side Effects: dies on error, like if this db is not indexed
483 my ($self, $seqname) = @_;
484 return unless $self->_fileset;
485 return $self->_fileset->get_sequence($seqname);
490 Usage: CXGN::BlastDB->dbpath('/data/cluster/blast/databases');
491 Desc : class method to get/set the location where all blast database
492 files are expected to be found. Defaults to the value of the
493 CXGN configuration variable 'blast_db_path'.
494 Ret : the current base path
495 Args : (optional) new base path
496 Side Effects: gets/sets a piece of CLASS-WIDE data
500 #mk_classdata is from Class::Data::Inheritable. good little module,
501 #you should look at it
502 __PACKAGE__
->mk_classdata( dbpath
=> CXGN
::BlastDB
::Config
->load->{'blast_db_path'} );
504 =head2 identifier_url
506 Usage: my $url = $db->identifier_url('some ident from this bdb');
507 Desc : get a URL to look up more information on this identifier.
508 first tries to make a URL using the lookup_url column in the
509 sgn.blast_db table, then tries to use identifier_url() from
510 L<CXGN::Tools::Identifiers>
511 Args : the identifier to lookup, assumed
512 to be from this blast db
513 Ret : a URL, or undef if none could be found
514 Side Effects: Example:
519 my ($self,$ident) = @_;
520 $ident or croak
'must pass an identifier to link';
522 return $self->lookup_url
523 ?
sprintf($self->lookup_url,$ident)
524 : do { require CXGN
::Tools
::Identifiers
; CXGN
::Tools
::Identifiers
::identifier_url
($ident) };
527 # accessor that holds our encapsulated Bio::BLAST::Database
529 NORMALIZER
=> sub { #< need to take the full_file_basename (really the dbpath) into account for the memoization
530 my $s = shift; join ',',$s,@_,$s->full_file_basename
533 my ($self,$write) = @_;
534 my $ffbn = $self->full_file_basename;
535 return Bio
::BLAST
::Database
->open( full_file_basename
=> $ffbn,
537 ($write ?
( write => 1,
551 Robert Buels, E<lt>rmb32@cornell.eduE<gt>
553 =head1 COPYRIGHT & LICENSE
555 Copyright 2009 Boyce Thompson Institute for Plant Research
557 This program is free software; you can redistribute it and/or modify
558 it under the same terms as Perl itself.
562 package CXGN
::BlastDB
::Group
;
567 use base qw
/CXGN::CDBI::Class::DBI Class::Data::Inheritable/;
568 __PACKAGE__
->table(__PACKAGE__
->qualify_schema('sgn') . '.blast_db_group');
570 #define our database columns with class::dbi
571 our @primary_key_names = ('blast_db_group_id');
574 ('blast_db_group_id',
578 __PACKAGE__
->columns( Primary
=> @primary_key_names, );
579 __PACKAGE__
->columns( All
=> @column_names, );
580 __PACKAGE__
->columns( Essential
=> @column_names, );
581 __PACKAGE__
->sequence( __PACKAGE__
->base_schema('sgn'). '.blast_db_group_blast_db_group_id_seq' );
583 __PACKAGE__
->has_many( blast_dbs
=> 'CXGN::BlastDB', {order_by
=> 'title'} );