5 CXGN::BlastDB - a BLAST database that we keep in stock and updated.
11 my $db = CXGN::BlastDB->from_id(10); #get by ID
13 my @dbs = CXGN::BlastDB->retrieve_all(); #get all blastDB objects
15 #search ilike by title
16 my @dbs = CXGN::BlastDB->search_ilike( title => '%solanum%' );
18 #change the title of a blast db in memory
19 $dbs[0]->title( 'Sequences from Tomatoes' );
21 #write the changes to the DB object to the database
24 #do a blast against this database
25 CXGN::Tools::Run->run( 'blastall',
28 -d => $dbs[0]->full_file_basename,
31 -o => 'myreport.m8.blast',
36 #get a handle on our managed copy of the NCBI UniVec database
37 my ($uv) = CXGN::BlastDB->search_ilike( title => '%univec%');
39 #set the path to our blast databases repository
40 CXGN::BlastDB->dbpath('/data/shared/blast/databases');
42 #does it need to be updated?
43 print "univec needs updating\n" if $uv->needs_update;
45 #list the files that are part of our univec DB
46 my @uv_files = $uv->list_files;
47 #returns ( '/data/shared/blast/databases/screening/vector/UniVec.nin',
48 # '/data/shared/blast/databases/screening/vector/UniVec.nhr',
49 # '/data/shared/blast/databases/screening/vector/UniVec.nsq',
52 #how many sequences does it have in it?
53 print "this copy of univec has ".$uv->sequence_count." sequences in it\n";
55 #we've got an updated copy of univec here, let's format it and install it
57 $uv->format_from_file('new_univec.seq');
59 #i'll plop another formatted copy of univec in my home dir too
60 CXGN::BlastDB->dbpath('/home/rob/blast');
61 $uv->format_from_file('new_univec.seq');
62 #that will have done a mkpath -p /home/rob/blast/screening/vector,
63 #then it will have put the Univec.nin, nhr, and nsq files there.
67 This is a handle on a BLAST database we manage. Each of
68 these objects corresponds to a row in the sgn.blast_db table and
69 a set of files in the filesystem. at a place specified by this class's
70 dbpath() data member (see dbpath docs below). This path defaults to
71 the value of the 'blast_db_path' configuration variable (see L<CXGN::Config>).
85 use List
::MoreUtils qw
/ uniq /;
89 use CXGN
::BlastDB
::Config
;
90 use CXGN
::Tools
::List qw
/any min all max/;
93 use Bio
::BLAST
::Database
;
95 use base qw
/CXGN::CDBI::Class::DBI Class::Data::Inheritable/;
96 __PACKAGE__
->table('blast_db');
98 #define our database columns with class::dbi
99 our @primary_key_names = ('blast_db_id');
102 ('blast_db_id', #-database serial number
103 'file_base', #-basename of the database files, with a path prepended,
105 'title', #-title of the database, e.g. 'NCBI Non-redundant proteins'
106 'type', #-type, either 'protein' or 'nucleotide'
107 'source_url', #-the URL new copies of this database can be fetched from
108 'lookup_url', #-printf-style format string that can be used to generate
109 # a URL where a user can get more info on a sequence in
111 'info_url', #-URL that gives information about the contents of this DB
112 'update_freq', #-frequency of updating this blast database
113 'index_seqs', #- corresponds to formatdb's -o option. Set true if formatdb
114 # should be given a '-o T'. This is used if you later want to
115 # fetch specific sequences out of this blast db
116 'web_interface_visible', #whether the blast web interface should display this DB
117 'blast_db_group_id', #ID of the blast DB group this db belondgs to,
118 #if any. used for displaying them in the web
120 'description', # text description of the database, display on the database details page
122 __PACKAGE__
->columns( Primary
=> @primary_key_names, );
123 __PACKAGE__
->columns( All
=> @column_names, );
124 __PACKAGE__
->columns( Essential
=> @column_names, );
125 __PACKAGE__
->sequence( 'blast_db_blast_db_id_seq' );
127 __PACKAGE__
->has_a( blast_db_group_id
=> 'CXGN::BlastDB::Group');
131 Usage: my $bdb = CXGN::BlastDB->from_id(12);
132 Desc : retrieve a BlastDB object using its ID number
133 Ret : a BlastDB object for that id number, or undef if none found
134 Args : the id number of the object to retrieve
135 Side Effects: accesses the database
139 sub from_id
{ shift->retrieve(@_); }
141 #only document title and type for external users of this module,
142 #some nicer wrappers will be provided for using the other information
147 Desc : get/set the title of this blast DB object
148 Ret : string containing the title, e.g. 'NCBI Non-redundant proteins'
149 Args : optional new value for the title
151 Note: must run $db->update for changes to be written to the db, unless you've
157 Desc : get the type of this blast db, whether it holds
158 proteins or nucleotides
159 Ret : 'protein' or 'nucleotide'
160 Args : optional new value for the type, either 'protein' or 'nucleotide'
162 Note: must run $db->update for changes to be written to the db, unless you've
167 Usage: $db->file_base;
168 Desc : get/set the basename and path relative to 'blast_db_path' config var
169 Ret : the path and basename, e.g. 'genbank/nr' or 'screening/organelle/ATH_mitochondria'
170 Args : (optional) new string containing subpath and basename
173 Note: must run $db->update for changes to be written to the db, unless you've
178 =head2 genomic_libraries_annotated
180 Desc: get the L<CXGN::Genomic::Library> objects that are slated as using this
181 blast database for annotation
183 Ret : array of L<CXGN::Genomic::Library> objects
187 __PACKAGE__
->has_many( genomic_libraries_annotated
=>
188 [ 'CXGN::Genomic::LibraryAnnotationDB' => 'library_id' ],
193 Desc: get the earliest unix modification time of the database files
195 Ret : unix modification time of the database files, or nothing if does not exist
203 return unless $self->_fileset;
204 return $self->_fileset->file_modtime;
209 Usage: my $time = $db->format_time;
210 Desc : get the format time of these db files
211 Ret : the value time() would have returned when
212 this database was last formatted, or undef
213 if that could not be determined (like if the
216 Side Effects: runs 'fastacmd' to extract the formatting
217 time from the database files
219 NOTE: This function assumes that the computer that
220 last formatted this database had the same time zone
221 set as the computer we are running on.
227 return unless $self->_fileset;
228 return $self->_fileset->format_time;
231 =head2 full_file_basename
235 Ret : full path to the blast database file basename,
239 my $basename = $db->full_file_basename;
240 #returns '/data/shared/blast/databases/genbank/nr'
244 sub full_file_basename
{
246 my $class = ref $this;
248 return scalar File
::Spec
->catfile( $class->dbpath,
256 Usage: my @files = $db->list_files;
257 Desc : get the list of files that belong to this blast database
258 Ret : list of full paths to all files belonging to this blast database,
260 Side Effects: looks in the filesystem
266 return unless $self->_fileset;
267 $self->_fileset->list_files();
270 =head2 files_are_complete
272 Usage: print "complete!" if $db->files_are_complete;
273 Desc : tell whether this blast db has a complete set of files on disk
274 Ret : true if the set of files on disk looks complete,
277 Side Effects: lists files on disk
281 sub files_are_complete
{
283 return unless $self->_fileset;
284 return $self->_fileset->files_are_complete;
289 Usage: print "that thing is split, yo" if $db->is_split;
290 Desc : determine whether this database is in multiple parts
291 Ret : true if this database has been split into multiple
292 files by formatdb (e.g. nr.00.pin, nr.01.pin, etc.)
294 Side Effects: looks in filesystem
300 return unless $self->_fileset;
301 return $self->_fileset->is_split;
306 Usage: $bdb->is_indexed
307 Desc : checks whether this blast db is indexed on disk to support
308 individual sequence retrieval. note that this is different
309 from index_seqs(), which is the flag of whether this db
312 Ret : false if not on disk or not indexed, true if indexed
318 return unless $self->_fileset;
319 return $self->_fileset->files_are_complete && $self->_fileset->indexed_seqs;
323 =head2 sequences_count
325 Desc: get the number of sequences in this blast database
327 Ret : number of distinct sequences in this blast database, or undef
328 if it could not be determined due to some error or other
329 Side Effects: runs 'fastacmd' to get stats on the blast database file
333 sub sequences_count
{
335 return unless $self->_fileset;
336 return $self->_fileset->sequences_count;
339 =head2 is_contaminant_for
341 This method doesn't work yet.
343 Usage: my $is_contam = $bdb->is_contaminant_for($lib);
344 Desc : return whether this BlastDB contains sequences
345 from something that would be considered a contaminant
346 in the given CXGN::Genomic::Library
348 Args : a CXGN::Genomic::Library object
352 __PACKAGE__
->has_many( _lib_annots
=> 'CXGN::Genomic::LibraryAnnotationDB' );
353 sub is_contaminant_for
{
354 my ($this,$lib) = @_;
356 #return true if any arguments are true
357 return any
( map { $_->is_contaminant && $_->library_id == $lib } $this->_lib_annots);
362 Usage: print "you should update ".$db->title if $db->needs_update;
363 Desc : check whether this blast DB needs to be updated
364 Ret : true if this database's files need an update or are missing,
367 Side Effects: runs format_time(), which runs `fastacmd`
374 #it of course needs an update if it is not complete
375 return 1 unless $self->files_are_complete;
377 my $modtime = $self->format_time();
379 #if no modtime, files must not even be there
380 return 1 unless $modtime;
382 #manually updated DBs never _need_ updates if their
384 return 0 if $self->update_freq eq 'manual';
386 #also need update if it is set to be indexed but is not indexed
387 return 1 if $self->index_seqs && ! $self->is_indexed;
389 #figure out the maximum number of seconds we'll tolerate
390 #the files being out of date
391 my $max_time_offset = 60 * 60 * 24 * do { #figure out number of days
392 if( $self->update_freq eq 'daily' ) { 1 }
393 elsif( $self->update_freq eq 'weekly' ) { 7 }
394 elsif( $self->update_freq eq 'monthly' ) { 31 }
396 confess
"invalid update_freq ".$self->update_freq;
400 #subtract from modtime and make a decision
401 return time-$modtime > $max_time_offset ?
1 : 0;
405 =head2 check_format_permissions
407 Usage: $bdb->check_format_from_file() or die "cannot format!\n";
408 Desc : check directory existence and file permissions to see if a
409 format_from_file() is likely to succeed. This is useful,
410 for example, when you have a script that downloads some
411 remote database and you'd like to check first whether
412 we even have permissions to format before you take the
413 time to download something.
415 Ret : nothing if everything looks good,
416 otherwise a string error message summarizing the reason
418 Side Effects: reads from filesystem, may stat some files
422 sub check_format_permissions
{
423 my ($self,$ffbn) = @_;
424 croak
"ffbn arg is no longer supported, maybe you should just use a new Bio::BLAST::Database object" if $ffbn;
425 return unless $self->_fileset('write');
426 return $self->_fileset('write')->check_format_permissions;
429 =head2 format_from_file
431 Usage: $db->format_from_file('mysequences.seq');
432 Desc : format this blast database from the given source file,
433 into its proper place on disk, overwriting the files already
435 Ret : nothing meaningful
436 Args : filename containing sequences,
437 Side Effects: runs 'formatdb' to format the given sequences,
442 sub format_from_file
{
443 my ($self,$seqfile,$ffbn) = @_;
444 $ffbn and croak
"ffbn arg no longer supported. maybe you should make a new Bio::BLAST::Database object";
446 $self->_fileset('write')
447 ->format_from_file( seqfile
=> $seqfile, indexed_seqs
=> $self->index_seqs, title
=> $self->title );
452 Usage: my $fasta_fh = $bdb->to_fasta;
453 Desc : get the contents of this blast database in FASTA format
454 Ret : an IO::Pipe filehandle, or nothing if it could not be opened
456 Side Effects: runs 'fastacmd' in a forked process, cleaning up its output,
457 and passing it to you
463 return unless $self->_fileset;
464 return $self->_fileset->to_fasta;
469 Usage: my $seq = $bdb->get_sequence('LE_HBa0001A02');
470 Desc : get a particular sequence from this db
471 Args : sequence name to retrieve
472 Ret : Bio::PrimarySeqI object, or nothing if not found or
474 Side Effects: dies on error, like if this db is not indexed
479 my ($self, $seqname) = @_;
480 return unless $self->_fileset;
481 return $self->_fileset->get_sequence($seqname);
486 Usage: CXGN::BlastDB->dbpath('/data/cluster/blast/databases');
487 Desc : class method to get/set the location where all blast database
488 files are expected to be found. Defaults to the value of the
489 CXGN configuration variable 'blast_db_path'.
490 Ret : the current base path
491 Args : (optional) new base path
492 Side Effects: gets/sets a piece of CLASS-WIDE data
496 #mk_classdata is from Class::Data::Inheritable. good little module,
497 #you should look at it
498 __PACKAGE__
->mk_classdata( dbpath
=> CXGN
::BlastDB
::Config
->load->{'blast_db_path'} );
500 =head2 identifier_url
502 Usage: my $url = $db->identifier_url('some ident from this bdb');
503 Desc : get a URL to look up more information on this identifier.
504 first tries to make a URL using the lookup_url column in the
505 sgn.blast_db table, then tries to use identifier_url() from
506 L<CXGN::Tools::Identifiers>
507 Args : the identifier to lookup, assumed
508 to be from this blast db
509 Ret : a URL, or undef if none could be found
510 Side Effects: Example:
515 my ($self,$ident) = @_;
516 $ident or croak
'must pass an identifier to link';
518 return $self->lookup_url
519 ?
sprintf($self->lookup_url,$ident)
520 : do { require CXGN
::Tools
::Identifiers
; CXGN
::Tools
::Identifiers
::identifier_url
($ident) };
523 # accessor that holds our encapsulated Bio::BLAST::Database
525 NORMALIZER
=> sub { #< need to take the full_file_basename (really the dbpath) into account for the memoization
526 my $s = shift; join ',',$s,@_,$s->full_file_basename
529 my ($self,$write) = @_;
530 my $ffbn = $self->full_file_basename;
531 return Bio
::BLAST
::Database
->open( full_file_basename
=> $ffbn,
533 ($write ?
( write => 1,
547 Robert Buels, E<lt>rmb32@cornell.eduE<gt>
549 =head1 COPYRIGHT & LICENSE
551 Copyright 2009 Boyce Thompson Institute for Plant Research
553 This program is free software; you can redistribute it and/or modify
554 it under the same terms as Perl itself.
558 package CXGN
::BlastDB
::Group
;
563 use base qw
/CXGN::CDBI::Class::DBI Class::Data::Inheritable/;
564 __PACKAGE__
->table(__PACKAGE__
->qualify_schema('sgn') . '.blast_db_group');
566 #define our database columns with class::dbi
567 our @primary_key_names = ('blast_db_group_id');
570 ('blast_db_group_id',
574 __PACKAGE__
->columns( Primary
=> @primary_key_names, );
575 __PACKAGE__
->columns( All
=> @column_names, );
576 __PACKAGE__
->columns( Essential
=> @column_names, );
577 __PACKAGE__
->sequence( __PACKAGE__
->base_schema('sgn'). '.blast_db_group_blast_db_group_id_seq' );
579 __PACKAGE__
->has_many( blast_dbs
=> 'CXGN::BlastDB', {order_by
=> 'title'} );