6 CXGN::Blast - a BLAST database that we keep in stock and updated.
12 This object has been derived from CXGN::BlastDB (in cxgn-corelibs) and
13 refactored to work with DBIx::Class (instead of Class::DBI) and Moose.
15 The constructor now requires some additional arguments:
17 my $db = CXGN::Blast->new( { blast_db_id => $x,
22 (this standard constructor now replaces the previous from_id() constructor).
24 my @dbs = CXGN::Blast->retrieve_all(); #get all blastDB objects
26 #change the title of a blast db in memory
27 $dbs[0]->title( 'Sequences from Tomatoes' );
29 Updating the object in the database is not supported right now.
31 #do a blast against this database
32 CXGN::Tools::Run->run( 'blastall',
35 -d => $dbs[0]->full_file_basename,
38 -o => 'myreport.m8.blast',
43 #does it need to be updated?
44 print "univec needs updating\n" if $uv->needs_update;
46 #list the files that are part of our univec DB
47 my @uv_files = $uv->list_files;
48 #returns ( '/data/shared/blast/databases/screening/vector/UniVec.nin',
49 # '/data/shared/blast/databases/screening/vector/UniVec.nhr',
50 # '/data/shared/blast/databases/screening/vector/UniVec.nsq',
53 #how many sequences does it have in it?
54 print "this copy of univec has ".$uv->sequence_count." sequences in it\n";
56 #we've got an updated copy of univec here, let's format it and install it
58 $uv->format_from_file('new_univec.seq');
60 #i'll plop another formatted copy of univec in my home dir too
61 $bdb->dbpath('/home/rob/blast');
62 $uv->format_from_file('new_univec.seq');
63 #that will have done a mkpath -p /home/rob/blast/screening/vector,
64 #then it will have put the Univec.nin, nhr, and nsq files there.
68 This is a handle on a BLAST database we manage. Each of
69 these objects corresponds to a row in the sgn.blast_db table and
70 a set of files in the filesystem. at a place specified by the
71 dbpath() accessor (see dbpath docs below). This path defaults to
72 the value of the 'blast_db_path' configuration variable (see L<CXGN::Config>).
88 use List
::MoreUtils qw
/ uniq /;
92 use CXGN
::Tools
::List qw
/any min all max/;
95 use Bio
::BLAST
::Database
;
97 has
'sgn_schema' => ( isa
=> 'SGN::Schema',
102 has
'blast_db_id' => ( isa
=> 'Int',
107 has
'dbpath' => ( isa
=> 'Maybe[Str]',
112 has
'file_base' => ( isa
=> 'Maybe[Str]',
116 has
'title' => ( isa
=> 'Str',
121 has
'type' => ( isa
=> 'Maybe[Str]',
125 has
'source_url' => ( isa
=> 'Maybe[Str]',
130 has
'lookup_url' => (isa
=> 'Maybe[Str]',
135 has
'info_url' => ( isa
=> 'Maybe[Str]',
139 has
'update_freq' => ( isa
=> 'Maybe[Str]',
143 has
'index_seqs' => ( isa
=> 'Bool',
147 has
'web_interface_visible' => ( isa
=> 'Bool',
151 has
'description' => (isa
=> 'Maybe[Str]',
155 ###our @column_names =
156 # ('blast_db_id', #-database serial number
157 # 'file_base', #-basename of the database files, with a path prepended,
158 # # e.g. 'genbank/nr'
159 # 'title', #-title of the database, e.g. 'NCBI Non-redundant proteins'
160 # 'type', #-type, either 'protein' or 'nucleotide'
161 # 'source_url', #-the URL new copies of this database can be fetched from
162 # 'lookup_url', #-printf-style format string that can be used to generate
163 # # a URL where a user can get more info on a sequence in
165 # 'info_url', #-URL that gives information about the contents of this DB
166 # 'update_freq', #-frequency of updating this blast database
167 # 'index_seqs', #- corresponds to formatdb's -o option. Set true if formatdb
168 # # should be given a '-o T'. This is used if you later want to
169 # # fetch specific sequences out of this blast db
170 # 'web_interface_visible', #whether the blast web interface should display this DB
171 # 'blast_db_group_id', #ID of the blast DB group this db belondgs to,
172 # #if any. used for displaying them in the web
174 # 'description', # text description of the database, display on the database details page
180 if ($self->blast_db_id) {
181 my $row = $self->sgn_schema()->resultset("BlastDb")->find( { blast_db_id
=> $self->blast_db_id() } );
182 if (!$row) { die "The blast_db_id with the id ".$self->blast_db_id()." does not exist in this database\n"; }
183 $self->file_base($row->file_base());
184 $self->title($row->title());
185 $self->type($row->type());
186 $self->source_url($row->source_url());
187 $self->lookup_url($row->lookup_url());
188 $self->info_url($row->info_url());
189 $self->update_freq($row->update_freq());
190 $self->index_seqs($row->index_seqs());
191 $self->web_interface_visible($row->web_interface_visible());
192 $self->description($row->description());
195 print STDERR
"No blast_db_id provided. Creating empty object...\n";
203 my $sgn_schema = shift;
206 my @dbs = $class->search($sgn_schema, $dbpath);
213 my $sgn_schema = shift;
217 my $rs = $sgn_schema->resultset("BlastDb")->search( { %search } );
221 while (my $db = $rs->next()) {
222 my $bdbo = CXGN
::Blast
->new( sgn_schema
=> $sgn_schema, dbpath
=> $dbpath, blast_db_id
=> $db->blast_db_id() );
230 ## Note: replaced by standard moose constructor and blast_db_id arg
232 # Usage: my $bdb = CXGN::BlastDB->from_id(12);
233 # Desc : retrieve a BlastDB object using its ID number
234 # Ret : a BlastDB object for that id number, or undef if none found
235 # Args : the id number of the object to retrieve
236 # Side Effects: accesses the database
241 # shift->retrieve(@_);
244 #Only document title and type for external users of this module,
245 #some nicer wrappers will be provided for using the other information
250 Desc : get/set the title of this blast DB object
251 Ret : string containing the title, e.g. 'NCBI Non-redundant proteins'
252 Args : optional new value for the title
254 Note: must run $db->update for changes to be written to the db, unless you've
260 Desc : get the type of this blast db, whether it holds
261 proteins or nucleotides
262 Ret : 'protein' or 'nucleotide'
263 Args : optional new value for the type, either 'protein' or 'nucleotide'
265 Note: must run $db->update for changes to be written to the db, unless you've
270 Usage: $db->file_base;
271 Desc : get/set the basename and path relative to 'blast_db_path' config var
272 Ret : the path and basename, e.g. 'genbank/nr' or 'screening/organelle/ATH_mitochondria'
273 Args : (optional) new string containing subpath and basename
276 Note: must run $db->update for changes to be written to the db, unless you've
281 =head2 genomic_libraries_annotated
283 Desc: get the L<CXGN::Genomic::Library> objects that are slated as using this
284 blast database for annotation
286 Ret : array of L<CXGN::Genomic::Library> objects
292 Desc: get the earliest unix modification time of the database files
294 Ret : unix modification time of the database files, or nothing if does not exist
302 return unless $self->_fileset;
303 return $self->_fileset->file_modtime;
308 Usage: my $time = $db->format_time;
309 Desc : get the format time of these db files
310 Ret : the value time() would have returned when
311 this database was last formatted, or undef
312 if that could not be determined (like if the
315 Side Effects: runs 'fastacmd' to extract the formatting
316 time from the database files
318 NOTE: This function assumes that the computer that
319 last formatted this database had the same time zone
320 set as the computer we are running on.
326 return unless $self->_fileset;
327 return $self->_fileset->format_time;
330 =head2 full_file_basename
334 Ret : full path to the blast database file basename,
338 my $basename = $db->full_file_basename;
339 #returns '/data/shared/blast/databases/genbank/nr'
343 sub full_file_basename
{
346 return scalar File
::Spec
->catfile( $self->dbpath,
354 Usage: my @files = $db->list_files;
355 Desc : get the list of files that belong to this blast database
356 Ret : list of full paths to all files belonging to this blast database,
358 Side Effects: looks in the filesystem
364 return unless $self->_fileset;
365 $self->_fileset->list_files();
368 =head2 files_are_complete
370 Usage: print "complete!" if $db->files_are_complete;
371 Desc : tell whether this blast db has a complete set of files on disk
372 Ret : true if the set of files on disk looks complete,
375 Side Effects: lists files on disk
379 sub files_are_complete
{
381 return unless $self->_fileset;
382 return $self->_fileset->files_are_complete;
387 Usage: print "that thing is split, yo" if $db->is_split;
388 Desc : determine whether this database is in multiple parts
389 Ret : true if this database has been split into multiple
390 files by formatdb (e.g. nr.00.pin, nr.01.pin, etc.)
392 Side Effects: looks in filesystem
398 return unless $self->_fileset;
399 return $self->_fileset->is_split;
404 Usage: $bdb->is_indexed
405 Desc : checks whether this blast db is indexed on disk to support
406 individual sequence retrieval. note that this is different
407 from index_seqs(), which is the flag of whether this db
410 Ret : false if not on disk or not indexed, true if indexed
416 return unless $self->_fileset;
417 return $self->_fileset->files_are_complete && $self->_fileset->indexed_seqs;
421 =head2 sequences_count
423 Desc: get the number of sequences in this blast database
425 Ret : number of distinct sequences in this blast database, or undef
426 if it could not be determined due to some error or other
427 Side Effects: runs 'fastacmd' to get stats on the blast database file
431 sub sequences_count
{
433 return unless $self->_fileset;
434 return $self->_fileset->sequences_count;
437 =head2 is_contaminant_for
439 This method doesn't work yet.
441 Usage: my $is_contam = $bdb->is_contaminant_for($lib);
442 Desc : return whether this BlastDB contains sequences
443 from something that would be considered a contaminant
444 in the given CXGN::Genomic::Library
446 Args : a CXGN::Genomic::Library object
450 #__PACKAGE__->has_many( _lib_annots => 'CXGN::Genomic::LibraryAnnotationDB' );
451 sub is_contaminant_for
{
452 my ($this,$lib) = @_;
454 #return true if any arguments are true
455 return any
( map { $_->is_contaminant && $_->library_id == $lib } $this->_lib_annots);
460 Usage: print "you should update ".$db->title if $db->needs_update;
461 Desc : check whether this blast DB needs to be updated
462 Ret : true if this database's files need an update or are missing,
465 Side Effects: runs format_time(), which runs `fastacmd`
472 #it of course needs an update if it is not complete
473 return 1 unless $self->files_are_complete;
475 my $modtime = $self->format_time();
477 #if no modtime, files must not even be there
478 return 1 unless $modtime;
480 #manually updated DBs never _need_ updates if their
482 return 0 if $self->update_freq eq 'manual';
484 #also need update if it is set to be indexed but is not indexed
485 return 1 if $self->index_seqs && ! $self->is_indexed;
487 #figure out the maximum number of seconds we'll tolerate
488 #the files being out of date
489 my $max_time_offset = 60 * 60 * 24 * do { #figure out number of days
490 if( $self->update_freq eq 'daily' ) { 1 }
491 elsif( $self->update_freq eq 'weekly' ) { 7 }
492 elsif( $self->update_freq eq 'monthly' ) { 31 }
494 confess
"invalid update_freq ".$self->update_freq;
498 #subtract from modtime and make a decision
499 return time-$modtime > $max_time_offset ?
1 : 0;
503 =head2 check_format_permissions
505 Usage: $bdb->check_format_from_file() or die "cannot format!\n";
506 Desc : check directory existence and file permissions to see if a
507 format_from_file() is likely to succeed. This is useful,
508 for example, when you have a script that downloads some
509 remote database and you'd like to check first whether
510 we even have permissions to format before you take the
511 time to download something.
513 Ret : nothing if everything looks good,
514 otherwise a string error message summarizing the reason
516 Side Effects: reads from filesystem, may stat some files
520 sub check_format_permissions
{
521 my ($self,$ffbn) = @_;
522 croak
"ffbn arg is no longer supported, maybe you should just use a new Bio::BLAST::Database object" if $ffbn;
523 return unless $self->_fileset('write');
524 return $self->_fileset('write')->check_format_permissions;
527 =head2 format_from_file
529 Usage: $db->format_from_file('mysequences.seq');
530 Desc : format this blast database from the given source file,
531 into its proper place on disk, overwriting the files already
533 Ret : nothing meaningful
534 Args : filename containing sequences,
535 Side Effects: runs 'formatdb' to format the given sequences,
540 sub format_from_file
{
541 my ($self,$seqfile,$ffbn) = @_;
542 $ffbn and croak
"ffbn arg no longer supported. maybe you should make a new Bio::BLAST::Database object";
544 $self->_fileset('write')
545 ->format_from_file( seqfile
=> $seqfile, indexed_seqs
=> $self->index_seqs, title
=> $self->title );
550 Usage: my $fasta_fh = $bdb->to_fasta;
551 Desc : get the contents of this blast database in FASTA format
552 Ret : an IO::Pipe filehandle, or nothing if it could not be opened
554 Side Effects: runs 'fastacmd' in a forked process, cleaning up its output,
555 and passing it to you
561 return unless $self->_fileset;
562 return $self->_fileset->to_fasta;
567 Usage: my $seq = $bdb->get_sequence('LE_HBa0001A02');
568 Desc : get a particular sequence from this db
569 Args : sequence name to retrieve
570 Ret : Bio::PrimarySeqI object, or nothing if not found or
572 Side Effects: dies on error, like if this db is not indexed
577 my ($self, $seqname) = @_;
578 return unless $self->_fileset;
579 return $self->_fileset->get_sequence($seqname);
584 Usage: $bdb->dbpath('/data/cluster/blast/databases');
585 Desc : object method to get/set the location where all blast database
586 files are expected to be found. Defaults to the value of the
587 CXGN configuration variable 'blast_db_path'.
588 Ret : the current base path
589 Args : (optional) new base path
590 Side Effects: gets/sets a piece of CLASS-WIDE data
594 #mk_classdata is from Class::Data::Inheritable. good little module,
595 #you should look at it
596 #__PACKAGE__->mk_classdata( dbpath => CXGN::BlastDB::Config->load->{'blast_db_path'} );
598 =head2 identifier_url
600 Usage: my $url = $db->identifier_url('some ident from this bdb');
601 Desc : get a URL to look up more information on this identifier.
602 first tries to make a URL using the lookup_url column in the
603 sgn.blast_db table, then tries to use identifier_url() from
604 L<CXGN::Tools::Identifiers>
605 Args : the identifier to lookup, assumed
606 to be from this blast db
607 Ret : a URL, or undef if none could be found
608 Side Effects: Example:
613 my ($self,$ident) = @_;
614 $ident or croak
'must pass an identifier to link';
616 return $self->lookup_url
617 ?
sprintf($self->lookup_url,$ident)
618 : do { require CXGN
::Tools
::Identifiers
; CXGN
::Tools
::Identifiers
::identifier_url
($ident) };
621 # accessor that holds our encapsulated Bio::BLAST::Database
623 NORMALIZER
=> sub { #< need to take the full_file_basename (really the dbpath) into account for the memoization
624 my $s = shift; join ',',$s,@_,$s->full_file_basename
628 my ($self,$write) = @_;
629 my $ffbn = $self->full_file_basename;
630 return Bio
::BLAST
::Database
->open( full_file_basename
=> $ffbn,
632 ($write ?
( write => 1,
642 Original maintainer: Robert Buels
643 Refactored by Lukas Nov 2016.
647 Robert Buels, E<lt>rmb32@cornell.eduE<gt>
649 =head1 COPYRIGHT & LICENSE
651 Copyright 2009 Boyce Thompson Institute for Plant Research
653 This program is free software; you can redistribute it and/or modify
654 it under the same terms as Perl itself.
658 # package CXGN::BlastDB::Group;
663 # use base qw/CXGN::CDBI::Class::DBI Class::Data::Inheritable/;
664 # __PACKAGE__->table(__PACKAGE__->qualify_schema('sgn') . '.blast_db_group');
666 # #define our database columns with class::dbi
667 # our @primary_key_names = ('blast_db_group_id');
669 # our @column_names =
670 # ('blast_db_group_id',
674 # __PACKAGE__->columns( Primary => @primary_key_names, );
675 # __PACKAGE__->columns( All => @column_names, );
676 # __PACKAGE__->columns( Essential => @column_names, );
677 # __PACKAGE__->sequence( __PACKAGE__->base_schema('sgn'). '.blast_db_group_blast_db_group_id_seq' );
679 # __PACKAGE__->has_many( blast_dbs => 'CXGN::BlastDB', {order_by => 'title'} );