6 CXGN::Blast - a BLAST database that we keep in stock and updated.
12 This object has been derived from CXGN::BlastDB (in cxgn-corelibs) and
13 refactored to work with DBIx::Class (instead of Class::DBI) and Moose.
15 The constructor now requires some additional arguments:
17 my $db = CXGN::Blast->new( { blast_db_id => $x,
22 (this standard constructor now replaces the previous from_id() constructor).
24 my @dbs = CXGN::Blast->retrieve_all(); #get all blastDB objects
26 #change the title of a blast db in memory
27 $dbs[0]->title( 'Sequences from Tomatoes' );
29 Updating the object in the database is not supported right now.
31 #do a blast against this database
32 CXGN::Tools::Run->run( 'blastall',
35 -d => $dbs[0]->full_file_basename,
38 -o => 'myreport.m8.blast',
43 #does it need to be updated?
44 print "univec needs updating\n" if $uv->needs_update;
46 #list the files that are part of our univec DB
47 my @uv_files = $uv->list_files;
48 #returns ( '/data/shared/blast/databases/screening/vector/UniVec.nin',
49 # '/data/shared/blast/databases/screening/vector/UniVec.nhr',
50 # '/data/shared/blast/databases/screening/vector/UniVec.nsq',
53 #how many sequences does it have in it?
54 print "this copy of univec has ".$uv->sequence_count." sequences in it\n";
56 #we've got an updated copy of univec here, let's format it and install it
58 $uv->format_from_file('new_univec.seq');
60 #i'll plop another formatted copy of univec in my home dir too
61 $bdb->dbpath('/home/rob/blast');
62 $uv->format_from_file('new_univec.seq');
63 #that will have done a mkpath -p /home/rob/blast/screening/vector,
64 #then it will have put the Univec.nin, nhr, and nsq files there.
68 This is a handle on a BLAST database we manage. Each of
69 these objects corresponds to a row in the sgn.blast_db table and
70 a set of files in the filesystem. at a place specified by the
71 dbpath() accessor (see dbpath docs below). This path defaults to
72 the value of the 'blast_db_path' configuration variable (see L<CXGN::Config>).
88 use List
::MoreUtils qw
/ uniq /;
92 use CXGN
::Tools
::List qw
/any min all max/;
95 use Bio
::BLAST2
::Database
;
97 has
'sgn_schema' => ( isa
=> 'SGN::Schema',
102 has
'blast_db_id' => ( isa
=> 'Int',
107 has
'dbpath' => ( isa
=> 'Maybe[Str]',
112 has
'file_base' => ( isa
=> 'Maybe[Str]',
116 has
'title' => ( isa
=> 'Str',
121 has
'type' => ( isa
=> 'Maybe[Str]',
125 has
'source_url' => ( isa
=> 'Maybe[Str]',
130 has
'lookup_url' => (isa
=> 'Maybe[Str]',
135 has
'info_url' => ( isa
=> 'Maybe[Str]',
139 has
'update_freq' => ( isa
=> 'Maybe[Str]',
143 has
'index_seqs' => ( isa
=> 'Bool',
147 has
'web_interface_visible' => ( isa
=> 'Bool',
151 has
'description' => (isa
=> 'Maybe[Str]',
155 has
'jbrowse_src' => (isa
=> 'Maybe[Str]',
162 ###our @column_names =
163 # ('blast_db_id', #-database serial number
164 # 'file_base', #-basename of the database files, with a path prepended,
165 # # e.g. 'genbank/nr'
166 # 'title', #-title of the database, e.g. 'NCBI Non-redundant proteins'
167 # 'type', #-type, either 'protein' or 'nucleotide'
168 # 'source_url', #-the URL new copies of this database can be fetched from
169 # 'lookup_url', #-printf-style format string that can be used to generate
170 # # a URL where a user can get more info on a sequence in
172 # 'info_url', #-URL that gives information about the contents of this DB
173 # 'update_freq', #-frequency of updating this blast database
174 # 'index_seqs', #- corresponds to formatdb's -o option. Set true if formatdb
175 # # should be given a '-o T'. This is used if you later want to
176 # # fetch specific sequences out of this blast db
177 # 'web_interface_visible', #whether the blast web interface should display this DB
178 # 'blast_db_group_id', #ID of the blast DB group this db belondgs to,
179 # #if any. used for displaying them in the web
181 # 'description', # text description of the database, display on the database details page
187 if ($self->blast_db_id) {
188 my $row = $self->sgn_schema()->resultset("BlastDb")->find( { blast_db_id
=> $self->blast_db_id() } );
189 if (!$row) { die "The blast_db_id with the id ".$self->blast_db_id()." does not exist in this database\n"; }
190 $self->file_base($row->file_base());
191 $self->title($row->title());
192 $self->type($row->type());
193 $self->source_url($row->source_url());
194 $self->lookup_url($row->lookup_url());
195 $self->info_url($row->info_url());
196 $self->update_freq($row->update_freq());
197 $self->index_seqs($row->index_seqs());
198 $self->web_interface_visible($row->web_interface_visible());
199 $self->description($row->description());
200 $self->jbrowse_src($row->jbrowse_src());
203 print STDERR
"No blast_db_id provided. Creating empty object...\n";
211 my $sgn_schema = shift;
214 my @dbs = $class->search($sgn_schema, $dbpath);
221 my $sgn_schema = shift;
225 my $rs = $sgn_schema->resultset("BlastDb")->search( { %search } );
229 while (my $db = $rs->next()) {
230 my $bdbo = CXGN
::Blast
->new( sgn_schema
=> $sgn_schema, dbpath
=> $dbpath, blast_db_id
=> $db->blast_db_id() );
238 ## Note: replaced by standard moose constructor and blast_db_id arg
240 # Usage: my $bdb = CXGN::BlastDB->from_id(12);
241 # Desc : retrieve a BlastDB object using its ID number
242 # Ret : a BlastDB object for that id number, or undef if none found
243 # Args : the id number of the object to retrieve
244 # Side Effects: accesses the database
249 # shift->retrieve(@_);
252 #Only document title and type for external users of this module,
253 #some nicer wrappers will be provided for using the other information
258 Desc : get/set the title of this blast DB object
259 Ret : string containing the title, e.g. 'NCBI Non-redundant proteins'
260 Args : optional new value for the title
262 Note: must run $db->update for changes to be written to the db, unless you've
268 Desc : get the type of this blast db, whether it holds
269 proteins or nucleotides
270 Ret : 'protein' or 'nucleotide'
271 Args : optional new value for the type, either 'protein' or 'nucleotide'
273 Note: must run $db->update for changes to be written to the db, unless you've
278 Usage: $db->file_base;
279 Desc : get/set the basename and path relative to 'blast_db_path' config var
280 Ret : the path and basename, e.g. 'genbank/nr' or 'screening/organelle/ATH_mitochondria'
281 Args : (optional) new string containing subpath and basename
284 Note: must run $db->update for changes to be written to the db, unless you've
289 =head2 genomic_libraries_annotated
291 Desc: get the L<CXGN::Genomic::Library> objects that are slated as using this
292 blast database for annotation
294 Ret : array of L<CXGN::Genomic::Library> objects
300 Desc: get the earliest unix modification time of the database files
302 Ret : unix modification time of the database files, or nothing if does not exist
310 return unless $self->_fileset;
311 return $self->_fileset->file_modtime;
316 Usage: my $time = $db->format_time;
317 Desc : get the format time of these db files
318 Ret : the value time() would have returned when
319 this database was last formatted, or undef
320 if that could not be determined (like if the
323 Side Effects: runs 'fastacmd' to extract the formatting
324 time from the database files
326 NOTE: This function assumes that the computer that
327 last formatted this database had the same time zone
328 set as the computer we are running on.
334 return unless $self->_fileset;
335 return $self->_fileset->format_time;
338 =head2 full_file_basename
342 Ret : full path to the blast database file basename,
346 my $basename = $db->full_file_basename;
347 #returns '/data/shared/blast/databases/genbank/nr'
351 sub full_file_basename
{
354 return scalar File
::Spec
->catfile( $self->dbpath,
362 Usage: my @files = $db->list_files;
363 Desc : get the list of files that belong to this blast database
364 Ret : list of full paths to all files belonging to this blast database,
366 Side Effects: looks in the filesystem
372 return unless $self->_fileset;
373 $self->_fileset->list_files();
376 =head2 files_are_complete
378 Usage: print "complete!" if $db->files_are_complete;
379 Desc : tell whether this blast db has a complete set of files on disk
380 Ret : true if the set of files on disk looks complete,
383 Side Effects: lists files on disk
387 sub files_are_complete
{
389 return unless $self->_fileset;
390 return $self->_fileset->files_are_complete;
395 Usage: print "that thing is split, yo" if $db->is_split;
396 Desc : determine whether this database is in multiple parts
397 Ret : true if this database has been split into multiple
398 files by formatdb (e.g. nr.00.pin, nr.01.pin, etc.)
400 Side Effects: looks in filesystem
406 return unless $self->_fileset;
407 return $self->_fileset->is_split;
412 Usage: $bdb->is_indexed
413 Desc : checks whether this blast db is indexed on disk to support
414 individual sequence retrieval. note that this is different
415 from index_seqs(), which is the flag of whether this db
418 Ret : false if not on disk or not indexed, true if indexed
424 return unless $self->_fileset;
425 return $self->_fileset->files_are_complete && $self->_fileset->indexed_seqs;
429 =head2 sequences_count
431 Desc: get the number of sequences in this blast database
433 Ret : number of distinct sequences in this blast database, or undef
434 if it could not be determined due to some error or other
435 Side Effects: runs 'fastacmd' to get stats on the blast database file
439 sub sequences_count
{
441 return unless $self->_fileset;
442 return $self->_fileset->sequences_count;
445 =head2 is_contaminant_for
447 This method doesn't work yet.
449 Usage: my $is_contam = $bdb->is_contaminant_for($lib);
450 Desc : return whether this BlastDB contains sequences
451 from something that would be considered a contaminant
452 in the given CXGN::Genomic::Library
454 Args : a CXGN::Genomic::Library object
458 #__PACKAGE__->has_many( _lib_annots => 'CXGN::Genomic::LibraryAnnotationDB' );
459 sub is_contaminant_for
{
460 my ($this,$lib) = @_;
462 #return true if any arguments are true
463 return any
( map { $_->is_contaminant && $_->library_id == $lib } $this->_lib_annots);
468 Usage: print "you should update ".$db->title if $db->needs_update;
469 Desc : check whether this blast DB needs to be updated
470 Ret : true if this database's files need an update or are missing,
473 Side Effects: runs format_time(), which runs `fastacmd`
480 #it of course needs an update if it is not complete
481 return 1 unless $self->files_are_complete;
483 my $modtime = $self->format_time();
485 #if no modtime, files must not even be there
486 return 1 unless $modtime;
488 #manually updated DBs never _need_ updates if their
490 return 0 if $self->update_freq eq 'manual';
492 #also need update if it is set to be indexed but is not indexed
493 return 1 if $self->index_seqs && ! $self->is_indexed;
495 #figure out the maximum number of seconds we'll tolerate
496 #the files being out of date
497 my $max_time_offset = 60 * 60 * 24 * do { #figure out number of days
498 if( $self->update_freq eq 'daily' ) { 1 }
499 elsif( $self->update_freq eq 'weekly' ) { 7 }
500 elsif( $self->update_freq eq 'monthly' ) { 31 }
502 confess
"invalid update_freq ".$self->update_freq;
506 #subtract from modtime and make a decision
507 return time-$modtime > $max_time_offset ?
1 : 0;
511 =head2 check_format_permissions
513 Usage: $bdb->check_format_from_file() or die "cannot format!\n";
514 Desc : check directory existence and file permissions to see if a
515 format_from_file() is likely to succeed. This is useful,
516 for example, when you have a script that downloads some
517 remote database and you'd like to check first whether
518 we even have permissions to format before you take the
519 time to download something.
521 Ret : nothing if everything looks good,
522 otherwise a string error message summarizing the reason
524 Side Effects: reads from filesystem, may stat some files
528 sub check_format_permissions
{
529 my ($self,$ffbn) = @_;
530 croak
"ffbn arg is no longer supported, maybe you should just use a new Bio::BLAST2::Database object" if $ffbn;
531 return unless $self->_fileset('write');
532 return $self->_fileset('write')->check_format_permissions;
535 =head2 format_from_file
537 Usage: $db->format_from_file('mysequences.seq');
538 Desc : format this blast database from the given source file,
539 into its proper place on disk, overwriting the files already
541 Ret : nothing meaningful
542 Args : filename containing sequences,
543 Side Effects: runs 'formatdb' to format the given sequences,
548 sub format_from_file
{
549 my ($self,$seqfile,$ffbn) = @_;
550 $ffbn and croak
"ffbn arg no longer supported. maybe you should make a new Bio::BLAST2::Database object";
552 $self->_fileset('write')
553 ->format_from_file( seqfile
=> $seqfile, indexed_seqs
=> $self->index_seqs, title
=> $self->title );
558 Usage: my $fasta_fh = $bdb->to_fasta;
559 Desc : get the contents of this blast database in FASTA format
560 Ret : an IO::Pipe filehandle, or nothing if it could not be opened
562 Side Effects: runs 'fastacmd' in a forked process, cleaning up its output,
563 and passing it to you
569 return unless $self->_fileset;
570 return $self->_fileset->to_fasta;
575 Usage: my $seq = $bdb->get_sequence('LE_HBa0001A02');
576 Desc : get a particular sequence from this db
577 Args : sequence name to retrieve
578 Ret : Bio::PrimarySeqI object, or nothing if not found or
580 Side Effects: dies on error, like if this db is not indexed
585 my ($self, $seqname) = @_;
586 return unless $self->_fileset;
587 return $self->_fileset->get_sequence($seqname);
592 Usage: $bdb->dbpath('/data/cluster/blast/databases');
593 Desc : object method to get/set the location where all blast database
594 files are expected to be found. Defaults to the value of the
595 CXGN configuration variable 'blast_db_path'.
596 Ret : the current base path
597 Args : (optional) new base path
598 Side Effects: gets/sets a piece of CLASS-WIDE data
602 #mk_classdata is from Class::Data::Inheritable. good little module,
603 #you should look at it
604 #__PACKAGE__->mk_classdata( dbpath => CXGN::BlastDB::Config->load->{'blast_db_path'} );
606 =head2 identifier_url
608 Usage: my $url = $db->identifier_url('some ident from this bdb');
609 Desc : get a URL to look up more information on this identifier.
610 first tries to make a URL using the lookup_url column in the
611 sgn.blast_db table, then tries to use identifier_url() from
612 L<CXGN::Tools::Identifiers>
613 Args : the identifier to lookup, assumed
614 to be from this blast db
615 Ret : a URL, or undef if none could be found
616 Side Effects: Example:
621 my ($self,$ident) = @_;
622 $ident or croak
'must pass an identifier to link';
624 return $self->lookup_url
625 ?
sprintf($self->lookup_url,$ident)
626 : do { require CXGN
::Tools
::Identifiers
; CXGN
::Tools
::Identifiers
::identifier_url
($ident) };
629 # accessor that holds our encapsulated Bio::BLAST2::Database
631 NORMALIZER
=> sub { #< need to take the full_file_basename (really the dbpath) into account for the memoization
632 my $s = shift; join ',',$s,@_,$s->full_file_basename
636 my ($self,$write) = @_;
637 my $ffbn = $self->full_file_basename;
638 return Bio
::BLAST2
::Database
->open( full_file_basename
=> $ffbn,
640 ($write ?
( write => 1,
650 Original maintainer: Robert Buels
651 Refactored by Lukas Nov 2016.
655 Robert Buels, E<lt>rmb32@cornell.eduE<gt>
657 =head1 COPYRIGHT & LICENSE
659 Copyright 2009 Boyce Thompson Institute for Plant Research
661 This program is free software; you can redistribute it and/or modify
662 it under the same terms as Perl itself.
666 # package CXGN::BlastDB::Group;
671 # use base qw/CXGN::CDBI::Class::DBI Class::Data::Inheritable/;
672 # __PACKAGE__->table(__PACKAGE__->qualify_schema('sgn') . '.blast_db_group');
674 # #define our database columns with class::dbi
675 # our @primary_key_names = ('blast_db_group_id');
677 # our @column_names =
678 # ('blast_db_group_id',
682 # __PACKAGE__->columns( Primary => @primary_key_names, );
683 # __PACKAGE__->columns( All => @column_names, );
684 # __PACKAGE__->columns( Essential => @column_names, );
685 # __PACKAGE__->sequence( __PACKAGE__->base_schema('sgn'). '.blast_db_group_blast_db_group_id_seq' );
687 # __PACKAGE__->has_many( blast_dbs => 'CXGN::BlastDB', {order_by => 'title'} );