Merge pull request #2890 from solgenomics/topic/check_login_always
[sgn.git] / lib / CXGN / Blast.pm
bloba9d850e46959ed231cd758caf5010a19fec5e087
2 package CXGN::Blast;
4 =head1 NAME
6 CXGN::Blast - a BLAST database that we keep in stock and updated.
8 =head1 SYNOPSIS
10 ### SIMPLE MECHANICS
12 This object has been derived from CXGN::BlastDB (in cxgn-corelibs) and
13 refactored to work with DBIx::Class (instead of Class::DBI) and Moose.
15 The constructor now requires some additional arguments:
17 my $db = CXGN::Blast->new( { blast_db_id => $x,
18 sgn_schema => $s,
19 dbpath => $p,
20 });
22 (this standard constructor now replaces the previous from_id() constructor).
24 my @dbs = CXGN::Blast->retrieve_all(); #get all blastDB objects
26 #change the title of a blast db in memory
27 $dbs[0]->title( 'Sequences from Tomatoes' );
29 Updating the object in the database is not supported right now.
31 #do a blast against this database
32 CXGN::Tools::Run->run( 'blastall',
33 -m => 8,
34 -i => 'myseqs.seq',
35 -d => $dbs[0]->full_file_basename,
36 -p => 'blastn',
37 -e => '1e-10',
38 -o => 'myreport.m8.blast',
41 ### NIFTY THINGS
43 #does it need to be updated?
44 print "univec needs updating\n" if $uv->needs_update;
46 #list the files that are part of our univec DB
47 my @uv_files = $uv->list_files;
48 #returns ( '/data/shared/blast/databases/screening/vector/UniVec.nin',
49 # '/data/shared/blast/databases/screening/vector/UniVec.nhr',
50 # '/data/shared/blast/databases/screening/vector/UniVec.nsq',
51 # )
53 #how many sequences does it have in it?
54 print "this copy of univec has ".$uv->sequence_count." sequences in it\n";
56 #we've got an updated copy of univec here, let's format it and install it
57 #in place
58 $uv->format_from_file('new_univec.seq');
60 #i'll plop another formatted copy of univec in my home dir too
61 $bdb->dbpath('/home/rob/blast');
62 $uv->format_from_file('new_univec.seq');
63 #that will have done a mkpath -p /home/rob/blast/screening/vector,
64 #then it will have put the Univec.nin, nhr, and nsq files there.
66 =head1 DESCRIPTION
68 This is a handle on a BLAST database we manage. Each of
69 these objects corresponds to a row in the sgn.blast_db table and
70 a set of files in the filesystem. at a place specified by the
71 dbpath() accessor (see dbpath docs below). This path defaults to
72 the value of the 'blast_db_path' configuration variable (see L<CXGN::Config>).
74 =head1 METHODS
76 =cut
78 use Moose;
80 use strict;
81 use Carp;
82 use File::Spec;
83 use File::Basename;
84 use File::Copy;
85 use File::Path;
86 use POSIX;
88 use List::MoreUtils qw/ uniq /;
90 use Memoize;
92 use CXGN::Tools::List qw/any min all max/;
93 use CXGN::Tools::Run;
95 use Bio::BLAST2::Database;
97 has 'sgn_schema' => ( isa => 'SGN::Schema',
98 is => 'ro',
99 required => 1,
102 has 'blast_db_id' => ( isa => 'Int',
103 is => 'ro',
104 default => 0,
107 has 'dbpath' => ( isa => 'Maybe[Str]',
108 is => 'rw',
109 required => 1,
112 has 'file_base' => ( isa => 'Maybe[Str]',
113 is => 'rw',
116 has 'title' => ( isa => 'Str',
117 is => 'rw',
118 default => '',
121 has 'type' => ( isa => 'Maybe[Str]',
122 is => 'rw',
125 has 'source_url' => ( isa => 'Maybe[Str]',
126 is => 'rw',
127 default => '',
130 has 'lookup_url' => (isa => 'Maybe[Str]',
131 is => 'rw',
132 default => '',
135 has 'info_url' => ( isa => 'Maybe[Str]',
136 is => 'rw',
139 has 'update_freq' => ( isa => 'Maybe[Str]',
140 is => 'rw',
143 has 'index_seqs' => ( isa => 'Bool',
144 is => 'rw',
147 has 'web_interface_visible' => ( isa => 'Bool',
148 is => 'rw',
151 has 'description' => (isa => 'Maybe[Str]',
152 is => 'rw',
155 has 'jbrowse_src' => (isa => 'Maybe[Str]',
156 is => 'rw',
162 ###our @column_names =
163 # ('blast_db_id', #-database serial number
164 # 'file_base', #-basename of the database files, with a path prepended,
165 # # e.g. 'genbank/nr'
166 # 'title', #-title of the database, e.g. 'NCBI Non-redundant proteins'
167 # 'type', #-type, either 'protein' or 'nucleotide'
168 # 'source_url', #-the URL new copies of this database can be fetched from
169 # 'lookup_url', #-printf-style format string that can be used to generate
170 # # a URL where a user can get more info on a sequence in
171 # # this blast db
172 # 'info_url', #-URL that gives information about the contents of this DB
173 # 'update_freq', #-frequency of updating this blast database
174 # 'index_seqs', #- corresponds to formatdb's -o option. Set true if formatdb
175 # # should be given a '-o T'. This is used if you later want to
176 # # fetch specific sequences out of this blast db
177 # 'web_interface_visible', #whether the blast web interface should display this DB
178 # 'blast_db_group_id', #ID of the blast DB group this db belondgs to,
179 # #if any. used for displaying them in the web
180 # #interface
181 # 'description', # text description of the database, display on the database details page
182 ### );
184 sub BUILD {
185 my $self = shift;
187 if ($self->blast_db_id) {
188 my $row = $self->sgn_schema()->resultset("BlastDb")->find( { blast_db_id => $self->blast_db_id() } );
189 if (!$row) { die "The blast_db_id with the id ".$self->blast_db_id()." does not exist in this database\n"; }
190 $self->file_base($row->file_base());
191 $self->title($row->title());
192 $self->type($row->type());
193 $self->source_url($row->source_url());
194 $self->lookup_url($row->lookup_url());
195 $self->info_url($row->info_url());
196 $self->update_freq($row->update_freq());
197 $self->index_seqs($row->index_seqs());
198 $self->web_interface_visible($row->web_interface_visible());
199 $self->description($row->description());
200 $self->jbrowse_src($row->jbrowse_src());
202 else {
203 print STDERR "No blast_db_id provided. Creating empty object...\n";
207 # class function
209 sub retrieve_all {
210 my $class = shift;
211 my $sgn_schema = shift;
212 my $dbpath = shift;
214 my @dbs = $class->search($sgn_schema, $dbpath);
216 return @dbs;
219 sub search {
220 my $class = shift;
221 my $sgn_schema = shift;
222 my $dbpath = shift;
223 my %search = @_;
225 my $rs = $sgn_schema->resultset("BlastDb")->search( { %search } );
227 my @dbs = ();
229 while (my $db = $rs->next()) {
230 my $bdbo = CXGN::Blast->new( sgn_schema => $sgn_schema, dbpath => $dbpath, blast_db_id => $db->blast_db_id() );
231 push @dbs, $bdbo;
233 return @dbs;
236 # =head2 from_id
238 ## Note: replaced by standard moose constructor and blast_db_id arg
240 # Usage: my $bdb = CXGN::BlastDB->from_id(12);
241 # Desc : retrieve a BlastDB object using its ID number
242 # Ret : a BlastDB object for that id number, or undef if none found
243 # Args : the id number of the object to retrieve
244 # Side Effects: accesses the database
246 # =cut
248 # sub from_id {
249 # shift->retrieve(@_);
252 #Only document title and type for external users of this module,
253 #some nicer wrappers will be provided for using the other information
255 =head2 title
257 Usage: $db->title
258 Desc : get/set the title of this blast DB object
259 Ret : string containing the title, e.g. 'NCBI Non-redundant proteins'
260 Args : optional new value for the title
262 Note: must run $db->update for changes to be written to the db, unless you've
263 set $db->autoupdate.
265 =head2 type
267 Usage: $db->type
268 Desc : get the type of this blast db, whether it holds
269 proteins or nucleotides
270 Ret : 'protein' or 'nucleotide'
271 Args : optional new value for the type, either 'protein' or 'nucleotide'
273 Note: must run $db->update for changes to be written to the db, unless you've
274 set $db->autoupdate.
276 =head2 file_base
278 Usage: $db->file_base;
279 Desc : get/set the basename and path relative to 'blast_db_path' config var
280 Ret : the path and basename, e.g. 'genbank/nr' or 'screening/organelle/ATH_mitochondria'
281 Args : (optional) new string containing subpath and basename
282 Side Effects: none
284 Note: must run $db->update for changes to be written to the db, unless you've
285 set $db->autoupdate.
287 =cut
289 =head2 genomic_libraries_annotated
291 Desc: get the L<CXGN::Genomic::Library> objects that are slated as using this
292 blast database for annotation
293 Args: none
294 Ret : array of L<CXGN::Genomic::Library> objects
296 =cut
298 =head2 file_modtime
300 Desc: get the earliest unix modification time of the database files
301 Args: none
302 Ret : unix modification time of the database files, or nothing if does not exist
303 Side Effects:
304 Example:
306 =cut
308 sub file_modtime {
309 my $self = shift;
310 return unless $self->_fileset;
311 return $self->_fileset->file_modtime;
314 =head2 format_time
316 Usage: my $time = $db->format_time;
317 Desc : get the format time of these db files
318 Ret : the value time() would have returned when
319 this database was last formatted, or undef
320 if that could not be determined (like if the
321 files aren't there)
322 Args : none
323 Side Effects: runs 'fastacmd' to extract the formatting
324 time from the database files
326 NOTE: This function assumes that the computer that
327 last formatted this database had the same time zone
328 set as the computer we are running on.
330 =cut
332 sub format_time {
333 my ($self) = @_;
334 return unless $self->_fileset;
335 return $self->_fileset->format_time;
338 =head2 full_file_basename
340 Desc:
341 Args: none
342 Ret : full path to the blast database file basename,
343 Side Effects: none
344 Example:
346 my $basename = $db->full_file_basename;
347 #returns '/data/shared/blast/databases/genbank/nr'
349 =cut
351 sub full_file_basename {
352 my $self = shift;
354 return scalar File::Spec->catfile( $self->dbpath,
355 $self->file_base,
360 =head2 list_files
362 Usage: my @files = $db->list_files;
363 Desc : get the list of files that belong to this blast database
364 Ret : list of full paths to all files belonging to this blast database,
365 Args : none
366 Side Effects: looks in the filesystem
368 =cut
370 sub list_files {
371 my $self = shift;
372 return unless $self->_fileset;
373 $self->_fileset->list_files();
376 =head2 files_are_complete
378 Usage: print "complete!" if $db->files_are_complete;
379 Desc : tell whether this blast db has a complete set of files on disk
380 Ret : true if the set of files on disk looks complete,
381 false if not
382 Args : none
383 Side Effects: lists files on disk
385 =cut
387 sub files_are_complete {
388 my ($self) = @_;
389 return unless $self->_fileset;
390 return $self->_fileset->files_are_complete;
393 =head2 is_split
395 Usage: print "that thing is split, yo" if $db->is_split;
396 Desc : determine whether this database is in multiple parts
397 Ret : true if this database has been split into multiple
398 files by formatdb (e.g. nr.00.pin, nr.01.pin, etc.)
399 Args : none
400 Side Effects: looks in filesystem
402 =cut
404 sub is_split {
405 my ($self) = @_;
406 return unless $self->_fileset;
407 return $self->_fileset->is_split;
410 =head2 is_indexed
412 Usage: $bdb->is_indexed
413 Desc : checks whether this blast db is indexed on disk to support
414 individual sequence retrieval. note that this is different
415 from index_seqs(), which is the flag of whether this db
416 _should_ be indexed.
417 Args : none
418 Ret : false if not on disk or not indexed, true if indexed
420 =cut
422 sub is_indexed {
423 my ( $self ) = @_;
424 return unless $self->_fileset;
425 return $self->_fileset->files_are_complete && $self->_fileset->indexed_seqs;
429 =head2 sequences_count
431 Desc: get the number of sequences in this blast database
432 Args: none
433 Ret : number of distinct sequences in this blast database, or undef
434 if it could not be determined due to some error or other
435 Side Effects: runs 'fastacmd' to get stats on the blast database file
437 =cut
439 sub sequences_count {
440 my $self = shift;
441 return unless $self->_fileset;
442 return $self->_fileset->sequences_count;
445 =head2 is_contaminant_for
447 This method doesn't work yet.
449 Usage: my $is_contam = $bdb->is_contaminant_for($lib);
450 Desc : return whether this BlastDB contains sequences
451 from something that would be considered a contaminant
452 in the given CXGN::Genomic::Library
453 Ret : 1 or 0
454 Args : a CXGN::Genomic::Library object
456 =cut
458 #__PACKAGE__->has_many( _lib_annots => 'CXGN::Genomic::LibraryAnnotationDB' );
459 sub is_contaminant_for {
460 my ($this,$lib) = @_;
462 #return true if any arguments are true
463 return any( map { $_->is_contaminant && $_->library_id == $lib } $this->_lib_annots);
466 =head2 needs_update
468 Usage: print "you should update ".$db->title if $db->needs_update;
469 Desc : check whether this blast DB needs to be updated
470 Ret : true if this database's files need an update or are missing,
471 false otherwise
472 Args : none
473 Side Effects: runs format_time(), which runs `fastacmd`
475 =cut
477 sub needs_update {
478 my ($self) = @_;
480 #it of course needs an update if it is not complete
481 return 1 unless $self->files_are_complete;
483 my $modtime = $self->format_time();
485 #if no modtime, files must not even be there
486 return 1 unless $modtime;
488 #manually updated DBs never _need_ updates if their
489 #files are there
490 return 0 if $self->update_freq eq 'manual';
492 #also need update if it is set to be indexed but is not indexed
493 return 1 if $self->index_seqs && ! $self->is_indexed;
495 #figure out the maximum number of seconds we'll tolerate
496 #the files being out of date
497 my $max_time_offset = 60 * 60 * 24 * do { #figure out number of days
498 if( $self->update_freq eq 'daily' ) { 1 }
499 elsif( $self->update_freq eq 'weekly' ) { 7 }
500 elsif( $self->update_freq eq 'monthly' ) { 31 }
501 else {
502 confess "invalid update_freq ".$self->update_freq;
506 #subtract from modtime and make a decision
507 return time-$modtime > $max_time_offset ? 1 : 0;
511 =head2 check_format_permissions
513 Usage: $bdb->check_format_from_file() or die "cannot format!\n";
514 Desc : check directory existence and file permissions to see if a
515 format_from_file() is likely to succeed. This is useful,
516 for example, when you have a script that downloads some
517 remote database and you'd like to check first whether
518 we even have permissions to format before you take the
519 time to download something.
520 Args : none
521 Ret : nothing if everything looks good,
522 otherwise a string error message summarizing the reason
523 for failure
524 Side Effects: reads from filesystem, may stat some files
526 =cut
528 sub check_format_permissions {
529 my ($self,$ffbn) = @_;
530 croak "ffbn arg is no longer supported, maybe you should just use a new Bio::BLAST2::Database object" if $ffbn;
531 return unless $self->_fileset('write');
532 return $self->_fileset('write')->check_format_permissions;
535 =head2 format_from_file
537 Usage: $db->format_from_file('mysequences.seq');
538 Desc : format this blast database from the given source file,
539 into its proper place on disk, overwriting the files already
540 present
541 Ret : nothing meaningful
542 Args : filename containing sequences,
543 Side Effects: runs 'formatdb' to format the given sequences,
544 dies on failure
546 =cut
548 sub format_from_file {
549 my ($self,$seqfile,$ffbn) = @_;
550 $ffbn and croak "ffbn arg no longer supported. maybe you should make a new Bio::BLAST2::Database object";
552 $self->_fileset('write')
553 ->format_from_file( seqfile => $seqfile, indexed_seqs => $self->index_seqs, title => $self->title );
556 =head2 to_fasta
558 Usage: my $fasta_fh = $bdb->to_fasta;
559 Desc : get the contents of this blast database in FASTA format
560 Ret : an IO::Pipe filehandle, or nothing if it could not be opened
561 Args : none
562 Side Effects: runs 'fastacmd' in a forked process, cleaning up its output,
563 and passing it to you
565 =cut
567 sub to_fasta {
568 my ($self) = @_;
569 return unless $self->_fileset;
570 return $self->_fileset->to_fasta;
573 =head2 get_sequence
575 Usage: my $seq = $bdb->get_sequence('LE_HBa0001A02');
576 Desc : get a particular sequence from this db
577 Args : sequence name to retrieve
578 Ret : Bio::PrimarySeqI object, or nothing if not found or
579 if db does not exist
580 Side Effects: dies on error, like if this db is not indexed
582 =cut
584 sub get_sequence {
585 my ($self, $seqname) = @_;
586 return unless $self->_fileset;
587 return $self->_fileset->get_sequence($seqname);
590 =head2 dbpath
592 Usage: $bdb->dbpath('/data/cluster/blast/databases');
593 Desc : object method to get/set the location where all blast database
594 files are expected to be found. Defaults to the value of the
595 CXGN configuration variable 'blast_db_path'.
596 Ret : the current base path
597 Args : (optional) new base path
598 Side Effects: gets/sets a piece of CLASS-WIDE data
600 =cut
602 #mk_classdata is from Class::Data::Inheritable. good little module,
603 #you should look at it
604 #__PACKAGE__->mk_classdata( dbpath => CXGN::BlastDB::Config->load->{'blast_db_path'} );
606 =head2 identifier_url
608 Usage: my $url = $db->identifier_url('some ident from this bdb');
609 Desc : get a URL to look up more information on this identifier.
610 first tries to make a URL using the lookup_url column in the
611 sgn.blast_db table, then tries to use identifier_url() from
612 L<CXGN::Tools::Identifiers>
613 Args : the identifier to lookup, assumed
614 to be from this blast db
615 Ret : a URL, or undef if none could be found
616 Side Effects: Example:
618 =cut
620 sub identifier_url {
621 my ($self,$ident) = @_;
622 $ident or croak 'must pass an identifier to link';
624 return $self->lookup_url
625 ? sprintf($self->lookup_url,$ident)
626 : do { require CXGN::Tools::Identifiers; CXGN::Tools::Identifiers::identifier_url($ident) };
629 # accessor that holds our encapsulated Bio::BLAST2::Database
630 memoize '_fileset',
631 NORMALIZER => sub { #< need to take the full_file_basename (really the dbpath) into account for the memoization
632 my $s = shift; join ',',$s,@_,$s->full_file_basename
635 sub _fileset {
636 my ($self,$write) = @_;
637 my $ffbn = $self->full_file_basename;
638 return Bio::BLAST2::Database->open( full_file_basename => $ffbn,
639 type => $self->type,
640 ($write ? ( write => 1,
641 create_dirs => 1,
643 : (),
648 =head1 MAINTAINER
650 Original maintainer: Robert Buels
651 Refactored by Lukas Nov 2016.
653 =head1 AUTHOR
655 Robert Buels, E<lt>rmb32@cornell.eduE<gt>
657 =head1 COPYRIGHT & LICENSE
659 Copyright 2009 Boyce Thompson Institute for Plant Research
661 This program is free software; you can redistribute it and/or modify
662 it under the same terms as Perl itself.
664 =cut
666 # package CXGN::BlastDB::Group;
667 # use strict;
668 # use English;
669 # use Carp;
671 # use base qw/CXGN::CDBI::Class::DBI Class::Data::Inheritable/;
672 # __PACKAGE__->table(__PACKAGE__->qualify_schema('sgn') . '.blast_db_group');
674 # #define our database columns with class::dbi
675 # our @primary_key_names = ('blast_db_group_id');
677 # our @column_names =
678 # ('blast_db_group_id',
679 # 'name',
680 # 'ordinal'
681 # );
682 # __PACKAGE__->columns( Primary => @primary_key_names, );
683 # __PACKAGE__->columns( All => @column_names, );
684 # __PACKAGE__->columns( Essential => @column_names, );
685 # __PACKAGE__->sequence( __PACKAGE__->base_schema('sgn'). '.blast_db_group_blast_db_group_id_seq' );
687 # __PACKAGE__->has_many( blast_dbs => 'CXGN::BlastDB', {order_by => 'title'} );
690 ####
691 1; # do not remove
692 ####