brapi refactor error response
[sgn.git] / lib / CXGN / Blast.pm
blob3374acb08ebac464e9ab853073b9326f9e0f39ec
2 package CXGN::Blast;
4 =head1 NAME
6 CXGN::Blast - a BLAST database that we keep in stock and updated.
8 =head1 SYNOPSIS
10 ### SIMPLE MECHANICS
12 This object has been derived from CXGN::BlastDB (in cxgn-corelibs) and
13 refactored to work with DBIx::Class (instead of Class::DBI) and Moose.
15 The constructor now requires some additional arguments:
17 my $db = CXGN::Blast->new( { blast_db_id => $x,
18 sgn_schema => $s,
19 dbpath => $p,
20 });
22 (this standard constructor now replaces the previous from_id() constructor).
24 my @dbs = CXGN::Blast->retrieve_all(); #get all blastDB objects
26 #change the title of a blast db in memory
27 $dbs[0]->title( 'Sequences from Tomatoes' );
29 Updating the object in the database is not supported right now.
31 #do a blast against this database
32 CXGN::Tools::Run->run( 'blastall',
33 -m => 8,
34 -i => 'myseqs.seq',
35 -d => $dbs[0]->full_file_basename,
36 -p => 'blastn',
37 -e => '1e-10',
38 -o => 'myreport.m8.blast',
41 ### NIFTY THINGS
43 #does it need to be updated?
44 print "univec needs updating\n" if $uv->needs_update;
46 #list the files that are part of our univec DB
47 my @uv_files = $uv->list_files;
48 #returns ( '/data/shared/blast/databases/screening/vector/UniVec.nin',
49 # '/data/shared/blast/databases/screening/vector/UniVec.nhr',
50 # '/data/shared/blast/databases/screening/vector/UniVec.nsq',
51 # )
53 #how many sequences does it have in it?
54 print "this copy of univec has ".$uv->sequence_count." sequences in it\n";
56 #we've got an updated copy of univec here, let's format it and install it
57 #in place
58 $uv->format_from_file('new_univec.seq');
60 #i'll plop another formatted copy of univec in my home dir too
61 $bdb->dbpath('/home/rob/blast');
62 $uv->format_from_file('new_univec.seq');
63 #that will have done a mkpath -p /home/rob/blast/screening/vector,
64 #then it will have put the Univec.nin, nhr, and nsq files there.
66 =head1 DESCRIPTION
68 This is a handle on a BLAST database we manage. Each of
69 these objects corresponds to a row in the sgn.blast_db table and
70 a set of files in the filesystem. at a place specified by the
71 dbpath() accessor (see dbpath docs below). This path defaults to
72 the value of the 'blast_db_path' configuration variable (see L<CXGN::Config>).
74 =head1 METHODS
76 =cut
78 use Moose;
80 use strict;
81 use Carp;
82 use File::Spec;
83 use File::Basename;
84 use File::Copy;
85 use File::Path;
86 use POSIX;
88 use List::MoreUtils qw/ uniq /;
90 use Memoize;
92 use CXGN::Tools::List qw/any min all max/;
93 use CXGN::Tools::Run;
95 use Bio::BLAST::Database;
97 has 'sgn_schema' => ( isa => 'SGN::Schema',
98 is => 'ro',
99 required => 1,
102 has 'blast_db_id' => ( isa => 'Int',
103 is => 'ro',
104 default => 0,
107 has 'dbpath' => ( isa => 'Maybe[Str]',
108 is => 'rw',
109 required => 1,
112 has 'file_base' => ( isa => 'Maybe[Str]',
113 is => 'rw',
116 has 'title' => ( isa => 'Str',
117 is => 'rw',
118 default => '',
121 has 'type' => ( isa => 'Maybe[Str]',
122 is => 'rw',
125 has 'source_url' => ( isa => 'Maybe[Str]',
126 is => 'rw',
127 default => '',
130 has 'lookup_url' => (isa => 'Maybe[Str]',
131 is => 'rw',
132 default => '',
135 has 'info_url' => ( isa => 'Maybe[Str]',
136 is => 'rw',
139 has 'update_freq' => ( isa => 'Maybe[Str]',
140 is => 'rw',
143 has 'index_seqs' => ( isa => 'Bool',
144 is => 'rw',
147 has 'web_interface_visible' => ( isa => 'Bool',
148 is => 'rw',
151 has 'description' => (isa => 'Maybe[Str]',
152 is => 'rw',
155 ###our @column_names =
156 # ('blast_db_id', #-database serial number
157 # 'file_base', #-basename of the database files, with a path prepended,
158 # # e.g. 'genbank/nr'
159 # 'title', #-title of the database, e.g. 'NCBI Non-redundant proteins'
160 # 'type', #-type, either 'protein' or 'nucleotide'
161 # 'source_url', #-the URL new copies of this database can be fetched from
162 # 'lookup_url', #-printf-style format string that can be used to generate
163 # # a URL where a user can get more info on a sequence in
164 # # this blast db
165 # 'info_url', #-URL that gives information about the contents of this DB
166 # 'update_freq', #-frequency of updating this blast database
167 # 'index_seqs', #- corresponds to formatdb's -o option. Set true if formatdb
168 # # should be given a '-o T'. This is used if you later want to
169 # # fetch specific sequences out of this blast db
170 # 'web_interface_visible', #whether the blast web interface should display this DB
171 # 'blast_db_group_id', #ID of the blast DB group this db belondgs to,
172 # #if any. used for displaying them in the web
173 # #interface
174 # 'description', # text description of the database, display on the database details page
175 ### );
177 sub BUILD {
178 my $self = shift;
180 if ($self->blast_db_id) {
181 my $row = $self->sgn_schema()->resultset("BlastDb")->find( { blast_db_id => $self->blast_db_id() } );
182 if (!$row) { die "The blast_db_id with the id ".$self->blast_db_id()." does not exist in this database\n"; }
183 $self->file_base($row->file_base());
184 $self->title($row->title());
185 $self->type($row->type());
186 $self->source_url($row->source_url());
187 $self->lookup_url($row->lookup_url());
188 $self->info_url($row->info_url());
189 $self->update_freq($row->update_freq());
190 $self->index_seqs($row->index_seqs());
191 $self->web_interface_visible($row->web_interface_visible());
192 $self->description($row->description());
194 else {
195 print STDERR "No blast_db_id provided. Creating empty object...\n";
199 # class function
201 sub retrieve_all {
202 my $class = shift;
203 my $sgn_schema = shift;
204 my $dbpath = shift;
206 my @dbs = $class->search($sgn_schema, $dbpath);
208 return @dbs;
211 sub search {
212 my $class = shift;
213 my $sgn_schema = shift;
214 my $dbpath = shift;
215 my %search = @_;
217 my $rs = $sgn_schema->resultset("BlastDb")->search( { %search } );
219 my @dbs = ();
221 while (my $db = $rs->next()) {
222 my $bdbo = CXGN::Blast->new( sgn_schema => $sgn_schema, dbpath => $dbpath, blast_db_id => $db->blast_db_id() );
223 push @dbs, $bdbo;
225 return @dbs;
228 # =head2 from_id
230 ## Note: replaced by standard moose constructor and blast_db_id arg
232 # Usage: my $bdb = CXGN::BlastDB->from_id(12);
233 # Desc : retrieve a BlastDB object using its ID number
234 # Ret : a BlastDB object for that id number, or undef if none found
235 # Args : the id number of the object to retrieve
236 # Side Effects: accesses the database
238 # =cut
240 # sub from_id {
241 # shift->retrieve(@_);
244 #Only document title and type for external users of this module,
245 #some nicer wrappers will be provided for using the other information
247 =head2 title
249 Usage: $db->title
250 Desc : get/set the title of this blast DB object
251 Ret : string containing the title, e.g. 'NCBI Non-redundant proteins'
252 Args : optional new value for the title
254 Note: must run $db->update for changes to be written to the db, unless you've
255 set $db->autoupdate.
257 =head2 type
259 Usage: $db->type
260 Desc : get the type of this blast db, whether it holds
261 proteins or nucleotides
262 Ret : 'protein' or 'nucleotide'
263 Args : optional new value for the type, either 'protein' or 'nucleotide'
265 Note: must run $db->update for changes to be written to the db, unless you've
266 set $db->autoupdate.
268 =head2 file_base
270 Usage: $db->file_base;
271 Desc : get/set the basename and path relative to 'blast_db_path' config var
272 Ret : the path and basename, e.g. 'genbank/nr' or 'screening/organelle/ATH_mitochondria'
273 Args : (optional) new string containing subpath and basename
274 Side Effects: none
276 Note: must run $db->update for changes to be written to the db, unless you've
277 set $db->autoupdate.
279 =cut
281 =head2 genomic_libraries_annotated
283 Desc: get the L<CXGN::Genomic::Library> objects that are slated as using this
284 blast database for annotation
285 Args: none
286 Ret : array of L<CXGN::Genomic::Library> objects
288 =cut
290 =head2 file_modtime
292 Desc: get the earliest unix modification time of the database files
293 Args: none
294 Ret : unix modification time of the database files, or nothing if does not exist
295 Side Effects:
296 Example:
298 =cut
300 sub file_modtime {
301 my $self = shift;
302 return unless $self->_fileset;
303 return $self->_fileset->file_modtime;
306 =head2 format_time
308 Usage: my $time = $db->format_time;
309 Desc : get the format time of these db files
310 Ret : the value time() would have returned when
311 this database was last formatted, or undef
312 if that could not be determined (like if the
313 files aren't there)
314 Args : none
315 Side Effects: runs 'fastacmd' to extract the formatting
316 time from the database files
318 NOTE: This function assumes that the computer that
319 last formatted this database had the same time zone
320 set as the computer we are running on.
322 =cut
324 sub format_time {
325 my ($self) = @_;
326 return unless $self->_fileset;
327 return $self->_fileset->format_time;
330 =head2 full_file_basename
332 Desc:
333 Args: none
334 Ret : full path to the blast database file basename,
335 Side Effects: none
336 Example:
338 my $basename = $db->full_file_basename;
339 #returns '/data/shared/blast/databases/genbank/nr'
341 =cut
343 sub full_file_basename {
344 my $self = shift;
346 return scalar File::Spec->catfile( $self->dbpath,
347 $self->file_base,
352 =head2 list_files
354 Usage: my @files = $db->list_files;
355 Desc : get the list of files that belong to this blast database
356 Ret : list of full paths to all files belonging to this blast database,
357 Args : none
358 Side Effects: looks in the filesystem
360 =cut
362 sub list_files {
363 my $self = shift;
364 return unless $self->_fileset;
365 $self->_fileset->list_files();
368 =head2 files_are_complete
370 Usage: print "complete!" if $db->files_are_complete;
371 Desc : tell whether this blast db has a complete set of files on disk
372 Ret : true if the set of files on disk looks complete,
373 false if not
374 Args : none
375 Side Effects: lists files on disk
377 =cut
379 sub files_are_complete {
380 my ($self) = @_;
381 return unless $self->_fileset;
382 return $self->_fileset->files_are_complete;
385 =head2 is_split
387 Usage: print "that thing is split, yo" if $db->is_split;
388 Desc : determine whether this database is in multiple parts
389 Ret : true if this database has been split into multiple
390 files by formatdb (e.g. nr.00.pin, nr.01.pin, etc.)
391 Args : none
392 Side Effects: looks in filesystem
394 =cut
396 sub is_split {
397 my ($self) = @_;
398 return unless $self->_fileset;
399 return $self->_fileset->is_split;
402 =head2 is_indexed
404 Usage: $bdb->is_indexed
405 Desc : checks whether this blast db is indexed on disk to support
406 individual sequence retrieval. note that this is different
407 from index_seqs(), which is the flag of whether this db
408 _should_ be indexed.
409 Args : none
410 Ret : false if not on disk or not indexed, true if indexed
412 =cut
414 sub is_indexed {
415 my ( $self ) = @_;
416 return unless $self->_fileset;
417 return $self->_fileset->files_are_complete && $self->_fileset->indexed_seqs;
421 =head2 sequences_count
423 Desc: get the number of sequences in this blast database
424 Args: none
425 Ret : number of distinct sequences in this blast database, or undef
426 if it could not be determined due to some error or other
427 Side Effects: runs 'fastacmd' to get stats on the blast database file
429 =cut
431 sub sequences_count {
432 my $self = shift;
433 return unless $self->_fileset;
434 return $self->_fileset->sequences_count;
437 =head2 is_contaminant_for
439 This method doesn't work yet.
441 Usage: my $is_contam = $bdb->is_contaminant_for($lib);
442 Desc : return whether this BlastDB contains sequences
443 from something that would be considered a contaminant
444 in the given CXGN::Genomic::Library
445 Ret : 1 or 0
446 Args : a CXGN::Genomic::Library object
448 =cut
450 #__PACKAGE__->has_many( _lib_annots => 'CXGN::Genomic::LibraryAnnotationDB' );
451 sub is_contaminant_for {
452 my ($this,$lib) = @_;
454 #return true if any arguments are true
455 return any( map { $_->is_contaminant && $_->library_id == $lib } $this->_lib_annots);
458 =head2 needs_update
460 Usage: print "you should update ".$db->title if $db->needs_update;
461 Desc : check whether this blast DB needs to be updated
462 Ret : true if this database's files need an update or are missing,
463 false otherwise
464 Args : none
465 Side Effects: runs format_time(), which runs `fastacmd`
467 =cut
469 sub needs_update {
470 my ($self) = @_;
472 #it of course needs an update if it is not complete
473 return 1 unless $self->files_are_complete;
475 my $modtime = $self->format_time();
477 #if no modtime, files must not even be there
478 return 1 unless $modtime;
480 #manually updated DBs never _need_ updates if their
481 #files are there
482 return 0 if $self->update_freq eq 'manual';
484 #also need update if it is set to be indexed but is not indexed
485 return 1 if $self->index_seqs && ! $self->is_indexed;
487 #figure out the maximum number of seconds we'll tolerate
488 #the files being out of date
489 my $max_time_offset = 60 * 60 * 24 * do { #figure out number of days
490 if( $self->update_freq eq 'daily' ) { 1 }
491 elsif( $self->update_freq eq 'weekly' ) { 7 }
492 elsif( $self->update_freq eq 'monthly' ) { 31 }
493 else {
494 confess "invalid update_freq ".$self->update_freq;
498 #subtract from modtime and make a decision
499 return time-$modtime > $max_time_offset ? 1 : 0;
503 =head2 check_format_permissions
505 Usage: $bdb->check_format_from_file() or die "cannot format!\n";
506 Desc : check directory existence and file permissions to see if a
507 format_from_file() is likely to succeed. This is useful,
508 for example, when you have a script that downloads some
509 remote database and you'd like to check first whether
510 we even have permissions to format before you take the
511 time to download something.
512 Args : none
513 Ret : nothing if everything looks good,
514 otherwise a string error message summarizing the reason
515 for failure
516 Side Effects: reads from filesystem, may stat some files
518 =cut
520 sub check_format_permissions {
521 my ($self,$ffbn) = @_;
522 croak "ffbn arg is no longer supported, maybe you should just use a new Bio::BLAST::Database object" if $ffbn;
523 return unless $self->_fileset('write');
524 return $self->_fileset('write')->check_format_permissions;
527 =head2 format_from_file
529 Usage: $db->format_from_file('mysequences.seq');
530 Desc : format this blast database from the given source file,
531 into its proper place on disk, overwriting the files already
532 present
533 Ret : nothing meaningful
534 Args : filename containing sequences,
535 Side Effects: runs 'formatdb' to format the given sequences,
536 dies on failure
538 =cut
540 sub format_from_file {
541 my ($self,$seqfile,$ffbn) = @_;
542 $ffbn and croak "ffbn arg no longer supported. maybe you should make a new Bio::BLAST::Database object";
544 $self->_fileset('write')
545 ->format_from_file( seqfile => $seqfile, indexed_seqs => $self->index_seqs, title => $self->title );
548 =head2 to_fasta
550 Usage: my $fasta_fh = $bdb->to_fasta;
551 Desc : get the contents of this blast database in FASTA format
552 Ret : an IO::Pipe filehandle, or nothing if it could not be opened
553 Args : none
554 Side Effects: runs 'fastacmd' in a forked process, cleaning up its output,
555 and passing it to you
557 =cut
559 sub to_fasta {
560 my ($self) = @_;
561 return unless $self->_fileset;
562 return $self->_fileset->to_fasta;
565 =head2 get_sequence
567 Usage: my $seq = $bdb->get_sequence('LE_HBa0001A02');
568 Desc : get a particular sequence from this db
569 Args : sequence name to retrieve
570 Ret : Bio::PrimarySeqI object, or nothing if not found or
571 if db does not exist
572 Side Effects: dies on error, like if this db is not indexed
574 =cut
576 sub get_sequence {
577 my ($self, $seqname) = @_;
578 return unless $self->_fileset;
579 return $self->_fileset->get_sequence($seqname);
582 =head2 dbpath
584 Usage: $bdb->dbpath('/data/cluster/blast/databases');
585 Desc : object method to get/set the location where all blast database
586 files are expected to be found. Defaults to the value of the
587 CXGN configuration variable 'blast_db_path'.
588 Ret : the current base path
589 Args : (optional) new base path
590 Side Effects: gets/sets a piece of CLASS-WIDE data
592 =cut
594 #mk_classdata is from Class::Data::Inheritable. good little module,
595 #you should look at it
596 #__PACKAGE__->mk_classdata( dbpath => CXGN::BlastDB::Config->load->{'blast_db_path'} );
598 =head2 identifier_url
600 Usage: my $url = $db->identifier_url('some ident from this bdb');
601 Desc : get a URL to look up more information on this identifier.
602 first tries to make a URL using the lookup_url column in the
603 sgn.blast_db table, then tries to use identifier_url() from
604 L<CXGN::Tools::Identifiers>
605 Args : the identifier to lookup, assumed
606 to be from this blast db
607 Ret : a URL, or undef if none could be found
608 Side Effects: Example:
610 =cut
612 sub identifier_url {
613 my ($self,$ident) = @_;
614 $ident or croak 'must pass an identifier to link';
616 return $self->lookup_url
617 ? sprintf($self->lookup_url,$ident)
618 : do { require CXGN::Tools::Identifiers; CXGN::Tools::Identifiers::identifier_url($ident) };
621 # accessor that holds our encapsulated Bio::BLAST::Database
622 memoize '_fileset',
623 NORMALIZER => sub { #< need to take the full_file_basename (really the dbpath) into account for the memoization
624 my $s = shift; join ',',$s,@_,$s->full_file_basename
627 sub _fileset {
628 my ($self,$write) = @_;
629 my $ffbn = $self->full_file_basename;
630 return Bio::BLAST::Database->open( full_file_basename => $ffbn,
631 type => $self->type,
632 ($write ? ( write => 1,
633 create_dirs => 1,
635 : (),
640 =head1 MAINTAINER
642 Original maintainer: Robert Buels
643 Refactored by Lukas Nov 2016.
645 =head1 AUTHOR
647 Robert Buels, E<lt>rmb32@cornell.eduE<gt>
649 =head1 COPYRIGHT & LICENSE
651 Copyright 2009 Boyce Thompson Institute for Plant Research
653 This program is free software; you can redistribute it and/or modify
654 it under the same terms as Perl itself.
656 =cut
658 # package CXGN::BlastDB::Group;
659 # use strict;
660 # use English;
661 # use Carp;
663 # use base qw/CXGN::CDBI::Class::DBI Class::Data::Inheritable/;
664 # __PACKAGE__->table(__PACKAGE__->qualify_schema('sgn') . '.blast_db_group');
666 # #define our database columns with class::dbi
667 # our @primary_key_names = ('blast_db_group_id');
669 # our @column_names =
670 # ('blast_db_group_id',
671 # 'name',
672 # 'ordinal'
673 # );
674 # __PACKAGE__->columns( Primary => @primary_key_names, );
675 # __PACKAGE__->columns( All => @column_names, );
676 # __PACKAGE__->columns( Essential => @column_names, );
677 # __PACKAGE__->sequence( __PACKAGE__->base_schema('sgn'). '.blast_db_group_blast_db_group_id_seq' );
679 # __PACKAGE__->has_many( blast_dbs => 'CXGN::BlastDB', {order_by => 'title'} );
682 ####
683 1; # do not remove
684 ####