Merge pull request #5248 from solgenomics/topic/batch_update_trials
[sgn.git] / lib / Bio / BLAST2 / Database.pm
blob14f9ec7e33853867253dc5c849eb964b4ad5fb41
1 package Bio::BLAST2::Database;
2 BEGIN {
3 $Bio::BLAST2::Database::AUTHORITY = 'cpan:RBUELS';
5 BEGIN {
6 $Bio::BLAST2::Database::VERSION = '0.4';
8 # ABSTRACT: work with formatted BLAST databases
10 use strict;
11 use warnings;
13 use POSIX;
15 use IO::Pipe;
16 use IPC::Cmd qw/ can_run /;
18 use Carp;
19 use Memoize;
20 use Data::Dumper;
22 use File::Basename;
23 use File::Copy;
24 use File::Path;
25 use File::Slurp qw/slurp/;
26 use File::Spec::Functions qw/ splitdir catdir devnull /;
28 use IPC::System::Simple 'systemx';
30 use List::Util qw/ min max /;
31 use List::MoreUtils qw/ all any /;
33 use Bio::BLAST2::Database::Seq;
35 use namespace::clean;
38 use base qw/ Class::Accessor::Fast /;
42 sub new { croak "use open(), not new()" }
44 sub open {
45 my $class = shift;
46 #validate the args
47 @_ % 2 and croak 'invalid args to open()';
48 my %args = @_;
49 my %valid_keys = map {$_ => 1} qw( full_file_basename type write create_dirs );
50 $valid_keys{$_} or croak "invalid param '$_' passed to open()" for keys %args;
52 my $self = $class->SUPER::new(\%args);
54 $self->full_file_basename or croak 'must provide a full_file_basename';
56 unless( $self->type ) {
57 $self->type( $self->_guess_type )
58 or croak 'type not provided, and could not guess it';
61 if( $self->write ) {
62 $self->create_dirs || -d dirname( $self->full_file_basename )
63 or croak 'either directory must exist, or create_dirs must be set true';
65 my $perm_error = $self->check_format_permissions;
66 croak $perm_error if $perm_error;
69 # set some of our attrs from the existing files
70 $self->_read_blastdbcmd_info;
72 if( $self->write ) {
73 return $self;
74 } else {
75 # open succeeds if all the files are there
76 return $self if $self->files_are_complete;
78 carp "cannot open for reading, not a complete set of files:\n",
79 map " - $_\n", $self->list_files;
80 return;
85 __PACKAGE__->mk_accessors('full_file_basename');
88 __PACKAGE__->mk_accessors('create_dirs');
91 __PACKAGE__->mk_accessors('write');
94 __PACKAGE__->mk_accessors('title');
97 sub indexed_seqs { #< indexed_seqs is read-only externally
98 my ($self,@args) = @_;
99 croak 'indexed_seqs() is read-only' if @args;
100 shift->_indexed_seqs;
102 __PACKAGE__->mk_accessors('_indexed_seqs');
105 sub type {
106 my $self = shift;
108 if( @_ ) {
109 my $type = shift;
110 !defined $type || $type eq 'nucleotide' || $type eq 'protein'
111 or croak "invalid type '$type'";
112 $self->{type} = $type;
115 return $self->{type};
120 sub to_fasta {
121 my ($self) = @_;
123 $self->_check_external_tools;
125 my $pipe = IO::Pipe->new;
126 if(my $pid = fork) {
127 $pipe->reader;
128 return $pipe;
129 } elsif(defined $pid) {
130 $pipe->writer;
132 # #figure out the right -D flag to use, depending on blastdbcmd version
133 # my $d = do {
134 # my ($version) = `blastdbcmd -help` =~ /\s+blastdbcmd\s+([\d\.]+)/;
135 # if( _ver_cmp($version,'2.2.14') >= 0) {
136 # '1'
138 # else {
139 # 'T'
141 # };
143 my $cmd = "blastdbcmd -db ".$self->full_file_basename." -entry 'all' |";
144 CORE::open my $fh, "$cmd"
145 or die "Could not run $cmd: $!\n";
146 while (<$fh>) {
147 if(/^>/) {
148 #remove renamed idents for genbank-accessioned databases
149 s/^>gnl\|\S+\s+/>/;
150 #remove renamed idents for local-accessioned databases
151 s/^>lcl\|/>/;
153 print $pipe $_;
155 close $pipe;
156 POSIX::_exit(0);
157 } else {
158 die "could not fork: $!";
161 sub _ver_cmp { #compares two version numbers like '2.2.10' and '2.2.14'
162 my ($v1,$v2) = @_;
163 my @v1 = split /\./,$v1;
164 my @v2 = split /\./,$v2;
165 for(my $i=0;$i<@v2||$i<@v1;$i++) {
166 my $m1 = $v1[$i] || 0;
167 my $m2 = $v2[$i] || 0;
168 my $cmp = $m1 <=> $m2;
169 return $cmp if $cmp;
171 return 0;
174 memoize('_check_external_tools');
175 sub _check_external_tools {
177 my @missing;
178 for my $tool ( qw/ blastdbcmd makeblastdb / ) {
179 unless( can_run( $tool ) ) {
180 push @missing, "External tool `$tool` not found in path. Please install it.\n";
184 croak @missing, "Please install missing tools before using ".__PACKAGE__.".\n"
185 if @missing;
187 return;
191 sub format_from_file {
192 my ($self,%args) = @_;
194 #validate arg keys
195 my %valid_keys = map {$_ => 1} qw/seqfile title indexed_seqs/;
196 $valid_keys{$_} or croak "invalid arg '$_'" foreach keys %args;
198 my $seqfile = $args{seqfile}
199 or croak 'must provide seqfile';
200 my $title = $args{title};
202 $self->_check_external_tools;
204 #check whether the file looks like it's a fasta file
205 CORE::open my $seqfh, '<', $seqfile
206 or croak "Could not open '$seqfile' for reading: $!";
207 while(<$seqfh>) {
208 next unless /\S/; #go to first non-whitespace line
209 croak "$seqfile does not seem to be a valid FASTA file (got line: $_)"
210 unless /^\s*>\s*\S+/;
211 last;
213 close $seqfh;
215 unless( $self->write ) {
216 if( my @files = $self->list_files ) {
217 croak "cannot format from file, files are in the way:\n",map " - $_\n",$self->list_files;
221 #now run makeblastdb, formatting into files with a -blast-db-new
222 #appended to the filebase, so the old databases are still available
223 #while the format is running
224 my $ffbn = $self->full_file_basename;
225 my $new_ffbn = $ffbn; #keep original name so blastdbcmd does not fail #"$ffbn-blast-db-new";
226 my (undef,$ffbn_subdir,undef) = fileparse($ffbn);
227 #make sure the destination directories exist. Create them if not.
228 -d $ffbn_subdir or $self->create_dirs && mkpath([$ffbn_subdir])
229 or croak "Could not make path '$ffbn_subdir': $!\n";
230 unless( -d $ffbn_subdir ) {
231 croak $self->create_dirs ? "Could not create dir '$ffbn_subdir'"
232 : "Directory '$ffbn_subdir' does not exist, and create_dirs not set\n";
234 -w $ffbn_subdir or croak "Directory '$ffbn_subdir' is not writable\n";
236 print STDERR "*** in = $seqfile\n out = $new_ffbn\n title = $title\n dbtype = " . $self->type . "\n indexed seqs = " . $args{indexed_seqs} . "\n\n" ;
237 my $dbtype = ($self->type eq 'protein') ? 'prot' : 'nucl';
239 #makeblastdb -in database.fasta -dbtype [prot|nucl] -parse_seqids
240 eval {
241 systemx( 'makeblastdb',
242 -in => $seqfile,
243 -out => $ffbn,
244 ($title ? (-title => $title) : ()),
245 -dbtype => $dbtype,
246 -parse_seqids,
249 if ($@) {
250 print "Something went wrong with makeblastdb- $@\n";
251 } else { print "makeblastdb done for $ffbn\n" ; }
253 #now if it made an alias file, fix it up to remove the -blast-db-new
254 #and the absolute paths, so that when we move it into place, it works
255 if( my $aliasfile = do {
256 my %exts = ( protein => '.pdb', nucleotide => '.ndb');
257 my $n = $new_ffbn.$exts{$self->type};
258 (-f $n) ? $n : undef;
261 my $aliases = slurp($aliasfile);
262 $aliases =~ s/-blast-db-new//g; #remove the new extension
263 $aliases =~ s/$ffbn_subdir\/*//g; #remove absolute paths
264 CORE::open my $a_fh, '>', $aliasfile or confess "Could not open $aliasfile for writing";
265 print $a_fh $aliases;
266 #closing not necessary for indirect filehandles in lexical variables
267 } else {
268 print STDERR "Cannot read file $new_ffbn " . $self->type . "\n";
270 #list of files we will be replacing
271 my @oldfiles = _list_files($ffbn,$self->type);
273 #move the newly formatted files (almost) seamlessly into place
274 foreach my $newfile ( sort (_list_files($new_ffbn,$self->type)) ) {
275 my $dest = $newfile;
276 #################################
277 #$dest =~ s/-blast-db-new\./\./;
278 #move it into the right place
279 #move( $newfile => $dest );
281 #remove this file from the old files array if it's there,
282 #since it has just been overwritten
283 @oldfiles = grep { $_ ne $dest } @oldfiles;
286 #delete any old files that were not overwritten
287 if(@oldfiles) {
288 unlink @oldfiles;
289 carp "WARNING: these files for database ".$self->full_file_basename." are no longer used and have been removed:\n",map {"-$_\n"} @oldfiles;
293 #and now reread our data from the new database
294 $self->_read_blastdbcmd_info;
298 sub file_modtime {
299 my $this = shift;
300 my ($basename,$ext) = $this->full_file_basename;
301 my $db_mtime = min( map { (stat($_))[9] } $this->list_files );
302 return $db_mtime;
308 __PACKAGE__->mk_accessors('format_time');
312 sub check_format_permissions {
313 my ($self) = @_;
314 my $ffbn = $self->full_file_basename;
316 my $err_str = '';
318 #check the dir
319 my $dir = dirname($ffbn);
320 unless( $self->create_dirs ) {
321 unless( -d $dir ) {
322 $err_str .= "Directory '$dir' does not exist\n";
324 elsif( ! -w $dir ) {
325 $err_str .= "Directory $dir exists, but is not writable\n";
327 } else {
328 my @dirs = splitdir($dir);
329 #use Data::Dumper;
330 #die Dumper \@dirs;
331 pop @dirs while @dirs && ! -d catdir(@dirs);
332 my $d = catdir(@dirs);
333 if( ! @dirs ) {
334 $err_str .= "Entire directory tree for '$dir' does not exist!\n";
336 elsif(! -w $d ) {
337 $err_str .= "Directory $d is not writable, cannot make dirs\n";
342 #check writability of any files that are already there
343 my @files = $self->list_files();
344 foreach (@files) {
345 if( -f && !-w ) {
346 $err_str .= "Blast DB component file $_ exists, but is not overwritable\n";
349 return $err_str if $err_str;
350 return;
354 sub is_split {
355 my ($self) = @_;
356 my $ffbn = $self->full_file_basename;
357 return 1 if grep /^$ffbn\.\d{2,3}\.[np]\w\w$/,$self->list_files;
358 return 0;
362 sub files_are_complete {
363 my ($self) = @_;
365 #list of files belonging to this db
366 my @files = $self->list_files;
368 #certainly not complete if fewer than 3 files
369 return 0 unless @files >= 3;
371 #assemble list of necessary extensions
372 my @necessary_extensions = (qw/sq hr in/, #base database files
373 #add seqid indexes if called for
374 $self->indexed_seqs ? qw/tf to/ : (),
377 #add protein/nucleotide prefix to extensions
378 my $norp = $self->type eq 'protein' ? '.p' : '.n';
379 $_ = $norp.$_ foreach @necessary_extensions;
381 #deal with large, split databases
382 if( $self->is_split ) {
383 #if the database is split, add all of the fragment numbers to
384 #the extensions we have to have
386 #maximum index number of all fragments present
387 my $max_frag_num = 0 + max( map { /\.(\d{2,3})\.[np]\w\w$/ ? $1 : 0 } @files);
389 #make extensions with all of the necessary fragment numbers
390 @necessary_extensions = map { my $ext = $_;
391 map {sprintf(".%02d$ext",$_)} (0..$max_frag_num)
392 } @necessary_extensions;
394 #also remember that we have to have an alias file for split dbs
395 push @necessary_extensions, $norp.'al';
398 #now that we have our list of all the file extensions we need to have,
399 #check if they are actually there
400 my $ffbn = $self->full_file_basename;
401 return all {
402 my $ext = $_;
403 (grep {$_ eq "$ffbn$ext"} @files) ? 1 : 0
404 } @necessary_extensions;
408 sub list_files {
409 my $self = shift;
410 croak "cannot list files without knowing the database type" unless $self->type;
411 _list_files($self->full_file_basename, $self->type);
413 #our internal version of this function just takes a full file basename,
414 #and a db type, and returns all the files that go with that database
415 sub _list_files {
416 my ($ffbn,$type) = @_;
418 #file extensions for each type of blast database
419 my %valid_extensions = ( protein => [qw/.psq .phr .pin .pog .pos .pot .ptf .pto .pdb .phd .phi /],
420 nucleotide => [qw/.nsq .nhr .nin .nog .nos .not .ntf .nto .ndb .nhd .nhi /],
423 #file extensions for _this_ database
424 $valid_extensions{$type} or confess 'invalid type '.$type;
425 my @search_extensions = @{$valid_extensions{$type}};
427 #this gives us all files which have our basename, and one of the right search extensions
428 my @myfiles =
429 grep {
430 my $file = $_;
431 grep {$file =~ /^$ffbn(\.\d{2})?$_$/} @search_extensions
432 } glob("$ffbn*");
434 for (@myfiles) { -f or confess 'sanity check failed' };
436 return @myfiles;
440 __PACKAGE__->mk_accessors('sequences_count');
444 sub get_sequence {
445 my ($self, $seqname) = @_;
447 croak "cannot call get_sequence on an incomplete database!"
448 unless $self->files_are_complete;
450 croak "cannot call get_sequence on a database that has not been indexed for retrieval!"
451 unless $self->indexed_seqs;
453 return Bio::BLAST2::Database::Seq->new(
454 -bdb => $self,
455 -id => $seqname,
459 # internal function to set the title, sequence count, type,
460 # format_time, and indexed_seqs from the set of files on disk and from
461 # the output of blastdbcmd
462 sub _read_blastdbcmd_info {
463 my ($self) = @_;
465 my @files = $self->list_files
466 or return;
468 $self->_check_external_tools;
470 my $ffbn = $self->full_file_basename;
471 my $cmd = "blastdbcmd -db $ffbn -info";
472 my $blastdbcmd = `$cmd 2>&1`;
473 print STDERR "BLASTDBCMD RETURNED: $blastdbcmd\n";
475 my ($title) = $blastdbcmd =~ /Database:\s*([\s\S]+)sequences/
476 or return (print STDERR "could not parse output of blastdbcmd (0):\n$blastdbcmd");
477 $title =~ s/\s*[\d,]+\s*$//;
479 my ($seq_cnt) = $blastdbcmd =~ /([\d,]+)\s*sequences/
480 or return(print STDERR "could not parse output of blastdbcmd (1):\n$blastdbcmd");
481 $seq_cnt =~ s/,//g;
483 my ($datestr) =
484 $blastdbcmd =~ m(
485 Date: \s* ( \w [\S\ ]+ \w )
488 or die "could not parse output of blastdbcmd (2):\n$blastdbcmd";
490 print STDERR "FILES = ".Dumper(\@files);
492 my $indexed = (any {/tf$/} @files) && (any {/to$/} @files);
494 ### set our data
495 $self->type( $self->_guess_type )
496 or confess 'could not determine db type';
498 ### type: $self->type
500 $self->format_time( _parse_datestr($datestr) ); #< will die on failure
501 $title =~ s/\s+$//;
502 $self->title( $title );
503 $self->_indexed_seqs( $indexed );
504 $self->sequences_count( $seq_cnt );
506 sub _guess_type {
507 my ($self) = @_;
508 my $saved_type = $self->type;
510 foreach my $guess (qw( protein nucleotide )) {
511 $self->type( $guess );
512 if( $self->files_are_complete ) {
513 $self->type( $saved_type );
514 return $guess;
518 $self->type( $saved_type );
519 return;
521 sub _parse_datestr {
522 my ($datestr) = @_;
523 my @split = split /\W+/,$datestr;
524 my ($mon,$d,$y,$h,$min,$ampm) = @split
525 or die "could not parse data string '$datestr'";
527 # warn "got $mon,$d,$y,$h,$min,$ampm\n";
528 my %months = do{ my $n = 0; map { $_ => $n++ } qw/Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec/};
529 exists $months{$mon} or return;
530 $mon = $months{$mon};
531 $h = 0 if $h == 12;
532 $h += 12 if lc($ampm) eq 'pm';
533 #warn "mktime $min,$h,$d,$mon,".($y-1900)."\n";
534 my $time = mktime(0,$min,$h,$d,$mon,$y-1900,0,0,-1);
535 # warn "$datestr => ".ctime($time)."\n";
536 return $time;
540 1;#do not remove
543 __END__
544 =pod
546 =encoding utf-8
548 =head1 NAME
550 Bio::BLAST2::Database - work with formatted BLAST databases, updated for new NCBI-Blast+ Debian package
552 =head1 SYNOPSIS
554 use Bio::BLAST2::Database;
556 # open an existing bdb for reading
557 my $fs = Bio::BLAST2::Database->open(
558 full_file_basename => '/path/to/my_bdb',
560 # will read from /path/to/my_bdb.nin, /path/to/my_bdb.nsq, etc
562 my @filenames = $fs->list_files;
564 #reopen it for writing
565 $fs = Bio::BLAST2::Database->open(
566 full_file_basename => '/path/to/my_bdb',
567 write => 1,
570 # replace it with a different set of sequences
571 $fs->format_from_file('myseqs.seq');
573 # can also get some metadata about it
574 print "db's title is ".$fs->title;
575 print "db was last formatted on ".localtime( $fs->format_time );
576 print "db file modification was ".localtime( $fs->file_modtime );
578 =head1 DESCRIPTION
580 Each object of this class represents an NCBI-formatted sequence
581 database on disk, which is a set of files, the exact structure of
582 which varies a bit with the type and size of the sequence set.
584 This is mostly an object-oriented wrapper for using NCBI's C<blastdbcmd>
585 and C<makeblastdb> tools.
587 =head1 ATTRIBUTES
589 =head2 full_file_basename
591 Full path to the blast database file basename. This is the entire
592 path to the BLAST database files, except for the final suffixes
593 (C<.nin>, C<.nsq>, etc).
595 my $basename = $db->full_file_basename;
596 #returns '/data/shared/blast/databases/genbank/nr'
598 =head2 create_dirs
600 true/false flag for whether to create any necessary dirs at format time
602 =head2 write
604 true/false flag for whether to write any files that are in the way when formatted
606 =head2 title
608 title of this blast database, if set
610 =head2 indexed_seqs
612 return whether this blast database is indexed
614 =head2 type
616 accessor for type of blastdb. must be set in new(), but open() looks
617 at the existing files and sets this
619 =head1 METHODS
621 =head2 open
623 Usage: my $fs = Bio::BLAST2::Database->open({
624 full_file_basename => $ffbn,
625 write => 1,
626 create_dirs => 1,
628 Desc : open a BlastDB with the given ffbn.
629 Args : hashref of params as:
630 { full_file_basename => full path plus basename of files in this blastdb,
631 type => 'nucleotide' or 'protein'
632 write => default false, set true to write any files in the way,
633 create_dirs => default false, set true to create any necessary directories
634 if formatted
636 Ret : Bio::BLAST2::Database object
637 Side Effects: none if no files are present at the given ffbn. overwise,
638 dies if files are present and write is not specified,
639 or if dir does not exist and create_dirs was not specified
640 Example:
642 =head2 to_fasta
644 Usage: my $fasta_fh = $bdb->to_fasta;
645 Desc : get the contents of this blast database in FASTA format
646 Ret : an IO::Pipe filehandle
647 Args : none
648 Side Effects: runs 'blastdbcmd' in a forked process, cleaning up its output,
649 and passing it to you
651 =head2 format_from_file
653 Usage: $db->format_from_file(seqfile => 'mysequences.seq');
654 Desc : format this blast database from the given source file,
655 into its proper place on disk, overwriting the files already
656 present
657 Ret : nothing meaningful
658 Args : hash-style list as:
659 seqfile => filename containing sequences,
660 title => (optional) title for this blast database,
661 hash_index => (optional) if true, formats the database with
662 indexing (and sets indexed_seqs in this obj)
663 Side Effects: runs 'makeblastdb' to format the given sequences,
664 dies on failure
666 =head2 file_modtime
668 Desc: get the earliest unix modification time of the database files
669 Args: none
670 Ret : unix modification time of the database files
671 Side Effects:
672 Example:
674 =head2 format_time
676 Usage: my $time = $db->format_time;
677 Desc : get the format time of these db files
678 Ret : the value time() would have returned when
679 this database was last formatted, or undef
680 if that could not be determined (like if the
681 files aren't there)
682 Args : none
683 Side Effects: runs 'blastdbcmd' to extract the formatting
684 time from the database files
686 NOTE: This function assumes that the computer that
687 last formatted this database had the same time zone
688 set as the computer we are running on.
689 Also, the time returned by this function is rounded
690 down to the minute, because blastdbcmd does not print
691 the format time in seconds.
693 =head2 check_format_permissions
695 Usage: $bdb->check_format_from_file() or die "cannot format!\n";
696 Desc : check directory existence and file permissions to see if a
697 format_from_file() is likely to succeed. This is useful,
698 for example, when you have a script that downloads some
699 remote database and you'd like to check first whether
700 we even have permissions to format before you take the
701 time to download something.
702 Args : (optional) alternate full file basename to write blast DB to
703 e.g. '/tmp/mytempdir/tester_blast_db'
704 Ret : nothing if everything looks good,
705 otherwise a string error message summarizing the reason
706 for failure
707 Side Effects: reads from filesystem, may stat some files
709 =head2 is_split
711 Usage: print "that thing is split, yo" if $db->is_split;
712 Desc : determine whether this database is in multiple parts
713 Ret : true if this database has been split into multiple
714 files by makeblastdb (e.g. nr.00.pin, nr.01.pin, etc.)
715 Args : none
716 Side Effects: looks in filesystem
718 =head2 files_are_complete
720 Usage: print "complete!" if $db->files_are_complete;
721 Desc : tell whether this blast db has a complete set of files on disk
722 Ret : true if the set of files on disk looks complete,
723 false if not
724 Args : (optional) true value if the files should only be
725 considered complete if the sequences are indexed for retrieval
726 Side Effects: lists files on disk
728 =head2 list_files
730 Usage: my @files = $db->list_files;
731 Desc : get the list of files that belong to this blast database
732 Ret : list of full paths to all files belonging to this blast database,
733 Args : none
734 Side Effects: looks in the filesystem
736 =head2 sequences_count
738 Desc: get the number of sequences in this blast database
739 Args: none
740 Ret : number of distinct sequences in this blast database, or undef
741 if it could not be determined due to some error or other
742 Side Effects: runs 'blastdbcmd' to get stats on the blast database file
744 =head2 get_sequence
746 Usage: my $seq = $fs->get_sequence('LE_HBa0001A02');
747 Desc : get a particular sequence from this db
748 Args : sequence name to retrieve
749 Ret : Bio::PrimarySeqI-implementing object, or nothing if not found
750 Side Effects: dies on error
752 =head1 BASE CLASS(ES)
754 L<Class::Accessor::Fast>
756 =head1 AUTHOR
758 Robert Buels <rmb32@cornell.edu>
760 =head1 COPYRIGHT AND LICENSE
762 This software is copyright (c) 2011 by Robert Buels.
764 This is free software; you can redistribute it and/or modify it under
765 the same terms as the Perl 5 programming language system itself.
767 =cut