1 package Bio
::BLAST2
::Database
;
3 $Bio::BLAST2
::Database
::AUTHORITY
= 'cpan:RBUELS';
6 $Bio::BLAST2
::Database
::VERSION
= '0.4';
8 # ABSTRACT: work with formatted BLAST databases
16 use IPC
::Cmd qw
/ can_run /;
25 use File
::Slurp qw
/slurp/;
26 use File
::Spec
::Functions qw
/ splitdir catdir devnull /;
28 use IPC
::System
::Simple
'systemx';
30 use List
::Util qw
/ min max /;
31 use List
::MoreUtils qw
/ all any /;
33 use Bio
::BLAST2
::Database
::Seq
;
38 use base qw
/ Class::Accessor::Fast /;
42 sub new
{ croak
"use open(), not new()" }
47 @_ % 2 and croak
'invalid args to open()';
49 my %valid_keys = map {$_ => 1} qw( full_file_basename type write create_dirs );
50 $valid_keys{$_} or croak
"invalid param '$_' passed to open()" for keys %args;
52 my $self = $class->SUPER::new
(\
%args);
54 $self->full_file_basename or croak
'must provide a full_file_basename';
56 unless( $self->type ) {
57 $self->type( $self->_guess_type )
58 or croak
'type not provided, and could not guess it';
62 $self->create_dirs || -d dirname
( $self->full_file_basename )
63 or croak
'either directory must exist, or create_dirs must be set true';
65 my $perm_error = $self->check_format_permissions;
66 croak
$perm_error if $perm_error;
69 # set some of our attrs from the existing files
70 $self->_read_blastdbcmd_info;
75 # open succeeds if all the files are there
76 return $self if $self->files_are_complete;
78 carp
"cannot open for reading, not a complete set of files:\n",
79 map " - $_\n", $self->list_files;
85 __PACKAGE__
->mk_accessors('full_file_basename');
88 __PACKAGE__
->mk_accessors('create_dirs');
91 __PACKAGE__
->mk_accessors('write');
94 __PACKAGE__
->mk_accessors('title');
97 sub indexed_seqs
{ #< indexed_seqs is read-only externally
98 my ($self,@args) = @_;
99 croak
'indexed_seqs() is read-only' if @args;
100 shift->_indexed_seqs;
102 __PACKAGE__
->mk_accessors('_indexed_seqs');
110 !defined $type || $type eq 'nucleotide' || $type eq 'protein'
111 or croak
"invalid type '$type'";
112 $self->{type
} = $type;
115 return $self->{type
};
123 $self->_check_external_tools;
125 my $pipe = IO
::Pipe
->new;
129 } elsif(defined $pid) {
132 # #figure out the right -D flag to use, depending on blastdbcmd version
134 # my ($version) = `blastdbcmd -help` =~ /\s+blastdbcmd\s+([\d\.]+)/;
135 # if( _ver_cmp($version,'2.2.14') >= 0) {
143 my $cmd = "blastdbcmd -db ".$self->full_file_basename." -entry 'all' |";
144 CORE
::open my $fh, "$cmd"
145 or die "Could not run $cmd: $!\n";
148 #remove renamed idents for genbank-accessioned databases
150 #remove renamed idents for local-accessioned databases
158 die "could not fork: $!";
161 sub _ver_cmp
{ #compares two version numbers like '2.2.10' and '2.2.14'
163 my @v1 = split /\./,$v1;
164 my @v2 = split /\./,$v2;
165 for(my $i=0;$i<@v2||$i<@v1;$i++) {
166 my $m1 = $v1[$i] || 0;
167 my $m2 = $v2[$i] || 0;
168 my $cmp = $m1 <=> $m2;
174 memoize
('_check_external_tools');
175 sub _check_external_tools
{
178 for my $tool ( qw
/ blastdbcmd makeblastdb / ) {
179 unless( can_run
( $tool ) ) {
180 push @missing, "External tool `$tool` not found in path. Please install it.\n";
184 croak
@missing, "Please install missing tools before using ".__PACKAGE__
.".\n"
191 sub format_from_file
{
192 my ($self,%args) = @_;
195 my %valid_keys = map {$_ => 1} qw
/seqfile title indexed_seqs/;
196 $valid_keys{$_} or croak
"invalid arg '$_'" foreach keys %args;
198 my $seqfile = $args{seqfile
}
199 or croak
'must provide seqfile';
200 my $title = $args{title
};
202 $self->_check_external_tools;
204 #check whether the file looks like it's a fasta file
205 CORE
::open my $seqfh, '<', $seqfile
206 or croak
"Could not open '$seqfile' for reading: $!";
208 next unless /\S/; #go to first non-whitespace line
209 croak
"$seqfile does not seem to be a valid FASTA file (got line: $_)"
210 unless /^\s*>\s*\S+/;
215 unless( $self->write ) {
216 if( my @files = $self->list_files ) {
217 croak
"cannot format from file, files are in the way:\n",map " - $_\n",$self->list_files;
221 #now run makeblastdb, formatting into files with a -blast-db-new
222 #appended to the filebase, so the old databases are still available
223 #while the format is running
224 my $ffbn = $self->full_file_basename;
225 my $new_ffbn = $ffbn; #keep original name so blastdbcmd does not fail #"$ffbn-blast-db-new";
226 my (undef,$ffbn_subdir,undef) = fileparse
($ffbn);
227 #make sure the destination directories exist. Create them if not.
228 -d
$ffbn_subdir or $self->create_dirs && mkpath
([$ffbn_subdir])
229 or croak
"Could not make path '$ffbn_subdir': $!\n";
230 unless( -d
$ffbn_subdir ) {
231 croak
$self->create_dirs ?
"Could not create dir '$ffbn_subdir'"
232 : "Directory '$ffbn_subdir' does not exist, and create_dirs not set\n";
234 -w
$ffbn_subdir or croak
"Directory '$ffbn_subdir' is not writable\n";
236 print STDERR
"*** in = $seqfile\n out = $new_ffbn\n title = $title\n dbtype = " . $self->type . "\n indexed seqs = " . $args{indexed_seqs
} . "\n\n" ;
237 my $dbtype = ($self->type eq 'protein') ?
'prot' : 'nucl';
239 #makeblastdb -in database.fasta -dbtype [prot|nucl] -parse_seqids
241 systemx
( 'makeblastdb',
244 ($title ?
(-title
=> $title) : ()),
250 print "Something went wrong with makeblastdb- $@\n";
251 } else { print "makeblastdb done for $ffbn\n" ; }
253 #now if it made an alias file, fix it up to remove the -blast-db-new
254 #and the absolute paths, so that when we move it into place, it works
255 if( my $aliasfile = do {
256 my %exts = ( protein
=> '.pdb', nucleotide
=> '.ndb');
257 my $n = $new_ffbn.$exts{$self->type};
258 (-f
$n) ?
$n : undef;
261 my $aliases = slurp
($aliasfile);
262 $aliases =~ s/-blast-db-new//g; #remove the new extension
263 $aliases =~ s/$ffbn_subdir\/*//g
; #remove absolute paths
264 CORE
::open my $a_fh, '>', $aliasfile or confess
"Could not open $aliasfile for writing";
265 print $a_fh $aliases;
266 #closing not necessary for indirect filehandles in lexical variables
268 print STDERR
"Cannot read file $new_ffbn " . $self->type . "\n";
270 #list of files we will be replacing
271 my @oldfiles = _list_files
($ffbn,$self->type);
273 #move the newly formatted files (almost) seamlessly into place
274 foreach my $newfile ( sort (_list_files
($new_ffbn,$self->type)) ) {
276 #################################
277 #$dest =~ s/-blast-db-new\./\./;
278 #move it into the right place
279 #move( $newfile => $dest );
281 #remove this file from the old files array if it's there,
282 #since it has just been overwritten
283 @oldfiles = grep { $_ ne $dest } @oldfiles;
286 #delete any old files that were not overwritten
289 carp
"WARNING: these files for database ".$self->full_file_basename." are no longer used and have been removed:\n",map {"-$_\n"} @oldfiles;
293 #and now reread our data from the new database
294 $self->_read_blastdbcmd_info;
300 my ($basename,$ext) = $this->full_file_basename;
301 my $db_mtime = min
( map { (stat($_))[9] } $this->list_files );
308 __PACKAGE__
->mk_accessors('format_time');
312 sub check_format_permissions
{
314 my $ffbn = $self->full_file_basename;
319 my $dir = dirname
($ffbn);
320 unless( $self->create_dirs ) {
322 $err_str .= "Directory '$dir' does not exist\n";
325 $err_str .= "Directory $dir exists, but is not writable\n";
328 my @dirs = splitdir
($dir);
331 pop @dirs while @dirs && ! -d catdir
(@dirs);
332 my $d = catdir
(@dirs);
334 $err_str .= "Entire directory tree for '$dir' does not exist!\n";
337 $err_str .= "Directory $d is not writable, cannot make dirs\n";
342 #check writability of any files that are already there
343 my @files = $self->list_files();
346 $err_str .= "Blast DB component file $_ exists, but is not overwritable\n";
349 return $err_str if $err_str;
356 my $ffbn = $self->full_file_basename;
357 return 1 if grep /^$ffbn\.\d{2,3}\.[np]\w\w$/,$self->list_files;
362 sub files_are_complete
{
365 #list of files belonging to this db
366 my @files = $self->list_files;
368 #certainly not complete if fewer than 3 files
369 return 0 unless @files >= 3;
371 #assemble list of necessary extensions
372 my @necessary_extensions = (qw
/sq hr in/, #base database files
373 #add seqid indexes if called for
374 $self->indexed_seqs ? qw
/tf to/ : (),
377 #add protein/nucleotide prefix to extensions
378 my $norp = $self->type eq 'protein' ?
'.p' : '.n';
379 $_ = $norp.$_ foreach @necessary_extensions;
381 #deal with large, split databases
382 if( $self->is_split ) {
383 #if the database is split, add all of the fragment numbers to
384 #the extensions we have to have
386 #maximum index number of all fragments present
387 my $max_frag_num = 0 + max
( map { /\.(\d{2,3})\.[np]\w\w$/ ?
$1 : 0 } @files);
389 #make extensions with all of the necessary fragment numbers
390 @necessary_extensions = map { my $ext = $_;
391 map {sprintf(".%02d$ext",$_)} (0..$max_frag_num)
392 } @necessary_extensions;
394 #also remember that we have to have an alias file for split dbs
395 push @necessary_extensions, $norp.'al';
398 #now that we have our list of all the file extensions we need to have,
399 #check if they are actually there
400 my $ffbn = $self->full_file_basename;
403 (grep {$_ eq "$ffbn$ext"} @files) ?
1 : 0
404 } @necessary_extensions;
410 croak
"cannot list files without knowing the database type" unless $self->type;
411 _list_files
($self->full_file_basename, $self->type);
413 #our internal version of this function just takes a full file basename,
414 #and a db type, and returns all the files that go with that database
416 my ($ffbn,$type) = @_;
418 #file extensions for each type of blast database
419 my %valid_extensions = ( protein
=> [qw
/.psq .phr .pin .pog .pos .pot .ptf .pto .pdb .phd .phi /],
420 nucleotide
=> [qw
/.nsq .nhr .nin .nog .nos .not .ntf .nto .ndb .nhd .nhi /],
423 #file extensions for _this_ database
424 $valid_extensions{$type} or confess
'invalid type '.$type;
425 my @search_extensions = @
{$valid_extensions{$type}};
427 #this gives us all files which have our basename, and one of the right search extensions
431 grep {$file =~ /^$ffbn(\.\d{2})?$_$/} @search_extensions
434 for (@myfiles) { -f
or confess
'sanity check failed' };
440 __PACKAGE__
->mk_accessors('sequences_count');
445 my ($self, $seqname) = @_;
447 croak
"cannot call get_sequence on an incomplete database!"
448 unless $self->files_are_complete;
450 croak
"cannot call get_sequence on a database that has not been indexed for retrieval!"
451 unless $self->indexed_seqs;
453 return Bio
::BLAST2
::Database
::Seq
->new(
459 # internal function to set the title, sequence count, type,
460 # format_time, and indexed_seqs from the set of files on disk and from
461 # the output of blastdbcmd
462 sub _read_blastdbcmd_info
{
465 my @files = $self->list_files
468 $self->_check_external_tools;
470 my $ffbn = $self->full_file_basename;
471 my $cmd = "blastdbcmd -db $ffbn -info";
472 my $blastdbcmd = `$cmd 2>&1`;
473 print STDERR
"BLASTDBCMD RETURNED: $blastdbcmd\n";
475 my ($title) = $blastdbcmd =~ /Database:\s*([\s\S]+)sequences/
476 or return (print STDERR
"could not parse output of blastdbcmd (0):\n$blastdbcmd");
477 $title =~ s/\s*[\d,]+\s*$//;
479 my ($seq_cnt) = $blastdbcmd =~ /([\d,]+)\s*sequences/
480 or return(print STDERR
"could not parse output of blastdbcmd (1):\n$blastdbcmd");
485 Date
: \s
* ( \w
[\S\
]+ \w
)
488 or die "could not parse output of blastdbcmd (2):\n$blastdbcmd";
490 print STDERR
"FILES = ".Dumper
(\
@files);
492 my $indexed = (any
{/tf$/} @files) && (any
{/to$/} @files);
495 $self->type( $self->_guess_type )
496 or confess
'could not determine db type';
498 ### type: $self->type
500 $self->format_time( _parse_datestr
($datestr) ); #< will die on failure
502 $self->title( $title );
503 $self->_indexed_seqs( $indexed );
504 $self->sequences_count( $seq_cnt );
508 my $saved_type = $self->type;
510 foreach my $guess (qw( protein nucleotide )) {
511 $self->type( $guess );
512 if( $self->files_are_complete ) {
513 $self->type( $saved_type );
518 $self->type( $saved_type );
523 my @split = split /\W+/,$datestr;
524 my ($mon,$d,$y,$h,$min,$ampm) = @split
525 or die "could not parse data string '$datestr'";
527 # warn "got $mon,$d,$y,$h,$min,$ampm\n";
528 my %months = do{ my $n = 0; map { $_ => $n++ } qw
/Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec/};
529 exists $months{$mon} or return;
530 $mon = $months{$mon};
532 $h += 12 if lc($ampm) eq 'pm';
533 #warn "mktime $min,$h,$d,$mon,".($y-1900)."\n";
534 my $time = mktime
(0,$min,$h,$d,$mon,$y-1900,0,0,-1);
535 # warn "$datestr => ".ctime($time)."\n";
550 Bio::BLAST2::Database - work with formatted BLAST databases, updated for new NCBI-Blast+ Debian package
554 use Bio::BLAST2::Database;
556 # open an existing bdb for reading
557 my $fs = Bio::BLAST2::Database->open(
558 full_file_basename => '/path/to/my_bdb',
560 # will read from /path/to/my_bdb.nin, /path/to/my_bdb.nsq, etc
562 my @filenames = $fs->list_files;
564 #reopen it for writing
565 $fs = Bio::BLAST2::Database->open(
566 full_file_basename => '/path/to/my_bdb',
570 # replace it with a different set of sequences
571 $fs->format_from_file('myseqs.seq');
573 # can also get some metadata about it
574 print "db's title is ".$fs->title;
575 print "db was last formatted on ".localtime( $fs->format_time );
576 print "db file modification was ".localtime( $fs->file_modtime );
580 Each object of this class represents an NCBI-formatted sequence
581 database on disk, which is a set of files, the exact structure of
582 which varies a bit with the type and size of the sequence set.
584 This is mostly an object-oriented wrapper for using NCBI's C<blastdbcmd>
585 and C<makeblastdb> tools.
589 =head2 full_file_basename
591 Full path to the blast database file basename. This is the entire
592 path to the BLAST database files, except for the final suffixes
593 (C<.nin>, C<.nsq>, etc).
595 my $basename = $db->full_file_basename;
596 #returns '/data/shared/blast/databases/genbank/nr'
600 true/false flag for whether to create any necessary dirs at format time
604 true/false flag for whether to write any files that are in the way when formatted
608 title of this blast database, if set
612 return whether this blast database is indexed
616 accessor for type of blastdb. must be set in new(), but open() looks
617 at the existing files and sets this
623 Usage: my $fs = Bio::BLAST2::Database->open({
624 full_file_basename => $ffbn,
628 Desc : open a BlastDB with the given ffbn.
629 Args : hashref of params as:
630 { full_file_basename => full path plus basename of files in this blastdb,
631 type => 'nucleotide' or 'protein'
632 write => default false, set true to write any files in the way,
633 create_dirs => default false, set true to create any necessary directories
636 Ret : Bio::BLAST2::Database object
637 Side Effects: none if no files are present at the given ffbn. overwise,
638 dies if files are present and write is not specified,
639 or if dir does not exist and create_dirs was not specified
644 Usage: my $fasta_fh = $bdb->to_fasta;
645 Desc : get the contents of this blast database in FASTA format
646 Ret : an IO::Pipe filehandle
648 Side Effects: runs 'blastdbcmd' in a forked process, cleaning up its output,
649 and passing it to you
651 =head2 format_from_file
653 Usage: $db->format_from_file(seqfile => 'mysequences.seq');
654 Desc : format this blast database from the given source file,
655 into its proper place on disk, overwriting the files already
657 Ret : nothing meaningful
658 Args : hash-style list as:
659 seqfile => filename containing sequences,
660 title => (optional) title for this blast database,
661 hash_index => (optional) if true, formats the database with
662 indexing (and sets indexed_seqs in this obj)
663 Side Effects: runs 'makeblastdb' to format the given sequences,
668 Desc: get the earliest unix modification time of the database files
670 Ret : unix modification time of the database files
676 Usage: my $time = $db->format_time;
677 Desc : get the format time of these db files
678 Ret : the value time() would have returned when
679 this database was last formatted, or undef
680 if that could not be determined (like if the
683 Side Effects: runs 'blastdbcmd' to extract the formatting
684 time from the database files
686 NOTE: This function assumes that the computer that
687 last formatted this database had the same time zone
688 set as the computer we are running on.
689 Also, the time returned by this function is rounded
690 down to the minute, because blastdbcmd does not print
691 the format time in seconds.
693 =head2 check_format_permissions
695 Usage: $bdb->check_format_from_file() or die "cannot format!\n";
696 Desc : check directory existence and file permissions to see if a
697 format_from_file() is likely to succeed. This is useful,
698 for example, when you have a script that downloads some
699 remote database and you'd like to check first whether
700 we even have permissions to format before you take the
701 time to download something.
702 Args : (optional) alternate full file basename to write blast DB to
703 e.g. '/tmp/mytempdir/tester_blast_db'
704 Ret : nothing if everything looks good,
705 otherwise a string error message summarizing the reason
707 Side Effects: reads from filesystem, may stat some files
711 Usage: print "that thing is split, yo" if $db->is_split;
712 Desc : determine whether this database is in multiple parts
713 Ret : true if this database has been split into multiple
714 files by makeblastdb (e.g. nr.00.pin, nr.01.pin, etc.)
716 Side Effects: looks in filesystem
718 =head2 files_are_complete
720 Usage: print "complete!" if $db->files_are_complete;
721 Desc : tell whether this blast db has a complete set of files on disk
722 Ret : true if the set of files on disk looks complete,
724 Args : (optional) true value if the files should only be
725 considered complete if the sequences are indexed for retrieval
726 Side Effects: lists files on disk
730 Usage: my @files = $db->list_files;
731 Desc : get the list of files that belong to this blast database
732 Ret : list of full paths to all files belonging to this blast database,
734 Side Effects: looks in the filesystem
736 =head2 sequences_count
738 Desc: get the number of sequences in this blast database
740 Ret : number of distinct sequences in this blast database, or undef
741 if it could not be determined due to some error or other
742 Side Effects: runs 'blastdbcmd' to get stats on the blast database file
746 Usage: my $seq = $fs->get_sequence('LE_HBa0001A02');
747 Desc : get a particular sequence from this db
748 Args : sequence name to retrieve
749 Ret : Bio::PrimarySeqI-implementing object, or nothing if not found
750 Side Effects: dies on error
752 =head1 BASE CLASS(ES)
754 L<Class::Accessor::Fast>
758 Robert Buels <rmb32@cornell.edu>
760 =head1 COPYRIGHT AND LICENSE
762 This software is copyright (c) 2011 by Robert Buels.
764 This is free software; you can redistribute it and/or modify it under
765 the same terms as the Perl 5 programming language system itself.