Bio::DB::Universal: move into its own distribution
[bioperl-live.git] / Bio / Index / Abstract.pm
blob6ad2ba849672a2c5d56af92c7df26dc24d978047
3 # BioPerl module for Bio::Index::Abstract
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Ewan Birney <birney@sanger.ac.uk>
8 # and James Gilbert <jgrg@sanger.ac.uk>
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Index::Abstract - Abstract interface for indexing a flat file
18 =head1 SYNOPSIS
20 You should not be using this module directly
22 =head1 USING DB_FILE
24 To use DB_File and not SDBM for this index, pass the value:
26 -dbm_package => 'DB_File'
28 to new (see below).
30 =head1 DESCRIPTION
32 This object provides the basic mechanism to associate positions
33 in files with names. The position and filenames are stored in DBM
34 which can then be accessed later on. It is the equivalent of flat
35 file indexing (eg, SRS or efetch).
37 This object is the guts to the mechanism, which will be used by the
38 specific objects inheriting from it.
40 =head1 FEEDBACK
42 =head2 Mailing Lists
44 User feedback is an integral part of the evolution of this and other
45 Bioperl modules. Send your comments and suggestions preferably to one
46 of the Bioperl mailing lists. Your participation is much appreciated.
48 bioperl-l@bioperl.org - General discussion
49 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
51 =head2 Support
53 Please direct usage questions or support issues to the mailing list:
55 I<bioperl-l@bioperl.org>
57 rather than to the module maintainer directly. Many experienced and
58 reponsive experts will be able look at the problem and quickly
59 address it. Please include a thorough description of the problem
60 with code and data examples if at all possible.
62 =head2 Reporting Bugs
64 Report bugs to the Bioperl bug tracking system to help us keep track
65 the bugs and their resolution. Bug reports can be submitted via the
66 web:
68 https://github.com/bioperl/bioperl-live/issues
70 =head1 AUTHOR - Ewan Birney, James Gilbert
72 Email - birney@sanger.ac.uk, jgrg@sanger.ac.uk
74 =head1 APPENDIX
76 The rest of the documentation details each of the object methods. Internal
77 methods are usually preceded with an "_" (underscore).
79 =cut
82 # Let the code begin...
84 package Bio::Index::Abstract;
86 use strict;
87 use Fcntl qw( O_RDWR O_CREAT O_RDONLY );
88 use vars qw( $TYPE_AND_VERSION_KEY
89 $USE_DBM_TYPE $DB_HASH );
92 use Bio::Root::IO;
93 use Symbol;
95 use base qw(Bio::Root::Root);
97 # Generate accessor methods for simple object fields
98 BEGIN {
99 foreach my $func (qw(filename write_flag)) {
100 no strict 'refs';
101 my $field = "_$func";
103 *$func = sub {
104 my( $self, $value ) = @_;
106 if (defined $value) {
107 $self->{$field} = $value;
109 return $self->{$field};
114 =head2 new
116 Usage : $index = Bio::Index::Abstract->new(
117 -filename => $dbm_file,
118 -write_flag => 0,
119 -dbm_package => 'DB_File',
120 -verbose => 0);
121 Function: Returns a new index object. If filename is
122 specified, then open_dbm() is immediately called.
123 Bio::Index::Abstract->new() will usually be called
124 directly only when opening an existing index.
125 Returns : A new index object
126 Args : -filename The name of the dbm index file.
127 -write_flag TRUE if write access to the dbm file is
128 needed.
129 -dbm_package The Perl dbm module to use for the
130 index.
131 -verbose Print debugging output to STDERR if
132 TRUE.
134 =cut
136 sub new {
137 my($class, @args) = @_;
138 my $self = $class->SUPER::new(@args);
139 my( $filename, $write_flag, $dbm_package, $cachesize, $ffactor, $pathtype ) =
140 $self->_rearrange([qw(FILENAME
141 WRITE_FLAG
142 DBM_PACKAGE
143 CACHESIZE
144 FFACTOR
145 PATHTYPE
146 )], @args);
148 # Store any parameters passed
149 $self->filename($filename) if $filename;
150 $self->cachesize($cachesize) if $cachesize;
151 $self->ffactor($ffactor) if $ffactor;
152 $self->write_flag($write_flag) if $write_flag;
153 $self->dbm_package($dbm_package) if $dbm_package;
155 #If user doesn't give a path, we default it to absolute
156 $pathtype ? $self->pathtype($pathtype) : $self->pathtype('absolute');
158 $self->{'_filehandle'} = []; # Array in which to cache SeqIO objects
159 $self->{'_DB'} = {}; # Gets tied to the DBM file
161 # Open database
162 $self->open_dbm() if $filename;
163 return $self;
166 =pod
168 =head2 filename
170 Title : filename
171 Usage : $value = $self->filename();
172 $self->filename($value);
173 Function: Gets or sets the name of the dbm index file.
174 Returns : The current value of filename
175 Args : Value of filename if setting, or none if
176 getting the value.
178 =head2 write_flag
180 Title : write_flag
181 Usage : $value = $self->write_flag();
182 $self->write_flag($value);
183 Function: Gets or sets the value of write_flag, which
184 is whether the dbm file should be opened with
185 write access.
186 Returns : The current value of write_flag (default 0)
187 Args : Value of write_flag if setting, or none if
188 getting the value.
190 =head2 dbm_package
192 Usage : $value = $self->dbm_package();
193 $self->dbm_package($value);
195 Function: Gets or sets the name of the Perl dbm module used.
196 If the value is unset, then it returns the value of
197 the package variable $USE_DBM_TYPE or if that is
198 unset, then it chooses the best available dbm type,
199 choosing 'DB_File' in preference to 'SDBM_File'.
200 Bio::Abstract::Index may work with other dbm file
201 types.
203 Returns : The current value of dbm_package
204 Args : Value of dbm_package if setting, or none if
205 getting the value.
207 =cut
209 sub dbm_package {
210 my( $self, $value ) = @_;
211 my $to_require = 0;
212 if( $value || ! $self->{'_dbm_package'} ) {
213 my $type = $value || $USE_DBM_TYPE || 'DB_File';
214 if( $type =~ /DB_File/i ) {
215 eval {
216 require DB_File;
218 $type = ( $@ ) ? 'SDBM_File' : 'DB_File';
220 if( $type ne 'DB_File' ) {
221 eval { require "$type.pm"; };
222 $self->throw($@) if( $@ );
224 $self->{'_dbm_package'} = $type;
225 if( ! defined $USE_DBM_TYPE ) {
226 $USE_DBM_TYPE = $self->{'_dbm_package'};
229 return $self->{'_dbm_package'};
232 =head2 db
234 Title : db
235 Usage : $index->db
236 Function: Returns a ref to the hash which is tied to the dbm
237 file. Used internally when adding and retrieving
238 data from the database.
239 Example : $db = $index->db();
240 $db->{ $some_key } = $data
241 $data = $index->db->{ $some_key };
242 Returns : ref to HASH
243 Args : NONE
245 =cut
247 sub db {
248 return $_[0]->{'_DB'};
252 =head2 get_stream
254 Title : get_stream
255 Usage : $stream = $index->get_stream( $id );
256 Function: Returns a file handle with the file pointer
257 at the approprite place
259 This provides for a way to get the actual
260 file contents and not an object
262 WARNING: you must parse the record deliminter
263 *yourself*. Abstract won't do this for you
264 So this code
266 $fh = $index->get_stream($myid);
267 while( <$fh> ) {
268 # do something
270 will parse the entire file if you don't put in
271 a last statement in, like
273 while( <$fh> ) {
274 /^\/\// && last; # end of record
275 # do something
278 Returns : A filehandle object
279 Args : string represents the accession number
280 Notes : This method should not be used without forethought
282 =cut
286 sub get_stream {
287 my ($self,$id) = @_;
289 my ($desc,$acc,$out);
290 my $db = $self->db();
292 if (my $rec = $db->{ $id }) {
293 my( @record );
295 my ($file, $begin, $end) = $self->unpack_record( $rec );
297 # Get the (possibly cached) filehandle
298 my $fh = $self->_file_handle( $file );
300 # move to start
301 seek($fh, $begin, 0);
303 return $fh;
304 } else {
305 $self->throw("Unable to find a record for $id in the flat file index");
310 =head2 cachesize
312 Usage : $index->cachesize(1000000)
313 Function: Sets the dbm file cache size for the index.
314 Needs to be set before the DBM file gets opened.
315 Example : $index->cachesize(1000000)
316 Returns : size of the curent cache
318 =cut
320 sub cachesize {
321 my( $self, $size ) = @_;
323 if(defined $size){
324 $self->{'_cachesize'} = $size;
326 return ( $self->{'_cachesize'} );
330 =head2 ffactor
332 Usage : $index->ffactor(1000000)
333 Function: Sets the dbm file fill factor.
334 Needs to be set before the DBM file gets opened.
336 Example : $index->ffactor(1000000)
337 Returns : size of the curent cache
339 =cut
341 sub ffactor {
342 my( $self, $size ) = @_;
344 if(defined $size){
345 $self->{'_ffactor'} = $size;
347 return ( $self->{'_ffactor'} );
351 =head2 open_dbm
353 Usage : $index->open_dbm()
354 Function: Opens the dbm file associated with the index
355 object. Write access is only given if explicitly
356 asked for by calling new(-write => 1) or having set
357 the write_flag(1) on the index object. The type of
358 dbm file opened is that returned by dbm_package().
359 The name of the file to be is opened is obtained by
360 calling the filename() method.
362 Example : $index->_open_dbm()
363 Returns : 1 on success
365 =cut
367 sub open_dbm {
368 my( $self ) = @_;
370 my $filename = $self->filename()
371 or $self->throw("filename() not set");
373 my $db = $self->db();
375 # Close the dbm file if already open (maybe we're getting
376 # or dropping write access
377 if (ref($db) ne 'HASH') {
378 untie($db);
381 # What kind of DBM file are we going to open?
382 my $dbm_type = $self->dbm_package;
384 # Choose mode for opening dbm file (read/write+create or read-only).
385 my $mode_flags = $self->write_flag ? O_RDWR|O_CREAT : O_RDONLY;
387 # Open the dbm file
388 if ($dbm_type eq 'DB_File') {
389 my $hash_inf = DB_File::HASHINFO->new();
390 my $cache = $self->cachesize();
391 my $ffactor = $self->ffactor();
392 if ($cache){
393 $hash_inf->{'cachesize'} = $cache;
395 if ($ffactor){
396 $hash_inf->{'ffactor'} = $ffactor;
398 tie( %$db, $dbm_type, $filename, $mode_flags, 0644, $hash_inf )
399 or $self->throw("Can't open '$dbm_type' dbm file '$filename' : $!");
400 } else {
401 tie( %$db, $dbm_type, $filename, $mode_flags, 0644 )
402 or $self->throw("Can't open '$dbm_type' dbm file '$filename' : $!");
405 # The following methods access data in the dbm file:
407 # Now, if we're a Bio::Index::Abstract caterpillar, then we
408 # transform ourselves into a Bio::Index::<something> butterfly!
409 if( ref($self) eq "Bio::Index::Abstract" ) {
410 my $pkg = $self->_code_base();
411 bless $self, $pkg;
414 # Check or set this is the right kind and version of index
415 $self->_type_and_version();
417 # Check files haven't changed size since they were indexed
418 $self->_check_file_sizes();
420 return 1;
423 =head2 _version
425 Title : _version
426 Usage : $type = $index->_version()
427 Function: Returns a string which identifes the version of an
428 index module. Used to permanently identify an index
429 file as having been created by a particular version
430 of the index module. Must be provided by the sub class
431 Example :
432 Returns :
433 Args : none
435 =cut
437 sub _version {
438 my $self = shift;
439 $self->throw("In Bio::Index::Abstract, no _version method in sub class");
442 =head2 _code_base
444 Title : _code_base
445 Usage : $code = $db->_code_base();
446 Function:
447 Example :
448 Returns : Code package to be used with this
449 Args :
452 =cut
454 sub _code_base {
455 my ($self) = @_;
456 my $code_key = '__TYPE_AND_VERSION';
457 my $record;
459 $record = $self->db->{$code_key};
461 my($code,$version) = $self->unpack_record($record);
462 if( wantarray ) {
463 return ($code,$version);
464 } else {
465 return $code;
470 =head2 _type_and_version
472 Title : _type_and_version
473 Usage : Called by _initalize
474 Function: Checks that the index opened is made by the same index
475 module and version of that module that made it. If the
476 index is empty, then it adds the information to the
477 database.
478 Example :
479 Returns : 1 or exception
480 Args : none
482 =cut
484 sub _type_and_version {
485 my $self = shift;
486 my $key = '__TYPE_AND_VERSION';
487 my $version = $self->_version();
488 my $type = ref $self;
490 # Run check or add type and version key if missing
491 if (my $rec = $self->db->{ $key }) {
492 my( $db_type, $db_version ) = $self->unpack_record($rec);
493 $self->throw("This index file is from version [$db_version] - You need to rebuild it to use module version [$version]")
494 unless $db_version == $version;
495 $self->throw("This index file is type [$db_type] - Can't access it with module for [$type]")
496 unless $db_type eq $type;
497 } else {
498 $self->add_record( $key, $type, $version )
499 or $self->throw("Can't add Type and Version record");
501 return 1;
505 =head2 _check_file_sizes
507 Title : _check_file_sizes
508 Usage : $index->_check_file_sizes()
509 Function: Verifies that the files listed in the database
510 are the same size as when the database was built,
511 or throws an exception. Called by the new()
512 function.
513 Example :
514 Returns : 1 or exception
515 Args :
517 =cut
519 sub _check_file_sizes {
520 my $self = shift;
521 my $num = $self->_file_count() || 0;
523 for (my $i = 0; $i < $num; $i++) {
524 my( $file, $stored_size ) = $self->unpack_record( $self->db->{"__FILE_$i"} );
525 my $size = -s $file;
526 unless ($size == $stored_size) {
527 $self->throw("file $i [ $file ] has changed size $stored_size -> $size. This probably means you need to rebuild the index.");
530 return 1;
534 =head2 make_index
536 Title : make_index
537 Usage : $index->make_index( FILE_LIST )
538 Function: Takes a list of file names, checks that they are
539 all fully qualified, and then calls _filename() on
540 each. It supplies _filename() with the name of the
541 file, and an integer which is stored with each record
542 created by _filename(). Can be called multiple times,
543 and can be used to add to an existing index file.
544 Example : $index->make_index( '/home/seqs1', '/home/seqs2', '/nfs/pub/big_db' );
545 Returns : Number of files indexed
546 Args : LIST OF FILES
548 =cut
550 sub make_index {
551 my($self, @files) = @_;
552 my $count = 0;
553 my $recs = 0;
554 # blow up if write flag is not set. EB fix
556 if( !defined $self->write_flag ) {
557 $self->throw("Attempting to make an index on a read-only database. What about a WRITE flag on opening the index?");
560 # We're really fussy/lazy, expecting all file names to be fully qualified
561 $self->throw("No files to index provided") unless @files;
562 for(my $i=0;$i<scalar @files; $i++) {
563 if( $Bio::Root::IO::FILESPECLOADED && File::Spec->can('rel2abs') ) {
564 if( ! File::Spec->file_name_is_absolute($files[$i])
565 && $self->pathtype() ne 'relative') {
566 $files[$i] = File::Spec->rel2abs($files[$i]);
568 } else {
569 if( $^O =~ /MSWin/i ) {
570 ($files[$i] =~ m|^[A-Za-z]:/|) ||
571 $self->throw("Not an absolute file path '$files[$i]'");
572 } else {
573 ($files[$i] =~ m|^/|) ||
574 $self->throw("Not an absolute file path '$files[$i]'");
577 $self->throw("File does not exist '$files[$i]'") unless -e $files[$i];
580 # Add each file to the index
581 FILE :
582 foreach my $file (@files) {
584 my $i; # index for this file
586 # Get new index for this file and increment file count
587 if ( defined(my $count = $self->_file_count) ) {
588 $i = $count;
589 } else {
590 $i = 0; $self->_file_count(0);
593 # see whether this file has been already indexed
594 my ($record,$number,$size);
596 if( ($record = $self->db->{"__FILENAME_$file"}) ) {
597 ($number,$size) = $self->unpack_record($record);
599 # if it is the same size - fine. Otherwise die
600 if( -s $file == $size ) {
601 $self->warn("File $file already indexed. Skipping...");
602 next FILE;
603 } else {
604 $self->throw("In index, $file has changed size ($size). Indicates that the index is out of date");
608 # index this file
609 $self->debug("Indexing file $file\n");
611 # this is supplied by the subclass and does the serious work
612 $recs += $self->_index_file( $file, $i ); # Specific method for each type of index
614 # Save file name and size for this index
615 $self->add_record("__FILE_$i", $file, -s $file)
616 or $self->throw("Can't add data to file: $file");
617 $self->add_record("__FILENAME_$file", $i, -s $file)
618 or $self->throw("Can't add data to file: $file");
620 # increment file lines
621 $i++; $self->_file_count($i);
622 my $temp;
623 $temp = $self->_file_count();
625 return ($count, $recs);
628 =head2 pathtype
630 Title : pathtype
631 Usage : $index->pathtype($pathtype)
632 Function: Set the type of the file path
633 Only two values are supported, 'relative' or 'absolute'.
634 If the user does not give any value, it is set to
635 absolute by default. Thus it mimics the default
636 behavior of Bio::Index::Abstract module.
637 Example : my $index = Bio::Index::Abstract->(-pathtype => 'relative',
638 -file => $file.inx,
641 $index->pathtype('relative');
642 Returns : Type of the path.
643 Args : String (relative|absolute)
645 =cut
647 sub pathtype {
649 my($self, $type) = @_;
651 if(defined($type)){
652 if($type ne 'absolute' && $type ne 'relative'){
653 $self->throw("Type of path can only be 'relative' or 'absolute', not [$type].");
655 $self->{'_filepathtype'} = $type;
658 return $self->{'_filepathtype'};
662 =head2 _filename
664 Title : _filename
665 Usage : $index->_filename( FILE INT )
666 Function: Indexes the file
667 Example :
668 Returns :
669 Args :
671 =cut
673 sub _index_file {
674 my $self = shift;
676 my $pkg = ref($self);
677 $self->throw("Error: '$pkg' does not provide the _index_file() method");
682 =head2 _file_handle
684 Title : _file_handle
685 Usage : $fh = $index->_file_handle( INT )
686 Function: Returns an open filehandle for the file
687 index INT. On opening a new filehandle it
688 caches it in the @{$index->_filehandle} array.
689 If the requested filehandle is already open,
690 it simply returns it from the array.
691 Example : $first_file_indexed = $index->_file_handle( 0 );
692 Returns : ref to a filehandle
693 Args : INT
695 =cut
697 sub _file_handle {
698 my( $self, $i ) = @_;
700 unless ($self->{'_filehandle'}[$i]) {
701 my @rec = $self->unpack_record($self->db->{"__FILE_$i"})
702 or $self->throw("Can't get filename for index : $i");
703 my $file = $rec[0];
704 open my $fh, '<', $file or $self->throw("Could not read file '$file': $!");
705 $self->{'_filehandle'}[$i] = $fh; # Cache filehandle
707 return $self->{'_filehandle'}[$i];
711 =head2 _file_count
713 Title : _file_count
714 Usage : $index->_file_count( INT )
715 Function: Used by the index building sub in a sub class to
716 track the number of files indexed. Sets or gets
717 the number of files indexed when called with or
718 without an argument.
719 Example :
720 Returns : INT
721 Args : INT
723 =cut
725 sub _file_count {
726 my $self = shift;
727 if (@_) {
728 $self->db->{'__FILE_COUNT'} = shift;
730 return $self->db->{'__FILE_COUNT'};
734 =head2 add_record
736 Title : add_record
737 Usage : $index->add_record( $id, @stuff );
738 Function: Calls pack_record on @stuff, and adds the result
739 of pack_record to the index database under key $id.
740 If $id is a reference to an array, then a new entry
741 is added under a key corresponding to each element
742 of the array.
743 Example : $index->add_record( $id, $fileNumber, $begin, $end )
744 Returns : TRUE on success or FALSE on failure
745 Args : ID LIST
747 =cut
749 sub add_record {
750 my( $self, $id, @rec ) = @_;
751 $self->debug( "Adding key $id\n");
752 if( exists $self->db->{$id} ) {
753 $self->warn("overwriting a current value stored for $id\n");
755 $self->db->{$id} = $self->pack_record( @rec );
756 return 1;
760 =head2 pack_record
762 Title : pack_record
763 Usage : $packed_string = $index->pack_record( LIST )
764 Function: Packs an array of scalars into a single string
765 joined by ASCII 034 (which is unlikely to be used
766 in any of the strings), and returns it.
767 Example : $packed_string = $index->pack_record( $fileNumber, $begin, $end )
768 Returns : STRING or undef
769 Args : LIST
771 =cut
773 sub pack_record {
774 my( $self, @args ) = @_;
775 # Silence undefined warnings
776 @args = map {
777 $_ = (defined $_) ? $_ : '';
778 $_ ;
779 } @args;
780 return join "\034", @args;
783 =head2 unpack_record
785 Title : unpack_record
786 Usage : $index->unpack_record( STRING )
787 Function: Splits the sting provided into an array,
788 splitting on ASCII 034.
789 Example : ( $fileNumber, $begin, $end ) = $index->unpack_record( $self->db->{$id} )
790 Returns : A 3 element ARRAY
791 Args : STRING containing ASCII 034
793 =cut
795 sub unpack_record {
796 my( $self, @args ) = @_;
797 return split /\034/, $args[0];
800 =head2 count_records
802 Title : count_records
803 Usage : $recs = $seqdb->count_records()
804 Function: return count of all recs in the index
805 Example :
806 Returns : a scalar
807 Args : none
810 =cut
812 sub count_records {
813 my ($self,@args) = @_;
814 my $db = $self->db;
815 my $c = 0;
816 while (my($id, $rec) = each %$db) {
817 if( $id =~ /^__/ ) {
818 # internal info
819 next;
821 $c++;
823 return ($c);
827 =head2 DESTROY
829 Title : DESTROY
830 Usage : Called automatically when index goes out of scope
831 Function: Closes connection to database and handles to
832 sequence files
833 Returns : NEVER
834 Args : NONE
837 =cut
839 sub DESTROY {
840 my $self = shift;
841 untie($self->{'_DB'});
842 # An additional undef was the only way to force
843 # the object to drop the open filehandles for ActivePerl
844 undef $self->{'_DB'};