3 # BioPerl module for Bio::Index::Abstract
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Ewan Birney <birney@sanger.ac.uk>
8 # and James Gilbert <jgrg@sanger.ac.uk>
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Index::Abstract - Abstract interface for indexing a flat file
20 You should not be using this module directly
24 To use DB_File and not SDBM for this index, pass the value:
26 -dbm_package => 'DB_File'
32 This object provides the basic mechanism to associate positions
33 in files with names. The position and filenames are stored in DBM
34 which can then be accessed later on. It is the equivalent of flat
35 file indexing (eg, SRS or efetch).
37 This object is the guts to the mechanism, which will be used by the
38 specific objects inheriting from it.
44 User feedback is an integral part of the evolution of this and other
45 Bioperl modules. Send your comments and suggestions preferably to one
46 of the Bioperl mailing lists. Your participation is much appreciated.
48 bioperl-l@bioperl.org - General discussion
49 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
53 Please direct usage questions or support issues to the mailing list:
55 I<bioperl-l@bioperl.org>
57 rather than to the module maintainer directly. Many experienced and
58 reponsive experts will be able look at the problem and quickly
59 address it. Please include a thorough description of the problem
60 with code and data examples if at all possible.
64 Report bugs to the Bioperl bug tracking system to help us keep track
65 the bugs and their resolution. Bug reports can be submitted via the
68 https://github.com/bioperl/bioperl-live/issues
70 =head1 AUTHOR - Ewan Birney, James Gilbert
72 Email - birney@sanger.ac.uk, jgrg@sanger.ac.uk
76 The rest of the documentation details each of the object methods. Internal
77 methods are usually preceded with an "_" (underscore).
82 # Let the code begin...
84 package Bio
::Index
::Abstract
;
87 use Fcntl
qw( O_RDWR O_CREAT O_RDONLY );
88 use vars
qw( $TYPE_AND_VERSION_KEY
89 $USE_DBM_TYPE $DB_HASH );
95 use base qw(Bio::Root::Root);
97 # Generate accessor methods for simple object fields
99 foreach my $func (qw(filename write_flag)) {
101 my $field = "_$func";
104 my( $self, $value ) = @_;
106 if (defined $value) {
107 $self->{$field} = $value;
109 return $self->{$field};
116 Usage : $index = Bio::Index::Abstract->new(
117 -filename => $dbm_file,
119 -dbm_package => 'DB_File',
121 Function: Returns a new index object. If filename is
122 specified, then open_dbm() is immediately called.
123 Bio::Index::Abstract->new() will usually be called
124 directly only when opening an existing index.
125 Returns : A new index object
126 Args : -filename The name of the dbm index file.
127 -write_flag TRUE if write access to the dbm file is
129 -dbm_package The Perl dbm module to use for the
131 -verbose Print debugging output to STDERR if
137 my($class, @args) = @_;
138 my $self = $class->SUPER::new
(@args);
139 my( $filename, $write_flag, $dbm_package, $cachesize, $ffactor, $pathtype ) =
140 $self->_rearrange([qw(FILENAME
148 # Store any parameters passed
149 $self->filename($filename) if $filename;
150 $self->cachesize($cachesize) if $cachesize;
151 $self->ffactor($ffactor) if $ffactor;
152 $self->write_flag($write_flag) if $write_flag;
153 $self->dbm_package($dbm_package) if $dbm_package;
155 #If user doesn't give a path, we default it to absolute
156 $pathtype ?
$self->pathtype($pathtype) : $self->pathtype('absolute');
158 $self->{'_filehandle'} = []; # Array in which to cache SeqIO objects
159 $self->{'_DB'} = {}; # Gets tied to the DBM file
162 $self->open_dbm() if $filename;
171 Usage : $value = $self->filename();
172 $self->filename($value);
173 Function: Gets or sets the name of the dbm index file.
174 Returns : The current value of filename
175 Args : Value of filename if setting, or none if
181 Usage : $value = $self->write_flag();
182 $self->write_flag($value);
183 Function: Gets or sets the value of write_flag, which
184 is whether the dbm file should be opened with
186 Returns : The current value of write_flag (default 0)
187 Args : Value of write_flag if setting, or none if
192 Usage : $value = $self->dbm_package();
193 $self->dbm_package($value);
195 Function: Gets or sets the name of the Perl dbm module used.
196 If the value is unset, then it returns the value of
197 the package variable $USE_DBM_TYPE or if that is
198 unset, then it chooses the best available dbm type,
199 choosing 'DB_File' in preference to 'SDBM_File'.
200 Bio::Abstract::Index may work with other dbm file
203 Returns : The current value of dbm_package
204 Args : Value of dbm_package if setting, or none if
210 my( $self, $value ) = @_;
212 if( $value || ! $self->{'_dbm_package'} ) {
213 my $type = $value || $USE_DBM_TYPE || 'DB_File';
214 if( $type =~ /DB_File/i ) {
218 $type = ( $@
) ?
'SDBM_File' : 'DB_File';
220 if( $type ne 'DB_File' ) {
221 eval { require "$type.pm"; };
222 $self->throw($@
) if( $@
);
224 $self->{'_dbm_package'} = $type;
225 if( ! defined $USE_DBM_TYPE ) {
226 $USE_DBM_TYPE = $self->{'_dbm_package'};
229 return $self->{'_dbm_package'};
236 Function: Returns a ref to the hash which is tied to the dbm
237 file. Used internally when adding and retrieving
238 data from the database.
239 Example : $db = $index->db();
240 $db->{ $some_key } = $data
241 $data = $index->db->{ $some_key };
242 Returns : ref to HASH
248 return $_[0]->{'_DB'};
255 Usage : $stream = $index->get_stream( $id );
256 Function: Returns a file handle with the file pointer
257 at the approprite place
259 This provides for a way to get the actual
260 file contents and not an object
262 WARNING: you must parse the record deliminter
263 *yourself*. Abstract won't do this for you
266 $fh = $index->get_stream($myid);
270 will parse the entire file if you don't put in
271 a last statement in, like
274 /^\/\// && last; # end of record
278 Returns : A filehandle object
279 Args : string represents the accession number
280 Notes : This method should not be used without forethought
289 my ($desc,$acc,$out);
290 my $db = $self->db();
292 if (my $rec = $db->{ $id }) {
295 my ($file, $begin, $end) = $self->unpack_record( $rec );
297 # Get the (possibly cached) filehandle
298 my $fh = $self->_file_handle( $file );
301 seek($fh, $begin, 0);
305 $self->throw("Unable to find a record for $id in the flat file index");
312 Usage : $index->cachesize(1000000)
313 Function: Sets the dbm file cache size for the index.
314 Needs to be set before the DBM file gets opened.
315 Example : $index->cachesize(1000000)
316 Returns : size of the curent cache
321 my( $self, $size ) = @_;
324 $self->{'_cachesize'} = $size;
326 return ( $self->{'_cachesize'} );
332 Usage : $index->ffactor(1000000)
333 Function: Sets the dbm file fill factor.
334 Needs to be set before the DBM file gets opened.
336 Example : $index->ffactor(1000000)
337 Returns : size of the curent cache
342 my( $self, $size ) = @_;
345 $self->{'_ffactor'} = $size;
347 return ( $self->{'_ffactor'} );
353 Usage : $index->open_dbm()
354 Function: Opens the dbm file associated with the index
355 object. Write access is only given if explicitly
356 asked for by calling new(-write => 1) or having set
357 the write_flag(1) on the index object. The type of
358 dbm file opened is that returned by dbm_package().
359 The name of the file to be is opened is obtained by
360 calling the filename() method.
362 Example : $index->_open_dbm()
363 Returns : 1 on success
370 my $filename = $self->filename()
371 or $self->throw("filename() not set");
373 my $db = $self->db();
375 # Close the dbm file if already open (maybe we're getting
376 # or dropping write access
377 if (ref($db) ne 'HASH') {
381 # What kind of DBM file are we going to open?
382 my $dbm_type = $self->dbm_package;
384 # Choose mode for opening dbm file (read/write+create or read-only).
385 my $mode_flags = $self->write_flag ? O_RDWR
|O_CREAT
: O_RDONLY
;
388 if ($dbm_type eq 'DB_File') {
389 my $hash_inf = DB_File
::HASHINFO
->new();
390 my $cache = $self->cachesize();
391 my $ffactor = $self->ffactor();
393 $hash_inf->{'cachesize'} = $cache;
396 $hash_inf->{'ffactor'} = $ffactor;
398 tie
( %$db, $dbm_type, $filename, $mode_flags, 0644, $hash_inf )
399 or $self->throw("Can't open '$dbm_type' dbm file '$filename' : $!");
401 tie
( %$db, $dbm_type, $filename, $mode_flags, 0644 )
402 or $self->throw("Can't open '$dbm_type' dbm file '$filename' : $!");
405 # The following methods access data in the dbm file:
407 # Now, if we're a Bio::Index::Abstract caterpillar, then we
408 # transform ourselves into a Bio::Index::<something> butterfly!
409 if( ref($self) eq "Bio::Index::Abstract" ) {
410 my $pkg = $self->_code_base();
414 # Check or set this is the right kind and version of index
415 $self->_type_and_version();
417 # Check files haven't changed size since they were indexed
418 $self->_check_file_sizes();
426 Usage : $type = $index->_version()
427 Function: Returns a string which identifes the version of an
428 index module. Used to permanently identify an index
429 file as having been created by a particular version
430 of the index module. Must be provided by the sub class
439 $self->throw("In Bio::Index::Abstract, no _version method in sub class");
445 Usage : $code = $db->_code_base();
448 Returns : Code package to be used with this
456 my $code_key = '__TYPE_AND_VERSION';
459 $record = $self->db->{$code_key};
461 my($code,$version) = $self->unpack_record($record);
463 return ($code,$version);
470 =head2 _type_and_version
472 Title : _type_and_version
473 Usage : Called by _initalize
474 Function: Checks that the index opened is made by the same index
475 module and version of that module that made it. If the
476 index is empty, then it adds the information to the
479 Returns : 1 or exception
484 sub _type_and_version
{
486 my $key = '__TYPE_AND_VERSION';
487 my $version = $self->_version();
488 my $type = ref $self;
490 # Run check or add type and version key if missing
491 if (my $rec = $self->db->{ $key }) {
492 my( $db_type, $db_version ) = $self->unpack_record($rec);
493 $self->throw("This index file is type [$db_type] - Can't access it with module for [$type]")
494 unless $db_type eq $type;
495 $self->throw("This index file is from version [$db_version] - You need to rebuild it to use module version [$version]")
496 unless $db_version == $version;
498 $self->add_record( $key, $type, $version )
499 or $self->throw("Can't add Type and Version record");
505 =head2 _check_file_sizes
507 Title : _check_file_sizes
508 Usage : $index->_check_file_sizes()
509 Function: Verifies that the files listed in the database
510 are the same size as when the database was built,
511 or throws an exception. Called by the new()
514 Returns : 1 or exception
519 sub _check_file_sizes
{
521 my $num = $self->_file_count() || 0;
523 for (my $i = 0; $i < $num; $i++) {
524 my( $file, $stored_size ) = $self->unpack_record( $self->db->{"__FILE_$i"} );
526 unless ($size == $stored_size) {
527 $self->throw("file $i [ $file ] has changed size $stored_size -> $size. This probably means you need to rebuild the index.");
537 Usage : $index->make_index( FILE_LIST )
538 Function: Takes a list of file names, checks that they are
539 all fully qualified, and then calls _filename() on
540 each. It supplies _filename() with the name of the
541 file, and an integer which is stored with each record
542 created by _filename(). Can be called multiple times,
543 and can be used to add to an existing index file.
544 Example : $index->make_index( '/home/seqs1', '/home/seqs2', '/nfs/pub/big_db' );
545 Returns : Number of files indexed
551 my($self, @files) = @_;
554 # blow up if write flag is not set. EB fix
556 if( !defined $self->write_flag ) {
557 $self->throw("Attempting to make an index on a read-only database. What about a WRITE flag on opening the index?");
560 # We're really fussy/lazy, expecting all file names to be fully qualified
561 $self->throw("No files to index provided") unless @files;
562 for(my $i=0;$i<scalar @files; $i++) {
563 if( $Bio::Root
::IO
::FILESPECLOADED
&& File
::Spec
->can('rel2abs') ) {
564 if( ! File
::Spec
->file_name_is_absolute($files[$i])
565 && $self->pathtype() ne 'relative') {
566 $files[$i] = File
::Spec
->rel2abs($files[$i]);
569 if( $^O
=~ /MSWin/i ) {
570 ($files[$i] =~ m
|^[A
-Za
-z
]:/|) ||
571 $self->throw("Not an absolute file path '$files[$i]'");
573 ($files[$i] =~ m
|^/|) ||
574 $self->throw("Not an absolute file path '$files[$i]'");
577 $self->throw("File does not exist '$files[$i]'") unless -e
$files[$i];
580 # Add each file to the index
582 foreach my $file (@files) {
584 my $i; # index for this file
586 # Get new index for this file and increment file count
587 if ( defined(my $count = $self->_file_count) ) {
590 $i = 0; $self->_file_count(0);
593 # see whether this file has been already indexed
594 my ($record,$number,$size);
596 if( ($record = $self->db->{"__FILENAME_$file"}) ) {
597 ($number,$size) = $self->unpack_record($record);
599 # if it is the same size - fine. Otherwise die
600 if( -s
$file == $size ) {
601 $self->warn("File $file already indexed. Skipping...");
604 $self->throw("In index, $file has changed size ($size). Indicates that the index is out of date");
609 $self->debug("Indexing file $file\n");
611 # this is supplied by the subclass and does the serious work
612 $recs += $self->_index_file( $file, $i ); # Specific method for each type of index
614 # Save file name and size for this index
615 $self->add_record("__FILE_$i", $file, -s
$file)
616 or $self->throw("Can't add data to file: $file");
617 $self->add_record("__FILENAME_$file", $i, -s
$file)
618 or $self->throw("Can't add data to file: $file");
620 # increment file lines
621 $i++; $self->_file_count($i);
623 $temp = $self->_file_count();
625 return ($count, $recs);
631 Usage : $index->pathtype($pathtype)
632 Function: Set the type of the file path
633 Only two values are supported, 'relative' or 'absolute'.
634 If the user does not give any value, it is set to
635 absolute by default. Thus it mimics the default
636 behavior of Bio::Index::Abstract module.
637 Example : my $index = Bio::Index::Abstract->(-pathtype => 'relative',
641 $index->pathtype('relative');
642 Returns : Type of the path.
643 Args : String (relative|absolute)
649 my($self, $type) = @_;
652 if($type ne 'absolute' && $type ne 'relative'){
653 $self->throw("Type of path can only be 'relative' or 'absolute', not [$type].");
655 $self->{'_filepathtype'} = $type;
658 return $self->{'_filepathtype'};
665 Usage : $index->_filename( FILE INT )
666 Function: Indexes the file
676 my $pkg = ref($self);
677 $self->throw("Error: '$pkg' does not provide the _index_file() method");
685 Usage : $fh = $index->_file_handle( INT )
686 Function: Returns an open filehandle for the file
687 index INT. On opening a new filehandle it
688 caches it in the @{$index->_filehandle} array.
689 If the requested filehandle is already open,
690 it simply returns it from the array.
691 Example : $first_file_indexed = $index->_file_handle( 0 );
692 Returns : ref to a filehandle
698 my( $self, $i ) = @_;
700 unless ($self->{'_filehandle'}[$i]) {
701 my @rec = $self->unpack_record($self->db->{"__FILE_$i"})
702 or $self->throw("Can't get filename for index : $i");
704 open my $fh, '<', $file or $self->throw("Could not read file '$file': $!");
705 $self->{'_filehandle'}[$i] = $fh; # Cache filehandle
707 return $self->{'_filehandle'}[$i];
714 Usage : $index->_file_count( INT )
715 Function: Used by the index building sub in a sub class to
716 track the number of files indexed. Sets or gets
717 the number of files indexed when called with or
728 $self->db->{'__FILE_COUNT'} = shift;
730 return $self->db->{'__FILE_COUNT'};
737 Usage : $index->add_record( $id, @stuff );
738 Function: Calls pack_record on @stuff, and adds the result
739 of pack_record to the index database under key $id.
740 If $id is a reference to an array, then a new entry
741 is added under a key corresponding to each element
743 Example : $index->add_record( $id, $fileNumber, $begin, $end )
744 Returns : TRUE on success or FALSE on failure
750 my( $self, $id, @rec ) = @_;
751 $self->debug( "Adding key $id\n");
752 if( exists $self->db->{$id} ) {
753 $self->warn("overwriting a current value stored for $id\n");
755 $self->db->{$id} = $self->pack_record( @rec );
763 Usage : $packed_string = $index->pack_record( LIST )
764 Function: Packs an array of scalars into a single string
765 joined by ASCII 034 (which is unlikely to be used
766 in any of the strings), and returns it.
767 Example : $packed_string = $index->pack_record( $fileNumber, $begin, $end )
768 Returns : STRING or undef
774 my( $self, @args ) = @_;
775 # Silence undefined warnings
777 $_ = (defined $_) ?
$_ : '';
780 return join "\034", @args;
785 Title : unpack_record
786 Usage : $index->unpack_record( STRING )
787 Function: Splits the sting provided into an array,
788 splitting on ASCII 034.
789 Example : ( $fileNumber, $begin, $end ) = $index->unpack_record( $self->db->{$id} )
790 Returns : A 3 element ARRAY
791 Args : STRING containing ASCII 034
796 my( $self, @args ) = @_;
797 return split /\034/, $args[0];
802 Title : count_records
803 Usage : $recs = $seqdb->count_records()
804 Function: return count of all recs in the index
813 my ($self,@args) = @_;
816 while (my($id, $rec) = each %$db) {
830 Usage : Called automatically when index goes out of scope
831 Function: Closes connection to database and handles to
841 untie($self->{'_DB'});
842 # An additional undef was the only way to force
843 # the object to drop the open filehandles for ActivePerl
844 undef $self->{'_DB'};