lib/Bio/DB/IndexedBase.pm

   1 #
   2 # BioPerl module for Bio::DB::IndexedBase
   3 #
   4 # You may distribute this module under the same terms as perl itself
   5 #
   6
   7 =head1 NAME
   8
   9 Bio::DB::IndexedBase - Base class for modules using indexed sequence files
  10
  11 =head1 SYNOPSIS
  12
  13   use Bio::DB::XXX; # a made-up class that uses Bio::IndexedBase
  14
  15   # 1/ Bio::SeqIO-style access
  16
  17   # Index some sequence files
  18   my $db = Bio::DB::XXX->new('/path/to/file');    # from a single file
  19   my $db = Bio::DB::XXX->new(['file1', 'file2']); # from multiple files
  20   my $db = Bio::DB::XXX->new('/path/to/files/');  # from a directory
  21
  22   # Get IDs of all the sequences in the database
  23   my @ids = $db->get_all_primary_ids;
  24
  25   # Get a specific sequence
  26   my $seq = $db->get_Seq_by_id('CHROMOSOME_I');
  27
  28   # Loop through all sequences
  29   my $stream = $db->get_PrimarySeq_stream;
  30   while (my $seq = $stream->next_seq) {
  31     # Do something...
  32   }
  33
  34
  35   # 2/ Access via filehandle
  36   my $fh = Bio::DB::XXX->newFh('/path/to/file');
  37   while (my $seq = <$fh>) {
  38     # Do something...
  39   }
  40
  41
  42   # 3/ Tied-hash access
  43   tie %sequences, 'Bio::DB::XXX', '/path/to/file';
  44   print $sequences{'CHROMOSOME_I:1,20000'};
  45
  46 =head1 DESCRIPTION
  47
  48 Bio::DB::IndexedBase provides a base class for modules that want to index
  49 and read sequence files and provides persistent, random access to each sequence
  50 entry, without bringing the entire file into memory. This module is compliant
  51 with the Bio::SeqI interface and both. Bio::DB::Fasta and Bio::DB::Qual both use
  52 Bio::DB::IndexedBase.
  53
  54 When you initialize the module, you point it at a single file, several files, or
  55 a directory of files. The first time it is run, the module generates an index
  56 of the content of the files using the AnyDBM_File module (BerkeleyDB preferred,
  57 followed by GDBM_File, NDBM_File, and SDBM_File). Subsequently, it uses the
  58 index file to find the sequence file and offset for any requested sequence. If
  59 one of the source files is updated, the module reindexes just that one file. You
  60 can also force reindexing manually at any time. For improved performance, the
  61 module keeps a cache of open filehandles, closing less-recently used ones when
  62 the cache is full.
  63
  64 Entries may have any line length up to 65,536 characters, and different line
  65 lengths are allowed in the same file.  However, within a sequence entry, all
  66 lines must be the same length except for the last. An error will be thrown if
  67 this is not the case!
  68
  69 This module was developed for use with the C. elegans and human genomes, and has
  70 been tested with sequence segments as large as 20 megabases. Indexing the C.
  71 elegans genome (100 megabases of genomic sequence plus 100,000 ESTs) takes ~5
  72 minutes on my 300 MHz pentium laptop. On the same system, average access time
  73 for any 200-mer within the C. elegans genome was E<lt>0.02s.
  74
  75 =head1 DATABASE CREATION AND INDEXING
  76
  77 The two constructors for this class are new() and newFh(). The former creates a
  78 Bio::DB::IndexedBase object which is accessed via method calls. The latter
  79 creates a tied filehandle which can be used Bio::SeqIO style to fetch sequence
  80 objects in a stream fashion. There is also a tied hash interface.
  81
  82 =over
  83
  84 =item $db = Bio::DB::IndexedBase-E<gt>new($path [,%options])
  85
  86 Create a new Bio::DB::IndexedBase object from the files designated by $path
  87 $path may be a single file, an arrayref of files, or a directory containing
  88 such files.
  89
  90 After the database is created, you can use methods like get_all_primary_ids()
  91 and get_Seq_by_id() to retrieve sequence objects.
  92
  93 =item $fh = Bio::DB::IndexedBase-E<gt>newFh($path [,%options])
  94
  95 Create a tied filehandle opened on a Bio::DB::IndexedBase object. Reading
  96 from this filehandle with E<lt>E<gt> will return a stream of sequence objects,
  97 Bio::SeqIO style. The path and the options should be specified as for new().
  98
  99 =item $obj = tie %db,'Bio::DB::IndexedBase', '/path/to/file' [,@args]
 100
 101 Create a tied-hash by tieing %db to Bio::DB::IndexedBase using the indicated
 102 path to the files. The optional @args list is the same set used by new(). If
 103 successful, tie() returns the tied object, undef otherwise.
 104
 105 Once tied, you can use the hash to retrieve an individual sequence by
 106 its ID, like this:
 107
 108   my $seq = $db{CHROMOSOME_I};
 109
 110 The keys() and values() functions will return the sequence IDs and their
 111 sequences, respectively.  In addition, each() can be used to iterate over the
 112 entire data set:
 113
 114  while (my ($id,$sequence) = each %db) {
 115     print "$id => $sequence\n";
 116  }
 117
 118
 119 When dealing with very large sequences, you can avoid bringing them into memory
 120 by calling each() in a scalar context.  This returns the key only.  You can then
 121 use tied(%db) to recover the Bio::DB::IndexedBase object and call its methods.
 122
 123  while (my $id = each %db) {
 124     print "$id: $db{$sequence:1,100}\n";
 125     print "$id: ".tied(%db)->length($id)."\n";
 126  }
 127
 128 In addition, you may invoke the FIRSTKEY and NEXTKEY tied hash methods directly
 129 to retrieve the first and next ID in the database, respectively. This allows one to
 130 write the following iterative loop using just the object-oriented interface:
 131
 132  my $db = Bio::DB::IndexedBase->new('/path/to/file');
 133  for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) {
 134     # do something with sequence
 135  }
 136
 137 =back
 138
 139 =head1 INDEX CONTENT
 140
 141 Several attributes of each sequence are stored in the index file. Given a
 142 sequence ID, these attributes can be retrieved using the following methods:
 143
 144 =over
 145
 146 =item offset($id)
 147
 148 Get the offset of the indicated sequence from the beginning of the file in which
 149 it is located. The offset points to the beginning of the sequence, not the
 150 beginning of the header line.
 151
 152 =item strlen($id)
 153
 154 Get the number of characters in the sequence string.
 155
 156 =item length($id)
 157
 158 Get the number of residues of the sequence.
 159
 160 =item linelen($id)
 161
 162 Get the length of the line for this sequence. If the sequence is wrapped, then
 163 linelen() is likely to be much shorter than strlen().
 164
 165 =item headerlen($id)
 166
 167 Get the length of the header line for the indicated sequence.
 168
 169 =item header_offset
 170
 171 Get the offset of the header line for the indicated sequence from the beginning
 172 of the file in which it is located. This attribute is not stored. It is
 173 calculated from offset() and headerlen().
 174
 175 =item alphabet($id)
 176
 177 Get the molecular type (alphabet) of the indicated sequence. This method handles
 178 residues according to the IUPAC convention.
 179
 180 =item file($id)
 181
 182 Get the the name of the file in which the indicated sequence can be found.
 183
 184 =back
 185
 186 =head1 INTERFACE COMPLIANCE NOTES
 187
 188 Bio::DB::IndexedBase is compliant with the Bio::DB::SeqI and hence with the
 189 Bio::RandomAccessI interfaces.
 190
 191 Database do not necessarily provide any meaningful internal primary ID for the
 192 sequences they store. However, Bio::DB::IndexedBase's internal primary IDs are
 193 the IDs of the sequences. This means that the same ID passed to get_Seq_by_id()
 194 and get_Seq_by_primary_id() will return the same sequence.
 195
 196 Since this database index has no notion of sequence version or namespace, the
 197 get_Seq_by_id(), get_Seq_by_acc() and get_Seq_by_version() are identical.
 198
 199 =head1 BUGS
 200
 201 When a sequence is deleted from one of the files, this deletion is not detected
 202 by the module and removed from the index. As a result, a "ghost" entry will
 203 remain in the index and will return garbage results if accessed.
 204
 205 Also, if you are indexing a directory, it is wise to not add or remove files
 206 from it.
 207
 208 In case you have changed the files in a directory, or the sequences in a file,
 209 you can to rebuild the entire index, either by deleting it manually, or by
 210 passing -reindex=E<gt>1 to new() when initializing the module.
 211
 212 =head1 SEE ALSO
 213
 214 L<DB_File>
 215
 216 L<Bio::DB::Fasta>
 217
 218 L<Bio::DB::Qual>
 219
 220 =head1 AUTHOR
 221
 222 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
 223
 224 Copyright (c) 2001 Cold Spring Harbor Laboratory.
 225
 226 Florent Angly (for the modularization)
 227
 228 This library is free software; you can redistribute it and/or modify
 229 it under the same terms as Perl itself.  See DISCLAIMER.txt for
 230 disclaimers of warranty.
 231
 232 =head1 APPENDIX
 233
 234 The rest of the documentation details each of the object
 235 methods. Internal methods are usually preceded with a _
 236
 237 =cut
 238
 239
 240 package Bio::DB::IndexedBase;
 241
 242 BEGIN {
 243     @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File)
 244         if(!$INC{'AnyDBM_File.pm'});
 245     # Remove carriage returns (\r) and newlines (\n) from a string.  When
 246     # called from subseq, this can take a signficiant portion of time, in
 247     # Variant Effect Prediction. Therefore we compile the match portion.
 248
 249     eval 'require Inline::C';
 250     if ( $INC{'Inline/C.pm'} ) {
 251         # C can do _strip_crnl much faster. But this requires the
 252         # Inline::C module which we don't require people to have. So we make
 253         # this optional by wrapping the C code in an eval. If the eval works,
 254         # the Perl strip_crnl() function is overwritten.
 255         Inline->bind(
 256             C => q(
 257         /*
 258         Strip all newlines (\n) and carriage returns (\r) from the string
 259         */
 260         char* _strip_crnl(char* str) {
 261           char *s;
 262           char *s2 = str;
 263           for (s = str; *s; *s++) {
 264             if (*s != '\n' && *s != '\r') {
 265               *s2++ = *s;
 266             }
 267           }
 268           *s2 = '\0';
 269           return str;
 270         }
 271         )
 272         );
 273     } else {
 274         # "tr" is much faster than the regex, with "s"
 275         *Bio::DB::IndexedBase::_strip_crnl = sub {
 276             my $str = shift;
 277             $str =~ tr/\n\r//d;
 278             return $str;
 279         };
 280     }
 281 }
 282
 283 use strict;
 284 use warnings;
 285 use IO::File;
 286 use AnyDBM_File;
 287 use Fcntl;
 288 use File::Spec;
 289 use File::Basename qw(basename dirname);
 290 use Bio::PrimarySeq;
 291
 292 use base qw(Bio::DB::SeqI);
 293
 294 # Store offset, strlen, linelen, headerlen, type and fileno
 295 use constant STRUCT    => 'NNNnnCa*'; # 32-bit file offset and seq length
 296 use constant STRUCTBIG => 'QQQnnCa*'; # 64-bit
 297
 298 use constant NA        => 0;
 299 use constant DNA       => 1;
 300 use constant RNA       => 2;
 301 use constant PROTEIN   => 3;
 302
 303 # You can avoid dying if you want but you may get incorrect results
 304 use constant DIE_ON_MISSMATCHED_LINES => 1;
 305
 306 =head2 new
 307
 308  Title   : new
 309  Usage   : my $db = Bio::DB::IndexedBase->new($path, -reindex => 1);
 310  Function: Initialize a new database object
 311  Returns : A Bio::DB::IndexedBase object
 312  Args    : A single file, or path to dir, or arrayref of files
 313            Optional arguments:
 314
 315  Option        Description                                         Default
 316  -----------   -----------                                         -------
 317  -glob         Glob expression to search for files in directories  *
 318  -makeid       A code subroutine for transforming IDs              None
 319  -maxopen      Maximum size of filehandle cache                    32
 320  -debug        Turn on status messages                             0
 321  -reindex      Force the index to be rebuilt                       0
 322  -dbmargs      Additional arguments to pass to the DBM routine     None
 323  -index_name   Name of the file that will hold the indices
 324  -clean        Remove the index file when finished                 0
 325
 326 The -dbmargs option can be used to control the format of the index. For example,
 327 you can pass $DB_BTREE to this argument so as to force the IDs to be sorted and
 328 retrieved alphabetically. Note that you must use the same arguments every time
 329 you open the index!
 330
 331 The -makeid option gives you a chance to modify sequence IDs during indexing.
 332 For example, you may wish to extract a portion of the gi|gb|abc|xyz nonsense
 333 that GenBank Fasta files use. The original header line can be recovered later.
 334 The option value for -makeid should be a code reference that takes a scalar
 335 argument (the full header line) and returns a scalar or an array of scalars (the
 336 ID or IDs you want to assign). For example:
 337
 338   $db = Bio::DB::IndexedBase->new('file.fa', -makeid => \&extract_gi);
 339
 340   sub extract_gi {
 341       # Extract GI from GenBank
 342       my $header = shift;
 343       my ($id) = ($header =~ /gi\|(\d+)/m);
 344       return $id || '';
 345   }
 346
 347 extract_gi() will be called with the full header line, e.g. a Fasta line would
 348 include the "E<gt>", the ID and the description:
 349
 350  >gi|352962132|ref|NG_030353.1| Homo sapiens sal-like 3 (Drosophila) (SALL3)
 351
 352 In the database, this sequence can now be retrieved by its GI instead of its
 353 complete ID:
 354
 355  my $seq = $db->get_Seq_by_id(352962132);
 356
 357 The -makeid option is ignored after the index is constructed.
 358
 359 =cut
 360
 361 sub new {
 362     my ($class, $path, %opts) = @_;
 363
 364     my $self = bless {
 365         debug       => $opts{-debug}   || 0,
 366         makeid      => $opts{-makeid},
 367         glob        => $opts{-glob}    || eval '$'.$class.'::file_glob' || '*',
 368         maxopen     => $opts{-maxopen} || 32,
 369         clean       => $opts{-clean}   || 0,
 370         dbmargs     => $opts{-dbmargs} || undef,
 371         fhcache     => {},
 372         cacheseq    => {},
 373         curopen     => 0,
 374         openseq     => 1,
 375         dirname     => undef,
 376         offsets     => undef,
 377         index_name  => $opts{-index_name},
 378         obj_class   => eval '$'.$class.'::obj_class',
 379         offset_meth => \&{$class.'::_calculate_offsets'},
 380         fileno2path => [],
 381         filepath2no => {},
 382     }, $class;
 383
 384     my ($offsets, $dirname);
 385     my $ref = ref $path || '';
 386     if ( $ref eq 'ARRAY' ) {
 387         $offsets = $self->index_files($path, $opts{-reindex});
 388         require Cwd;
 389         $dirname = Cwd::getcwd();
 390     } else {
 391   $self->{index_name} ||= $self->_default_index_name($path);
 392         if (-d $path) {
 393             # because Win32 glob() is broken with respect to long file names
 394             # that contain whitespace.
 395             $path = Win32::GetShortPathName($path)
 396                 if $^O =~ /^MSWin/i && eval 'use Win32; 1';
 397             $offsets = $self->index_dir($path, $opts{-reindex});
 398             $dirname = $path;
 399         } elsif (-f _) {
 400             $offsets = $self->index_file($path, $opts{-reindex});
 401             $dirname = dirname($path);
 402         } else {
 403             $self->throw( "No file or directory called '$path'");
 404         }
 405     }
 406     @{$self}{qw(dirname offsets)} = ($dirname, $offsets);
 407
 408     return $self;
 409 }
 410
 411
 412 =head2 newFh
 413
 414  Title   : newFh
 415  Usage   : my $fh = Bio::DB::IndexedBase->newFh('/path/to/files/', %options);
 416  Function: Index and get a new Fh for a single file, several files or a directory
 417  Returns : Filehandle object
 418  Args    : Same as new()
 419
 420 =cut
 421
 422 sub newFh {
 423     my ($class, @args) = @_;
 424     my $self = $class->new(@args);
 425     require Symbol;
 426     my $fh = Symbol::gensym;
 427     tie $$fh, 'Bio::DB::Indexed::Stream', $self
 428         or $self->throw("Could not tie filehandle: $!");
 429     return $fh;
 430 }
 431
 432
 433 =head2 dbmargs
 434
 435  Title   : dbmargs
 436  Usage   : my @args = $db->dbmargs;
 437  Function: Get stored dbm arguments
 438  Returns : Array
 439  Args    : None
 440
 441 =cut
 442
 443 sub dbmargs {
 444     my $self = shift;
 445     my $args = $self->{dbmargs} or return;
 446     return ref($args) eq 'ARRAY' ? @$args : $args;
 447 }
 448
 449
 450 =head2 glob
 451
 452  Title   : glob
 453  Usage   : my $glob = $db->glob;
 454  Function: Get the expression used to match files in directories
 455  Returns : String
 456  Args    : None
 457
 458 =cut
 459
 460 sub glob {
 461     my $self = shift;
 462     return $self->{glob};
 463 }
 464
 465
 466 =head2 index_dir
 467
 468  Title   : index_dir
 469  Usage   : $db->index_dir($dir);
 470  Function: Index the files that match -glob in the given directory
 471  Returns : Hashref of offsets
 472  Args    : Dirname
 473            Boolean to force a reindexing the directory
 474
 475 =cut
 476
 477 sub index_dir {
 478     my ($self, $dir, $force_reindex) = @_;
 479     my @files = glob( File::Spec->catfile($dir, $self->{glob}) );
 480     return if scalar @files == 0;
 481     $self->{index_name} ||= $self->_default_index_name($dir);
 482     my $offsets = $self->_index_files(\@files, $force_reindex);
 483     return $offsets;
 484 }
 485
 486
 487 =head2 get_all_primary_ids
 488
 489  Title   : get_all_primary_ids, get_all_ids, ids
 490  Usage   : my @ids = $db->get_all_primary_ids;
 491  Function: Get the IDs stored in all indexes. This is a Bio::DB::SeqI method
 492            implementation. Note that in this implementation, the internal
 493            database primary IDs are also the sequence IDs.
 494  Returns : List of ids
 495  Args    : None
 496
 497 =cut
 498
 499 sub get_all_primary_ids  {
 500     return keys %{shift->{offsets}};
 501 }
 502
 503 {
 504 no warnings 'once';
 505 *ids = *get_all_ids = \&get_all_primary_ids;
 506 }
 507
 508
 509 =head2 index_file
 510
 511  Title   : index_file
 512  Usage   : $db->index_file($filename);
 513  Function: Index the given file
 514  Returns : Hashref of offsets
 515  Args    : Filename
 516            Boolean to force reindexing the file
 517
 518 =cut
 519
 520 sub index_file {
 521     my ($self, $file, $force_reindex) = @_;
 522     $self->{index_name} ||= $self->_default_index_name($file);
 523     my $offsets = $self->_index_files([$file], $force_reindex);
 524     return $offsets;
 525 }
 526
 527 sub _default_index_name {
 528     my ($self,$path) = @_;
 529     return File::Spec->catfile($path,'directory.index') if -d $path;
 530     return "$path.index";
 531 }
 532
 533 =head2 index_files
 534
 535  Title   : index_files
 536  Usage   : $db->index_files(\@files);
 537  Function: Index the given files
 538  Returns : Hashref of offsets
 539  Args    : Arrayref of filenames
 540            Boolean to force reindexing the files
 541
 542 =cut
 543
 544 sub index_files {
 545     my ($self, $files, $force_reindex) = @_;
 546     my @paths = map { File::Spec->rel2abs($_) } @$files;
 547     require Digest::MD5;
 548     my $digest = Digest::MD5::md5_hex( join('', sort @paths) );
 549     $self->{index_name} ||= "fileset_$digest.index"; # unique name for the given files
 550     my $offsets = $self->_index_files($files, $force_reindex);
 551     return $offsets;
 552 }
 553
 554
 555 =head2 index_name
 556
 557  Title   : index_name
 558  Usage   : my $indexname = $db->index_name($path);
 559  Function: Get the full name of the index file
 560  Returns : String
 561  Args    : None
 562
 563 =cut
 564
 565 sub index_name {
 566     return shift->{index_name};
 567 }
 568
 569
 570 =head2 path
 571
 572  Title   : path
 573  Usage   : my $path = $db->path($path);
 574  Function: When a single file or a directory of files is indexed, this returns
 575            the file directory. When indexing an arbitrary list of files, the
 576            return value is the path of the current working directory.
 577  Returns : String
 578  Args    : None
 579
 580 =cut
 581
 582 sub path {
 583     return shift->{dirname};
 584 }
 585
 586
 587 =head2 get_PrimarySeq_stream
 588
 589  Title   : get_PrimarySeq_stream
 590  Usage   : my $stream = $db->get_PrimarySeq_stream();
 591  Function: Get a SeqIO-like stream of sequence objects. The stream supports a
 592            single method, next_seq(). Each call to next_seq() returns a new
 593            PrimarySeqI compliant sequence object, until no more sequences remain.
 594            This is a Bio::DB::SeqI method implementation.
 595  Returns : A Bio::DB::Indexed::Stream object
 596  Args    : None
 597
 598 =cut
 599
 600 sub get_PrimarySeq_stream {
 601     my $self = shift;
 602     return Bio::DB::Indexed::Stream->new($self);
 603 }
 604
 605
 606 =head2 get_Seq_by_id
 607
 608  Title   : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_version, get_Seq_by_primary_id
 609  Usage   : my $seq = $db->get_Seq_by_id($id);
 610  Function: Given an ID, fetch the corresponding sequence from the database.
 611            This is a Bio::DB::SeqI and Bio::DB::RandomAccessI method implementation.
 612  Returns : A sequence object
 613  Args    : ID
 614
 615 =cut
 616
 617 sub get_Seq_by_id {
 618     my ($self, $id) = @_;
 619     $self->throw('Need to provide a sequence ID') if not defined $id;
 620     return if not exists $self->{offsets}{$id};
 621     return $self->{obj_class}->new($self, $id);
 622 }
 623
 624 {
 625 no warnings 'once';
 626 *get_Seq_by_version = *get_Seq_by_primary_id = *get_Seq_by_acc = \&get_Seq_by_id;
 627 }
 628
 629
 630 =head2 _calculate_offsets
 631
 632  Title   : _calculate_offsets
 633  Usage   : $db->_calculate_offsets($filename, $offsets);
 634  Function: This method calculates the sequence offsets in a file based on ID and
 635            should be implemented by classes that use Bio::DB::IndexedBase.
 636  Returns : Hash of offsets
 637  Args    : File to process
 638            Hashref of file offsets keyed by IDs.
 639
 640 =cut
 641
 642 sub _calculate_offsets {
 643     my $self = shift;
 644     $self->throw_not_implemented();
 645 }
 646
 647
 648 sub _index_files {
 649     # Do the indexing of the given files using the index file on record
 650     my ($self, $files, $force_reindex) = @_;
 651
 652     $self->_set_pack_method( @$files );
 653
 654     # Get name of index file
 655     my $index = $self->index_name;
 656
 657     # If caller has requested reindexing, unlink the index file.
 658     if ($force_reindex) {
 659         # Tied-hash in Strawberry Perl creates "$file.index"
 660         unlink $index if -e $index;
 661         # Tied-hash in ActivePerl creates "$file.index.pag" and "$file.index.dir"
 662         unlink "$index.dir" if -e "$index.dir";
 663         unlink "$index.pag" if -e "$index.pag";
 664     }
 665
 666     # Get the modification time of the index
 667     my $indextime = (stat $index)[9] || 0;
 668
 669     # Register files and find if there has been any update
 670     my $modtime = 0;
 671     my @updated;
 672     for my $file (@$files) {
 673         # Register file
 674         $self->_path2fileno(basename($file));
 675         # Any update?
 676         my $m = (stat $file)[9] || 0;
 677         if ($m > $modtime) {
 678            $modtime = $m;
 679         }
 680         if ($m > $indextime) {
 681            push @updated, $file;
 682         }
 683     }
 684
 685     # Get termination length from first file
 686     $self->{termination_length} = $self->_calc_termination_length( $files->[0] );
 687
 688     # Reindex contents of changed files if needed
 689     my $reindex      = $force_reindex || (scalar @updated > 0);
 690     $self->{offsets} = $self->_open_index($index, $reindex) or return;
 691     if ($reindex) {
 692         $self->{indexing} = $index;
 693         for my $file (@updated) {
 694             my $fileno = $self->_path2fileno(basename($file));
 695             &{$self->{offset_meth}}($self, $fileno, $file, $self->{offsets});
 696         }
 697         delete $self->{indexing};
 698     }
 699
 700     # Closing and reopening might help corrupted index file problem on Windows
 701     $self->_close_index($self->{offsets});
 702
 703     return $self->{offsets} = $self->_open_index($index);
 704 }
 705
 706
 707 sub _open_index {
 708     # Open index file in read-only or write mode
 709     my ($self, $index_file, $write) = @_;
 710     my %offsets;
 711     my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY;
 712     my @dbmargs = $self->dbmargs;
 713     tie %offsets, 'AnyDBM_File', $index_file, $flags, 0644, @dbmargs
 714         or $self->throw( "Could not open index file $index_file: $!");
 715     return \%offsets;
 716 }
 717
 718
 719 sub _close_index {
 720     # Close index file
 721     my ($self, $index) = @_;
 722     untie %$index;
 723     return 1;
 724 }
 725
 726 # Compiling the below regular expression speeds up _parse_compound_id
 727 my $compound_id = qr/^ (.+?) (?:\:([\d_]+)(?:,|-|\.\.)([\d_]+))? (?:\/(.+))? $/x;
 728
 729 sub _parse_compound_id {
 730     # Handle compound IDs:
 731     #     $db->seq($id)
 732     #     $db->seq($id, $start, $stop, $strand)
 733     #     $db->seq("$id:$start,$stop")
 734     #     $db->seq("$id:$start..$stop")
 735     #     $db->seq("$id:$start-$stop")
 736     #     $db->seq("$id:$start,$stop/$strand")
 737     #     $db->seq("$id:$start..$stop/$strand")
 738     #     $db->seq("$id:$start-$stop/$strand")
 739     #     $db->seq("$id/$strand")
 740     my ($self, $id, $start, $stop, $strand) = @_;
 741
 742     if ( (not defined $start ) &&
 743          (not defined $stop  ) &&
 744          (not defined $strand) &&
 745          ($id =~ m{$compound_id}) ) {
 746         # Start, stop and strand not provided and ID looks like a compound ID
 747         ($id, $start, $stop, $strand) = ($1, $2, $3, $4);
 748     }
 749
 750     # Start, stop and strand defaults
 751     $stop   ||= $self->length($id) || 0; # 0 if sequence not found in database
 752     $start  ||= ($stop > 0) ? 1 : 0;
 753     $strand ||= 1;
 754
 755     # Convert numbers such as 1_000_000 to 1000000
 756     $start =~ s/_//g;
 757     $stop  =~ s/_//g;
 758
 759     if ($start > $stop) {
 760         # Change the strand
 761         ($start, $stop) = ($stop, $start);
 762         $strand *= -1;
 763     }
 764
 765     return $id, $start, $stop, $strand;
 766 }
 767
 768
 769 sub _guess_alphabet {
 770     # Determine the molecular type of the given sequence string:
 771     #    'dna', 'rna', 'protein' or '' (unknown/empty)
 772     my ($self, $string) = @_;
 773     # Handle IUPAC residues like PrimarySeq does
 774     my $alphabet = Bio::PrimarySeq::_guess_alphabet_from_string($self, $string, 1);
 775     return $alphabet eq 'dna' ? DNA
 776            : $alphabet eq 'rna' ? RNA
 777            : $alphabet eq 'protein' ? PROTEIN
 778            : NA;
 779 }
 780
 781
 782 sub _makeid {
 783     # Process the header line by applying any transformation given in -makeid
 784     my ($self, $header_line) = @_;
 785     return ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($header_line) : $1;
 786 }
 787
 788
 789 sub _check_linelength {
 790     # Check that the line length is valid. Generate an error otherwise.
 791     my ($self, $linelength) = @_;
 792     return if not defined $linelength;
 793     $self->throw(
 794         "Each line of the file must be less than 65,536 characters. Line ".
 795         "$. is $linelength chars."
 796     ) if $linelength > 65535;
 797 }
 798
 799
 800 sub _calc_termination_length {
 801     # Try the beginning of the file to determine termination length
 802     # Account for crlf-terminated Windows and Mac files
 803     my ($self, $file) = @_;
 804     my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!");
 805
 806     # In Windows, text files have '\r\n' as line separator, but when reading in
 807     # text mode Perl will only show the '\n'. This means that for a line "ABC\r\n",
 808     # "length $_" will report 4 although the line is 5 bytes in length.
 809     # We assume that all lines have the same line separator and only read current line.
 810     my $init_pos   = tell($fh);
 811     my $curr_line  = <$fh>;
 812     my $pos_diff   = tell($fh) - $init_pos;
 813     my $correction = $pos_diff - length $curr_line;
 814     close $fh;
 815
 816     $self->{termination_length} = ($curr_line =~ /\r\n$/) ? 2 : 1+$correction;
 817     return $self->{termination_length};
 818 }
 819
 820
 821 sub _calc_offset {
 822     # Get the offset of the n-th residue of the sequence with the given ID
 823     # and termination length (tl)
 824     my ($self, $id, $n) = @_;
 825     my $tl = $self->{termination_length};
 826     $n--;
 827     my ($offset, $seqlen, $linelen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,1,3];
 828     $n = 0            if $n < 0;
 829     $n = $seqlen-1 if $n >= $seqlen;
 830     return $offset + $linelen * int($n/($linelen-$tl)) + $n % ($linelen-$tl);
 831 }
 832
 833
 834 sub _fh {
 835     # Given a sequence ID, return the filehandle on which to find this sequence
 836     my ($self, $id) = @_;
 837     $self->throw('Need to provide a sequence ID') if not defined $id;
 838     my $file = $self->file($id) or return;
 839     return eval {
 840       $self->_fhcache( File::Spec->catfile($self->{dirname}, $file));
 841     } || $self->throw( "Can't open file $file" );
 842 }
 843
 844
 845 sub _fhcache {
 846     my ($self, $path) = @_;
 847     if (!$self->{fhcache}{$path}) {
 848         if ($self->{curopen} >= $self->{maxopen}) {
 849             my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};}
 850                 keys %{$self->{fhcache}};
 851             splice(@lru, $self->{maxopen} / 3);
 852             $self->{curopen} -= @lru;
 853             for (@lru) {
 854                 delete $self->{fhcache}{$_};
 855             }
 856         }
 857         $self->{fhcache}{$path} = IO::File->new($path) || return;
 858         binmode $self->{fhcache}{$path};
 859         $self->{curopen}++;
 860     }
 861     $self->{cacheseq}{$path}++;
 862     return $self->{fhcache}{$path};
 863 }
 864
 865
 866 #-------------------------------------------------------------
 867 # Methods to store and retrieve data from indexed file
 868 #
 869
 870 =head2 offset
 871
 872  Title   : offset
 873  Usage   : my $offset = $db->offset($id);
 874  Function: Get the offset of the indicated sequence from the beginning of the
 875            file in which it is located. The offset points to the beginning of
 876            the sequence, not the beginning of the header line.
 877  Returns : String
 878  Args    : ID of sequence
 879
 880 =cut
 881
 882 sub offset {
 883     my ($self, $id) = @_;
 884     $self->throw('Need to provide a sequence ID') if not defined $id;
 885     my $offset = $self->{offsets}{$id} or return;
 886     return (&{$self->{unpackmeth}}($offset))[0];
 887 }
 888
 889
 890 =head2 strlen
 891
 892  Title   : strlen
 893  Usage   : my $length = $db->strlen($id);
 894  Function: Get the number of characters in the sequence string.
 895  Returns : Integer
 896  Args    : ID of sequence
 897
 898 =cut
 899
 900 sub strlen {
 901     my ($self, $id) = @_;
 902     $self->throw('Need to provide a sequence ID') if not defined $id;
 903     my $offset = $self->{offsets}{$id} or return;
 904     return (&{$self->{unpackmeth}}($offset))[1];
 905 }
 906
 907
 908 =head2 length
 909
 910  Title   : length
 911  Usage   : my $length = $db->length($id);
 912  Function: Get the number of residues of the sequence.
 913  Returns : Integer
 914  Args    : ID of sequence
 915
 916 =cut
 917
 918 sub length {
 919     my ($self, $id) = @_;
 920     $self->throw('Need to provide a sequence ID') if not defined $id;
 921     my $offset = $self->{offsets}{$id} or return;
 922     return (&{$self->{unpackmeth}}($offset))[2];
 923 }
 924
 925
 926 =head2 linelen
 927
 928  Title   : linelen
 929  Usage   : my $linelen = $db->linelen($id);
 930  Function: Get the length of the line for this sequence.
 931  Returns : Integer
 932  Args    : ID of sequence
 933
 934 =cut
 935
 936 sub linelen {
 937     my ($self, $id) = @_;
 938     $self->throw('Need to provide a sequence ID') if not defined $id;
 939     my $offset = $self->{offsets}{$id} or return;
 940     return (&{$self->{unpackmeth}}($offset))[3];
 941 }
 942
 943
 944 =head2 headerlen
 945
 946  Title   : headerlen
 947  Usage   : my $length = $db->headerlen($id);
 948  Function: Get the length of the header line for the indicated sequence.
 949  Returns : Integer
 950  Args    : ID of sequence
 951
 952 =cut
 953
 954 sub headerlen {
 955     my ($self, $id) = @_;
 956     $self->throw('Need to provide a sequence ID') if not defined $id;
 957     my $offset = $self->{offsets}{$id} or return;
 958     return (&{$self->{unpackmeth}}($offset))[4];
 959 }
 960
 961
 962 =head2 header_offset
 963
 964  Title   : header_offset
 965  Usage   : my $offset = $db->header_offset($id);
 966  Function: Get the offset of the header line for the indicated sequence from
 967            the beginning of the file in which it is located.
 968  Returns : String
 969  Args    : ID of sequence
 970
 971 =cut
 972
 973 sub header_offset {
 974     my ($self, $id) = @_;
 975     $self->throw('Need to provide a sequence ID') if not defined $id;
 976     return if not $self->{offsets}{$id};
 977     return $self->offset($id) - $self->headerlen($id);
 978 }
 979
 980
 981 =head2 alphabet
 982
 983  Title   : alphabet
 984  Usage   : my $alphabet = $db->alphabet($id);
 985  Function: Get the molecular type of the indicated sequence: dna, rna or protein
 986  Returns : String
 987  Args    : ID of sequence
 988
 989 =cut
 990
 991 sub alphabet {
 992     my ($self, $id) = @_;
 993     $self->throw('Need to provide a sequence ID') if not defined $id;
 994     my $offset = $self->{offsets}{$id} or return;
 995     my $alphabet = (&{$self->{unpackmeth}}($offset))[5];
 996     return : $alphabet == Bio::DB::IndexedBase::DNA     ? 'dna'
 997            : $alphabet == Bio::DB::IndexedBase::RNA     ? 'rna'
 998            : $alphabet == Bio::DB::IndexedBase::PROTEIN ? 'protein'
 999            : '';
1000 }
1001
1002
1003 =head2 file
1004
1005  Title   : file
1006  Usage   : my $file = $db->file($id);
1007  Function: Get the the name of the file in which the indicated sequence can be
1008            found.
1009  Returns : String
1010  Args    : ID of sequence
1011
1012 =cut
1013
1014 sub file {
1015     my ($self, $id) = @_;
1016     $self->throw('Need to provide a sequence ID') if not defined $id;
1017     my $offset = $self->{offsets}{$id} or return;
1018     return $self->_fileno2path((&{$self->{unpackmeth}}($offset))[6]);
1019 }
1020
1021
1022 sub _fileno2path {
1023     my ($self, $fileno) = @_;
1024     return $self->{fileno2path}->[$fileno];
1025 }
1026
1027
1028 sub _path2fileno {
1029     my ($self, $path) = @_;
1030     if ( not exists $self->{filepath2no}->{$path} ) {
1031         my $fileno = ($self->{filepath2no}->{$path} = 0+ $self->{fileno}++);
1032         $self->{fileno2path}->[$fileno] = $path; # Save path
1033     }
1034     return $self->{filepath2no}->{$path};
1035
1036 }
1037
1038
1039 sub _packSmall {
1040     return pack STRUCT, @_;
1041 }
1042
1043
1044 sub _packBig {
1045     return pack STRUCTBIG, @_;
1046 }
1047
1048
1049 sub _unpackSmall {
1050     return unpack STRUCT, shift;
1051 }
1052
1053
1054 sub _unpackBig {
1055     return unpack STRUCTBIG, shift;
1056 }
1057
1058
1059 sub _set_pack_method {
1060     # Determine whether to use 32 or 64 bit integers for the given files.
1061     my $self = shift;
1062     # Find the maximum file size:
1063     my ($maxsize) = sort { $b <=> $a } map { -s $_ } @_;
1064     my $fourGB    = (2 ** 32) - 1;
1065
1066     if ($maxsize > $fourGB) {
1067         # At least one file exceeds 4Gb - we will need to use 64 bit ints
1068         $self->{packmeth}   = \&_packBig;
1069         $self->{unpackmeth} = \&_unpackBig;
1070     } else {
1071         $self->{packmeth}   = \&_packSmall;
1072         $self->{unpackmeth} = \&_unpackSmall;
1073     }
1074     return 1;
1075 }
1076
1077
1078 #-------------------------------------------------------------
1079 # Tied hash logic
1080 #
1081
1082 sub TIEHASH {
1083     return shift->new(@_);
1084 }
1085
1086
1087 sub FETCH {
1088     return shift->subseq(@_);
1089 }
1090
1091
1092 sub STORE {
1093     shift->throw("Read-only database");
1094 }
1095
1096
1097 sub DELETE {
1098     shift->throw("Read-only database");
1099 }
1100
1101
1102 sub CLEAR {
1103     shift->throw("Read-only database");
1104 }
1105
1106
1107 sub EXISTS {
1108     return defined shift->offset(@_);
1109 }
1110
1111
1112 sub FIRSTKEY {
1113     return tied(%{shift->{offsets}})->FIRSTKEY(@_);
1114 }
1115
1116
1117 sub NEXTKEY {
1118     return tied(%{shift->{offsets}})->NEXTKEY(@_);
1119 }
1120
1121
1122 sub DESTROY {
1123     my $self = shift;
1124
1125     # Close filehandles
1126     while (my ($file, $fh) = each %{ $self->{fhcache} }) {
1127         if (defined $fh) {
1128             $fh->close;
1129         }
1130     }
1131     $self->_close_index($self->{offsets});
1132
1133     if ( $self->{clean} || $self->{indexing} ) {
1134         # Indexing aborted or cleaning requested. Delete the index file.
1135         my $index = $self->{index_name};
1136
1137         # Tied-hash in Strawberry Perl creates "$file.index"
1138         unlink $index if -e $index;
1139         # Tied-hash in ActivePerl creates "$file.index.pag" and "$file.index.dir"
1140         unlink "$index.dir" if -e "$index.dir";
1141         unlink "$index.pag" if -e "$index.pag";
1142     }
1143     return 1;
1144 }
1145
1146
1147 #-------------------------------------------------------------
1148 # stream-based access to the database
1149 #
1150
1151 package Bio::DB::Indexed::Stream;
1152 use base qw(Tie::Handle Bio::DB::SeqI);
1153
1154
1155 sub new {
1156     my ($class, $db) = @_;
1157     my $key = $db->FIRSTKEY;
1158     return bless {
1159         db  => $db,
1160         key => $key
1161     }, $class;
1162 }
1163
1164 sub next_seq {
1165     my $self = shift;
1166     my ($key, $db) = @{$self}{'key', 'db'};
1167     return if not defined $key;
1168     my $value = $db->get_Seq_by_id($key);
1169     $self->{key} = $db->NEXTKEY($key);
1170     return $value;
1171 }
1172
1173 sub TIEHANDLE {
1174     my ($class, $db) = @_;
1175     return $class->new($db);
1176 }
1177
1178 sub READLINE {
1179     my $self = shift;
1180     return $self->next_seq || undef;
1181 }
1182
1183
1184 1;