lib/Bio/DB/SeqFeature/Store/berkeleydb.pm

   1 package Bio::DB::SeqFeature::Store::berkeleydb;
   2
   3 use strict;
   4 use base 'Bio::DB::SeqFeature::Store';
   5 use Bio::DB::GFF::Util::Rearrange 'rearrange';
   6 use DB_File;
   7 use Fcntl qw(O_RDWR O_CREAT :flock);
   8 use IO::File;
   9 use File::Temp 'tempdir';
  10 use File::Path 'rmtree','mkpath';
  11 use File::Basename;
  12 use File::Spec;
  13 use Carp 'carp','croak';
  14
  15 use constant BINSIZE => 10_000;
  16 use constant MININT  => -999_999_999_999;
  17 use constant MAXINT  => 999_999_999_999;
  18
  19 =head1 NAME
  20
  21 Bio::DB::SeqFeature::Store::berkeleydb -- Storage and retrieval of sequence annotation data in Berkeleydb files
  22
  23 =head1 SYNOPSIS
  24
  25   use Bio::DB::SeqFeature::Store;
  26
  27   # Create a database from the feature files located in /home/fly4.3 and store
  28   # the database index in the same directory:
  29   my $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
  30                                             -dir     => '/home/fly4.3');
  31
  32   # Create a database that will monitor the files in /home/fly4.3, but store
  33   # the indexes in /var/databases/fly4.3
  34   $db    = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
  35                                             -dir     => '/home/fly4.3',
  36                                             -dsn     => '/var/databases/fly4.3');
  37
  38   # Create a feature database from scratch
  39   $db    = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
  40                                             -dsn     => '/var/databases/fly4.3',
  41                                             -create  => 1);
  42
  43   # get a feature from somewhere
  44   my $feature = Bio::SeqFeature::Generic->new(...);
  45
  46   # store it
  47   $db->store($feature) or die "Couldn't store!";
  48
  49   # primary ID of the feature is changed to indicate its primary ID
  50   # in the database...
  51   my $id = $feature->primary_id;
  52
  53   # get the feature back out
  54   my $f  = $db->fetch($id);
  55
  56   # change the feature and update it
  57   $f->start(100);
  58   $db->update($f) or $self->throw("Couldn't update!");
  59
  60   # use the GFF3 loader to do a bulk write:
  61   my $loader = Bio::DB::SeqFeature::Store::GFF3Loader->new(-store   => $db,
  62                                                            -verbose => 0,
  63                                                            -fast    => 1);
  64   $loader->load('/home/fly4.3/dmel-all.gff');
  65
  66
  67   # searching...
  68   # ...by id
  69   my @features = $db->fetch_many(@list_of_ids);
  70
  71   # ...by name
  72   @features = $db->get_features_by_name('ZK909');
  73
  74   # ...by alias
  75   @features = $db->get_features_by_alias('sma-3');
  76
  77   # ...by type
  78   @features = $db->get_features_by_type('gene');
  79
  80   # ...by location
  81   @features = $db->get_features_by_location(-seq_id=>'Chr1',-start=>4000,-end=>600000);
  82
  83   # ...by attribute
  84   @features = $db->get_features_by_attribute({description => 'protein kinase'})
  85
  86   # ...by the GFF "Note" field
  87   @result_list = $db->search_notes('kinase');
  88
  89   # ...by arbitrary combinations of selectors
  90   @features = $db->features(-name => $name,
  91                             -type => $types,
  92                             -seq_id => $seqid,
  93                             -start  => $start,
  94                             -end    => $end,
  95                             -attributes => $attributes);
  96
  97   # ...using an iterator
  98   my $iterator = $db->get_seq_stream(-name => $name,
  99                                      -type => $types,
 100                                      -seq_id => $seqid,
 101                                      -start  => $start,
 102                                      -end    => $end,
 103                                      -attributes => $attributes);
 104
 105   while (my $feature = $iterator->next_seq) {
 106     # do something with the feature
 107   }
 108
 109   # ...limiting the search to a particular region
 110   my $segment  = $db->segment('Chr1',5000=>6000);
 111   my @features = $segment->features(-type=>['mRNA','match']);
 112
 113   # what feature types are defined in the database?
 114   my @types    = $db->types;
 115
 116   # getting & storing sequence information
 117   # Warning: this returns a string, and not a PrimarySeq object
 118   $db->insert_sequence('Chr1','GATCCCCCGGGATTCCAAAA...');
 119   my $sequence = $db->fetch_sequence('Chr1',5000=>6000);
 120
 121   # create a new feature in the database
 122   my $feature = $db->new_feature(-primary_tag => 'mRNA',
 123                                  -seq_id      => 'chr3',
 124                                  -start      => 10000,
 125                                  -end        => 11000);
 126
 127 =head1 DESCRIPTION
 128
 129 Bio::DB::SeqFeature::Store::berkeleydb is the Berkeleydb adaptor for
 130 Bio::DB::SeqFeature::Store. You will not create it directly, but
 131 instead use Bio::DB::SeqFeature::Store-E<gt>new() to do so.
 132
 133 See L<Bio::DB::SeqFeature::Store> for complete usage instructions.
 134
 135 =head2 Using the berkeleydb adaptor
 136
 137 The Berkeley database consists of a series of Berkeleydb index files,
 138 and a couple of special purpose indexes. You can create the index
 139 files from scratch by creating a new database and calling
 140 new_feature() repeatedly, you can create the database and then bulk
 141 populate it using the GFF3 loader, or you can monitor a directory of
 142 preexisting GFF3 and FASTA files and rebuild the indexes whenever one
 143 or more of the fields changes. The last mode is probably the most
 144 convenient. Note that the indexer will only pay attention to files
 145 that end with .gff3, .wig and .fa.
 146
 147 =over 4
 148
 149 =item The new() constructor
 150
 151 The new() constructor method all the arguments recognized by
 152 Bio::DB::SeqFeature::Store, and a few additional ones.
 153
 154 Standard arguments:
 155
 156  Name               Value
 157  ----               -----
 158
 159  -adaptor           The name of the Adaptor class (default DBI::mysql)
 160
 161  -serializer        The name of the serializer class (default Storable)
 162
 163  -index_subfeatures Whether or not to make subfeatures searchable
 164                     (default true)
 165
 166  -cache             Activate LRU caching feature -- size of cache
 167
 168  -compress          Compresses features before storing them in database
 169                     using Compress::Zlib
 170
 171 Adaptor-specific arguments
 172
 173  Name               Value
 174  ----               -----
 175
 176  -dsn               Where the index files are stored
 177
 178  -dir               Where the source (GFF3, FASTA) files are stored
 179
 180  -autoindex         An alias for -dir.
 181
 182  -write             Pass true to open the index files for writing.
 183
 184  -create            Pass true to create the index files if they don't exist
 185                     (implies -write=>1)
 186
 187  -locking           Use advisory locking to avoid one process trying to read
 188                     from the database while another is updating it (may not
 189                     work properly over NFS).
 190
 191  -temp              Pass true to create temporary index files that will
 192                     be deleted once the script exits.
 193
 194  -verbose           Pass true to report autoindexing operations on STDERR.
 195                     (default is true).
 196
 197 Examples:
 198
 199 To create an empty database which will be populated using calls to
 200 store() or new_feature(), or which will be bulk-loaded using the GFF3
 201 loader:
 202
 203   $db     = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
 204                                              -dsn     => '/var/databases/fly4.3',
 205                                              -create  => 1);
 206
 207 To open a preexisting database in read-only mode:
 208
 209   $db     = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
 210                                              -dsn     => '/var/databases/fly4.3');
 211
 212 To open a preexisting database in read/write (update) mode:
 213
 214   $db     = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
 215                                              -dsn     => '/var/databases/fly4.3',
 216                                              -write   => 1);
 217
 218 To monitor a set of GFF3 and FASTA files located in a directory and
 219 create/update the database indexes as needed. The indexes will be
 220 stored in a new subdirectory named "indexes":
 221
 222   $db     = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
 223                                              -dir     => '/var/databases/fly4.3');
 224
 225 As above, but store the source files and index files in separate directories:
 226
 227   $db     = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
 228                                              -dsn     => '/var/databases/fly4.3',
 229                                              -dir     => '/home/gff3_files/fly4.3');
 230
 231 To be indexed, files must end with one of .gff3 (GFF3 format), .fa
 232 (FASTA format) or .wig (WIG format).
 233
 234 B<-autoindex> is an alias for B<-dir>.
 235
 236 You should specify B<-locking> in a multiuser environment, including
 237 the case in which the database is being used by a web server at the
 238 same time another user might be updating it.
 239
 240 =back
 241
 242 See L<Bio::DB::SeqFeature::Store> for all the access methods supported
 243 by this adaptor. The various methods for storing and updating features
 244 and sequences into the database are supported, but there is no
 245 locking. If two processes try to update the same database
 246 simultaneously, the database will likely become corrupted.
 247
 248 =cut
 249
 250 ###
 251 # object initialization
 252 #
 253 sub init {
 254   my $self          = shift;
 255   my ($directory,
 256       $autoindex,
 257       $is_temporary,
 258       $write,
 259       $create,
 260       $verbose,
 261       $locking,
 262       ) = rearrange([['DSN','DB'],
 263                     [qw(DIR AUTOINDEX)],
 264                     ['TMP','TEMP','TEMPORARY'],
 265                     [qw(WRITE WRITABLE)],
 266                     'CREATE',
 267                     'VERBOSE',
 268                     [qw(LOCK LOCKING)],
 269                   ],@_);
 270
 271   $verbose = 1 unless defined $verbose;
 272
 273   if ($autoindex) {
 274     -d $autoindex or $self->throw("Invalid directory $autoindex");
 275     $directory ||= "$autoindex/indexes";
 276   }
 277   $directory ||= $is_temporary ? File::Spec->tmpdir : '.';
 278   #
 279   my $pacname = __PACKAGE__;
 280   if ($^O =~ /mswin/i) {
 281     $pacname =~ s/:+/_/g;
 282   }
 283   $directory = tempdir($pacname.'_XXXXXX',
 284                        TMPDIR  => 1,
 285                        CLEANUP => 1,
 286                        DIR     => $directory) if $is_temporary;
 287   mkpath($directory);
 288   -d $directory or $self->throw("Invalid directory $directory");
 289
 290   $create++ if $is_temporary;
 291   $write ||= $create;
 292   $self->throw("Can't write into the directory $directory")
 293     if $write && !-w $directory;
 294
 295
 296   $self->default_settings;
 297   $self->directory($directory);
 298   $self->temporary($is_temporary);
 299   $self->verbose($verbose);
 300   $self->locking($locking);
 301   $self->_delete_databases()    if $create;
 302   if ($autoindex && -d $autoindex) {
 303       $self->auto_reindex($autoindex);
 304   }
 305   $self->lock('shared');
 306
 307   # this step may rebless $self into a subclass
 308   # to preserve backward compatibility with older
 309   # databases while providing better performance for
 310   # new databases.
 311   $self->possibly_rebless($create);
 312
 313   $self->_open_databases($write,$create,$autoindex);
 314   $self->_permissions($write,$create);
 315   return $self;
 316 }
 317
 318 sub version { return 2.0 };
 319
 320 sub possibly_rebless {
 321     my $self   = shift;
 322     my $create = shift;
 323     my $do_rebless;
 324
 325     if ($create) {
 326         $do_rebless++;
 327     } else {  # probe database
 328         my %h;
 329         tie (%h,'DB_File',$self->_features_path,O_RDONLY,0666,$DB_HASH) or return;
 330         $do_rebless = $h{'.version'} >= 3.0;
 331     }
 332
 333     if ($do_rebless) {
 334         eval "require Bio::DB::SeqFeature::Store::berkeleydb3";
 335         bless $self,'Bio::DB::SeqFeature::Store::berkeleydb3';
 336     }
 337
 338 }
 339
 340 sub can_store_parentage { 1 }
 341
 342 sub auto_reindex {
 343     my $self    = shift;
 344     my $autodir = shift;
 345     my $result  = $self->needs_auto_reindexing($autodir);
 346
 347     if ($result && %$result) {
 348         $self->flag_autoindexing(1);
 349         $self->lock('exclusive');
 350         $self->reindex_wigfiles($result->{wig},$autodir) if $result->{wig};
 351         $self->reindex_ffffiles($result->{fff},$autodir) if $result->{fff};
 352         $self->reindex_gfffiles($result->{gff},$autodir) if $result->{gff};
 353         $self->dna_db(Bio::DB::Fasta::Subdir->new($autodir));
 354         $self->unlock;
 355         $self->flag_autoindexing(0);
 356     }
 357
 358     else {
 359         $self->dna_db(Bio::DB::Fasta::Subdir->new($autodir));
 360     }
 361 }
 362
 363 sub autoindex_flagfile {
 364     return File::Spec->catfile(shift->directory,'autoindex.pid');
 365 }
 366 sub auto_index_in_process {
 367     my $self = shift;
 368     my $flag_file = $self->autoindex_flagfile;
 369     return unless -e $flag_file;
 370
 371     # if flagfile exists, then check that PID still exists
 372     open my $fh, '<', $flag_file or $self->throw("Could not read file '$flag_file': $!");
 373     my $pid = <$fh>;
 374     close $fh;
 375     return 1 if kill 0=>$pid;
 376     warn "Autoindexing seems to be running in another process, but the process has gone away. Trying to override...";
 377     if (unlink $flag_file) {
 378         warn "Successfully removed stale PID file." if $self->verbose;
 379         warn "Assuming partial reindexing process. Rebuilding indexes from scratch..." if $self->verbose;
 380         my $glob = File::Spec->catfile($self->directory,'*');
 381         unlink glob($glob);
 382         return;
 383     } else {
 384         croak ("Cannot recover from apparent aborted autoindex process. Please remove files in ",
 385              $self->directory,
 386              " and allow the adaptor to reindex");
 387         return 1;
 388     }
 389 }
 390
 391 sub flag_autoindexing {
 392     my $self = shift;
 393     my $doit = shift;
 394     my $flag_file = $self->autoindex_flagfile;
 395     if ($doit) {
 396         open my $fh, '>', $flag_file or $self->throw("Could not write file '$flag_file': $!");
 397         print $fh $$;
 398         close $fh;
 399     } else {
 400         unlink $flag_file;
 401     }
 402 }
 403
 404 sub reindex_gfffiles {
 405     my $self    = shift;
 406     my $files   = shift;
 407     my $autodir = shift;
 408
 409     warn "Reindexing GFF files...\n" if $self->verbose;
 410     my $exists = -e $self->_features_path;
 411
 412     $self->_permissions(1,1);
 413     $self->_close_databases();
 414     $self->_open_databases(1,!$exists);
 415     require Bio::DB::SeqFeature::Store::GFF3Loader
 416         unless Bio::DB::SeqFeature::Store::GFF3Loader->can('new');
 417     my $loader = Bio::DB::SeqFeature::Store::GFF3Loader->new(-store    => $self,
 418                                                              -sf_class => $self->seqfeature_class,
 419                                                              -verbose  => $self->verbose,
 420         )
 421         or $self->throw("Couldn't create GFF3Loader");
 422     my %seen;
 423     $loader->load(grep {!$seen{$_}++} @$files);
 424     $self->_touch_timestamp;
 425 }
 426
 427 sub reindex_ffffiles {
 428     my $self    = shift;
 429     my $files   = shift;
 430     my $autodir = shift;
 431
 432     warn "Reindexing FFF files...\n" if $self->verbose;
 433     $self->_permissions(1,1);
 434     $self->_close_databases();
 435     $self->_open_databases(1,1);
 436     require Bio::DB::SeqFeature::Store::FeatureFileLoader
 437         unless Bio::DB::SeqFeature::Store::FeatureFileLoader->can('new');
 438     my $loader = Bio::DB::SeqFeature::Store::FeatureFileLoader->new(-store    => $self,
 439                                                                     -sf_class => $self->seqfeature_class,
 440                                                                     -verbose  => $self->verbose,
 441         )
 442         or $self->throw("Couldn't create FeatureFileLoader");
 443     my %seen;
 444     $loader->load(grep {!$seen{$_}++} @$files);
 445     $self->_touch_timestamp;
 446 }
 447
 448 sub reindex_wigfiles {
 449     my $self    = shift;
 450     my $files   = shift;
 451     my $autodir = shift;
 452
 453     warn "Reindexing wig files...\n" if $self->verbose;
 454
 455     unless (Bio::Graphics::Wiggle::Loader->can('new')) {
 456         eval "require Bio::Graphics::Wiggle::Loader; 1"
 457             or return;
 458     }
 459
 460     for my $wig (@$files) {
 461         warn "Reindexing $wig...\n" if $self->verbose;
 462         my ($wib_name) = fileparse($wig,qr/\.[^.]*/);
 463         my $gff3_name  = "$wib_name.gff3";
 464
 465         # unlink all wib files that share the basename
 466         my $wib_glob   = File::Spec->catfile($self->directory,"$wib_name*.wib");
 467         unlink glob($wib_glob);
 468
 469         my $loader     = Bio::Graphics::Wiggle::Loader->new($self->directory,$wib_name);
 470         my $fh         = IO::File->new($wig) or die "Can't open $wig: $!";
 471         $loader->load($fh);  # will create one or more .wib files
 472         $fh->close;
 473         my $gff3_data  = $loader->featurefile('gff3','microarray_oligo',$wib_name);
 474         my $gff3_path  = File::Spec->catfile($autodir,$gff3_name);
 475         $fh            = IO::File->new($gff3_path,'>')
 476             or die "Can't open $gff3_path for writing: $!";
 477         $fh->print($gff3_data);
 478         $fh->close;
 479         my $conf_path  = File::Spec->catfile($autodir,"$wib_name.conf");
 480         $fh            = IO::File->new($conf_path,'>');
 481         $fh->print($loader->conf_stanzas('microarray_oligo',$wib_name));
 482         $fh->close;
 483     }
 484 }
 485
 486 # returns the following hashref
 487 # empty hash if nothing needs reindexing
 488 # {fasta => 1}                    if DNA database needs reindexing
 489 # {gff   => [list,of,gff,paths]}  if gff3 files need reindexing
 490 # {wig   => [list,of,wig,paths]}  if wig files need reindexing
 491 sub needs_auto_reindexing {
 492     my $self    = shift;
 493     my $autodir = shift;
 494     my $result    = {};
 495
 496     # don't allow two processes to reindex simultaneously
 497         $self->auto_index_in_process and croak "Autoindexing in process. Try again later";
 498
 499     # first interrogate the GFF3 files, using the timestamp file
 500     # as modification comparison
 501     my (@gff3,@fff,@wig,$fasta,$fasta_index_time);
 502     opendir (my $D,$autodir)
 503         or $self->throw("Couldn't open directory $autodir for reading: $!");
 504
 505     my $maxtime   = 0;
 506     my $timestamp_time  = _mtime($self->_mtime_path) || 0;
 507     while (defined (my $node = readdir($D))) {
 508         next if $node =~ /^\./;
 509         my $path      = File::Spec->catfile($autodir,$node);
 510         next unless -f $path;
 511
 512         if ($path =~ /\.gff\d?$/i) {
 513             my $mtime = _mtime(\*_);  # not a typo
 514             $maxtime   = $mtime if $mtime > $maxtime;
 515             push @gff3,$path;
 516         }
 517
 518
 519         elsif ($path =~ /\.fff?$/i) {
 520             my $mtime = _mtime(\*_);  # not a typo
 521             $maxtime   = $mtime if $mtime > $maxtime;
 522             push @fff,$path;
 523         }
 524
 525         elsif ($path =~ /\.wig$/i) {
 526             my $wig       = $path;
 527             (my $gff_file = $wig) =~ s/\.wig$/\.gff3/i;
 528             next if -e $gff_file && _mtime($gff_file) > _mtime($path);
 529             push @wig,$wig;
 530             push @gff3,$gff_file;
 531             $maxtime      = time();
 532         }
 533
 534         elsif ($path =~ /\.(fa|fasta|dna)$/i) {
 535             $fasta_index_time =
 536                 _mtime(File::Spec->catfile($self->directory,'fasta.index'))||0
 537                 unless defined $fasta_index_time;
 538             $fasta++ if _mtime($path) > $fasta_index_time;
 539         }
 540     }
 541     closedir $D;
 542
 543     $result->{gff}     = \@gff3 if $maxtime > $timestamp_time;
 544     $result->{wig}     = \@wig  if @wig;
 545     $result->{fff}     = \@fff  if @fff;
 546     $result->{fasta}++ if $fasta;
 547     return $result;
 548 }
 549
 550 sub verbose {
 551     my $self = shift;
 552     my $d    = $self->{verbose};
 553     $self->{verbose} = shift if @_;
 554     return $d;
 555 }
 556
 557 sub locking {
 558     my $self = shift;
 559     my $d    = $self->{locking};
 560     $self->{locking} = shift if @_;
 561     return $d;
 562 }
 563
 564 sub lockfile {
 565     my $self = shift;
 566     return File::Spec->catfile($self->directory,'lock');
 567 }
 568
 569 sub lock {
 570     my $self = shift;
 571     my $mode = shift;
 572     return unless $self->locking;
 573
 574     my $flag = $mode eq 'exclusive' ? LOCK_EX : LOCK_SH;
 575     my $lockfile = $self->lockfile;
 576     my $fh = $self->_flock_fh;
 577     unless ($fh) {
 578         my $open  = -e $lockfile ? '<' : '>';
 579         $fh       = IO::File->new($lockfile,$open) or die "Cannot open $lockfile: $!";
 580     }
 581     flock($fh,$flag);
 582     $self->_flock_fh($fh);
 583 }
 584
 585 sub unlock {
 586     my $self = shift;
 587     return unless $self->locking;
 588
 589     my $fh = $self->_flock_fh or return;
 590     flock($fh,LOCK_UN);
 591     undef $self->{flock_fh};
 592 }
 593
 594 sub _flock_fh {
 595     my $self = shift;
 596     my $d    = $self->{flock_fh};
 597     $self->{flock_fh} = shift if @_;
 598     $d;
 599 }
 600
 601 sub _open_databases {
 602   my $self = shift;
 603   my ($write,$create,$ignore_errors) = @_;
 604   return if $self->db; # already open - don't reopen
 605
 606   my $directory  = $self->directory;
 607   unless (-d $directory) {  # directory does not exist
 608     $create or $self->throw("Directory $directory does not exist and you did not specify the -create flag");
 609     mkpath($directory) or $self->throw("Couldn't create database directory $directory: $!");
 610   }
 611
 612   my $flags = O_RDONLY;
 613   $flags   |= O_RDWR  if $write;
 614   $flags   |= O_CREAT if $create;
 615
 616   # Create the main database; this is a DB_HASH implementation
 617   my %h;
 618   my $result = tie (%h,'DB_File',$self->_features_path,$flags,0666,$DB_HASH);
 619
 620   unless ($result) {
 621     return if $ignore_errors;  # autoindex set, so defer this
 622     $self->throw("Couldn't tie: ".$self->_features_path . " $!");
 623   }
 624
 625   if ($create) {
 626     %h = ();
 627     $h{'.next_id'} = 1;
 628     $h{'.version'} = $self->version;
 629   }
 630   $self->db(\%h);
 631
 632   $self->open_index_dbs($flags,$create);
 633   $self->open_parentage_db($flags,$create);
 634   $self->open_notes_db($write,$create);
 635   $self->open_seq_db($flags,$create) if -e $self->_fasta_path;
 636 }
 637
 638 sub open_index_dbs {
 639     my $self = shift;
 640     my ($flags,$create) = @_;
 641
 642     # Create the index databases; these are DB_BTREE implementations with duplicates allowed.
 643     $DB_BTREE->{flags}  = R_DUP;
 644     $DB_BTREE->{compare}     = sub { lc($_[0]) cmp lc($_[1]) };
 645     for my $idx ($self->_index_files) {
 646         my $path = $self->_qualify("$idx.idx");
 647         my %db;
 648         my $result = tie(%db,'DB_File',$path,$flags,0666,$DB_BTREE);
 649         # for backward compatibility, allow a failure when trying to open the is_indexed index.
 650         $self->throw("Couldn't tie $path: $!") unless $result || $idx eq 'is_indexed';
 651         %db = () if $create;
 652         $self->index_db($idx=>\%db);
 653     }
 654 }
 655
 656 sub open_parentage_db {
 657     my $self = shift;
 658     my ($flags,$create) = @_;
 659
 660     # Create the parentage database
 661     my %p;
 662     tie (%p,'DB_File',$self->_parentage_path,$flags,0666,$DB_BTREE)
 663         or $self->throw("Couldn't tie: ".$self->_parentage_path . $!);
 664     %p = () if $create;
 665     $self->parentage_db(\%p);
 666 }
 667
 668 sub open_notes_db {
 669     my $self = shift;
 670     my ($write,$create) = @_;
 671
 672     my $mode =  $write  ? "+>>"
 673                         : $create ? "+>"
 674                         : "<";
 675
 676     my $notes_file = $self->_notes_path;
 677     open my $F, $mode, $notes_file or $self->throw("Could not open file '$notes_file': $!");
 678     $self->notes_db($F);
 679 }
 680
 681 sub open_seq_db {
 682     my $self = shift;
 683
 684     if (-e $self->_fasta_path) {
 685         my $dna_db = Bio::DB::Fasta::Subdir->new($self->_fasta_path)
 686             or $self->throw("Can't reindex sequence file: $@");
 687         $self->dna_db($dna_db);
 688     }
 689 }
 690
 691 sub commit { # reindex fasta files
 692   my $self = shift;
 693   if (my $fh = $self->{fasta_fh}) {
 694     $fh->close;
 695     $self->dna_db(Bio::DB::Fasta::Subdir->new($self->{fasta_file}));
 696   } elsif (-d $self->directory) {
 697     $self->dna_db(Bio::DB::Fasta::Subdir->new($self->directory));
 698   }
 699 }
 700
 701 sub _close_databases {
 702   my $self = shift;
 703   $self->db(undef);
 704   $self->dna_db(undef);
 705   $self->notes_db(undef);
 706   $self->parentage_db(undef);
 707   $self->index_db($_=>undef) foreach $self->_index_files;
 708 }
 709
 710 # do nothing -- new() with -create=>1 will do the trick
 711 sub _init_database { }
 712
 713 sub _delete_databases {
 714   my $self = shift;
 715   for my $idx ($self->_index_files) {
 716     my $path = $self->_qualify("$idx.idx");
 717     unlink $path;
 718   }
 719   unlink $self->_parentage_path;
 720   unlink $self->_fasta_path;
 721   unlink $self->_features_path;
 722   unlink $self->_mtime_path;
 723 }
 724
 725 sub _touch_timestamp {
 726   my $self = shift;
 727   my $tsf = $self->_mtime_path;
 728   open my $F, '>', $tsf or $self->throw("Could not write file '$tsf': $!");
 729   print $F scalar(localtime);
 730   close $F;
 731 }
 732
 733 sub _store {
 734   my $self    = shift;
 735   my $indexed = shift;
 736   my $db   = $self->db;
 737   my $is_indexed = $self->index_db('is_indexed');
 738   my $count = 0;
 739   for my $obj (@_) {
 740     my $primary_id = $obj->primary_id;
 741     $self->_delete_indexes($obj,$primary_id)  if $indexed && $primary_id;
 742     $primary_id    = $db->{'.next_id'}++      unless defined $primary_id;
 743     $db->{$primary_id} = $self->freeze($obj);
 744     $is_indexed->{$primary_id} = $indexed if $is_indexed;
 745     $obj->primary_id($primary_id);
 746     $self->_update_indexes($obj)              if $indexed;
 747     $count++;
 748   }
 749   $count;
 750 }
 751
 752 sub _delete_indexes {
 753   my $self = shift;
 754   my ($obj,$id) = @_;
 755
 756   # the additional "1" causes the index to be deleted
 757   $self->_update_name_index($obj,$id,1);
 758   $self->_update_type_index($obj,$id,1);
 759   $self->_update_location_index($obj,$id,1);
 760   $self->_update_attribute_index($obj,$id,1);
 761   $self->_update_note_index($obj,$id,1);
 762 }
 763
 764 sub _fetch {
 765   my $self = shift;
 766   my $id   = shift;
 767   my $db = $self->db;
 768   my $obj = $self->thaw($db->{$id},$id);
 769   $obj;
 770 }
 771
 772 sub _add_SeqFeature {
 773   my $self = shift;
 774   my $parent   = shift;
 775   my @children = @_;
 776   my $parent_id = (ref $parent ? $parent->primary_id : $parent)
 777     or $self->throw("$parent should have a primary_id");
 778   my $p = $self->parentage_db;
 779   for my $child (@children) {
 780     my $child_id = ref $child ? $child->primary_id : $child;
 781     defined $child_id or $self->throw("no primary ID known for $child");
 782     $p->{$parent_id} = $child_id if tied(%$p)->find_dup($parent_id,$child_id);
 783   }
 784   return scalar @children;
 785 }
 786
 787 sub _fetch_SeqFeatures {
 788   my $self   = shift;
 789   my $parent = shift;
 790   my @types  = @_;
 791   my $parent_id = $parent->primary_id or $self->throw("$parent should have a primary_id");
 792   my $index     = $self->parentage_db;
 793   my $db        = tied %$index;
 794
 795   my @children_ids  = $db->get_dup($parent_id);
 796   my @children      = map {$self->fetch($_)} @children_ids;
 797
 798   if (@types) {
 799       foreach (@types) {
 800           my ($a,$b) = split ':',$_,2;
 801           $_  = quotemeta($a);
 802           if (length $b) {
 803               $_ .= ":".quotemeta($b).'$';
 804           } else {
 805               $_ .= ':';
 806           }
 807       }
 808       my $regexp = join '|', @types;
 809       return grep {($_->primary_tag.':'.$_->source_tag) =~ /^($regexp)/i} @children;
 810   } else {
 811       return @children;
 812   }
 813 }
 814
 815 sub _update_indexes {
 816   my $self = shift;
 817   my $obj  = shift;
 818   defined (my $id   = $obj->primary_id) or return;
 819   $self->_update_name_index($obj,$id);
 820   $self->_update_type_index($obj,$id);
 821   $self->_update_location_index($obj,$id);
 822   $self->_update_attribute_index($obj,$id);
 823   $self->_update_note_index($obj,$id);
 824 }
 825
 826 sub _update_name_index {
 827   my $self = shift;
 828   my ($obj,$id,$delete) = @_;
 829   my $db = $self->index_db('names') or $self->throw("Couldn't find 'names' index file");
 830   my ($names,$aliases) = $self->feature_names($obj);
 831
 832   # little stinky - needs minor refactoring
 833   foreach (@$names) {
 834     my $key = lc $_;
 835     $self->update_or_delete($delete,$db,$key,$id);
 836   }
 837
 838   foreach (@$aliases) {
 839     my $key = lc($_)."_2"; # the _2 indicates a secondary (alias) ID
 840     $self->update_or_delete($delete,$db,$key,$id);
 841   }
 842
 843 }
 844
 845 sub _update_type_index {
 846   my $self = shift;
 847   my ($obj,$id,$delete) = @_;
 848   my $db = $self->index_db('types')
 849     or $self->throw("Couldn't find 'types' index file");
 850   my $primary_tag = $obj->primary_tag;
 851   my $source_tag  = $obj->source_tag || '';
 852   return unless defined $primary_tag;
 853
 854   $primary_tag    .= ":$source_tag";
 855   my $key          = lc $primary_tag;
 856   $self->update_or_delete($delete,$db,$key,$id);
 857 }
 858
 859 # Note: this indexing scheme is space-inefficient because it stores the
 860 # denormalized sequence ID followed by the bin in XXXXXX zero-leading
 861 # format. It should be replaced with a binary numeric encoding and the
 862 # BTREE {compare} attribute changed accordingly.
 863 sub _update_location_index {
 864   my $self = shift;
 865   my ($obj,$id,$delete) = @_;
 866   my $db = $self->index_db('locations')
 867     or $self->throw("Couldn't find 'locations' index file");
 868
 869   my $seq_id      = $obj->seq_id || '';
 870   my $start       = $obj->start  || '';
 871   my $end         = $obj->end    || '';
 872   my $strand      = $obj->strand;
 873   my $bin_min     = int $start/BINSIZE;
 874   my $bin_max     = int $end/BINSIZE;
 875
 876   for (my $bin = $bin_min; $bin <= $bin_max; $bin++ ) {
 877     my $key = sprintf("%s.%06d",lc($seq_id),$bin);
 878     $self->update_or_delete($delete,$db,$key,pack("i4",$id,$start,$end,$strand));
 879   }
 880 }
 881
 882 sub _update_attribute_index {
 883   my $self      = shift;
 884   my ($obj,$id,$delete) = @_;
 885   my $db = $self->index_db('attributes')
 886     or $self->throw("Couldn't find 'attributes' index file");
 887
 888   for my $tag ($obj->get_all_tags) {
 889     for my $value ($obj->get_tag_values($tag)) {
 890       my $key = "${tag}:${value}";
 891       $self->update_or_delete($delete,$db,$key,$id);
 892     }
 893   }
 894 }
 895
 896 sub _update_note_index {
 897   my $self = shift;
 898   my ($obj,$id,$delete) = @_;
 899   return if $delete; # we don't know how to do this
 900
 901   my $fh = $self->notes_db;
 902   my @notes = $obj->get_tag_values('Note') if $obj->has_tag('Note');
 903
 904
 905   print $fh $_,"\t",pack("u*",$id) or $self->throw("An error occurred while updating note index: $!")
 906     foreach @notes;
 907 }
 908
 909 sub update_or_delete {
 910   my $self = shift;
 911   my ($delete,$db,$key,$id) = @_;
 912   if ($delete) {
 913     tied(%$db)->del_dup($key,$id);
 914   } else {
 915     $db->{$key} = $id;
 916   }
 917 }
 918
 919 # these methods return pointers to....
 920 # the database that stores serialized objects
 921 sub db {
 922   my $self = shift;
 923   my $d = $self->setting('db');
 924   $self->setting(db=>shift) if @_;
 925   $d;
 926 }
 927
 928 sub parentage_db {
 929   my $self = shift;
 930   my $d = $self->setting('parentage_db');
 931   $self->setting(parentage_db=>shift) if @_;
 932   $d;
 933 }
 934
 935 # the Bio::DB::Fasta object
 936 sub dna_db {
 937   my $self = shift;
 938   my $d = $self->setting('dna_db');
 939   $self->setting(dna_db=>shift) if @_;
 940   $d;
 941 }
 942
 943 # the specialized notes database
 944 sub notes_db {
 945   my $self = shift;
 946   my $d = $self->setting('notes_db');
 947   $self->setting(notes_db=>shift) if @_;
 948   $d;
 949 }
 950
 951 # the is_indexed_db
 952 sub is_indexed_db {
 953   my $self = shift;
 954   my $d = $self->setting('is_indexed_db');
 955   $self->setting(is_indexed_db=>shift) if @_;
 956   $d;
 957 }
 958
 959 # The indicated index berkeley db
 960 sub index_db {
 961   my $self = shift;
 962   my $index_name = shift;
 963   my $d = $self->setting($index_name);
 964   $self->setting($index_name=>shift) if @_;
 965   $d;
 966 }
 967
 968
 969 sub _mtime {
 970   my $file = shift;
 971   my @stat = stat($file);
 972   return $stat[9];
 973 }
 974
 975 # return names of all the indexes
 976 sub _index_files {
 977   return qw(names types locations attributes is_indexed);
 978 }
 979
 980 # the directory in which we store our indexes
 981 sub directory {
 982   my $self = shift;
 983   my $d = $self->setting('directory');
 984   $self->setting(directory=>shift) if @_;
 985   $d;
 986 }
 987
 988 # flag indicating that we are a temporary database
 989 sub temporary {
 990   my $self = shift;
 991   my $d = $self->setting('temporary');
 992   $self->setting(temporary=>shift) if @_;
 993   $d;
 994 }
 995
 996 sub _permissions {
 997   my $self = shift;
 998   my $d = $self->setting('permissions') or return;
 999   if (@_) {
1000     my ($write,$create) = @_;
1001     $self->setting(permissions=>[$write,$create]);
1002   }
1003   @$d;
1004 }
1005
1006 # file name utilities...
1007 sub _qualify {
1008   my $self = shift;
1009   my $file = shift;
1010   return $self->directory .'/' . $file;
1011 }
1012
1013 sub _features_path {
1014   shift->_qualify('features.bdb');
1015 }
1016
1017 sub _parentage_path {
1018   shift->_qualify('parentage.bdb');
1019 }
1020
1021 sub _type_path {
1022   shift->_qualify('types.idx');
1023 }
1024
1025 sub _location_path {
1026   shift->_qualify('locations.idx');
1027 }
1028
1029 sub _attribute_path {
1030   shift->_qualify('attributes.idx');
1031 }
1032
1033 sub _notes_path {
1034   shift->_qualify('notes.idx');
1035 }
1036
1037 sub _fasta_path {
1038   shift->_qualify('sequence.fa');
1039 }
1040
1041 sub _mtime_path {
1042   shift->_qualify('mtime.stamp');
1043 }
1044
1045 ###########################################
1046 # searching
1047 ###########################################
1048
1049 sub _features {
1050   my $self = shift;
1051   my ($seq_id,$start,$end,$strand,
1052       $name,$class,$allow_aliases,
1053       $types,
1054       $attributes,
1055       $range_type,
1056       $iterator
1057      ) = rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],'STRAND',
1058                     'NAME','CLASS','ALIASES',
1059                     ['TYPES','TYPE','PRIMARY_TAG'],
1060                     ['ATTRIBUTES','ATTRIBUTE'],
1061                     'RANGE_TYPE',
1062                     'ITERATOR',
1063                    ],@_);
1064
1065   my (@from,@where,@args,@group);
1066   $range_type ||= 'overlaps';
1067
1068   my @result;
1069   unless (defined $name or defined $seq_id or defined $types or defined $attributes) {
1070       my $is_indexed = $self->index_db('is_indexed');
1071       @result = $is_indexed ? grep {$is_indexed->{$_}} keys %{$self->db}
1072                             : grep { !/^\./ }keys %{$self->db};
1073   }
1074
1075   my %found = ();
1076   my $result = 1;
1077
1078   if (defined($name)) {
1079     # hacky backward compatibility workaround
1080     undef $class if $class && $class eq 'Sequence';
1081     $name     = "$class:$name" if defined $class && length $class > 0;
1082     $result &&= $self->filter_by_name($name,$allow_aliases,\%found);
1083   }
1084
1085   if (defined $seq_id) {
1086     $result &&= $self->filter_by_location($seq_id,$start,$end,$strand,$range_type,\%found);
1087   }
1088
1089   if (defined $types) {
1090     $result &&= $self->filter_by_type($types,\%found);
1091   }
1092
1093   if (defined $attributes) {
1094     $result &&= $self->filter_by_attribute($attributes,\%found);
1095   }
1096
1097   push @result,keys %found if $result;
1098   return $iterator ? Bio::DB::SeqFeature::Store::berkeleydb::Iterator->new($self,\@result)
1099                    : map {$self->fetch($_)} @result;
1100 }
1101
1102 sub filter_by_name {
1103   my $self = shift;
1104   my ($name,$allow_aliases,$filter) = @_;
1105
1106   my $index = $self->index_db('names');
1107   my $db    = tied(%$index);
1108
1109   my ($stem,$regexp) = $self->glob_match($name);
1110   $stem   ||= $name;
1111   $regexp ||= $name;
1112   $regexp .= "(?:_2)?" if $allow_aliases;
1113
1114   my $key   = $stem;
1115   my $value;
1116   my @results;
1117   for (my $status = $db->seq($key,$value,R_CURSOR);
1118        $status == 0 and $key =~ /^$regexp$/i;
1119        $status = $db->seq($key,$value,R_NEXT)) {
1120       next if %$filter && !$filter->{$value};  # don't bother
1121       push @results,$value;
1122   }
1123   $self->update_filter($filter,\@results);
1124 }
1125
1126 sub filter_by_type {
1127   my $self = shift;
1128   my ($types,$filter) = @_;
1129   my @types = ref $types eq 'ARRAY' ?  @$types : $types;
1130
1131   my $index = $self->index_db('types');
1132   my $db    = tied(%$index);
1133
1134   my @results;
1135
1136   for my $type (@types) {
1137     my ($primary_tag,$source_tag);
1138     if (ref $type && $type->isa('Bio::DB::GFF::Typename')) {
1139       $primary_tag = $type->method;
1140       $source_tag  = $type->source;
1141     } else {
1142       ($primary_tag,$source_tag) = split ':',$type,2;
1143     }
1144     my $match = defined $source_tag ? "^$primary_tag:$source_tag\$" : "^$primary_tag:";
1145     $source_tag ||= '';
1146     my $key   = lc "$primary_tag:$source_tag";
1147     my $value;
1148
1149     # If filter is already provided, then it is usually faster to
1150     # fetch each object.
1151     if (%$filter) {
1152         for my $id (keys %$filter) {
1153             my $obj = $self->_fetch($id) or next;
1154             push @results,$id if $obj->type =~ /$match/i;
1155         }
1156
1157     }
1158
1159     else {
1160         for (my $status = $db->seq($key,$value,R_CURSOR);
1161              $status == 0 && $key =~ /$match/i;
1162              $status = $db->seq($key,$value,R_NEXT)) {
1163             next if %$filter && !$filter->{$value};  # don't even bother
1164             push @results,$value;
1165         }
1166     }
1167   }
1168   $self->update_filter($filter,\@results);
1169 }
1170
1171 sub filter_by_location {
1172   my $self = shift;
1173   my ($seq_id,$start,$end,$strand,$range_type,$filter) = @_;
1174   $strand ||= 0;
1175
1176   my $index    = $self->index_db('locations');
1177   my $db       = tied(%$index);
1178
1179   my $binstart = defined $start ? sprintf("%06d",int $start/BINSIZE) : '';
1180   my $binend   = defined $end   ? sprintf("%06d",int $end/BINSIZE)   : 'z';  # beyond a number
1181
1182   my %seenit;
1183   my @results;
1184
1185   $start = MININT  if !defined $start;
1186   $end   = MAXINT  if !defined $end;
1187   my $version_2 = $self->db_version > 1;
1188
1189   if ($range_type eq 'overlaps' or $range_type eq 'contains') {
1190     my $key     = $version_2 ? "\L$seq_id\E.$binstart" : "\L$seq_id\E$binstart";
1191     my $keystop = $version_2 ? "\L$seq_id\E.$binend"   : "\L$seq_id\E$binend";
1192     my $value;
1193
1194     for (my $status = $db->seq($key,$value,R_CURSOR);
1195          $status == 0 && $key le $keystop;
1196          $status = $db->seq($key,$value,R_NEXT)) {
1197       my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1198       next if $seenit{$id}++;
1199       next if $strand && $fstrand != $strand;
1200       if ($range_type eq 'overlaps') {
1201         next unless $fend >= $start && $fstart <= $end;
1202       }
1203       elsif ($range_type eq 'contains') {
1204         next unless $fstart >= $start && $fend <= $end;
1205       }
1206       next if %$filter && !$filter->{$id};  # don't bother
1207       push @results,$id;
1208     }
1209   }
1210
1211   # for contained in, we look for features originating and terminating outside the specified range
1212   # this is incredibly inefficient, but fortunately the query is rare (?)
1213   elsif ($range_type eq 'contained_in') {
1214     my $key     = $version_2 ? "\L$seq_id."            : "\L$seq_id";
1215     my $keystop = $version_2 ? "\L$seq_id\E.$binstart" : "\L$seq_id\E$binstart";
1216     my $value;
1217
1218     # do the left part of the range
1219     for (my $status = $db->seq($key,$value,R_CURSOR);
1220          $status == 0 && $key le $keystop;
1221          $status = $db->seq($key,$value,R_NEXT)) {
1222       my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1223       next if $seenit{$id}++;
1224       next if $strand && $fstrand != $strand;
1225       next unless $fstart <= $start && $fend >= $end;
1226       next if %$filter && !$filter->{$id};  # don't bother
1227       push @results,$id;
1228     }
1229
1230     # do the right part of the range
1231     $key = "\L$seq_id\E.$binend";
1232     for (my $status = $db->seq($key,$value,R_CURSOR);
1233          $status == 0;
1234          $status = $db->seq($key,$value,R_NEXT)) {
1235       my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1236       next if $seenit{$id}++;
1237       next if $strand && $fstrand != $strand;
1238       next unless $fstart <= $start && $fend >= $end;
1239       next if %$filter && !$filter->{$id};  # don't bother
1240       push @results,$id;
1241     }
1242
1243   }
1244
1245   $self->update_filter($filter,\@results);
1246 }
1247
1248 sub attributes {
1249     my $self = shift;
1250     my $index = $self->index_db('attributes');
1251     my %a     = map {s/:.+$//; $_=> 1} keys %$index;
1252     return keys %a;
1253 }
1254
1255 sub filter_by_attribute {
1256   my $self = shift;
1257   my ($attributes,$filter) = @_;
1258
1259   my $index = $self->index_db('attributes');
1260   my $db    = tied(%$index);
1261   my $result;
1262
1263   for my $att_name (keys %$attributes) {
1264     my @result;
1265     my @search_terms = ref($attributes->{$att_name}) && ref($attributes->{$att_name}) eq 'ARRAY'
1266                            ? @{$attributes->{$att_name}} : $attributes->{$att_name};
1267
1268     for my $v (@search_terms) {
1269       my ($stem,$regexp) = $self->glob_match($v);
1270       $stem   ||= $v;
1271       $regexp ||= $v;
1272       my $key = "\L${att_name}:${stem}\E";
1273       my $value;
1274       for (my $status = $db->seq($key,$value,R_CURSOR);
1275            $status == 0 && $key =~ /^$att_name:$regexp$/i;
1276            $status = $db->seq($key,$value,R_NEXT)) {
1277           next if %$filter && !$filter->{$value};  # don't bother
1278           push @result,$value;
1279       }
1280     }
1281     $result ||= $self->update_filter($filter,\@result);
1282   }
1283   $result;
1284 }
1285
1286 sub _search_attributes {
1287   my $self = shift;
1288   my ($search_string,$attribute_array,$limit) = @_;
1289   $search_string =~ tr/*?//d;
1290   my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g;
1291   my $search = join '|',@words;
1292
1293   my $index = $self->index_db('attributes');
1294   my $db    = tied(%$index);
1295
1296   my (%results,%notes);
1297
1298   for my $tag (@$attribute_array) {
1299     my $id;
1300     my $key = "\L$tag:\E";
1301     for (my $status = $db->seq($key,$id,R_CURSOR);
1302          $status == 0 and $key =~ /^$tag:(.*)/i;
1303          $status = $db->seq($key,$id,R_NEXT)) {
1304       my $text = $1;
1305       next unless $text =~ /$search/;
1306       for my $w (@words) {
1307         my @hits = $text =~ /($w)/ig or next;
1308         $results{$id} += @hits;
1309       }
1310       $notes{$id} .= "$text ";
1311     }
1312   }
1313
1314   my @results;
1315   for my $id (keys %results) {
1316     my $hits = $results{$id};
1317     my $note = $notes{$id};
1318     $note =~ s/\s+$//;
1319     my $relevance = 10 * $hits;
1320     my $feature   = $self->fetch($id) or next;
1321     my $name      = $feature->display_name or next;
1322     my $type      = $feature->type;
1323     push @results,[$name,$note,$relevance,$type,$id];
1324   }
1325
1326   return @results;
1327 }
1328
1329 sub search_notes {
1330   my $self = shift;
1331   my ($search_string,$limit) = @_;
1332
1333   $search_string =~ tr/*?//d;
1334
1335   my @results;
1336
1337   my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g;
1338   my $search = join '|',@words;
1339
1340   my (%found,$found);
1341   my $note_index = $self->notes_db;
1342   seek($note_index,0,0);  # back to start
1343   while (<$note_index>) {
1344     next unless /$search/;
1345     chomp;
1346     my ($note,$uu) = split "\t";
1347     $found{unpack("u*",$uu)}++;
1348     last if $limit && ++$found >= $limit;
1349   }
1350
1351   my (@features, @matches);
1352   for my $idx (keys %found) {
1353     my $feature    = $self->fetch($idx) or next;
1354     my @values     = $feature->get_tag_values('Note') if $feature->has_tag('Note');
1355     my $value      = "@values";
1356
1357     my $hits;
1358     $hits++ while $value =~ /($search)/ig;  # count the number of times we were hit
1359     push @matches,$hits;
1360     push @features,$feature;
1361   }
1362
1363   for (my $i=0; $i<@matches; $i++)  {
1364     my $feature = $features[$i];
1365     my $matches = $matches[$i];
1366
1367     my $relevance = 10 * $matches;
1368     my $note;
1369     $note   = join ' ',$feature->get_tag_values('Note') if $feature->has_tag('Note');
1370     push @results,[$feature->display_name,$note,$relevance];
1371   }
1372
1373   return @results;
1374 }
1375
1376 sub glob_match {
1377   my $self = shift;
1378   my $term = shift;
1379   return unless $term =~ /([^*?]*)(?:^|[^\\])?[*?]/;
1380   my $stem = $1;
1381   $term =~ s/(^|[^\\])([+\[\]^{}\$|\(\).])/$1\\$2/g;
1382   $term =~ s/(^|[^\\])\*/$1.*/g;
1383   $term =~ s/(^|[^\\])\?/$1./g;
1384   return ($stem,$term);
1385 }
1386
1387
1388 sub update_filter {
1389   my $self = shift;
1390   my ($filter,$results) = @_;
1391   return unless @$results;
1392
1393   if (%$filter) {
1394     my @filtered = grep {$filter->{$_}} @$results;
1395     %$filter     = map {$_=>1} @filtered;
1396   } else {
1397     %$filter     = map {$_=>1} @$results;
1398   }
1399
1400 }
1401
1402 sub types {
1403     my $self = shift;
1404     eval "require Bio::DB::GFF::Typename"
1405         unless Bio::DB::GFF::Typename->can('new');
1406
1407     my $index = $self->index_db('types');
1408     my $db    = tied(%$index);
1409     return map {Bio::DB::GFF::Typename->new($_)} keys %$index;
1410 }
1411
1412 # this is ugly
1413 sub _insert_sequence {
1414   my $self = shift;
1415   my ($seqid,$seq,$offset) = @_;
1416   my $dna_fh = $self->private_fasta_file or return;
1417   if ($offset == 0) { # start of the sequence
1418     print $dna_fh ">$seqid\n";
1419   }
1420   print $dna_fh $seq,"\n";
1421 }
1422
1423 sub _fetch_sequence {
1424   my $self = shift;
1425   my ($seqid,$start,$end) = @_;
1426   my $db = $self->dna_db or return;
1427   return $db->seq($seqid,$start,$end);
1428 }
1429
1430 sub private_fasta_file {
1431   my $self = shift;
1432   return $self->{fasta_fh} if exists $self->{fasta_fh};
1433   $self->{fasta_file}   = $self->_qualify("sequence.fa");
1434   return $self->{fasta_fh} = IO::File->new($self->{fasta_file},">");
1435 }
1436
1437 sub finish_bulk_update {
1438   my $self = shift;
1439   if (my $fh = $self->{fasta_fh}) {
1440     $fh->close;
1441     $self->{fasta_db} = Bio::DB::Fasta::Subdir->new($self->{fasta_file});
1442   }
1443 }
1444
1445 sub db_version {
1446     my $self = shift;
1447     my $db   = $self->db;
1448     return $db->{'.version'} || 1.00;
1449 }
1450
1451
1452 sub DESTROY {
1453   my $self = shift;
1454   $self->_close_databases();
1455   $self->private_fasta_file->close;
1456   rmtree($self->directory,0,1) if $self->temporary && -e $self->directory;
1457 }
1458
1459 # TIE interface -- a little annoying because we are storing magic ".variable"
1460 # meta-variables in the same data structure as the IDs, so these variables
1461 # must be skipped.
1462 sub _firstid {
1463   my $self  = shift;
1464   my $db    = $self->db;
1465   my ($key,$value);
1466   while ( ($key,$value) = each %{$db}) {
1467     last unless $key =~ /^\./;
1468   }
1469   $key;
1470 }
1471
1472 sub _nextid {
1473   my $self = shift;
1474   my $id   = shift;
1475   my $db    = $self->db;
1476   my ($key,$value);
1477   while ( ($key,$value) = each %$db) {
1478     last unless $key =~ /^\./;
1479   }
1480   $key;
1481 }
1482
1483 sub _existsid {
1484   my $self = shift;
1485   my $id   = shift;
1486   return exists $self->db->{$id};
1487 }
1488
1489 sub _deleteid {
1490   my $self = shift;
1491   my $id   = shift;
1492   my $obj  = $self->fetch($id) or return;
1493   $self->_delete_indexes($obj,$id);
1494   delete $self->db->{$id};
1495   1;
1496 }
1497
1498 sub _clearall {
1499   my $self = shift;
1500   $self->_close_databases();
1501   $self->_delete_databases();
1502   my ($write,$create) = $self->_permissions;
1503   $self->_open_databases($write,$create);
1504 }
1505
1506 sub _featurecount {
1507   my $self = shift;
1508   return scalar %{$self->db};
1509 }
1510
1511
1512 package Bio::DB::SeqFeature::Store::berkeleydb::Iterator;
1513
1514 sub new {
1515   my $class = shift;
1516   my $store = shift;
1517   my $ids   = shift;
1518   return bless {store => $store,
1519                 ids   => $ids},ref($class) || $class;
1520 }
1521
1522 sub next_seq {
1523   my $self  = shift;
1524   my $store = $self->{store} or return;
1525   my $id    = shift @{$self->{ids}};
1526   defined $id or return;
1527   return $store->fetch($id);
1528 }
1529
1530
1531 package Bio::DB::Fasta::Subdir;
1532
1533 use base 'Bio::DB::Fasta';
1534
1535 # alter calling arguments so that the index file is placed in a subdirectory
1536 # named "indexes"
1537
1538 sub new {
1539     my ($class, $path, %opts) = @_;
1540     if (-d $path) {
1541         $opts{-index_name} = File::Spec->catfile($path,'indexes','fasta.index');
1542     }
1543     return Bio::DB::Fasta->new($path, %opts);
1544 }
1545
1546
1547 sub _calculate_offsets {
1548     my ($self, @args) = @_;
1549     return $self->SUPER::_calculate_offsets(@args);
1550 }
1551
1552
1553 1;
1554
1555 __END__
1556
1557 =head1 BUGS
1558
1559 This is an early version, so there are certainly some bugs. Please
1560 use the BioPerl bug tracking system to report bugs.
1561
1562 =head1 SEE ALSO
1563
1564 L<bioperl>,
1565 L<Bio::DB::SeqFeature>,
1566 L<Bio::DB::SeqFeature::Store>,
1567 L<Bio::DB::SeqFeature::GFF3Loader>,
1568 L<Bio::DB::SeqFeature::Segment>,
1569 L<Bio::DB::SeqFeature::Store::memory>,
1570 L<Bio::DB::SeqFeature::Store::DBI::mysql>,
1571
1572 =head1 AUTHOR
1573
1574 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
1575
1576 Copyright (c) 2006 Cold Spring Harbor Laboratory.
1577
1578 This library is free software; you can redistribute it and/or modify
1579 it under the same terms as Perl itself.
1580
1581 =cut