lib/Bio/DB/GFF/Adaptor/berkeleydb.pm

   1 package Bio::DB::GFF::Adaptor::berkeleydb;
   2
   3
   4 =head1 NAME
   5
   6 Bio::DB::GFF::Adaptor::berkeleydb -- Bio::DB::GFF database adaptor for in-memory databases
   7
   8 =head1 SYNOPSIS
   9
  10   use Bio::DB::GFF;
  11   my $db = Bio::DB::GFF->new(-adaptor=> 'berkeleydb',
  12                              -create => 1, # on initial build you need this
  13                              -dsn    => '/usr/local/share/gff/dmel');
  14
  15   # initialize an empty database, then load GFF and FASTA files
  16   $db->initialize(1);
  17   $db->load_gff('/home/drosophila_R3.2.gff');
  18   $db->load_fasta('/home/drosophila_R3.2.fa');
  19
  20   # do queries
  21   my $segment  = $db->segment(Chromosome => '1R');
  22   my $subseg   = $segment->subseq(5000,6000);
  23   my @features = $subseg->features('gene');
  24
  25 See L<Bio::DB::GFF> for other methods.
  26
  27 =head1 DESCRIPTION
  28
  29 This adaptor implements a berkeleydb-indexed version of Bio::DB::GFF.
  30 It requires the DB_File and Storable modules. It can be used to store
  31 and retrieve short to medium-length GFF files of several million
  32 features in length.
  33
  34 =head1 CONSTRUCTOR
  35
  36 Use Bio::DB::GFF-E<gt>new() to construct new instances of this class.
  37 Three named arguments are recommended:
  38
  39  Argument    Description
  40  --------    -----------
  41
  42  -adaptor    Set to "berkeleydb" to create an instance of this class.
  43
  44  -dsn        Path to directory where the database index files will be stored (alias -db)
  45
  46  -autoindex  Monitor the indicated directory path for FASTA and GFF files, and update the
  47                indexes automatically if they change (alias -dir)
  48
  49  -write      Set to a true value in order to update the database.
  50
  51  -create     Set to a true value to create the database the first time
  52                (implies -write)
  53
  54  -tmp        Location of temporary directory for storing intermediate files
  55                during certain queries.
  56
  57  -preferred_groups  Specify the grouping tag. See L<Bio::DB::GFF>
  58
  59 The -dsn argument selects the directory in which to store the database
  60 index files. If the directory does not exist it will be created
  61 automatically, provided that the current process has sufficient
  62 privileges. If no -dsn argument is specified, a database named "test"
  63 will be created in your system's temporary files directory.
  64
  65 The -tmp argument specifies the temporary directory to use for storing
  66 intermediate search results. If not specified, your system's temporary
  67 files directory will be used. On Unix systems, the TMPDIR environment
  68 variable is honored. Note that some queries can require a lot of
  69 space.
  70
  71 The -autoindex argument, if present, selects a directory to be
  72 monitored for GFF and FASTA files (which can be compressed with the
  73 gzip program if desired). Whenever any file in this directory is
  74 changed, the index files will be updated. Note that the indexing can
  75 take a long time to run: anywhere from 5 to 10 minutes for a million
  76 features. An alias for this argument is -dir, which gives this adaptor
  77 a similar flavor to the "memory" adaptor.
  78
  79 -dsn and -dir can point to the same directory. If -dir is given but
  80 -dsn is absent the index files will be stored into the directory
  81 containing the source files.  For autoindexing to work, you must
  82 specify the same -dir path each time you open the database.
  83
  84 If you do not choose autoindexing, then you will want to load the
  85 database using the bp_load_gff.pl command-line tool. For example:
  86
  87  bp_load_gff.pl -a berkeleydb -c -d /usr/local/share/gff/dmel dna1.fa dna2.fa features.gff
  88
  89 =head1 METHODS
  90
  91 See L<Bio::DB::GFF> for inherited methods
  92
  93 =head1 BUGS
  94
  95 The various get_Stream_* methods and the features() method with the
  96 -iterator argument only return an iterator after the query runs
  97 completely and the module has been able to generate a temporary
  98 results file on disk. This means that iteration is not as big a win as
  99 it is for the relational-database adaptors.
 100
 101 Like the dbi::mysqlopt adaptor, this module uses a binning scheme to
 102 speed up range-based searches. The binning scheme used here imposes a
 103 hard-coded 1 gigabase (1000 Mbase) limit on the size of the largest
 104 chromosome or other reference sequence.
 105
 106 =head1 SEE ALSO
 107
 108 L<Bio::DB::GFF>, L<bioperl>
 109
 110 =head1 AUTHORS
 111
 112 Vsevolod (Simon) Ilyushchenko E<gt>simonf@cshl.eduE<lt>
 113 Lincoln Stein E<gt>lstein@cshl.eduE<lt>
 114
 115 Copyright (c) 2005 Cold Spring Harbor Laboratory.
 116
 117 This library is free software; you can redistribute it and/or modify
 118 it under the same terms as Perl itself.
 119
 120 =cut
 121
 122 use strict;
 123
 124 use DB_File;
 125 use File::Path 'mkpath';
 126 use File::Spec;
 127 use File::Temp 'tempfile';
 128
 129 use Bio::DB::GFF::Util::Rearrange; # for rearrange()
 130 use Bio::DB::GFF::Util::Binning;
 131 use Bio::DB::Fasta;
 132 use Bio::DB::GFF::Adaptor::berkeleydb::iterator;
 133 use Bio::DB::GFF::Adaptor::memory::feature_serializer; # qw(feature2string string2feature @hash2array_map);
 134
 135 # this is the smallest bin (1 K)
 136 use constant MIN_BIN    => 1000;
 137 # this is the largest that any reference sequence can be (1000 megabases)
 138 use constant MAX_BIN     => 1_000_000_000;
 139 use constant MAX_SEGMENT => 1_000_000_000;  # the largest a segment can get
 140
 141 #We have to define a limit because Berkeleydb sorts in lexicografic order,
 142 #so all the numbers have to have the same length.
 143 use constant MAX_NUM_LENGTH => length(MAX_BIN);
 144
 145 use base 'Bio::DB::GFF::Adaptor::memory';
 146
 147 sub new {
 148   my $class = shift ;
 149   my ($dbdir,$preferred_groups,$autoindex,$write,$create,$tmpdir) = rearrange([
 150                                                                                [qw(DSN DB)],
 151                                                                                'PREFERRED_GROUPS',
 152                                                                                [qw(DIR AUTOINDEX)],
 153                                                                                [qw(WRITE WRITABLE)],
 154                                                                                'CREATE',
 155                                                                                'TMP',
 156                                                                               ],@_);
 157   $tmpdir ||= File::Spec->tmpdir;
 158   $dbdir  ||= $autoindex;
 159   $dbdir  ||= "$tmpdir/test";
 160   $write  ||= $create;
 161
 162   my $self = bless {},$class;
 163   $self->dsn($dbdir);
 164   $self->tmpdir($tmpdir);
 165   $self->preferred_groups($preferred_groups) if defined $preferred_groups;
 166   $self->_autoindex($autoindex)              if $autoindex;
 167   $self->_open_databases($write,$create);
 168   return $self;
 169 }
 170
 171 sub _autoindex {
 172   my $self    = shift;
 173   my $autodir = shift;
 174
 175   my $dir    = $self->dsn;
 176   my %ignore = map {$_=>1} ($self->_index_file,$self->_data_file,
 177                             $self->_fasta_file,$self->_temp_file,
 178                             $self->_notes_file,
 179                             $self->_timestamp_file);
 180
 181   my $maxtime   = 0;
 182   my $maxfatime = 0;
 183
 184   opendir (my $D,$autodir) or $self->throw("Couldn't open directory $autodir for reading: $!");
 185
 186   while (defined (my $node = readdir($D))) {
 187     next if $node =~ /^\./;
 188     my $path      = "$dir/$node";
 189     next if $ignore{$path};
 190     next unless -f $path;
 191     my $mtime = _mtime(\*_);  # not a typo
 192     $maxtime   = $mtime if $mtime > $maxtime;
 193     $maxfatime = $mtime if $mtime > $maxfatime && $node =~ /\.(?:fa|fasta|dna)(?:\.gz)?$/;
 194   }
 195
 196   close $D;
 197
 198   my $timestamp_time  = _mtime($self->_timestamp_file) || 0;
 199   my $all_files_exist = -e $self->_index_file && -e $self->_data_file && (-e $self->_fasta_file || !$maxfatime);
 200
 201   # to avoid rebuilding FASTA files if not changed
 202   my $spare_fasta     = $maxfatime > 0 && $maxfatime < $timestamp_time && -e $self->_fasta_file;
 203
 204   if ($maxtime > $timestamp_time || !$all_files_exist) {
 205     print STDERR __PACKAGE__,": Reindexing files in $dir. This may take a while....\n";
 206     $self->do_initialize(1,$spare_fasta);
 207     $self->load_gff($autodir,1);
 208     $self->load_fasta($autodir,1) unless $spare_fasta;
 209     print STDERR __PACKAGE__,": Reindexing done\n";
 210   }
 211
 212   else {
 213     $self->_open_databases();
 214   }
 215
 216 }
 217
 218 sub _open_databases {
 219   my $self   = shift;
 220   my ($write,$create) = @_;
 221
 222   my $dsn  = $self->dsn;
 223   unless (-d $dsn) {  # directory does not exist
 224     $create or $self->throw("Directory $dsn does not exist and you did not specify the -create flag");
 225     mkpath($dsn) or $self->throw("Couldn't create database directory $dsn: $!");
 226   }
 227
 228   my %db;
 229   local $DB_BTREE->{flags} = R_DUP;
 230   $DB_BTREE->{compare}     = sub { lc($_[0]) cmp lc($_[1]) };
 231   my $flags = O_RDONLY;
 232   $flags   |= O_RDWR  if $write;
 233   $flags   |= O_CREAT if $create;
 234
 235   tie(%db,'DB_File',$self->_index_file,$flags,0666,$DB_BTREE)
 236     or $self->throw("Couldn't tie ".$self->_index_file.": $!");
 237   $self->{db}   = \%db;
 238   $self->{data} = FeatureStore->new($self->_data_file,$write,$create);
 239
 240   if (-e $self->_fasta_file) {
 241     my $dna_db = Bio::DB::Fasta->new($self->_fasta_file) or $self->throw("Can't reindex sequence file: $@");
 242     $self->dna_db($dna_db);
 243   }
 244
 245   my $mode =  $write  ? "+>>"
 246             : $create ? "+>"
 247             : "<";
 248
 249   my $notes_file = $self->_notes_file;
 250   open my $F, $mode, $notes_file or $self->throw("Could not open file '$notes_file': $!");
 251   $self->{notes} = $F;
 252 }
 253
 254 sub _close_databases {
 255   my $self = shift;
 256   delete $self->{db};
 257   delete $self->{data};
 258   delete $self->{notes};
 259 }
 260
 261 sub _delete_features {
 262   my $self        = shift;
 263   my @feature_ids = @_;
 264   my $removed = 0;
 265   my $last_id = $self->{data}->last_id;
 266   for my $id (@feature_ids) {
 267     next unless $id >= 0 && $id < $last_id;
 268     my $feat  = $self->{data}->get($id) or next;
 269     $self->{data}->remove($id);
 270     $self->_bump_class_count($feat->{gclass},-1);
 271     my @keys = $self->_secondary_keys($feat);
 272     $self->db->del_dup($_,$id) foreach @keys;
 273     $removed++;
 274   }
 275   $removed;
 276 }
 277
 278 sub _secondary_keys {
 279   my $self = shift;
 280   my $feat = shift;
 281   return (
 282                 "__name__".lc(join ":",$feat->{gclass},$feat->{gname}),
 283                 "__bin__".lc("$feat->{ref}$;$feat->{bin}"),
 284                 "__type__".join(':',$feat->{method},$feat->{source}),
 285                 map {"__attr__".lc(join(':',$_->[0],$_->[1]))} @{$feat->{attributes}}
 286           );
 287 }
 288
 289 sub _delete {
 290   my $self        = shift;
 291   my $delete_spec = shift;
 292   return $self->SUPER::_delete($delete_spec) if @{$delete_spec->{segments}} or @{$delete_spec->{types}};
 293   $self->throw("This operation would delete all feature data and -force not specified")
 294     unless $delete_spec->{force};
 295   my $deleted = $self->{db}{__count__};
 296   $self->{data} = FeatureStore->new($self->_data_file,1,1);
 297   %{$self->{db}}   = ();
 298   $deleted;
 299 }
 300
 301 # with duplicates enabled, we cannot simply do $db->{__index__}++;
 302 sub _bump_feature_count {
 303   my $self = shift;
 304   my $db = $self->{db};
 305   if (@_) {
 306     delete $db->{__count__};
 307     return $db->{__count__} = shift;
 308   } else {
 309     my $index = ${db}->{__count__};
 310     delete $db->{__count__};
 311     $db->{__count__} = ($index || 0) + 1;
 312     return $index;
 313   }
 314 }
 315
 316 sub _bump_class_count {
 317   my $self = shift;
 318   my ($class,$count) = @_;
 319   $count ||= 1;
 320   my $db  = $self->{db};
 321   my $key = "__class__$class";
 322   my $newcount = ($db->{$key} || 0) + $count;
 323   delete $db->{$key};
 324   $db->{$key} = $newcount;
 325 }
 326
 327 sub classes {
 328   my $self = shift;
 329   my $db   = $self->db;
 330   my ($key,$value) = ('__class__',undef);
 331   my %classes;
 332   for (my $status = $db->seq($key,$value,R_CURSOR);
 333        $status == 0;
 334        $status = $db->seq($key,$value,R_NEXT)) {
 335     my ($class) = $key =~ /^__class__(.+)/ or last;
 336     $classes{$class}++ if $value > 0;
 337   }
 338   my @classes = sort keys %classes;
 339   return @classes;
 340 }
 341
 342 sub do_initialize {
 343   my $self  = shift;
 344   my $erase = shift;
 345   my $spare_fasta = shift; # used internally only!
 346   if ($erase) {
 347     $self->_close_databases;
 348     unlink $self->_index_file;
 349     unlink $self->_data_file;
 350     unlink $self->_notes_file;
 351     unless ($spare_fasta) {
 352       unlink $self->_fasta_file;
 353       unlink $self->_fasta_file.'.index';
 354     }
 355     unlink $self->_timestamp_file;
 356     $self->_open_databases(1,1);
 357   }
 358   1;
 359 }
 360
 361 # load_sequence($fasta_filehandle,$first_sequence_id)
 362 sub load_sequence {
 363   my $self = shift;
 364   my ($io_handle,$id) = @_;
 365   my $file = $self->_fasta_file;
 366   my $loaded = 0;
 367
 368   open my $F, '>>', $file or $self->throw("Could not append file '$file': $!");
 369
 370   if (defined $id) {
 371     print $F ">$id\n";
 372     $loaded++;
 373   }
 374
 375   while (<$io_handle>) {
 376     $loaded++ if /^>/;
 377     print $F $_;
 378   }
 379   close $F;
 380   my $dna_db = Bio::DB::Fasta->new($file) or $self->throw("Can't reindex sequence file: $@");
 381   $self->dna_db($dna_db);
 382   $self->_touch_timestamp;
 383   return $loaded;
 384 }
 385
 386 sub _mtime {
 387   my $file = shift;
 388   my @stat = stat($file);
 389   return $stat[9];
 390 }
 391
 392 sub _index_file {
 393   my $self = shift;
 394   return $self->dsn . "/bdb_features.btree";
 395 }
 396
 397 sub _data_file {
 398   my $self = shift;
 399   return $self->dsn . "/bdb_features.data";
 400 }
 401
 402 sub _fasta_file {
 403   my $self = shift;
 404   return $self->dsn . "/bdb_sequence.fa";
 405 }
 406
 407 sub _notes_file {
 408   my $self = shift;
 409   return $self->dsn . "/bdb_notes.idx";
 410 }
 411
 412 sub _temp_file {
 413   my $self = shift;
 414   local $^W=0;
 415   my (undef,$filename) = tempfile("bdb_temp_XXXXXX",DIR=>$self->tmpdir,OPEN=>0);
 416   return $filename;
 417 }
 418
 419 sub _timestamp_file {
 420   my $self = shift;
 421   return $self->dsn ."/bdb_timestamp";
 422 }
 423
 424 sub db {
 425   my $db   = shift()->{db} or return;
 426   return tied(%$db);
 427 }
 428
 429 sub dsn {
 430   my $self = shift;
 431   my $d    = $self->{dsn};
 432   $self->{dsn} = shift if @_;
 433   $d;
 434 }
 435
 436 sub tmpdir {
 437   my $self = shift;
 438   my $d    = $self->{tmpdir};
 439   $self->{tmpdir} = shift if @_;
 440   $d;
 441 }
 442
 443 sub load_gff_line {
 444
 445   my ($self, $feat) = @_;
 446
 447   $feat->{strand} = '' if $feat->{strand} && $feat->{strand} eq '.';
 448   $feat->{phase} = ''  if $feat->{phase}  && $feat->{phase}  eq '.';
 449
 450   my $start = $feat->{start};
 451   my $stop = $feat->{stop};
 452   my $type = join(':',$feat->{method},$feat->{source});
 453
 454   my $bin =  bin($feat->{start},$feat->{stop},MIN_BIN);
 455   $feat->{bin} = $bin;
 456
 457   my $id = $self->{data}->put($feat);
 458   $bin = $self->normalizeNumber($bin);
 459
 460   my $db = $self->{db};
 461   for my $skey ($self->_secondary_keys($feat)) {
 462     $db->{$skey} = $id;
 463   }
 464
 465   # save searchable notes to separate index
 466   my $fh = $self->{notes};
 467   my @notes = map {$_->[1]} grep {lc $_->[0] eq 'note'} @{$feat->{attributes}};
 468   print $fh $_,"\t",pack("u*",$id) or $self->throw("An error occurred while updating indexes: $!")
 469     foreach @notes;
 470
 471   $self->{records_loaded}++;
 472   $self->_bump_feature_count();
 473   $self->_bump_class_count($feat->{gclass});
 474
 475 }
 476
 477 # do nothing!
 478 sub setup_load {
 479   my $self = shift;
 480   $self->{records_loaded} = 0;
 481   1;
 482 }
 483
 484 sub finish_load {
 485   my $self = shift;
 486   $self->db->sync && $self->throw("An error occurred while updating indexes: $!");
 487   $self->_touch_timestamp;
 488   $self->{records_loaded};
 489 }
 490
 491 sub _touch_timestamp {
 492   my $self = shift;
 493   my $tsf = $self->_timestamp_file;
 494   open my $F, '>', $tsf or $self->throw("Could not write file '$tsf': $!");
 495   print $F scalar(localtime);
 496   close $F;
 497 }
 498
 499
 500 # given sequence name, return (reference,start,stop,strand)
 501 sub get_abscoords {
 502   my $self = shift;
 503   my ($name,$class,$refseq) = @_;
 504   my %refs;
 505   my $regexp;
 506
 507   if ($name =~ /[*?]/) {  # uh oh regexp time
 508     $name = quotemeta($name);
 509     $name =~ s/\\\*/.*/g;
 510     $name =~ s/\\\?/.?/g;
 511     $regexp++;
 512   }
 513   # Find all features that have the requested name and class.
 514   # Sort them by reference point.
 515   my @features = @{$self->retrieve_features(-table => 'name', -key=>"$class:$name")};
 516   if (!@features) {  # nothing matched exactly, so try aliases
 517     @features = @{$self->retrieve_features(-table=>'attr',-key=>"Alias:$name")};
 518   }
 519
 520   foreach my $feature (@features){
 521     push @{$refs{$feature->{ref}}},$feature;
 522   }
 523
 524   # find out how many reference points we recovered
 525   if (! %refs) {
 526     $self->error("$name not found in database");
 527     return;
 528   }
 529
 530   # compute min and max
 531   my ($ref) = keys %refs;
 532   my @found = @{$refs{$ref}};
 533   my ($strand,$start,$stop);
 534
 535   my @found_segments;
 536   foreach my $ref (keys %refs) {
 537     next if defined($refseq) and $ref ne $refseq;
 538     my @found = @{$refs{$ref}};
 539     my ($strand,$start,$stop,$name);
 540     foreach (@found) {
 541       $strand ||= $_->{strand};
 542       $strand = '+' if $strand && $strand eq '.';
 543       $start  = $_->{start} if !defined($start) || $start > $_->{start};
 544       $stop   = $_->{stop}  if !defined($stop)  || $stop  < $_->{stop};
 545       $name ||= $_->{gname};
 546     }
 547     push @found_segments,[$ref,$class,$start,$stop,$strand,$name];
 548
 549   }
 550
 551   return \@found_segments;
 552 }
 553
 554 sub get_types {
 555   my $self = shift;
 556   my ($srcseq,$class,$start,$stop,$want_count,$typelist) = @_;
 557   my (%obj,%result,$key,$value);
 558   $key = "__type__";
 559
 560   if (!$srcseq) { # optimized full type list
 561     my $db = $self->db;
 562     my $status = $db->seq($key,$value,R_CURSOR);
 563
 564     while ($status == 0 && $key =~ /^__type__(.+)/) {
 565       my $type = $1;
 566       my ($method,$source) = split ':',$type;
 567       $obj{$type} = Bio::DB::GFF::Typename->new($method,$source);
 568       $result{$type}++;
 569
 570       if ($want_count) {
 571         $status = $db->seq($key,$value,R_NEXT);
 572       } else { # skip to next key set
 573         $key .= "\0";
 574         $status = $db->seq($key,$value,R_CURSOR)
 575       }
 576
 577     }
 578   }
 579
 580   else { # range search
 581     for my $feature (@{$self->_get_features_by_search_options(
 582                                                               {rangetype => 'overlaps',
 583                                                                refseq    => $srcseq,
 584                                                                refclass  => ($class || undef),
 585                                                                start     => ($start || undef),
 586                                                                stop      => ($stop  || undef),
 587                                                               },
 588                                                               {}
 589                                                              )}
 590                     ) {
 591       my $type = Bio::DB::GFF::Typename->new($feature->{method},$feature->{source});
 592       $obj{$type} = $type;
 593       $result{$type}++;
 594     }
 595   }
 596
 597   return $want_count ? %result : values %obj;
 598 }
 599
 600
 601 # Low level implementation of fetching a named feature.
 602 # GFF annotations are named using the group class and name fields.
 603 # May return zero, one, or several Bio::DB::GFF::Feature objects.
 604
 605 =head2 _feature_by_name
 606
 607  Title   : _feature_by_name
 608  Usage   : $db->get_features_by_name($class,$name,$callback)
 609  Function: get a list of features by name and class
 610  Returns : count of number of features retrieved
 611  Args    : name of feature, class of feature, and a callback
 612  Status  : protected
 613
 614 This method is used internally.  The callback arguments are those used
 615 by make_feature().
 616
 617 =cut
 618
 619 sub _feature_by_name {
 620   my $self = shift;
 621   my ($class,$name,$location,$callback) = @_;
 622   $callback || $self->throw('must provide a callback argument');
 623
 624   #use Devel::StackTrace;
 625   #warn Devel::StackTrace->new->as_string;
 626
 627   my $count = 0;
 628   my $id    = -1;
 629   my ($use_regexp, $use_glob,$using_alias_search);
 630
 631   if ($name =~ /[*?]/) {  # uh oh regexp time
 632
 633     #If there is only one trailing *, do a range search
 634     if ($name =~ /^([^\*]+)\*$/) {
 635       $name = $1;
 636       $use_glob++;
 637     }
 638
 639     else {
 640       $name = quotemeta($name);
 641       $name =~ s/\\\*/.*/g;
 642       $name =~ s/\\\?/.?/g;
 643       $use_regexp++;
 644     }
 645   }
 646
 647   my @features;
 648   if ($use_glob) {
 649     my $callback = sub {my $feat = shift; $feat->{gname} =~ /^$name/i};
 650     @features = @{$self->retrieve_features_range (-table => 'name',
 651                                                   -start => "$class:$name",
 652                                                   -do_while => $callback)
 653                 };
 654   }
 655   elsif ($use_regexp) {
 656     my $filter = sub {my $feat = shift; $feat->{gname} =~ /$name/i};
 657     @features = @{$self->filter_features(-table =>'name', -filter => $filter)};
 658   }
 659
 660   else {
 661     @features = @{$self->retrieve_features(-table=>'name',   -key => "$class:$name")};
 662   }
 663
 664   unless (@features) {
 665     $using_alias_search++;
 666     @features = @{$self->retrieve_features(-table=>'attr',   -key=>"Alias:$name")};
 667   }
 668
 669   foreach my $feature (@features){
 670     $id++;
 671     next unless $using_alias_search || $feature->{gclass} eq $class;
 672
 673     if ($location) {
 674       next if $location->[0] ne $feature->{ref};
 675       next if $location->[1] && $location->[1] > $feature->{stop};
 676       next if $location->[2] && $location->[2] < $feature->{start};
 677     }
 678     $count++;
 679
 680     $callback->(@{$feature}{@hash2array_map},0);
 681   }
 682   return $count;
 683 }
 684
 685 #sub get_feature_by_attribute{
 686 sub _feature_by_attribute{
 687   my $self = shift;
 688   my ($attributes,$callback) = @_;
 689   $callback || $self->throw('must provide a callback argument');
 690   my $count = 0;
 691   my $feature_group_id = undef;
 692
 693   #there could be more than one set of attributes......
 694   while (my ($key, $value) = each %$attributes) {
 695
 696     my @features = @{$self->retrieve_features
 697                        (-table => "attr", -key => "$key:$value")};
 698
 699     for my $feature (@features) {
 700       $callback->(@{$feature}{@hash2array_map},$feature_group_id);
 701       $count++;
 702     }
 703   }
 704
 705 }
 706
 707 sub search_notes {
 708   my $self = shift;
 709   my ($search_string,$limit) = @_;
 710
 711   $search_string =~ tr/*?//d;
 712
 713   my @results;
 714
 715   my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g;
 716   my $search = join '|',@words;
 717
 718   my (%found,$found);
 719   my $note_index = $self->{notes};
 720   seek($note_index,0,0);  # back to start
 721   while (<$note_index>) {
 722     next unless /$search/;
 723     chomp;
 724     my ($note,$uu) = split "\t";
 725     $found{unpack("u*",$uu)}++;
 726     last if $limit && ++$found >= $limit;
 727   }
 728
 729   my (@features, @matches);
 730   for my $idx (keys %found) {
 731     my $feature = $self->{data}->get($idx) or next;
 732     my @attributes = @{$feature->{attributes}};
 733     my @values     = map {lc $_->[0] eq 'note' ? $_->[1] : ()} @attributes;
 734     my $value      = "@values";
 735
 736     my $hits;
 737     $hits++ while $value =~ /($search)/ig;  # count the number of times we were hit
 738     push @matches,$hits;
 739     push @features,$feature;
 740   }
 741
 742   for (my $i=0; $i<@matches; $i++)  {
 743     my $feature = $features[$i];
 744     my $matches = $matches[$i];
 745
 746     my $relevance = 10 * $matches;
 747     my $featname = Bio::DB::GFF::Featname->new($feature->{gclass}=>$feature->{gname});
 748     my $type     = Bio::DB::GFF::Typename->new($feature->{method}=>$feature->{source});
 749     my $note;
 750     $note   = join ' ',map {$_->[1]} grep {$_->[0] eq 'Note'}                @{$feature->{attributes}};
 751     push @results,[$featname,$note,$relevance,$type];
 752   }
 753
 754   return @results;
 755 }
 756
 757 sub _get_features_by_search_options {
 758
 759   #The $data argument is not used and is preserved for superclass compatibility
 760   my ($self, $search,$options) = @_;
 761   my $count = 0;
 762
 763   my ($rangetype,$refseq,$class,$start,$stop,$types,$sparse,$order_by_group,$attributes,$temp_file) =
 764     (@{$search}{qw(rangetype refseq refclass start stop types)},
 765      @{$options}{qw(sparse sort_by_group ATTRIBUTES temp_file)}) ;
 766
 767   $start = 0               unless defined($start);
 768   $stop  = MAX_BIN         unless defined($stop);
 769
 770   my $bin =  bin($start,$stop,MIN_BIN);
 771   $bin = $self->normalizeNumber($bin);
 772
 773   my ($results,@features,%found,%results_table);
 774
 775   if ($temp_file) {
 776     local $DB_BTREE->{flags} = R_DUP;
 777     # note: there is a race condition possible here, if someone reuses the
 778     # same name between the time we get the tmpfile name and the time we
 779     # ask DB_File to open it.
 780     tie(%results_table,'DB_File',$temp_file,O_RDWR|O_CREAT,0666,$DB_BTREE)
 781       or $self->throw("Couldn't tie temporary file ".$temp_file." for writing: $!");
 782     $results = \%results_table;
 783   } else {
 784     $results = \@features;
 785   }
 786
 787   my $filter = sub {
 788     my $feature = shift;
 789
 790     my $ref           = $feature->{ref};
 791     my $feature_start = $feature->{start};
 792     my $feature_stop  = $feature->{stop};
 793     my $feature_id    = $feature->{feature_id};
 794
 795     return 0 if $found{$feature_id}++;
 796
 797     if (defined $refseq) {
 798       return 0 unless lc $refseq eq lc $ref;
 799       $start = 0               unless defined($start);
 800       $stop  = MAX_SEGMENT     unless defined($stop);
 801
 802       if ($rangetype eq 'overlaps') {
 803         return 0 unless $feature_stop >= $start && $feature_start <= $stop;
 804       } elsif ($rangetype eq 'contains') {
 805         return 0 unless $feature_start >= $start && $feature_stop <= $stop;
 806       } elsif ($rangetype eq 'contained_in') {
 807         return 0 unless $feature_start <= $start && $feature_stop >= $stop;
 808       } else {
 809         return 0 unless $feature_start == $start && $feature_stop == $stop;
 810       }
 811     }
 812
 813     my $feature_source = $feature->{source};
 814     my $feature_method = $feature->{method};
 815
 816     if (defined $types && @$types){
 817       return 0 unless $self->_matching_typelist($feature_method,$feature_source,$types);
 818     }
 819
 820     my $feature_attributes = $feature->{attributes};
 821     if (defined $attributes){
 822       return 0 unless $self->_matching_attributes($feature_attributes,$attributes);
 823     }
 824
 825     return 1;
 826   };
 827
 828   if (defined $refseq && !$sparse) {
 829     my $tier = MAX_BIN;
 830     while ($tier >= MIN_BIN) {
 831       my ($tier_start,$tier_stop) = (bin_bot($tier,$start),bin_top($tier,$stop));
 832       # warn "Using $tier_start $tier_stop\n";
 833       if ($tier_start == $tier_stop) {
 834         $self->retrieve_features(-table => "bin",
 835                                  -key => "$refseq$;$tier_start",
 836                                  -filter => $filter,
 837                                  -result => $results);
 838       } else {
 839         my $callback = sub {my $feat = shift; $feat->{bin} <= $tier_stop};
 840         $self->retrieve_features_range(-table => "bin",
 841                                        -start => "$refseq$;$tier_start",
 842                                        -do_while => $callback,
 843                                        -filter => $filter,
 844                                        -result => $results);
 845       }
 846       $tier /= 10;
 847     }
 848   }
 849
 850   elsif (@$types) {
 851     foreach (@$types) {
 852       my $type = join ':',@$_;
 853       $self->retrieve_features_range(-table    => 'type',
 854                                      -start    => $type,
 855                                      -filter   => $filter,
 856                                      -do_while => sub { my $f = shift;
 857                                                         lc($f->{method}) eq lc($_->[0])
 858                                                           &&
 859                                                         lc($f->{source}||$_->[1]||'') eq lc($_->[1]||'')
 860                                                       },
 861                                      -result => $results);
 862     }
 863   }
 864
 865   elsif (defined $attributes) {
 866     my ($attribute_name,$attribute_value) = each %$attributes; # pick first one
 867     $self->retrieve_features(-table => 'attr',
 868                              -key   => "${attribute_name}:${attribute_value}",
 869                              -filter => $filter,
 870                              -result  => $results);
 871   }
 872
 873   else {
 874     $self->filter_features(-filter => $filter,-result=>$results);
 875   }
 876
 877   return $results;
 878 }
 879
 880 sub retrieve_features {
 881   my $self = shift;
 882   my ($table, $key, $filter, $result) = rearrange(['TABLE','KEY','FILTER', 'RESULT'],@_);
 883
 884   my @result;
 885   $result ||= \@result;
 886
 887   my $frozen;
 888   my @ids  = $self->db->get_dup("__".lc($table)."__".lc($key));
 889   my $data = $self->{data};
 890   local $^W = 0;   # because _hash_to_array() will generate lots of uninit values
 891
 892   foreach my $id (@ids) {
 893     my $feat = $data->get($id);
 894     my $filter_result = $filter ? $filter->($feat) : 1;
 895     next unless $filter_result;
 896     if (ref $result eq 'HASH') {
 897      $result->{"$feat->{gclass}:$feat->{gname}"} = join ($;,$self->_hash_to_array($feat));
 898     } else {
 899       push @$result, $feat;
 900     }
 901     last if $filter_result == -1;
 902   }
 903   return $result;
 904 }
 905
 906 sub retrieve_features_range {
 907   my ($self) = shift;
 908   my ($table, $start, $do_while, $filter, $result) = rearrange(['TABLE','START','DO_WHILE', 'FILTER', 'RESULT'],@_);
 909   local $^W = 0;  # because _hash_to_array will generate lots of uninit warnings
 910
 911   my @result;
 912   $result ||= \@result;
 913   my ($id, $key, $value);
 914
 915   $key = "__".$table."__".$start;
 916   my $db   = $self->db;
 917
 918   for (my $status = $db->seq($key,$value,R_CURSOR);
 919        $status == 0;
 920        $status = $db->seq($key,$value,R_NEXT)) {
 921
 922     my $feat = $self->{data}->get($value);
 923     last unless $do_while->($feat,$key);
 924
 925     my $filter_result = $filter ? $filter->($feat) : 1;
 926     next unless $filter_result;
 927
 928     if (ref $result eq 'HASH') {
 929      $result->{"$feat->{gclass}:$feat->{gname}"} = join($;,$self->_hash_to_array($feat));
 930     } else {
 931       push @$result,$feat;
 932     }
 933     last if $filter_result == -1;
 934   }
 935
 936   return $result;
 937 }
 938
 939
 940 sub filter_features {
 941   my ($self) = shift;
 942
 943   my ($filter,$result) = rearrange(['FILTER','RESULT'],@_);
 944
 945   my @result;
 946   $result ||= \@result;
 947
 948   my ($key, $frozen);
 949   my $data = $self->{data};
 950   $data->reset;
 951   while (my $feat = $data->next) {
 952
 953     my $filter_result = $filter ? $filter->($feat) : 1;
 954     next unless $filter_result;
 955
 956     if (ref($result) eq 'HASH') {
 957       $result->{"$feat->{gclass}:$feat->{gname}"} = join($;,$self->_hash_to_array($feat));
 958     } else {
 959       push @$result,$feat;
 960     }
 961     last if $filter_result == -1;
 962   }
 963
 964   return $result;
 965 }
 966
 967
 968 sub _basic_features_by_id{
 969   my $self = shift;
 970   my ($ids) = @_;
 971
 972   $ids = [$ids] unless ref $ids =~ /ARRAY/;
 973
 974   my @result;
 975   my $data = $self->{data};
 976   for my $feature_id (@$ids){
 977     push @result, $data->get($feature_id);
 978   }
 979
 980   return wantarray() ? @result : $result[0];
 981 }
 982
 983 sub normalizeNumber {
 984   my ($self, $num) = @_;
 985   while ((length $num) < MAX_NUM_LENGTH)
 986   {
 987     $num = "0".$num;
 988   }
 989   return $num;
 990 }
 991
 992 sub get_features_iterator {
 993   my $self = shift;
 994
 995   my ($search,$options,$callback) = @_;
 996   $callback || $self->throw('must provide a callback argument');
 997   $options->{temp_file} = $self->_temp_file;
 998
 999   my $results = $self->_get_features_by_search_options($search,$options);
1000   return Bio::DB::GFF::Adaptor::berkeleydb::iterator->new($results,$callback,$options->{temp_file});
1001 }
1002
1003 #--------------------------------------------------------------------------#
1004
1005 package FeatureStore;
1006
1007 # This is a very specialized package that stores serialized features onto a file-based
1008 # array. The array is indexed by the physical offset to the beginning of each serialized
1009 # feature.
1010
1011 use strict;
1012 use Fcntl qw(SEEK_SET SEEK_END);
1013 use base 'Bio::Root::Root';
1014 use Bio::DB::GFF::Adaptor::memory::feature_serializer; # qw(feature2string string2feature @hash2array_map);
1015
1016 sub new {
1017   my $class  = shift;
1018   my $dbname = shift    or $class->throw("must provide a filepath argument");
1019   my ($write,$create) = @_;
1020
1021   my $mode =  $create  ? "+>"
1022             : $write   ? "+>>"
1023             : "<";
1024
1025   open my $F, $mode, $dbname or $class->throw("Could not open file '$dbname': $!");
1026   my $self = bless {
1027                     fh        => $F,
1028                     next_idx  => 0,
1029                     last_id   => 0,
1030                    },$class;
1031   return $self;
1032 }
1033
1034 sub put {
1035   my $self   = shift;
1036   my $feature = shift;
1037   my $fh = $self->{fh};
1038   seek($fh,0,SEEK_END);
1039   my $offset = tell($fh) || 0;
1040
1041   $self->{last_id} = $offset;
1042
1043   my $id = pack("L",$offset);
1044   $feature->{feature_id} = $id;
1045   my $value = feature2string($feature);
1046   print $fh pack("n/a*",$value) or $self->throw("An error occurred while updating the data file: $!");
1047
1048
1049   return $id;
1050 }
1051
1052 sub last_id {
1053   shift->{last_id};
1054 }
1055
1056 sub get {
1057   my $self     = shift;
1058   my $idx      = shift;
1059   my $offset   = unpack("L",$idx);
1060   my $fh = $self->{fh};
1061
1062   my ($value,$length);
1063   $offset ||= 0;
1064   seek($fh,$offset,SEEK_SET);
1065   return unless read($fh,$length,2);
1066   return unless read($fh,$value,unpack("n",$length));
1067   $self->{next_idx} = tell($fh);
1068   return if substr($value,0,1) eq "\0";
1069   return string2feature($value);
1070 }
1071
1072 sub next {
1073   my $self = shift;
1074   my $fh     = $self->{fh};
1075   my $result;
1076   do {
1077     $result = $self->get(pack("L",$self->{next_idx}));
1078   } until $result || eof($fh);
1079   $self->{next_idx} = 0 unless $result;
1080   $result;
1081 }
1082
1083 sub remove {
1084   my $self   = shift;
1085   my $id     = shift;
1086   my $offset = unpack("L",$id);
1087   my $fh     = $self->{fh};
1088   my ($value,$length);
1089   seek($fh,$offset,SEEK_SET);
1090   return unless read($fh,$length,2);
1091   print $fh "\0"x$length;  # null it out
1092   1;
1093 }
1094
1095 sub _seek {
1096   my $self = shift;
1097   my $idx  = shift;
1098   my $offset   = unpack("L",$idx);
1099   seek($self->{fh},$offset,SEEK_SET);
1100   $self->{next_idx} = tell($self->{fh});
1101 }
1102
1103 sub reset {
1104   my $self = shift;
1105   $self->_seek(pack("L",0));
1106 }
1107
1108 sub _feature2string {
1109   my $feature = shift;
1110   my @a = @{$feature}{@hash2array_map};
1111   push @a,map {@$_} @{$feature->{attributes}} if $feature->{attributes};
1112   return join $;,@a;
1113 }
1114
1115 sub _string2feature {
1116   my $string  = shift;
1117   my (%feature,@attributes);
1118
1119   (@feature{@hash2array_map},@attributes) = split $;,$string;
1120   while (@attributes) {
1121     my ($key,$value) = splice(@attributes,0,2);
1122     push @{$feature{attributes}},[$key,$value];
1123   }
1124   $feature{group_id} = undef;
1125   \%feature;
1126 }
1127
1128
1129 1;