1 package Bio
::DB
::GFF
::Adaptor
::berkeleydb
;
6 Bio::DB::GFF::Adaptor::berkeleydb -- Bio::DB::GFF database adaptor for in-memory databases
11 my $db = Bio::DB::GFF->new(-adaptor=> 'berkeleydb',
12 -create => 1, # on initial build you need this
13 -dsn => '/usr/local/share/gff/dmel');
15 # initialize an empty database, then load GFF and FASTA files
17 $db->load_gff('/home/drosophila_R3.2.gff');
18 $db->load_fasta('/home/drosophila_R3.2.fa');
21 my $segment = $db->segment(Chromosome => '1R');
22 my $subseg = $segment->subseq(5000,6000);
23 my @features = $subseg->features('gene');
25 See L<Bio::DB::GFF> for other methods.
29 This adaptor implements a berkeleydb-indexed version of Bio::DB::GFF.
30 It requires the DB_File and Storable modules. It can be used to store
31 and retrieve short to medium-length GFF files of several million
36 Use Bio::DB::GFF-E<gt>new() to construct new instances of this class.
37 Three named arguments are recommended:
42 -adaptor Set to "berkeleydb" to create an instance of this class.
44 -dsn Path to directory where the database index files will be stored (alias -db)
46 -autoindex Monitor the indicated directory path for FASTA and GFF files, and update the
47 indexes automatically if they change (alias -dir)
49 -write Set to a true value in order to update the database.
51 -create Set to a true value to create the database the first time
54 -tmp Location of temporary directory for storing intermediate files
55 during certain queries.
57 -preferred_groups Specify the grouping tag. See L<Bio::DB::GFF>
59 The -dsn argument selects the directory in which to store the database
60 index files. If the directory does not exist it will be created
61 automatically, provided that the current process has sufficient
62 privileges. If no -dsn argument is specified, a database named "test"
63 will be created in your system's temporary files directory.
65 The -tmp argument specifies the temporary directory to use for storing
66 intermediate search results. If not specified, your system's temporary
67 files directory will be used. On Unix systems, the TMPDIR environment
68 variable is honored. Note that some queries can require a lot of
71 The -autoindex argument, if present, selects a directory to be
72 monitored for GFF and FASTA files (which can be compressed with the
73 gzip program if desired). Whenever any file in this directory is
74 changed, the index files will be updated. Note that the indexing can
75 take a long time to run: anywhere from 5 to 10 minutes for a million
76 features. An alias for this argument is -dir, which gives this adaptor
77 a similar flavor to the "memory" adaptor.
79 -dsn and -dir can point to the same directory. If -dir is given but
80 -dsn is absent the index files will be stored into the directory
81 containing the source files. For autoindexing to work, you must
82 specify the same -dir path each time you open the database.
84 If you do not choose autoindexing, then you will want to load the
85 database using the bp_load_gff.pl command-line tool. For example:
87 bp_load_gff.pl -a berkeleydb -c -d /usr/local/share/gff/dmel dna1.fa dna2.fa features.gff
91 See L<Bio::DB::GFF> for inherited methods
95 The various get_Stream_* methods and the features() method with the
96 -iterator argument only return an iterator after the query runs
97 completely and the module has been able to generate a temporary
98 results file on disk. This means that iteration is not as big a win as
99 it is for the relational-database adaptors.
101 Like the dbi::mysqlopt adaptor, this module uses a binning scheme to
102 speed up range-based searches. The binning scheme used here imposes a
103 hard-coded 1 gigabase (1000 Mbase) limit on the size of the largest
104 chromosome or other reference sequence.
108 L<Bio::DB::GFF>, L<bioperl>
112 Vsevolod (Simon) Ilyushchenko E<gt>simonf@cshl.eduE<lt>
113 Lincoln Stein E<gt>lstein@cshl.eduE<lt>
115 Copyright (c) 2005 Cold Spring Harbor Laboratory.
117 This library is free software; you can redistribute it and/or modify
118 it under the same terms as Perl itself.
125 use File
::Path
'mkpath';
127 use File
::Temp
'tempfile';
129 use Bio
::DB
::GFF
::Util
::Rearrange
; # for rearrange()
130 use Bio
::DB
::GFF
::Util
::Binning
;
132 use Bio
::DB
::GFF
::Adaptor
::berkeleydb
::iterator
;
133 use Bio
::DB
::GFF
::Adaptor
::memory
::feature_serializer
; # qw(feature2string string2feature @hash2array_map);
135 # this is the smallest bin (1 K)
136 use constant MIN_BIN
=> 1000;
137 # this is the largest that any reference sequence can be (1000 megabases)
138 use constant MAX_BIN
=> 1_000_000_000
;
139 use constant MAX_SEGMENT
=> 1_000_000_000
; # the largest a segment can get
141 #We have to define a limit because Berkeleydb sorts in lexicografic order,
142 #so all the numbers have to have the same length.
143 use constant MAX_NUM_LENGTH
=> length(MAX_BIN
);
145 use base
'Bio::DB::GFF::Adaptor::memory';
149 my ($dbdir,$preferred_groups,$autoindex,$write,$create,$tmpdir) = rearrange
([
153 [qw(WRITE WRITABLE)],
157 $tmpdir ||= File
::Spec
->tmpdir;
158 $dbdir ||= $autoindex;
159 $dbdir ||= "$tmpdir/test";
162 my $self = bless {},$class;
164 $self->tmpdir($tmpdir);
165 $self->preferred_groups($preferred_groups) if defined $preferred_groups;
166 $self->_autoindex($autoindex) if $autoindex;
167 $self->_open_databases($write,$create);
175 my $dir = $self->dsn;
176 my %ignore = map {$_=>1} ($self->_index_file,$self->_data_file,
177 $self->_fasta_file,$self->_temp_file,
179 $self->_timestamp_file);
184 opendir (my $D,$autodir) or $self->throw("Couldn't open directory $autodir for reading: $!");
186 while (defined (my $node = readdir($D))) {
187 next if $node =~ /^\./;
188 my $path = "$dir/$node";
189 next if $ignore{$path};
190 next unless -f
$path;
191 my $mtime = _mtime
(\
*_
); # not a typo
192 $maxtime = $mtime if $mtime > $maxtime;
193 $maxfatime = $mtime if $mtime > $maxfatime && $node =~ /\.(?:fa|fasta|dna)(?:\.gz)?$/;
198 my $timestamp_time = _mtime
($self->_timestamp_file) || 0;
199 my $all_files_exist = -e
$self->_index_file && -e
$self->_data_file && (-e
$self->_fasta_file || !$maxfatime);
201 # to avoid rebuilding FASTA files if not changed
202 my $spare_fasta = $maxfatime > 0 && $maxfatime < $timestamp_time && -e
$self->_fasta_file;
204 if ($maxtime > $timestamp_time || !$all_files_exist) {
205 print STDERR __PACKAGE__
,": Reindexing files in $dir. This may take a while....\n";
206 $self->do_initialize(1,$spare_fasta);
207 $self->load_gff($autodir,1);
208 $self->load_fasta($autodir,1) unless $spare_fasta;
209 print STDERR __PACKAGE__
,": Reindexing done\n";
213 $self->_open_databases();
218 sub _open_databases
{
220 my ($write,$create) = @_;
222 my $dsn = $self->dsn;
223 unless (-d
$dsn) { # directory does not exist
224 $create or $self->throw("Directory $dsn does not exist and you did not specify the -create flag");
225 mkpath
($dsn) or $self->throw("Couldn't create database directory $dsn: $!");
229 local $DB_BTREE->{flags
} = R_DUP
;
230 $DB_BTREE->{compare
} = sub { lc($_[0]) cmp lc($_[1]) };
231 my $flags = O_RDONLY
;
232 $flags |= O_RDWR
if $write;
233 $flags |= O_CREAT
if $create;
235 tie
(%db,'DB_File',$self->_index_file,$flags,0666,$DB_BTREE)
236 or $self->throw("Couldn't tie ".$self->_index_file.": $!");
238 $self->{data
} = FeatureStore
->new($self->_data_file,$write,$create);
240 if (-e
$self->_fasta_file) {
241 my $dna_db = Bio
::DB
::Fasta
->new($self->_fasta_file) or $self->throw("Can't reindex sequence file: $@");
242 $self->dna_db($dna_db);
245 my $mode = $write ?
"+>>"
249 my $notes_file = $self->_notes_file;
250 open my $F, $mode, $notes_file or $self->throw("Could not open file '$notes_file': $!");
254 sub _close_databases
{
257 delete $self->{data
};
258 delete $self->{notes
};
261 sub _delete_features
{
263 my @feature_ids = @_;
265 my $last_id = $self->{data
}->last_id;
266 for my $id (@feature_ids) {
267 next unless $id >= 0 && $id < $last_id;
268 my $feat = $self->{data
}->get($id) or next;
269 $self->{data
}->remove($id);
270 $self->_bump_class_count($feat->{gclass
},-1);
271 my @keys = $self->_secondary_keys($feat);
272 $self->db->del_dup($_,$id) foreach @keys;
278 sub _secondary_keys
{
282 "__name__".lc(join ":",$feat->{gclass
},$feat->{gname
}),
283 "__bin__".lc("$feat->{ref}$;$feat->{bin}"),
284 "__type__".join(':',$feat->{method
},$feat->{source
}),
285 map {"__attr__".lc(join(':',$_->[0],$_->[1]))} @
{$feat->{attributes
}}
291 my $delete_spec = shift;
292 return $self->SUPER::_delete
($delete_spec) if @
{$delete_spec->{segments
}} or @
{$delete_spec->{types
}};
293 $self->throw("This operation would delete all feature data and -force not specified")
294 unless $delete_spec->{force
};
295 my $deleted = $self->{db
}{__count__
};
296 $self->{data
} = FeatureStore
->new($self->_data_file,1,1);
301 # with duplicates enabled, we cannot simply do $db->{__index__}++;
302 sub _bump_feature_count
{
304 my $db = $self->{db
};
306 delete $db->{__count__
};
307 return $db->{__count__
} = shift;
309 my $index = ${db
}->{__count__
};
310 delete $db->{__count__
};
311 $db->{__count__
} = ($index || 0) + 1;
316 sub _bump_class_count
{
318 my ($class,$count) = @_;
320 my $db = $self->{db
};
321 my $key = "__class__$class";
322 my $newcount = ($db->{$key} || 0) + $count;
324 $db->{$key} = $newcount;
330 my ($key,$value) = ('__class__',undef);
332 for (my $status = $db->seq($key,$value,R_CURSOR
);
334 $status = $db->seq($key,$value,R_NEXT
)) {
335 my ($class) = $key =~ /^__class__(.+)/ or last;
336 $classes{$class}++ if $value > 0;
338 my @classes = sort keys %classes;
345 my $spare_fasta = shift; # used internally only!
347 $self->_close_databases;
348 unlink $self->_index_file;
349 unlink $self->_data_file;
350 unlink $self->_notes_file;
351 unless ($spare_fasta) {
352 unlink $self->_fasta_file;
353 unlink $self->_fasta_file.'.index';
355 unlink $self->_timestamp_file;
356 $self->_open_databases(1,1);
361 # load_sequence($fasta_filehandle,$first_sequence_id)
364 my ($io_handle,$id) = @_;
365 my $file = $self->_fasta_file;
368 open my $F, '>>', $file or $self->throw("Could not append file '$file': $!");
375 while (<$io_handle>) {
380 my $dna_db = Bio
::DB
::Fasta
->new($file) or $self->throw("Can't reindex sequence file: $@");
381 $self->dna_db($dna_db);
382 $self->_touch_timestamp;
388 my @stat = stat($file);
394 return $self->dsn . "/bdb_features.btree";
399 return $self->dsn . "/bdb_features.data";
404 return $self->dsn . "/bdb_sequence.fa";
409 return $self->dsn . "/bdb_notes.idx";
415 my (undef,$filename) = tempfile
("bdb_temp_XXXXXX",DIR
=>$self->tmpdir,OPEN
=>0);
419 sub _timestamp_file
{
421 return $self->dsn ."/bdb_timestamp";
425 my $db = shift()->{db
} or return;
431 my $d = $self->{dsn
};
432 $self->{dsn
} = shift if @_;
438 my $d = $self->{tmpdir
};
439 $self->{tmpdir
} = shift if @_;
445 my ($self, $feat) = @_;
447 $feat->{strand
} = '' if $feat->{strand
} && $feat->{strand
} eq '.';
448 $feat->{phase
} = '' if $feat->{phase
} && $feat->{phase
} eq '.';
450 my $start = $feat->{start
};
451 my $stop = $feat->{stop
};
452 my $type = join(':',$feat->{method
},$feat->{source
});
454 my $bin = bin
($feat->{start
},$feat->{stop
},MIN_BIN
);
457 my $id = $self->{data
}->put($feat);
458 $bin = $self->normalizeNumber($bin);
460 my $db = $self->{db
};
461 for my $skey ($self->_secondary_keys($feat)) {
465 # save searchable notes to separate index
466 my $fh = $self->{notes
};
467 my @notes = map {$_->[1]} grep {lc $_->[0] eq 'note'} @
{$feat->{attributes
}};
468 print $fh $_,"\t",pack("u*",$id) or $self->throw("An error occurred while updating indexes: $!")
471 $self->{records_loaded
}++;
472 $self->_bump_feature_count();
473 $self->_bump_class_count($feat->{gclass
});
480 $self->{records_loaded
} = 0;
486 $self->db->sync && $self->throw("An error occurred while updating indexes: $!");
487 $self->_touch_timestamp;
488 $self->{records_loaded
};
491 sub _touch_timestamp
{
493 my $tsf = $self->_timestamp_file;
494 open my $F, '>', $tsf or $self->throw("Could not write file '$tsf': $!");
495 print $F scalar(localtime);
500 # given sequence name, return (reference,start,stop,strand)
503 my ($name,$class,$refseq) = @_;
507 if ($name =~ /[*?]/) { # uh oh regexp time
508 $name = quotemeta($name);
509 $name =~ s/\\\*/.*/g;
510 $name =~ s/\\\?/.?/g;
513 # Find all features that have the requested name and class.
514 # Sort them by reference point.
515 my @features = @
{$self->retrieve_features(-table
=> 'name', -key
=>"$class:$name")};
516 if (!@features) { # nothing matched exactly, so try aliases
517 @features = @
{$self->retrieve_features(-table
=>'attr',-key
=>"Alias:$name")};
520 foreach my $feature (@features){
521 push @
{$refs{$feature->{ref}}},$feature;
524 # find out how many reference points we recovered
526 $self->error("$name not found in database");
530 # compute min and max
531 my ($ref) = keys %refs;
532 my @found = @
{$refs{$ref}};
533 my ($strand,$start,$stop);
536 foreach my $ref (keys %refs) {
537 next if defined($refseq) and $ref ne $refseq;
538 my @found = @
{$refs{$ref}};
539 my ($strand,$start,$stop,$name);
541 $strand ||= $_->{strand
};
542 $strand = '+' if $strand && $strand eq '.';
543 $start = $_->{start
} if !defined($start) || $start > $_->{start
};
544 $stop = $_->{stop
} if !defined($stop) || $stop < $_->{stop
};
545 $name ||= $_->{gname
};
547 push @found_segments,[$ref,$class,$start,$stop,$strand,$name];
551 return \
@found_segments;
556 my ($srcseq,$class,$start,$stop,$want_count,$typelist) = @_;
557 my (%obj,%result,$key,$value);
560 if (!$srcseq) { # optimized full type list
562 my $status = $db->seq($key,$value,R_CURSOR
);
564 while ($status == 0 && $key =~ /^__type__(.+)/) {
566 my ($method,$source) = split ':',$type;
567 $obj{$type} = Bio
::DB
::GFF
::Typename
->new($method,$source);
571 $status = $db->seq($key,$value,R_NEXT
);
572 } else { # skip to next key set
574 $status = $db->seq($key,$value,R_CURSOR
)
580 else { # range search
581 for my $feature (@
{$self->_get_features_by_search_options(
582 {rangetype
=> 'overlaps',
584 refclass
=> ($class || undef),
585 start
=> ($start || undef),
586 stop
=> ($stop || undef),
591 my $type = Bio
::DB
::GFF
::Typename
->new($feature->{method
},$feature->{source
});
597 return $want_count ?
%result : values %obj;
601 # Low level implementation of fetching a named feature.
602 # GFF annotations are named using the group class and name fields.
603 # May return zero, one, or several Bio::DB::GFF::Feature objects.
605 =head2 _feature_by_name
607 Title : _feature_by_name
608 Usage : $db->get_features_by_name($class,$name,$callback)
609 Function: get a list of features by name and class
610 Returns : count of number of features retrieved
611 Args : name of feature, class of feature, and a callback
614 This method is used internally. The callback arguments are those used
619 sub _feature_by_name
{
621 my ($class,$name,$location,$callback) = @_;
622 $callback || $self->throw('must provide a callback argument');
624 #use Devel::StackTrace;
625 #warn Devel::StackTrace->new->as_string;
629 my ($use_regexp, $use_glob,$using_alias_search);
631 if ($name =~ /[*?]/) { # uh oh regexp time
633 #If there is only one trailing *, do a range search
634 if ($name =~ /^([^\*]+)\*$/) {
640 $name = quotemeta($name);
641 $name =~ s/\\\*/.*/g;
642 $name =~ s/\\\?/.?/g;
649 my $callback = sub {my $feat = shift; $feat->{gname
} =~ /^$name/i};
650 @features = @
{$self->retrieve_features_range (-table
=> 'name',
651 -start
=> "$class:$name",
652 -do_while
=> $callback)
655 elsif ($use_regexp) {
656 my $filter = sub {my $feat = shift; $feat->{gname
} =~ /$name/i};
657 @features = @
{$self->filter_features(-table
=>'name', -filter
=> $filter)};
661 @features = @
{$self->retrieve_features(-table
=>'name', -key
=> "$class:$name")};
665 $using_alias_search++;
666 @features = @
{$self->retrieve_features(-table
=>'attr', -key
=>"Alias:$name")};
669 foreach my $feature (@features){
671 next unless $using_alias_search || $feature->{gclass
} eq $class;
674 next if $location->[0] ne $feature->{ref};
675 next if $location->[1] && $location->[1] > $feature->{stop
};
676 next if $location->[2] && $location->[2] < $feature->{start
};
680 $callback->(@
{$feature}{@hash2array_map},0);
685 #sub get_feature_by_attribute{
686 sub _feature_by_attribute
{
688 my ($attributes,$callback) = @_;
689 $callback || $self->throw('must provide a callback argument');
691 my $feature_group_id = undef;
693 #there could be more than one set of attributes......
694 while (my ($key, $value) = each %$attributes) {
696 my @features = @
{$self->retrieve_features
697 (-table
=> "attr", -key
=> "$key:$value")};
699 for my $feature (@features) {
700 $callback->(@
{$feature}{@hash2array_map},$feature_group_id);
709 my ($search_string,$limit) = @_;
711 $search_string =~ tr/*?//d;
715 my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g;
716 my $search = join '|',@words;
719 my $note_index = $self->{notes
};
720 seek($note_index,0,0); # back to start
721 while (<$note_index>) {
722 next unless /$search/;
724 my ($note,$uu) = split "\t";
725 $found{unpack("u*",$uu)}++;
726 last if $limit && ++$found >= $limit;
729 my (@features, @matches);
730 for my $idx (keys %found) {
731 my $feature = $self->{data
}->get($idx) or next;
732 my @attributes = @
{$feature->{attributes
}};
733 my @values = map {lc $_->[0] eq 'note' ?
$_->[1] : ()} @attributes;
734 my $value = "@values";
737 $hits++ while $value =~ /($search)/ig; # count the number of times we were hit
739 push @features,$feature;
742 for (my $i=0; $i<@matches; $i++) {
743 my $feature = $features[$i];
744 my $matches = $matches[$i];
746 my $relevance = 10 * $matches;
747 my $featname = Bio
::DB
::GFF
::Featname
->new($feature->{gclass
}=>$feature->{gname
});
748 my $type = Bio
::DB
::GFF
::Typename
->new($feature->{method
}=>$feature->{source
});
750 $note = join ' ',map {$_->[1]} grep {$_->[0] eq 'Note'} @
{$feature->{attributes
}};
751 push @results,[$featname,$note,$relevance,$type];
757 sub _get_features_by_search_options
{
759 #The $data argument is not used and is preserved for superclass compatibility
760 my ($self, $search,$options) = @_;
763 my ($rangetype,$refseq,$class,$start,$stop,$types,$sparse,$order_by_group,$attributes,$temp_file) =
764 (@
{$search}{qw(rangetype refseq refclass start stop types)},
765 @
{$options}{qw(sparse sort_by_group ATTRIBUTES temp_file)}) ;
767 $start = 0 unless defined($start);
768 $stop = MAX_BIN
unless defined($stop);
770 my $bin = bin
($start,$stop,MIN_BIN
);
771 $bin = $self->normalizeNumber($bin);
773 my ($results,@features,%found,%results_table);
776 local $DB_BTREE->{flags
} = R_DUP
;
777 # note: there is a race condition possible here, if someone reuses the
778 # same name between the time we get the tmpfile name and the time we
779 # ask DB_File to open it.
780 tie
(%results_table,'DB_File',$temp_file,O_RDWR
|O_CREAT
,0666,$DB_BTREE)
781 or $self->throw("Couldn't tie temporary file ".$temp_file." for writing: $!");
782 $results = \
%results_table;
784 $results = \
@features;
790 my $ref = $feature->{ref};
791 my $feature_start = $feature->{start
};
792 my $feature_stop = $feature->{stop
};
793 my $feature_id = $feature->{feature_id
};
795 return 0 if $found{$feature_id}++;
797 if (defined $refseq) {
798 return 0 unless lc $refseq eq lc $ref;
799 $start = 0 unless defined($start);
800 $stop = MAX_SEGMENT
unless defined($stop);
802 if ($rangetype eq 'overlaps') {
803 return 0 unless $feature_stop >= $start && $feature_start <= $stop;
804 } elsif ($rangetype eq 'contains') {
805 return 0 unless $feature_start >= $start && $feature_stop <= $stop;
806 } elsif ($rangetype eq 'contained_in') {
807 return 0 unless $feature_start <= $start && $feature_stop >= $stop;
809 return 0 unless $feature_start == $start && $feature_stop == $stop;
813 my $feature_source = $feature->{source
};
814 my $feature_method = $feature->{method
};
816 if (defined $types && @
$types){
817 return 0 unless $self->_matching_typelist($feature_method,$feature_source,$types);
820 my $feature_attributes = $feature->{attributes
};
821 if (defined $attributes){
822 return 0 unless $self->_matching_attributes($feature_attributes,$attributes);
828 if (defined $refseq && !$sparse) {
830 while ($tier >= MIN_BIN
) {
831 my ($tier_start,$tier_stop) = (bin_bot
($tier,$start),bin_top
($tier,$stop));
832 # warn "Using $tier_start $tier_stop\n";
833 if ($tier_start == $tier_stop) {
834 $self->retrieve_features(-table
=> "bin",
835 -key
=> "$refseq$;$tier_start",
837 -result
=> $results);
839 my $callback = sub {my $feat = shift; $feat->{bin
} <= $tier_stop};
840 $self->retrieve_features_range(-table
=> "bin",
841 -start
=> "$refseq$;$tier_start",
842 -do_while
=> $callback,
844 -result
=> $results);
852 my $type = join ':',@
$_;
853 $self->retrieve_features_range(-table
=> 'type',
856 -do_while
=> sub { my $f = shift;
857 lc($f->{method
}) eq lc($_->[0])
859 lc($f->{source
}||$_->[1]||'') eq lc($_->[1]||'')
861 -result
=> $results);
865 elsif (defined $attributes) {
866 my ($attribute_name,$attribute_value) = each %$attributes; # pick first one
867 $self->retrieve_features(-table
=> 'attr',
868 -key
=> "${attribute_name}:${attribute_value}",
870 -result
=> $results);
874 $self->filter_features(-filter
=> $filter,-result
=>$results);
880 sub retrieve_features
{
882 my ($table, $key, $filter, $result) = rearrange
(['TABLE','KEY','FILTER', 'RESULT'],@_);
885 $result ||= \
@result;
888 my @ids = $self->db->get_dup("__".lc($table)."__".lc($key));
889 my $data = $self->{data
};
890 local $^W
= 0; # because _hash_to_array() will generate lots of uninit values
892 foreach my $id (@ids) {
893 my $feat = $data->get($id);
894 my $filter_result = $filter ?
$filter->($feat) : 1;
895 next unless $filter_result;
896 if (ref $result eq 'HASH') {
897 $result->{"$feat->{gclass}:$feat->{gname}"} = join ($;,$self->_hash_to_array($feat));
899 push @
$result, $feat;
901 last if $filter_result == -1;
906 sub retrieve_features_range
{
908 my ($table, $start, $do_while, $filter, $result) = rearrange
(['TABLE','START','DO_WHILE', 'FILTER', 'RESULT'],@_);
909 local $^W
= 0; # because _hash_to_array will generate lots of uninit warnings
912 $result ||= \
@result;
913 my ($id, $key, $value);
915 $key = "__".$table."__".$start;
918 for (my $status = $db->seq($key,$value,R_CURSOR
);
920 $status = $db->seq($key,$value,R_NEXT
)) {
922 my $feat = $self->{data
}->get($value);
923 last unless $do_while->($feat,$key);
925 my $filter_result = $filter ?
$filter->($feat) : 1;
926 next unless $filter_result;
928 if (ref $result eq 'HASH') {
929 $result->{"$feat->{gclass}:$feat->{gname}"} = join($;,$self->_hash_to_array($feat));
933 last if $filter_result == -1;
940 sub filter_features
{
943 my ($filter,$result) = rearrange
(['FILTER','RESULT'],@_);
946 $result ||= \
@result;
949 my $data = $self->{data
};
951 while (my $feat = $data->next) {
953 my $filter_result = $filter ?
$filter->($feat) : 1;
954 next unless $filter_result;
956 if (ref($result) eq 'HASH') {
957 $result->{"$feat->{gclass}:$feat->{gname}"} = join($;,$self->_hash_to_array($feat));
961 last if $filter_result == -1;
968 sub _basic_features_by_id
{
972 $ids = [$ids] unless ref $ids =~ /ARRAY/;
975 my $data = $self->{data
};
976 for my $feature_id (@
$ids){
977 push @result, $data->get($feature_id);
980 return wantarray() ?
@result : $result[0];
983 sub normalizeNumber
{
984 my ($self, $num) = @_;
985 while ((length $num) < MAX_NUM_LENGTH
)
992 sub get_features_iterator
{
995 my ($search,$options,$callback) = @_;
996 $callback || $self->throw('must provide a callback argument');
997 $options->{temp_file
} = $self->_temp_file;
999 my $results = $self->_get_features_by_search_options($search,$options);
1000 return Bio
::DB
::GFF
::Adaptor
::berkeleydb
::iterator
->new($results,$callback,$options->{temp_file
});
1003 #--------------------------------------------------------------------------#
1005 package FeatureStore
;
1007 # This is a very specialized package that stores serialized features onto a file-based
1008 # array. The array is indexed by the physical offset to the beginning of each serialized
1012 use Fcntl
qw(SEEK_SET SEEK_END);
1013 use base
'Bio::Root::Root';
1014 use Bio
::DB
::GFF
::Adaptor
::memory
::feature_serializer
; # qw(feature2string string2feature @hash2array_map);
1018 my $dbname = shift or $class->throw("must provide a filepath argument");
1019 my ($write,$create) = @_;
1021 my $mode = $create ?
"+>"
1025 open my $F, $mode, $dbname or $class->throw("Could not open file '$dbname': $!");
1036 my $feature = shift;
1037 my $fh = $self->{fh
};
1038 seek($fh,0,SEEK_END
);
1039 my $offset = tell($fh) || 0;
1041 $self->{last_id
} = $offset;
1043 my $id = pack("L",$offset);
1044 $feature->{feature_id
} = $id;
1045 my $value = feature2string
($feature);
1046 print $fh pack("n/a*",$value) or $self->throw("An error occurred while updating the data file: $!");
1059 my $offset = unpack("L",$idx);
1060 my $fh = $self->{fh
};
1062 my ($value,$length);
1064 seek($fh,$offset,SEEK_SET
);
1065 return unless read($fh,$length,2);
1066 return unless read($fh,$value,unpack("n",$length));
1067 $self->{next_idx
} = tell($fh);
1068 return if substr($value,0,1) eq "\0";
1069 return string2feature
($value);
1074 my $fh = $self->{fh
};
1077 $result = $self->get(pack("L",$self->{next_idx
}));
1078 } until $result || eof($fh);
1079 $self->{next_idx
} = 0 unless $result;
1086 my $offset = unpack("L",$id);
1087 my $fh = $self->{fh
};
1088 my ($value,$length);
1089 seek($fh,$offset,SEEK_SET
);
1090 return unless read($fh,$length,2);
1091 print $fh "\0"x
$length; # null it out
1098 my $offset = unpack("L",$idx);
1099 seek($self->{fh
},$offset,SEEK_SET
);
1100 $self->{next_idx
} = tell($self->{fh
});
1105 $self->_seek(pack("L",0));
1108 sub _feature2string
{
1109 my $feature = shift;
1110 my @a = @
{$feature}{@hash2array_map};
1111 push @a,map {@
$_} @
{$feature->{attributes
}} if $feature->{attributes
};
1115 sub _string2feature
{
1117 my (%feature,@attributes);
1119 (@feature{@hash2array_map},@attributes) = split $;,$string;
1120 while (@attributes) {
1121 my ($key,$value) = splice(@attributes,0,2);
1122 push @
{$feature{attributes
}},[$key,$value];
1124 $feature{group_id
} = undef;