1 package Bio
::DB
::SeqFeature
::Store
::berkeleydb
;
4 use base
'Bio::DB::SeqFeature::Store';
5 use Bio
::DB
::GFF
::Util
::Rearrange
'rearrange';
7 use Fcntl
qw(O_RDWR O_CREAT :flock);
9 use File
::Temp
'tempdir';
10 use File
::Path
'rmtree','mkpath';
13 use Carp
'carp','croak';
15 use constant BINSIZE
=> 10_000
;
16 use constant MININT
=> -999_999_999_999
;
17 use constant MAXINT
=> 999_999_999_999
;
21 Bio::DB::SeqFeature::Store::berkeleydb -- Storage and retrieval of sequence annotation data in Berkeleydb files
25 use Bio::DB::SeqFeature::Store;
27 # Create a database from the feature files located in /home/fly4.3 and store
28 # the database index in the same directory:
29 my $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
30 -dir => '/home/fly4.3');
32 # Create a database that will monitor the files in /home/fly4.3, but store
33 # the indexes in /var/databases/fly4.3
34 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
35 -dir => '/home/fly4.3',
36 -dsn => '/var/databases/fly4.3');
38 # Create a feature database from scratch
39 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
40 -dsn => '/var/databases/fly4.3',
43 # get a feature from somewhere
44 my $feature = Bio::SeqFeature::Generic->new(...);
47 $db->store($feature) or die "Couldn't store!";
49 # primary ID of the feature is changed to indicate its primary ID
51 my $id = $feature->primary_id;
53 # get the feature back out
54 my $f = $db->fetch($id);
56 # change the feature and update it
58 $db->update($f) or $self->throw("Couldn't update!");
60 # use the GFF3 loader to do a bulk write:
61 my $loader = Bio::DB::SeqFeature::Store::GFF3Loader->new(-store => $db,
64 $loader->load('/home/fly4.3/dmel-all.gff');
69 my @features = $db->fetch_many(@list_of_ids);
72 @features = $db->get_features_by_name('ZK909');
75 @features = $db->get_features_by_alias('sma-3');
78 @features = $db->get_features_by_type('gene');
81 @features = $db->get_features_by_location(-seq_id=>'Chr1',-start=>4000,-end=>600000);
84 @features = $db->get_features_by_attribute({description => 'protein kinase'})
86 # ...by the GFF "Note" field
87 @result_list = $db->search_notes('kinase');
89 # ...by arbitrary combinations of selectors
90 @features = $db->features(-name => $name,
95 -attributes => $attributes);
97 # ...using an iterator
98 my $iterator = $db->get_seq_stream(-name => $name,
103 -attributes => $attributes);
105 while (my $feature = $iterator->next_seq) {
106 # do something with the feature
109 # ...limiting the search to a particular region
110 my $segment = $db->segment('Chr1',5000=>6000);
111 my @features = $segment->features(-type=>['mRNA','match']);
113 # what feature types are defined in the database?
114 my @types = $db->types;
116 # getting & storing sequence information
117 # Warning: this returns a string, and not a PrimarySeq object
118 $db->insert_sequence('Chr1','GATCCCCCGGGATTCCAAAA...');
119 my $sequence = $db->fetch_sequence('Chr1',5000=>6000);
121 # create a new feature in the database
122 my $feature = $db->new_feature(-primary_tag => 'mRNA',
129 Bio::DB::SeqFeature::Store::berkeleydb is the Berkeleydb adaptor for
130 Bio::DB::SeqFeature::Store. You will not create it directly, but
131 instead use Bio::DB::SeqFeature::Store-E<gt>new() to do so.
133 See L<Bio::DB::SeqFeature::Store> for complete usage instructions.
135 =head2 Using the berkeleydb adaptor
137 The Berkeley database consists of a series of Berkeleydb index files,
138 and a couple of special purpose indexes. You can create the index
139 files from scratch by creating a new database and calling
140 new_feature() repeatedly, you can create the database and then bulk
141 populate it using the GFF3 loader, or you can monitor a directory of
142 preexisting GFF3 and FASTA files and rebuild the indexes whenever one
143 or more of the fields changes. The last mode is probably the most
144 convenient. Note that the indexer will only pay attention to files
145 that end with .gff3, .wig and .fa.
149 =item The new() constructor
151 The new() constructor method all the arguments recognized by
152 Bio::DB::SeqFeature::Store, and a few additional ones.
159 -adaptor The name of the Adaptor class (default DBI::mysql)
161 -serializer The name of the serializer class (default Storable)
163 -index_subfeatures Whether or not to make subfeatures searchable
166 -cache Activate LRU caching feature -- size of cache
168 -compress Compresses features before storing them in database
171 Adaptor-specific arguments
176 -dsn Where the index files are stored
178 -dir Where the source (GFF3, FASTA) files are stored
180 -autoindex An alias for -dir.
182 -write Pass true to open the index files for writing.
184 -create Pass true to create the index files if they don't exist
187 -locking Use advisory locking to avoid one process trying to read
188 from the database while another is updating it (may not
189 work properly over NFS).
191 -temp Pass true to create temporary index files that will
192 be deleted once the script exits.
194 -verbose Pass true to report autoindexing operations on STDERR.
199 To create an empty database which will be populated using calls to
200 store() or new_feature(), or which will be bulk-loaded using the GFF3
203 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
204 -dsn => '/var/databases/fly4.3',
207 To open a preexisting database in read-only mode:
209 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
210 -dsn => '/var/databases/fly4.3');
212 To open a preexisting database in read/write (update) mode:
214 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
215 -dsn => '/var/databases/fly4.3',
218 To monitor a set of GFF3 and FASTA files located in a directory and
219 create/update the database indexes as needed. The indexes will be
220 stored in a new subdirectory named "indexes":
222 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
223 -dir => '/var/databases/fly4.3');
225 As above, but store the source files and index files in separate directories:
227 $db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
228 -dsn => '/var/databases/fly4.3',
229 -dir => '/home/gff3_files/fly4.3');
231 To be indexed, files must end with one of .gff3 (GFF3 format), .fa
232 (FASTA format) or .wig (WIG format).
234 B<-autoindex> is an alias for B<-dir>.
236 You should specify B<-locking> in a multiuser environment, including
237 the case in which the database is being used by a web server at the
238 same time another user might be updating it.
242 See L<Bio::DB::SeqFeature::Store> for all the access methods supported
243 by this adaptor. The various methods for storing and updating features
244 and sequences into the database are supported, but there is no
245 locking. If two processes try to update the same database
246 simultaneously, the database will likely become corrupted.
251 # object initialization
262 ) = rearrange
([['DSN','DB'],
264 ['TMP','TEMP','TEMPORARY'],
265 [qw(WRITE WRITABLE)],
271 $verbose = 1 unless defined $verbose;
274 -d
$autoindex or $self->throw("Invalid directory $autoindex");
275 $directory ||= "$autoindex/indexes";
277 $directory ||= $is_temporary ? File
::Spec
->tmpdir : '.';
279 my $pacname = __PACKAGE__
;
280 if ($^O
=~ /mswin/i) {
281 $pacname =~ s/:+/_/g;
283 $directory = tempdir
($pacname.'_XXXXXX',
286 DIR
=> $directory) if $is_temporary;
288 -d
$directory or $self->throw("Invalid directory $directory");
290 $create++ if $is_temporary;
292 $self->throw("Can't write into the directory $directory")
293 if $write && !-w
$directory;
296 $self->default_settings;
297 $self->directory($directory);
298 $self->temporary($is_temporary);
299 $self->verbose($verbose);
300 $self->locking($locking);
301 $self->_delete_databases() if $create;
302 if ($autoindex && -d
$autoindex) {
303 $self->auto_reindex($autoindex);
305 $self->lock('shared');
307 # this step may rebless $self into a subclass
308 # to preserve backward compatibility with older
309 # databases while providing better performance for
311 $self->possibly_rebless($create);
313 $self->_open_databases($write,$create,$autoindex);
314 $self->_permissions($write,$create);
318 sub version
{ return 2.0 };
320 sub possibly_rebless
{
327 } else { # probe database
329 tie
(%h,'DB_File',$self->_features_path,O_RDONLY
,0666,$DB_HASH) or return;
330 $do_rebless = $h{'.version'} >= 3.0;
334 eval "require Bio::DB::SeqFeature::Store::berkeleydb3";
335 bless $self,'Bio::DB::SeqFeature::Store::berkeleydb3';
340 sub can_store_parentage
{ 1 }
345 my $result = $self->needs_auto_reindexing($autodir);
347 if ($result && %$result) {
348 $self->flag_autoindexing(1);
349 $self->lock('exclusive');
350 $self->reindex_wigfiles($result->{wig
},$autodir) if $result->{wig
};
351 $self->reindex_ffffiles($result->{fff
},$autodir) if $result->{fff
};
352 $self->reindex_gfffiles($result->{gff
},$autodir) if $result->{gff
};
353 $self->dna_db(Bio
::DB
::Fasta
::Subdir
->new($autodir));
355 $self->flag_autoindexing(0);
359 $self->dna_db(Bio
::DB
::Fasta
::Subdir
->new($autodir));
363 sub autoindex_flagfile
{
364 return File
::Spec
->catfile(shift->directory,'autoindex.pid');
366 sub auto_index_in_process
{
368 my $flag_file = $self->autoindex_flagfile;
369 return unless -e
$flag_file;
371 # if flagfile exists, then check that PID still exists
372 open my $fh, '<', $flag_file or $self->throw("Could not read file '$flag_file': $!");
375 return 1 if kill 0=>$pid;
376 warn "Autoindexing seems to be running in another process, but the process has gone away. Trying to override...";
377 if (unlink $flag_file) {
378 warn "Successfully removed stale PID file." if $self->verbose;
379 warn "Assuming partial reindexing process. Rebuilding indexes from scratch..." if $self->verbose;
380 my $glob = File
::Spec
->catfile($self->directory,'*');
384 croak
("Cannot recover from apparent aborted autoindex process. Please remove files in ",
386 " and allow the adaptor to reindex");
391 sub flag_autoindexing
{
394 my $flag_file = $self->autoindex_flagfile;
396 open my $fh, '>', $flag_file or $self->throw("Could not write file '$flag_file': $!");
404 sub reindex_gfffiles
{
409 warn "Reindexing GFF files...\n" if $self->verbose;
410 my $exists = -e
$self->_features_path;
412 $self->_permissions(1,1);
413 $self->_close_databases();
414 $self->_open_databases(1,!$exists);
415 require Bio
::DB
::SeqFeature
::Store
::GFF3Loader
416 unless Bio
::DB
::SeqFeature
::Store
::GFF3Loader
->can('new');
417 my $loader = Bio
::DB
::SeqFeature
::Store
::GFF3Loader
->new(-store
=> $self,
418 -sf_class
=> $self->seqfeature_class,
419 -verbose
=> $self->verbose,
421 or $self->throw("Couldn't create GFF3Loader");
423 $loader->load(grep {!$seen{$_}++} @
$files);
424 $self->_touch_timestamp;
427 sub reindex_ffffiles
{
432 warn "Reindexing FFF files...\n" if $self->verbose;
433 $self->_permissions(1,1);
434 $self->_close_databases();
435 $self->_open_databases(1,1);
436 require Bio
::DB
::SeqFeature
::Store
::FeatureFileLoader
437 unless Bio
::DB
::SeqFeature
::Store
::FeatureFileLoader
->can('new');
438 my $loader = Bio
::DB
::SeqFeature
::Store
::FeatureFileLoader
->new(-store
=> $self,
439 -sf_class
=> $self->seqfeature_class,
440 -verbose
=> $self->verbose,
442 or $self->throw("Couldn't create FeatureFileLoader");
444 $loader->load(grep {!$seen{$_}++} @
$files);
445 $self->_touch_timestamp;
448 sub reindex_wigfiles
{
453 warn "Reindexing wig files...\n" if $self->verbose;
455 unless (Bio
::Graphics
::Wiggle
::Loader
->can('new')) {
456 eval "require Bio::Graphics::Wiggle::Loader; 1"
460 for my $wig (@
$files) {
461 warn "Reindexing $wig...\n" if $self->verbose;
462 my ($wib_name) = fileparse
($wig,qr/\.[^.]*/);
463 my $gff3_name = "$wib_name.gff3";
465 # unlink all wib files that share the basename
466 my $wib_glob = File
::Spec
->catfile($self->directory,"$wib_name*.wib");
467 unlink glob($wib_glob);
469 my $loader = Bio
::Graphics
::Wiggle
::Loader
->new($self->directory,$wib_name);
470 my $fh = IO
::File
->new($wig) or die "Can't open $wig: $!";
471 $loader->load($fh); # will create one or more .wib files
473 my $gff3_data = $loader->featurefile('gff3','microarray_oligo',$wib_name);
474 my $gff3_path = File
::Spec
->catfile($autodir,$gff3_name);
475 $fh = IO
::File
->new($gff3_path,'>')
476 or die "Can't open $gff3_path for writing: $!";
477 $fh->print($gff3_data);
479 my $conf_path = File
::Spec
->catfile($autodir,"$wib_name.conf");
480 $fh = IO
::File
->new($conf_path,'>');
481 $fh->print($loader->conf_stanzas('microarray_oligo',$wib_name));
486 # returns the following hashref
487 # empty hash if nothing needs reindexing
488 # {fasta => 1} if DNA database needs reindexing
489 # {gff => [list,of,gff,paths]} if gff3 files need reindexing
490 # {wig => [list,of,wig,paths]} if wig files need reindexing
491 sub needs_auto_reindexing
{
496 # don't allow two processes to reindex simultaneously
497 $self->auto_index_in_process and croak
"Autoindexing in process. Try again later";
499 # first interrogate the GFF3 files, using the timestamp file
500 # as modification comparison
501 my (@gff3,@fff,@wig,$fasta,$fasta_index_time);
502 opendir (my $D,$autodir)
503 or $self->throw("Couldn't open directory $autodir for reading: $!");
506 my $timestamp_time = _mtime
($self->_mtime_path) || 0;
507 while (defined (my $node = readdir($D))) {
508 next if $node =~ /^\./;
509 my $path = File
::Spec
->catfile($autodir,$node);
510 next unless -f
$path;
512 if ($path =~ /\.gff\d?$/i) {
513 my $mtime = _mtime
(\
*_
); # not a typo
514 $maxtime = $mtime if $mtime > $maxtime;
519 elsif ($path =~ /\.fff?$/i) {
520 my $mtime = _mtime
(\
*_
); # not a typo
521 $maxtime = $mtime if $mtime > $maxtime;
525 elsif ($path =~ /\.wig$/i) {
527 (my $gff_file = $wig) =~ s/\.wig$/\.gff3/i;
528 next if -e
$gff_file && _mtime
($gff_file) > _mtime
($path);
530 push @gff3,$gff_file;
534 elsif ($path =~ /\.(fa|fasta|dna)$/i) {
536 _mtime
(File
::Spec
->catfile($self->directory,'fasta.index'))||0
537 unless defined $fasta_index_time;
538 $fasta++ if _mtime
($path) > $fasta_index_time;
543 $result->{gff
} = \
@gff3 if $maxtime > $timestamp_time;
544 $result->{wig
} = \
@wig if @wig;
545 $result->{fff
} = \
@fff if @fff;
546 $result->{fasta
}++ if $fasta;
552 my $d = $self->{verbose
};
553 $self->{verbose
} = shift if @_;
559 my $d = $self->{locking
};
560 $self->{locking
} = shift if @_;
566 return File
::Spec
->catfile($self->directory,'lock');
572 return unless $self->locking;
574 my $flag = $mode eq 'exclusive' ? LOCK_EX
: LOCK_SH
;
575 my $lockfile = $self->lockfile;
576 my $fh = $self->_flock_fh;
578 my $open = -e
$lockfile ?
'<' : '>';
579 $fh = IO
::File
->new($lockfile,$open) or die "Cannot open $lockfile: $!";
582 $self->_flock_fh($fh);
587 return unless $self->locking;
589 my $fh = $self->_flock_fh or return;
591 undef $self->{flock_fh
};
596 my $d = $self->{flock_fh
};
597 $self->{flock_fh
} = shift if @_;
601 sub _open_databases
{
603 my ($write,$create,$ignore_errors) = @_;
604 return if $self->db; # already open - don't reopen
606 my $directory = $self->directory;
607 unless (-d
$directory) { # directory does not exist
608 $create or $self->throw("Directory $directory does not exist and you did not specify the -create flag");
609 mkpath
($directory) or $self->throw("Couldn't create database directory $directory: $!");
612 my $flags = O_RDONLY
;
613 $flags |= O_RDWR
if $write;
614 $flags |= O_CREAT
if $create;
616 # Create the main database; this is a DB_HASH implementation
618 my $result = tie
(%h,'DB_File',$self->_features_path,$flags,0666,$DB_HASH);
621 return if $ignore_errors; # autoindex set, so defer this
622 $self->throw("Couldn't tie: ".$self->_features_path . " $!");
628 $h{'.version'} = $self->version;
632 $self->open_index_dbs($flags,$create);
633 $self->open_parentage_db($flags,$create);
634 $self->open_notes_db($write,$create);
635 $self->open_seq_db($flags,$create) if -e
$self->_fasta_path;
640 my ($flags,$create) = @_;
642 # Create the index databases; these are DB_BTREE implementations with duplicates allowed.
643 $DB_BTREE->{flags
} = R_DUP
;
644 $DB_BTREE->{compare
} = sub { lc($_[0]) cmp lc($_[1]) };
645 for my $idx ($self->_index_files) {
646 my $path = $self->_qualify("$idx.idx");
648 my $result = tie
(%db,'DB_File',$path,$flags,0666,$DB_BTREE);
649 # for backward compatibility, allow a failure when trying to open the is_indexed index.
650 $self->throw("Couldn't tie $path: $!") unless $result || $idx eq 'is_indexed';
652 $self->index_db($idx=>\
%db);
656 sub open_parentage_db
{
658 my ($flags,$create) = @_;
660 # Create the parentage database
662 tie
(%p,'DB_File',$self->_parentage_path,$flags,0666,$DB_BTREE)
663 or $self->throw("Couldn't tie: ".$self->_parentage_path . $!);
665 $self->parentage_db(\
%p);
670 my ($write,$create) = @_;
672 my $mode = $write ?
"+>>"
676 my $notes_file = $self->_notes_path;
677 open my $F, $mode, $notes_file or $self->throw("Could not open file '$notes_file': $!");
684 if (-e
$self->_fasta_path) {
685 my $dna_db = Bio
::DB
::Fasta
::Subdir
->new($self->_fasta_path)
686 or $self->throw("Can't reindex sequence file: $@");
687 $self->dna_db($dna_db);
691 sub commit
{ # reindex fasta files
693 if (my $fh = $self->{fasta_fh
}) {
695 $self->dna_db(Bio
::DB
::Fasta
::Subdir
->new($self->{fasta_file
}));
696 } elsif (-d
$self->directory) {
697 $self->dna_db(Bio
::DB
::Fasta
::Subdir
->new($self->directory));
701 sub _close_databases
{
704 $self->dna_db(undef);
705 $self->notes_db(undef);
706 $self->parentage_db(undef);
707 $self->index_db($_=>undef) foreach $self->_index_files;
710 # do nothing -- new() with -create=>1 will do the trick
711 sub _init_database
{ }
713 sub _delete_databases
{
715 for my $idx ($self->_index_files) {
716 my $path = $self->_qualify("$idx.idx");
719 unlink $self->_parentage_path;
720 unlink $self->_fasta_path;
721 unlink $self->_features_path;
722 unlink $self->_mtime_path;
725 sub _touch_timestamp
{
727 my $tsf = $self->_mtime_path;
728 open my $F, '>', $tsf or $self->throw("Could not write file '$tsf': $!");
729 print $F scalar(localtime);
737 my $is_indexed = $self->index_db('is_indexed');
740 my $primary_id = $obj->primary_id;
741 $self->_delete_indexes($obj,$primary_id) if $indexed && $primary_id;
742 $primary_id = $db->{'.next_id'}++ unless defined $primary_id;
743 $db->{$primary_id} = $self->freeze($obj);
744 $is_indexed->{$primary_id} = $indexed if $is_indexed;
745 $obj->primary_id($primary_id);
746 $self->_update_indexes($obj) if $indexed;
752 sub _delete_indexes
{
756 # the additional "1" causes the index to be deleted
757 $self->_update_name_index($obj,$id,1);
758 $self->_update_type_index($obj,$id,1);
759 $self->_update_location_index($obj,$id,1);
760 $self->_update_attribute_index($obj,$id,1);
761 $self->_update_note_index($obj,$id,1);
768 my $obj = $self->thaw($db->{$id},$id);
772 sub _add_SeqFeature
{
776 my $parent_id = (ref $parent ?
$parent->primary_id : $parent)
777 or $self->throw("$parent should have a primary_id");
778 my $p = $self->parentage_db;
779 for my $child (@children) {
780 my $child_id = ref $child ?
$child->primary_id : $child;
781 defined $child_id or $self->throw("no primary ID known for $child");
782 $p->{$parent_id} = $child_id if tied(%$p)->find_dup($parent_id,$child_id);
784 return scalar @children;
787 sub _fetch_SeqFeatures
{
791 my $parent_id = $parent->primary_id or $self->throw("$parent should have a primary_id");
792 my $index = $self->parentage_db;
793 my $db = tied %$index;
795 my @children_ids = $db->get_dup($parent_id);
796 my @children = map {$self->fetch($_)} @children_ids;
800 my ($a,$b) = split ':',$_,2;
803 $_ .= ":".quotemeta($b).'$';
808 my $regexp = join '|', @types;
809 return grep {($_->primary_tag.':'.$_->source_tag) =~ /^($regexp)/i} @children;
815 sub _update_indexes
{
818 defined (my $id = $obj->primary_id) or return;
819 $self->_update_name_index($obj,$id);
820 $self->_update_type_index($obj,$id);
821 $self->_update_location_index($obj,$id);
822 $self->_update_attribute_index($obj,$id);
823 $self->_update_note_index($obj,$id);
826 sub _update_name_index
{
828 my ($obj,$id,$delete) = @_;
829 my $db = $self->index_db('names') or $self->throw("Couldn't find 'names' index file");
830 my ($names,$aliases) = $self->feature_names($obj);
832 # little stinky - needs minor refactoring
835 $self->update_or_delete($delete,$db,$key,$id);
838 foreach (@
$aliases) {
839 my $key = lc($_)."_2"; # the _2 indicates a secondary (alias) ID
840 $self->update_or_delete($delete,$db,$key,$id);
845 sub _update_type_index
{
847 my ($obj,$id,$delete) = @_;
848 my $db = $self->index_db('types')
849 or $self->throw("Couldn't find 'types' index file");
850 my $primary_tag = $obj->primary_tag;
851 my $source_tag = $obj->source_tag || '';
852 return unless defined $primary_tag;
854 $primary_tag .= ":$source_tag";
855 my $key = lc $primary_tag;
856 $self->update_or_delete($delete,$db,$key,$id);
859 # Note: this indexing scheme is space-inefficient because it stores the
860 # denormalized sequence ID followed by the bin in XXXXXX zero-leading
861 # format. It should be replaced with a binary numeric encoding and the
862 # BTREE {compare} attribute changed accordingly.
863 sub _update_location_index
{
865 my ($obj,$id,$delete) = @_;
866 my $db = $self->index_db('locations')
867 or $self->throw("Couldn't find 'locations' index file");
869 my $seq_id = $obj->seq_id || '';
870 my $start = $obj->start || '';
871 my $end = $obj->end || '';
872 my $strand = $obj->strand;
873 my $bin_min = int $start/BINSIZE
;
874 my $bin_max = int $end/BINSIZE
;
876 for (my $bin = $bin_min; $bin <= $bin_max; $bin++ ) {
877 my $key = sprintf("%s.%06d",lc($seq_id),$bin);
878 $self->update_or_delete($delete,$db,$key,pack("i4",$id,$start,$end,$strand));
882 sub _update_attribute_index
{
884 my ($obj,$id,$delete) = @_;
885 my $db = $self->index_db('attributes')
886 or $self->throw("Couldn't find 'attributes' index file");
888 for my $tag ($obj->get_all_tags) {
889 for my $value ($obj->get_tag_values($tag)) {
890 my $key = "${tag}:${value}";
891 $self->update_or_delete($delete,$db,$key,$id);
896 sub _update_note_index
{
898 my ($obj,$id,$delete) = @_;
899 return if $delete; # we don't know how to do this
901 my $fh = $self->notes_db;
902 my @notes = $obj->get_tag_values('Note') if $obj->has_tag('Note');
905 print $fh $_,"\t",pack("u*",$id) or $self->throw("An error occurred while updating note index: $!")
909 sub update_or_delete
{
911 my ($delete,$db,$key,$id) = @_;
913 tied(%$db)->del_dup($key,$id);
919 # these methods return pointers to....
920 # the database that stores serialized objects
923 my $d = $self->setting('db');
924 $self->setting(db
=>shift) if @_;
930 my $d = $self->setting('parentage_db');
931 $self->setting(parentage_db
=>shift) if @_;
935 # the Bio::DB::Fasta object
938 my $d = $self->setting('dna_db');
939 $self->setting(dna_db
=>shift) if @_;
943 # the specialized notes database
946 my $d = $self->setting('notes_db');
947 $self->setting(notes_db
=>shift) if @_;
954 my $d = $self->setting('is_indexed_db');
955 $self->setting(is_indexed_db
=>shift) if @_;
959 # The indicated index berkeley db
962 my $index_name = shift;
963 my $d = $self->setting($index_name);
964 $self->setting($index_name=>shift) if @_;
971 my @stat = stat($file);
975 # return names of all the indexes
977 return qw(names types locations attributes is_indexed);
980 # the directory in which we store our indexes
983 my $d = $self->setting('directory');
984 $self->setting(directory
=>shift) if @_;
988 # flag indicating that we are a temporary database
991 my $d = $self->setting('temporary');
992 $self->setting(temporary
=>shift) if @_;
998 my $d = $self->setting('permissions') or return;
1000 my ($write,$create) = @_;
1001 $self->setting(permissions
=>[$write,$create]);
1006 # file name utilities...
1010 return $self->directory .'/' . $file;
1013 sub _features_path
{
1014 shift->_qualify('features.bdb');
1017 sub _parentage_path
{
1018 shift->_qualify('parentage.bdb');
1022 shift->_qualify('types.idx');
1025 sub _location_path
{
1026 shift->_qualify('locations.idx');
1029 sub _attribute_path
{
1030 shift->_qualify('attributes.idx');
1034 shift->_qualify('notes.idx');
1038 shift->_qualify('sequence.fa');
1042 shift->_qualify('mtime.stamp');
1045 ###########################################
1047 ###########################################
1051 my ($seq_id,$start,$end,$strand,
1052 $name,$class,$allow_aliases,
1057 ) = rearrange
([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],'STRAND',
1058 'NAME','CLASS','ALIASES',
1059 ['TYPES','TYPE','PRIMARY_TAG'],
1060 ['ATTRIBUTES','ATTRIBUTE'],
1065 my (@from,@where,@args,@group);
1066 $range_type ||= 'overlaps';
1069 unless (defined $name or defined $seq_id or defined $types or defined $attributes) {
1070 my $is_indexed = $self->index_db('is_indexed');
1071 @result = $is_indexed ?
grep {$is_indexed->{$_}} keys %{$self->db}
1072 : grep { !/^\./ }keys %{$self->db};
1078 if (defined($name)) {
1079 # hacky backward compatibility workaround
1080 undef $class if $class && $class eq 'Sequence';
1081 $name = "$class:$name" if defined $class && length $class > 0;
1082 $result &&= $self->filter_by_name($name,$allow_aliases,\
%found);
1085 if (defined $seq_id) {
1086 $result &&= $self->filter_by_location($seq_id,$start,$end,$strand,$range_type,\
%found);
1089 if (defined $types) {
1090 $result &&= $self->filter_by_type($types,\
%found);
1093 if (defined $attributes) {
1094 $result &&= $self->filter_by_attribute($attributes,\
%found);
1097 push @result,keys %found if $result;
1098 return $iterator ? Bio
::DB
::SeqFeature
::Store
::berkeleydb
::Iterator
->new($self,\
@result)
1099 : map {$self->fetch($_)} @result;
1102 sub filter_by_name
{
1104 my ($name,$allow_aliases,$filter) = @_;
1106 my $index = $self->index_db('names');
1107 my $db = tied(%$index);
1109 my ($stem,$regexp) = $self->glob_match($name);
1112 $regexp .= "(?:_2)?" if $allow_aliases;
1117 for (my $status = $db->seq($key,$value,R_CURSOR
);
1118 $status == 0 and $key =~ /^$regexp$/i;
1119 $status = $db->seq($key,$value,R_NEXT
)) {
1120 next if %$filter && !$filter->{$value}; # don't bother
1121 push @results,$value;
1123 $self->update_filter($filter,\
@results);
1126 sub filter_by_type
{
1128 my ($types,$filter) = @_;
1129 my @types = ref $types eq 'ARRAY' ? @
$types : $types;
1131 my $index = $self->index_db('types');
1132 my $db = tied(%$index);
1136 for my $type (@types) {
1137 my ($primary_tag,$source_tag);
1138 if (ref $type && $type->isa('Bio::DB::GFF::Typename')) {
1139 $primary_tag = $type->method;
1140 $source_tag = $type->source;
1142 ($primary_tag,$source_tag) = split ':',$type,2;
1144 my $match = defined $source_tag ?
"^$primary_tag:$source_tag\$" : "^$primary_tag:";
1146 my $key = lc "$primary_tag:$source_tag";
1149 # If filter is already provided, then it is usually faster to
1150 # fetch each object.
1152 for my $id (keys %$filter) {
1153 my $obj = $self->_fetch($id) or next;
1154 push @results,$id if $obj->type =~ /$match/i;
1160 for (my $status = $db->seq($key,$value,R_CURSOR
);
1161 $status == 0 && $key =~ /$match/i;
1162 $status = $db->seq($key,$value,R_NEXT
)) {
1163 next if %$filter && !$filter->{$value}; # don't even bother
1164 push @results,$value;
1168 $self->update_filter($filter,\
@results);
1171 sub filter_by_location
{
1173 my ($seq_id,$start,$end,$strand,$range_type,$filter) = @_;
1176 my $index = $self->index_db('locations');
1177 my $db = tied(%$index);
1179 my $binstart = defined $start ?
sprintf("%06d",int $start/BINSIZE
) : '';
1180 my $binend = defined $end ?
sprintf("%06d",int $end/BINSIZE
) : 'z'; # beyond a number
1185 $start = MININT
if !defined $start;
1186 $end = MAXINT
if !defined $end;
1187 my $version_2 = $self->db_version > 1;
1189 if ($range_type eq 'overlaps' or $range_type eq 'contains') {
1190 my $key = $version_2 ?
"\L$seq_id\E.$binstart" : "\L$seq_id\E$binstart";
1191 my $keystop = $version_2 ?
"\L$seq_id\E.$binend" : "\L$seq_id\E$binend";
1194 for (my $status = $db->seq($key,$value,R_CURSOR
);
1195 $status == 0 && $key le $keystop;
1196 $status = $db->seq($key,$value,R_NEXT
)) {
1197 my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1198 next if $seenit{$id}++;
1199 next if $strand && $fstrand != $strand;
1200 if ($range_type eq 'overlaps') {
1201 next unless $fend >= $start && $fstart <= $end;
1203 elsif ($range_type eq 'contains') {
1204 next unless $fstart >= $start && $fend <= $end;
1206 next if %$filter && !$filter->{$id}; # don't bother
1211 # for contained in, we look for features originating and terminating outside the specified range
1212 # this is incredibly inefficient, but fortunately the query is rare (?)
1213 elsif ($range_type eq 'contained_in') {
1214 my $key = $version_2 ?
"\L$seq_id." : "\L$seq_id";
1215 my $keystop = $version_2 ?
"\L$seq_id\E.$binstart" : "\L$seq_id\E$binstart";
1218 # do the left part of the range
1219 for (my $status = $db->seq($key,$value,R_CURSOR
);
1220 $status == 0 && $key le $keystop;
1221 $status = $db->seq($key,$value,R_NEXT
)) {
1222 my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1223 next if $seenit{$id}++;
1224 next if $strand && $fstrand != $strand;
1225 next unless $fstart <= $start && $fend >= $end;
1226 next if %$filter && !$filter->{$id}; # don't bother
1230 # do the right part of the range
1231 $key = "\L$seq_id\E.$binend";
1232 for (my $status = $db->seq($key,$value,R_CURSOR
);
1234 $status = $db->seq($key,$value,R_NEXT
)) {
1235 my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
1236 next if $seenit{$id}++;
1237 next if $strand && $fstrand != $strand;
1238 next unless $fstart <= $start && $fend >= $end;
1239 next if %$filter && !$filter->{$id}; # don't bother
1245 $self->update_filter($filter,\
@results);
1250 my $index = $self->index_db('attributes');
1251 my %a = map {s/:.+$//; $_=> 1} keys %$index;
1255 sub filter_by_attribute
{
1257 my ($attributes,$filter) = @_;
1259 my $index = $self->index_db('attributes');
1260 my $db = tied(%$index);
1263 for my $att_name (keys %$attributes) {
1265 my @search_terms = ref($attributes->{$att_name}) && ref($attributes->{$att_name}) eq 'ARRAY'
1266 ? @
{$attributes->{$att_name}} : $attributes->{$att_name};
1268 for my $v (@search_terms) {
1269 my ($stem,$regexp) = $self->glob_match($v);
1272 my $key = "\L${att_name}:${stem}\E";
1274 for (my $status = $db->seq($key,$value,R_CURSOR
);
1275 $status == 0 && $key =~ /^$att_name:$regexp$/i;
1276 $status = $db->seq($key,$value,R_NEXT
)) {
1277 next if %$filter && !$filter->{$value}; # don't bother
1278 push @result,$value;
1281 $result ||= $self->update_filter($filter,\
@result);
1286 sub _search_attributes
{
1288 my ($search_string,$attribute_array,$limit) = @_;
1289 $search_string =~ tr/*?//d;
1290 my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g;
1291 my $search = join '|',@words;
1293 my $index = $self->index_db('attributes');
1294 my $db = tied(%$index);
1296 my (%results,%notes);
1298 for my $tag (@
$attribute_array) {
1300 my $key = "\L$tag:\E";
1301 for (my $status = $db->seq($key,$id,R_CURSOR
);
1302 $status == 0 and $key =~ /^$tag:(.*)/i;
1303 $status = $db->seq($key,$id,R_NEXT
)) {
1305 next unless $text =~ /$search/;
1306 for my $w (@words) {
1307 my @hits = $text =~ /($w)/ig or next;
1308 $results{$id} += @hits;
1310 $notes{$id} .= "$text ";
1315 for my $id (keys %results) {
1316 my $hits = $results{$id};
1317 my $note = $notes{$id};
1319 my $relevance = 10 * $hits;
1320 my $feature = $self->fetch($id) or next;
1321 my $name = $feature->display_name or next;
1322 my $type = $feature->type;
1323 push @results,[$name,$note,$relevance,$type,$id];
1331 my ($search_string,$limit) = @_;
1333 $search_string =~ tr/*?//d;
1337 my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g;
1338 my $search = join '|',@words;
1341 my $note_index = $self->notes_db;
1342 seek($note_index,0,0); # back to start
1343 while (<$note_index>) {
1344 next unless /$search/;
1346 my ($note,$uu) = split "\t";
1347 $found{unpack("u*",$uu)}++;
1348 last if $limit && ++$found >= $limit;
1351 my (@features, @matches);
1352 for my $idx (keys %found) {
1353 my $feature = $self->fetch($idx) or next;
1354 my @values = $feature->get_tag_values('Note') if $feature->has_tag('Note');
1355 my $value = "@values";
1358 $hits++ while $value =~ /($search)/ig; # count the number of times we were hit
1359 push @matches,$hits;
1360 push @features,$feature;
1363 for (my $i=0; $i<@matches; $i++) {
1364 my $feature = $features[$i];
1365 my $matches = $matches[$i];
1367 my $relevance = 10 * $matches;
1369 $note = join ' ',$feature->get_tag_values('Note') if $feature->has_tag('Note');
1370 push @results,[$feature->display_name,$note,$relevance];
1379 return unless $term =~ /([^*?]*)(?:^|[^\\])?[*?]/;
1381 $term =~ s/(^|[^\\])([+\[\]^{}\$|\(\).])/$1\\$2/g;
1382 $term =~ s/(^|[^\\])\*/$1.*/g;
1383 $term =~ s/(^|[^\\])\?/$1./g;
1384 return ($stem,$term);
1390 my ($filter,$results) = @_;
1391 return unless @
$results;
1394 my @filtered = grep {$filter->{$_}} @
$results;
1395 %$filter = map {$_=>1} @filtered;
1397 %$filter = map {$_=>1} @
$results;
1404 eval "require Bio::DB::GFF::Typename"
1405 unless Bio
::DB
::GFF
::Typename
->can('new');
1407 my $index = $self->index_db('types');
1408 my $db = tied(%$index);
1409 return map {Bio
::DB
::GFF
::Typename
->new($_)} keys %$index;
1413 sub _insert_sequence
{
1415 my ($seqid,$seq,$offset) = @_;
1416 my $dna_fh = $self->private_fasta_file or return;
1417 if ($offset == 0) { # start of the sequence
1418 print $dna_fh ">$seqid\n";
1420 print $dna_fh $seq,"\n";
1423 sub _fetch_sequence
{
1425 my ($seqid,$start,$end) = @_;
1426 my $db = $self->dna_db or return;
1427 return $db->seq($seqid,$start,$end);
1430 sub private_fasta_file
{
1432 return $self->{fasta_fh
} if exists $self->{fasta_fh
};
1433 $self->{fasta_file
} = $self->_qualify("sequence.fa");
1434 return $self->{fasta_fh
} = IO
::File
->new($self->{fasta_file
},">");
1437 sub finish_bulk_update
{
1439 if (my $fh = $self->{fasta_fh
}) {
1441 $self->{fasta_db
} = Bio
::DB
::Fasta
::Subdir
->new($self->{fasta_file
});
1448 return $db->{'.version'} || 1.00;
1454 $self->_close_databases();
1455 $self->private_fasta_file->close;
1456 rmtree
($self->directory,0,1) if $self->temporary && -e
$self->directory;
1459 # TIE interface -- a little annoying because we are storing magic ".variable"
1460 # meta-variables in the same data structure as the IDs, so these variables
1466 while ( ($key,$value) = each %{$db}) {
1467 last unless $key =~ /^\./;
1477 while ( ($key,$value) = each %$db) {
1478 last unless $key =~ /^\./;
1486 return exists $self->db->{$id};
1492 my $obj = $self->fetch($id) or return;
1493 $self->_delete_indexes($obj,$id);
1494 delete $self->db->{$id};
1500 $self->_close_databases();
1501 $self->_delete_databases();
1502 my ($write,$create) = $self->_permissions;
1503 $self->_open_databases($write,$create);
1508 return scalar %{$self->db};
1512 package Bio
::DB
::SeqFeature
::Store
::berkeleydb
::Iterator
;
1518 return bless {store
=> $store,
1519 ids
=> $ids},ref($class) || $class;
1524 my $store = $self->{store
} or return;
1525 my $id = shift @
{$self->{ids
}};
1526 defined $id or return;
1527 return $store->fetch($id);
1531 package Bio
::DB
::Fasta
::Subdir
;
1533 use base
'Bio::DB::Fasta';
1535 # alter calling arguments so that the index file is placed in a subdirectory
1539 my ($class, $path, %opts) = @_;
1541 $opts{-index_name
} = File
::Spec
->catfile($path,'indexes','fasta.index');
1543 return Bio
::DB
::Fasta
->new($path, %opts);
1547 sub _calculate_offsets
{
1548 my ($self, @args) = @_;
1549 return $self->SUPER::_calculate_offsets
(@args);
1559 This is an early version, so there are certainly some bugs. Please
1560 use the BioPerl bug tracking system to report bugs.
1565 L<Bio::DB::SeqFeature>,
1566 L<Bio::DB::SeqFeature::Store>,
1567 L<Bio::DB::SeqFeature::GFF3Loader>,
1568 L<Bio::DB::SeqFeature::Segment>,
1569 L<Bio::DB::SeqFeature::Store::memory>,
1570 L<Bio::DB::SeqFeature::Store::DBI::mysql>,
1574 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
1576 Copyright (c) 2006 Cold Spring Harbor Laboratory.
1578 This library is free software; you can redistribute it and/or modify
1579 it under the same terms as Perl itself.