maint: restructure to use Dist::Zilla
[bioperl-live.git] / lib / Bio / DB / SeqFeature / Store / DBI / SQLite.pm
blob2d0d070cdab8b54aee0c225763639262c7a79cad
1 package Bio::DB::SeqFeature::Store::DBI::SQLite;
3 #$Id$
5 =head1 NAME
7 Bio::DB::SeqFeature::Store::DBI::SQLite -- SQLite implementation of Bio::DB::SeqFeature::Store
9 =head1 SYNOPSIS
11 use Bio::DB::SeqFeature::Store;
13 # Open the sequence database
14 my $db = Bio::DB::SeqFeature::Store->new(-adaptor => 'DBI::SQLite',
15 -dsn => '/path/to/database.db');
17 # get a feature from somewhere
18 my $feature = Bio::SeqFeature::Generic->new(...);
20 # store it
21 $db->store($feature) or die "Couldn't store!";
23 # primary ID of the feature is changed to indicate its primary ID
24 # in the database...
25 my $id = $feature->primary_id;
27 # get the feature back out
28 my $f = $db->fetch($id);
30 # change the feature and update it
31 $f->start(100);
32 $db->update($f) or die "Couldn't update!";
34 # searching...
35 # ...by id
36 my @features = $db->fetch_many(@list_of_ids);
38 # ...by name
39 @features = $db->get_features_by_name('ZK909');
41 # ...by alias
42 @features = $db->get_features_by_alias('sma-3');
44 # ...by type
45 @features = $db->get_features_by_name('gene');
47 # ...by location
48 @features = $db->get_features_by_location(-seq_id=>'Chr1',-start=>4000,-end=>600000);
50 # ...by attribute
51 @features = $db->get_features_by_attribute({description => 'protein kinase'})
53 # ...by the GFF "Note" field
54 @result_list = $db->search_notes('kinase');
56 # ...by arbitrary combinations of selectors
57 @features = $db->features(-name => $name,
58 -type => $types,
59 -seq_id => $seqid,
60 -start => $start,
61 -end => $end,
62 -attributes => $attributes);
64 # ...using an iterator
65 my $iterator = $db->get_seq_stream(-name => $name,
66 -type => $types,
67 -seq_id => $seqid,
68 -start => $start,
69 -end => $end,
70 -attributes => $attributes);
72 while (my $feature = $iterator->next_seq) {
73 # do something with the feature
76 # ...limiting the search to a particular region
77 my $segment = $db->segment('Chr1',5000=>6000);
78 my @features = $segment->features(-type=>['mRNA','match']);
80 # getting & storing sequence information
81 # Warning: this returns a string, and not a PrimarySeq object
82 $db->insert_sequence('Chr1','GATCCCCCGGGATTCCAAAA...');
83 my $sequence = $db->fetch_sequence('Chr1',5000=>6000);
85 # what feature types are defined in the database?
86 my @types = $db->types;
88 # create a new feature in the database
89 my $feature = $db->new_feature(-primary_tag => 'mRNA',
90 -seq_id => 'chr3',
91 -start => 10000,
92 -end => 11000);
94 =head1 DESCRIPTION
96 Bio::DB::SeqFeature::Store::SQLite is the SQLite adaptor for
97 Bio::DB::SeqFeature::Store. You will not create it directly, but
98 instead use Bio::DB::SeqFeature::Store-E<gt>new() to do so.
100 See L<Bio::DB::SeqFeature::Store> for complete usage instructions.
102 =head2 Using the SQLite adaptor
104 To establish a connection to the database, call
105 Bio::DB::SeqFeature::Store-E<gt>new(-adaptor=E<gt>'DBI::SQLite',@more_args). The
106 additional arguments are as follows:
108 Argument name Description
109 ------------- -----------
111 -dsn The path to the SQLite database file.
113 -namespace A prefix to attach to each table. This allows you
114 to have several virtual databases in the same
115 physical database.
117 -temp Boolean flag. If true, a temporary database
118 will be created and destroyed as soon as
119 the Store object goes out of scope. (synonym -temporary)
121 -autoindex Boolean flag. If true, features in the database will be
122 reindexed every time they change. This is the default.
124 -fts Boolean flag. If true, when the -create flag is true, the
125 attribute table will be created and indexed index for
126 full-text search using the most recent FTS extension
127 supported by DBD::SQLite.
129 -tmpdir Directory in which to place temporary files during "fast" loading.
130 Defaults to File::Spec->tmpdir(). (synonyms -dump_dir, -dumpdir, -tmp)
132 -dbi_options A hashref to pass to DBI->connect's 4th argument, the "attributes."
133 (synonyms -options, -dbi_attr)
135 -write Pass true to open database for writing or updating.
137 If successful, a new instance of
138 Bio::DB::SeqFeature::Store::DBI::SQLite will be returned.
140 In addition to the standard methods supported by all well-behaved
141 Bio::DB::SeqFeature::Store databases, several following
142 adaptor-specific methods are provided. These are described in the next
143 sections.
145 =cut
147 use strict;
149 use base 'Bio::DB::SeqFeature::Store::DBI::mysql';
150 use Bio::DB::SeqFeature::Store::DBI::Iterator;
151 use DBD::SQLite;
152 use DBI qw(:sql_types);
153 use Memoize;
154 use Cwd qw(abs_path getcwd);
155 use Bio::DB::GFF::Util::Rearrange 'rearrange';
156 use Bio::SeqFeature::Lite;
157 use File::Spec;
158 use constant DEBUG=>0;
159 use constant EXPERIMENTAL_COVERAGE=>1;
161 # Using same limits as MySQL adaptor so I don't have to make something up.
162 use constant MAX_INT => 2_147_483_647;
163 use constant MIN_INT => -2_147_483_648;
164 use constant SUMMARY_BIN_SIZE => 1000; # we checkpoint coverage this often, about 20 meg overhead per feature type on hg
165 use constant USE_SPATIAL=>0;
167 # The binning scheme places each feature into a bin.
168 # Bins are variably sized as powers of two. For example,
169 # there are 585 bins of size 2**17 (131072 bases)
170 my (@BINS,%BINS);
172 @BINS = map {2**$_} (17, 20, 23, 26, 29); # TO DO: experiment with different bin sizes
173 my $start=0;
174 for my $b (sort {$b<=>$a} @BINS) {
175 $BINS{$b} = $start;
176 $start += $BINS[-1]/$b;
181 # my %BINS = (
182 # 2**11 => 37449,
183 # 2**14 => 4681,
184 # 2**17 => 585,
185 # 2**20 => 73,
186 # 2**23 => 9,
187 # 2**26 => 1,
188 # 2**29 => 0
189 # );
190 # my @BINS = sort {$a<=>$b} keys %BINS;
192 sub calculate_bin {
193 my $self = shift;
194 my ($start,$end) = @_;
196 my $len = $end - $start;
197 for my $bin (@BINS) {
198 next if $len > $bin;
199 # possibly fits here
200 my $binstart = int $start/$bin;
201 my $binend = int $end/$bin;
202 return $binstart+$BINS{$bin} if $binstart == $binend;
205 die "unreasonable coordinates ",$start+1,"..$end";
208 sub search_bins {
209 my $self = shift;
210 my ($start,$end) = @_;
211 my @results;
213 for my $bin (@BINS) {
214 my $binstart = int $start/$bin;
215 my $binend = int $end/$bin;
216 push @results,$binstart+$BINS{$bin}..$binend+$BINS{$bin};
218 return @results;
223 # object initialization
225 sub init {
226 my $self = shift;
227 my ($dsn,
228 $is_temporary,
229 $autoindex,
230 $namespace,
231 $dump_dir,
232 $user,
233 $pass,
234 $dbi_options,
235 $writeable,
236 $fts,
237 $create,
238 ) = rearrange(['DSN',
239 ['TEMP','TEMPORARY'],
240 'AUTOINDEX',
241 'NAMESPACE',
242 ['DUMP_DIR','DUMPDIR','TMP','TMPDIR'],
243 'USER',
244 ['PASS','PASSWD','PASSWORD'],
245 ['OPTIONS','DBI_OPTIONS','DBI_ATTR'],
246 ['WRITE','WRITEABLE'],
247 'FTS',
248 'CREATE',
249 ],@_);
250 $dbi_options ||= {};
251 $writeable = 1 if $is_temporary or $dump_dir;
253 $dsn or $self->throw("Usage: ".__PACKAGE__."->init(-dsn => \$dbh || \$dsn)");
255 my $dbh;
256 if (ref $dsn) {
257 $dbh = $dsn;
258 } else {
259 $dsn = "dbi:SQLite:$dsn" unless $dsn =~ /^dbi:/;
260 $dbh = DBI->connect($dsn,$user,$pass,$dbi_options) or $self->throw($DBI::errstr);
261 $dbh->do("PRAGMA synchronous = OFF;"); # makes writes much faster
262 $dbh->do("PRAGMA temp_store = MEMORY;"); # less disk I/O; some speedup
263 $dbh->do("PRAGMA cache_size = 20000;"); # less disk I/O; some speedup
264 # Keep track of database file location
265 my $cwd = getcwd;
266 my ($db_file) = ($dsn =~ m/(?:db(?:name)?|database)=(.+)$/);
267 $self->{dbh_file} = "$cwd/$db_file";
269 $self->{dbh} = $dbh;
270 $self->{fts} = $fts;
271 $self->{is_temp} = $is_temporary;
272 $self->{namespace} = $namespace;
273 $self->{writeable} = $writeable;
275 $self->default_settings;
276 $self->autoindex($autoindex) if defined $autoindex;
277 $self->dumpdir($dump_dir) if $dump_dir;
278 if ($self->is_temp) {
279 $self->init_tmp_database();
280 } elsif ($create) {
281 $self->init_database('erase');
285 sub table_definitions {
286 my $self = shift;
287 my $defs =
289 feature => <<END,
291 id integer primary key autoincrement,
292 typeid integer not null,
293 strand integer default 0,
294 "indexed" integer default 1,
295 object blob not null
299 locationlist => <<END,
301 id integer primary key autoincrement,
302 seqname text not null
304 create index index_locationlist on locationlist (seqname);
307 typelist => <<END,
309 id integer primary key autoincrement,
310 tag text not null collate nocase
312 create index index_typelist on typelist (tag);
314 name => <<END,
316 id integer not null,
317 name text not null collate nocase,
318 display_name integer default 0
320 create index index_name_id on name(id);
321 create index index_name_name on name(name);
324 attribute => <<END,
326 id integer not null,
327 attribute_id integer not null,
328 attribute_value text collate nocase
330 create index index_attribute_id on attribute(attribute_id);
331 create index index_attribute_value on attribute(attribute_value);
334 attributelist => <<END,
336 id integer primary key autoincrement,
337 tag text not null
339 create index index_attributelist_tag on attributelist(tag);
341 parent2child => <<END,
343 id integer,
344 child integer,
345 primary key(id, child)
346 ) without rowid;
349 meta => <<END,
351 name text primary key,
352 value text not null
355 sequence => <<END,
357 id integer not null,
358 offset integer not null,
359 sequence blob,
360 primary key(id,offset)
365 if ($self->{'fts'}) {
366 delete($defs->{attribute});
369 unless ($self->_has_spatial_index) {
370 $defs->{feature_location} = <<END;
372 id int(10) primary key,
373 seqid int(10),
374 bin int,
375 start int,
376 end int
378 create index index_feature_location on feature_location(seqid,bin,start,end);
383 if (EXPERIMENTAL_COVERAGE) {
384 $defs->{interval_stats} = <<END;
386 typeid integer not null,
387 seqid integer not null,
388 bin integer not null,
389 cum_count integer not null,
390 primary key (typeid,seqid,bin)
391 ) without rowid;
394 return $defs;
397 sub _init_database {
398 my $self = shift;
400 # must do this first before calling table_definitions
401 $self->_create_spatial_index;
402 $self->_create_attribute_fts;
403 $self->SUPER::_init_database(@_);
406 # FIXME: ensure this works with _create_attribute_fts...
407 sub init_tmp_database {
408 my $self = shift;
409 my $erase = shift;
410 $self->_create_spatial_index;
411 $self->_create_attribute_fts;
412 $self->SUPER::init_tmp_database(@_);
415 sub _create_spatial_index{
416 my $self = shift;
417 my $dbh = $self->dbh;
418 local $dbh->{PrintError} = 0;
419 $dbh->do("DROP TABLE IF EXISTS feature_index"); # spatial index
420 if (USE_SPATIAL) {
421 $dbh->do("CREATE VIRTUAL TABLE feature_index USING RTREE(id,seqid,bin,start,end)");
425 sub _create_attribute_fts{
426 my $self = shift;
427 my $dbh = $self->dbh;
428 if ($self->{'fts'}) {
429 my @fts_versions;
430 for (@fts_versions = grep(/^ENABLE_FTS[0-9]+$/, DBD::SQLite::compile_options)) { s/ENABLE_// }
431 # use the latest supported FTS version.
432 # DBD::SQLite::compile_options appears to be sorted
433 # alphabetically, so this should work through version FTS9.
434 die 'fts not supported by this version of DBD::SQLite' if (!@fts_versions);
435 $dbh->do("DROP TABLE IF EXISTS attribute");
436 $dbh->do("CREATE VIRTUAL TABLE "
437 . $self->_attribute_table
438 . " USING " . $fts_versions[-1]
439 . "(id, attribute_id, attribute_value)");
444 # return 1 if an existing attribute table in the connected database is an FTS
445 # table, else 0
447 sub _has_fts {
448 my $self = shift;
449 if (!defined($self->{'has_fts'})) {
450 # If the attribute table is a virtual table, assume it is an FTS
451 # table. Per http://www.sqlite.org/fileformat2.html:
452 # For (sqlite_master) rows that define views, triggers, and virtual
453 # tables, the rootpage column is 0 or NULL.
454 ($self->{'has_fts'}) = $self->dbh->selectrow_array("select count(*) from sqlite_master where type = 'table' and name = '"
455 . $self->_attribute_table
456 . "' and (rootpage = 0 or rootpage is null);");
458 return $self->{'has_fts'};
461 sub _has_spatial_index {
462 my $self = shift;
463 return $self->{'_has_spatial_index'} if exists $self->{'_has_spatial_index'};
464 my $dbh = $self->dbh;
465 my ($count) = $dbh->selectrow_array("select count(*) from sqlite_master where name='feature_index'");
466 return $self->{'_has_spatial_index'} = $count;
469 sub _finish_bulk_update {
470 my $self = shift;
471 my $dbh = $self->dbh;
472 my $dir = $self->{dumpdir} || '.';
474 $self->begin_work; # making this a transaction greatly improves performance
476 for my $table ('feature', $self->index_tables) {
477 my $fh = $self->dump_filehandle($table);
478 my $path = $self->dump_path($table);
479 $fh->close;
481 open $fh, '<', $path or $self->throw("Could not read file '$path': $!");
482 my $qualified_table = $self->_qualify($table);
484 my $sth;
485 if ($table =~ /feature$/) {
486 $sth = $dbh->prepare("REPLACE INTO $qualified_table VALUES (?,?,?,?,?)");
488 while (<$fh>) {
489 chomp();
490 my ($id,$typeid,$strand,$indexed,$obj) = split(/\t/);
491 $sth->bind_param(1, $id);
492 $sth->bind_param(2, $typeid);
493 $sth->bind_param(3, $strand);
494 $sth->bind_param(4, $indexed);
495 $sth->bind_param(5, pack('H*',$obj), {TYPE => SQL_BLOB});
496 $sth->execute();
498 } else {
499 my $feature_index = $self->_feature_index_table;
500 if ($table =~ /parent2child$/) {
501 $sth = $dbh->prepare("REPLACE INTO $qualified_table VALUES (?,?)");
502 } elsif ($table =~ /$feature_index$/) {
503 $sth = $dbh->prepare(
504 $self->_has_spatial_index ?"REPLACE INTO $qualified_table VALUES (?,?,?,?,?)"
505 :"REPLACE INTO $qualified_table (id,seqid,bin,start,end) VALUES (?,?,?,?,?)"
507 } else { # attribute or name
508 $sth = $dbh->prepare("REPLACE INTO $qualified_table VALUES (?,?,?)");
511 while (<$fh>) {
512 chomp();
513 $sth->execute(split(/\t/));
516 $fh->close();
517 unlink $path;
519 $self->commit; # commit the transaction
520 delete $self->{bulk_update_in_progress};
521 delete $self->{filehandles};
524 sub index_tables {
525 my $self = shift;
526 my @t = $self->SUPER::index_tables;
527 return (@t,$self->_feature_index_table);
530 sub _enable_keys { } # nullop
531 sub _disable_keys { } # nullop
533 sub _fetch_indexed_features_sql {
534 my $self = shift;
535 my $location_table = $self->_qualify('feature_location');
536 my $feature_table = $self->_qualify('feature');
537 return <<END;
538 SELECT typeid,seqid,start-1,end
539 FROM $location_table as l,$feature_table as f
540 WHERE l.id=f.id AND f.\"indexed\"=1
541 ORDER BY typeid,seqid,start
546 # get primary sequence between start and end
548 sub _fetch_sequence {
549 my $self = shift;
550 my ($seqid,$start,$end) = @_;
552 # backward compatibility to the old days when I liked reverse complementing
553 # dna by specifying $start > $end
554 my $reversed;
555 if (defined $start && defined $end && $start > $end) {
556 $reversed++;
557 ($start,$end) = ($end,$start);
559 $start-- if defined $start;
560 $end-- if defined $end;
562 my $offset1 = $self->_offset_boundary($seqid,$start || 'left');
563 my $offset2 = $self->_offset_boundary($seqid,$end || 'right');
564 my $sequence_table = $self->_sequence_table;
565 my $locationlist_table = $self->_locationlist_table;
567 # CROSS JOIN gives a hint to the SQLite query optimizer -- mucho speedup!
568 my $sth = $self->_prepare(<<END);
569 SELECT sequence,offset
570 FROM $locationlist_table as ll CROSS JOIN $sequence_table as s
571 WHERE ll.id=s.id
572 AND ll.seqname= ?
573 AND offset >= ?
574 AND offset <= ?
575 ORDER BY offset
578 my $seq = '';
579 $sth->execute($seqid,$offset1,$offset2) or $self->throw($sth->errstr);
581 while (my($frag,$offset) = $sth->fetchrow_array) {
582 substr($frag,0,$start-$offset) = '' if defined $start && $start > $offset;
583 $seq .= $frag;
585 substr($seq,$end-$start+1) = '' if defined $end && $end-$start+1 < length($seq);
586 if ($reversed) {
587 $seq = reverse $seq;
588 $seq =~ tr/gatcGATC/ctagCTAG/;
590 $sth->finish;
591 $seq;
594 sub _offset_boundary {
595 my $self = shift;
596 my ($seqid,$position) = @_;
598 my $sequence_table = $self->_sequence_table;
599 my $locationlist_table = $self->_locationlist_table;
601 my $sql;
602 # use "CROSS JOIN" to give a hint to the SQLite query optimizer.
603 $sql = $position eq 'left' ? "SELECT min(offset) FROM $locationlist_table as ll CROSS JOIN $sequence_table as s ON ll.id=s.id WHERE ll.seqname=?"
604 :$position eq 'right' ? "SELECT max(offset) FROM $locationlist_table as ll CROSS JOIN $sequence_table as s ON ll.id=s.id WHERE ll.seqname=?"
605 :"SELECT max(offset) FROM $locationlist_table as ll CROSS JOIN $sequence_table as s ON ll.id=s.id WHERE ll.seqname=? AND offset<=?";
606 my $sth = $self->_prepare($sql);
607 my @args = $position =~ /^-?\d+$/ ? ($seqid,$position) : ($seqid);
608 $sth->execute(@args) or $self->throw($sth->errstr);
609 my $boundary = $sth->fetchall_arrayref->[0][0];
610 $sth->finish;
611 return $boundary;
615 # Efficiently fetch a series of IDs from the database
616 # Can pass an array or an array ref
618 sub _fetch_many {
619 my $self = shift;
620 @_ or $self->throw('usage: fetch_many($id1,$id2,$id3...)');
621 my $ids = join ',',map {ref($_) ? @$_ : $_} @_ or return;
622 my $features = $self->_feature_table;
624 my $sth = $self->_prepare(<<END);
625 SELECT id,object FROM $features WHERE id IN ($ids)
627 $sth->execute() or $self->throw($sth->errstr);
628 return $self->_sth2objs($sth);
631 sub _features {
632 my $self = shift;
633 my ($seq_id,$start,$end,$strand,
634 $name,$class,$allow_aliases,
635 $types,
636 $attributes,
637 $range_type,
638 $fromtable,
639 $iterator,
640 $sources
641 ) = rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],'STRAND',
642 'NAME','CLASS','ALIASES',
643 ['TYPES','TYPE','PRIMARY_TAG'],
644 ['ATTRIBUTES','ATTRIBUTE'],
645 'RANGE_TYPE',
646 'FROM_TABLE',
647 'ITERATOR',
648 ['SOURCE','SOURCES']
649 ],@_);
651 my (@from,@where,@args,@group);
652 $range_type ||= 'overlaps';
654 my $feature_table = $self->_feature_table;
655 @from = "$feature_table as f";
657 if (defined $name) {
658 # hacky backward compatibility workaround
659 undef $class if $class && $class eq 'Sequence';
660 $name = "$class:$name" if defined $class && length $class > 0;
661 # last argument is the join field
662 my ($from,$where,$group,@a) = $self->_name_sql($name,$allow_aliases,'f.id');
663 push @from,$from if $from;
664 push @where,$where if $where;
665 push @group,$group if $group;
666 push @args,@a;
669 if (defined $seq_id) {
670 # last argument is the name of the features table
671 my ($from,$where,$group,@a) = $self->_location_sql($seq_id,$start,$end,$range_type,$strand,'f');
672 push @from,$from if $from;
673 push @where,$where if $where;
674 push @group,$group if $group;
675 push @args,@a;
678 if (defined($sources)) {
679 my @sources = ref($sources) eq 'ARRAY' ? @{$sources} : ($sources);
680 if (defined($types)) {
681 my @types = ref($types) eq 'ARRAY' ? @{$types} : ($types);
682 my @final_types;
683 foreach my $type (@types) {
684 # *** not sure what to do if user supplies both -source and -type
685 # where the type includes a source!
686 if ($type =~ /:/) {
687 push(@final_types, $type);
689 else {
690 foreach my $source (@sources) {
691 push(@final_types, $type.':'.$source);
695 $types = \@final_types;
697 else {
698 $types = [map { ':'.$_ } @sources];
701 if (defined($types)) {
702 # last argument is the name of the features table
703 my ($from,$where,$group,@a) = $self->_types_sql($types,'f');
704 push @from,$from if $from;
705 push @where,$where if $where;
706 push @group,$group if $group;
707 push @args,@a;
710 if (defined $attributes) {
711 # last argument is the join field
712 my ($from,$where,$group,@a) = $self->_attributes_sql($attributes,'f.id');
713 push @from,$from if $from;
714 push @where,$where if $where;
715 push @group,$group if $group;
716 push @args,@a;
719 if (defined $fromtable) {
720 # last argument is the join field
721 my ($from,$where,$group,@a) = $self->_from_table_sql($fromtable,'f.id');
722 push @from,$from if $from;
723 push @where,$where if $where;
724 push @group,$group if $group;
725 push @args,@a;
728 # if no other criteria are specified, then
729 # only fetch indexed (i.e. top level objects)
730 @where = '"indexed"=1' unless @where;
732 my $from = join ', ',@from;
733 my $where = join ' AND ',map {"($_)"} @where;
734 my $group = join ', ',@group;
735 $group = "GROUP BY $group" if @group;
737 my $query = <<END;
738 SELECT f.id,f.object
739 FROM $from
740 WHERE $where
741 $group
744 $self->_print_query($query,@args) if DEBUG || $self->debug;
746 my $sth = $self->_prepare($query);
747 $sth->execute(@args) or $self->throw($sth->errstr);
748 return $iterator ? Bio::DB::SeqFeature::Store::DBI::Iterator->new($sth,$self) : $self->_sth2objs($sth);
751 sub _make_attribute_group {
752 my $self = shift;
753 my ($table_name,$attributes) = @_;
754 my $key_count = keys %$attributes or return;
755 my $count = $key_count-1;
756 return "f.id HAVING count(f.id)>$count";
759 sub _location_sql {
760 my $self = shift;
761 my ($seq_id,$start,$end,$range_type,$strand,$location) = @_;
763 # the additional join on the location_list table badly impacts performance
764 # so we build a copy of the table in memory
765 my $seqid = $self->_locationid_nocreate($seq_id) || 0; # zero is an invalid primary ID, so will return empty
767 my $feature_index = $self->_feature_index_table;
768 my $from = "$feature_index as fi";
770 my ($bin_where,@bin_args);
771 if (defined $start && defined $end && !$self->_has_spatial_index) {
772 my @bins = $self->search_bins($start,$end);
773 $bin_where = ' AND bin in ('.join(',',@bins).')';
776 $start = MIN_INT unless defined $start;
777 $end = MAX_INT unless defined $end;
779 my ($range,@range_args);
780 if ($range_type eq 'overlaps') {
781 $range = "fi.end>=? AND fi.start<=?".$bin_where;
782 @range_args = ($start,$end,@bin_args);
783 } elsif ($range_type eq 'contains') {
784 $range = "fi.start>=? AND fi.end<=?".$bin_where;
785 @range_args = ($start,$end,@bin_args);
786 } elsif ($range_type eq 'contained_in') {
787 $range = "fi.start<=? AND fi.end>=?";
788 @range_args = ($start,$end);
789 } else {
790 $self->throw("range_type must be one of 'overlaps', 'contains' or 'contained_in'");
793 if (defined $strand) {
794 $range .= " AND strand=?";
795 push @range_args,$strand;
798 my $where = <<END;
799 fi.seqid=?
800 AND $location.id=fi.id
801 AND $range
804 my $group = '';
806 my @args = ($seqid,@range_args);
807 return ($from,$where,$group,@args);
810 sub _feature_index_table {
811 my $self = shift;
812 return $self->_has_spatial_index ? $self->_qualify('feature_index')
813 : $self->_qualify('feature_location') }
815 # Do a case-insensitive search a la the PostgreSQL adaptor
816 sub _name_sql {
817 my $self = shift;
818 my ($name,$allow_aliases,$join) = @_;
819 my $name_table = $self->_name_table;
821 my $from = "$name_table as n";
822 my ($match,$string) = $self->_match_sql($name);
824 my $where = "n.id=$join AND n.name $match COLLATE NOCASE";
825 $where .= " AND n.display_name>0" unless $allow_aliases;
826 return ($from,$where,'',$string);
829 sub _search_attributes {
830 my $self = shift;
831 my ($search_string,$attribute_names,$limit) = @_;
832 my @words = map {quotemeta($_)} split /\s+/,$search_string;
834 my $name_table = $self->_name_table;
835 my $attribute_table = $self->_attribute_table;
836 my $attributelist_table = $self->_attributelist_table;
837 my $type_table = $self->_type_table;
838 my $typelist_table = $self->_typelist_table;
839 my $has_fts = $self->_has_fts;
841 my @tags = @$attribute_names;
842 my $tag_sql = join ' OR ',("al.tag=?") x @tags;
844 my $perl_regexp = join '|',@words;
846 my $sql_regexp;
847 my @wild_card_words;
848 if ($has_fts) {
849 $sql_regexp = "a.attribute_value MATCH ?";
850 @wild_card_words = join(' OR ', @words);
851 } else {
852 $sql_regexp = join ' OR ',("a.attribute_value LIKE ?") x @words;
853 @wild_card_words = map { "%$_%" } @words;
855 # CROSS JOIN hinders performance with FTS attribute table for DBD::SQLite 1.42
856 my $sql = <<END;
857 SELECT name,attribute_value,tl.tag,n.id
858 FROM $attributelist_table AS al
859 JOIN $attribute_table AS a ON al.id = a.attribute_id
860 JOIN $name_table AS n ON n.id = a.id
861 JOIN $type_table AS t ON t.id = n.id
862 JOIN $typelist_table AS tl ON tl.id = t.typeid
863 WHERE ($tag_sql)
864 AND ($sql_regexp)
865 AND n.display_name=1
867 $sql .= "LIMIT $limit" if defined $limit;
868 $self->_print_query($sql,@tags,@wild_card_words) if DEBUG || $self->debug;
869 my $sth = $self->_prepare($sql);
870 $sth->execute(@tags, @wild_card_words) or $self->throw($sth->errstr);
872 my @results;
873 while (my($name,$value,$type,$id) = $sth->fetchrow_array) {
874 my (@hits) = $value =~ /$perl_regexp/ig;
875 my @words_in_row = split /\b/,$value;
876 my $score = int(@hits*100/@words/@words_in_row);
877 push @results,[$name,$value,$score,$type,$id];
879 $sth->finish;
880 @results = sort {$b->[2]<=>$a->[2]} @results;
881 return @results;
884 sub _match_sql {
885 my $self = shift;
886 my $name = shift;
888 my ($match,$string);
889 if ($name =~ /(?:^|[^\\])[*?]/) {
890 $name =~ s/(^|[^\\])([%_])/$1\\$2/g;
891 $name =~ s/(^|[^\\])\*/$1%/g;
892 $name =~ s/(^|[^\\])\?/$1_/g;
893 $match = "LIKE ?";
894 $string = $name;
895 } else {
896 $match = "= ? COLLATE NOCASE";
897 $string = $name;
899 return ($match,$string);
902 sub _attributes_sql {
903 my $self = shift;
904 my ($attributes,$join) = @_;
906 my ($wf,@bind_args) = $self->_make_attribute_where('a','al',$attributes);
907 my ($group_by,@group_args)= $self->_make_attribute_group('a',$attributes);
909 my $attribute_table = $self->_attribute_table;
910 my $attributelist_table = $self->_attributelist_table;
912 my $from = "$attribute_table AS a" . ($self->_has_fts
913 ? ''
914 : " INDEXED BY index_attribute_id") . ", $attributelist_table AS al";
915 my $a_al_join = $self->_has_fts ? 'a.attribute_id MATCH al.id' : 'a.attribute_id=al.id';
917 my $where = <<END;
918 a.id=$join
919 AND $a_al_join
920 AND ($wf)
923 my $group = $group_by;
925 my @args = (@bind_args,@group_args);
926 return ($from,$where,$group,@args);
929 # overridden because of case-sensitivity of matches
930 sub _types_sql {
931 my $self = shift;
932 my ($types,$type_table) = @_;
933 my ($primary_tag,$source_tag);
935 my @types = ref $types eq 'ARRAY' ? @$types : $types;
937 my $typelist = $self->_typelist_table;
938 my $from = "$typelist AS tl";
940 my (@matches,@args);
942 for my $type (@types) {
944 if (ref $type && $type->isa('Bio::DB::GFF::Typename')) {
945 $primary_tag = $type->method;
946 $source_tag = $type->source;
947 } else {
948 ($primary_tag,$source_tag) = split ':',$type,2;
951 if (length $source_tag) {
952 push @matches,"tl.tag=? COLLATE NOCASE";
953 push @args,"$primary_tag:$source_tag";
954 } else {
955 push @matches,"tl.tag LIKE ?";
956 push @args,"$primary_tag:%";
959 my $matches = join ' OR ',@matches;
961 my $where = <<END;
962 tl.id=$type_table.typeid
963 AND ($matches)
966 return ($from,$where,'',@args);
969 sub optimize {
970 my $self = shift;
971 $self->dbh->do("ANALYZE $_") foreach $self->index_tables;
975 # Replace Bio::SeqFeatureI into database.
977 sub replace {
978 my $self = shift;
979 my $object = shift;
980 my $index_flag = shift || undef;
982 # ?? shouldn't need to do this
983 # $self->_load_class($object);
984 my $id = $object->primary_id;
985 my $features = $self->_feature_table;
987 my $sth = $self->_prepare(<<END);
988 REPLACE INTO $features (id,object,"indexed",strand,typeid) VALUES (?,?,?,?,?)
991 my ($seqid,$start,$end,$strand,$bin) = $index_flag ? $self->_get_location_and_bin($object) : (undef)x6;
993 my $primary_tag = $object->primary_tag;
994 my $source_tag = $object->source_tag || '';
995 $primary_tag .= ":$source_tag";
996 my $typeid = $self->_typeid($primary_tag,1);
998 my $frozen = $self->no_blobs() ? 0 : $self->freeze($object);
1000 $sth->bind_param(1, $id);
1001 $sth->bind_param(2, $frozen, {TYPE => SQL_BLOB});
1002 $sth->bind_param(3, $index_flag||0);
1003 $sth->bind_param(4, $strand);
1004 $sth->bind_param(5, $typeid);
1006 $sth->execute() or $self->throw($sth->errstr);
1008 my $dbh = $self->dbh;
1009 $object->primary_id($dbh->func('last_insert_rowid')) unless defined $id;
1011 $self->flag_for_indexing($dbh->func('last_insert_rowid')) if $self->{bulk_update_in_progress};
1014 # doesn't work with this schema, since we have to update name and attribute
1015 # tables which need object ids, which we can only know by replacing feats in
1016 # the feature table one by one
1017 sub bulk_replace {
1018 my $self = shift;
1019 my $index_flag = shift || undef;
1020 my @objects = @_;
1022 my $features = $self->_feature_table;
1024 my @insert_values;
1025 foreach my $object (@objects) {
1026 my $id = $object->primary_id;
1027 my (undef,undef,undef,$strand) = $index_flag ? $self->_get_location_and_bin($object) : (undef)x4;
1028 my $primary_tag = $object->primary_tag;
1029 my $source_tag = $object->source_tag || '';
1030 $primary_tag .= ":$source_tag";
1031 my $typeid = $self->_typeid($primary_tag,1);
1033 push(@insert_values, ($id,0,$index_flag||0,$strand,$typeid));
1036 my @value_blocks;
1037 for (1..@objects) {
1038 push(@value_blocks, '(?,?,?,?,?)');
1040 my $value_blocks = join(',', @value_blocks);
1041 my $sql = qq{REPLACE INTO $features (id,object,"indexed",strand,typeid) VALUES $value_blocks};
1043 my $sth = $self->_prepare($sql);
1044 $sth->execute(@insert_values) or $self->throw($sth->errstr);
1047 sub _get_location_and_bin {
1048 my $self = shift;
1049 my $obj = shift;
1050 my $seqid = $self->_locationid($obj->seq_id||'');
1051 my $start = $obj->start;
1052 my $end = $obj->end;
1053 my $strand = $obj->strand;
1054 return ($seqid,$start,$end,$strand,$self->calculate_bin($start,$end));
1060 # Insert one Bio::SeqFeatureI into database. primary_id must be undef
1062 sub insert {
1063 my $self = shift;
1064 my $object = shift;
1065 my $index_flag = shift || 0;
1067 $self->_load_class($object);
1068 defined $object->primary_id and $self->throw("$object already has a primary id");
1070 my $features = $self->_feature_table;
1071 my $sth = $self->_prepare(<<END);
1072 INSERT INTO $features (id,object,"indexed") VALUES (?,?,?)
1074 $sth->execute(undef,$self->freeze($object),$index_flag) or $self->throw($sth->errstr);
1075 my $dbh = $self->dbh;
1076 $object->primary_id($dbh->func('last_insert_rowid'));
1077 $self->flag_for_indexing($dbh->func('last_insert_rowid')) if $self->{bulk_update_in_progress};
1080 =head2 toplevel_types
1082 Title : toplevel_types
1083 Usage : @type_list = $db->toplevel_types
1084 Function: Get the toplevel types in the database
1085 Returns : array of Bio::DB::GFF::Typename objects
1086 Args : none
1087 Status : public
1089 This is similar to types() but only returns the types of
1090 INDEXED (toplevel) features.
1092 =cut
1094 sub toplevel_types {
1095 my $self = shift;
1096 eval "require Bio::DB::GFF::Typename"
1097 unless Bio::DB::GFF::Typename->can('new');
1098 my $typelist_table = $self->_typelist_table;
1099 my $feature_table = $self->_feature_table;
1100 my $sql = <<END;
1101 SELECT distinct(tag) from $typelist_table as tl,$feature_table as f
1102 WHERE tl.id=f.typeid
1103 AND f."indexed"=1
1106 $self->_print_query($sql) if DEBUG || $self->debug;
1107 my $sth = $self->_prepare($sql);
1108 $sth->execute() or $self->throw($sth->errstr);
1110 my @results;
1111 while (my($tag) = $sth->fetchrow_array) {
1112 push @results,Bio::DB::GFF::Typename->new($tag);
1114 $sth->finish;
1115 return @results;
1118 sub _genericid {
1119 my $self = shift;
1120 my ($table,$namefield,$name,$add_if_missing) = @_;
1121 my $qualified_table = $self->_qualify($table);
1122 my $sth = $self->_prepare(<<END);
1123 SELECT id FROM $qualified_table WHERE $namefield=? COLLATE NOCASE
1125 $sth->execute($name) or die $sth->errstr;
1126 my ($id) = $sth->fetchrow_array;
1127 $sth->finish;
1128 return $id if defined $id;
1129 return unless $add_if_missing;
1131 $sth = $self->_prepare(<<END);
1132 INSERT INTO $qualified_table ($namefield) VALUES (?)
1134 $sth->execute($name) or die $sth->errstr;
1135 my $dbh = $self->dbh;
1136 return $dbh->func('last_insert_rowid');
1139 # special-purpose store for bulk loading - write to a file rather than to the db
1141 sub _dump_store {
1142 my $self = shift;
1143 my $indexed = shift;
1145 my $count = 0;
1146 my $store_fh = $self->dump_filehandle('feature');
1147 my $dbh = $self->dbh;
1149 my $autoindex = $self->autoindex;
1151 for my $obj (@_) {
1152 my $id = $self->next_id;
1153 my ($seqid,$start,$end,$strand) = $indexed ? $self->_get_location_and_bin($obj) : (undef)x4;
1154 my $primary_tag = $obj->primary_tag;
1155 my $source_tag = $obj->source_tag || '';
1156 $primary_tag .= ":$source_tag";
1157 my $typeid = $self->_typeid($primary_tag,1);
1159 # Encode BLOB in hex so we can more easily import it into SQLite
1160 print $store_fh
1161 join("\t",$id,$typeid,$strand,$indexed,
1162 unpack('H*', $self->freeze($obj))),"\n";
1163 $obj->primary_id($id);
1164 $self->_update_indexes($obj) if $indexed && $autoindex;
1165 $count++;
1168 # remember whether we are have ever stored a non-indexed feature
1169 unless ($indexed or $self->{indexed_flag}++) {
1170 $self->subfeatures_are_indexed(0);
1172 $count;
1175 sub _dump_update_name_index {
1176 my $self = shift;
1177 my ($obj,$id) = @_;
1178 my $fh = $self->dump_filehandle('name');
1179 my $dbh = $self->dbh;
1180 my ($names,$aliases) = $self->feature_names($obj);
1181 # unlike DBI::mysql, don't quote, as quotes will be quoted when loaded
1182 print $fh join("\t",$id,$_,1),"\n" foreach @$names;
1183 print $fh join("\t",$id,$_,0),"\n" foreach @$aliases;
1186 sub _update_name_index {
1187 my $self = shift;
1188 my ($obj,$id) = @_;
1189 my $name = $self->_name_table;
1190 my $primary_id = $obj->primary_id;
1192 $self->_delete_index($name,$id);
1193 my ($names,$aliases) = $self->feature_names($obj);
1195 my $sth = $self->_prepare("INSERT INTO $name (id,name,display_name) VALUES (?,?,?)");
1197 $sth->execute($id,$_,1) or $self->throw($sth->errstr) foreach @$names;
1198 $sth->execute($id,$_,0) or $self->throw($sth->errstr) foreach @$aliases;
1199 $sth->finish;
1203 sub _dump_update_attribute_index {
1204 my $self = shift;
1205 my ($obj,$id) = @_;
1206 my $fh = $self->dump_filehandle('attribute');
1207 my $dbh = $self->dbh;
1208 for my $tag ($obj->all_tags) {
1209 my $tagid = $self->_attributeid($tag);
1210 for my $value ($obj->each_tag_value($tag)) {
1211 # unlike DBI::mysql, don't quote, as quotes will be quoted when loaded
1212 print $fh join("\t",$id,$tagid,$value),"\n";
1217 sub _update_indexes {
1218 my $self = shift;
1219 my $obj = shift;
1220 defined (my $id = $obj->primary_id) or return;
1221 $self->SUPER::_update_indexes($obj);
1223 if ($self->{bulk_update_in_progress}) {
1224 $self->_dump_update_location_index($obj,$id);
1225 } else {
1226 $self->_update_location_index($obj,$id);
1230 sub _update_location_index {
1231 my $self = shift;
1232 my ($obj,$id) = @_;
1233 my ($seqid,$start,$end,$strand,$bin) = $self->_get_location_and_bin($obj);
1235 my $table = $self->_feature_index_table;
1236 $self->_delete_index($table,$id);
1238 my ($sql,@args);
1240 if ($self->_has_spatial_index) {
1241 $sql = "INSERT INTO $table (id,seqid,bin,start,end) values (?,?,?,?,?)";
1242 @args = ($id,$seqid,$bin,$start,$end);
1243 } else {
1244 $sql = "INSERT INTO $table (id,seqid,bin,start,end) values (?,?,?,?,?)";
1245 @args = ($id,$seqid,$bin,$start,$end);
1248 my $sth = $self->_prepare($sql);
1249 $sth->execute(@args);
1250 $sth->finish;
1253 sub _dump_update_location_index {
1254 my $self = shift;
1255 my ($obj,$id) = @_;
1256 my $table = $self->_feature_index_table;
1257 my $fh = $self->dump_filehandle($table);
1258 my $dbh = $self->dbh;
1259 my ($seqid,$start,$end,$strand,$bin) = $self->_get_location_and_bin($obj);
1260 my @args = $self->_has_spatial_index ? ($id,$seqid,$bin,$start,$end)
1261 : ($id,$seqid,$bin,$start,$end);
1262 print $fh join("\t",@args),"\n";
1265 sub DESTROY {
1266 my $self = shift;
1267 # Remove filehandles, so temporal files can be properly deleted
1268 if (%DBI::installed_drh) {
1269 DBI->disconnect_all;
1270 %DBI::installed_drh = ();
1272 undef $self->{dbh};
1277 =head1 AUTHOR
1279 Nathan Weeks - Nathan.Weeks@ars.usda.gov
1281 Copyright (c) 2009 Nathan Weeks
1283 Modified 2010 to support cumulative statistics by Lincoln Stein
1284 <lincoln.stein@gmail.com>.
1286 This library is free software; you can redistribute it and/or modify
1287 it under the same terms as Perl itself. See the Bioperl license for
1288 more details.
1290 =cut