2 # BioPerl module for Bio::DB::Taxonomy::flatfile
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Chris Fields <cjfields-at-cpan-dot-org>
8 # Copyright Chris Fields
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::DB::Taxonomy::sqlite - SQLite-based implementation of Bio::DB::Taxonomy::flatfile
20 use Bio::DB::Taxonomy;
22 my $db = Bio::DB::Taxonomy->new(-source => 'sqlite',
23 -db => 'mytax.db' # default 'taxonomy.sqlite'
24 -nodesfile => 'nodes.dmp',
25 -namesfile => 'names.dmp');
29 This is an implementation of Bio::DB::Taxonomy which stores and accesses the
30 NCBI taxonomy using a simple SQLite3 database stored locally on disk.
32 With this implementation, one can do the same basic searches as with the 'flatfile'
33 database. A test lookup of 1000 NCBI TaxIDs with full lineage information took
34 about 2 seconds on my older MacBook Pro laptop with an on-disk implementation.
36 A few key differences:
40 =item * You can use typical SQL syntax to run a query search; for instance, if you want you can run:
42 @ids = sort $db->get_taxonids('Chloroflexi%');
44 =item * In-memory database is allowed
46 my $db = Bio::DB::Taxonomy->new(-source => 'sqlite',
48 -nodesfile => 'nodes.dmp',
49 -namesfile => 'names.dmp');
53 The required database files, nodes.dmp and names.dmp can be obtained from
54 ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
60 =item * Small optimizations, such as optimizing name lookups
62 =item * Possibly use L<recursive CTE|http://www.sqlite.org/lang_with.html> to do lineage lookups
64 =item * Clean up SQL (still kind of a mess right now)
66 =item * Check compat. with other NCBI-specific L<Bio::DB::Taxonomy> implementations
68 =item * Plan out feasibility of allowing other backends (Neo4J, other DBI, etc)
70 =item * Optionally calculate left/right ID values for TaxID nodes
74 Beyond completing the implementation and optimization, this will
75 likely be rolled into a more flexible backend at some future point.
81 User feedback is an integral part of the evolution of this and other
82 Bioperl modules. Send your comments and suggestions preferably to
83 the Bioperl mailing list. Your participation is much appreciated.
85 bioperl-l@bioperl.org - General discussion
86 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
90 Please direct usage questions or support issues to the mailing list:
92 I<bioperl-l@bioperl.org>
94 rather than to the module maintainer directly. Many experienced and
95 reponsive experts will be able look at the problem and quickly
96 address it. Please include a thorough description of the problem
97 with code and data examples if at all possible.
101 Report bugs to the Bioperl bug tracking system to help us keep track
102 of the bugs and their resolution. Bug reports can be submitted via
105 https://github.com/bioperl/bioperl-live/issues
107 =head1 AUTHOR - Chris Fields
109 Email cjfields-at-cpan-dot-org
113 The rest of the documentation details each of the object methods.
114 Internal methods are usually preceded with a _
118 # Let the code begin...
120 package Bio
::DB
::Taxonomy
::sqlite
;
126 use File
::Spec
::Functions
;
130 use constant SEPARATOR
=> ':';
132 our $DEFAULT_INDEX_DIR = $Bio::Root
::IO
::TEMPDIR
; # /tmp
133 our $DEFAULT_CACHE_SIZE = 0; # /tmp
134 our $DEFAULT_DB_NAME = 'taxonomy.sqlite';
138 [qw(INV Invertebrates)],
141 [qw(PLN Plants)], # (and fungi)
145 [qw(UNA Unassigned)],
147 [qw(VRT Vertebrates)],
148 [qw(ENV 'Environmental samples')]
151 use base
qw(Bio::DB::Taxonomy);
156 Usage : my $obj = Bio::DB::Taxonomy::flatfile->new();
157 Function: Builds a new Bio::DB::Taxonomy::flatfile object
158 Returns : an instance of Bio::DB::Taxonomy::flatfile
159 Args : -directory => name of directory where index files should be created
160 -nodesfile => name of file containing nodes (nodes.dmp from NCBI)
161 -namesfile => name of the file containing names(names.dmp from NCBI)
162 -force => 1 to replace current indexes even if they exist
166 # TODO: get rid of globals!
168 my ( $class, @args ) = @_;
170 my $self = $class->SUPER::new
(@args);
172 my ( $dir, $nodesfile, $namesfile, $db, $force, $cs ) =
173 $self->_rearrange( [qw(DIRECTORY NODESFILE NAMESFILE DB FORCE CACHE_SIZE)], @args );
175 $self->index_directory( $dir || $DEFAULT_INDEX_DIR );
177 $self->db_name( $db || $DEFAULT_DB_NAME );
179 $self->cache_size($cs // $DEFAULT_CACHE_SIZE);
182 $self->_build_index( $nodesfile, $namesfile, $force );
189 =head2 Bio::DB::Taxonomy interface implementation
194 Usage : my $num = $db->get_num_taxa();
195 Function: Get the number of taxa stored in the database.
204 my $ct = $self->_dbh_fetch(<<SQL);
205 SELECT COUNT(*) FROM taxon
214 Usage : my $taxon = $db->get_taxon(-taxonid => $taxonid)
215 Function: Get a Bio::Taxon object from the database.
216 Returns : Bio::Taxon object
217 Args : just a single value which is the database id, OR named args:
218 -taxonid => taxonomy id (to query by taxonid)
220 -name => string (to query by a taxonomy name: common name,
221 scientific name, etc)
227 my ( $taxonid, $name );
230 ( $taxonid, $name ) = $self->_rearrange( [qw(TAXONID NAME)], @_ );
232 ( $taxonid, my @others ) = $self->get_taxonids($name);
234 "There were multiple ids ($taxonid @others) matching '$name', using '$taxonid'"
242 return unless $taxonid;
244 $taxonid =~ /^\d+$/ || $self->throw("TaxID must be integer, got [$taxonid]");
246 my ( $parent_id, $rank, $code, $divid, $gen_code, $mito, $nm, $uniq, $class );
247 # single join or two calls?
248 my $sth = $self->_prepare_cached(<<SQL);
249 SELECT tax.parent_id, tax.rank, tax.code, tax.division_id, tax.gencode_id, tax.mito_id, names.name, names.uniq_name, names.class
250 FROM taxon as tax, names
254 names.taxon_id = tax.taxon_id
257 $sth->bind_columns(\
$parent_id, \
$rank, \
$code, \
$divid, \
$gen_code, \
$mito, \
$nm, \
$uniq, \
$class);
259 $sth->execute($taxonid) or $self->throw($sth->errstr);
261 my ($sci_name, @common_names);
263 while ($sth->fetch) {
264 if ($class eq 'scientific name') {
267 push @common_names, $nm;
271 my $taxon = Bio
::Taxon
->new(
273 -common_names
=> [@common_names],
274 -ncbi_taxid
=> $taxonid,
275 -parent_id
=> $parent_id,
277 -division
=> $DIVISIONS[$divid]->[1],
278 -genetic_code
=> $gen_code,
279 -mito_genetic_code
=> $mito
282 # we can't use -dbh or the db_handle() method ourselves or we'll go
283 # infinite on the merge attempt
284 $taxon->{'db_handle'} = $self;
286 $self->_handle_internal_id($taxon);
291 *get_Taxonomy_Node
= \
&get_taxon
;
296 Usage : my @taxonids = $db->get_taxonids('Homo sapiens');
297 Function: Searches for a taxonid (typically ncbi_taxon_id) based on a query
298 string. Note that multiple taxonids can match to the same supplied
300 Returns : array of integer ids in list context, one of these in scalar context
301 Args : string representing taxon's name
306 my ( $self, $query ) = @_;
308 # TODO: note we're not cleaning the query here, so you could technically
309 # have a fuzzy match (or Bobby Tables someone)
311 # TODO: OR'd match seems poor optimally
312 my $taxids = $self->{dbh
}->selectcol_arrayref(<<SQL);
313 SELECT DISTINCT taxon_id FROM names
317 uniq_name LIKE "$query"
320 return wantarray() ? @
{$taxids} : @
{$taxids}[0];
323 *get_taxonid
= \
&get_taxonids
;
325 =head2 get_Children_Taxids
327 Title : get_Children_Taxids
328 Usage : my @childrenids = $db->get_Children_Taxids
329 Function: Get the ids of the children of a node in the taxonomy
330 Returns : Array of Ids
331 Args : Bio::Taxon or a taxon_id
332 Status : deprecated (use each_Descendent())
336 sub get_Children_Taxids
{
337 my ( $self, $node ) = @_;
338 $self->deprecated(); # ?
340 # "get_Children_Taxids is deprecated, use each_Descendent instead");
343 # if ( $node->can('object_id') ) {
344 # $id = $node->object_id;
346 # elsif ( $node->can('ncbi_taxid') ) {
347 # $id = $node->ncbi_taxid;
351 # "Don't know how to extract a taxon id from the object of type "
357 #else { $id = $node }
358 #my @vals = $self->{'_parentbtree'}->get_dup($id);
365 Usage : my $ancestor_taxon = $db->ancestor($taxon)
366 Function: Retrieve the full ancestor taxon of a supplied Taxon from the
369 Args : Bio::Taxon (that was retrieved from this database)
374 my ( $self, $taxon ) = @_;
375 $self->throw("Must supply a Bio::Taxon")
376 unless ref($taxon) && $taxon->isa('Bio::Taxon');
377 $self->throw("The supplied Taxon must belong to this database")
378 unless $taxon->db_handle && $taxon->db_handle eq $self;
380 $taxon->id || $self->throw("The supplied Taxon is missing its id!");
383 # Note here we explicitly set the parent ID, but use a separate method to
384 # check whether it is defined. Mixing back-end databases, even if from the
385 # same source, should still work (since a different backend wouldn't
386 # explicitly set the parent_id)
388 if ($taxon->trusted_parent_id) {
389 # this is the failsafe when we hit the root node
390 if ($taxon->parent_id eq $id) {
393 return $self->get_taxon(-taxonid
=> $taxon->parent_id);
395 # TODO: would there be any other option?
400 # TODO: this may act as a drop-in for a recursive CTE lookup
405 # Usage : my @ancestor_taxa = $db->ancestors($taxon)
406 # Function: Retrieve the full ancestor taxon of a supplied Taxon from the
408 # Returns : List of Bio::Taxon
409 # Args : Bio::Taxon (that was retrieved from this database)
413 #sub ancestors { ... }
415 =head2 each_Descendent
417 Title : each_Descendent
418 Usage : my @taxa = $db->each_Descendent($taxon);
419 Function: Get all the descendents of the supplied Taxon (but not their
420 descendents, ie. not a recursive fetchall).
421 Returns : Array of Bio::Taxon objects
422 Args : Bio::Taxon (that was retrieved from this database)
426 sub each_Descendent
{
427 my ( $self, $taxon ) = @_;
428 $self->throw("Must supply a Bio::Taxon")
429 unless ref($taxon) && $taxon->isa('Bio::Taxon');
430 $self->throw("The supplied Taxon must belong to this database")
431 unless $taxon->db_handle && $taxon->db_handle eq $self; # yikes
434 $taxon->id || $self->throw("The supplied Taxon is missing its id!");
436 #my ( $parent_id, $rank, $code, $divid, $gen_code, $mito, $nm, $uniq, $class );
437 # single join or two calls?
439 # probably not optimal, maybe set up as a cached statement with bindings?
440 my $desc_ids = $self->{dbh
}->selectcol_arrayref(<<SQL) or $self->throw($self->{dbh}->errstr);
447 return unless ref $desc_ids eq 'ARRAY';
450 foreach my $desc_id (@
$desc_ids) {
451 push( @descs, $self->get_taxon($desc_id) || next );
456 =head2 Helper methods
460 =head2 index_directory
462 Title : index_directory
463 Function : Get/set the location that index files are stored. (this module
464 will index the supplied database)
465 Usage : $obj->index_directory($newval)
466 Returns : value of index_directory (a scalar)
467 Args : on set, new value (a scalar or undef, optional)
468 Note : kept for backwards compatibility with older DB_File implementation
472 sub index_directory
{
474 return $self->{'index_directory'} = shift if @_;
475 return $self->{'index_directory'};
481 Function : Get/set the name of the SQLite3 database where data is stored
482 Usage : $obj->db_name($newval)
483 Returns : value of db_name (a scalar)
484 Args : on set, new value (a scalar or undef, optional)
488 # TODO: this may need some disambiguation w/ index_directory above; for now we
489 # assume this doesn't have a full path name (though I see no reason why this
490 # shouldn't allow that)
494 return $self->{'db_name'} = shift if @_;
495 return $self->{'db_name'};
501 Function : Get/set the cachesize used for loading the SQLite3 database
502 Usage : $obj->cache_size($newval)
503 Returns : value of cache_size (a scalar)
504 Args : on set, new value (a scalar or undef, optional)
505 Note : we do no checking on whether this value is an integer (SQLite does this for use)
511 return $self->{'cache_size'} = shift if defined($_[0]);
512 return $self->{'cache_size'};
515 # internal method which does the indexing
517 my ( $self, $nodesfile, $namesfile, $force ) = @_;
519 # TODO: need to disambiguate using index_directory here since we only have
520 # one file. Mayeb ignore it in favor of having full path for db_name?
521 my ($dir, $db_name) = ($self->index_directory, $self->db_name);
522 if (! -e
$db_name || $force) {
524 # TODO: we're ignoring index_directory for now, may add support for this
526 my $dbh = DBI
->connect("dbi:SQLite:dbname=$db_name","","") or die $!;
528 $self->debug("Running SQLite version:".$dbh->{sqlite_version
}."\n");
530 #$dbh->do('PRAGMA synchronous = 0'); # Non transaction safe!!!
532 if ($self->cache_size) {
533 my $cs = $self->cache_size;
534 $self->debug("Setting cache size $cs\n");
535 $dbh->do("PRAGMA cache_size = $cs")
538 $self->debug("Loading taxon table data\n");
539 $self->_init_db($dbh);
540 open my $NODES, '<', $nodesfile
541 or $self->throw("Could not read node file '$nodesfile': $!");
543 # TODO: this has the really unnecessary 'OR IGNORE' option added,
544 # apparently b.c the test data expects to handle cases where the TaxID
545 # is repeated in this table (which should never happen in this table). I
546 # will likely change this to throw under those circumstances
548 my $sth = $dbh->prepare_cached(<<SQL);
549 INSERT OR IGNORE INTO taxon (taxon_id, parent_id, rank, code, division_id, gencode_id, mito_id) VALUES (?,?,?,?,?,?,?)
555 my ($taxid,$parent,$rank,$code,$divid,undef,$gen_code,undef,$mito) = split(/\t\|\t/,$_);
561 $sth->execute($taxid, $parent, $rank, $code, $divid, $gen_code, $mito) or die $sth->errstr.": TaxID $taxid";
563 $dbh->do("COMMIT") or $self->throw($dbh->errstr);
567 $self->debug("Loading name table data\n");
568 open my $NAMES, '<', $namesfile
569 or $self->throw("Could not read names file '$namesfile': $!");
571 $sth = $dbh->prepare_cached(<<SQL) or $self->throw($dbh->errstr);
572 INSERT INTO names (taxon_id, name, uniq_name, class) VALUES (?,?,?,?)
578 my ($taxid, $name, $unique_name, $class) = split(/\t\|\t/,$_);
580 # don't include the fake root node 'root' or 'all' with id 1
583 $class =~ s/\s+\|\s*$//;
585 #if ($name =~ /\(class\)$/) { # it seems that only rank of class is ever used in this situation
586 # $name =~ s/\s+\(class\)$//;
589 $sth->execute($taxid, $name, $unique_name, $class) or $self->throw($sth->errstr);
595 $self->debug("Creating taxon index\n");
596 $dbh->do("CREATE INDEX parent_idx ON taxon (parent_id)") or $self->throw($dbh->errstr);
597 $self->debug("Creating name index\n");
598 $dbh->do("CREATE INDEX name_idx ON names (name)") or $self->throw($dbh->errstr);
599 $self->debug("Creating taxon name table index\n");
600 $dbh->do("CREATE INDEX taxon_name_idx ON names (taxon_id)") or $self->throw($dbh->errstr);
602 $dbh->do("PRAGMA foreign_keys = ON");
604 #$dbh->do('PRAGMA synchronous = 1');
606 $self->{'_initialized'} = 1;
611 # connect the internal db handle
614 return if $self->{'_initialized'};
616 my ($dir, $db_name) = ($self->index_directory, $self->db_name);
618 # TODO: we're ignoring index_directory for now, may add support for this
620 my $dbh = DBI
->connect("dbi:SQLite:dbname=$db_name","","") or die $!;
621 $dbh->do("PRAGMA foreign_keys = ON");
622 if ($self->cache_size) {
623 my $cs = $self->cache_size;
624 $self->debug("Setting cache size $cs\n");
625 $dbh->do("PRAGMA cache_size = $cs")
629 $self->{'_initialized'} = 1;
633 my ($self, $dbh) = @_;
634 my $schema = $self->taxon_schema();
635 # TODO: set up handler parameters here
636 for my $table (sort keys %$schema) {
637 $dbh->do("DROP TABLE IF EXISTS $table") or $self->throw($dbh->errstr);
638 $dbh->do("CREATE TABLE $table ".$schema->{$table}) or $self->throw($dbh->errstr);
644 my ($self, $sql) = @_;
645 # TODO: more sanity checks
646 my $rows = $self->{dbh
}->selectrow_arrayref($sql) or $self->throw( $self->{dbh
}->errstr );
650 sub _prepare_cached
{
651 my ($self, $sql) = @_;
652 # TODO: more sanity checks
653 my $sth = $self->{dbh
}->prepare_cached($sql) or $self->throw( $self->{dbh
}->errstr );
658 # TODO: check data size, this is a ballpark estimate (could be reduced)
664 taxon_id INTEGER UNIQUE PRIMARY KEY NOT NULL,
673 FOREIGN KEY(parent_id) REFERENCES taxon(taxon_id)
679 name_id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
682 uniq_name VARCHAR(50),
684 FOREIGN KEY(taxon_id) REFERENCES taxon(taxon_id)