2 # BioPerl module for Bio::DB::Taxonomy::flatfile
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl-dot-org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::DB::Taxonomy::flatfile - Use the NCBI taxonomy from local indexed flat files
20 use Bio::DB::Taxonomy;
22 my $db = Bio::DB::Taxonomy->new(-source => 'flatfile' ,
23 -nodesfile => 'nodes.dmp',
24 -namesfile => 'names.dmp');
28 This is an implementation of Bio::DB::Taxonomy which stores and accesses the
29 NCBI taxonomy using flat files stored locally on disk and indexed using the
30 DB_File module RECNO data structure for fast retrieval.
32 The required database files, nodes.dmp and names.dmp can be obtained from
33 ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
39 User feedback is an integral part of the evolution of this and other
40 Bioperl modules. Send your comments and suggestions preferably to
41 the Bioperl mailing list. Your participation is much appreciated.
43 bioperl-l@bioperl.org - General discussion
44 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
48 Please direct usage questions or support issues to the mailing list:
50 I<bioperl-l@bioperl.org>
52 rather than to the module maintainer directly. Many experienced and
53 reponsive experts will be able look at the problem and quickly
54 address it. Please include a thorough description of the problem
55 with code and data examples if at all possible.
59 Report bugs to the Bioperl bug tracking system to help us keep track
60 of the bugs and their resolution. Bug reports can be submitted via
63 https://github.com/bioperl/bioperl-live/issues
65 =head1 AUTHOR - Jason Stajich
67 Email jason-at-bioperl-dot-org
71 Sendu Bala: bix@sendu.me.uk
75 The rest of the documentation details each of the object methods.
76 Internal methods are usually preceded with a _
80 # Let the code begin...
82 package Bio
::DB
::Taxonomy
::flatfile
;
84 use vars
qw($DEFAULT_INDEX_DIR $DEFAULT_NODE_INDEX $DEFAULT_NAME2ID_INDEX
85 $DEFAULT_ID2NAME_INDEX $DEFAULT_PARENT_INDEX @DIVISIONS);
90 use File::Spec::Functions;
92 use constant SEPARATOR => ':';
94 $DEFAULT_INDEX_DIR = $Bio::Root::IO::TEMPDIR; # /tmp
95 $DEFAULT_NODE_INDEX = 'nodes';
96 $DEFAULT_NAME2ID_INDEX = 'names2id';
97 $DEFAULT_ID2NAME_INDEX = 'id2names';
98 $DEFAULT_PARENT_INDEX = 'parents';
100 $DB_BTREE->{'flags'} = R_DUP; # allow duplicate values in DB_File BTREEs
102 # 8192 bytes; this seems to work to keep OS X from complaining
103 $DB_HASH->{'bsize'} = 0x2000;
105 @DIVISIONS = ([qw(BCT Bacteria)],
106 [qw(INV Invertebrates)],
109 [qw(PLN Plants)], # (and fungi)
113 [qw(UNA Unassigned)],
115 [qw(VRT Vertebrates)],
116 [qw(ENV 'Environmental samples')]);
118 use base
qw(Bio::DB::Taxonomy);
123 Usage : my $obj = Bio::DB::Taxonomy::flatfile->new();
124 Function: Builds a new Bio::DB::Taxonomy::flatfile object
125 Returns : an instance of Bio::DB::Taxonomy::flatfile
126 Args : -directory => name of directory where index files should be created
127 -nodesfile => name of file containing nodes (nodes.dmp from NCBI)
128 -namesfile => name of the file containing names(names.dmp from NCBI)
129 -force => 1 to replace current indexes even if they exist
134 my($class, @args) = @_;
136 my $self = $class->SUPER::new
(@args);
137 my ($dir,$nodesfile,$namesfile,$force) =
138 $self->_rearrange([qw(DIRECTORY NODESFILE NAMESFILE FORCE)], @args);
140 $self->index_directory($dir || $DEFAULT_INDEX_DIR);
142 $self->_build_index($nodesfile,$namesfile,$force);
150 =head2 Bio::DB::Taxonomy interface implementation
155 Usage : my $num = $db->get_num_taxa();
156 Function: Get the number of taxa stored in the database.
164 if (not exists $self->{_num_taxa
}) {
166 while ( my ($parent, undef) = each %{$self->{_parent2children
}} ) {
169 $self->{_num_taxa
} = $num;
171 return $self->{_num_taxa
};
178 Usage : my $taxon = $db->get_taxon(-taxonid => $taxonid)
179 Function: Get a Bio::Taxon object from the database.
180 Returns : Bio::Taxon object
181 Args : just a single value which is the database id, OR named args:
182 -taxonid => taxonomy id (to query by taxonid)
184 -name => string (to query by a taxonomy name: common name,
185 scientific name, etc)
191 my ($taxonid, $name);
194 ($taxonid, $name) = $self->_rearrange([qw(TAXONID NAME)],@_);
196 ($taxonid, my @others) = $self->get_taxonids($name);
197 $self->warn("There were multiple ids ($taxonid @others) matching '$name', using '$taxonid'") if @others > 0;
204 return unless $taxonid;
206 $taxonid =~ /^\d+$/ || return;
207 my $node = $self->{'_nodes'}->[$taxonid] || return;
208 length($node) || return;
209 my ($taxid, undef, $rank, $code, $divid, $gen_code, $mito) = split(SEPARATOR
,$node);
210 last unless defined $taxid;
211 my ($taxon_names) = $self->{'_id2name'}->[$taxid];
212 my ($sci_name, @common_names) = split(SEPARATOR
, $taxon_names);
214 my $taxon = Bio
::Taxon
->new(
216 -common_names
=> [@common_names],
217 -ncbi_taxid
=> $taxid, # since this is a real ncbi taxid, explicitly set it as one
219 -division
=> $DIVISIONS[$divid]->[1],
220 -genetic_code
=> $gen_code,
221 -mito_genetic_code
=> $mito );
222 # we can't use -dbh or the db_handle() method ourselves or we'll go
223 # infinite on the merge attempt
224 $taxon->{'db_handle'} = $self;
226 $self->_handle_internal_id($taxon);
231 *get_Taxonomy_Node
= \
&get_taxon
;
237 Usage : my @taxonids = $db->get_taxonids('Homo sapiens');
238 Function: Searches for a taxonid (typically ncbi_taxon_id) based on a query
239 string. Note that multiple taxonids can match to the same supplied
241 Returns : array of integer ids in list context, one of these in scalar context
242 Args : string representing taxon's name
247 my ($self, $query) = @_;
248 my $ids = $self->{'_name2id'}->{lc($query)};
251 # try again converting underscores to spaces
253 $ids = $self->{'_name2id'}->{lc($query)};
257 my @ids = split(SEPARATOR
, $ids);
258 return wantarray() ?
@ids : shift @ids;
261 *get_taxonid
= \
&get_taxonids
;
264 =head2 get_Children_Taxids
266 Title : get_Children_Taxids
267 Usage : my @childrenids = $db->get_Children_Taxids
268 Function: Get the ids of the children of a node in the taxonomy
269 Returns : Array of Ids
270 Args : Bio::Taxon or a taxon_id
271 Status : deprecated (use each_Descendent())
275 sub get_Children_Taxids
{
276 my ($self, $node) = @_;
277 $self->warn("get_Children_Taxids is deprecated, use each_Descendent instead");
280 if( $node->can('object_id') ) {
281 $id = $node->object_id;
282 } elsif( $node->can('ncbi_taxid') ) {
283 $id = $node->ncbi_taxid;
285 $self->warn("Don't know how to extract a taxon id from the object of type ".ref($node)."\n");
288 } else { $id = $node }
289 my @vals = $self->{'_parentbtree'}->get_dup($id);
297 Usage : my $ancestor_taxon = $db->ancestor($taxon)
298 Function: Retrieve the full ancestor taxon of a supplied Taxon from the
301 Args : Bio::Taxon (that was retrieved from this database)
306 my ($self, $taxon) = @_;
307 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
308 $self->throw("The supplied Taxon must belong to this database") unless $taxon->db_handle && $taxon->db_handle eq $self;
309 my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
311 my $node = $self->{'_nodes'}->[$id];
313 my (undef, $parent_id) = split(SEPARATOR
,$node);
314 $parent_id || return;
315 $parent_id eq $id && return; # one of the roots
316 return $self->get_taxon($parent_id);
322 =head2 each_Descendent
324 Title : each_Descendent
325 Usage : my @taxa = $db->each_Descendent($taxon);
326 Function: Get all the descendents of the supplied Taxon (but not their
327 descendents, ie. not a recursive fetchall).
328 Returns : Array of Bio::Taxon objects
329 Args : Bio::Taxon (that was retrieved from this database)
333 sub each_Descendent
{
334 my ($self, $taxon) = @_;
335 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
336 $self->throw("The supplied Taxon must belong to this database") unless $taxon->db_handle && $taxon->db_handle eq $self;
337 my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
339 my @desc_ids = $self->{'_parentbtree'}->get_dup($id);
341 foreach my $desc_id (@desc_ids) {
342 push(@descs, $self->get_taxon($desc_id) || next);
348 =head2 Helper methods
352 # internal method which does the indexing
354 my ($self, $nodesfile, $namesfile, $force) = @_;
356 my $dir = $self->index_directory;
357 my $nodeindex = catfile
($dir, $DEFAULT_NODE_INDEX);
358 my $name2idindex = catfile
($dir, $DEFAULT_NAME2ID_INDEX);
359 my $id2nameindex = catfile
($dir, $DEFAULT_ID2NAME_INDEX);
360 my $parent2childindex = catfile
($dir, $DEFAULT_PARENT_INDEX);
361 $self->{'_nodes'} = [];
362 $self->{'_id2name'} = [];
363 $self->{'_name2id'} = {};
364 $self->{'_parent2children'} = {};
366 if (! -e
$nodeindex || $force) {
367 my (%parent2children,@nodes);
368 open my $NODES, '<', $nodesfile
369 or $self->throw("Could not read node file '$nodesfile': $!");
372 unlink $parent2childindex;
373 my $nh = tie
( @nodes, 'DB_File', $nodeindex, O_RDWR
|O_CREAT
, 0644, $DB_RECNO) ||
374 $self->throw("Cannot open file '$nodeindex': $!");
375 my $btree = tie
( %parent2children, 'DB_File', $parent2childindex, O_RDWR
|O_CREAT
, 0644, $DB_BTREE) ||
376 $self->throw("Cannot tie to file '$parent2childindex': $!");
381 my ($taxid,$parent,$rank,$code,$divid,undef,$gen_code,undef,$mito) = split(/\t\|\t/,$_);
382 # don't include the fake root node 'root' with id 1; we essentially have multiple roots here
388 # keep this stringified
389 $nodes[$taxid] = join(SEPARATOR
, ($taxid,$parent,$rank,$code,$divid,$gen_code,$mito));
390 $btree->put($parent,$taxid);
394 $nh = $btree = undef;
396 untie %parent2children;
399 if ((! -e
$name2idindex || -z
$name2idindex) || (! -e
$id2nameindex || -z
$id2nameindex) || $force) {
400 open my $NAMES, '<', $namesfile
401 or $self->throw("Could not read names file '$namesfile': $!");
403 unlink $name2idindex;
404 unlink $id2nameindex;
405 my (@id2name,%name2id);
406 my $idh = tie
(@id2name, 'DB_File', $id2nameindex, O_RDWR
|O_CREAT
, 0644, $DB_RECNO) ||
407 $self->throw("Cannot tie to file '$id2nameindex': $!");
408 my $nameh = tie
( %name2id, 'DB_File', $name2idindex, O_RDWR
|O_CREAT
, 0644, $DB_HASH) ||
409 $self->throw("Cannot tie to file '$name2idindex': $!");
414 my ($taxid, $name, $unique_name, $class) = split(/\t\|\t/,$_);
415 # don't include the fake root node 'root' or 'all' with id 1
418 $class =~ s/\s+\|\s*$//;
419 my $lc_name = lc($name);
420 my $orig_name = $name;
422 # unique names aren't always in the correct column, sometimes they
423 # are uniqued by adding bracketed rank names to the normal name;
424 # store the uniqued version then fix the name for normal use
425 if ($lc_name =~ /\(class\)$/) { # it seems that only rank of class is ever used in this situation
426 $name2id{$lc_name} = $taxid;
427 $name =~ s/\s+\(class\)$//;
428 $lc_name = lc($name);
431 # handle normal names which aren't necessarily unique
432 my $taxids = $name2id{$lc_name} || '';
433 my %taxids = map { $_ => 1 } split(SEPARATOR
, $taxids);
434 unless (exists $taxids{$taxid}) {
436 $name2id{$lc_name} = join(SEPARATOR
, keys %taxids);
439 # store unique names in name2id
441 $name2id{lc($unique_name)} = $taxid;
444 # store all names in id2name array
445 my $names = $id2name[$taxid] || '';
446 my @names = split(SEPARATOR
, $names);
447 if ($class && $class eq 'scientific name') {
448 # the scientific name should be the first name stored
449 unshift(@names, $name);
450 push(@names, $orig_name) if ($orig_name ne $name);
451 push(@names, $unique_name) if $unique_name;
454 # all other ('common' in this simplification) names get added after
456 push(@names, $orig_name) if ($orig_name ne $name);
457 push(@names, $unique_name) if $unique_name;
459 $id2name[$taxid] = join(SEPARATOR
, @names);
463 $idh = $nameh = undef;
470 # connect the internal db handle
473 return if $self->{'_initialized'};
475 my $dir = $self->index_directory;
476 my $nodeindex = catfile
($dir, $DEFAULT_NODE_INDEX);
477 my $name2idindex = catfile
($dir, $DEFAULT_NAME2ID_INDEX);
478 my $id2nameindex = catfile
($dir, $DEFAULT_ID2NAME_INDEX);
479 my $parent2childindex = catfile
($dir, $DEFAULT_PARENT_INDEX);
480 $self->{'_nodes'} = [];
481 $self->{'_id2name'} = [];
482 $self->{'_name2id'} = {};
483 $self->{'_parent2children'} = {};
485 if( ! -e
$nodeindex ||
486 ! -e
$name2idindex ||
487 ! -e
$id2nameindex ) {
488 $self->warn("Index files have not been created");
491 tie
( @
{$self->{'_nodes'}}, 'DB_File', $nodeindex, O_RDONLY
,undef, $DB_RECNO)
492 || $self->throw("$! $nodeindex");
493 tie
(@
{$self->{'_id2name'}}, 'DB_File', $id2nameindex,O_RDONLY
, undef,
494 $DB_RECNO) || $self->throw("$! $id2nameindex");
496 tie
( %{$self->{'_name2id'}}, 'DB_File', $name2idindex, O_RDONLY
,undef,
497 $DB_HASH) || $self->throw("$! $name2idindex");
498 $self->{'_parentbtree'} = tie
( %{$self->{'_parent2children'}},
499 'DB_File', $parent2childindex,
500 O_RDONLY
, 0644, $DB_BTREE);
502 $self->{'_initialized'} = 1;
506 =head2 index_directory
508 Title : index_directory
509 Function : Get/set the location that index files are stored. (this module
510 will index the supplied database)
511 Usage : $obj->index_directory($newval)
512 Returns : value of index_directory (a scalar)
513 Args : on set, new value (a scalar or undef, optional)
518 sub index_directory
{
520 return $self->{'index_directory'} = shift if @_;
521 return $self->{'index_directory'};
527 # Destroy all filehandle references
528 # to be able to remove temporary files
529 undef $self->{_id2name
};
530 undef $self->{_name2id
};
531 undef $self->{_nodes
};
532 undef $self->{_parent2children
};
533 undef $self->{_parentbtree
};
535 # Treat index files as temporary and delete them now if
536 # 'index_directory' match $DEFAULT_INDEX_DIR (which means
537 # that no "-directory" was specified or is an explicit
539 my $default_temp = quotemeta $DEFAULT_INDEX_DIR;
540 if ($self->{index_directory
} =~ m/^$default_temp/) {
541 unlink catfile
($self->{index_directory
},'id2names');
542 unlink catfile
($self->{index_directory
},'names2id');
543 unlink catfile
($self->{index_directory
},'nodes');
544 unlink catfile
($self->{index_directory
},'parents');