Bio::DB::Universal: move into its own distribution
[bioperl-live.git] / Bio / DB / Taxonomy / flatfile.pm
blob677b039f5ab643ee9363d5eb8d52a77e3131504f
2 # BioPerl module for Bio::DB::Taxonomy::flatfile
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl-dot-org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::DB::Taxonomy::flatfile - Use the NCBI taxonomy from local indexed flat files
18 =head1 SYNOPSIS
20 use Bio::DB::Taxonomy;
22 my $db = Bio::DB::Taxonomy->new(-source => 'flatfile' ,
23 -nodesfile => 'nodes.dmp',
24 -namesfile => 'names.dmp');
26 =head1 DESCRIPTION
28 This is an implementation of Bio::DB::Taxonomy which stores and accesses the
29 NCBI taxonomy using flat files stored locally on disk and indexed using the
30 DB_File module RECNO data structure for fast retrieval.
32 The required database files, nodes.dmp and names.dmp can be obtained from
33 ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
35 =head1 FEEDBACK
37 =head2 Mailing Lists
39 User feedback is an integral part of the evolution of this and other
40 Bioperl modules. Send your comments and suggestions preferably to
41 the Bioperl mailing list. Your participation is much appreciated.
43 bioperl-l@bioperl.org - General discussion
44 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
46 =head2 Support
48 Please direct usage questions or support issues to the mailing list:
50 I<bioperl-l@bioperl.org>
52 rather than to the module maintainer directly. Many experienced and
53 reponsive experts will be able look at the problem and quickly
54 address it. Please include a thorough description of the problem
55 with code and data examples if at all possible.
57 =head2 Reporting Bugs
59 Report bugs to the Bioperl bug tracking system to help us keep track
60 of the bugs and their resolution. Bug reports can be submitted via
61 the web:
63 https://github.com/bioperl/bioperl-live/issues
65 =head1 AUTHOR - Jason Stajich
67 Email jason-at-bioperl-dot-org
69 =head1 CONTRIBUTORS
71 Sendu Bala: bix@sendu.me.uk
73 =head1 APPENDIX
75 The rest of the documentation details each of the object methods.
76 Internal methods are usually preceded with a _
78 =cut
80 # Let the code begin...
82 package Bio::DB::Taxonomy::flatfile;
84 use vars qw($DEFAULT_INDEX_DIR $DEFAULT_NODE_INDEX $DEFAULT_NAME2ID_INDEX
85 $DEFAULT_ID2NAME_INDEX $DEFAULT_PARENT_INDEX @DIVISIONS);
87 use strict;
88 use DB_File;
89 use Bio::Taxon;
90 use File::Spec::Functions;
92 use constant SEPARATOR => ':';
94 $DEFAULT_INDEX_DIR = $Bio::Root::IO::TEMPDIR; # /tmp
95 $DEFAULT_NODE_INDEX = 'nodes';
96 $DEFAULT_NAME2ID_INDEX = 'names2id';
97 $DEFAULT_ID2NAME_INDEX = 'id2names';
98 $DEFAULT_PARENT_INDEX = 'parents';
100 $DB_BTREE->{'flags'} = R_DUP; # allow duplicate values in DB_File BTREEs
102 # 8192 bytes; this seems to work to keep OS X from complaining
103 $DB_HASH->{'bsize'} = 0x2000;
105 @DIVISIONS = ([qw(BCT Bacteria)],
106 [qw(INV Invertebrates)],
107 [qw(MAM Mammals)],
108 [qw(PHG Phages)],
109 [qw(PLN Plants)], # (and fungi)
110 [qw(PRI Primates)],
111 [qw(ROD Rodents)],
112 [qw(SYN Synthetic)],
113 [qw(UNA Unassigned)],
114 [qw(VRL Viruses)],
115 [qw(VRT Vertebrates)],
116 [qw(ENV 'Environmental samples')]);
118 use base qw(Bio::DB::Taxonomy);
120 =head2 new
122 Title : new
123 Usage : my $obj = Bio::DB::Taxonomy::flatfile->new();
124 Function: Builds a new Bio::DB::Taxonomy::flatfile object
125 Returns : an instance of Bio::DB::Taxonomy::flatfile
126 Args : -directory => name of directory where index files should be created
127 -nodesfile => name of file containing nodes (nodes.dmp from NCBI)
128 -namesfile => name of the file containing names(names.dmp from NCBI)
129 -force => 1 to replace current indexes even if they exist
131 =cut
133 sub new {
134 my($class, @args) = @_;
136 my $self = $class->SUPER::new(@args);
137 my ($dir,$nodesfile,$namesfile,$force) =
138 $self->_rearrange([qw(DIRECTORY NODESFILE NAMESFILE FORCE)], @args);
140 $self->index_directory($dir || $DEFAULT_INDEX_DIR);
141 if ( $nodesfile ) {
142 $self->_build_index($nodesfile,$namesfile,$force);
145 $self->_db_connect;
146 return $self;
150 =head2 Bio::DB::Taxonomy interface implementation
152 =head2 get_num_taxa
154 Title : get_num_taxa
155 Usage : my $num = $db->get_num_taxa();
156 Function: Get the number of taxa stored in the database.
157 Returns : A number
158 Args : None
160 =cut
162 sub get_num_taxa {
163 my ($self) = @_;
164 if (not exists $self->{_num_taxa}) {
165 my $num = 0;
166 while ( my ($parent, undef) = each %{$self->{_parent2children}} ) {
167 $num++;
169 $self->{_num_taxa} = $num;
171 return $self->{_num_taxa};
175 =head2 get_taxon
177 Title : get_taxon
178 Usage : my $taxon = $db->get_taxon(-taxonid => $taxonid)
179 Function: Get a Bio::Taxon object from the database.
180 Returns : Bio::Taxon object
181 Args : just a single value which is the database id, OR named args:
182 -taxonid => taxonomy id (to query by taxonid)
184 -name => string (to query by a taxonomy name: common name,
185 scientific name, etc)
187 =cut
189 sub get_taxon {
190 my ($self) = shift;
191 my ($taxonid, $name);
193 if (@_ > 1) {
194 ($taxonid, $name) = $self->_rearrange([qw(TAXONID NAME)],@_);
195 if ($name) {
196 ($taxonid, my @others) = $self->get_taxonids($name);
197 $self->warn("There were multiple ids ($taxonid @others) matching '$name', using '$taxonid'") if @others > 0;
200 else {
201 $taxonid = shift;
204 return unless $taxonid;
206 $taxonid =~ /^\d+$/ || return;
207 my $node = $self->{'_nodes'}->[$taxonid] || return;
208 length($node) || return;
209 my ($taxid, undef, $rank, $code, $divid, $gen_code, $mito) = split(SEPARATOR,$node);
210 last unless defined $taxid;
211 my ($taxon_names) = $self->{'_id2name'}->[$taxid];
212 my ($sci_name, @common_names) = split(SEPARATOR, $taxon_names);
214 my $taxon = Bio::Taxon->new(
215 -name => $sci_name,
216 -common_names => [@common_names],
217 -ncbi_taxid => $taxid, # since this is a real ncbi taxid, explicitly set it as one
218 -rank => $rank,
219 -division => $DIVISIONS[$divid]->[1],
220 -genetic_code => $gen_code,
221 -mito_genetic_code => $mito );
222 # we can't use -dbh or the db_handle() method ourselves or we'll go
223 # infinite on the merge attempt
224 $taxon->{'db_handle'} = $self;
226 $self->_handle_internal_id($taxon);
228 return $taxon;
231 *get_Taxonomy_Node = \&get_taxon;
234 =head2 get_taxonids
236 Title : get_taxonids
237 Usage : my @taxonids = $db->get_taxonids('Homo sapiens');
238 Function: Searches for a taxonid (typically ncbi_taxon_id) based on a query
239 string. Note that multiple taxonids can match to the same supplied
240 name.
241 Returns : array of integer ids in list context, one of these in scalar context
242 Args : string representing taxon's name
244 =cut
246 sub get_taxonids {
247 my ($self, $query) = @_;
248 my $ids = $self->{'_name2id'}->{lc($query)};
249 unless ($ids) {
250 if ($query =~ /_/) {
251 # try again converting underscores to spaces
252 $query =~ s/_/ /g;
253 $ids = $self->{'_name2id'}->{lc($query)};
255 $ids || return;
257 my @ids = split(SEPARATOR, $ids);
258 return wantarray() ? @ids : shift @ids;
261 *get_taxonid = \&get_taxonids;
264 =head2 get_Children_Taxids
266 Title : get_Children_Taxids
267 Usage : my @childrenids = $db->get_Children_Taxids
268 Function: Get the ids of the children of a node in the taxonomy
269 Returns : Array of Ids
270 Args : Bio::Taxon or a taxon_id
271 Status : deprecated (use each_Descendent())
273 =cut
275 sub get_Children_Taxids {
276 my ($self, $node) = @_;
277 $self->warn("get_Children_Taxids is deprecated, use each_Descendent instead");
278 my $id;
279 if( ref($node) ) {
280 if( $node->can('object_id') ) {
281 $id = $node->object_id;
282 } elsif( $node->can('ncbi_taxid') ) {
283 $id = $node->ncbi_taxid;
284 } else {
285 $self->warn("Don't know how to extract a taxon id from the object of type ".ref($node)."\n");
286 return;
288 } else { $id = $node }
289 my @vals = $self->{'_parentbtree'}->get_dup($id);
290 return @vals;
294 =head2 ancestor
296 Title : ancestor
297 Usage : my $ancestor_taxon = $db->ancestor($taxon)
298 Function: Retrieve the full ancestor taxon of a supplied Taxon from the
299 database.
300 Returns : Bio::Taxon
301 Args : Bio::Taxon (that was retrieved from this database)
303 =cut
305 sub ancestor {
306 my ($self, $taxon) = @_;
307 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
308 $self->throw("The supplied Taxon must belong to this database") unless $taxon->db_handle && $taxon->db_handle eq $self;
309 my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
311 my $node = $self->{'_nodes'}->[$id];
312 if (length($node)) {
313 my (undef, $parent_id) = split(SEPARATOR,$node);
314 $parent_id || return;
315 $parent_id eq $id && return; # one of the roots
316 return $self->get_taxon($parent_id);
318 return;
322 =head2 each_Descendent
324 Title : each_Descendent
325 Usage : my @taxa = $db->each_Descendent($taxon);
326 Function: Get all the descendents of the supplied Taxon (but not their
327 descendents, ie. not a recursive fetchall).
328 Returns : Array of Bio::Taxon objects
329 Args : Bio::Taxon (that was retrieved from this database)
331 =cut
333 sub each_Descendent {
334 my ($self, $taxon) = @_;
335 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
336 $self->throw("The supplied Taxon must belong to this database") unless $taxon->db_handle && $taxon->db_handle eq $self;
337 my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
339 my @desc_ids = $self->{'_parentbtree'}->get_dup($id);
340 my @descs;
341 foreach my $desc_id (@desc_ids) {
342 push(@descs, $self->get_taxon($desc_id) || next);
344 return @descs;
348 =head2 Helper methods
350 =cut
352 # internal method which does the indexing
353 sub _build_index {
354 my ($self, $nodesfile, $namesfile, $force) = @_;
356 my $dir = $self->index_directory;
357 my $nodeindex = catfile($dir, $DEFAULT_NODE_INDEX);
358 my $name2idindex = catfile($dir, $DEFAULT_NAME2ID_INDEX);
359 my $id2nameindex = catfile($dir, $DEFAULT_ID2NAME_INDEX);
360 my $parent2childindex = catfile($dir, $DEFAULT_PARENT_INDEX);
361 $self->{'_nodes'} = [];
362 $self->{'_id2name'} = [];
363 $self->{'_name2id'} = {};
364 $self->{'_parent2children'} = {};
366 if (! -e $nodeindex || $force) {
367 my (%parent2children,@nodes);
368 open my $NODES, '<', $nodesfile
369 or $self->throw("Could not read node file '$nodesfile': $!");
371 unlink $nodeindex;
372 unlink $parent2childindex;
373 my $nh = tie ( @nodes, 'DB_File', $nodeindex, O_RDWR|O_CREAT, 0644, $DB_RECNO) ||
374 $self->throw("Cannot open file '$nodeindex': $!");
375 my $btree = tie( %parent2children, 'DB_File', $parent2childindex, O_RDWR|O_CREAT, 0644, $DB_BTREE) ||
376 $self->throw("Cannot tie to file '$parent2childindex': $!");
378 while (<$NODES>) {
379 next if /^$/;
380 chomp;
381 my ($taxid,$parent,$rank,$code,$divid,undef,$gen_code,undef,$mito) = split(/\t\|\t/,$_);
382 # don't include the fake root node 'root' with id 1; we essentially have multiple roots here
383 next if $taxid == 1;
384 if ($parent == 1) {
385 $parent = $taxid;
388 # keep this stringified
389 $nodes[$taxid] = join(SEPARATOR, ($taxid,$parent,$rank,$code,$divid,$gen_code,$mito));
390 $btree->put($parent,$taxid);
392 close $NODES;
394 $nh = $btree = undef;
395 untie @nodes ;
396 untie %parent2children;
399 if ((! -e $name2idindex || -z $name2idindex) || (! -e $id2nameindex || -z $id2nameindex) || $force) {
400 open my $NAMES, '<', $namesfile
401 or $self->throw("Could not read names file '$namesfile': $!");
403 unlink $name2idindex;
404 unlink $id2nameindex;
405 my (@id2name,%name2id);
406 my $idh = tie (@id2name, 'DB_File', $id2nameindex, O_RDWR|O_CREAT, 0644, $DB_RECNO) ||
407 $self->throw("Cannot tie to file '$id2nameindex': $!");
408 my $nameh = tie ( %name2id, 'DB_File', $name2idindex, O_RDWR|O_CREAT, 0644, $DB_HASH) ||
409 $self->throw("Cannot tie to file '$name2idindex': $!");
411 while (<$NAMES>) {
412 next if /^$/;
413 chomp;
414 my ($taxid, $name, $unique_name, $class) = split(/\t\|\t/,$_);
415 # don't include the fake root node 'root' or 'all' with id 1
416 next if $taxid == 1;
418 $class =~ s/\s+\|\s*$//;
419 my $lc_name = lc($name);
420 my $orig_name = $name;
422 # unique names aren't always in the correct column, sometimes they
423 # are uniqued by adding bracketed rank names to the normal name;
424 # store the uniqued version then fix the name for normal use
425 if ($lc_name =~ /\(class\)$/) { # it seems that only rank of class is ever used in this situation
426 $name2id{$lc_name} = $taxid;
427 $name =~ s/\s+\(class\)$//;
428 $lc_name = lc($name);
431 # handle normal names which aren't necessarily unique
432 my $taxids = $name2id{$lc_name} || '';
433 my %taxids = map { $_ => 1 } split(SEPARATOR, $taxids);
434 unless (exists $taxids{$taxid}) {
435 $taxids{$taxid} = 1;
436 $name2id{$lc_name} = join(SEPARATOR, keys %taxids);
439 # store unique names in name2id
440 if ($unique_name) {
441 $name2id{lc($unique_name)} = $taxid;
444 # store all names in id2name array
445 my $names = $id2name[$taxid] || '';
446 my @names = split(SEPARATOR, $names);
447 if ($class && $class eq 'scientific name') {
448 # the scientific name should be the first name stored
449 unshift(@names, $name);
450 push(@names, $orig_name) if ($orig_name ne $name);
451 push(@names, $unique_name) if $unique_name;
453 else {
454 # all other ('common' in this simplification) names get added after
455 push(@names, $name);
456 push(@names, $orig_name) if ($orig_name ne $name);
457 push(@names, $unique_name) if $unique_name;
459 $id2name[$taxid] = join(SEPARATOR, @names);
461 close $NAMES;
463 $idh = $nameh = undef;
464 untie( %name2id);
465 untie( @id2name);
470 # connect the internal db handle
471 sub _db_connect {
472 my $self = shift;
473 return if $self->{'_initialized'};
475 my $dir = $self->index_directory;
476 my $nodeindex = catfile($dir, $DEFAULT_NODE_INDEX);
477 my $name2idindex = catfile($dir, $DEFAULT_NAME2ID_INDEX);
478 my $id2nameindex = catfile($dir, $DEFAULT_ID2NAME_INDEX);
479 my $parent2childindex = catfile($dir, $DEFAULT_PARENT_INDEX);
480 $self->{'_nodes'} = [];
481 $self->{'_id2name'} = [];
482 $self->{'_name2id'} = {};
483 $self->{'_parent2children'} = {};
485 if( ! -e $nodeindex ||
486 ! -e $name2idindex ||
487 ! -e $id2nameindex ) {
488 $self->warn("Index files have not been created");
489 return 0;
491 tie ( @{$self->{'_nodes'}}, 'DB_File', $nodeindex, O_RDONLY,undef, $DB_RECNO)
492 || $self->throw("$! $nodeindex");
493 tie (@{$self->{'_id2name'}}, 'DB_File', $id2nameindex,O_RDONLY, undef,
494 $DB_RECNO) || $self->throw("$! $id2nameindex");
496 tie ( %{$self->{'_name2id'}}, 'DB_File', $name2idindex, O_RDONLY,undef,
497 $DB_HASH) || $self->throw("$! $name2idindex");
498 $self->{'_parentbtree'} = tie( %{$self->{'_parent2children'}},
499 'DB_File', $parent2childindex,
500 O_RDONLY, 0644, $DB_BTREE);
502 $self->{'_initialized'} = 1;
506 =head2 index_directory
508 Title : index_directory
509 Function : Get/set the location that index files are stored. (this module
510 will index the supplied database)
511 Usage : $obj->index_directory($newval)
512 Returns : value of index_directory (a scalar)
513 Args : on set, new value (a scalar or undef, optional)
515 =cut
518 sub index_directory {
519 my $self = shift;
520 return $self->{'index_directory'} = shift if @_;
521 return $self->{'index_directory'};
525 sub DESTROY {
526 my $self = shift;
527 # Destroy all filehandle references
528 # to be able to remove temporary files
529 undef $self->{_id2name};
530 undef $self->{_name2id};
531 undef $self->{_nodes};
532 undef $self->{_parent2children};
533 undef $self->{_parentbtree};
535 # Treat index files as temporary and delete them now if
536 # 'index_directory' match $DEFAULT_INDEX_DIR (which means
537 # that no "-directory" was specified or is an explicit
538 # temporary file)
539 my $default_temp = quotemeta $DEFAULT_INDEX_DIR;
540 if ($self->{index_directory} =~ m/^$default_temp/) {
541 unlink catfile($self->{index_directory},'id2names');
542 unlink catfile($self->{index_directory},'names2id');
543 unlink catfile($self->{index_directory},'nodes');
544 unlink catfile($self->{index_directory},'parents');