Bio::DB::Universal: move into its own distribution
[bioperl-live.git] / Bio / DB / Taxonomy.pm
blobfad3cc5728b75ba6bef23958f2d110d956ff9c03
2 # BioPerl module for Bio::DB::Taxonomy
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::DB::Taxonomy - Access to a taxonomy database
18 =head1 SYNOPSIS
20 use Bio::DB::Taxonomy;
21 my $db = Bio::DB::Taxonomy->new(-source => 'entrez');
22 # use NCBI Entrez over HTTP
23 my $taxonid = $db->get_taxonid('Homo sapiens');
25 # get a taxon
26 my $taxon = $db->get_taxon(-taxonid => $taxonid);
28 =head1 DESCRIPTION
30 This is a front end module for access to a taxonomy database.
32 =head1 FEEDBACK
34 =head2 Mailing Lists
36 User feedback is an integral part of the evolution of this and other
37 Bioperl modules. Send your comments and suggestions preferably to
38 the Bioperl mailing list. Your participation is much appreciated.
40 bioperl-l@bioperl.org - General discussion
41 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
43 =head2 Support
45 Please direct usage questions or support issues to the mailing list:
47 I<bioperl-l@bioperl.org>
49 rather than to the module maintainer directly. Many experienced and
50 reponsive experts will be able look at the problem and quickly
51 address it. Please include a thorough description of the problem
52 with code and data examples if at all possible.
54 =head2 Reporting Bugs
56 Report bugs to the Bioperl bug tracking system to help us keep track
57 of the bugs and their resolution. Bug reports can be submitted via
58 the web:
60 https://github.com/bioperl/bioperl-live/issues
62 =head1 AUTHOR - Jason Stajich
64 Email jason-at-bioperl.org
66 =head1 CONTRIBUTORS
68 Sendu Bala: bix@sendu.me.uk
70 =head1 APPENDIX
72 The rest of the documentation details each of the object methods.
73 Internal methods are usually preceded with a _
75 =cut
77 # Let the code begin...
79 package Bio::DB::Taxonomy;
80 use vars qw($DefaultSource $TAXON_IIDS);
81 use strict;
82 use Bio::Tree::Tree;
84 use base qw(Bio::Root::Root);
86 $DefaultSource = 'entrez';
87 $TAXON_IIDS = {};
90 =head2 new
92 Title : new
93 Usage : my $obj = Bio::DB::Taxonomy->new(-source => 'entrez');
94 Function: Builds a new Bio::DB::Taxonomy object.
95 Returns : an instance of Bio::DB::Taxonomy
96 Args : -source => which database source 'entrez' (NCBI taxonomy online),
97 'flatfile' (local NCBI taxonomy), 'greengenes' (local
98 GreenGenes taxonomy), 'silva' (local Silva taxonomy), or
99 'list' (Do-It-Yourself taxonomy)
101 =cut
103 sub new {
104 my($class,@args) = @_;
106 if( $class =~ /Bio::DB::Taxonomy::(\S+)/ ) {
107 my ($self) = $class->SUPER::new(@args);
108 $self->_initialize(@args);
109 return $self;
110 } else {
111 my %param = @args;
112 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
113 my $source = $param{'-source'} || $DefaultSource;
115 $source = "\L$source"; # normalize capitalization to lower case
117 # normalize capitalization
118 return unless( $class->_load_tax_module($source) );
119 return "Bio::DB::Taxonomy::$source"->new(@args);
124 # empty for now
125 sub _initialize { }
128 =head2 get_num_taxa
130 Title : get_num_taxa
131 Usage : my $num = $db->get_num_taxa();
132 Function: Get the number of taxa stored in the database.
133 Returns : A number
134 Args : None
136 =cut
138 sub get_num_taxa {
139 shift->throw_not_implemented();
143 =head2 get_taxon
145 Title : get_taxon
146 Usage : my $taxon = $db->get_taxon(-taxonid => $taxonid);
147 Function: Get a Bio::Taxon object from the database.
148 Returns : Bio::Taxon object
149 Args : just a single value which is the database id, OR named args:
150 -taxonid => taxonomy id (to query by taxonid)
152 -name => string (to query by a taxonomy name: common name,
153 scientific name, etc)
155 =cut
157 sub get_taxon {
158 shift->throw_not_implemented();
161 *get_Taxonomy_Node = \&get_taxon;
164 =head2 get_taxonids
166 Title : get_taxonids
167 Usage : my @taxonids = $db->get_taxonids('Homo sapiens');
168 Function: Searches for a taxonid (typically ncbi_taxon_id) based on a query
169 string. Note that multiple taxonids can match to the same supplied
170 name.
171 Returns : array of integer ids in list context, one of these in scalar context
172 Args : string representing the taxon's name
174 =cut
176 sub get_taxonids {
177 shift->throw_not_implemented();
180 *get_taxonid = \&get_taxonids;
181 *get_taxaid = \&get_taxonids;
184 =head2 get_tree
186 Title : get_tree
187 Usage : my $tree = $db->get_tree(@species_names);
188 Function: Generate a tree comprised of the full lineages of all the supplied
189 species names. The nodes for the requested species are given
190 name('supplied') values corresponding to the supplied name, such that
191 they can be identified if the real species name in the database
192 (stored under node_name()) is different. The nodes are also given an
193 arbitrary branch length of 1.
194 Returns : Bio::Tree::Tree
195 Args : A list of species names (strings) to include in the tree.
197 =cut
199 sub get_tree {
200 my ($self, @species_names) = @_;
202 # the full lineages of the species are merged into a single tree
203 my $tree;
204 for my $name (@species_names) {
205 my @ids = $self->get_taxonids($name);
206 if (not scalar @ids) {
207 $self->throw("Could not find species $name in the taxonomy");
209 for my $id (@ids) {
210 my $node = $self->get_taxon(-taxonid => $id);
211 $node->name('supplied', $name);
212 if ($tree) {
213 $tree->merge_lineage($node);
214 } else {
215 $tree = Bio::Tree::Tree->new(-verbose => $self->verbose, -node => $node);
220 # add arbitrary branch length
221 for my $node ($tree->get_nodes) {
222 $node->branch_length(1);
225 return $tree;
229 =head2 ancestor
231 Title : ancestor
232 Usage : my $ancestor_taxon = $db->ancestor($taxon);
233 Function: Retrieve the full ancestor taxon of a supplied Taxon from the
234 database.
235 Returns : Bio::Taxon
236 Args : Bio::Taxon (that was retrieved from this database)
238 =cut
240 sub ancestor {
241 shift->throw_not_implemented();
245 =head2 each_Descendent
247 Title : each_Descendent
248 Usage : my @taxa = $db->each_Descendent($taxon);
249 Function: Get all the descendents of the supplied Taxon (but not their
250 descendents, ie. not a recursive fetchall).
251 Returns : Array of Bio::Taxon objects
252 Args : Bio::Taxon (that was retrieved from this database)
254 =cut
256 sub each_Descendent {
257 shift->throw_not_implemented();
261 =head2 get_all_Descendents
263 Title : get_all_Descendents
264 Usage : my @taxa = $db->get_all_Descendents($taxon);
265 Function: Like each_Descendent(), but do a recursive fetchall
266 Returns : Array of Bio::Taxon objects
267 Args : Bio::Taxon (that was retrieved from this database)
269 =cut
271 sub get_all_Descendents {
272 my ($self, $taxon) = @_;
273 my @taxa;
274 foreach my $desc_taxon ($self->each_Descendent($taxon)) {
275 push @taxa, ($desc_taxon, $self->get_all_Descendents($desc_taxon));
277 return @taxa;
281 =head2 _load_tax_module
283 Title : _load_tax_module
284 Usage : *INTERNAL Bio::DB::Taxonomy stuff*
285 Function: Loads up (like use) a module at run time on demand
287 =cut
289 sub _load_tax_module {
290 my ($self, $source) = @_;
291 my $module = "Bio::DB::Taxonomy::" . $source;
292 my $ok;
294 eval { $ok = $self->_load_module($module) };
295 if ( $@ ) {
296 print STDERR $@;
297 print STDERR <<END;
298 $self: $source cannot be found
299 Exception $@
300 For more information about the Bio::DB::Taxonomy system please see
301 the Bio::DB::Taxonomy docs. This includes ways of checking for
302 formats at compile time, not run time.
306 return $ok;
310 =head2 _handle_internal_id
312 Title : _handle_internal_id
313 Usage : *INTERNAL Bio::DB::Taxonomy stuff*
314 Function: Add an internal ID to a taxon object, ensuring that the taxon gets
315 the same internal ID, regardless of which database it is retrieved
316 from.
317 Returns : The assigned internal ID
318 Args : * A Bio::Taxon
319 * An optional boolean to decide whether or not to try and do the job
320 using scientific name & rank in addition to taxon ID. This is
321 useful if your IDs are not comparable to that of other databases,
322 e.g. if they are arbitrary, as in the case of Bio::DB::Taxonomy::list.
323 CAVEAT: will handle ambiguous names within a database fine, but not
324 across multiple databases.
326 =cut
328 sub _handle_internal_id {
329 my ($self, $taxon, $try_name) = @_;
330 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
332 my $taxid = $taxon->id || return;
333 my $name = $taxon->scientific_name || '';
334 my $rank = $taxon->rank || 'no rank';
335 my $dbh = $try_name ? $taxon->db_handle : 'any';
337 my $iid = $TAXON_IIDS->{taxids}->{$dbh}->{$taxid};
338 if ( (not defined $iid) && $try_name && $name && exists $TAXON_IIDS->{names}->{$name}) {
339 # Search for a suitable IID based on species name and ranks
340 my %test_ranks = map {$_ => undef} ($rank, 'no rank');
341 SEARCH: while (my ($test_rank, undef) = each %test_ranks) {
342 # Search at the specified rank first, then with 'no rank'
343 while ( my ($test_iid, $test_info) = each %{$TAXON_IIDS->{names}->{$name}->{$rank}} ) {
344 while (my ($test_db, $test_taxid) = each %$test_info) {
345 if ( ($test_db eq $dbh) && not($test_taxid eq $taxid) ) {
346 # Taxa are different (same database, different taxid)
347 next;
349 # IID is acceptable since taxa are from different databases,
350 # or from the same database but have the same taxid
351 $iid = $test_iid;
352 $TAXON_IIDS->{taxids}->{$dbh}->{$taxid} = $iid;
353 last SEARCH;
359 if (defined $iid) {
360 # Assign Bio::DB::Taxonomy IID with risky Bio::Tree::Node internal method
361 $taxon->_creation_id($iid);
362 } else {
363 # Register new IID in Bio::DB::Taxonomy
364 $iid = $taxon->internal_id;
365 $TAXON_IIDS->{taxids}->{$dbh}->{$taxid} = $iid;
366 if ($name) {
367 $TAXON_IIDS->{names}->{$name}->{$rank}->{$iid}->{$taxon->db_handle} = $taxid
371 return $iid;