t/AlignIO/AlignIO.t: fix number of tests in plan (fixup c523e6bed866)
[bioperl-live.git] / Bio / DB / Taxonomy / entrez.pm
blobb4644d1b2a74b0cf361316b57015df551a1d3450
2 # BioPerl module for Bio::DB::Taxonomy::entrez
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::DB::Taxonomy::entrez - Taxonomy Entrez driver
18 =head1 SYNOPSIS
20 # Do not use this object directly, rather through the Bio::DB::Taxonomy
21 # interface
23 use Bio::DB::Taxonomy;
25 my $db = Bio::DB::Taxonomy->new(-source => 'entrez');
27 my $taxonid = $db->get_taxonid('Homo sapiens');
28 my $node = $db->get_Taxonomy_Node(-taxonid => $taxonid);
30 my $gi = 71836523;
31 my $node = $db->get_Taxonomy_Node(-gi => $gi, -db => 'protein');
32 print $node->binomial, "\n";
33 my ($species,$genus,$family) = $node->classification;
34 print "family is $family\n";
36 # Can also go up 4 levels
37 my $p = $node;
38 for ( 1..4 ) {
39 $p = $db->get_Taxonomy_Node(-taxonid => $p->parent_id);
41 print $p->rank, " ", ($p->classification)[0], "\n";
43 # could then classify a set of BLAST hits based on their GI numbers
44 # into taxonomic categories.
47 It is not currently possibly to query a node for its children so we
48 cannot completely replace the advantage of the flatfile
49 Bio::DB::Taxonomy::flatfile module.
52 =head1 DESCRIPTION
54 A driver for querying NCBI Entrez Taxonomy database.
56 =head1 FEEDBACK
58 =head2 Mailing Lists
60 User feedback is an integral part of the evolution of this and other
61 Bioperl modules. Send your comments and suggestions preferably to
62 the Bioperl mailing list. Your participation is much appreciated.
64 bioperl-l@bioperl.org - General discussion
65 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
67 =head2 Support
69 Please direct usage questions or support issues to the mailing list:
71 I<bioperl-l@bioperl.org>
73 rather than to the module maintainer directly. Many experienced and
74 reponsive experts will be able look at the problem and quickly
75 address it. Please include a thorough description of the problem
76 with code and data examples if at all possible.
78 =head2 Reporting Bugs
80 Report bugs to the Bioperl bug tracking system to help us keep track
81 of the bugs and their resolution. Bug reports can be submitted via
82 the web:
84 https://github.com/bioperl/bioperl-live/issues
86 =head1 AUTHOR - Jason Stajich
88 Email jason-at-bioperl.org
90 =head1 CONTRIBUTORS
92 Sendu Bala: bix@sendu.me.uk
94 =head1 APPENDIX
96 The rest of the documentation details each of the object methods.
97 Internal methods are usually preceded with a _
99 =cut
101 # Let the code begin...
103 package Bio::DB::Taxonomy::entrez;
104 use vars qw($EntrezLocation $UrlParamSeparatorValue %EntrezParams
105 $EntrezGet $EntrezSummary $EntrezFetch %SequenceParams
106 $XMLTWIG $DATA_CACHE $RELATIONS);
107 use strict;
108 use Bio::Taxon;
109 use Bio::DB::NCBIHelper;
111 eval {
112 require XML::Twig;
113 $XMLTWIG = 1;
115 if( $@ ) {
116 $XMLTWIG = 0;
119 use base qw(Bio::WebAgent Bio::DB::Taxonomy);
121 $EntrezLocation = $Bio::DB::NCBIHelper::HOSTBASE . '/entrez/eutils/';
122 $EntrezGet = 'esearch.fcgi';
123 $EntrezFetch = 'efetch.fcgi';
124 $EntrezSummary = 'esummary.fcgi';
126 $DATA_CACHE = {};
127 $RELATIONS = {};
129 %EntrezParams = ( 'db' => 'taxonomy',
130 'report' => 'xml',
131 'retmode'=> 'xml',
132 'tool' => 'Bioperl');
134 %SequenceParams = ( 'db' => 'nucleotide', # or protein
135 'retmode' => 'xml',
136 'tool' => 'Bioperl');
138 $UrlParamSeparatorValue = '&';
141 =head2 new
143 Title : new
144 Usage : my $obj = Bio::DB::Taxonomy::entrez->new();
145 Function: Builds a new Bio::DB::Taxonomy::entrez object
146 Returns : an instance of Bio::DB::Taxonomy::entrez
147 Args : -location => URL to Entrez (if you want to override the default)
148 -params => Hashref of URL params if you want to override the
149 default
151 =cut
153 sub new {
154 my ($class, @args) = @_;
156 # need to initialise Bio::WebAgent...
157 my ($self) = $class->SUPER::new(@args);
159 # ... as well as our normal Bio::DB::Taxonomy selves:
160 $self->_initialize(@args);
161 return $self;
165 sub _initialize {
166 my($self) = shift;
168 $self->SUPER::_initialize(@_);
170 my ($location,$params) = $self->_rearrange([qw(LOCATION PARAMS)],@_);
172 if( $params ) {
173 if( ref($params) !~ /HASH/i ) {
174 $self->warn("Must have provided a valid HASHref for -params");
175 $params = \%EntrezParams;
177 } else {
178 $params = \%EntrezParams;
180 $self->entrez_params($params);
181 $self->entrez_url($location || $EntrezLocation );
185 =head2 get_num_taxa
187 Title : get_num_taxa
188 Usage : my $num = $db->get_num_taxa();
189 Function: Get the number of taxa stored in the database.
190 Returns : A number
191 Args : None
193 =cut
195 sub get_num_taxa {
196 my ($self) = @_;
197 # Use this URL query to get the ID of all the taxa in the NCBI Taxonomy database:
198 # https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=taxonomy&term=all[Filter]
199 # Only the first 20 taxa IDs are returned (good because the list is long),
200 # and the total number is reported as well (which is what we are interested
201 # in).
202 my %p = $self->entrez_params;
203 $p{'term'} = 'all[Filter]';
204 my $twig = $self->_run_query($self->_build_url($EntrezGet, \%p));
205 my $count = $twig->root->first_child('Count')->first_child->text;
206 return $count;
210 =head2 get_taxon
212 Title : get_taxon
213 Usage : my $taxon = $db->get_taxon(-taxonid => $taxonid)
214 Function: Get a Bio::Taxon object from the database.
215 Returns : Bio::Taxon object
216 Args : just a single value which is the database id, OR named args:
217 -taxonid => taxonomy id (to query by taxonid)
219 -name => string (to query by a taxonomy name: common name,
220 scientific name, etc)
222 To retrieve a taxonomy node for a GI number provide the -gi option
223 with the gi number and -db with either 'nucleotide' or 'protein' to
224 define the db.
225 AND optionally,
226 -full => 1 (to force retrieval of full information - sometimes
227 minimal information about your taxon may have been
228 cached, which is normally used to save database
229 accesses)
231 =cut
233 sub get_taxon {
234 my $self = shift;
235 if (! $XMLTWIG) {
236 eval { require XML::Twig };
237 $self->throw("Could not load XML::Twig for get_taxon(): $@") if $@;
240 my %p = $self->entrez_params;
242 # convert input request to one or more ids
243 my (@taxonids, $taxonid, $want_full);
244 if (@_ > 1) {
245 my %params = @_;
246 if ($params{'-taxonid'}) {
247 $taxonid = $params{'-taxonid'};
249 elsif ($params{'-gi'}) {
250 my $db = $params{'-db'};
251 # we're going to do all the work here and then redirect
252 # the call based on the TaxId
253 my %p = %SequenceParams;
254 my %items;
255 if( ref($params{'-gi'}) =~ /ARRAY/i ) {
256 $p{'id'} = join(',', @{$params{'-gi'}});
257 } else {
258 $p{'id'} = $params{'-gi'};
260 $p{'db'} = $db if defined $db;
261 my $url = $self->_build_url($EntrezSummary, \%p);
262 my @ids;
263 if (exists $DATA_CACHE->{gi_to_ids}->{$url}) {
264 @ids = @{$DATA_CACHE->{gi_to_ids}->{$url}};
266 else {
267 my $twig = $self->_run_query($url);
268 my $root = $twig->root;
269 for my $topnode ( $root->children('DocSum') ) {
270 for my $child ( $topnode->children('Item') ) {
271 if( uc($child->{att}->{'Name'}) eq 'TAXID' ) {
272 push @ids, $child->text;
277 $DATA_CACHE->{gi_to_ids}->{$url} = \@ids;
280 return $self->get_taxon(-taxonid => \@ids);
282 elsif ($params{'-name'}) {
283 @taxonids = $self->get_taxonid($params{'-name'});
285 else {
286 $self->warn("Need to have provided either a -taxonid or -name value to get_taxon");
289 if ($params{'-full'}) {
290 $want_full = 1;
293 else {
294 $taxonid = shift;
297 if (ref($taxonid) =~ /ARRAY/i ) {
298 @taxonids = @{$taxonid};
300 else {
301 push(@taxonids, $taxonid) if $taxonid;
304 # return answer(s) from the cache if possible
305 my @results;
306 my @uncached;
307 foreach my $taxonid (@taxonids) {
308 $taxonid || $self->throw("In taxonids list one was undef! '@taxonids'\n");
309 if (defined $DATA_CACHE->{full_info}->{$taxonid}) {
310 push(@results, $self->_make_taxon($DATA_CACHE->{full_info}->{$taxonid}));
312 elsif (! $want_full && defined $DATA_CACHE->{minimal_info}->{$taxonid}) {
313 push(@results, $self->_make_taxon($DATA_CACHE->{minimal_info}->{$taxonid}));
315 else {
316 push(@uncached, $taxonid);
320 if (@uncached > 0) {
321 $taxonid = join(',', @uncached);
323 $p{'id'} = $taxonid;
325 my $twig = $self->_run_query($self->_build_url($EntrezFetch, \%p));
326 my $root = $twig->root;
327 for my $taxon ( $root->children('Taxon') ) {
328 my $taxid = $taxon->first_child_text('TaxId');
329 $self->throw("Got a result with no TaxId!") unless $taxid;
331 my $data = {};
332 if (exists $DATA_CACHE->{minimal_info}->{$taxid}) {
333 $data = $DATA_CACHE->{minimal_info}->{$taxid};
336 $data->{id} = $taxid;
337 $data->{rank} = $taxon->first_child_text('Rank');
339 my $other_names = $taxon->first_child('OtherNames');
340 my @other_names = $other_names->children_text() if $other_names;
341 my $sci_name = $taxon->first_child_text('ScientificName');
342 my $orig_sci_name = $sci_name;
343 $sci_name =~ s/ \(class\)$//;
344 push(@other_names, $orig_sci_name) if $orig_sci_name ne $sci_name;
345 $data->{scientific_name} = $sci_name;
346 $data->{common_names} = \@other_names;
348 $data->{division} = $taxon->first_child_text('Division');
349 $data->{genetic_code} = $taxon->first_child('GeneticCode')->first_child_text('GCId');
350 $data->{mitochondrial_genetic_code} = $taxon->first_child('MitoGeneticCode')->first_child_text('MGCId');
351 $data->{create_date} = $taxon->first_child_text('CreateDate');
352 $data->{update_date} = $taxon->first_child_text('UpdateDate');
353 $data->{pub_date} = $taxon->first_child_text('PubDate');
355 # since we have some information about all the ancestors of our
356 # requested node, we may as well cache data for the ancestors to
357 # reduce the number of accesses to website in future
358 my $lineage_ex = $taxon->first_child('LineageEx');
359 if (defined $lineage_ex) {
360 my ($ancestor, $lineage_data, @taxa);
361 foreach my $lineage_taxon ($lineage_ex->children) {
362 my $lineage_taxid = $lineage_taxon->first_child_text('TaxId');
364 if (exists $DATA_CACHE->{minimal_info}->{$lineage_taxid} || exists $DATA_CACHE->{full_info}->{$lineage_taxid}) {
365 $lineage_data = $DATA_CACHE->{minimal_info}->{$lineage_taxid} || $DATA_CACHE->{full_info}->{$lineage_taxid};
366 next;
368 else {
369 $lineage_data = {};
372 $lineage_data->{id} = $lineage_taxid;
373 $lineage_data->{scientific_name} = $lineage_taxon->first_child_text('ScientificName');
374 $lineage_data->{rank} = $lineage_taxon->first_child_text('Rank');
376 $RELATIONS->{ancestors}->{$lineage_taxid} = $ancestor->{id} if $ancestor;
378 $DATA_CACHE->{minimal_info}->{$lineage_taxid} = $lineage_data;
379 } continue { $ancestor = $lineage_data; unshift(@taxa, $lineage_data); }
381 $RELATIONS->{ancestors}->{$taxid} = $ancestor->{id} if $ancestor;
383 # go through the lineage in reverse so we can remember the children
384 my $child = $data;
385 foreach my $lineage_data (@taxa) {
386 $RELATIONS->{children}->{$lineage_data->{id}}->{$child->{id}} = 1;
387 } continue { $child = $lineage_data; }
390 delete $DATA_CACHE->{minimal_info}->{$taxid};
391 $DATA_CACHE->{full_info}->{$taxid} = $data;
392 push(@results, $self->_make_taxon($data));
396 wantarray() ? @results : shift @results;
399 *get_Taxonomy_Node = \&get_taxon;
402 =head2 get_taxonids
404 Title : get_taxonids
405 Usage : my $taxonid = $db->get_taxonids('Homo sapiens');
406 Function: Searches for a taxonid (typically ncbi_taxon_id) based on a query
407 string. Note that multiple taxonids can match to the same supplied
408 name.
409 Returns : array of integer ids in list context, one of these in scalar context
410 Args : string representing taxon's name
412 =cut
414 sub get_taxonids {
415 my ($self,$query) = @_;
416 my %p = $self->entrez_params;
418 # queries don't work correctly with special characters, so get rid of them.
419 if ($query =~ /<.+>/) {
420 # queries with <something> will fail, so workaround by removing, doing
421 # the query, getting multiple taxonids, then picking the one id that
422 # has a parent node with a scientific_name() or common_names()
423 # case-insensitive matching to the word(s) within <>
424 $query =~ s/ <(.+?)>//;
425 my $desired_parent_name = lc($1);
427 ID: for my $start_id ($self->get_taxonids($query)) {
428 my $node = $self->get_taxon($start_id) || next ID;
430 # walk up the parents until we hit a node with a named rank
431 while (1) {
432 my $parent_node = $self->ancestor($node) || next ID;
433 my $parent_sci_name = $parent_node->scientific_name || next ID;
434 my @parent_common_names = $parent_node->common_names;
435 unless (@parent_common_names) {
436 # ensure we're not using a minimal-info cached version
437 $parent_node = $self->get_taxon(-taxonid => $parent_node->id, -full => 1);
438 @parent_common_names = $parent_node->common_names;
441 for my $name ($parent_sci_name, @parent_common_names) {
442 if (lc($name) eq $desired_parent_name) {
443 return wantarray() ? ($start_id) : $start_id;
447 my $parent_rank = $parent_node->rank || 'no rank';
448 if ($parent_rank ne 'no rank') {
449 last;
451 else {
452 $node = $parent_node;
456 return;
458 $query =~ s/[\"\(\)]//g; # not an exhaustive list; these are just the ones we know cause problems
459 $query =~ s/\s/+/g;
461 my @data;
462 if (defined $DATA_CACHE->{name_to_id}->{$query}) {
463 @data = @{$DATA_CACHE->{name_to_id}->{$query}};
465 else {
466 $p{'term'} = $query;
468 my $twig = $self->_run_query($self->_build_url($EntrezGet, \%p));
469 my $root = $twig->root;
471 # Check that all words in the query are found, because we do not want to
472 # match just 1 word if there are multiple words, e.g. if we query with
473 # "Homo sapiens" both "homo" and "sapiens" must be found
474 my $errorlist = $root->first_child('ErrorList');
475 if ( $errorlist ) {
476 my @notfound = map { $_->text } $errorlist->children('PhraseNotFound');
477 if ( @notfound ) {
478 for my $term ( @notfound ) {
479 return "No hit" if $query =~ /$term/;
484 my $list = $root->first_child('IdList');
485 @data = map { $_->text } $list->children('Id');
487 $DATA_CACHE->{name_to_id}->{$query} = [@data];
490 return wantarray() ? @data : shift @data;
493 *get_taxonid = \&get_taxonids;
496 =head2 ancestor
498 Title : ancestor
499 Usage : my $ancestor_taxon = $db->ancestor($taxon)
500 Function: Retrieve the ancestor taxon of a supplied Taxon from the database.
502 Note that unless the ancestor has previously been directly
503 requested with get_taxon(), the returned Taxon object will only have
504 a minimal amount of information.
506 Returns : Bio::Taxon
507 Args : Bio::Taxon (that was retrieved from this database)
509 =cut
511 sub ancestor {
512 my ($self, $taxon) = @_;
513 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
514 $self->throw("The supplied Taxon must belong to this database") unless $taxon->db_handle && $taxon->db_handle eq $self;
515 my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
517 my $ancestor_id = $RELATIONS->{ancestors}->{$id} || return;
518 return $self->_make_taxon($DATA_CACHE->{full_info}->{$ancestor_id} || $DATA_CACHE->{minimal_info}->{$ancestor_id});
522 =head2 each_Descendent
524 Title : each_Descendent
525 Usage : my @taxa = $db->each_Descendent($taxon);
526 Function: Get all the descendents of the supplied Taxon (but not their
527 descendents, ie. not a recursive fetchall).
529 Note that this implementation is unable to return a taxon that
530 hasn't previously been directly fetched with get_taxon(), or wasn't
531 an ancestor of such a fetch.
533 Returns : Array of Bio::Taxon objects
534 Args : Bio::Taxon (that was retrieved from this database)
536 =cut
538 sub each_Descendent {
539 my ($self, $taxon) = @_;
540 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
541 $self->throw("The supplied Taxon must belong to this database") unless $taxon->db_handle && $taxon->db_handle eq $self;
542 my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
544 my @children_ids = keys %{$RELATIONS->{children}->{$id} || {}};
545 my @children;
546 foreach my $child_id (@children_ids) {
547 push(@children, $self->_make_taxon($DATA_CACHE->{full_info}->{$child_id} || $DATA_CACHE->{minimal_info}->{$child_id}));
550 return @children;
554 =head2 Some Get/Setter methods
556 =head2 entrez_url
558 Title : entrez_url
559 Usage : $obj->entrez_url($newval)
560 Function: Get/set entrez URL
561 Returns : value of entrez url (a scalar)
562 Args : on set, new value (a scalar or undef, optional)
564 =cut
566 sub entrez_url{
567 my $self = shift;
569 return $self->{'_entrez_url'} = shift if @_;
570 return $self->{'_entrez_url'};
574 =head2 entrez_params
576 Title : entrez_params
577 Usage : $obj->entrez_params($newval)
578 Function: Get/set entrez params
579 Returns : value of entrez_params (a hashref)
580 Args : on set, new value Hashref
582 =cut
584 sub entrez_params{
585 my $self = shift;
586 my $f;
587 if( @_ ) {
588 $f = $self->{'_entrez_params'} = shift;
589 } else {
590 $f = $self->{'_entrez_params'};
592 return %$f;
596 =head2 Bio::DB::WebBase methods
598 =head2 proxy_string
600 Title : proxy_string
601 Usage : my $proxy_string = $self->proxy_string($protocol)
602 Function: Get the proxy string (plus user/pass )
603 Returns : string
604 Args : protocol ('http' or 'ftp'), default 'http'
606 =head2 proxy
608 Title : proxy
609 Usage : $httpproxy = $db->proxy('http') or
610 $db->proxy(['http','ftp'], 'http://myproxy' )
611 Function: Get/Set a proxy for use of proxy
612 Returns : a string indicating the proxy
613 Args : $protocol : an array ref of the protocol(s) to set/get
614 $proxyurl : url of the proxy to use for the specified protocol
615 $username : username (if proxy requires authentication)
616 $password : password (if proxy requires authentication)
618 =head2 authentication
620 Title : authentication
621 Usage : $db->authentication($user,$pass)
622 Function: Get/Set authentication credentials
623 Returns : Array of user/pass
624 Args : Array or user/pass
626 =cut
629 # make a Taxon object from data hash ref
630 sub _make_taxon {
631 my ($self, $data) = @_;
633 my $taxon = Bio::Taxon->new();
635 my $taxid;
636 while (my ($method, $value) = each %{$data}) {
637 if ($method eq 'id') {
638 $method = 'ncbi_taxid'; # since this is a real ncbi taxid, explicitly set it as one
639 $taxid = $value;
641 $taxon->$method(ref($value) eq 'ARRAY' ? @{$value} : $value);
644 # we can't use -dbh or the db_handle() method ourselves or we'll go
645 # infinite on the merge attempt
646 $taxon->{'db_handle'} = $self;
648 $self->_handle_internal_id($taxon);
650 return $taxon;
654 sub _build_url {
655 # Given a eutility (esearch.fcgi, efetch.fcgi or esummary.fcgi) and a
656 # hashref or parameters, build a url suitable for eutil query
657 my ($self, $eutility, $p) = @_;
658 my $params = join($UrlParamSeparatorValue, map { $_.'='.$p->{$_} } keys %$p);
659 my $url = $self->entrez_url.$eutility.'?'.$params;
660 $self->debug("url is $url\n");
661 return $url;
665 sub _run_query {
666 # Given an eutil url, run the eutil query and parse the response into an
667 # XML Twig object
668 my ($self, $url) = @_;
669 my $response = $self->get($url);
670 if ($response->is_success) {
671 $response = $response->content;
672 }else {
673 $self->throw("Can't query website: ".$response->status_line);
675 $self->debug("response is $response\n");
676 my $twig = XML::Twig->new;
677 $twig->parse($response);
678 return $twig;