2 # BioPerl module for Bio::DB::Taxonomy::entrez
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::DB::Taxonomy::entrez - Taxonomy Entrez driver
20 # Do not use this object directly, rather through the Bio::DB::Taxonomy
23 use Bio::DB::Taxonomy;
25 my $db = Bio::DB::Taxonomy->new(-source => 'entrez');
27 my $taxonid = $db->get_taxonid('Homo sapiens');
28 my $node = $db->get_Taxonomy_Node(-taxonid => $taxonid);
31 my $node = $db->get_Taxonomy_Node(-gi => $gi, -db => 'protein');
32 print $node->binomial, "\n";
33 my ($species,$genus,$family) = $node->classification;
34 print "family is $family\n";
36 # Can also go up 4 levels
39 $p = $db->get_Taxonomy_Node(-taxonid => $p->parent_id);
41 print $p->rank, " ", ($p->classification)[0], "\n";
43 # could then classify a set of BLAST hits based on their GI numbers
44 # into taxonomic categories.
47 It is not currently possibly to query a node for its children so we
48 cannot completely replace the advantage of the flatfile
49 Bio::DB::Taxonomy::flatfile module.
54 A driver for querying NCBI Entrez Taxonomy database.
60 User feedback is an integral part of the evolution of this and other
61 Bioperl modules. Send your comments and suggestions preferably to
62 the Bioperl mailing list. Your participation is much appreciated.
64 bioperl-l@bioperl.org - General discussion
65 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
69 Please direct usage questions or support issues to the mailing list:
71 I<bioperl-l@bioperl.org>
73 rather than to the module maintainer directly. Many experienced and
74 reponsive experts will be able look at the problem and quickly
75 address it. Please include a thorough description of the problem
76 with code and data examples if at all possible.
80 Report bugs to the Bioperl bug tracking system to help us keep track
81 of the bugs and their resolution. Bug reports can be submitted via
84 https://github.com/bioperl/bioperl-live/issues
86 =head1 AUTHOR - Jason Stajich
88 Email jason-at-bioperl.org
92 Sendu Bala: bix@sendu.me.uk
96 The rest of the documentation details each of the object methods.
97 Internal methods are usually preceded with a _
101 # Let the code begin...
103 package Bio
::DB
::Taxonomy
::entrez
;
104 use vars
qw($EntrezLocation $UrlParamSeparatorValue %EntrezParams
105 $EntrezGet $EntrezSummary $EntrezFetch %SequenceParams
106 $XMLTWIG $DATA_CACHE $RELATIONS);
109 use Bio::DB::NCBIHelper;
119 use base qw(Bio::WebAgent Bio::DB::Taxonomy);
121 $EntrezLocation = $Bio::DB
::NCBIHelper
::HOSTBASE
. '/entrez/eutils/';
122 $EntrezGet = 'esearch.fcgi';
123 $EntrezFetch = 'efetch.fcgi';
124 $EntrezSummary = 'esummary.fcgi';
129 %EntrezParams = ( 'db' => 'taxonomy',
132 'tool' => 'Bioperl');
134 %SequenceParams = ( 'db' => 'nucleotide', # or protein
136 'tool' => 'Bioperl');
138 $UrlParamSeparatorValue = '&';
144 Usage : my $obj = Bio::DB::Taxonomy::entrez->new();
145 Function: Builds a new Bio::DB::Taxonomy::entrez object
146 Returns : an instance of Bio::DB::Taxonomy::entrez
147 Args : -location => URL to Entrez (if you want to override the default)
148 -params => Hashref of URL params if you want to override the
154 my ($class, @args) = @_;
156 # need to initialise Bio::WebAgent...
157 my ($self) = $class->SUPER::new
(@args);
159 # ... as well as our normal Bio::DB::Taxonomy selves:
160 $self->_initialize(@args);
168 $self->SUPER::_initialize
(@_);
170 my ($location,$params) = $self->_rearrange([qw(LOCATION PARAMS)],@_);
173 if( ref($params) !~ /HASH/i ) {
174 $self->warn("Must have provided a valid HASHref for -params");
175 $params = \
%EntrezParams;
178 $params = \
%EntrezParams;
180 $self->entrez_params($params);
181 $self->entrez_url($location || $EntrezLocation );
188 Usage : my $num = $db->get_num_taxa();
189 Function: Get the number of taxa stored in the database.
197 # Use this URL query to get the ID of all the taxa in the NCBI Taxonomy database:
198 # https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=taxonomy&term=all[Filter]
199 # Only the first 20 taxa IDs are returned (good because the list is long),
200 # and the total number is reported as well (which is what we are interested
202 my %p = $self->entrez_params;
203 $p{'term'} = 'all[Filter]';
204 my $twig = $self->_run_query($self->_build_url($EntrezGet, \
%p));
205 my $count = $twig->root->first_child('Count')->first_child->text;
213 Usage : my $taxon = $db->get_taxon(-taxonid => $taxonid)
214 Function: Get a Bio::Taxon object from the database.
215 Returns : Bio::Taxon object
216 Args : just a single value which is the database id, OR named args:
217 -taxonid => taxonomy id (to query by taxonid)
219 -name => string (to query by a taxonomy name: common name,
220 scientific name, etc)
222 To retrieve a taxonomy node for a GI number provide the -gi option
223 with the gi number and -db with either 'nucleotide' or 'protein' to
226 -full => 1 (to force retrieval of full information - sometimes
227 minimal information about your taxon may have been
228 cached, which is normally used to save database
236 eval { require XML
::Twig
};
237 $self->throw("Could not load XML::Twig for get_taxon(): $@") if $@
;
240 my %p = $self->entrez_params;
242 # convert input request to one or more ids
243 my (@taxonids, $taxonid, $want_full);
246 if ($params{'-taxonid'}) {
247 $taxonid = $params{'-taxonid'};
249 elsif ($params{'-gi'}) {
250 my $db = $params{'-db'};
251 # we're going to do all the work here and then redirect
252 # the call based on the TaxId
253 my %p = %SequenceParams;
255 if( ref($params{'-gi'}) =~ /ARRAY/i ) {
256 $p{'id'} = join(',', @
{$params{'-gi'}});
258 $p{'id'} = $params{'-gi'};
260 $p{'db'} = $db if defined $db;
261 my $url = $self->_build_url($EntrezSummary, \
%p);
263 if (exists $DATA_CACHE->{gi_to_ids
}->{$url}) {
264 @ids = @
{$DATA_CACHE->{gi_to_ids
}->{$url}};
267 my $twig = $self->_run_query($url);
268 my $root = $twig->root;
269 for my $topnode ( $root->children('DocSum') ) {
270 for my $child ( $topnode->children('Item') ) {
271 if( uc($child->{att
}->{'Name'}) eq 'TAXID' ) {
272 push @ids, $child->text;
277 $DATA_CACHE->{gi_to_ids
}->{$url} = \
@ids;
280 return $self->get_taxon(-taxonid
=> \
@ids);
282 elsif ($params{'-name'}) {
283 @taxonids = $self->get_taxonid($params{'-name'});
286 $self->warn("Need to have provided either a -taxonid or -name value to get_taxon");
289 if ($params{'-full'}) {
297 if (ref($taxonid) =~ /ARRAY/i ) {
298 @taxonids = @
{$taxonid};
301 push(@taxonids, $taxonid) if $taxonid;
304 # return answer(s) from the cache if possible
307 foreach my $taxonid (@taxonids) {
308 $taxonid || $self->throw("In taxonids list one was undef! '@taxonids'\n");
309 if (defined $DATA_CACHE->{full_info
}->{$taxonid}) {
310 push(@results, $self->_make_taxon($DATA_CACHE->{full_info
}->{$taxonid}));
312 elsif (! $want_full && defined $DATA_CACHE->{minimal_info
}->{$taxonid}) {
313 push(@results, $self->_make_taxon($DATA_CACHE->{minimal_info
}->{$taxonid}));
316 push(@uncached, $taxonid);
321 $taxonid = join(',', @uncached);
325 my $twig = $self->_run_query($self->_build_url($EntrezFetch, \
%p));
326 my $root = $twig->root;
327 for my $taxon ( $root->children('Taxon') ) {
328 my $taxid = $taxon->first_child_text('TaxId');
329 $self->throw("Got a result with no TaxId!") unless $taxid;
332 if (exists $DATA_CACHE->{minimal_info
}->{$taxid}) {
333 $data = $DATA_CACHE->{minimal_info
}->{$taxid};
336 $data->{id
} = $taxid;
337 $data->{rank
} = $taxon->first_child_text('Rank');
339 my $other_names = $taxon->first_child('OtherNames');
340 my @other_names = $other_names->children_text() if $other_names;
341 my $sci_name = $taxon->first_child_text('ScientificName');
342 my $orig_sci_name = $sci_name;
343 $sci_name =~ s/ \(class\)$//;
344 push(@other_names, $orig_sci_name) if $orig_sci_name ne $sci_name;
345 $data->{scientific_name
} = $sci_name;
346 $data->{common_names
} = \
@other_names;
348 $data->{division
} = $taxon->first_child_text('Division');
349 $data->{genetic_code
} = $taxon->first_child('GeneticCode')->first_child_text('GCId');
350 $data->{mitochondrial_genetic_code
} = $taxon->first_child('MitoGeneticCode')->first_child_text('MGCId');
351 $data->{create_date
} = $taxon->first_child_text('CreateDate');
352 $data->{update_date
} = $taxon->first_child_text('UpdateDate');
353 $data->{pub_date
} = $taxon->first_child_text('PubDate');
355 # since we have some information about all the ancestors of our
356 # requested node, we may as well cache data for the ancestors to
357 # reduce the number of accesses to website in future
358 my $lineage_ex = $taxon->first_child('LineageEx');
359 if (defined $lineage_ex) {
360 my ($ancestor, $lineage_data, @taxa);
361 foreach my $lineage_taxon ($lineage_ex->children) {
362 my $lineage_taxid = $lineage_taxon->first_child_text('TaxId');
364 if (exists $DATA_CACHE->{minimal_info
}->{$lineage_taxid} || exists $DATA_CACHE->{full_info
}->{$lineage_taxid}) {
365 $lineage_data = $DATA_CACHE->{minimal_info
}->{$lineage_taxid} || $DATA_CACHE->{full_info
}->{$lineage_taxid};
372 $lineage_data->{id
} = $lineage_taxid;
373 $lineage_data->{scientific_name
} = $lineage_taxon->first_child_text('ScientificName');
374 $lineage_data->{rank
} = $lineage_taxon->first_child_text('Rank');
376 $RELATIONS->{ancestors
}->{$lineage_taxid} = $ancestor->{id
} if $ancestor;
378 $DATA_CACHE->{minimal_info
}->{$lineage_taxid} = $lineage_data;
379 } continue { $ancestor = $lineage_data; unshift(@taxa, $lineage_data); }
381 $RELATIONS->{ancestors
}->{$taxid} = $ancestor->{id
} if $ancestor;
383 # go through the lineage in reverse so we can remember the children
385 foreach my $lineage_data (@taxa) {
386 $RELATIONS->{children
}->{$lineage_data->{id
}}->{$child->{id
}} = 1;
387 } continue { $child = $lineage_data; }
390 delete $DATA_CACHE->{minimal_info
}->{$taxid};
391 $DATA_CACHE->{full_info
}->{$taxid} = $data;
392 push(@results, $self->_make_taxon($data));
396 wantarray() ?
@results : shift @results;
399 *get_Taxonomy_Node
= \
&get_taxon
;
405 Usage : my $taxonid = $db->get_taxonids('Homo sapiens');
406 Function: Searches for a taxonid (typically ncbi_taxon_id) based on a query
407 string. Note that multiple taxonids can match to the same supplied
409 Returns : array of integer ids in list context, one of these in scalar context
410 Args : string representing taxon's name
415 my ($self,$query) = @_;
416 my %p = $self->entrez_params;
418 # queries don't work correctly with special characters, so get rid of them.
419 if ($query =~ /<.+>/) {
420 # queries with <something> will fail, so workaround by removing, doing
421 # the query, getting multiple taxonids, then picking the one id that
422 # has a parent node with a scientific_name() or common_names()
423 # case-insensitive matching to the word(s) within <>
424 $query =~ s/ <(.+?)>//;
425 my $desired_parent_name = lc($1);
427 ID
: for my $start_id ($self->get_taxonids($query)) {
428 my $node = $self->get_taxon($start_id) || next ID
;
430 # walk up the parents until we hit a node with a named rank
432 my $parent_node = $self->ancestor($node) || next ID
;
433 my $parent_sci_name = $parent_node->scientific_name || next ID
;
434 my @parent_common_names = $parent_node->common_names;
435 unless (@parent_common_names) {
436 # ensure we're not using a minimal-info cached version
437 $parent_node = $self->get_taxon(-taxonid
=> $parent_node->id, -full
=> 1);
438 @parent_common_names = $parent_node->common_names;
441 for my $name ($parent_sci_name, @parent_common_names) {
442 if (lc($name) eq $desired_parent_name) {
443 return wantarray() ?
($start_id) : $start_id;
447 my $parent_rank = $parent_node->rank || 'no rank';
448 if ($parent_rank ne 'no rank') {
452 $node = $parent_node;
458 $query =~ s/[\"\(\)]//g; # not an exhaustive list; these are just the ones we know cause problems
462 if (defined $DATA_CACHE->{name_to_id
}->{$query}) {
463 @data = @
{$DATA_CACHE->{name_to_id
}->{$query}};
468 my $twig = $self->_run_query($self->_build_url($EntrezGet, \
%p));
469 my $root = $twig->root;
471 # Check that all words in the query are found, because we do not want to
472 # match just 1 word if there are multiple words, e.g. if we query with
473 # "Homo sapiens" both "homo" and "sapiens" must be found
474 my $errorlist = $root->first_child('ErrorList');
476 my @notfound = map { $_->text } $errorlist->children('PhraseNotFound');
478 for my $term ( @notfound ) {
479 return "No hit" if $query =~ /$term/;
484 my $list = $root->first_child('IdList');
485 @data = map { $_->text } $list->children('Id');
487 $DATA_CACHE->{name_to_id
}->{$query} = [@data];
490 return wantarray() ?
@data : shift @data;
493 *get_taxonid
= \
&get_taxonids
;
499 Usage : my $ancestor_taxon = $db->ancestor($taxon)
500 Function: Retrieve the ancestor taxon of a supplied Taxon from the database.
502 Note that unless the ancestor has previously been directly
503 requested with get_taxon(), the returned Taxon object will only have
504 a minimal amount of information.
507 Args : Bio::Taxon (that was retrieved from this database)
512 my ($self, $taxon) = @_;
513 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
514 $self->throw("The supplied Taxon must belong to this database") unless $taxon->db_handle && $taxon->db_handle eq $self;
515 my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
517 my $ancestor_id = $RELATIONS->{ancestors
}->{$id} || return;
518 return $self->_make_taxon($DATA_CACHE->{full_info
}->{$ancestor_id} || $DATA_CACHE->{minimal_info
}->{$ancestor_id});
522 =head2 each_Descendent
524 Title : each_Descendent
525 Usage : my @taxa = $db->each_Descendent($taxon);
526 Function: Get all the descendents of the supplied Taxon (but not their
527 descendents, ie. not a recursive fetchall).
529 Note that this implementation is unable to return a taxon that
530 hasn't previously been directly fetched with get_taxon(), or wasn't
531 an ancestor of such a fetch.
533 Returns : Array of Bio::Taxon objects
534 Args : Bio::Taxon (that was retrieved from this database)
538 sub each_Descendent
{
539 my ($self, $taxon) = @_;
540 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
541 $self->throw("The supplied Taxon must belong to this database") unless $taxon->db_handle && $taxon->db_handle eq $self;
542 my $id = $taxon->id || $self->throw("The supplied Taxon is missing its id!");
544 my @children_ids = keys %{$RELATIONS->{children
}->{$id} || {}};
546 foreach my $child_id (@children_ids) {
547 push(@children, $self->_make_taxon($DATA_CACHE->{full_info
}->{$child_id} || $DATA_CACHE->{minimal_info
}->{$child_id}));
554 =head2 Some Get/Setter methods
559 Usage : $obj->entrez_url($newval)
560 Function: Get/set entrez URL
561 Returns : value of entrez url (a scalar)
562 Args : on set, new value (a scalar or undef, optional)
569 return $self->{'_entrez_url'} = shift if @_;
570 return $self->{'_entrez_url'};
576 Title : entrez_params
577 Usage : $obj->entrez_params($newval)
578 Function: Get/set entrez params
579 Returns : value of entrez_params (a hashref)
580 Args : on set, new value Hashref
588 $f = $self->{'_entrez_params'} = shift;
590 $f = $self->{'_entrez_params'};
596 =head2 Bio::DB::WebBase methods
601 Usage : my $proxy_string = $self->proxy_string($protocol)
602 Function: Get the proxy string (plus user/pass )
604 Args : protocol ('http' or 'ftp'), default 'http'
609 Usage : $httpproxy = $db->proxy('http') or
610 $db->proxy(['http','ftp'], 'http://myproxy' )
611 Function: Get/Set a proxy for use of proxy
612 Returns : a string indicating the proxy
613 Args : $protocol : an array ref of the protocol(s) to set/get
614 $proxyurl : url of the proxy to use for the specified protocol
615 $username : username (if proxy requires authentication)
616 $password : password (if proxy requires authentication)
618 =head2 authentication
620 Title : authentication
621 Usage : $db->authentication($user,$pass)
622 Function: Get/Set authentication credentials
623 Returns : Array of user/pass
624 Args : Array or user/pass
629 # make a Taxon object from data hash ref
631 my ($self, $data) = @_;
633 my $taxon = Bio
::Taxon
->new();
636 while (my ($method, $value) = each %{$data}) {
637 if ($method eq 'id') {
638 $method = 'ncbi_taxid'; # since this is a real ncbi taxid, explicitly set it as one
641 $taxon->$method(ref($value) eq 'ARRAY' ? @
{$value} : $value);
644 # we can't use -dbh or the db_handle() method ourselves or we'll go
645 # infinite on the merge attempt
646 $taxon->{'db_handle'} = $self;
648 $self->_handle_internal_id($taxon);
655 # Given a eutility (esearch.fcgi, efetch.fcgi or esummary.fcgi) and a
656 # hashref or parameters, build a url suitable for eutil query
657 my ($self, $eutility, $p) = @_;
658 my $params = join($UrlParamSeparatorValue, map { $_.'='.$p->{$_} } keys %$p);
659 my $url = $self->entrez_url.$eutility.'?'.$params;
660 $self->debug("url is $url\n");
666 # Given an eutil url, run the eutil query and parse the response into an
668 my ($self, $url) = @_;
669 my $response = $self->get($url);
670 if ($response->is_success) {
671 $response = $response->content;
673 $self->throw("Can't query website: ".$response->status_line);
675 $self->debug("response is $response\n");
676 my $twig = XML
::Twig
->new;
677 $twig->parse($response);