Bio::Tools::CodonTable::is_start_codon: check in case of ambiguous codons (#266)
[bioperl-live.git] / lib / Bio / Species.pm
blob33c40e9816de6eb77a566c48359452b567738cbb
2 # BioPerl module for Bio::Species
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by James Gilbert <jgrg@sanger.ac.uk>
7 # Reimplemented by Sendu Bala <bix@sendu.me.uk>
8 # Re-reimplemented by Chris Fields <cjfields - at - bioperl dot org>
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Species - Generic species object.
18 =head1 SYNOPSIS
20 $species = Bio::Species->new(-classification => [@classification]);
21 # Can also pass classification
22 # array to new as below
24 $species->classification(qw( sapiens Homo Hominidae
25 Catarrhini Primates Eutheria
26 Mammalia Vertebrata Chordata
27 Metazoa Eukaryota ));
29 $genus = $species->genus();
31 $bi = $species->binomial(); # $bi is now "Homo sapiens"
33 # For storing common name
34 $species->common_name("human");
36 # For storing subspecies
37 $species->sub_species("accountant");
39 =head1 DESCRIPTION
41 B<NOTE: This class is planned for deprecation in favor of the simpler Bio::Taxon.
42 Please use that class instead.>
44 Provides a very simple object for storing phylogenetic information. The
45 classification is stored in an array, which is a list of nodes in a phylogenetic
46 tree. Access to getting and setting species and genus is provided, but not to
47 any of the other node types (eg: "phylum", "class", "order", "family"). There's
48 plenty of scope for making the model more sophisticated, if this is ever needed.
50 A methods are also provided for storing common names, and subspecies.
52 =head1 FEEDBACK
54 =head2 Mailing Lists
56 User feedback is an integral part of the evolution of this and other
57 Bioperl modules. Send your comments and suggestions preferably to
58 the Bioperl mailing list. Your participation is much appreciated.
60 bioperl-l@bioperl.org - General discussion
61 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
63 =head2 Support
65 Please direct usage questions or support issues to the mailing list:
67 I<bioperl-l@bioperl.org>
69 rather than to the module maintainer directly. Many experienced and
70 reponsive experts will be able look at the problem and quickly
71 address it. Please include a thorough description of the problem
72 with code and data examples if at all possible.
74 =head2 Reporting Bugs
76 Report bugs to the Bioperl bug tracking system to help us keep track
77 of the bugs and their resolution. Bug reports can be submitted via the
78 web:
80 https://github.com/bioperl/bioperl-live/issues
82 =head1 AUTHOR
84 James Gilbert email B<jgrg@sanger.ac.uk>
86 =head1 CONTRIBUTORS
88 Sendu Bala, bix@sendu.me.uk
89 Chris Fields, cjfields at bioperl dot org
91 =head1 APPENDIX
93 The rest of the documentation details each of the object
94 methods. Internal methods are usually preceded with a _
96 =cut
98 #' Let the code begin...
100 package Bio::Species;
102 use strict;
103 use warnings;
105 use Bio::DB::Taxonomy;
106 use Bio::Tree::Tree;
107 use Bio::Taxon;
108 use base qw(Bio::Root::Root Bio::Tree::NodeI);
110 =head2 new
112 Title : new
113 Usage : my $obj = Bio::Species->new(-classification => \@class)
114 Function: Build a new Species object
115 Returns : Bio::Species object
116 Args : -ncbi_taxid => NCBI taxonomic ID (optional)
117 -classification => arrayref of classification
119 =cut
121 sub new {
122 my($class, @args) = @_;
124 my $self = $class->SUPER::new(@args);
126 # Bio::Species is now just a proxy object that just observes the NodeI
127 # interface methods but delegates them to the proper classes (Bio::Taxon and
128 # Bio::Tree::Tree). This will be surplanted by the much simpler
129 # Bio::Taxon/Bio::DB::Taxonomy modules in the future.
131 # Using a proxy allows proper GC w/o using weaken(). This just wraps the
132 # older instances, which have no reciprocal refs (thus no circular refs).
133 # This can then run proper cleanup
135 $self->taxon(Bio::Taxon->new(@args));
137 my ($org, $sp, $var, $classification) =
138 $self->_rearrange([qw(ORGANELLE
139 SUB_SPECIES
140 VARIANT
141 CLASSIFICATION)], @args);
143 if (defined $classification && ref($classification) eq "ARRAY" && @{$classification}) {
144 $self->classification(@$classification);
146 else {
147 $self->tree(Bio::Tree::Tree->new());
150 defined $org && $self->organelle($org);
151 defined $sp && $self->sub_species($sp);
152 defined $var && $self->variant($var);
154 return $self;
157 =head2 classification
159 Title : classification
160 Usage : $self->classification(@class_array);
161 @classification = $self->classification();
162 Function: Get/set the lineage of this species. The array provided must be in
163 the order ... ---> SPECIES, GENUS ---> KINGDOM ---> etc.
164 Example : $obj->classification(qw( 'Homo sapiens' Homo Hominidae
165 Catarrhini Primates Eutheria Mammalia Vertebrata
166 Chordata Metazoa Eukaryota));
167 Returns : Classification array
168 Args : Classification array
170 A reference to the classification array. In the latter case
171 if there is a second argument and it evaluates to true,
172 names will not be validated. NB: in any case, names are never
173 validated anyway.
175 =cut
177 sub classification {
178 my ($self, @vals) = @_;
180 my $taxon = $self->taxon;
182 if (@vals) {
183 if (ref($vals[0]) eq 'ARRAY') {
184 @vals = @{$vals[0]};
187 $vals[1] ||= '';
188 # make sure the lineage contains us as first or second element
189 # (lineage may have subspecies, species, genus ...)
190 my $name = $taxon->node_name;
191 my ($genus, $species) = (quotemeta($vals[1]), quotemeta($vals[0]));
192 if ($name &&
193 ($name !~ m{$species}i && $name !~ m{$genus}i) &&
194 $name !~ m{$genus $species}i) {
195 if ($name =~ /^$genus $species\s*(.+)/) {
196 # just assume the problem is someone tried to make a Bio::Species starting at subspecies
197 #*** no idea if this is appropriate! just a possible fix related to bug 2092
198 $self->sub_species($1);
199 $name = $taxon->node_name("$vals[1] $vals[0]");
201 else {
202 $self->warn("The supplied lineage does not start near '$name' (I was supplied '".join(" | ", @vals)."')");
206 # create a lineage for ourselves
207 my $db = Bio::DB::Taxonomy->new(-source => 'list', -names => [reverse @vals]);
208 unless ($taxon->scientific_name) {
209 # assume we're supposed to be the leaf of the supplied lineage
210 $self->taxon->scientific_name($vals[0]);
212 unless ($taxon->rank) {
213 # and that we are rank species
214 $taxon->rank('species');
217 $taxon->db_handle($db);
219 $self->tree(Bio::Tree::Tree->new(-node => $taxon));
222 @vals = ();
223 foreach my $node ($self->tree->get_lineage_nodes($taxon), $taxon) {
224 unshift(@vals, $node->scientific_name || next);
226 return @vals;
229 =head2 ncbi_taxid
231 Title : ncbi_taxid
232 Usage : $obj->ncbi_taxid($newval)
233 Function: Get/set the NCBI Taxon ID
234 Returns : the NCBI Taxon ID as a string
235 Args : newvalue to set or undef to unset (optional)
237 =cut
239 =head2 common_name
241 Title : common_name
242 Usage : $self->common_name( $common_name );
243 $common_name = $self->common_name();
244 Function: Get or set the common name of the species
245 Example : $self->common_name('human')
246 Returns : The common name in a string
247 Args : String, which is the common name (optional)
249 =cut
251 =head2 division
253 Title : division
254 Usage : $obj->division($newval)
255 Function: Genbank Division for a species
256 Returns : value of division (a scalar)
257 Args : value of division (a scalar)
259 =cut
261 =head2 species
263 Title : species
264 Usage : $self->species( $species );
265 $species = $self->species();
266 Function: Get or set the species name.
267 Note that this is NOT genus and species
268 -- use $self->binomial() for that.
269 Example : $self->species('sapiens');
270 Returns : species name as string (NOT genus and species)
271 Args : species name as string (NOT genus and species)
273 =cut
275 sub species {
276 my ($self, $species) = @_;
278 if ($species) {
279 $self->{_species} = $species;
282 unless (defined $self->{_species}) {
283 # work it out from our nodes
284 my $species_taxon = $self->tree->find_node(-rank => 'species');
285 unless ($species_taxon) {
286 # just assume we are rank species
287 $species_taxon = $self->taxon;
290 $species = $species_taxon->scientific_name;
293 # munge it like the Bio::SeqIO modules used to do
294 # (more or less copy/pasted from old Bio::SeqIO::genbank, hence comments
295 # referring to 'ORGANISM' etc.)
298 my $root = $self->tree->get_root_node;
299 unless ($root) {
300 $self->tree(Bio::Tree::Tree->new(-node => $species_taxon));
301 $root = $self->tree->get_root_node;
304 my @spflds = split(' ', $species);
305 if (@spflds > 1 && $root->node_name ne 'Viruses') {
306 $species = undef;
308 # does the next term start with uppercase?
309 # yes: valid genus; no then unconventional
310 # e.g. leaf litter basidiomycete sp. Collb2-39
311 my $genus;
312 if ($spflds[0] =~ m/^[A-Z]/) {
313 $genus = shift(@spflds);
315 else {
316 undef $genus;
319 my $sub_species;
320 if (@spflds) {
321 while (my $fld = shift @spflds) {
322 $species .= "$fld ";
323 # does it have subspecies or varieties?
324 last if ($fld =~ m/(sp\.|var\.)/);
326 chop $species; # last space
327 $sub_species = join ' ',@spflds if(@spflds);
329 else {
330 $species = 'sp.';
333 # does ORGANISM start with any words which make its genus undefined?
334 # these are in @unkn_genus
335 # this in case species starts with uppercase so isn't caught above.
336 # alter common name if required
337 my $unconv = 0; # is it unconventional species name?
338 my @unkn_genus = ('unknown','unclassified','uncultured','unidentified');
339 foreach (@unkn_genus) {
340 if ($genus && $genus =~ m/$_/i) {
341 $species = $genus . " " . $species;
342 undef $genus;
343 $unconv = 1;
344 last;
346 elsif ($species =~ m/$_/i) {
347 $unconv = 1;
348 last;
351 if (!$unconv && !$sub_species && $species =~ s/^(\w+)\s(\w+)$/$1/) {
352 # need to extract subspecies from conventional ORGANISM format.
353 # Will the 'word' in a two element species name
354 # e.g. $species = 'thummi thummi' => $species='thummi' &
355 # $sub_species='thummi'
356 $sub_species = $2;
359 $self->genus($genus) if $genus;
360 $self->sub_species($sub_species) if $sub_species;
363 $self->{_species} = $species;
365 return $self->{_species};
368 =head2 genus
370 Title : genus
371 Usage : $self->genus( $genus );
372 $genus = $self->genus();
373 Function: Get or set the scientific genus name.
374 Example : $self->genus('Homo');
375 Returns : Scientific genus name as string
376 Args : Scientific genus name as string
378 =cut
380 sub genus {
381 my ($self, $genus) = @_;
383 # TODO: instead of caching the raw name, cache the actual node instance.
384 if ($genus) {
385 $self->{_genus} = $genus;
387 unless (defined $self->{_genus}) {
388 my $genus_taxon = $self->tree->find_node(-rank => 'genus');
389 unless ($genus_taxon) {
390 # just assume our ancestor is rank genus
391 $genus_taxon = $self->taxon->ancestor;
394 $self->{_genus} = $genus_taxon->scientific_name if $genus_taxon;
397 return $self->{_genus};
400 =head2 sub_species
402 Title : sub_species
403 Usage : $obj->sub_species($newval)
404 Function: Get or set the scientific subspecies name.
405 Returns : value of sub_species
406 Args : newvalue (optional)
408 =cut
410 sub sub_species {
411 my ($self, $sub) = @_;
413 # TODO: instead of caching the raw name, cache the actual node instance.
414 if (!defined $self->{'_sub_species'}) {
415 my $ss_taxon = $self->tree->find_node(-rank => 'subspecies');
416 if ($ss_taxon) {
417 if ($sub) {
418 $ss_taxon->scientific_name($sub);
420 # *** weakening ref to our root node in species() to solve a
421 # memory leak means that we have a subspecies taxon to set
422 # during the first call to species(), but it has vanished by
423 # the time a user subsequently calls sub_species() to get the
424 # value. So we 'cheat' and just store the subspecies name in
425 # our self hash, instead of the tree. Is this a problem for
426 # a Species object? Can't decide --sendu
428 # This can now be changed to deal with this information on the
429 # fly. For now, the caching remains, but maybe we should just
430 # let these things deal with mutable data as needed? -- cjfields
432 $self->{'_sub_species'} = $sub;
434 return $ss_taxon->scientific_name;
436 else {
437 # should we create a node here to be added to the tree?
441 # fall back to direct storage on self
442 $self->{'_sub_species'} = $sub if $sub;
443 return $self->{'_sub_species'};
446 =head2 variant
448 Title : variant
449 Usage : $obj->variant($newval)
450 Function: Get/set variant information for this species object (strain,
451 isolate, etc).
452 Example :
453 Returns : value of variant (a scalar)
454 Args : new value (a scalar or undef, optional)
456 =cut
458 sub variant{
459 my ($self, $var) = @_;
461 # TODO: instead of caching the raw name, cache the actual node instance.
462 if (!defined $self->{'_variant'}) {
463 my $var_taxon = $self->tree->find_node(-rank => 'variant');
464 if ($var_taxon) {
465 if ($var) {
466 $var_taxon->scientific_name($var);
468 return $var_taxon->scientific_name;
470 else {
471 # should we create a node here to be added to the tree?
475 # fall back to direct storage on self
476 $self->{'_variant'} = $var if $var;
477 return $self->{'_variant'};
480 =head2 binomial
482 Title : binomial
483 Usage : $binomial = $self->binomial();
484 $binomial = $self->binomial('FULL');
485 Function: Returns a string "Genus species", or "Genus species subspecies",
486 if the first argument is 'FULL' (and the species has a subspecies).
487 Args : Optionally the string 'FULL' to get the full name including
488 the subspecies.
489 Note : This is just munged from the taxon() name
491 =cut
493 sub binomial {
494 my ($self, $full) = @_;
495 my $rank = $self->taxon->rank || 'no rank';
497 my ($species, $genus) = ($self->species, $self->genus);
498 unless (defined $species) {
499 $species = 'sp.';
500 $self->warn("requested binomial but classification was not set");
502 $genus = '' unless( defined $genus);
504 $species =~ s/$genus\s+//;
506 my $bi = "$genus $species";
507 if (defined($full) && $full =~ /full/i) {
508 my $ssp = $self->sub_species;
509 if ($ssp) {
510 $ssp =~ s/$bi\s+//;
511 $ssp =~ s/$species\s+//;
512 $bi .= " $ssp";
515 return $bi;
518 =head2 validate_species_name
520 Title : validate_species_name
521 Usage : $result = $self->validate_species_name($string);
522 Function: Validate the species portion of the binomial
523 Args : string
524 Notes : The string following the "genus name" in the NCBI binomial is so
525 variable that it's not clear that this is a useful function. Consider
526 the binomials "Simian 11 rotavirus (serotype 3 / strain
527 SA11-Patton)", or "St. Thomas 3 rotavirus", straight from GenBank.
528 This is particularly problematic in microbes and viruses. As such,
529 this isn't actually used automatically by any Bio::Species method.
531 =cut
533 sub validate_species_name {
534 my( $self, $string ) = @_;
536 return 1 if $string eq "sp.";
537 return 1 if $string =~ /strain/;
538 return 1 if $string =~ /^[a-z][\w\s-]+$/i;
539 $self->throw("Invalid species name '$string'");
542 sub validate_name {
543 return 1;
546 =head2 organelle
548 Title : organelle
549 Usage : $self->organelle( $organelle );
550 $organelle = $self->organelle();
551 Function: Get or set the organelle name
552 Example : $self->organelle('Chloroplast')
553 Returns : The organelle name in a string
554 Args : String, which is the organelle name
555 Note : TODO: We currently do not know where the organelle definition will
556 eventually go. This is stored in the source seqfeature, though,
557 so the information isn't lost.
559 =cut
561 sub organelle {
562 my($self) = shift;
563 return $self->{'_organelle'} = shift if @_;
564 return $self->{'_organelle'};
567 =head2 Delegation
569 The following methods delegate to the internal Bio::Taxon instance. This is
570 mainly to allow code continue using older methods, with the mind to migrate to
571 using Bio::Taxon and related methods when this class is deprecated.
573 =cut
575 sub node_name {shift->taxon->node_name(@_)}
576 sub scientific_name {shift->taxon->node_name(@_)}
578 sub id {shift->taxon->id(@_)}
579 sub object_id {shift->taxon->id(@_)}
580 sub ncbi_taxid {shift->taxon->ncbi_taxid(@_)}
581 sub rank {shift->taxon->rank(@_)}
582 sub division {shift->taxon->division(@_)}
584 sub common_names {shift->taxon->common_names(@_)}
585 sub common_name {shift->taxon->common_names(@_)}
587 sub genetic_code {shift->taxon->genetic_code(@_)}
588 sub mitochondrial_genetic_code {shift->taxon->mitochondrial_genetic_code(@_)}
590 sub create_date { shift->taxon->create_date(@_)}
591 sub pub_date { shift->taxon->pub_date(@_)}
592 sub update_date { shift->taxon->update_date(@_)}
594 sub db_handle { shift->taxon->db_handle(@_)}
596 sub parent_id { shift->taxon->parent_id(@_)}
597 sub parent_taxon_id { shift->taxon->parent_id(@_)}
599 sub version { shift->taxon->version(@_)}
600 sub authority { shift->taxon->authority(@_)}
601 sub namespace { shift->taxon->namespace(@_)}
603 sub ancestor { shift->taxon->ancestor(@_)}
604 sub get_Parent_Node { shift->taxon->get_Parent_Node(@_)}
605 sub each_Descendent { shift->taxon->each_Descendent(@_)}
606 sub get_Children_Nodes { shift->taxon->get_Children_Nodes(@_)}
607 sub remove_Descendant { shift->taxon->remove_Descendant(@_)}
609 sub name { shift->taxon->name(@_)}
611 =head2 taxon
613 Title : taxon
614 Usage : $obj->taxon
615 Function : retrieve the internal Bio::Taxon instance
616 Returns : A Bio::Taxon. If one is not previously set,
617 an instance is created lazily
618 Args : Bio::Taxon (optional)
620 =cut
622 sub taxon {
623 my ($self, $taxon) = @_;
624 if (!$self->{taxon} || $taxon) {
625 $taxon ||= Bio::Taxon->new();
626 $self->{taxon} = $taxon;
628 $self->{taxon};
631 =head2 tree
633 Title : tree
634 Usage : $obj->tree
635 Function : Returns a Bio::Tree::Tree object
636 Returns : A Bio::Tree::Tree. If one is not previously set,
637 an instance is created lazily
638 Args : Bio::Tree::Tree (optional)
640 =cut
642 sub tree {
643 my ($self, $tree) = @_;
644 if (!$self->{tree} || $tree) {
645 $tree ||= Bio::Tree::Tree->new();
646 delete $tree->{_root_cleanup_methods};
647 $self->{tree} = $tree;
649 $self->{tree};
652 sub DESTROY {
653 my $self = shift;
654 $self->tree->cleanup_tree;
655 delete $self->{tree};
656 $self->taxon->node_cleanup;