2 # BioPerl module for Bio::Tree::Statistics
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason@bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Tree::Statistics - Calculate certain statistics for a Tree
20 use Bio::Tree::Statistics;
24 This should be where Tree statistics are calculated. It was
25 previously where statistics from a Coalescent simulation.
27 It now contains several methods for calculating L<Tree-Trait
34 User feedback is an integral part of the evolution of this and other
35 Bioperl modules. Send your comments and suggestions preferably to
36 the Bioperl mailing list. Your participation is much appreciated.
38 bioperl-l@bioperl.org - General discussion
39 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
43 Please direct usage questions or support issues to the mailing list:
45 I<bioperl-l@bioperl.org>
47 rather than to the module maintainer directly. Many experienced and
48 reponsive experts will be able look at the problem and quickly
49 address it. Please include a thorough description of the problem
50 with code and data examples if at all possible.
54 Report bugs to the Bioperl bug tracking system to help us keep track
55 of the bugs and their resolution. Bug reports can be submitted via
58 https://github.com/bioperl/bioperl-live/issues
60 =head1 AUTHOR - Jason Stajich
62 Email jason AT bioperl.org
66 Heikki Lehvaslaiho, heikki at bioperl dot org
70 The rest of the documentation details each of the object methods.
71 Internal methods are usually preceded with a _
76 # Let the code begin...
79 package Bio
::Tree
::Statistics
;
83 use base
qw(Bio::Root::Root);
88 Usage : my $obj = Bio::Tree::Statistics->new();
89 Function: Builds a new Bio::Tree::Statistics object
90 Returns : Bio::Tree::Statistics
93 =head2 assess_bootstrap
95 Title : assess_bootstrap
96 Usage : my $tree_with_bs = $stats->assess_bootstrap(\@bs_trees,$guide_tree);
97 Function: Calculates the bootstrap for internal nodes based on the percentage
98 of times \@bs_trees agree with each internal node
99 Returns : L<Bio::Tree::TreeI>
100 Args : Arrayref of L<Bio::Tree::TreeI>s
101 Guide tree, L<Bio::Tree::TreeI>s
105 sub assess_bootstrap
{
106 my ($self,$bs_trees,$guide_tree) = @_;
109 if(!defined($bs_trees) || ref($bs_trees) ne 'ARRAY'){
110 die "ERROR: second parameter in assess_bootstrap() must be a list";
112 my $num_bs_trees = scalar(@
$bs_trees);
113 if($num_bs_trees < 1){
114 die "ERROR: no bootstrap trees were passed to assess_bootstrap()";
117 # internal nodes are defined by their children
119 my (%lookup,%internal);
121 for my $tree ( $guide_tree, @
$bs_trees ) {
122 # Do this as a top down approach, can probably be
123 # improved by caching internal node states, but not going
124 # to worry about it right now.
126 my @allnodes = $tree->get_nodes;
127 my @internalnodes = grep { ! $_->is_Leaf } @allnodes;
128 for my $node ( @internalnodes ) {
129 my @tips = sort map { $_->id }
130 grep { $_->is_Leaf() } $node->get_all_Descendents;
131 my $id = "(".join(",", @tips).")";
133 $internal{$id} = $node->internal_id;
140 #my @save; # unsure why this variable is needed
141 for my $l ( keys %lookup ) {
142 if( defined $internal{$l} ) {#&& $lookup{$l} > $min_seen ) {
143 my $intnode = $guide_tree->find_node(-internal_id
=> $internal{$l});
144 $intnode->bootstrap(sprintf("%d",100 * $lookup{$l} / $num_bs_trees));
153 Example : cherries($tree, $node);
154 Description: Count number of paired leaf nodes
158 Args : 1. Bio::Tree::TreeI object
159 2. Bio::Tree::NodeI object within the tree, optional
161 Commonly used statistics assume a binary tree, but this methods
162 returns a value even for trees with polytomies.
169 my $node = shift || $tree->get_root_node;
172 my @descs = $node->each_Descendent;
174 if ($descs[0]->is_Leaf and $descs[1]->is_Leaf) {
175 if ($descs[3]) { #polytomy at leaf level
182 foreach my $desc (@descs) {
183 $cherries += $self->cherries($tree, $desc);
190 =head2 Tree-Trait statistics
192 The following methods produce descriptors of trait distribution among
193 leaf nodes within the trees. They require that a trait has been set
194 for each leaf node. The tag methods of Bio::Tree::Node are used to
195 store them as key/value pairs. In this way, one tree can store more
198 Trees have method add_traits() to set trait values from a file. See the
199 add_trait() method in L<Bio::Tree::TreeFunctionsI>.
203 Example : fitch($tree, $key, $node);
204 Description: Calculates Parsimony Score (PS) and internal trait
205 values using the Fitch 1971 parsimony algorithm for
206 the subtree a defined by the (internal) node.
207 Node defaults to the root.
208 Returns : true on success
209 Exceptions : leaf nodes have to have the trait defined
210 Args : 1. Bio::Tree::TreeI object
212 3. Bio::Tree::NodeI object within the tree, optional
214 Runs first L<fitch_up> that calculates parsimony scores and then
215 L<fitch_down> that should resolve most of the trait/character state
218 Fitch, W.M., 1971. Toward defining the course of evolution: minimal
219 change for a specific tree topology. Syst. Zool. 20, 406-416.
221 You can access calculated parsimony values using:
223 $score = $node->->get_tag_values('ps_score');
225 and the trait value with:
227 $traitvalue = $node->->get_tag_values('ps_trait'); # only the first
228 @traitvalues = $node->->get_tag_values('ps_trait');
230 Note that there can be more that one trait value, especially for the
238 my $key = shift || $self->throw("Trait name is needed");
239 my $node = shift || $tree->get_root_node;
241 $self->fitch_up($tree, $key, $node);
242 $self->fitch_down($tree, $node);
248 Example : ps($tree, $key, $node);
249 Description: Calculates Parsimony Score (PS) from Fitch 1971
250 parsimony algorithm for the subtree as defined
251 by the (internal) node.
252 Node defaults to the root.
253 Returns : integer, 1 < PS < n, where n is number of branches
254 Exceptions : leaf nodes have to have the trait defined
255 Args : 1. Bio::Tree::TreeI object
257 3. Bio::Tree::NodeI object within the tree, optional
259 This is the first half of the Fitch algorithm that is enough for
260 calculating the resolved parsimony values. The trait/chararacter
261 states are commonly left in ambiguous state. To resolve them, run
266 sub ps
{ shift->fitch_up(@_) }
271 Example : fitch_up($tree, $key, $node);
272 Description: Calculates Parsimony Score (PS) from the Fitch 1971
273 parsimony algorithm for the subtree as defined
274 by the (internal) node.
275 Node defaults to the root.
276 Returns : integer, 1< PS < n, where n is number of branches
277 Exceptions : leaf nodes have to have the trait defined
278 Args : 1. Bio::Tree::TreeI object
280 3. Bio::Tree::NodeI object within the tree, optional
282 This is a more generic name for L<ps> and indicates that it performs
283 the first bottom-up tree traversal that calculates the parsimony score
284 but usually leaves trait/character states ambiguous. If you are
285 interested in internal trait states, running L<fitch_down> should
286 resolve most of the ambiguities.
293 my $key = shift || $self->throw("Trait name is needed");
294 my $node = shift || $tree->get_root_node;
296 if ($node->is_Leaf) {
297 $self->throw ("ERROR: ". $node->internal_id. " needs a value for trait $key")
298 unless $node->has_tag($key);
299 $node->set_tag_value('ps_trait', $node->get_tag_values($key) );
300 $node->set_tag_value('ps_score', 0 );
301 return; # end of recursion
304 foreach my $child ($node->each_Descendent) {
305 $self->fitch_up($tree, $key, $child);
312 foreach my $child ($node->each_Descendent) {
313 foreach my $trait ($child->get_tag_values('ps_trait') ) {
314 $intersection{$trait}++ if $union{$trait};
317 $score += $child->get_tag_values('ps_score');
320 if (keys %intersection) {
321 $node->set_tag_value('ps_trait', keys %intersection);
322 $node->set_tag_value('ps_score', $score);
324 $node->set_tag_value('ps_trait', keys %union);
325 $node->set_tag_value('ps_score', $score+1);
328 if ($self->verbose) {
329 print "-- node --------------------------\n";
330 print "iID: ", $node->internal_id, " (", $node->id, ")\n";
331 print "Trait: ", join (', ', $node->get_tag_values('ps_trait') ), "\n";
332 print "length :", scalar($node->get_tag_values('ps_score')) , "\n";
334 return scalar $node->get_tag_values('ps_score');
340 Example : fitch_down($tree, $node);
341 Description: Runs the second pass from Fitch 1971
342 parsimony algorithm to resolve ambiguous
343 trait states left by first pass.
344 by the (internal) node.
345 Node defaults to the root.
347 Exceptions : dies unless the trait is defined in all nodes
348 Args : 1. Bio::Tree::TreeI object
349 2. Bio::Tree::NodeI object within the tree, optional
351 Before running this method you should have ran L<fitch_up> (alias to
352 L<ps> ). Note that it is not guaranteed that all states are completely
360 my $node = shift || $tree->get_root_node;
362 my $key = 'ps_trait';
363 $self->throw ("ERROR: ". $node->internal_id. " needs a value for $key")
364 unless $node->has_tag($key);
367 foreach my $trait ($node->get_tag_values($key) ) {
371 foreach my $child ($node->each_Descendent) {
372 next if $child->is_Leaf; # end of recursion
375 foreach my $trait ($child->get_tag_values($key) ) {
376 $intersection->{$trait}++ if $nodev->{$trait};
379 $self->fitch_down($tree, $child);
380 $child->set_tag_value($key, keys %$intersection);
388 Example : persistence($tree, $node);
389 Description: Calculates the persistence
390 for node in the subtree defined by the (internal)
391 node. Node defaults to the root.
392 Returns : int, number of generations trait value has to remain same
393 Exceptions : all the nodes need to have the trait defined
394 Args : 1. Bio::Tree::TreeI object
395 2. Bio::Tree::NodeI object within the tree, optional
397 Persistence measures the stability that the trait value has in a
398 tree. It expresses the number of generations the trait value remains
399 the same. All the decendants of the root in the same generation have
400 to share the same value.
402 Depends on Fitch's parsimony score (PS).
410 my $value = shift || $self->throw("Value is needed");
413 my $key = 'ps_trait';
415 $self->throw("Node is needed") unless $node->isa('Bio::Tree::NodeI');
417 return 0 unless $node->get_tag_values($key) eq $value; # wrong value
418 return 1 if $node->is_Leaf; # end of recursion
420 my $persistence = 10000000; # an arbitrarily large number
421 foreach my $child ($node->each_Descendent) {
422 my $pers = $self->_persistence($tree, $child, $value);
423 $persistence = $pers if $pers < $persistence;
425 return $persistence + 1;
431 my $node = shift || $tree->get_root_node;
432 $self->throw("Node is needed") unless $node->isa('Bio::Tree::NodeI');
434 my $key = 'ps_trait';
435 my $value = $node->get_tag_values($key);
438 my $persistence = $self->_persistence($tree, $node, $value);
439 $node->set_tag_value('persistance', $persistence);
444 =head2 count_subclusters
446 Example : count_clusters($tree, $node);
447 Description: Calculates the number of sub-clusters
448 in the subtree defined by the (internal)
449 node. Node defaults to the root.
451 Exceptions : all the nodes need to have the trait defined
452 Args : 1. Bio::Tree::TreeI object
453 2. Bio::Tree::NodeI object within the tree, optional
455 Depends on Fitch's parsimony score (PS).
459 sub _count_subclusters
{
463 my $value = shift || $self->throw("Value is needed");
465 my $key = 'ps_trait';
467 $self->throw ("ERROR: ". $node->internal_id. " needs a value for trait $key")
468 unless $node->has_tag($key);
470 if ($node->get_tag_values($key) eq $value) {
471 if ($node->get_tag_values('ps_score') == 0) {
475 foreach my $child ($node->each_Descendent) {
476 $count += $self->_count_subclusters($tree, $child, $value);
484 sub count_subclusters
{
487 my $node = shift || $tree->get_root_node;
488 $self->throw("Node is needed") unless $node->isa('Bio::Tree::NodeI');
490 my $key = 'ps_trait';
491 my $value = $node->get_tag_values($key);
493 return $self->_count_subclusters($tree, $node, $value);
499 Example : count_leaves($tree, $node);
500 Description: Calculates the number of leaves with same trait
501 value as root in the subtree defined by the (internal)
502 node. Requires an unbroken line of identical trait values.
503 Node defaults to the root.
504 Returns : int, number of leaves with this trait value
505 Exceptions : all the nodes need to have the trait defined
506 Args : 1. Bio::Tree::TreeI object
507 2. Bio::Tree::NodeI object within the tree, optional
509 Depends on Fitch's parsimony score (PS).
516 my $node = shift || $tree->get_root_node;
519 my $key = 'ps_trait';
521 $self->throw ("ERROR: ". $node->internal_id. " needs a value for trait $key")
522 unless $node->has_tag($key);
524 if ($node->get_tag_values($key) eq $value) {
525 #print $node->id, ": ", $node->get_tag_values($key), "\n";
526 return 1 if $node->is_Leaf; # end of recursion
529 foreach my $child ($node->each_Descendent) {
530 $count += $self->_count_leaves($tree, $child, $value);
540 my $node = shift || $tree->get_root_node;
541 $self->throw("Node is needed") unless $node->isa('Bio::Tree::NodeI');
543 my $key = 'ps_trait';
544 my $value = $node->get_tag_values($key);
546 return $self->_count_leaves($tree, $node, $value);
550 =head2 phylotype_length
552 Example : phylotype_length($tree, $node);
553 Description: Sums up the branch lengths within phylotype
554 exluding the subclusters where the trait values
556 Returns : float, length
557 Exceptions : all the nodes need to have the trait defined
558 Args : 1. Bio::Tree::TreeI object
559 2. Bio::Tree::NodeI object within the tree, optional
561 Depends on Fitch's parsimony score (PS).
565 sub _phylotype_length
{
571 my $key = 'ps_trait';
573 $self->throw ("ERROR: ". $node->internal_id. " needs a value for trait $key")
574 unless $node->has_tag($key);
576 return 0 if $node->get_tag_values($key) ne $value;
577 return $node->branch_length if $node->is_Leaf; # end of recursion
580 foreach my $child ($node->each_Descendent) {
581 my $sub_len = $self->_phylotype_length($tree, $child, $value);
583 $length += $child->branch_length if not $child->is_Leaf and $sub_len;
588 sub phylotype_length
{
591 my $node = shift || $tree->get_root_node;
593 my $key = 'ps_trait';
594 my $value = $node->get_tag_values($key);
596 return $self->_phylotype_length($tree, $node, $value);
600 =head2 sum_of_leaf_distances
602 Example : sum_of_leaf_distances($tree, $node);
603 Description: Sums up the branch lengths from root to leaf
604 exluding the subclusters where the trait values
606 Returns : float, length
607 Exceptions : all the nodes need to have the trait defined
608 Args : 1. Bio::Tree::TreeI object
609 2. Bio::Tree::NodeI object within the tree, optional
611 Depends on Fitch's parsimony score (PS).
615 sub _sum_of_leaf_distances
{
621 my $key = 'ps_trait';
623 $self->throw ("ERROR: ". $node->internal_id. " needs a value for trait $key")
624 unless $node->has_tag($key);
625 return 0 if $node->get_tag_values($key) ne $value;
626 #return $node->branch_length if $node->is_Leaf; # end of recursion
627 return 0 if $node->is_Leaf; # end of recursion
630 foreach my $child ($node->each_Descendent) {
631 $length += $self->_count_leaves($tree, $child, $value) * $child->branch_length +
632 $self->_sum_of_leaf_distances($tree, $child, $value);
637 sub sum_of_leaf_distances
{
640 my $node = shift || $tree->get_root_node;
642 my $key = 'ps_trait';
643 my $value = $node->get_tag_values($key);
645 return $self->_sum_of_leaf_distances($tree, $node, $value);
649 =head2 genetic_diversity
651 Example : genetic_diversity($tree, $node);
652 Description: Diversity is the sum of root to leaf distances
653 within the phylotype normalised by number of leaf
655 Returns : float, value of genetic diversity
656 Exceptions : all the nodes need to have the trait defined
657 Args : 1. Bio::Tree::TreeI object
658 2. Bio::Tree::NodeI object within the tree, optional
660 Depends on Fitch's parsimony score (PS).
664 sub genetic_diversity
{
667 my $node = shift || $tree->get_root_node;
669 return $self->sum_of_leaf_distances($tree, $node) /
670 $self->count_leaves($tree, $node);
676 Example : statratio($tree, $node);
677 Description: Ratio of the stem length and the genetic diversity of the
678 phylotype L<genetic_diversity>
679 Returns : float, separation score
680 Exceptions : all the nodes need to have the trait defined
681 Args : 1. Bio::Tree::TreeI object
682 2. Bio::Tree::NodeI object within the tree, optional
684 Statratio gives a measure of separation and variability within the phylotype.
685 Larger values identify more rapidly evolving and recent phylotypes.
687 Depends on Fitch's parsimony score (PS).
694 my $node = shift || $tree->get_root_node;
696 my $div = $self->genetic_diversity($tree, $node);
697 return 0 if $div == 0;
698 return $node->branch_length / $div;
705 Example : ai($tree, $key, $node);
706 Description: Calculates the Association Index (AI) of Whang et
707 al. 2001 for the subtree defined by the (internal)
708 node. Node defaults to the root.
710 Exceptions : leaf nodes have to have the trait defined
711 Args : 1. Bio::Tree::TreeI object
713 3. Bio::Tree::NodeI object within the tree, optional
715 Association index (AI) gives a more fine grained results than PS since
716 the result is a real number. ~0 E<lt>= AI.
718 Wang, T.H., Donaldson, Y.K., Brettle, R.P., Bell, J.E., Simmonds, P.,
719 2001. Identification of shared populations of human immunodeficiency
720 Virus Type 1 infecting microglia and tissue macrophages outside the
721 central nervous system. J. Virol. 75 (23), 11686-11699.
732 for my $desc ( $node->get_all_Descendents ) {
733 next unless $desc->is_Leaf;
735 $self->throw ("Node ". $desc->id. " needs a value for trait [$key]")
736 unless $desc->has_tag($key);
737 my $trait = $desc->get_tag_values($key);
741 foreach ( keys %$traits) {
742 $most_common = $traits->{$_} if $traits->{$_} > $most_common;
744 return sprintf "%1.6f", (1 - ($most_common/$leaf_count) ) / (2**($leaf_count-1) );
750 my $key = shift || $self->throw("Trait name is needed");
751 my $start_node = shift || $tree->get_root_node;
752 return unless $start_node;
755 for my $node ( $start_node->get_all_Descendents ) {
756 next if $node->is_Leaf;
757 $sum += $self->_node_ai($node, $key);
765 Example : mc($tree, $key, $node);
766 Description: Calculates the Monophyletic Clade (MC) size statistics
767 for the subtree a defined by the (internal) node.
768 Node defaults to the root;
769 Returns : hashref with trait values as keys
770 Exceptions : leaf nodes have to have the trait defined
771 Args : 1. Bio::Tree::TreeI object
773 3. Bio::Tree::NodeI object within the tree, optional
775 Monophyletic Clade (MC) size statistics by Salemi at al 2005. It is
776 calculated for each trait value. 1 E<lt>= MC E<lt>= nx, where nx is the
777 number of tips with value x:
779 pick the internal node with maximim value for
780 number of of tips with only trait x
782 MC was defined by Parker et al 2008.
784 Salemi, M., Lamers, S.L., Yu, S., de Oliveira, T., Fitch, W.M., McGrath, M.S.,
785 2005. Phylodynamic analysis of Human Immunodeficiency Virus Type 1 in
786 distinct brain compartments provides a model for the neuropathogenesis of
787 AIDS. J. Virol. 79 (17), 11343-11352.
789 Parker, J., Rambaut A., Pybus O., 2008. Correlating viral phenotypes
790 with phylogeny: Accounting for phylogenetic uncertainty Infection,
791 Genetics and Evolution 8 (2008), 239-246.
802 for my $node2 ( $node->get_all_Descendents ) {
803 next unless $node2->is_Leaf;
805 my $trait = $node2->get_tag_values($key);
814 my $key = shift || die "Trait name is needed";
815 my $start_node = shift || $tree->get_root_node;
816 return unless $start_node;
818 my $sum; # hashref, keys are trait values
819 my $keys; # hashref, keys are trait values
820 foreach my $node ( $start_node->get_all_Descendents ) {
821 next if $node->is_Leaf;
822 my $traits = $self->_node_mc($node, $key);
823 if (scalar keys %$traits == 1) {
824 my ($value) = keys %$traits;
826 $sum->{$value} = $traits->{$value}
827 if $sum->{$value} < $traits->{$value};
829 map { $keys->{$_} = 1 } keys %$traits;
832 # check for cases where there are no clusters
833 foreach my $value (keys %$keys) {
834 $sum->{$value} = 1 unless defined $sum->{$value};