Bio::Tools::CodonTable::is_start_codon: check in case of ambiguous codons (#266)
[bioperl-live.git] / lib / Bio / Tree / Statistics.pm
bloba78fe6786cf9d66b432094d6d1b81bfe43fc9983
2 # BioPerl module for Bio::Tree::Statistics
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason@bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Tree::Statistics - Calculate certain statistics for a Tree
18 =head1 SYNOPSIS
20 use Bio::Tree::Statistics;
22 =head1 DESCRIPTION
24 This should be where Tree statistics are calculated. It was
25 previously where statistics from a Coalescent simulation.
27 It now contains several methods for calculating L<Tree-Trait
28 statistics>.
30 =head1 FEEDBACK
32 =head2 Mailing Lists
34 User feedback is an integral part of the evolution of this and other
35 Bioperl modules. Send your comments and suggestions preferably to
36 the Bioperl mailing list. Your participation is much appreciated.
38 bioperl-l@bioperl.org - General discussion
39 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
41 =head2 Support
43 Please direct usage questions or support issues to the mailing list:
45 I<bioperl-l@bioperl.org>
47 rather than to the module maintainer directly. Many experienced and
48 reponsive experts will be able look at the problem and quickly
49 address it. Please include a thorough description of the problem
50 with code and data examples if at all possible.
52 =head2 Reporting Bugs
54 Report bugs to the Bioperl bug tracking system to help us keep track
55 of the bugs and their resolution. Bug reports can be submitted via
56 the web:
58 https://github.com/bioperl/bioperl-live/issues
60 =head1 AUTHOR - Jason Stajich
62 Email jason AT bioperl.org
64 =head1 CONTRIBUTORS
66 Heikki Lehvaslaiho, heikki at bioperl dot org
68 =head1 APPENDIX
70 The rest of the documentation details each of the object methods.
71 Internal methods are usually preceded with a _
73 =cut
76 # Let the code begin...
79 package Bio::Tree::Statistics;
81 use strict;
84 use base qw(Bio::Root::Root);
86 =head2 new
88 Title : new
89 Usage : my $obj = Bio::Tree::Statistics->new();
90 Function: Builds a new Bio::Tree::Statistics object
91 Returns : Bio::Tree::Statistics
92 Args :
94 =head2 assess_bootstrap
96 Title : assess_bootstrap
97 Usage : my $tree_with_bs = $stats->assess_bootstrap(\@bs_trees,$guide_tree);
98 Function: Calculates the bootstrap for internal nodes based on the percentage
99 of times \@bs_trees agree with each internal node
100 Returns : L<Bio::Tree::TreeI>
101 Args : Arrayref of L<Bio::Tree::TreeI>s
102 Guide tree, L<Bio::Tree::TreeI>s
104 =cut
106 sub assess_bootstrap{
107 my ($self,$bs_trees,$guide_tree) = @_;
108 my @consensus;
110 if(!defined($bs_trees) || ref($bs_trees) ne 'ARRAY'){
111 die "ERROR: second parameter in assess_bootstrap() must be a list";
113 my $num_bs_trees = scalar(@$bs_trees);
114 if($num_bs_trees < 1){
115 die "ERROR: no bootstrap trees were passed to assess_bootstrap()";
118 # internal nodes are defined by their children
120 my (%lookup,%internal);
121 my $i = 0;
122 for my $tree ( $guide_tree, @$bs_trees ) {
123 # Do this as a top down approach, can probably be
124 # improved by caching internal node states, but not going
125 # to worry about it right now.
127 my @allnodes = $tree->get_nodes;
128 my @internalnodes = grep { ! $_->is_Leaf } @allnodes;
129 for my $node ( @internalnodes ) {
130 my @tips = sort map { $_->id }
131 grep { $_->is_Leaf() } $node->get_all_Descendents;
132 my $id = "(".join(",", @tips).")";
133 if( $i == 0 ) {
134 $internal{$id} = $node->internal_id;
135 } else {
136 $lookup{$id}++;
139 $i++;
141 #my @save; # unsure why this variable is needed
142 for my $l ( keys %lookup ) {
143 if( defined $internal{$l} ) {#&& $lookup{$l} > $min_seen ) {
144 my $intnode = $guide_tree->find_node(-internal_id => $internal{$l});
145 $intnode->bootstrap(sprintf("%d",100 * $lookup{$l} / $num_bs_trees));
148 return $guide_tree;
151 =head2 transfer_bootstrap_expectation
153 Title : transfer_bootstrap_expectation
154 Usage : my $tree_with_bs = $stats->transfer_bootstrap_expectation(\@bs_trees,$guide_tree);
155 Function: Calculates the Transfer Bootstrap Expectation (TBE) for internal nodes based on
156 the methods outlined in Lemoine et al, Nature, 2018.
157 Currently experimental.
158 Returns : L<Bio::Tree::TreeI>
159 Args : Arrayref of L<Bio::Tree::TreeI>s
160 Guide tree, L<Bio::Tree::TreeI>s
162 =cut
164 sub transfer_bootstrap_expectation{
165 my ($self,$bs_trees,$guide_tree) = @_;
167 if(!defined($bs_trees) || ref($bs_trees) ne 'ARRAY'){
168 die "ERROR: second parameter in assess_bootstrap() must be a list";
170 my $num_bs_trees = scalar(@$bs_trees);
171 if($num_bs_trees < 1){
172 die "ERROR: no bootstrap trees were passed to ".(caller(0))[3];
175 # internal nodes are defined by their children
176 my %internal = ();
177 my %leafNameId = ();
178 my @idLookup = ();
179 my @internalLookup = ();
180 my @tree = ($guide_tree, @$bs_trees);
181 my $numTrees = scalar(@tree);
182 for(my $i = 0; $i < $numTrees; $i++){ # guide tree's index is $i==0
183 # Do this as a top down approach, can probably be
184 # improved by caching internal node states, but not going
185 # to worry about it right now.
187 my @allnodes = $tree[$i]->get_nodes;
188 my @internalnodes = grep { ! $_->is_Leaf } @allnodes;
189 for my $node ( @internalnodes ) {
190 my @tips = sort map { $_->id }
191 grep { $_->is_Leaf() } $node->get_all_Descendents;
192 my $id = join(",", @tips);
193 # Map the concatenated-leaf ID to the internal ID on the guide tree
194 if( $i == 0 ) {
195 $internal{$id} = $node->internal_id;
196 $leafNameId{$node->internal_id} = $id;
199 # Record the tips for each tree's internal node
200 # ID lookup (concatenated string of leaf names)
201 $idLookup[$i]{$id} = \@tips;
202 # Internal ID lookup
203 $internalLookup[$i]{$internal{$id}} = \@tips;
207 # Find the average distance from branch b to all
208 # bootstrap trees' branches b*
209 my @id = sort keys %internal;
210 my $numIds = @id;
211 # Loop through all internal nodes of the guide tree
212 for(my $j=0; $j<$numIds; $j++){
213 my $refNode = $guide_tree->find_node(-internal_id => $internal{$id[$j]});
214 my $refNodeId = $refNode->internal_id;
215 my $refJoinId = $leafNameId{$refNodeId};
216 my $refLeaves = $idLookup[0]{$refJoinId};
217 my %refLeafIndex = map{$_=>1} @$refLeaves;
218 #next if(!defined($refLeaves));
220 # For each internal node, start calculating for
221 # an average TBE distance.
222 my $nodeTbeTotal = 0;
224 # Loop through all bootstrap trees, skipping the 0th
225 # tree which is the guide tree.
226 for(my $i=1;$i<$numTrees;$i++){
228 # Find the right branch to bootstrap with. The right
229 # branch will be the one that has the smallest
230 # TBE distance.
231 my @bsNode = grep {!$_->is_Leaf} $tree[$i]->get_nodes;
232 my $numBsIds = scalar(@bsNode);
233 my $minDistance = ~0; # large int
234 for(my $k=0;$k<$numBsIds;$k++){
235 my @queryLeaves = sort map { $_->id }
236 grep { $_->is_Leaf() } $bsNode[$k]->get_all_Descendents;
238 my %queryLeafIndex = map{$_=>1} @queryLeaves;
240 # How many moves does it take to go from query to ref?
241 my $dist=0;
242 for my $queryLeaf(@queryLeaves){
243 if(!$refLeafIndex{$queryLeaf}){
244 $dist++;
247 for my $refLeaf(@$refLeaves){
248 if(!$queryLeafIndex{$refLeaf}){
249 $dist++;
253 if($dist < $minDistance){
254 $minDistance = $dist;
257 $nodeTbeTotal += $minDistance;
259 my $avgTbe = $nodeTbeTotal / $numTrees;
261 # Calculate the average of all b to b* distances
262 # But it is also 1 - average.
263 my $numRefLeaves = scalar(@$refLeaves);
264 my $nodeTbe = 1 - $avgTbe/$numRefLeaves;
265 # Round to an integer
266 $refNode->bootstrap(sprintf("%0.0f",100 * $nodeTbe));
269 return $guide_tree;
273 =head2 cherries
275 Example : cherries($tree, $node);
276 Description: Count number of paired leaf nodes
277 in a binary tree
278 Returns : integer
279 Exceptions :
280 Args : 1. Bio::Tree::TreeI object
281 2. Bio::Tree::NodeI object within the tree, optional
283 Commonly used statistics assume a binary tree, but this methods
284 returns a value even for trees with polytomies.
286 =cut
288 sub cherries ($;$) {
289 my $self = shift;
290 my $tree = shift;
291 my $node = shift || $tree->get_root_node;
293 my $cherries = 0;
294 my @descs = $node->each_Descendent;
296 if ($descs[0]->is_Leaf and $descs[1]->is_Leaf) {
297 if ($descs[3]) { #polytomy at leaf level
298 $cherries = 0;
299 } else {
300 $cherries = 1;
302 } else {
303 # recurse
304 foreach my $desc (@descs) {
305 $cherries += $self->cherries($tree, $desc);
308 return $cherries;
312 =head2 Tree-Trait statistics
314 The following methods produce descriptors of trait distribution among
315 leaf nodes within the trees. They require that a trait has been set
316 for each leaf node. The tag methods of Bio::Tree::Node are used to
317 store them as key/value pairs. In this way, one tree can store more
318 than one trait.
320 Trees have method add_traits() to set trait values from a file. See the
321 add_trait() method in L<Bio::Tree::TreeFunctionsI>.
323 =head2 fitch
325 Example : fitch($tree, $key, $node);
326 Description: Calculates Parsimony Score (PS) and internal trait
327 values using the Fitch 1971 parsimony algorithm for
328 the subtree a defined by the (internal) node.
329 Node defaults to the root.
330 Returns : true on success
331 Exceptions : leaf nodes have to have the trait defined
332 Args : 1. Bio::Tree::TreeI object
333 2. trait name string
334 3. Bio::Tree::NodeI object within the tree, optional
336 Runs first L<fitch_up> that calculates parsimony scores and then
337 L<fitch_down> that should resolve most of the trait/character state
338 ambiguities.
340 Fitch, W.M., 1971. Toward defining the course of evolution: minimal
341 change for a specific tree topology. Syst. Zool. 20, 406-416.
343 You can access calculated parsimony values using:
345 $score = $node->->get_tag_values('ps_score');
347 and the trait value with:
349 $traitvalue = $node->->get_tag_values('ps_trait'); # only the first
350 @traitvalues = $node->->get_tag_values('ps_trait');
352 Note that there can be more that one trait value, especially for the
353 root node.
355 =cut
357 sub fitch {
358 my $self = shift;
359 my $tree = shift;
360 my $key = shift || $self->throw("Trait name is needed");
361 my $node = shift || $tree->get_root_node;
363 $self->fitch_up($tree, $key, $node);
364 $self->fitch_down($tree, $node);
368 =head2 ps
370 Example : ps($tree, $key, $node);
371 Description: Calculates Parsimony Score (PS) from Fitch 1971
372 parsimony algorithm for the subtree as defined
373 by the (internal) node.
374 Node defaults to the root.
375 Returns : integer, 1 < PS < n, where n is number of branches
376 Exceptions : leaf nodes have to have the trait defined
377 Args : 1. Bio::Tree::TreeI object
378 2. trait name string
379 3. Bio::Tree::NodeI object within the tree, optional
381 This is the first half of the Fitch algorithm that is enough for
382 calculating the resolved parsimony values. The trait/chararacter
383 states are commonly left in ambiguous state. To resolve them, run
384 L<fitch_down>.
386 =cut
388 sub ps { shift->fitch_up(@_) }
391 =head2 fitch_up
393 Example : fitch_up($tree, $key, $node);
394 Description: Calculates Parsimony Score (PS) from the Fitch 1971
395 parsimony algorithm for the subtree as defined
396 by the (internal) node.
397 Node defaults to the root.
398 Returns : integer, 1< PS < n, where n is number of branches
399 Exceptions : leaf nodes have to have the trait defined
400 Args : 1. Bio::Tree::TreeI object
401 2. trait name string
402 3. Bio::Tree::NodeI object within the tree, optional
404 This is a more generic name for L<ps> and indicates that it performs
405 the first bottom-up tree traversal that calculates the parsimony score
406 but usually leaves trait/character states ambiguous. If you are
407 interested in internal trait states, running L<fitch_down> should
408 resolve most of the ambiguities.
410 =cut
412 sub fitch_up {
413 my $self = shift;
414 my $tree = shift;
415 my $key = shift || $self->throw("Trait name is needed");
416 my $node = shift || $tree->get_root_node;
418 if ($node->is_Leaf) {
419 $self->throw ("ERROR: ". $node->internal_id. " needs a value for trait $key")
420 unless $node->has_tag($key);
421 $node->set_tag_value('ps_trait', $node->get_tag_values($key) );
422 $node->set_tag_value('ps_score', 0 );
423 return; # end of recursion
426 foreach my $child ($node->each_Descendent) {
427 $self->fitch_up($tree, $key, $child);
430 my %intersection;
431 my %union;
432 my $score;
434 foreach my $child ($node->each_Descendent) {
435 foreach my $trait ($child->get_tag_values('ps_trait') ) {
436 $intersection{$trait}++ if $union{$trait};
437 $union{$trait}++;
439 $score += $child->get_tag_values('ps_score');
442 if (keys %intersection) {
443 $node->set_tag_value('ps_trait', keys %intersection);
444 $node->set_tag_value('ps_score', $score);
445 } else {
446 $node->set_tag_value('ps_trait', keys %union);
447 $node->set_tag_value('ps_score', $score+1);
450 if ($self->verbose) {
451 print "-- node --------------------------\n";
452 print "iID: ", $node->internal_id, " (", $node->id, ")\n";
453 print "Trait: ", join (', ', $node->get_tag_values('ps_trait') ), "\n";
454 print "length :", scalar($node->get_tag_values('ps_score')) , "\n";
456 return scalar $node->get_tag_values('ps_score');
460 =head2 fitch_down
462 Example : fitch_down($tree, $node);
463 Description: Runs the second pass from Fitch 1971
464 parsimony algorithm to resolve ambiguous
465 trait states left by first pass.
466 by the (internal) node.
467 Node defaults to the root.
468 Returns : true
469 Exceptions : dies unless the trait is defined in all nodes
470 Args : 1. Bio::Tree::TreeI object
471 2. Bio::Tree::NodeI object within the tree, optional
473 Before running this method you should have ran L<fitch_up> (alias to
474 L<ps> ). Note that it is not guaranteed that all states are completely
475 resolved.
477 =cut
479 sub fitch_down {
480 my $self = shift;
481 my $tree = shift;
482 my $node = shift || $tree->get_root_node;
484 my $key = 'ps_trait';
485 $self->throw ("ERROR: ". $node->internal_id. " needs a value for $key")
486 unless $node->has_tag($key);
488 my $nodev;
489 foreach my $trait ($node->get_tag_values($key) ) {
490 $nodev->{$trait}++;
493 foreach my $child ($node->each_Descendent) {
494 next if $child->is_Leaf; # end of recursion
496 my $intersection;
497 foreach my $trait ($child->get_tag_values($key) ) {
498 $intersection->{$trait}++ if $nodev->{$trait};
501 $self->fitch_down($tree, $child);
502 $child->set_tag_value($key, keys %$intersection);
504 return 1; # success
508 =head2 persistence
510 Example : persistence($tree, $node);
511 Description: Calculates the persistence
512 for node in the subtree defined by the (internal)
513 node. Node defaults to the root.
514 Returns : int, number of generations trait value has to remain same
515 Exceptions : all the nodes need to have the trait defined
516 Args : 1. Bio::Tree::TreeI object
517 2. Bio::Tree::NodeI object within the tree, optional
519 Persistence measures the stability that the trait value has in a
520 tree. It expresses the number of generations the trait value remains
521 the same. All the decendants of the root in the same generation have
522 to share the same value.
524 Depends on Fitch's parsimony score (PS).
526 =cut
528 sub _persistence {
529 my $self = shift;
530 my $tree = shift;
531 my $node = shift;
532 my $value = shift || $self->throw("Value is needed");
535 my $key = 'ps_trait';
537 $self->throw("Node is needed") unless $node->isa('Bio::Tree::NodeI');
539 return 0 unless $node->get_tag_values($key) eq $value; # wrong value
540 return 1 if $node->is_Leaf; # end of recursion
542 my $persistence = 10000000; # an arbitrarily large number
543 foreach my $child ($node->each_Descendent) {
544 my $pers = $self->_persistence($tree, $child, $value);
545 $persistence = $pers if $pers < $persistence;
547 return $persistence + 1;
550 sub persistence {
551 my $self = shift;
552 my $tree = shift;
553 my $node = shift || $tree->get_root_node;
554 $self->throw("Node is needed") unless $node->isa('Bio::Tree::NodeI');
556 my $key = 'ps_trait';
557 my $value = $node->get_tag_values($key);
559 #calculate
560 my $persistence = $self->_persistence($tree, $node, $value);
561 $node->set_tag_value('persistance', $persistence);
562 return $persistence;
566 =head2 count_subclusters
568 Example : count_clusters($tree, $node);
569 Description: Calculates the number of sub-clusters
570 in the subtree defined by the (internal)
571 node. Node defaults to the root.
572 Returns : int, count
573 Exceptions : all the nodes need to have the trait defined
574 Args : 1. Bio::Tree::TreeI object
575 2. Bio::Tree::NodeI object within the tree, optional
577 Depends on Fitch's parsimony score (PS).
579 =cut
581 sub _count_subclusters {
582 my $self = shift;
583 my $tree = shift;
584 my $node = shift;
585 my $value = shift || $self->throw("Value is needed");
587 my $key = 'ps_trait';
589 $self->throw ("ERROR: ". $node->internal_id. " needs a value for trait $key")
590 unless $node->has_tag($key);
592 if ($node->get_tag_values($key) eq $value) {
593 if ($node->get_tag_values('ps_score') == 0) {
594 return 0;
595 } else {
596 my $count = 0;
597 foreach my $child ($node->each_Descendent) {
598 $count += $self->_count_subclusters($tree, $child, $value);
600 return $count;
603 return 1;
606 sub count_subclusters {
607 my $self = shift;
608 my $tree = shift;
609 my $node = shift || $tree->get_root_node;
610 $self->throw("Node is needed") unless $node->isa('Bio::Tree::NodeI');
612 my $key = 'ps_trait';
613 my $value = $node->get_tag_values($key);
615 return $self->_count_subclusters($tree, $node, $value);
619 =head2 count_leaves
621 Example : count_leaves($tree, $node);
622 Description: Calculates the number of leaves with same trait
623 value as root in the subtree defined by the (internal)
624 node. Requires an unbroken line of identical trait values.
625 Node defaults to the root.
626 Returns : int, number of leaves with this trait value
627 Exceptions : all the nodes need to have the trait defined
628 Args : 1. Bio::Tree::TreeI object
629 2. Bio::Tree::NodeI object within the tree, optional
631 Depends on Fitch's parsimony score (PS).
633 =cut
635 sub _count_leaves {
636 my $self = shift;
637 my $tree = shift;
638 my $node = shift || $tree->get_root_node;
639 my $value = shift;
641 my $key = 'ps_trait';
643 $self->throw ("ERROR: ". $node->internal_id. " needs a value for trait $key")
644 unless $node->has_tag($key);
646 if ($node->get_tag_values($key) eq $value) {
647 #print $node->id, ": ", $node->get_tag_values($key), "\n";
648 return 1 if $node->is_Leaf; # end of recursion
650 my $count = 0;
651 foreach my $child ($node->each_Descendent) {
652 $count += $self->_count_leaves($tree, $child, $value);
654 return $count;
656 return 0;
659 sub count_leaves {
660 my $self = shift;
661 my $tree = shift;
662 my $node = shift || $tree->get_root_node;
663 $self->throw("Node is needed") unless $node->isa('Bio::Tree::NodeI');
665 my $key = 'ps_trait';
666 my $value = $node->get_tag_values($key);
668 return $self->_count_leaves($tree, $node, $value);
672 =head2 phylotype_length
674 Example : phylotype_length($tree, $node);
675 Description: Sums up the branch lengths within phylotype
676 excluding the subclusters where the trait values
677 are different
678 Returns : float, length
679 Exceptions : all the nodes need to have the trait defined
680 Args : 1. Bio::Tree::TreeI object
681 2. Bio::Tree::NodeI object within the tree, optional
683 Depends on Fitch's parsimony score (PS).
685 =cut
687 sub _phylotype_length {
688 my $self = shift;
689 my $tree = shift;
690 my $node = shift;
691 my $value = shift;
693 my $key = 'ps_trait';
695 $self->throw ("ERROR: ". $node->internal_id. " needs a value for trait $key")
696 unless $node->has_tag($key);
698 return 0 if $node->get_tag_values($key) ne $value;
699 return $node->branch_length if $node->is_Leaf; # end of recursion
701 my $length = 0;
702 foreach my $child ($node->each_Descendent) {
703 my $sub_len = $self->_phylotype_length($tree, $child, $value);
704 $length += $sub_len;
705 $length += $child->branch_length if not $child->is_Leaf and $sub_len;
707 return $length;
710 sub phylotype_length {
711 my $self = shift;
712 my $tree = shift;
713 my $node = shift || $tree->get_root_node;
715 my $key = 'ps_trait';
716 my $value = $node->get_tag_values($key);
718 return $self->_phylotype_length($tree, $node, $value);
722 =head2 sum_of_leaf_distances
724 Example : sum_of_leaf_distances($tree, $node);
725 Description: Sums up the branch lengths from root to leaf
726 excluding the subclusters where the trait values
727 are different
728 Returns : float, length
729 Exceptions : all the nodes need to have the trait defined
730 Args : 1. Bio::Tree::TreeI object
731 2. Bio::Tree::NodeI object within the tree, optional
733 Depends on Fitch's parsimony score (PS).
735 =cut
737 sub _sum_of_leaf_distances {
738 my $self = shift;
739 my $tree = shift;
740 my $node = shift;
741 my $value = shift;
743 my $key = 'ps_trait';
745 $self->throw ("ERROR: ". $node->internal_id. " needs a value for trait $key")
746 unless $node->has_tag($key);
747 return 0 if $node->get_tag_values($key) ne $value;
748 #return $node->branch_length if $node->is_Leaf; # end of recursion
749 return 0 if $node->is_Leaf; # end of recursion
751 my $length = 0;
752 foreach my $child ($node->each_Descendent) {
753 $length += $self->_count_leaves($tree, $child, $value) * $child->branch_length +
754 $self->_sum_of_leaf_distances($tree, $child, $value);
756 return $length;
759 sub sum_of_leaf_distances {
760 my $self = shift;
761 my $tree = shift;
762 my $node = shift || $tree->get_root_node;
764 my $key = 'ps_trait';
765 my $value = $node->get_tag_values($key);
767 return $self->_sum_of_leaf_distances($tree, $node, $value);
771 =head2 genetic_diversity
773 Example : genetic_diversity($tree, $node);
774 Description: Diversity is the sum of root to leaf distances
775 within the phylotype normalised by number of leaf
776 nodes
777 Returns : float, value of genetic diversity
778 Exceptions : all the nodes need to have the trait defined
779 Args : 1. Bio::Tree::TreeI object
780 2. Bio::Tree::NodeI object within the tree, optional
782 Depends on Fitch's parsimony score (PS).
784 =cut
786 sub genetic_diversity {
787 my $self = shift;
788 my $tree = shift;
789 my $node = shift || $tree->get_root_node;
791 return $self->sum_of_leaf_distances($tree, $node) /
792 $self->count_leaves($tree, $node);
796 =head2 statratio
798 Example : statratio($tree, $node);
799 Description: Ratio of the stem length and the genetic diversity of the
800 phylotype L<genetic_diversity>
801 Returns : float, separation score
802 Exceptions : all the nodes need to have the trait defined
803 Args : 1. Bio::Tree::TreeI object
804 2. Bio::Tree::NodeI object within the tree, optional
806 Statratio gives a measure of separation and variability within the phylotype.
807 Larger values identify more rapidly evolving and recent phylotypes.
809 Depends on Fitch's parsimony score (PS).
811 =cut
813 sub statratio {
814 my $self = shift;
815 my $tree = shift;
816 my $node = shift || $tree->get_root_node;
818 my $div = $self->genetic_diversity($tree, $node);
819 return 0 if $div == 0;
820 return $node->branch_length / $div;
825 =head2 ai
827 Example : ai($tree, $key, $node);
828 Description: Calculates the Association Index (AI) of Whang et
829 al. 2001 for the subtree defined by the (internal)
830 node. Node defaults to the root.
831 Returns : real
832 Exceptions : leaf nodes have to have the trait defined
833 Args : 1. Bio::Tree::TreeI object
834 2. trait name string
835 3. Bio::Tree::NodeI object within the tree, optional
837 Association index (AI) gives a more fine grained results than PS since
838 the result is a real number. ~0 E<lt>= AI.
840 Wang, T.H., Donaldson, Y.K., Brettle, R.P., Bell, J.E., Simmonds, P.,
841 2001. Identification of shared populations of human immunodeficiency
842 Virus Type 1 infecting microglia and tissue macrophages outside the
843 central nervous system. J. Virol. 75 (23), 11686-11699.
845 =cut
847 sub _node_ai {
848 my $self = shift;
849 my $node = shift;
850 my $key = shift;
852 my $traits;
853 my $leaf_count = 0;
854 for my $desc ( $node->get_all_Descendents ) {
855 next unless $desc->is_Leaf;
856 $leaf_count++;
857 $self->throw ("Node ". $desc->id. " needs a value for trait [$key]")
858 unless $desc->has_tag($key);
859 my $trait = $desc->get_tag_values($key);
860 $traits->{$trait}++;
862 my $most_common = 0;
863 foreach ( keys %$traits) {
864 $most_common = $traits->{$_} if $traits->{$_} > $most_common;
866 return sprintf "%1.6f", (1 - ($most_common/$leaf_count) ) / (2**($leaf_count-1) );
869 sub ai {
870 my $self = shift;
871 my $tree = shift;
872 my $key = shift || $self->throw("Trait name is needed");
873 my $start_node = shift || $tree->get_root_node;
874 return unless $start_node;
876 my $sum = 0;
877 for my $node ( $start_node->get_all_Descendents ) {
878 next if $node->is_Leaf;
879 $sum += $self->_node_ai($node, $key);
881 return $sum;
885 =head2 mc
887 Example : mc($tree, $key, $node);
888 Description: Calculates the Monophyletic Clade (MC) size statistics
889 for the subtree a defined by the (internal) node.
890 Node defaults to the root;
891 Returns : hashref with trait values as keys
892 Exceptions : leaf nodes have to have the trait defined
893 Args : 1. Bio::Tree::TreeI object
894 2. trait name string
895 3. Bio::Tree::NodeI object within the tree, optional
897 Monophyletic Clade (MC) size statistics by Salemi at al 2005. It is
898 calculated for each trait value. 1 E<lt>= MC E<lt>= nx, where nx is the
899 number of tips with value x:
901 pick the internal node with maximim value for
902 number of of tips with only trait x
904 MC was defined by Parker et al 2008.
906 Salemi, M., Lamers, S.L., Yu, S., de Oliveira, T., Fitch, W.M., McGrath, M.S.,
907 2005. Phylodynamic analysis of Human Immunodeficiency Virus Type 1 in
908 distinct brain compartments provides a model for the neuropathogenesis of
909 AIDS. J. Virol. 79 (17), 11343-11352.
911 Parker, J., Rambaut A., Pybus O., 2008. Correlating viral phenotypes
912 with phylogeny: Accounting for phylogenetic uncertainty Infection,
913 Genetics and Evolution 8 (2008), 239-246.
915 =cut
917 sub _node_mc {
918 my $self = shift;
919 my $node = shift;
920 my $key = shift;
922 my $traits;
923 my $leaf_count = 0;
924 for my $node2 ( $node->get_all_Descendents ) {
925 next unless $node2->is_Leaf;
926 $leaf_count++;
927 my $trait = $node2->get_tag_values($key);
928 $traits->{$trait}++;
930 return $traits;
933 sub mc {
934 my $self = shift;
935 my $tree = shift;
936 my $key = shift || die "Trait name is needed";
937 my $start_node = shift || $tree->get_root_node;
938 return unless $start_node;
940 my $sum; # hashref, keys are trait values
941 my $keys; # hashref, keys are trait values
942 foreach my $node ( $start_node->get_all_Descendents ) {
943 next if $node->is_Leaf;
944 my $traits = $self->_node_mc($node, $key);
945 if (scalar keys %$traits == 1) {
946 my ($value) = keys %$traits;
947 no warnings;
948 $sum->{$value} = $traits->{$value}
949 if $sum->{$value} < $traits->{$value};
950 } else {
951 map { $keys->{$_} = 1 } keys %$traits;
954 # check for cases where there are no clusters
955 foreach my $value (keys %$keys) {
956 $sum->{$value} = 1 unless defined $sum->{$value};
958 return $sum;