Bio::SeqIO::strider: move into its own distribution
[bioperl-live.git] / lib / Bio / Tree / Compatible.pm
blob9983373111b7f334999c37816c041d89e47f6eb8
2 # BioPerl module for Bio::Tree::Compatible
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Gabriel Valiente <valiente@lsi.upc.edu>
8 # Copyright Gabriel Valiente
10 # You may distribute this module under the same terms as Perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Tree::Compatible - Testing compatibility of phylogenetic trees
17 with nested taxa.
19 =head1 SYNOPSIS
21 use Bio::Tree::Compatible;
22 use Bio::TreeIO;
23 my $input = Bio::TreeIO->new('-format' => 'newick',
24 '-file' => 'input.tre');
25 my $t1 = $input->next_tree;
26 my $t2 = $input->next_tree;
28 my ($incompat, $ilabels, $inodes) = Bio::Tree::Compatible::is_compatible($t1,$t2);
29 if ($incompat) {
30 my %cluster1 = %{ Bio::Tree::Compatible::cluster_representation($t1) };
31 my %cluster2 = %{ Bio::Tree::Compatible::cluster_representation($t2) };
32 print "incompatible trees\n";
33 if (scalar(@$ilabels)) {
34 foreach my $label (@$ilabels) {
35 my $node1 = $t1->find_node(-id => $label);
36 my $node2 = $t2->find_node(-id => $label);
37 my @c1 = sort @{ $cluster1{$node1} };
38 my @c2 = sort @{ $cluster2{$node2} };
39 print "label $label";
40 print " cluster"; map { print " ",$_ } @c1;
41 print " cluster"; map { print " ",$_ } @c2; print "\n";
44 if (scalar(@$inodes)) {
45 while (@$inodes) {
46 my $node1 = shift @$inodes;
47 my $node2 = shift @$inodes;
48 my @c1 = sort @{ $cluster1{$node1} };
49 my @c2 = sort @{ $cluster2{$node2} };
50 print "cluster"; map { print " ",$_ } @c1;
51 print " properly intersects cluster";
52 map { print " ",$_ } @c2; print "\n";
55 } else {
56 print "compatible trees\n";
59 =head1 DESCRIPTION
61 NB: This module has exclusively class methods that work on Bio::Tree::TreeI
62 objects. An instance of Bio::Tree::Compatible cannot itself represent a tree,
63 and so typically there is no need to create one.
65 Bio::Tree::Compatible is a Perl tool for testing compatibility of
66 phylogenetic trees with nested taxa represented as Bio::Tree::Tree
67 objects. It is based on a recent characterization of ancestral
68 compatibility of semi-labeled trees in terms of their cluster
69 representations.
71 A semi-labeled tree is a phylogenetic tree with some of its internal
72 nodes labeled, and it can represent a classification tree as well as a
73 phylogenetic tree with nested taxa, with labeled internal nodes
74 corresponding to taxa at a higher level of aggregation or nesting than
75 that of their descendents.
77 Two semi-labeled trees are compatible if their topological
78 restrictions to the common labels are such that for each node label,
79 the smallest clusters containing it in each of the trees coincide and,
80 furthermore, no cluster in one of the trees properly intersects a
81 cluster of the other tree.
83 Future extensions of Bio::Tree::Compatible include a
84 Bio::Tree::Supertree module for combining compatible phylogenetic
85 trees with nested taxa into a common supertree.
87 =head1 FEEDBACK
89 =head2 Mailing Lists
91 User feedback is an integral part of the evolution of this and other
92 Bioperl modules. Send your comments and suggestions preferably to the
93 Bioperl mailing list. Your participation is much appreciated.
95 bioperl-l@bioperl.org - General discussion
96 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
98 =head2 Support
100 Please direct usage questions or support issues to the mailing list:
102 I<bioperl-l@bioperl.org>
104 rather than to the module maintainer directly. Many experienced and
105 reponsive experts will be able look at the problem and quickly
106 address it. Please include a thorough description of the problem
107 with code and data examples if at all possible.
109 =head2 Reporting Bugs
111 Report bugs to the Bioperl bug tracking system to help us keep track
112 of the bugs and their resolution. Bug reports can be submitted via the
113 web:
115 https://github.com/bioperl/bioperl-live/issues
117 =head1 SEE ALSO
119 =over
121 =item * Philip Daniel and Charles Semple. Supertree Algorithms for
122 Nested Taxa. In: Olaf R. P. Bininda-Emonds (ed.) Phylogenetic
123 Supertrees: Combining Information to Reveal the Tree of Life,
124 I<Computational Biology>, vol. 4, chap. 7, pp. 151-171. Kluwer (2004).
126 =item * Charles Semple, Philip Daniel, Wim Hordijk, Roderic
127 D. M. Page, and Mike Steel: Supertree Algorithms for Ancestral
128 Divergence Dates and Nested Taxa. Bioinformatics B<20>(15), 2355-2360
129 (2004).
131 =item * Merce Llabres, Jairo Rocha, Francesc Rossello, and Gabriel
132 Valiente: On the Ancestral Compatibility of Two Phylogenetic Trees
133 with Nested Taxa. J. Math. Biol. B<53>(3), 340-364 (2006).
135 =back
137 =head1 AUTHOR - Gabriel Valiente
139 Email valiente@lsi.upc.edu
141 =head1 APPENDIX
143 The rest of the documentation details each of the object methods.
145 =cut
147 package Bio::Tree::Compatible;
148 use strict;
150 # Object preamble - inherits from Bio::Root::Root
152 use Set::Scalar;
154 use base qw(Bio::Root::Root);
156 =head2 postorder_traversal
158 Title : postorder_traversal
159 Usage : my @nodes = @{ $tree->postorder_traversal }
160 Function: Return list of nodes in postorder
161 Returns : reference to array of Bio::Tree::Node
162 Args : none
164 For example, the postorder traversal of the tree
165 C<(((A,B)C,D),(E,F,G));> is a reference to an array of nodes with
166 internal_id 0 through 9, because the Newick standard representation
167 for phylogenetic trees is based on a postorder traversal.
169 +---A +---0
171 +---+---C +---4---2
172 | | | | | |
173 | | +---B | | +---1
174 | | | |
175 + +-------D 9 +-------3
177 | +-----E | +-----5
178 | | | |
179 +-----+-----F +-----8-----6
181 +-----G +-----7
183 =cut
185 sub postorder_traversal {
186 my($self) = @_;
187 my @stack;
188 my @queue;
189 push @stack, $self->get_root_node;
190 while (@stack) {
191 my $node = pop @stack;
192 push @queue, $node;
193 foreach my $child ($node->each_Descendent(-sortby => 'internal_id')) {
194 push @stack, $child;
197 my @postorder = reverse @queue;
198 return \@postorder;
201 =head2 cluster_representation
203 Title : cluster_representation
204 Usage : my %cluster = %{ $tree->cluster_representation }
205 Function: Compute the cluster representation of a tree
206 Returns : reference to hash of array of string indexed by
207 Bio::Tree::Node
208 Args : none
210 For example, the cluster representation of the tree
211 C<(((A,B)C,D),(E,F,G));> is a reference to a hash associating an array
212 of string (descendent labels) to each node, as follows:
214 0 --> [A]
215 1 --> [B]
216 2 --> [A,B,C]
217 3 --> [D]
218 4 --> [A,B,C,D]
219 5 --> [E]
220 6 --> [F]
221 7 --> [G]
222 8 --> [E,F,G]
223 9 --> [A,B,C,D,E,F,G]
225 =cut
227 sub cluster_representation {
228 my ($tree) = @_;
229 my %cluster;
230 my @postorder = @{ postorder_traversal($tree) };
231 foreach my $node ( @postorder ) {
232 my @labeled = map { $_->id } grep { $_->id } $node->get_Descendents;
233 push @labeled, $node->id if $node->id;
234 $cluster{$node} = \@labeled;
236 return \%cluster;
239 =head2 common_labels
241 Title : common_labels
242 Usage : my $labels = $tree1->common_labels($tree2);
243 Function: Return set of common node labels
244 Returns : Set::Scalar
245 Args : Bio::Tree::Tree
247 For example, the common labels of the tree C<(((A,B)C,D),(E,F,G));>
248 and the tree C<((A,B)H,E,(J,(K)G)I);> are: C<[A,B,E,G]>.
250 +---A +---A
252 +---+---C +-------H
253 | | | | |
254 | | +---B | +---B
255 | | |
256 + +-------D +-----------E
258 | +-----E | +-------J
259 | | | |
260 +-----+-----F +---I
262 +-----G +---G---K
264 =cut
266 sub common_labels {
267 my($self,$arg) = @_;
268 my @labels1 = map { $_->id } grep { $_->id } $self->get_nodes;
269 my $common = Set::Scalar->new( @labels1 );
270 my @labels2 = map { $_->id } grep { $_->id } $arg->get_nodes;
271 my $temp = Set::Scalar->new( @labels2 );
272 return $common->intersection($temp);
275 =head2 topological_restriction
277 Title : topological_restriction
278 Usage : $tree->topological_restriction($labels)
279 Function: Compute the topological restriction of a tree to a subset
280 of node labels
281 Returns : Bio::Tree::Tree
282 Args : Set::Scalar
284 For example, the topological restrictions of each of the trees
285 C<(((A,B)C,D),(E,F,G));> and C<((A,B)H,E,(J,(K)G)I);> to the labels
286 C<[A,B,E,G]> are as follows:
288 +---A +---A
290 +---+---+ +---+
291 | | | |
292 | +---B | +---B
294 | +---E +-------E
295 | | |
296 +-------+ +---+---G
298 +---G
300 =cut
302 sub topological_restriction {
303 my ($tree, $labels) = @_;
304 for my $node ( @{ postorder_traversal($tree) } ) {
305 unless (ref($node)) { # skip $node if already removed
306 my @cluster = map { $_->id } grep { $_->id } $node->get_Descendents;
307 push @cluster, $node->id if $node->id;
308 my $cluster = Set::Scalar->new(@cluster);
309 if ($cluster->is_disjoint($labels)) {
310 $tree->remove_Node($node);
311 } else {
312 if ($node->id and not $labels->has($node->id)) {
313 $node->{'_id'} = undef;
320 =head2 is_compatible
322 Title : is_compatible
323 Usage : $tree1->is_compatible($tree2)
324 Function: Test compatibility of two trees
325 Returns : boolean
326 Args : Bio::Tree::Tree
328 For example, the topological restrictions of the trees
329 C<(((A,B)C,D),(E,F,G));> and C<((A,B)H,E,(J,(K)G)I);> to their common
330 labels, C<[A,B,E,G]>, are compatible. The respective cluster
331 representations are as follows:
333 [A] [A]
334 [B] [B]
335 [E] [E]
336 [G] [G]
337 [A,B] [A,B]
338 [E,G] [A,B,E,G]
339 [A,B,E,G]
341 As a second example, the trees C<(A,B);> and C<((B)A);> are
342 incompatible. Their respective cluster representations are as follows:
344 [A] [B]
345 [B] [A,B]
346 [A,B]
348 The reason is, the smallest cluster containing label C<A> is C<[A]> in
349 the first tree but C<[A,B]> in the second tree.
351 +---A A---B
355 +---B
357 As a second example, the trees C<(((B,A),C),D);> and C<((A,(D,B)),C);>
358 are also incompatible. Their respective cluster representations are as
359 follows:
361 [A] [A]
362 [B] [B]
363 [C] [C]
364 [D] [D]
365 [A,B] [B,D]
366 [A,B,C] [A,B,D]
367 [A,B,C,D] [A,B,C,D]
369 The reason is, cluster C<[A,B]> properly intersects cluster
370 C<[B,D]>. There are further incompatibilities between these trees:
371 C<[A,B,C]> properly intersects both C<[B,D]> and C<[A,B,D]>.
373 +---B +-------A
375 +---+ +---+ +---D
376 | | | | |
377 +---+ +---A | +---+
378 | | + |
379 + +-------C | +---B
381 +-----------D +-----------C
383 =cut
385 sub is_compatible {
386 my ($tree1, $tree2) = @_;
387 my $common = $tree1->Bio::Tree::Compatible::common_labels($tree2);
388 $tree1->Bio::Tree::Compatible::topological_restriction($common);
389 $tree2->Bio::Tree::Compatible::topological_restriction($common);
390 my @postorder1 = @{ postorder_traversal($tree1) };
391 my @postorder2 = @{ postorder_traversal($tree2) };
392 my %cluster1 = %{ cluster_representation($tree1) };
393 my %cluster2 = %{ cluster_representation($tree2) };
394 my $incompat = 0; # false
395 my @labels;
396 foreach my $label ( $common->elements ) {
397 my $node1 = $tree1->find_node(-id => $label);
398 my @labels1 = @{ $cluster1{$node1} };
399 my $cluster1 = Set::Scalar->new(@labels1);
400 my $node2 = $tree2->find_node(-id => $label);
401 my @labels2 = @{ $cluster2{$node2} };
402 my $cluster2 = Set::Scalar->new(@labels2);
403 unless ( $cluster1->is_equal($cluster2) ) {
404 $incompat = 1; # true
405 push @labels, $label;
408 my @nodes;
409 foreach my $node1 ( @postorder1 ) {
410 my @labels1 = @{ $cluster1{$node1} };
411 my $cluster1 = Set::Scalar->new(@labels1);
412 foreach my $node2 ( @postorder2 ) {
413 my @labels2 = @{$cluster2{$node2} };
414 my $cluster2 = Set::Scalar->new(@labels2);
415 if ($cluster1->is_properly_intersecting($cluster2)) {
416 $incompat = 1; # true
417 push @nodes, $node1, $node2;
421 return ($incompat, \@labels, \@nodes);