2 # BioPerl module for Bio::Tree::DistanceFactory
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Tree::DistanceFactory - Construct a tree using distance based methods
20 use Bio::Tree::DistanceFactory;
22 use Bio::Align::DNAStatistics;
23 my $tfactory = Bio::Tree::DistanceFactory->new(-method => "NJ");
24 my $stats = Bio::Align::DNAStatistics->new();
26 my $alnin = Bio::AlignIO->new(-format => 'clustalw',
28 my $aln = $alnin->next_aln;
29 # Of course matrix can come from a different place
30 # like PHYLIP if you prefer, Bio::Matrix::IO should be able
31 # to parse many things
32 my $jcmatrix = $stats->distance(-align => $aln,
33 -method => 'Jukes-Cantor');
34 my $tree = $tfactory->make_tree($jcmatrix);
39 This is a factory which will construct a phylogenetic tree based on
40 the pairwise sequence distances for a set of sequences. Currently
41 UPGMA (Sokal and Michener 1958) and NJ (Saitou and Nei 1987) tree
42 construction methods are implemented.
46 Eddy SR, Durbin R, Krogh A, Mitchison G, (1998) "Biological Sequence Analysis",
47 Cambridge Univ Press, Cambridge, UK.
49 Howe K, Bateman A, Durbin R, (2002) "QuickTree: building huge
50 Neighbour-Joining trees of protein sequences." Bioinformatics
53 Saitou N and Nei M, (1987) "The neighbor-joining method: a new method
54 for reconstructing phylogenetic trees." Mol Biol Evol 4(4):406-25.
60 User feedback is an integral part of the evolution of this and other
61 Bioperl modules. Send your comments and suggestions preferably to
62 the Bioperl mailing list. Your participation is much appreciated.
64 bioperl-l@bioperl.org - General discussion
65 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
69 Please direct usage questions or support issues to the mailing list:
71 I<bioperl-l@bioperl.org>
73 rather than to the module maintainer directly. Many experienced and
74 reponsive experts will be able look at the problem and quickly
75 address it. Please include a thorough description of the problem
76 with code and data examples if at all possible.
80 Report bugs to the Bioperl bug tracking system to help us keep track
81 of the bugs and their resolution. Bug reports can be submitted the web:
83 https://github.com/bioperl/bioperl-live/issues
85 =head1 AUTHOR - Jason Stajich
87 Email jason-at-bioperl.org
91 The rest of the documentation details each of the object methods.
92 Internal methods are usually preceded with a _
96 package Bio
::Tree
::DistanceFactory
;
97 use vars
qw($DefaultMethod $Precision);
101 $DefaultMethod = 'UPGMA';
107 use base qw(Bio::Root::Root);
112 Usage : my $obj = Bio::Tree::DistanceFactory->new();
113 Function: Builds a new Bio::Tree::DistanceFactory object
114 Returns : an instance of Bio::Tree::DistanceFactory
115 Args : -method => 'NJ' or 'UPGMA'
121 my($class,@args) = @_;
122 my $self = $class->SUPER::new
(@args);
124 my ($method) = $self->_rearrange([qw(METHOD)],
126 $self->method($method || $DefaultMethod);
133 Usage : my $tree = $disttreefact->make_tree($matrix);
134 Function: Build a Tree based on a distance matrix
135 Returns : L<Bio::Tree::TreeI>
136 Args : L<Bio::Matrix::MatrixI> object
142 my ($self,$matrix) = @_;
143 if( ! defined $matrix || !ref($matrix) ||
144 ! $matrix->isa('Bio::Matrix::MatrixI') ) {
145 $self->warn("Need to provide a valid Bio::Matrix::MatrixI object to make_tree");
149 my $method = uc ($self->method);
150 if( $method =~ /NJ/i ) {
151 return $self->_nj($matrix);
152 } elsif( $method =~ /UPGMA/i ) {
153 return $self->_upgma($matrix);
155 $self->warn("Unknown tree construction method '$method'. Cannot run.");
165 Usage : my $tree = $disttreefact->_nj($matrix);
166 Function: Construct a tree based on distance matrix using the
167 Neighbor Joining algorithm (Saitou and Nei, 1987)
168 Implementation based on Kevin Howe's Quicktree implementation
169 and uses his tricks (some based on Bill Bruno's work) to eliminate
170 negative branch lengths
171 Returns : L<Bio::Tree::TreeI>
172 Args : L<Bio::Matrix::MatrixI> object
177 my ($self,$distmat) = @_;
179 # we assume type checking of $aln has already been done
180 # client shouldn't be calling this directly anyways, using the
181 # make_tree method is preferred
183 # so that we can trim the number of digits shown as the branch length
184 my $precisionstr = "%.$Precision"."f";
186 my @names = $distmat->column_names;
187 my $N = scalar @names;
188 my ($i,$j,$m,@nodes,$mat,@r);
192 $self->warn("Can only perform NJ treebuilding on sets of 2 or more species\n");
196 my $d = sprintf($precisionstr,
197 $distmat->get_entry($names[0],$names[1]) / 2);
198 my $root = Bio
::Tree
::Node
->new();
199 for my $nm ( @names ) {
200 $root->add_Descendent( Bio
::Tree
::Node
->new(-id
=> $nm,
201 -branch_length
=> $d));
203 return Bio
::Tree
::Tree
->new(-root
=> $root);
207 for ( $i = 0; $i < $N; $i++ ) {
208 push @nodes, Bio
::Tree
::Node
->new(-id
=> $names[$i]);
210 for( $j = 0; $j < $N; $j++ ) {
211 $mat->[$i][$j] = $distmat->get_entry($names[$i],$names[$j]);
212 $ri += $mat->[$i][$j];
214 $r[$i] = $ri / ($L -2);
217 for( my $nodecount = 0; $nodecount < $N-3; $nodecount++) {
218 my ($mini,$minj,$min);
219 for($i = 0; $i < $N; $i++ ) {
220 next unless defined $nodes[$i];
221 for( $j = 0; $j < $i; $j++ ) {
222 next unless defined $nodes[$j];
223 my $dist = $mat->[$i][$j] - ($r[$i] + $r[$j]);
224 if( ! defined $min ||
226 ($mini,$minj,$min) = ($i,$j,$dist);
230 my $dij = $mat->[$mini][$minj];
231 my $dist_i = ($dij + $r[$mini] - $r[$minj]) / 2;
232 my $dist_j = $dij - $dist_i;
234 # deal with negative branch lengths
235 # per code in K.Howe's quicktree
239 $dist_j = 0 if( $dist_j < 0 );
240 } elsif( $dist_j < 0 ) {
243 $dist_i = 0 if( $dist_i < 0 );
246 $nodes[$mini]->branch_length(sprintf($precisionstr,$dist_i));
247 $nodes[$minj]->branch_length(sprintf($precisionstr,$dist_j));
249 my $newnode = Bio
::Tree
::Node
->new(-descendents
=> [ $nodes[$mini],
252 $nodes[$mini] = $newnode;
253 delete $nodes[$minj];
255 # update the distance matrix
258 for( $m = 0; $m < $N; $m++ ) {
259 next unless defined $nodes[$m];
261 $dmj = $mat->[$m][$minj];
264 ($row,$col) = ($m,$mini);
265 $dmi = $mat->[$row][$col];
267 # from K.Howe's notes in quicktree
268 # we can actually adjust r[m] here, by using the form:
269 # rm = ((rm * numseqs) - dmi - dmj + dmk) / (numseqs-1)
271 # Note: in Bill Bruno's method for negative branch
272 # elimination, then if either dist_i is positive and
273 # dist_j is 0, or dist_i is zero and dist_j is positive
274 # (after adjustment) then the matrix entry is formed
275 # from the distance to the node in question (m) to the
276 # node with the zero branch length (whichever it was).
277 # I think my code already has the same effect; this is
278 # certainly true if dij is equal to dist_i + dist_j,
279 # which it should have been fixed to
281 my $dmk = $mat->[$row][$col] = $mat->[$col][$row] =
282 ($dmi + $dmj - $dij) / 2;
284 # If we don't want to try and correct negative brlens
285 # this is essentially what is in Edddy et al, BSA book.
286 # $r[$m] = (($r[$m] * $L) - $dmi - $dmj + $dmk) / ($L-1);
288 $r[$m] = (($r[$m] * ($L - 2)) - $dmi - $dmj +
289 $mat->[$row][$col]) / ( $L - 3);
297 # should be 3 nodes left
298 my (@leftovernodes,@leftovers);
299 for( my $k = 0; $k < $N; $k++ ) {
300 if( defined $nodes[$k] ) {
302 push @leftovernodes, $nodes[$k];
305 my ($l_0,$l_1,$l_2) = @leftovers;
307 my $dist_i = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_0] -
308 $mat->[$l_2][$l_1] ) / 2;
310 my $dist_j = ( $mat->[$l_1][$l_0] - $dist_i);
311 my $dist_k = ( $mat->[$l_2][$l_0] - $dist_i);
313 # This is Kev's code to get rid of negative branch lengths
316 $dist_j = $mat->[$l_1][$l_0];
317 $dist_k = $mat->[$l_2][$l_0];
320 $dist_k = ( $mat->[$l_2][$l_0] + $mat->[$l_2][$l_1] ) / 2;
321 $dist_k = 0 if( $dist_k < 0 );
322 } elsif( $dist_k < 0 ) {
324 $dist_j = ($mat->[$l_1][$l_0] + $mat->[$l_2][$l_1]) / 2;
325 $dist_j = 0 if( $dist_j < 0 );
327 } elsif( $dist_j < 0 ) {
329 $dist_i = $mat->[$l_1][$l_0];
330 $dist_k = $mat->[$l_2][$l_1];
333 $dist_k = ( $mat->[$l_2][$l_0] + $mat->[$l_2][$l_1]) / 2;
334 $dist_k = 0 if( $dist_k < 0 );
335 } elsif( $dist_k < 0 ) {
337 $dist_i = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_0]) / 2;
338 $dist_i = 0 if( $dist_i < 0 );
340 } elsif( $dist_k < 0 ) {
342 $dist_i = $mat->[$l_2][$l_0];
343 $dist_j = $mat->[$l_2][$l_1];
346 $dist_j = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_1] ) / 2;
347 $dist_j = 0 if $dist_j < 0;
348 } elsif( $dist_j < 0 ) {
350 $dist_i = ($mat->[$l_1][$l_0] + $mat->[$l_2][$l_0]) / 2;
351 $dist_i = 0 if $dist_i < 0;
354 $leftovernodes[0]->branch_length(sprintf($precisionstr,$dist_i));
355 $leftovernodes[1]->branch_length(sprintf($precisionstr,$dist_j));
356 $leftovernodes[2]->branch_length(sprintf($precisionstr,$dist_k));
358 Bio
::Tree
::Tree
->new(-root
=> Bio
::Tree
::Node
->new
359 (-descendents
=> \
@leftovernodes));
365 Usage : my $tree = $disttreefact->_upgma($matrix);
366 Function: Construct a tree based on alignment using UPGMA
367 Returns : L<Bio::Tree::TreeI>
368 Args : L<Bio::Matrix::MatrixI> object
374 my ($self,$distmat) = @_;
375 # we assume type checking of $matrix has already been done
376 # client shouldn't be calling this directly anyways, using the
377 # make_tree method is preferred
379 # algorithm, from Eddy, Durbin, Krogh, Mitchison, 1998
380 # originally by Sokal and Michener 1956
382 my $precisionstr = "%.$Precision"."f";
384 my ($i,$j,$x,$y,@dmat,@orig,@nodes);
386 my @names = $distmat->column_names;
389 my $r = { 'id' => $c,
397 my $K = scalar @clusters;
399 for ( $i = 0; $i < $K; $i++ ) {
400 for( $j = $i+1; $j < $K; $j++ ) {
401 my $d = $distmat->get_entry($names[$i],$names[$j]);
402 # get Min here on first time around, save 1 cycle
403 $dmat[$j][$i] = $dmat[$i][$j] = $d;
404 $orig[$i][$j] = $orig[$j][$i] = $d;
405 if ( ! defined $min || $d <= $min ) {
406 if( defined $min && $min == $d ) {
415 # distance between each cluster is avg distance
416 # between pairs of sequences from each cluster
418 # fencepost - we already have found the $min
419 # so very first time loop is executed we can skip checking
420 unless( defined $min ) {
421 for($i = 0; $i < $K; $i++ ) {
422 for( $j = $i+1; $j < $K; $j++ ) {
423 my $dij = $dmat[$i][$j];
424 if( ! defined $min ||
437 # randomly break ties
438 ($x,$y) = @
{ $mins[int(rand(scalar @mins))] };
440 # now we are going to join clusters x and y, make a new cluster
442 my $node = Bio
::Tree
::Node
->new();
444 for my $cid ( $x,$y ) {
445 my $nid = $clusters[$cid]->{'id'};
446 if( ! defined $nodes[$nid] ) {
447 $nodes[$nid] = Bio
::Tree
::Node
->new(-id
=> $names[$nid]);
449 $nodes[$nid]->branch_length
450 (sprintf($precisionstr,$min/2 - $clusters[$cid]->{'height'}));
451 $node->add_Descendent($nodes[$nid]);
452 push @subids, @
{ $clusters[$cid]->{'contains'} };
454 my $cluster = { 'id' => $c++,
455 'height' => $min / 2,
456 'contains' => [@subids],
459 $K--; # we are going to drop the last node so go ahead and decrement K
460 $nodes[$cluster->{'id'}] = $node;
462 $clusters[$y] = $clusters[$K];
463 $dmat[$y] = $dmat[$K];
464 for ( $i = 0; $i < $K; $i++ ) {
465 $dmat[$i][$y] = $dmat[$y][$i];
468 delete $clusters[$K];
469 $clusters[$x] = $cluster;
470 # now recalculate @dmat
471 for( $i = 0; $i < $K; $i++ ) {
473 $dmat[$i][$x] = $dmat[$x][$i] =
474 &_upgma_distance
($clusters[$i],$clusters[$x],\
@orig);
479 # reset so next loop iteration
480 # we will find minimum distance
484 Bio
::Tree
::Tree
->new(-root
=> $nodes[-1]);
487 # calculate avg distance between clusters - be they
488 # single sequences or the combination of multiple seqences
489 # $cluster_i and $cluster_j are the clusters to operate on
490 # and $distances is a matrix (arrayref of arrayrefs) of pairwise
491 # differences indexed on the sequence ids -
492 # so $distances->[0][1] is the distance between sequences 0 and 1
494 sub _upgma_distance
{
495 my ($cluster_i, $cluster_j, $distances) = @_;
496 my $ilen = scalar @
{ $cluster_i->{'contains'} };
497 my $jlen = scalar @
{ $cluster_j->{'contains'} };
499 for( my $i = 0; $i < $ilen; $i++ ) {
500 my $i_id = $cluster_i->{'contains'}->[$i];
501 for( my $j = 0; $j < $jlen; $j++) {
502 my $j_id = $cluster_j->{'contains'}->[$j];
503 if( ! defined $distances->[$i_id][$j_id] ) {
504 warn("no value for $i_id $j_id\n");
506 $d += $distances->[$i_id][$j_id];
517 Usage : $obj->method($newval)
520 Returns : value of method (a scalar)
521 Args : on set, new value (a scalar or undef, optional)
528 return $self->{'_method'} = shift if @_;
529 return $self->{'_method'};
533 =head2 check_additivity
535 Title : check_additivity
536 Usage : if( $distance->check_additivity($matrix) ) {
538 Function : See if matrix obeys additivity principal
540 Args : Bio::Matrix::MatrixI
541 References: Based on a Java implementation by
542 Peter Sestoft, sestoft@dina.kvl.dk 1999-12-07 version 0.3
543 http://www.dina.kvl.dk/~sestoft/bsa.html
544 which in turn is based on algorithms described in
545 R. Durbin, S. Eddy, A. Krogh, G. Mitchison.
546 Biological Sequence Analysis CUP 1998, Chapter 7.
550 sub check_additivity
{
551 my ($self,$matrix) = @_;
552 my @names = $matrix->column_names;
553 my $len = scalar @names;
554 return unless $len >= 4;
555 # look at all sets of 4
556 for( my $i = 0; $i < $len; $i++ ) {
557 for( my $j = $i+1; $j< $len; $j++) {
558 for( my $k = $j+1; $k < $len; $k ++ ) {
559 for( my $m = $k +1; $m < $len; $m++ ) {
560 my $DijDkm = $matrix->get_entry($names[$i],$names[$j]) +
561 $matrix->get_entry($names[$k],$names[$m]);
562 my $DikDjm = $matrix->get_entry($names[$i],$names[$k]) +
563 $matrix->get_entry($names[$j],$names[$m]);
564 my $DimDjk = $matrix->get_entry($names[$i],$names[$m]) +
565 $matrix->get_entry($names[$j],$names[$k]);
566 if( !( ( $DijDkm == $DikDjm && $DijDkm >= $DimDjk)
567 || ( $DijDkm == $DimDjk && $DijDkm >= $DikDjm)
568 || ( $DikDjm == $DimDjk && $DikDjm >= $DijDkm) )) {