t/AlignIO/AlignIO.t: fix number of tests in plan (fixup c523e6bed866)
[bioperl-live.git] / Bio / Tree / DistanceFactory.pm
blob4a88a5cffe285de0c5d61cd4a6e0390d07fbbd4e
2 # BioPerl module for Bio::Tree::DistanceFactory
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Tree::DistanceFactory - Construct a tree using distance based methods
18 =head1 SYNOPSIS
20 use Bio::Tree::DistanceFactory;
21 use Bio::AlignIO;
22 use Bio::Align::DNAStatistics;
23 my $tfactory = Bio::Tree::DistanceFactory->new(-method => "NJ");
24 my $stats = Bio::Align::DNAStatistics->new();
26 my $alnin = Bio::AlignIO->new(-format => 'clustalw',
27 -file => 'file.aln');
28 my $aln = $alnin->next_aln;
29 # Of course matrix can come from a different place
30 # like PHYLIP if you prefer, Bio::Matrix::IO should be able
31 # to parse many things
32 my $jcmatrix = $stats->distance(-align => $aln,
33 -method => 'Jukes-Cantor');
34 my $tree = $tfactory->make_tree($jcmatrix);
37 =head1 DESCRIPTION
39 This is a factory which will construct a phylogenetic tree based on
40 the pairwise sequence distances for a set of sequences. Currently
41 UPGMA (Sokal and Michener 1958) and NJ (Saitou and Nei 1987) tree
42 construction methods are implemented.
44 =head1 REFERENCES
46 Eddy SR, Durbin R, Krogh A, Mitchison G, (1998) "Biological Sequence Analysis",
47 Cambridge Univ Press, Cambridge, UK.
49 Howe K, Bateman A, Durbin R, (2002) "QuickTree: building huge
50 Neighbour-Joining trees of protein sequences." Bioinformatics
51 18(11):1546-1547.
53 Saitou N and Nei M, (1987) "The neighbor-joining method: a new method
54 for reconstructing phylogenetic trees." Mol Biol Evol 4(4):406-25.
56 =head1 FEEDBACK
58 =head2 Mailing Lists
60 User feedback is an integral part of the evolution of this and other
61 Bioperl modules. Send your comments and suggestions preferably to
62 the Bioperl mailing list. Your participation is much appreciated.
64 bioperl-l@bioperl.org - General discussion
65 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
67 =head2 Support
69 Please direct usage questions or support issues to the mailing list:
71 I<bioperl-l@bioperl.org>
73 rather than to the module maintainer directly. Many experienced and
74 reponsive experts will be able look at the problem and quickly
75 address it. Please include a thorough description of the problem
76 with code and data examples if at all possible.
78 =head2 Reporting Bugs
80 Report bugs to the Bioperl bug tracking system to help us keep track
81 of the bugs and their resolution. Bug reports can be submitted the web:
83 https://github.com/bioperl/bioperl-live/issues
85 =head1 AUTHOR - Jason Stajich
87 Email jason-at-bioperl.org
89 =head1 APPENDIX
91 The rest of the documentation details each of the object methods.
92 Internal methods are usually preceded with a _
94 =cut
96 package Bio::Tree::DistanceFactory;
97 use vars qw($DefaultMethod $Precision);
98 use strict;
100 # some defaults
101 $DefaultMethod = 'UPGMA';
102 $Precision = 5;
104 use Bio::Tree::Node;
105 use Bio::Tree::Tree;
107 use base qw(Bio::Root::Root);
109 =head2 new
111 Title : new
112 Usage : my $obj = Bio::Tree::DistanceFactory->new();
113 Function: Builds a new Bio::Tree::DistanceFactory object
114 Returns : an instance of Bio::Tree::DistanceFactory
115 Args : -method => 'NJ' or 'UPGMA'
118 =cut
120 sub new {
121 my($class,@args) = @_;
122 my $self = $class->SUPER::new(@args);
124 my ($method) = $self->_rearrange([qw(METHOD)],
125 @args);
126 $self->method($method || $DefaultMethod);
127 return $self;
130 =head2 make_tree
132 Title : make_tree
133 Usage : my $tree = $disttreefact->make_tree($matrix);
134 Function: Build a Tree based on a distance matrix
135 Returns : L<Bio::Tree::TreeI>
136 Args : L<Bio::Matrix::MatrixI> object
139 =cut
141 sub make_tree{
142 my ($self,$matrix) = @_;
143 if( ! defined $matrix || !ref($matrix) ||
144 ! $matrix->isa('Bio::Matrix::MatrixI') ) {
145 $self->warn("Need to provide a valid Bio::Matrix::MatrixI object to make_tree");
146 return;
149 my $method = uc ($self->method);
150 if( $method =~ /NJ/i ) {
151 return $self->_nj($matrix);
152 } elsif( $method =~ /UPGMA/i ) {
153 return $self->_upgma($matrix);
154 } else {
155 $self->warn("Unknown tree construction method '$method'. Cannot run.");
156 return;
162 =head2 _nj
164 Title : _nj
165 Usage : my $tree = $disttreefact->_nj($matrix);
166 Function: Construct a tree based on distance matrix using the
167 Neighbor Joining algorithm (Saitou and Nei, 1987)
168 Implementation based on Kevin Howe's Quicktree implementation
169 and uses his tricks (some based on Bill Bruno's work) to eliminate
170 negative branch lengths
171 Returns : L<Bio::Tree::TreeI>
172 Args : L<Bio::Matrix::MatrixI> object
174 =cut
176 sub _nj {
177 my ($self,$distmat) = @_;
179 # we assume type checking of $aln has already been done
180 # client shouldn't be calling this directly anyways, using the
181 # make_tree method is preferred
183 # so that we can trim the number of digits shown as the branch length
184 my $precisionstr = "%.$Precision"."f";
186 my @names = $distmat->column_names;
187 my $N = scalar @names;
188 my ($i,$j,$m,@nodes,$mat,@r);
189 my $L = $N;
191 if( $N < 2 ) {
192 $self->warn("Can only perform NJ treebuilding on sets of 2 or more species\n");
193 return;
194 } elsif( $N == 2 ) {
195 $i = 0;
196 my $d = sprintf($precisionstr,
197 $distmat->get_entry($names[0],$names[1]) / 2);
198 my $root = Bio::Tree::Node->new();
199 for my $nm ( @names ) {
200 $root->add_Descendent( Bio::Tree::Node->new(-id => $nm,
201 -branch_length => $d));
203 return Bio::Tree::Tree->new(-root => $root);
205 my $c = 0;
207 for ( $i = 0; $i < $N; $i++ ) {
208 push @nodes, Bio::Tree::Node->new(-id => $names[$i]);
209 my $ri = 0;
210 for( $j = 0; $j < $N; $j++ ) {
211 $mat->[$i][$j] = $distmat->get_entry($names[$i],$names[$j]);
212 $ri += $mat->[$i][$j];
214 $r[$i] = $ri / ($L -2);
217 for( my $nodecount = 0; $nodecount < $N-3; $nodecount++) {
218 my ($mini,$minj,$min);
219 for($i = 0; $i < $N; $i++ ) {
220 next unless defined $nodes[$i];
221 for( $j = 0; $j < $i; $j++ ) {
222 next unless defined $nodes[$j];
223 my $dist = $mat->[$i][$j] - ($r[$i] + $r[$j]);
224 if( ! defined $min ||
225 $dist <= $min) {
226 ($mini,$minj,$min) = ($i,$j,$dist);
230 my $dij = $mat->[$mini][$minj];
231 my $dist_i = ($dij + $r[$mini] - $r[$minj]) / 2;
232 my $dist_j = $dij - $dist_i;
234 # deal with negative branch lengths
235 # per code in K.Howe's quicktree
236 if( $dist_i < 0 ) {
237 $dist_i = 0;
238 $dist_j = $dij;
239 $dist_j = 0 if( $dist_j < 0 );
240 } elsif( $dist_j < 0 ) {
241 $dist_j = 0;
242 $dist_i = $dij;
243 $dist_i = 0 if( $dist_i < 0 );
246 $nodes[$mini]->branch_length(sprintf($precisionstr,$dist_i));
247 $nodes[$minj]->branch_length(sprintf($precisionstr,$dist_j));
249 my $newnode = Bio::Tree::Node->new(-descendents => [ $nodes[$mini],
250 $nodes[$minj] ]);
252 $nodes[$mini] = $newnode;
253 delete $nodes[$minj];
255 # update the distance matrix
256 $r[$mini] = 0;
257 my ($dmi,$dmj);
258 for( $m = 0; $m < $N; $m++ ) {
259 next unless defined $nodes[$m];
260 if( $m != $mini ) {
261 $dmj = $mat->[$m][$minj];
263 my ($row,$col);
264 ($row,$col) = ($m,$mini);
265 $dmi = $mat->[$row][$col];
267 # from K.Howe's notes in quicktree
268 # we can actually adjust r[m] here, by using the form:
269 # rm = ((rm * numseqs) - dmi - dmj + dmk) / (numseqs-1)
271 # Note: in Bill Bruno's method for negative branch
272 # elimination, then if either dist_i is positive and
273 # dist_j is 0, or dist_i is zero and dist_j is positive
274 # (after adjustment) then the matrix entry is formed
275 # from the distance to the node in question (m) to the
276 # node with the zero branch length (whichever it was).
277 # I think my code already has the same effect; this is
278 # certainly true if dij is equal to dist_i + dist_j,
279 # which it should have been fixed to
281 my $dmk = $mat->[$row][$col] = $mat->[$col][$row] =
282 ($dmi + $dmj - $dij) / 2;
284 # If we don't want to try and correct negative brlens
285 # this is essentially what is in Edddy et al, BSA book.
286 # $r[$m] = (($r[$m] * $L) - $dmi - $dmj + $dmk) / ($L-1);
288 $r[$m] = (($r[$m] * ($L - 2)) - $dmi - $dmj +
289 $mat->[$row][$col]) / ( $L - 3);
290 $r[$mini] += $dmk;
293 $L--;
294 $r[$mini] /= $L - 2;
297 # should be 3 nodes left
298 my (@leftovernodes,@leftovers);
299 for( my $k = 0; $k < $N; $k++ ) {
300 if( defined $nodes[$k] ) {
301 push @leftovers, $k;
302 push @leftovernodes, $nodes[$k];
305 my ($l_0,$l_1,$l_2) = @leftovers;
307 my $dist_i = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_0] -
308 $mat->[$l_2][$l_1] ) / 2;
310 my $dist_j = ( $mat->[$l_1][$l_0] - $dist_i);
311 my $dist_k = ( $mat->[$l_2][$l_0] - $dist_i);
313 # This is Kev's code to get rid of negative branch lengths
314 if( $dist_i < 0 ) {
315 $dist_i = 0;
316 $dist_j = $mat->[$l_1][$l_0];
317 $dist_k = $mat->[$l_2][$l_0];
318 if( $dist_j < 0 ) {
319 $dist_j = 0;
320 $dist_k = ( $mat->[$l_2][$l_0] + $mat->[$l_2][$l_1] ) / 2;
321 $dist_k = 0 if( $dist_k < 0 );
322 } elsif( $dist_k < 0 ) {
323 $dist_k = 0;
324 $dist_j = ($mat->[$l_1][$l_0] + $mat->[$l_2][$l_1]) / 2;
325 $dist_j = 0 if( $dist_j < 0 );
327 } elsif( $dist_j < 0 ) {
328 $dist_j = 0;
329 $dist_i = $mat->[$l_1][$l_0];
330 $dist_k = $mat->[$l_2][$l_1];
331 if( $dist_i < 0 ) {
332 $dist_i = 0;
333 $dist_k = ( $mat->[$l_2][$l_0] + $mat->[$l_2][$l_1]) / 2;
334 $dist_k = 0 if( $dist_k < 0 );
335 } elsif( $dist_k < 0 ) {
336 $dist_k = 0;
337 $dist_i = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_0]) / 2;
338 $dist_i = 0 if( $dist_i < 0 );
340 } elsif( $dist_k < 0 ) {
341 $dist_k = 0;
342 $dist_i = $mat->[$l_2][$l_0];
343 $dist_j = $mat->[$l_2][$l_1];
344 if( $dist_i < 0 ) {
345 $dist_i = 0;
346 $dist_j = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_1] ) / 2;
347 $dist_j = 0 if $dist_j < 0;
348 } elsif( $dist_j < 0 ) {
349 $dist_j = 0;
350 $dist_i = ($mat->[$l_1][$l_0] + $mat->[$l_2][$l_0]) / 2;
351 $dist_i = 0 if $dist_i < 0;
354 $leftovernodes[0]->branch_length(sprintf($precisionstr,$dist_i));
355 $leftovernodes[1]->branch_length(sprintf($precisionstr,$dist_j));
356 $leftovernodes[2]->branch_length(sprintf($precisionstr,$dist_k));
358 Bio::Tree::Tree->new(-root => Bio::Tree::Node->new
359 (-descendents => \@leftovernodes));
362 =head2 _upgma
364 Title : _upgma
365 Usage : my $tree = $disttreefact->_upgma($matrix);
366 Function: Construct a tree based on alignment using UPGMA
367 Returns : L<Bio::Tree::TreeI>
368 Args : L<Bio::Matrix::MatrixI> object
371 =cut
373 sub _upgma{
374 my ($self,$distmat) = @_;
375 # we assume type checking of $matrix has already been done
376 # client shouldn't be calling this directly anyways, using the
377 # make_tree method is preferred
379 # algorithm, from Eddy, Durbin, Krogh, Mitchison, 1998
380 # originally by Sokal and Michener 1956
382 my $precisionstr = "%.$Precision"."f";
384 my ($i,$j,$x,$y,@dmat,@orig,@nodes);
386 my @names = $distmat->column_names;
387 my $c = 0;
388 my @clusters = map {
389 my $r = { 'id' => $c,
390 'height' => 0,
391 'contains' => [$c],
393 $c++;
395 } @names;
397 my $K = scalar @clusters;
398 my (@mins,$min);
399 for ( $i = 0; $i < $K; $i++ ) {
400 for( $j = $i+1; $j < $K; $j++ ) {
401 my $d = $distmat->get_entry($names[$i],$names[$j]);
402 # get Min here on first time around, save 1 cycle
403 $dmat[$j][$i] = $dmat[$i][$j] = $d;
404 $orig[$i][$j] = $orig[$j][$i] = $d;
405 if ( ! defined $min || $d <= $min ) {
406 if( defined $min && $min == $d ) {
407 push @mins, [$i,$j];
408 } else {
409 @mins = [$i,$j];
410 $min = $d;
415 # distance between each cluster is avg distance
416 # between pairs of sequences from each cluster
417 while( $K > 1 ) {
418 # fencepost - we already have found the $min
419 # so very first time loop is executed we can skip checking
420 unless( defined $min ) {
421 for($i = 0; $i < $K; $i++ ) {
422 for( $j = $i+1; $j < $K; $j++ ) {
423 my $dij = $dmat[$i][$j];
424 if( ! defined $min ||
425 $dij <= $min) {
426 if( defined $min &&
427 $min == $dij ) {
428 push @mins, [$i,$j];
429 } else {
430 @mins = [ $i,$j ];
431 $min = $dij;
437 # randomly break ties
438 ($x,$y) = @{ $mins[int(rand(scalar @mins))] };
440 # now we are going to join clusters x and y, make a new cluster
442 my $node = Bio::Tree::Node->new();
443 my @subids;
444 for my $cid ( $x,$y ) {
445 my $nid = $clusters[$cid]->{'id'};
446 if( ! defined $nodes[$nid] ) {
447 $nodes[$nid] = Bio::Tree::Node->new(-id => $names[$nid]);
449 $nodes[$nid]->branch_length
450 (sprintf($precisionstr,$min/2 - $clusters[$cid]->{'height'}));
451 $node->add_Descendent($nodes[$nid]);
452 push @subids, @{ $clusters[$cid]->{'contains'} };
454 my $cluster = { 'id' => $c++,
455 'height' => $min / 2,
456 'contains' => [@subids],
459 $K--; # we are going to drop the last node so go ahead and decrement K
460 $nodes[$cluster->{'id'}] = $node;
461 if ( $y != $K ) {
462 $clusters[$y] = $clusters[$K];
463 $dmat[$y] = $dmat[$K];
464 for ( $i = 0; $i < $K; $i++ ) {
465 $dmat[$i][$y] = $dmat[$y][$i];
468 delete $clusters[$K];
469 $clusters[$x] = $cluster;
470 # now recalculate @dmat
471 for( $i = 0; $i < $K; $i++ ) {
472 if( $i != $x) {
473 $dmat[$i][$x] = $dmat[$x][$i] =
474 &_upgma_distance($clusters[$i],$clusters[$x],\@orig);
475 } else {
476 $dmat[$i][$i] = 0;
479 # reset so next loop iteration
480 # we will find minimum distance
481 @mins = ();
482 $min = undef;
484 Bio::Tree::Tree->new(-root => $nodes[-1]);
487 # calculate avg distance between clusters - be they
488 # single sequences or the combination of multiple seqences
489 # $cluster_i and $cluster_j are the clusters to operate on
490 # and $distances is a matrix (arrayref of arrayrefs) of pairwise
491 # differences indexed on the sequence ids -
492 # so $distances->[0][1] is the distance between sequences 0 and 1
494 sub _upgma_distance {
495 my ($cluster_i, $cluster_j, $distances) = @_;
496 my $ilen = scalar @{ $cluster_i->{'contains'} };
497 my $jlen = scalar @{ $cluster_j->{'contains'} };
498 my ($d,$count);
499 for( my $i = 0; $i < $ilen; $i++ ) {
500 my $i_id = $cluster_i->{'contains'}->[$i];
501 for( my $j = 0; $j < $jlen; $j++) {
502 my $j_id = $cluster_j->{'contains'}->[$j];
503 if( ! defined $distances->[$i_id][$j_id] ) {
504 warn("no value for $i_id $j_id\n");
505 } else {
506 $d += $distances->[$i_id][$j_id];
508 $count++;
511 return $d / $count;
514 =head2 method
516 Title : method
517 Usage : $obj->method($newval)
518 Function:
519 Example :
520 Returns : value of method (a scalar)
521 Args : on set, new value (a scalar or undef, optional)
524 =cut
526 sub method{
527 my $self = shift;
528 return $self->{'_method'} = shift if @_;
529 return $self->{'_method'};
533 =head2 check_additivity
535 Title : check_additivity
536 Usage : if( $distance->check_additivity($matrix) ) {
538 Function : See if matrix obeys additivity principal
539 Returns : boolean
540 Args : Bio::Matrix::MatrixI
541 References: Based on a Java implementation by
542 Peter Sestoft, sestoft@dina.kvl.dk 1999-12-07 version 0.3
543 http://www.dina.kvl.dk/~sestoft/bsa.html
544 which in turn is based on algorithms described in
545 R. Durbin, S. Eddy, A. Krogh, G. Mitchison.
546 Biological Sequence Analysis CUP 1998, Chapter 7.
548 =cut
550 sub check_additivity{
551 my ($self,$matrix) = @_;
552 my @names = $matrix->column_names;
553 my $len = scalar @names;
554 return unless $len >= 4;
555 # look at all sets of 4
556 for( my $i = 0; $i < $len; $i++ ) {
557 for( my $j = $i+1; $j< $len; $j++) {
558 for( my $k = $j+1; $k < $len; $k ++ ) {
559 for( my $m = $k +1; $m < $len; $m++ ) {
560 my $DijDkm = $matrix->get_entry($names[$i],$names[$j]) +
561 $matrix->get_entry($names[$k],$names[$m]);
562 my $DikDjm = $matrix->get_entry($names[$i],$names[$k]) +
563 $matrix->get_entry($names[$j],$names[$m]);
564 my $DimDjk = $matrix->get_entry($names[$i],$names[$m]) +
565 $matrix->get_entry($names[$j],$names[$k]);
566 if( !( ( $DijDkm == $DikDjm && $DijDkm >= $DimDjk)
567 || ( $DijDkm == $DimDjk && $DijDkm >= $DikDjm)
568 || ( $DikDjm == $DimDjk && $DikDjm >= $DijDkm) )) {
569 return 0;
575 return 1;