Bio/Tree/DistanceFactory.pm

   1 #
   2 # BioPerl module for Bio::Tree::DistanceFactory
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Jason Stajich <jason-at-bioperl.org>
   7 #
   8 # Copyright Jason Stajich
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::Tree::DistanceFactory - Construct a tree using distance based methods
  17
  18 =head1 SYNOPSIS
  19
  20   use Bio::Tree::DistanceFactory;
  21   use Bio::AlignIO;
  22   use Bio::Align::DNAStatistics;
  23   my $tfactory = Bio::Tree::DistanceFactory->new(-method => "NJ");
  24   my $stats    = Bio::Align::DNAStatistics->new();
  25
  26   my $alnin    = Bio::AlignIO->new(-format => 'clustalw',
  27                                    -file   => 'file.aln');
  28   my $aln = $alnin->next_aln;
  29   # Of course matrix can come from a different place
  30   # like PHYLIP if you prefer, Bio::Matrix::IO should be able
  31   # to parse many things
  32   my $jcmatrix = $stats->distance(-align => $aln,
  33                                   -method => 'Jukes-Cantor');
  34   my $tree = $tfactory->make_tree($jcmatrix);
  35
  36
  37 =head1 DESCRIPTION
  38
  39 This is a factory which will construct a phylogenetic tree based on
  40 the pairwise sequence distances for a set of sequences.  Currently
  41 UPGMA (Sokal and Michener 1958) and NJ (Saitou and Nei 1987) tree
  42 construction methods are implemented.
  43
  44 =head1 REFERENCES
  45
  46 Eddy SR, Durbin R, Krogh A, Mitchison G, (1998) "Biological Sequence Analysis",
  47 Cambridge Univ Press, Cambridge, UK.
  48
  49 Howe K, Bateman A, Durbin R, (2002) "QuickTree: building huge
  50 Neighbour-Joining trees of protein sequences." Bioinformatics
  51 18(11):1546-1547.
  52
  53 Saitou N and Nei M, (1987) "The neighbor-joining method: a new method
  54 for reconstructing phylogenetic trees." Mol Biol Evol 4(4):406-25.
  55
  56 =head1 FEEDBACK
  57
  58 =head2 Mailing Lists
  59
  60 User feedback is an integral part of the evolution of this and other
  61 Bioperl modules. Send your comments and suggestions preferably to
  62 the Bioperl mailing list.  Your participation is much appreciated.
  63
  64   bioperl-l@bioperl.org                  - General discussion
  65   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  66
  67 =head2 Support
  68
  69 Please direct usage questions or support issues to the mailing list:
  70
  71 I<bioperl-l@bioperl.org>
  72
  73 rather than to the module maintainer directly. Many experienced and
  74 reponsive experts will be able look at the problem and quickly
  75 address it. Please include a thorough description of the problem
  76 with code and data examples if at all possible.
  77
  78 =head2 Reporting Bugs
  79
  80 Report bugs to the Bioperl bug tracking system to help us keep track
  81 of the bugs and their resolution. Bug reports can be submitted the web:
  82
  83   https://github.com/bioperl/bioperl-live/issues
  84
  85 =head1 AUTHOR - Jason Stajich
  86
  87 Email jason-at-bioperl.org
  88
  89 =head1 APPENDIX
  90
  91 The rest of the documentation details each of the object methods.
  92 Internal methods are usually preceded with a _
  93
  94 =cut
  95
  96 package Bio::Tree::DistanceFactory;
  97 use vars qw($DefaultMethod $Precision);
  98 use strict;
  99
 100 # some defaults
 101 $DefaultMethod = 'UPGMA';
 102 $Precision = 5;
 103
 104 use Bio::Tree::Node;
 105 use Bio::Tree::Tree;
 106
 107 use base qw(Bio::Root::Root);
 108
 109 =head2 new
 110
 111  Title   : new
 112  Usage   : my $obj = Bio::Tree::DistanceFactory->new();
 113  Function: Builds a new Bio::Tree::DistanceFactory object
 114  Returns : an instance of Bio::Tree::DistanceFactory
 115  Args    : -method => 'NJ' or 'UPGMA'
 116
 117
 118 =cut
 119
 120 sub new {
 121   my($class,@args) = @_;
 122   my $self = $class->SUPER::new(@args);
 123
 124   my ($method) = $self->_rearrange([qw(METHOD)],
 125                                    @args);
 126   $self->method($method || $DefaultMethod);
 127   return $self;
 128 }
 129
 130 =head2 make_tree
 131
 132  Title   : make_tree
 133  Usage   : my $tree = $disttreefact->make_tree($matrix);
 134  Function: Build a Tree based on a distance matrix
 135  Returns : L<Bio::Tree::TreeI>
 136  Args    : L<Bio::Matrix::MatrixI> object
 137
 138
 139 =cut
 140
 141 sub make_tree{
 142    my ($self,$matrix) = @_;
 143    if( ! defined $matrix || !ref($matrix) ||
 144        ! $matrix->isa('Bio::Matrix::MatrixI') ) {
 145        $self->warn("Need to provide a valid Bio::Matrix::MatrixI object to make_tree");
 146        return;
 147    }
 148
 149    my $method = uc ($self->method);
 150    if( $method =~ /NJ/i ) {
 151        return $self->_nj($matrix);
 152    } elsif( $method =~ /UPGMA/i ) {
 153        return $self->_upgma($matrix);
 154    } else {
 155        $self->warn("Unknown tree construction method '$method'.  Cannot run.");
 156        return;
 157    }
 158
 159 }
 160
 161
 162 =head2 _nj
 163
 164  Title   : _nj
 165  Usage   : my $tree = $disttreefact->_nj($matrix);
 166  Function: Construct a tree based on distance matrix using the
 167            Neighbor Joining algorithm (Saitou and Nei, 1987)
 168            Implementation based on Kevin Howe's Quicktree implementation
 169            and uses his tricks (some based on Bill Bruno's work) to eliminate
 170            negative branch lengths
 171  Returns : L<Bio::Tree::TreeI>
 172  Args    : L<Bio::Matrix::MatrixI> object
 173
 174 =cut
 175
 176 sub _nj {
 177    my ($self,$distmat) = @_;
 178
 179    # we assume type checking of $aln has already been done
 180    # client shouldn't be calling this directly anyways, using the
 181    # make_tree method is preferred
 182
 183    # so that we can trim the number of digits shown as the branch length
 184    my $precisionstr = "%.$Precision"."f";
 185
 186    my @names =  $distmat->column_names;
 187    my $N = scalar @names;
 188    my ($i,$j,$m,@nodes,$mat,@r);
 189    my $L = $N;
 190
 191    if( $N < 2 ) {
 192        $self->warn("Can only perform NJ treebuilding on sets of 2 or more species\n");
 193        return;
 194    } elsif( $N == 2 ) {
 195        $i = 0;
 196        my $d = sprintf($precisionstr,
 197                        $distmat->get_entry($names[0],$names[1]) / 2);
 198        my $root = Bio::Tree::Node->new();
 199        for my $nm ( @names ) {
 200            $root->add_Descendent( Bio::Tree::Node->new(-id => $nm,
 201                                                         -branch_length => $d));
 202        }
 203        return Bio::Tree::Tree->new(-root => $root);
 204    }
 205    my $c = 0;
 206
 207    for ( $i = 0; $i < $N; $i++ ) {
 208        push @nodes, Bio::Tree::Node->new(-id => $names[$i]);
 209        my $ri = 0;
 210        for( $j = 0; $j < $N; $j++ ) {
 211            $mat->[$i][$j] = $distmat->get_entry($names[$i],$names[$j]);
 212            $ri += $mat->[$i][$j];
 213        }
 214        $r[$i] = $ri / ($L -2);
 215    }
 216
 217    for( my $nodecount = 0; $nodecount < $N-3; $nodecount++) {
 218        my ($mini,$minj,$min);
 219        for($i = 0; $i < $N; $i++ ) {
 220            next unless defined $nodes[$i];
 221            for( $j = 0; $j < $i; $j++ ) {
 222                next unless defined $nodes[$j];
 223                my $dist = $mat->[$i][$j] - ($r[$i] + $r[$j]);
 224                if( ! defined $min ||
 225                    $dist <= $min) {
 226                    ($mini,$minj,$min) = ($i,$j,$dist);
 227                }
 228            }
 229        }
 230        my $dij    = $mat->[$mini][$minj];
 231        my $dist_i = ($dij + $r[$mini] - $r[$minj]) / 2;
 232        my $dist_j = $dij - $dist_i;
 233
 234        # deal with negative branch lengths
 235        # per code in K.Howe's quicktree
 236        if( $dist_i < 0 ) {
 237            $dist_i = 0;
 238            $dist_j = $dij;
 239            $dist_j = 0 if( $dist_j < 0 );
 240        } elsif( $dist_j < 0 ) {
 241            $dist_j = 0;
 242            $dist_i = $dij;
 243            $dist_i = 0 if( $dist_i < 0 );
 244        }
 245
 246        $nodes[$mini]->branch_length(sprintf($precisionstr,$dist_i));
 247        $nodes[$minj]->branch_length(sprintf($precisionstr,$dist_j));
 248
 249        my $newnode = Bio::Tree::Node->new(-descendents => [ $nodes[$mini],
 250                                                             $nodes[$minj] ]);
 251
 252        $nodes[$mini] = $newnode;
 253        delete $nodes[$minj];
 254
 255        # update the distance matrix
 256        $r[$mini] = 0;
 257        my ($dmi,$dmj);
 258        for( $m = 0; $m < $N; $m++ ) {
 259            next unless defined $nodes[$m];
 260            if( $m != $mini ) {
 261                $dmj = $mat->[$m][$minj];
 262
 263                my ($row,$col);
 264                ($row,$col) = ($m,$mini);
 265                $dmi = $mat->[$row][$col];
 266
 267                # from K.Howe's notes in quicktree
 268                # we can actually adjust r[m] here, by using the form:
 269                # rm = ((rm * numseqs) - dmi - dmj + dmk) / (numseqs-1)
 270
 271                # Note: in Bill Bruno's method for negative branch
 272                # elimination, then if either dist_i is positive and
 273                # dist_j is 0, or dist_i is zero and dist_j is positive
 274                # (after adjustment) then the matrix entry is formed
 275                # from the distance to the node in question (m) to the
 276                # node with the zero branch length (whichever it was).
 277                # I think my code already has the same effect; this is
 278                # certainly true if dij is equal to dist_i + dist_j,
 279                # which it should have been fixed to
 280
 281                my $dmk = $mat->[$row][$col] = $mat->[$col][$row] =
 282                    ($dmi + $dmj - $dij) / 2;
 283
 284                # If we don't want to try and correct negative brlens
 285                # this is essentially what is in Edddy et al, BSA book.
 286                # $r[$m] = (($r[$m] * $L) - $dmi - $dmj + $dmk) / ($L-1);
 287                #
 288                $r[$m] = (($r[$m] * ($L - 2)) - $dmi - $dmj +
 289                          $mat->[$row][$col]) / ( $L - 3);
 290                $r[$mini] += $dmk;
 291            }
 292        }
 293        $L--;
 294        $r[$mini] /= $L - 2;
 295    }
 296
 297    # should be 3 nodes left
 298    my (@leftovernodes,@leftovers);
 299    for( my $k = 0; $k < $N; $k++ ) {
 300        if( defined $nodes[$k] ) {
 301            push @leftovers, $k;
 302            push @leftovernodes, $nodes[$k];
 303        }
 304    }
 305    my ($l_0,$l_1,$l_2) = @leftovers;
 306
 307    my $dist_i = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_0] -
 308                   $mat->[$l_2][$l_1] ) / 2;
 309
 310    my $dist_j = ( $mat->[$l_1][$l_0] - $dist_i);
 311    my $dist_k = ( $mat->[$l_2][$l_0] - $dist_i);
 312
 313    # This is Kev's code to get rid of negative branch lengths
 314    if( $dist_i < 0 ) {
 315        $dist_i = 0;
 316        $dist_j = $mat->[$l_1][$l_0];
 317        $dist_k = $mat->[$l_2][$l_0];
 318        if( $dist_j < 0 ) {
 319            $dist_j = 0;
 320            $dist_k = ( $mat->[$l_2][$l_0] + $mat->[$l_2][$l_1] ) / 2;
 321            $dist_k = 0 if( $dist_k < 0 );
 322        } elsif( $dist_k < 0 ) {
 323            $dist_k = 0;
 324            $dist_j = ($mat->[$l_1][$l_0] + $mat->[$l_2][$l_1]) / 2;
 325            $dist_j = 0 if( $dist_j < 0 );
 326        }
 327    } elsif( $dist_j < 0 ) {
 328        $dist_j = 0;
 329        $dist_i = $mat->[$l_1][$l_0];
 330        $dist_k = $mat->[$l_2][$l_1];
 331        if( $dist_i < 0 ) {
 332            $dist_i = 0;
 333            $dist_k = ( $mat->[$l_2][$l_0] + $mat->[$l_2][$l_1]) / 2;
 334            $dist_k = 0 if( $dist_k  < 0 );
 335        } elsif( $dist_k < 0 ) {
 336            $dist_k = 0;
 337            $dist_i = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_0]) / 2;
 338            $dist_i = 0 if( $dist_i < 0 );
 339        }
 340    } elsif( $dist_k < 0 ) {
 341        $dist_k = 0;
 342        $dist_i = $mat->[$l_2][$l_0];
 343        $dist_j = $mat->[$l_2][$l_1];
 344        if( $dist_i < 0 ) {
 345            $dist_i = 0;
 346            $dist_j = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_1] ) / 2;
 347            $dist_j = 0 if $dist_j < 0;
 348        } elsif( $dist_j < 0  ) {
 349            $dist_j = 0;
 350            $dist_i = ($mat->[$l_1][$l_0] + $mat->[$l_2][$l_0]) / 2;
 351            $dist_i = 0 if $dist_i < 0;
 352        }
 353    }
 354    $leftovernodes[0]->branch_length(sprintf($precisionstr,$dist_i));
 355    $leftovernodes[1]->branch_length(sprintf($precisionstr,$dist_j));
 356    $leftovernodes[2]->branch_length(sprintf($precisionstr,$dist_k));
 357
 358    Bio::Tree::Tree->new(-root => Bio::Tree::Node->new
 359                         (-descendents => \@leftovernodes));
 360 }
 361
 362 =head2 _upgma
 363
 364  Title   : _upgma
 365  Usage   : my $tree = $disttreefact->_upgma($matrix);
 366  Function: Construct a tree based on alignment using UPGMA
 367  Returns : L<Bio::Tree::TreeI>
 368  Args    : L<Bio::Matrix::MatrixI> object
 369
 370
 371 =cut
 372
 373 sub _upgma{
 374    my ($self,$distmat) = @_;
 375    # we assume type checking of $matrix has already been done
 376    # client shouldn't be calling this directly anyways, using the
 377    # make_tree method is preferred
 378
 379    # algorithm, from Eddy, Durbin, Krogh, Mitchison, 1998
 380    # originally by Sokal and Michener 1956
 381
 382    my $precisionstr = "%.$Precision"."f";
 383
 384    my ($i,$j,$x,$y,@dmat,@orig,@nodes);
 385
 386    my @names = $distmat->column_names;
 387    my $c = 0;
 388    my @clusters = map {
 389        my $r = { 'id'        => $c,
 390                  'height'    => 0,
 391                  'contains'  => [$c],
 392              };
 393        $c++;
 394        $r;
 395    } @names;
 396
 397    my $K = scalar @clusters;
 398    my (@mins,$min);
 399    for ( $i = 0; $i < $K; $i++ ) {
 400        for( $j = $i+1; $j < $K; $j++ ) {
 401            my $d =  $distmat->get_entry($names[$i],$names[$j]);
 402            # get Min here on first time around, save 1 cycle
 403            $dmat[$j][$i] = $dmat[$i][$j] = $d;
 404            $orig[$i][$j] = $orig[$j][$i] = $d;
 405            if ( ! defined $min || $d <= $min ) {
 406                if( defined $min && $min == $d ) {
 407                    push @mins, [$i,$j];
 408                } else {
 409                    @mins = [$i,$j];
 410                    $min  = $d;
 411                }
 412            }
 413        }
 414    }
 415    # distance between each cluster is avg distance
 416    # between pairs of sequences from each cluster
 417    while( $K > 1 ) {
 418        # fencepost - we already have found the $min
 419        # so very first time loop is executed we can skip checking
 420        unless( defined $min ) {
 421            for($i = 0; $i < $K; $i++ ) {
 422                for( $j = $i+1; $j < $K; $j++ ) {
 423                    my $dij = $dmat[$i][$j];
 424                    if( ! defined $min ||
 425                        $dij <= $min) {
 426                        if( defined $min &&
 427                            $min == $dij ) {
 428                            push @mins, [$i,$j];
 429                        } else {
 430                            @mins = [ $i,$j ];
 431                            $min = $dij;
 432                        }
 433                    }
 434                }
 435            }
 436        }
 437        # randomly break ties
 438        ($x,$y) = @{ $mins[int(rand(scalar @mins))] };
 439
 440        # now we are going to join clusters x and y, make a new cluster
 441
 442        my $node = Bio::Tree::Node->new();
 443        my @subids;
 444        for my $cid ( $x,$y ) {
 445            my $nid = $clusters[$cid]->{'id'};
 446            if( ! defined $nodes[$nid] ) {
 447                $nodes[$nid] = Bio::Tree::Node->new(-id => $names[$nid]);
 448            }
 449            $nodes[$nid]->branch_length
 450                (sprintf($precisionstr,$min/2 - $clusters[$cid]->{'height'}));
 451            $node->add_Descendent($nodes[$nid]);
 452            push @subids, @{ $clusters[$cid]->{'contains'} };
 453        }
 454        my $cluster = { 'id'       => $c++,
 455                        'height'   => $min / 2,
 456                        'contains' => [@subids],
 457                    };
 458
 459        $K--; # we are going to drop the last node so go ahead and decrement K
 460        $nodes[$cluster->{'id'}] = $node;
 461        if ( $y != $K ) {
 462            $clusters[$y] = $clusters[$K];
 463            $dmat[$y] = $dmat[$K];
 464            for ( $i = 0; $i < $K; $i++ ) {
 465                $dmat[$i][$y] = $dmat[$y][$i];
 466            }
 467        }
 468        delete $clusters[$K];
 469        $clusters[$x] = $cluster;
 470        # now recalculate @dmat
 471        for( $i = 0; $i < $K; $i++ ) {
 472            if( $i != $x) {
 473                $dmat[$i][$x] = $dmat[$x][$i] =
 474                    &_upgma_distance($clusters[$i],$clusters[$x],\@orig);
 475            } else {
 476                $dmat[$i][$i] = 0;
 477            }
 478        }
 479        # reset so next loop iteration
 480        # we will find minimum distance
 481        @mins = ();
 482        $min = undef;
 483    }
 484    Bio::Tree::Tree->new(-root => $nodes[-1]);
 485 }
 486
 487 # calculate avg distance between clusters - be they
 488 # single sequences or the combination of multiple seqences
 489 # $cluster_i and $cluster_j are the clusters to operate on
 490 # and $distances is a matrix (arrayref of arrayrefs) of pairwise
 491 # differences indexed on the sequence ids -
 492 # so $distances->[0][1] is the distance between sequences 0 and 1
 493
 494 sub _upgma_distance {
 495     my ($cluster_i, $cluster_j, $distances) = @_;
 496     my $ilen = scalar @{ $cluster_i->{'contains'} };
 497     my $jlen = scalar @{ $cluster_j->{'contains'} };
 498     my ($d,$count);
 499     for( my $i = 0; $i < $ilen; $i++ ) {
 500         my $i_id = $cluster_i->{'contains'}->[$i];
 501         for( my $j = 0; $j < $jlen; $j++) {
 502             my $j_id = $cluster_j->{'contains'}->[$j];
 503             if( ! defined $distances->[$i_id][$j_id] ) {
 504                 warn("no value for $i_id $j_id\n");
 505             } else {
 506                 $d += $distances->[$i_id][$j_id];
 507             }
 508             $count++;
 509         }
 510     }
 511     return $d / $count;
 512 }
 513
 514 =head2 method
 515
 516  Title   : method
 517  Usage   : $obj->method($newval)
 518  Function:
 519  Example :
 520  Returns : value of method (a scalar)
 521  Args    : on set, new value (a scalar or undef, optional)
 522
 523
 524 =cut
 525
 526 sub method{
 527     my $self = shift;
 528     return $self->{'_method'} = shift if @_;
 529     return $self->{'_method'};
 530 }
 531
 532
 533 =head2 check_additivity
 534
 535  Title     : check_additivity
 536  Usage     : if( $distance->check_additivity($matrix) ) {
 537              }
 538  Function  : See if matrix obeys additivity principal
 539  Returns   : boolean
 540  Args      : Bio::Matrix::MatrixI
 541  References: Based on a Java implementation by
 542              Peter Sestoft, sestoft@dina.kvl.dk 1999-12-07 version 0.3
 543              http://www.dina.kvl.dk/~sestoft/bsa.html
 544              which in turn is based on algorithms described in
 545              R. Durbin, S. Eddy, A. Krogh, G. Mitchison.
 546              Biological Sequence Analysis CUP 1998, Chapter 7.
 547
 548 =cut
 549
 550 sub check_additivity{
 551    my ($self,$matrix) = @_;
 552    my @names = $matrix->column_names;
 553    my $len = scalar @names;
 554    return unless $len >= 4;
 555    # look at all sets of 4
 556    for( my $i = 0; $i < $len; $i++ ) {
 557        for( my $j = $i+1; $j< $len; $j++) {
 558            for( my $k = $j+1; $k < $len; $k ++ ) {
 559                for( my $m = $k +1; $m < $len; $m++ ) {
 560                    my $DijDkm = $matrix->get_entry($names[$i],$names[$j]) +
 561                        $matrix->get_entry($names[$k],$names[$m]);
 562                    my $DikDjm = $matrix->get_entry($names[$i],$names[$k]) +
 563                        $matrix->get_entry($names[$j],$names[$m]);
 564                    my $DimDjk = $matrix->get_entry($names[$i],$names[$m]) +
 565                        $matrix->get_entry($names[$j],$names[$k]);
 566                    if( !( ( $DijDkm == $DikDjm && $DijDkm >= $DimDjk)
 567                           || ( $DijDkm == $DimDjk && $DijDkm >= $DikDjm)
 568                           || ( $DikDjm == $DimDjk && $DikDjm >= $DijDkm) )) {
 569                        return 0;
 570                    }
 571                }
 572            }
 573        }
 574    }
 575    return 1;
 576 }
 577
 578 1;