Bio/Align/DNAStatistics.pm

   1 #
   2 # BioPerl module for Bio::Align::DNAStatistics
   3 #
   4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
   5 #
   6 # Cared for by Jason Stajich <jason-AT-bioperl.org>
   7 #
   8 # Copyright Jason Stajich
   9 #
  10 # You may distribute this module under the same terms as perl itself
  11
  12 # POD documentation - main docs before the code
  13
  14 =head1 NAME
  15
  16 Bio::Align::DNAStatistics - Calculate some statistics for a DNA alignment
  17
  18 =head1 SYNOPSIS
  19
  20   use Bio::AlignIO;
  21   use Bio::Align::DNAStatistics;
  22
  23   my $stats = Bio::Align::DNAStatistics->new();
  24   my $alignin = Bio::AlignIO->new(-format => 'emboss',
  25                                  -file   => 't/data/insulin.water');
  26   my $aln = $alignin->next_aln;
  27   my $jcmatrix = $stats->distance(-align => $aln,
  28                                   -method => 'Jukes-Cantor');
  29
  30   print $jcmatrix->print_matrix;
  31   ## and for measurements of synonymous /nonsynonymous substitutions ##
  32
  33   my $in = Bio::AlignIO->new(-format => 'fasta',
  34                             -file   => 't/data/nei_gojobori_test.aln');
  35   my $alnobj = $in->next_aln;
  36   my ($seq1id,$seq2id) = map { $_->display_id } $alnobj->each_seq;
  37   my $results = $stats->calc_KaKs_pair($alnobj, $seq1id, $seq2id);
  38   print "comparing ".$results->[0]{'Seq1'}." and ".$results->[0]{'Seq2'}."\n";
  39   for (sort keys %{$results->[0]} ){
  40       next if /Seq/;
  41       printf("%-9s %.4f \n",$_ , $results->[0]{$_});
  42   }
  43
  44   my $results2 = $stats->calc_all_KaKs_pairs($alnobj);
  45   for my $an (@$results2){
  46       print "comparing ". $an->{'Seq1'}." and ". $an->{'Seq2'}. " \n";
  47       for (sort keys %$an ){
  48           next if /Seq/;
  49           printf("%-9s %.4f \n",$_ , $an->{$_});
  50       }
  51       print "\n\n";
  52   }
  53
  54   my $result3 = $stats->calc_average_KaKs($alnobj, 1000);
  55   for (sort keys %$result3 ){
  56       next if /Seq/;
  57       printf("%-9s %.4f \n",$_ , $result3->{$_});
  58   }
  59
  60 =head1 DESCRIPTION
  61
  62 This object contains routines for calculating various statistics and
  63 distances for DNA alignments.  The routines are not well tested and do
  64 contain errors at this point.  Work is underway to correct them, but
  65 do not expect this code to give you the right answer currently!  Use
  66 dnadist/distmat in the PHLYIP or EMBOSS packages to calculate the
  67 distances.
  68
  69
  70 Several different distance method calculations are supported.  Listed
  71 in brackets are the pattern which will match
  72
  73 =over 3
  74
  75 =item *
  76
  77 JukesCantor [jc|jukes|jukescantor|jukes-cantor]
  78
  79 =item *
  80
  81 Uncorrected [jcuncor|uncorrected]
  82
  83 =item *
  84
  85 F81 [f81|felsenstein]
  86
  87 =item *
  88
  89 Kimura [k2|k2p|k80|kimura]
  90
  91 =item *
  92
  93 Tamura [t92|tamura|tamura92]
  94
  95 =item *
  96
  97 F84 [f84|felsenstein84]
  98
  99 =item *
 100
 101 TajimaNei [tajimanei|tajima\-nei]
 102
 103 =item *
 104
 105 JinNei [jinnei|jin\-nei] (not implemented)
 106
 107 =back
 108
 109 There are also three methods to calculate the ratio of synonymous to
 110 non-synonymous mutations.  All are implementations of the Nei-Gojobori
 111 evolutionary pathway method and use the Jukes-Cantor method of
 112 nucleotide substitution. This method works well so long as the
 113 nucleotide frequencies are roughly equal and there is no significant
 114 transition/transversion bias.  In order to use these methods there are
 115 several pre-requisites for the alignment.
 116
 117 =over 3
 118
 119 =item 1
 120
 121 DNA alignment must be based on protein alignment. Use the subroutine
 122 L<Bio::Align::Utilities/aa_to_dna_aln> to achieve this.
 123
 124 =item 2
 125
 126 Therefore alignment gaps must be in multiples of 3 (representing an aa
 127 deletion/insertion) and at present must be indicated by a '-' symbol.
 128
 129 =item 3
 130
 131 Alignment must be solely of coding region and be in reading frame 0 to
 132 achieve meaningful results
 133
 134 =item 4
 135
 136 Alignment must therefore be a multiple of 3 nucleotides long.
 137
 138 =item 5
 139
 140 All sequences must be the same length (including gaps). This should be
 141 the case anyway if the sequences have been automatically aligned using
 142 a program like Clustal.
 143
 144 =item 6
 145
 146 Only the standard codon alphabet is supported at present.
 147
 148 =back
 149
 150 calc_KaKs_pair() calculates a number of statistics for a named pair of
 151 sequences in the alignment.
 152
 153 calc_all_KaKs_pairs() calculates these statistics for all pairwise
 154 comparisons in an MSA.  The statistics returned are:
 155
 156 =over 3
 157
 158 =item *
 159
 160 S_d - Number of synonymous mutations between the 2 sequences.
 161
 162 =item *
 163
 164 N_d - Number of non-synonymous mutations between the 2 sequences.
 165
 166 =item *
 167
 168 S -  Mean number of  synonymous sites in both sequences.
 169
 170 =item *
 171
 172 N -  mean number of  synonymous sites in both sequences.
 173
 174 =item *
 175
 176 P_s - proportion of synonymous differences in both sequences given by
 177 P_s = S_d/S.
 178
 179 =item *
 180
 181 P_n - proportion of non-synonymous differences in both sequences given
 182 by P_n = S_n/S.
 183
 184 =item *
 185
 186 D_s - estimation of synonymous mutations per synonymous site (by
 187 Jukes-Cantor).
 188
 189 =item *
 190
 191 D_n - estimation of non-synonymous mutations per non-synonymous site (by
 192 Jukes-Cantor).
 193
 194 =item *
 195
 196 D_n_var - estimation of variance of D_n .
 197
 198 =item *
 199
 200 D_s_var - estimation of variance of S_n.
 201
 202 =item *
 203
 204 z_value - calculation of z value.Positive value indicates D_n E<gt> D_s,
 205 negative value indicates D_s E<gt> D_n.
 206
 207 =back
 208
 209 The statistics returned by calc_average_KaKs are:
 210
 211 =over 3
 212
 213 =item *
 214
 215 D_s - Average number of synonymous mutations/synonymous site.
 216
 217 =item *
 218
 219 D_n - Average number of non-synonymous mutations/non-synonymous site.
 220
 221 =item *
 222
 223 D_s_var - Estimated variance of Ds from bootstrapped alignments.
 224
 225 =item *
 226
 227 D_n_var - Estimated variance of Dn from bootstrapped alignments.
 228
 229 =item *
 230
 231 z_score - calculation of z value. Positive value indicates D_n E<gt>D_s,
 232 negative values vice versa.
 233
 234 =back
 235
 236 The design of the code is based around the explanation of the
 237 Nei-Gojobori algorithm in the excellent book "Molecular Evolution and
 238 Phylogenetics" by Nei and Kumar, published by Oxford University
 239 Press. The methods have been tested using the worked example 4.1 in
 240 the book, and reproduce those results. If people like having this sort
 241 of analysis in BioPerl other methods for estimating Ds and Dn can be
 242 provided later.
 243
 244 Much of the DNA distance code is based on implementations in EMBOSS
 245 (Rice et al, www.emboss.org) [distmat.c] and PHYLIP (J. Felsenstein et
 246 al) [dnadist.c].  Insight also gained from Eddy, Durbin, Krogh, &
 247 Mitchison.
 248
 249 =head1 REFERENCES
 250
 251 =over 3
 252
 253 =item *
 254
 255 D_JukesCantor
 256
 257 "Phylogenetic Inference", Swoffrod, Olsen, Waddell and Hillis, in
 258 Mol. Systematics, 2nd ed, 1996, Ch 11.  Derived from "Evolution of
 259 Protein Molecules", Jukes & Cantor, in Mammalian Prot. Metab., III,
 260 1969, pp. 21-132.
 261
 262 =item *
 263
 264 D_Tamura
 265
 266 K Tamura, Mol. Biol. Evol. 1992, 9, 678.
 267
 268 =item *
 269
 270 D_Kimura
 271
 272 M Kimura, J. Mol. Evol., 1980, 16, 111.
 273
 274 =item *
 275
 276 JinNei
 277
 278 Jin and Nei, Mol. Biol. Evol. 82, 7, 1990.
 279
 280 =item *
 281
 282 D_TajimaNei
 283
 284 Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
 285
 286 =back
 287
 288 =head1 FEEDBACK
 289
 290 =head2 Mailing Lists
 291
 292 User feedback is an integral part of the evolution of this and other
 293 Bioperl modules. Send your comments and suggestions preferably to
 294 the Bioperl mailing list.  Your participation is much appreciated.
 295
 296   bioperl-l@bioperl.org                  - General discussion
 297   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 298
 299 =head2 Support
 300
 301 Please direct usage questions or support issues to the mailing list:
 302
 303 I<bioperl-l@bioperl.org>
 304
 305 rather than to the module maintainer directly. Many experienced and
 306 reponsive experts will be able look at the problem and quickly
 307 address it. Please include a thorough description of the problem
 308 with code and data examples if at all possible.
 309
 310 =head2 Reporting Bugs
 311
 312 Report bugs to the Bioperl bug tracking system to help us keep track
 313 of the bugs and their resolution. Bug reports can be submitted via the
 314 web:
 315
 316   https://github.com/bioperl/bioperl-live/issues
 317
 318 =head1 AUTHOR - Jason Stajich
 319
 320 Email jason-AT-bioperl.org
 321
 322 =head1 CONTRIBUTORS
 323
 324 Richard Adams, richard.adams@ed.ac.uk
 325
 326 =head1 APPENDIX
 327
 328 The rest of the documentation details each of the object methods.
 329 Internal methods are usually preceded with a _
 330
 331 =cut
 332
 333
 334 # Let the code begin...
 335
 336
 337 package Bio::Align::DNAStatistics;
 338 use vars qw(%DNAChanges @Nucleotides %NucleotideIndexes
 339             $GapChars $SeqCount $DefaultGapPenalty %DistanceMethods
 340             $CODONS %synchanges $synsites $Precision $GCChhars);
 341 use strict;
 342 use Bio::Align::PairwiseStatistics;
 343 use Bio::Matrix::PhylipDist;
 344 use Bio::Tools::IUPAC;
 345
 346 BEGIN {
 347     $GapChars = '[\.\-]';
 348     $GCChhars = '[GCS]';
 349     @Nucleotides = qw(A G T C);
 350     $SeqCount = 2;
 351     $Precision = 5;
 352
 353     # these values come from EMBOSS distmat implementation
 354     %NucleotideIndexes = ( 'A' => 0,
 355                            'T' => 1,
 356                            'C' => 2,
 357                            'G' => 3,
 358
 359                            'AT' => 0,
 360                            'AC' => 1,
 361                            'AG' => 2,
 362                            'CT' => 3,
 363                            'GT' => 4,
 364                            'CG' => 5,
 365
 366 # these are wrong now
 367 #                          'S' => [ 1, 3],
 368 #                          'W' => [ 0, 4],
 369 #                          'Y' => [ 2, 3],
 370 #                          'R' => [ 0, 1],
 371 #                          'M' => [ 0, 3],
 372 #                          'K' => [ 1, 2],
 373 #                          'B' => [ 1, 2, 3],
 374 #                          'H' => [ 0, 2, 3],
 375 #                          'V' => [ 0, 1, 3],
 376 #                          'D' => [ 0, 1, 2],
 377                            );
 378
 379     $DefaultGapPenalty = 0;
 380     # could put ambiguities here?
 381     %DNAChanges = ( 'Transversions' => { 'A' => [ 'T', 'C'],
 382                                          'T' => [ 'A', 'G'],
 383                                          'C' => [ 'A', 'G'],
 384                                          'G' => [ 'C', 'T'],
 385                                      },
 386                     'Transitions'   => { 'A' => [ 'G' ],
 387                                          'G' => [ 'A' ],
 388                                          'C' => [ 'T' ],
 389                                          'T' => [ 'C' ],
 390                                      },
 391                     );
 392     %DistanceMethods = ( 'jc|jukes|jukescantor|jukes\-cantor' => 'JukesCantor',
 393                          'jcuncor|uncorrected'   => 'Uncorrected',
 394                          'f81|felsenstein81'     => 'F81',
 395                          'k2|k2p|k80|kimura'     => 'Kimura',
 396                          't92|tamura|tamura92'   => 'Tamura',
 397                          'f84|felsenstein84'     => 'F84',
 398                          'tajimanei|tajima\-nei' => 'TajimaNei',
 399                          'jinnei|jin\-nei'       => 'JinNei');
 400
 401 }
 402 use base qw(Bio::Root::Root Bio::Align::StatisticsI);
 403
 404 ## generate look up hashes for Nei_Gojobori methods##
 405 $CODONS = get_codons();
 406 my @t = split '', "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
 407 #create look up hash of number of possible synonymous mutations per codon
 408 $synsites = get_syn_sites();
 409 #create reference look up hash of single basechanges in codons
 410 %synchanges = get_syn_changes();
 411
 412
 413
 414 =head2 new
 415
 416  Title   : new
 417  Usage   : my $obj = Bio::Align::DNAStatistics->new();
 418  Function: Builds a new Bio::Align::DNAStatistics object
 419  Returns : Bio::Align::DNAStatistics
 420  Args    : none
 421
 422
 423 =cut
 424
 425 sub new {
 426     my ($class,@args) = @_;
 427     my $self = $class->SUPER::new(@args);
 428
 429     $self->pairwise_stats( Bio::Align::PairwiseStatistics->new());
 430
 431     return $self;
 432 }
 433
 434
 435 =head2 distance
 436
 437  Title   : distance
 438  Usage   : my $distance_mat = $stats->distance(-align  => $aln,
 439                                                -method => $method);
 440  Function: Calculates a distance matrix for all pairwise distances of
 441            sequences in an alignment.
 442  Returns : L<Bio::Matrix::PhylipDist> object
 443  Args    : -align  => Bio::Align::AlignI object
 444            -method => String specifying specific distance method
 445                       (implementing class may assume a default)
 446 See also: L<Bio::Matrix::PhylipDist>
 447
 448 =cut
 449
 450 sub distance{
 451    my ($self,@args) = @_;
 452    my ($aln,$method) = $self->_rearrange([qw(ALIGN METHOD)],@args);
 453    if( ! defined $aln || ! ref ($aln) || ! $aln->isa('Bio::Align::AlignI') ) {
 454        $self->throw("Must supply a valid Bio::Align::AlignI for the -align parameter in distance");
 455    }
 456    $method ||= 'JukesCantor';
 457    foreach my $m ( keys %DistanceMethods ) {
 458        if(defined $m &&  $method =~ /$m/i ) {
 459            my $mtd = "D_$DistanceMethods{$m}";
 460            return $self->$mtd($aln);
 461        }
 462    }
 463    $self->warn("Unrecognized distance method $method must be one of [".
 464                join(',',$self->available_distance_methods())."]");
 465    return;
 466 }
 467
 468 =head2 available_distance_methods
 469
 470  Title   : available_distance_methods
 471  Usage   : my @methods = $stats->available_distance_methods();
 472  Function: Enumerates the possible distance methods
 473  Returns : Array of strings
 474  Args    : none
 475
 476
 477 =cut
 478
 479 sub available_distance_methods{
 480    my ($self,@args) = @_;
 481    return values %DistanceMethods;
 482 }
 483
 484 =head2 D - distance methods
 485
 486
 487 =cut
 488
 489
 490 =head2 D_JukesCantor
 491
 492  Title   : D_JukesCantor
 493  Usage   : my $d = $stat->D_JukesCantor($aln)
 494  Function: Calculates D (pairwise distance) between 2 sequences in an
 495            alignment using the Jukes-Cantor 1 parameter model.
 496  Returns : L<Bio::Matrix::PhylipDist>
 497  Args    : L<Bio::Align::AlignI> of DNA sequences
 498            double - gap penalty
 499
 500
 501 =cut
 502
 503 sub D_JukesCantor{
 504    my ($self,$aln,$gappenalty) = @_;
 505    return 0 unless $self->_check_arg($aln);
 506    $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
 507    # ambiguities ignored at this point
 508    my (@seqs,@names,@values,%dist);
 509    my $seqct = 0;
 510    foreach my $seq ( $aln->each_seq) {
 511        push @names, $seq->display_id;
 512        push @seqs, uc $seq->seq();
 513        $seqct++;
 514    }
 515    my $precisionstr = "%.$Precision"."f";
 516    for(my $i = 0; $i < $seqct-1; $i++ ) {
 517        # (diagonals) distance is 0 for same sequence
 518        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 519        $values[$i][$i] = sprintf($precisionstr,0);
 520
 521        for( my $j = $i+1; $j < $seqct; $j++ ) {
 522            my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
 523                                                                $seqs[$j]);
 524            # just want diagonals
 525            my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
 526                      $matrix->[2]->[2] + $matrix->[3]->[3] );
 527            my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
 528            my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
 529            # fwd and rev lookup
 530            $dist{$names[$i]}->{$names[$j]} = [$i,$j];
 531            $dist{$names[$j]}->{$names[$i]} = [$i,$j];
 532            $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
 533            # (diagonals) distance is 0 for same sequence
 534            $dist{$names[$j]}->{$names[$j]} = [$j,$j];
 535            $values[$j][$j] = sprintf($precisionstr,0);
 536        }
 537    }
 538    return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
 539                                        -matrix  => \%dist,
 540                                        -names   => \@names,
 541                                        -values  => \@values);
 542 }
 543
 544 =head2 D_F81
 545
 546  Title   : D_F81
 547  Usage   : my $d = $stat->D_F81($aln)
 548  Function: Calculates D (pairwise distance) between 2 sequences in an
 549            alignment using the Felsenstein 1981 distance model.
 550            Relaxes the assumption of equal base frequencies that is
 551            in JC.
 552  Returns : L<Bio::Matrix::PhylipDist>
 553  Args    : L<Bio::Align::AlignI> of DNA sequences
 554
 555
 556 =cut
 557
 558 sub D_F81{
 559    my ($self,$aln,$gappenalty) = @_;
 560    return 0 unless $self->_check_arg($aln);
 561    $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
 562    # ambiguities ignored at this point
 563    my (@seqs,@names,@values,%dist);
 564    my $seqct = 0;
 565    foreach my $seq ( $aln->each_seq) {
 566        push @names, $seq->display_id;;
 567        push @seqs, uc $seq->seq();
 568        $seqct++;
 569    }
 570    my $precisionstr = "%.$Precision"."f";
 571    for(my $i = 0; $i < $seqct-1; $i++ ) {
 572        # (diagonals) distance is 0 for same sequence
 573        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 574        $values[$i][$i] = sprintf($precisionstr,0);
 575
 576        for( my $j = $i+1; $j < $seqct; $j++ ) {
 577
 578            my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
 579                                                                $seqs[$j]);
 580            # just want diagonals
 581            my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
 582                      $matrix->[2]->[2] + $matrix->[3]->[3] );
 583            my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
 584            my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
 585            # fwd and rev lookup
 586            $dist{$names[$i]}->{$names[$j]} = [$i,$j];
 587            $dist{$names[$j]}->{$names[$i]} = [$i,$j];
 588            $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
 589            # (diagonals) distance is 0 for same sequence
 590            $dist{$names[$j]}->{$names[$j]} = [$j,$j];
 591            $values[$j][$j] = sprintf($precisionstr,0);
 592        }
 593    }
 594    return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
 595                                        -matrix  => \%dist,
 596                                        -names   => \@names,
 597                                        -values  => \@values);
 598 }
 599
 600 =head2 D_Uncorrected
 601
 602  Title   : D_Uncorrected
 603  Usage   : my $d = $stats->D_Uncorrected($aln)
 604  Function: Calculate a distance D, no correction for multiple substitutions
 605            is used.  In rare cases where sequences may not overlap, 'NA' is
 606            substituted for the distance.
 607  Returns : L<Bio::Matrix::PhylipDist>
 608  Args    : L<Bio::Align::AlignI> (DNA Alignment)
 609            [optional] gap penalty
 610
 611 =cut
 612
 613 sub D_Uncorrected {
 614    my ($self,$aln,$gappenalty) = @_;
 615    $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
 616    return 0 unless $self->_check_arg($aln);
 617    # ambiguities ignored at this point
 618    my (@seqs,@names,@values,%dist);
 619    my $seqct = 0;
 620    foreach my $seq ( $aln->each_seq) {
 621        push @names, $seq->display_id;
 622        push @seqs, uc $seq->seq();
 623        $seqct++;
 624    }
 625
 626    my $precisionstr = "%.$Precision"."f";
 627    my $len = $aln->length;
 628    for( my $i = 0; $i < $seqct-1; $i++ ) {
 629        # (diagonals) distance is 0 for same sequence
 630        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 631        $values[$i][$i] = sprintf($precisionstr,0);
 632
 633        for( my $j = $i+1; $j < $seqct; $j++ ) {
 634            my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
 635                                                                $seqs[$j]);
 636            my $m = ( $matrix->[0]->[0] +
 637                      $matrix->[1]->[1] +
 638                      $matrix->[2]->[2] +
 639                      $matrix->[3]->[3] );
 640        my $denom = ( $len - $gaps + ( $gaps * $gappenalty));
 641
 642        $self->warn("No distance calculated between $names[$i] and $names[$j], inserting -1")
 643             unless $denom;
 644
 645            my $D = $denom ? 1 - ( $m / $denom) : -1;
 646            # fwd and rev lookup
 647            $dist{$names[$i]}->{$names[$j]} = [$i,$j];
 648            $dist{$names[$j]}->{$names[$i]} = [$i,$j];
 649            $values[$j][$i] = $values[$i][$j] = $denom ? sprintf($precisionstr,$D)
 650                                                   : sprintf("%-*s", $Precision + 2, $D);
 651            # (diagonals) distance is 0 for same sequence
 652            $dist{$names[$j]}->{$names[$j]} = [$j,$j];
 653            $values[$j][$j] = sprintf($precisionstr,0);
 654        }
 655    }
 656    return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
 657                                        -matrix  => \%dist,
 658                                        -names   => \@names,
 659                                        -values  => \@values);
 660 }
 661
 662
 663 # M Kimura, J. Mol. Evol., 1980, 16, 111.
 664
 665 =head2 D_Kimura
 666
 667  Title   : D_Kimura
 668  Usage   : my $d = $stat->D_Kimura($aln)
 669  Function: Calculates D (pairwise distance) between all pairs of sequences
 670            in an alignment using the Kimura 2 parameter model.
 671  Returns : L<Bio::Matrix::PhylipDist>
 672  Args    : L<Bio::Align::AlignI> of DNA sequences
 673
 674
 675 =cut
 676
 677 sub D_Kimura {
 678    my ($self,$aln) = @_;
 679    return 0 unless $self->_check_arg($aln);
 680    # ambiguities ignored at this point
 681    my (@names,@values,%dist);
 682    my $seqct = 0;
 683    foreach my $seq ( $aln->each_seq) {
 684        push @names, $seq->display_id;
 685        $seqct++;
 686    }
 687
 688    my $precisionstr = "%.$Precision"."f";
 689
 690    for( my $i = 0; $i < $seqct-1; $i++ ) {
 691        # (diagonals) distance is 0 for same sequence
 692        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 693        $values[$i][$i] = sprintf($precisionstr,0);
 694
 695        for( my $j = $i+1; $j < $seqct; $j++ ) {
 696            my $pairwise = $aln->select_noncont($i+1,$j+1);
 697            my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
 698            unless( $L ) {
 699                $L = 1;
 700            }
 701            my $P = $self->transitions($pairwise) / $L;
 702            my $Q = $self->transversions($pairwise) / $L;
 703            my $K = 0;
 704            my $denom = ( 1 - (2 * $P) - $Q);
 705            if( $denom == 0 ) {
 706                $self->throw("cannot find distance for ",$i+1,
 707                             ",",$j+1," $P, $Q\n");
 708            }
 709            my $a = 1 / ( 1 - (2 * $P) - $Q);
 710            my $b = 1 / ( 1 - 2 * $Q );
 711            if( $a < 0 || $b < 0 ) {
 712                $K = -1;
 713            } else{
 714                $K = (1/2) * log ( $a ) + (1/4) * log($b);
 715            }
 716            # fwd and rev lookup
 717            $dist{$names[$i]}->{$names[$j]} = [$i,$j];
 718            $dist{$names[$j]}->{$names[$i]} = [$i,$j];
 719            $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$K);
 720            # (diagonals) distance is 0 for same sequence
 721            $dist{$names[$j]}->{$names[$j]} = [$j,$j];
 722            $values[$j][$j] = sprintf($precisionstr,0);
 723        }
 724    }
 725    return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
 726                                        -matrix  => \%dist,
 727                                        -names   => \@names,
 728                                        -values  => \@values);
 729 }
 730
 731
 732 =head2 D_Kimura_variance
 733
 734  Title   : D_Kimura
 735  Usage   : my $d = $stat->D_Kimura_variance($aln)
 736  Function: Calculates D (pairwise distance) between all pairs of sequences
 737            in an alignment using the Kimura 2 parameter model.
 738  Returns : array of 2 L<Bio::Matrix::PhylipDist>,
 739            the first is the Kimura distance and the second is
 740            a matrix of variance V(K)
 741  Args    : L<Bio::Align::AlignI> of DNA sequences
 742
 743
 744 =cut
 745
 746 sub D_Kimura_variance {
 747    my ($self,$aln) = @_;
 748    return 0 unless $self->_check_arg($aln);
 749    # ambiguities ignored at this point
 750    my (@names,@values,%dist,@var);
 751    my $seqct = 0;
 752    foreach my $seq ( $aln->each_seq) {
 753        push @names, $seq->display_id;
 754        $seqct++;
 755    }
 756
 757    my $precisionstr = "%.$Precision"."f";
 758
 759    for( my $i = 0; $i < $seqct-1; $i++ ) {
 760        # (diagonals) distance is 0 for same sequence
 761        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 762        $values[$i][$i] = sprintf($precisionstr,0);
 763
 764        for( my $j = $i+1; $j < $seqct; $j++ ) {
 765            my $pairwise = $aln->select_noncont($i+1,$j+1);
 766            my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
 767            unless( $L ) {
 768                $L = 1;
 769            }
 770            my $P = $self->transitions($pairwise) / $L;
 771            my $Q = $self->transversions($pairwise) / $L;
 772            my ($a,$b,$K,$var_k);
 773            my $a_denom = ( 1 - (2 * $P) - $Q);
 774            my $b_denom = 1 - 2 * $Q;
 775            unless( $a_denom > 0 && $b_denom > 0 ) {
 776                $a = 1;
 777                $b = 1;
 778                $K = -1;
 779                $var_k = -1;
 780            } else {
 781                $a = 1 / $a_denom;
 782                $b = 1 / $b_denom;
 783                $K = (1/2) * log ( $a ) + (1/4) * log($b);
 784                # from Wu and Li 1985 which in turn is from Kimura 1980
 785                my $c = ( $a - $b ) / 2;
 786                my $d = ( $a + $b ) / 2;
 787                $var_k = ( $a**2 * $P + $d**2 * $Q - ( $a * $P + $d * $Q)**2 ) / $L;
 788            }
 789
 790            # fwd and rev lookup
 791            $dist{$names[$i]}->{$names[$j]} = [$i,$j];
 792            $dist{$names[$j]}->{$names[$i]} = [$i,$j];
 793            $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$K);
 794            # (diagonals) distance is 0 for same sequence
 795            $dist{$names[$j]}->{$names[$j]} = [$j,$j];
 796            $values[$j]->[$j] = sprintf($precisionstr,0);
 797
 798            $var[$j]->[$i] = $var[$i]->[$j] = sprintf($precisionstr,$var_k);
 799            $var[$j]->[$j] = $values[$j]->[$j];
 800        }
 801    }
 802    return ( Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
 803                                          -matrix  => \%dist,
 804                                          -names   => \@names,
 805                                          -values  => \@values),
 806             Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
 807                                          -matrix  => \%dist,
 808                                          -names   => \@names,
 809                                          -values  => \@var)
 810             );
 811 }
 812
 813
 814 #  K Tamura, Mol. Biol. Evol. 1992, 9, 678.
 815
 816 =head2 D_Tamura
 817
 818  Title   : D_Tamura
 819  Usage   : Calculates D (pairwise distance) between 2 sequences in an
 820            alignment using Tamura 1992 distance model.
 821  Returns : L<Bio::Matrix::PhylipDist>
 822  Args    : L<Bio::Align::AlignI> of DNA sequences
 823
 824
 825 =cut
 826
 827 sub D_Tamura {
 828    my ($self,$aln) = @_;
 829    return 0 unless $self->_check_arg($aln);
 830    # ambiguities ignored at this point
 831    my (@seqs,@names,@values,%dist,$i,$j);
 832    my $seqct = 0;
 833    my $length = $aln->length;
 834    foreach my $seq ( $aln->each_seq) {
 835        push @names, $seq->display_id;;
 836        push @seqs, uc $seq->seq();
 837        $seqct++;
 838    }
 839
 840    my $precisionstr = "%.$Precision"."f";
 841    my (@gap,@gc,@trans,@tranv,@score);
 842    $i = 0;
 843    for my $t1 ( @seqs ) {
 844        $j = 0;
 845        for my $t2 ( @seqs ) {
 846            $gap[$i][$j] = 0;
 847            for( my $k = 0; $k < $length; $k++ ) {
 848                my ($c1,$c2) = ( substr($seqs[$i],$k,1),
 849                                 substr($seqs[$j],$k,1) );
 850                if( $c1 =~ /^$GapChars$/ ||
 851                    $c2 =~ /^$GapChars$/ ) {
 852                    $gap[$i][$j]++;
 853                } elsif( $c2 =~ /^$GCChhars$/i ) {
 854                    $gc[$i][$j]++;
 855                }
 856            }
 857            $gc[$i][$j] = ( $gc[$i][$j] /
 858                            ($length - $gap[$i][$j]) );
 859            $j++;
 860        }
 861        $i++;
 862    }
 863
 864    for( $i = 0; $i < $seqct-1; $i++ ) {
 865        # (diagonals) distance is 0 for same sequence
 866        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 867        $values[$i][$i] = sprintf($precisionstr,0);
 868
 869        for( $j = $i+1; $j < $seqct; $j++ ) {
 870
 871            my $pairwise = $aln->select_noncont($i+1,$j+1);
 872            my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
 873            my $P = $self->transitions($pairwise) / $L;
 874            my $Q = $self->transversions($pairwise) / $L;
 875            my $C = $gc[$i][$j] + $gc[$j][$i]-
 876                ( 2 * $gc[$i][$j] * $gc[$j][$i] );
 877            if( $P ) {
 878                $P = $P / $C;
 879            }
 880            my $d = -($C * log(1- $P - $Q)) -(0.5* ( 1 - $C) * log(1 - 2 * $Q));
 881            # fwd and rev lookup
 882            $dist{$names[$i]}->{$names[$j]} = [$i,$j];
 883            $dist{$names[$j]}->{$names[$i]} = [$i,$j];
 884            $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
 885            # (diagonals) distance is 0 for same sequence
 886            $dist{$names[$j]}->{$names[$j]} = [$j,$j];
 887            $values[$j][$j] = sprintf($precisionstr,0);
 888        }
 889    }
 890    return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
 891                                        -matrix  => \%dist,
 892                                        -names   => \@names,
 893                                        -values  => \@values);
 894
 895 }
 896
 897 =head2 D_F84
 898
 899  Title   : D_F84
 900  Usage   : my $d = $stat->D_F84($aln)
 901  Function: Calculates D (pairwise distance) between 2 sequences in an
 902            alignment using the Felsenstein 1984 distance model.
 903  Returns : L<Bio::Matrix::PhylipDist>
 904  Args    : L<Bio::Align::AlignI> of DNA sequences
 905            [optional] double - gap penalty
 906
 907 =cut
 908
 909 sub D_F84 {
 910    my ($self,$aln,$gappenalty) = @_;
 911    return 0 unless $self->_check_arg($aln);
 912    $self->throw_not_implemented();
 913    # ambiguities ignored at this point
 914    my (@seqs,@names,@values,%dist);
 915    my $seqct = 0;
 916    foreach my $seq ( $aln->each_seq) {
 917        # if there is no name,
 918        my $id = $seq->display_id;
 919        if( ! length($id) ||       # deal with empty names
 920            $id =~ /^\s+$/ ) {
 921            $id = $seqct+1;
 922        }
 923        push @names, $id;
 924        push @seqs, uc $seq->seq();
 925        $seqct++;
 926    }
 927
 928    my $precisionstr = "%.$Precision"."f";
 929
 930    for( my $i = 0; $i < $seqct-1; $i++ ) {
 931        # (diagonals) distance is 0 for same sequence
 932        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 933        $values[$i][$i] = sprintf($precisionstr,0);
 934
 935        for( my $j = $i+1; $j < $seqct; $j++ ) {
 936        }
 937    }
 938 }
 939
 940 # Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
 941 #  Tajima-Nei correction used for multiple substitutions in the calc
 942 # of the distance matrix. Nucleic acids only.
 943 #
 944 #  D = p-distance = 1 - (matches/(posns_scored + gaps)
 945 #
 946 #  distance = -b * ln(1-D/b)
 947 #
 948
 949 =head2 D_TajimaNei
 950
 951  Title   : D_TajimaNei
 952  Usage   : my $d = $stat->D_TajimaNei($aln)
 953  Function: Calculates D (pairwise distance) between 2 sequences in an
 954            alignment using the TajimaNei 1984 distance model.
 955  Returns : L<Bio::Matrix::PhylipDist>
 956  Args    : Bio::Align::AlignI of DNA sequences
 957
 958
 959 =cut
 960
 961 sub D_TajimaNei{
 962    my ($self,$aln) = @_;
 963    return 0 unless $self->_check_arg($aln);
 964    # ambiguities ignored at this point
 965    my (@seqs,@names,@values,%dist);
 966    my $seqct = 0;
 967    foreach my $seq ( $aln->each_seq) {
 968        # if there is no name,
 969        push @names, $seq->display_id;
 970        push @seqs, uc $seq->seq();
 971        $seqct++;
 972    }
 973    my $precisionstr = "%.$Precision"."f";
 974    my ($i,$j,$bs);
 975    # pairwise
 976    for( $i =0; $i < $seqct -1; $i++ ) {
 977        $dist{$names[$i]}->{$names[$i]} = [$i,$i];
 978        $values[$i][$i] = sprintf($precisionstr,0);
 979
 980        for ( $j = $i+1; $j <$seqct;$j++ ) {
 981            my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
 982                                                                $seqs[$j]);
 983            my $pairwise = $aln->select_noncont($i+1,$j+1);
 984            my $slen = $self->pairwise_stats->number_of_comparable_bases($pairwise);
 985            my $fij2 = 0;
 986            for( $bs = 0; $bs < 4; $bs++ ) {
 987                my $fi = 0;
 988                map {$fi += $matrix->[$bs]->[$_] } 0..3;
 989                my $fj = 0;
 990                # summation
 991                map { $fj += $matrix->[$_]->[$bs] } 0..3;
 992                my $fij = ( $fi && $fj ) ? ($fi + $fj) /( 2 * $slen) : 0;
 993                $fij2 += $fij**2;
 994            }
 995
 996            my ($pair,$h) = (0,0);
 997            for( $bs = 0; $bs < 3; $bs++ ) {
 998                for(my $bs1 = $bs+1; $bs1 <= 3; $bs1++ ) {
 999                    my $fij = $pfreq->[$pair++] / $slen;
1000                    if( $fij ) {
1001
1002                        my ($ci1,$ci2,$cj1,$cj2) = (0,0,0,0);
1003
1004                        map { $ci1 += $matrix->[$_]->[$bs] } 0..3;
1005                        map { $cj1 += $matrix->[$bs]->[$_] } 0..3;
1006                        map { $ci2 += $matrix->[$_]->[$bs1] } 0..3;
1007                        map { $cj2 += $matrix->[$bs1]->[$_] } 0..3;
1008
1009                        if( $fij ) {
1010                            $h += ( ($fij**2) / 2 ) /
1011                                (  ( ( $ci1 + $cj1 ) / (2 * $slen) ) *
1012                                   ( ( $ci2 + $cj2 ) / (2 * $slen) )
1013                                   );
1014                        }
1015                        $self->debug( "slen is $slen h is $h fij = $fij ci1 =$ci1 cj1=$cj1 ci2=$ci2 cj2=$cj2\n");
1016                    }
1017                }
1018            }
1019            # just want diagonals which are matches (A matched A, C -> C)
1020
1021            my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
1022                      $matrix->[2]->[2] + $matrix->[3]->[3] );
1023            my $D = 1 - ( $m / $slen);
1024            my $d;
1025            if( $h == 0 ) {
1026                $d = -1;
1027            } else {
1028                my $b = (1 - $fij2 + (($D**2)/$h)) / 2;
1029                my $c = 1- $D/ $b;
1030
1031                if( $c < 0 ) {
1032                    $d = -1;
1033                } else {
1034                    $d = (-1 * $b) * log ( $c);
1035                }
1036            }
1037            # fwd and rev lookup
1038            $dist{$names[$i]}->{$names[$j]} = [$i,$j];
1039            $dist{$names[$j]}->{$names[$i]} = [$i,$j];
1040            $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
1041
1042            # (diagonals) distance is 0 for same sequence
1043            $dist{$names[$j]}->{$names[$j]} = [$j,$j];
1044            $values[$j][$j] = sprintf($precisionstr,0);
1045        }
1046    }
1047    return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
1048                                        -matrix  => \%dist,
1049                                        -names   => \@names,
1050                                        -values  => \@values);
1051
1052 }
1053
1054 # Jin and Nei, Mol. Biol. Evol. 82, 7, 1990.
1055
1056 =head2 D_JinNei
1057
1058  Title   : D_JinNei
1059  Usage   : my $d = $stat->D_JinNei($aln)
1060  Function: Calculates D (pairwise distance) between 2 sequences in an
1061            alignment using the Jin-Nei 1990 distance model.
1062  Returns : L<Bio::Matrix::PhylipDist>
1063  Args    : L<Bio::Align::AlignI> of DNA sequences
1064
1065
1066 =cut
1067
1068 sub D_JinNei{
1069    my ($self,@args) = @_;
1070    $self->warn("JinNei implementation not completed");
1071    return;
1072 }
1073
1074 =head2 transversions
1075
1076  Title   : transversions
1077  Usage   : my $transversions = $stats->transversion($aln);
1078  Function: Calculates the number of transversions between two sequences in
1079            an alignment
1080  Returns : integer
1081  Args    : Bio::Align::AlignI
1082
1083
1084 =cut
1085
1086 sub transversions{
1087    my ($self,$aln) = @_;
1088    return $self->_trans_count_helper($aln, $DNAChanges{'Transversions'});
1089 }
1090
1091 =head2 transitions
1092
1093  Title   : transitions
1094  Usage   : my $transitions = Bio::Align::DNAStatistics->transitions($aln);
1095  Function: Calculates the number of transitions in a given DNA alignment
1096  Returns : integer representing the number of transitions
1097  Args    : Bio::Align::AlignI object
1098
1099
1100 =cut
1101
1102 sub transitions{
1103    my ($self,$aln) = @_;
1104    return $self->_trans_count_helper($aln, $DNAChanges{'Transitions'});
1105 }
1106
1107
1108 sub _trans_count_helper {
1109     my ($self,$aln,$type) = @_;
1110     return 0 unless( $self->_check_arg($aln) );
1111     if( ! $aln->is_flush ) { $self->throw("must be flush") }
1112     my (@tcount);
1113     my ($first,$second) = ( uc $aln->get_seq_by_pos(1)->seq(),
1114                             uc $aln->get_seq_by_pos(2)->seq() );
1115     my $alen = $aln->length;
1116     for (my $i = 0;$i<$alen; $i++ ) {
1117         my ($c1,$c2) = ( substr($first,$i,1),
1118                          substr($second,$i,1) );
1119         if( $c1 ne $c2 ) {
1120             foreach my $nt ( @{$type->{$c1}} ) {
1121                 if( $nt eq $c2) {
1122                    $tcount[$i]++;
1123                }
1124             }
1125         }
1126     }
1127     my $sum = 0;
1128     map { if( $_) { $sum += $_} } @tcount;
1129     return $sum;
1130 }
1131
1132 # this will generate a matrix which records across the row, the number
1133 # of DNA subst
1134 #
1135 sub _build_nt_matrix {
1136     my ($self,$seqa,$seqb) = @_;
1137
1138
1139     my $basect_matrix = [ [ qw(0 0 0 0) ],  # number of bases that match
1140                           [ qw(0 0 0 0) ],
1141                           [ qw(0 0 0 0) ],
1142                           [ qw(0 0 0 0) ] ];
1143     my $gaps = 0;                           # number of gaps
1144     my $pfreq = [ qw( 0 0 0 0 0 0)];        # matrix for pair frequency
1145     my $len_a = length($seqa);
1146     for( my $i = 0; $i < $len_a; $i++) {
1147         my ($ti,$tj) = (substr($seqa,$i,1),substr($seqb,$i,1));
1148         $ti =~ tr/U/T/;
1149         $tj =~ tr/U/T/;
1150
1151         if( $ti =~ /^$GapChars$/) { $gaps++; next; }
1152         if( $tj =~ /^$GapChars$/) { $gaps++; next }
1153
1154         my $ti_index = $NucleotideIndexes{$ti};
1155         my $tj_index = $NucleotideIndexes{$tj};
1156
1157         if( ! defined $ti_index ) {
1158             $self->warn("ti_index not defined for $ti\n");
1159             next;
1160         }
1161
1162         $basect_matrix->[$ti_index]->[$tj_index]++;
1163
1164         if( $ti ne $tj ) {
1165             $pfreq->[$NucleotideIndexes{join('',sort ($ti,$tj))}]++;
1166         }
1167     }
1168     return ($basect_matrix,$pfreq,$gaps);
1169 }
1170
1171 sub _check_ambiguity_nucleotide {
1172     my ($base1,$base2) = @_;
1173     my %iub = Bio::Tools::IUPAC->iupac_iub();
1174     my @amb1 = @{ $iub{uc($base1)} };
1175     my @amb2 = @{ $iub{uc($base2)} };
1176     my ($pmatch) = (0);
1177     for my $amb ( @amb1 ) {
1178         if( grep { $amb eq $_ } @amb2 ) {
1179             $pmatch = 1;
1180             last;
1181         }
1182     }
1183     if( $pmatch ) {
1184         return (1 / scalar @amb1) * (1 / scalar @amb2);
1185     } else {
1186         return 0;
1187     }
1188 }
1189
1190
1191 sub _check_arg {
1192     my($self,$aln ) = @_;
1193     if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
1194         $self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics");
1195         return 0;
1196     } elsif( $aln->get_seq_by_pos(1)->alphabet ne 'dna' ) {
1197         $self->warn("Must provide a DNA alignment to Bio::Align::DNAStatistics, you provided a " . $aln->get_seq_by_pos(1)->alphabet);
1198         return 0;
1199     }
1200     return 1;
1201 }
1202
1203 =head2 Data Methods
1204
1205 =cut
1206
1207 =head2 pairwise_stats
1208
1209  Title   : pairwise_stats
1210  Usage   : $obj->pairwise_stats($newval)
1211  Function:
1212  Returns : value of pairwise_stats
1213  Args    : newvalue (optional)
1214
1215
1216 =cut
1217
1218 sub pairwise_stats{
1219    my ($self,$value) = @_;
1220    if( defined $value) {
1221       $self->{'_pairwise_stats'} = $value;
1222     }
1223     return $self->{'_pairwise_stats'};
1224
1225 }
1226
1227 =head2 calc_KaKs_pair
1228
1229  Title    : calc_KaKs_pair
1230  Useage   : my $results = $stats->calc_KaKs_pair($alnobj,
1231             $name1, $name2).
1232  Function : calculates Nei-Gojobori statistics for pairwise
1233             comparison.
1234  Args     : A Bio::Align::AlignI compliant object such as a
1235             Bio::SimpleAlign object, and 2 sequence name strings.
1236  Returns  : a reference to a hash of statistics with keys as
1237             listed in Description.
1238
1239 =cut
1240
1241 sub calc_KaKs_pair {
1242     my ( $self, $aln, $seq1_id, $seq2_id) = @_;
1243     $self->throw("Needs 3 arguments - an alignment object, and 2 sequence ids")
1244         if @_!= 4;
1245     $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1246     my @seqs = (
1247                 #{id => $seq1_id, seq =>($aln->each_seq_with_id($seq1_id))[0]->seq},
1248                 #{id => $seq2_id, seq =>($aln->each_seq_with_id($seq2_id))[0]->seq}
1249                 {id => $seq1_id, seq => uc(($aln->each_seq_with_id($seq1_id))[0]->seq)},
1250                 {id => $seq2_id, seq => uc(($aln->each_seq_with_id($seq2_id))[0]->seq)}
1251                ) ;
1252     if (length($seqs[0]{'seq'}) != length($seqs[1]{'seq'})) {
1253         $self->throw(" aligned sequences must be of equal length!");
1254     }
1255     my $results = [];
1256     $self->_get_av_ds_dn(\@seqs, $results);
1257     return $results;
1258
1259 }
1260
1261 =head2 calc_all_KaKs_pairs
1262
1263  Title    : calc_all_KaKs_pairs
1264  Useage   : my $results2 = $stats->calc_KaKs_pair($alnobj).
1265  Function : Calculates Nei_gojobori statistics for all pairwise
1266             combinations in sequence.
1267  Arguments: A Bio::Align::ALignI compliant object such as
1268             a Bio::SimpleAlign object.
1269  Returns  : A reference to an array of hashes of statistics of
1270             all pairwise comparisons in the alignment.
1271
1272 =cut
1273
1274
1275
1276 sub calc_all_KaKs_pairs {
1277 #returns a multi_element_array with all pairwise comparisons
1278         my ($self,$aln) = @_;
1279         $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1280         my @seqs;
1281         for my $seq ($aln->each_seq) {
1282                 push @seqs, {id => $seq->display_id, seq=>$seq->seq};
1283                 }
1284         my $results ;
1285         $results = $self->_get_av_ds_dn(\@seqs, $results);
1286         return $results;
1287 }
1288
1289 =head2 calc_average_KaKs
1290
1291  Title    : calc_average_KaKs.
1292  Useage   : my $res= $stats->calc_average_KaKs($alnobj, 1000).
1293  Function : calculates Nei_Gojobori stats for average of all
1294             sequences in the alignment.
1295  Args     : A Bio::Align::AlignI compliant object such as a
1296             Bio::SimpleAlign object, number of bootstrap iterations
1297             (default 1000).
1298  Returns  : A reference to a hash of statistics as listed in Description.
1299
1300 =cut
1301
1302 sub calc_average_KaKs {
1303 #calculates global value for sequences in alignment using bootstrapping
1304 #this is quite slow (~10 seconds per  3 X 200nt seqs);
1305     my ($self, $aln, $bootstrap_rpt) = @_;
1306     $bootstrap_rpt ||= 1000;
1307     $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1308     my @seqs;
1309     for my $seq ($aln->each_seq) {
1310         push @seqs, {id => $seq->display_id, seq=>$seq->seq};
1311     }
1312     my $results ;
1313     my ($ds_orig, $dn_orig) = $self->_get_av_ds_dn(\@seqs);
1314     #print "ds = $ds_orig, dn = $dn_orig\n";
1315     $results = {D_s => $ds_orig, D_n => $dn_orig};
1316     $self->_run_bootstrap(\@seqs, $results, $bootstrap_rpt);
1317     return $results;
1318 }
1319
1320 ############## primary internal subs for alignment comparisons ########################
1321
1322 sub _run_bootstrap {
1323     ### generates sampled sequences, calculates Ds and Dn values,
1324     ### then calculates variance of sampled sequences and add results to results hash
1325     ###
1326     my ($self,$seq_ref, $results, $bootstrap_rpt) = @_;
1327     my @seqs = @$seq_ref;
1328     my @btstrp_aoa; # to hold array of array of nucleotides for resampling
1329     my %bootstrap_values = (ds => [], dn =>[]); # to hold list of av values
1330
1331     #1st make alternative array of codons;
1332     my $c = 0;
1333     while ($c < length $seqs[0]{'seq'}) {
1334         for (0..$#seqs) {
1335             push @{$btstrp_aoa[$_]}, substr ($seqs[$_]{'seq'}, $c, 3);
1336         }
1337         $c+=3;
1338     }
1339
1340     for (1..$bootstrap_rpt) {
1341         my $sampled = _resample (\@btstrp_aoa);
1342         my ($ds, $dn) = $self->_get_av_ds_dn ($sampled) ; # is array ref
1343         push @{$bootstrap_values{'ds'}}, $ds;
1344         push @{$bootstrap_values{'dn'}}, $dn;
1345     }
1346
1347     $results->{'D_s_var'} = sampling_variance($bootstrap_values{'ds'});
1348     $results->{'D_n_var'} = sampling_variance($bootstrap_values{'dn'});
1349     $results->{'z_score'} =     ($results->{'D_n'} - $results->{'D_s'}) /
1350         sqrt($results->{'D_s_var'} + $results->{'D_n_var'} );
1351     #print "bootstrapped var_syn =      $results->{'D_s_var'} \n" ;
1352     #print "bootstrapped var_nc =       $results->{'D_n_var'} \n";
1353     #print "z is $results->{'z_score'}\n";      ### end of global set up of/perm look up data
1354 }
1355
1356 sub _resample {
1357     my $ref = shift;
1358     my $codon_num = scalar (@{$ref->[0]});
1359     my @altered;
1360     for (0..$codon_num -1) {    #for each codon
1361         my $rand = int (rand ($codon_num));
1362         for (0..$#$ref) {
1363             push @{$altered[$_]}, $ref->[$_][$rand];
1364         }
1365     }
1366     my @stringed = map {join '', @$_}@altered;
1367     my @return;
1368     #now out in random name to keep other subs happy
1369     for (@stringed) {
1370         push @return, {id=>'1', seq=> $_};
1371     }
1372     return \@return;
1373 }
1374
1375 sub _get_av_ds_dn {
1376     # takes array of hashes of sequence strings and ids   #
1377     my $self = shift;
1378     my $seq_ref = shift;
1379     my $result = shift if @_;
1380     my @caller = caller(1);
1381     my @seqarray = @$seq_ref;
1382     my $bootstrap_score_list;
1383     #for a multiple alignment considers all pairwise combinations#
1384     my %dsfor_average = (ds => [], dn => []);
1385     for (my $i = 0; $i < scalar @seqarray; $i++) {
1386         for (my $j = $i +1; $j<scalar @seqarray; $j++ ){
1387 #                       print "comparing $i and $j\n";
1388             if (length($seqarray[$i]{'seq'}) != length($seqarray[$j]{'seq'})) {
1389                 $self->warn(" aligned sequences must be of equal length!");
1390                 next;
1391             }
1392
1393             my $syn_site_count = count_syn_sites($seqarray[$i]{'seq'}, $synsites);
1394             my $syn_site_count2 = count_syn_sites($seqarray[$j]{'seq'}, $synsites);
1395 #                       print "syn 1 is $syn_site_count , syn2 is $syn_site_count2\n";
1396             my ($syn_count, $non_syn_count, $gap_cnt) = analyse_mutations($seqarray[$i]{'seq'}, $seqarray[$j]{'seq'});
1397             #get averages
1398             my $av_s_site = ($syn_site_count + $syn_site_count2)/2;
1399             my $av_ns_syn_site = length($seqarray[$i]{'seq'}) - $gap_cnt- $av_s_site ;
1400
1401             #calculate ps and pn  (p54)
1402             my $syn_prop = $syn_count / $av_s_site;
1403             my $nc_prop = $non_syn_count / $av_ns_syn_site      ;
1404
1405             #now use jukes/cantor to calculate D_s and D_n, would alter here if needed a different method
1406             my $d_syn = $self->jk($syn_prop);
1407             my $d_nc = $self->jk($nc_prop);
1408
1409             #JK calculation must succeed for continuation of calculation
1410             #ret_value = -1 if error
1411             next unless $d_nc >=0 && $d_syn >=0;
1412
1413
1414             push @{$dsfor_average{'ds'}}, $d_syn;
1415             push @{$dsfor_average{'dn'}}, $d_nc;
1416
1417             #if not doing bootstrap, calculate the pairwise comparisin stats
1418             if ($caller[3] =~ /calc_KaKs_pair/ || $caller[3] =~ /calc_all_KaKs_pairs/) {
1419                                 #now calculate variances assuming large sample
1420                 my $d_syn_var =  jk_var($syn_prop, length($seqarray[$i]{'seq'})  - $gap_cnt );
1421                 my $d_nc_var =  jk_var($nc_prop, length ($seqarray[$i]{'seq'}) - $gap_cnt);
1422                 #now calculate z_value
1423                 #print "d_syn_var is  $d_syn_var,and d_nc_var is $d_nc_var\n";
1424                 #my $z = ($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var);
1425                 my $z = ($d_syn_var + $d_nc_var) ?
1426                   ($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var) : 0;
1427                 #       print "z is $z\n";
1428                 push @$result , {S => $av_s_site, N=>$av_ns_syn_site,
1429                                  S_d => $syn_count, N_d =>$non_syn_count,
1430                                  P_s => $syn_prop, P_n=>$nc_prop,
1431                                  D_s => @{$dsfor_average{'ds'}}[-1],
1432                                  D_n => @{$dsfor_average{'dn'}}[-1],
1433                                  D_n_var =>$d_nc_var, D_s_var => $d_syn_var,
1434                                  Seq1 => $seqarray[$i]{'id'},
1435                                  Seq2 => $seqarray[$j]{'id'},
1436                                  z_score => $z,
1437                              };
1438                 $self->warn (" number of mutations too small to justify normal test for  $seqarray[$i]{'id'} and $seqarray[$j]{'id'}\n- use Fisher's exact, or bootstrap a MSA")
1439                     if ($syn_count < 10 || $non_syn_count < 10 ) && $self->verbose > -1 ;
1440             }#endif
1441             }
1442     }
1443
1444     #warn of failure if no results hashes are present
1445     #will fail if Jukes Cantor has failed for all pairwise combinations
1446     #$self->warn("calculation failed!") if scalar @$result ==0;
1447
1448     #return results unless bootstrapping
1449     return $result if $caller[3]=~ /calc_all_KaKs/ || $caller[3] =~ /calc_KaKs_pair/;
1450     #else if getting average for bootstrap
1451     return( mean ($dsfor_average{'ds'}),mean ($dsfor_average{'dn'})) ;
1452 }
1453
1454
1455 sub jk {
1456     my ($self, $p) = @_;
1457     if ($p > 0.75) {
1458         $self->warn( " Jukes Cantor won't  work -too divergent!");
1459         return -1;
1460     }
1461     return -1 * (3/4) * (log(1 - (4/3) * $p));
1462 }
1463
1464 #works for large value of n (50?100?)
1465 sub jk_var {
1466     my ($p, $n) = @_;
1467     return (9 * $p * (1 -$p))/(((3 - 4 *$p) **2) * $n);
1468 }
1469
1470
1471 # compares 2 sequences to find the number of synonymous/non
1472 # synonymous mutations between them
1473
1474 sub analyse_mutations {
1475     my ($seq1, $seq2) = @_;
1476     my %mutator = ( 2=> {0=>[[1,2],  # codon positions to be altered
1477                              [2,1]], # depend on which is the same
1478                          1=>[[0,2],
1479                              [2,0]],
1480                          2=>[[0,1],
1481                              [1,0]],
1482                      },
1483                     3=> [ [0,1,2],  # all need to be altered
1484                           [1,0,2],
1485                           [0,2,1],
1486                           [1,2,0],
1487                           [2,0,1],
1488                           [2,1,0] ],
1489                     );
1490     my $TOTAL   = 0;    # total synonymous changes
1491     my $TOTAL_n = 0;    # total non-synonymous changes
1492     my $gap_cnt = 0;
1493
1494     my %input;
1495     my $seqlen = length($seq1);
1496     for (my $j=0; $j< $seqlen; $j+=3) {
1497         $input{'cod1'} = substr($seq1, $j,3);
1498         $input{'cod2'} = substr($seq2, $j,3);
1499
1500         #ignore codon if beeing compared with gaps!
1501         if ($input{'cod1'} =~ /\-/ || $input{'cod2'} =~ /\-/){
1502             $gap_cnt += 3; #just increments once if there is a pair of gaps
1503             next;
1504         }
1505
1506         my ($diff_cnt, $same) = count_diffs(\%input);
1507
1508         #ignore if codons are identical
1509         next if $diff_cnt == 0 ;
1510         if ($diff_cnt == 1) {
1511             $TOTAL += $synchanges{$input{'cod1'}}{$input{'cod2'}};
1512             $TOTAL_n += 1 - $synchanges{$input{'cod1'}}{$input{'cod2'}};
1513             #print " \nfordiff is 1 , total now $TOTAL, total n now $TOTAL_n\n\n"
1514         }
1515         elsif ($diff_cnt ==2) {
1516             my $s_cnt = 0;
1517             my $n_cnt = 0;
1518             my $tot_muts = 4;
1519             #will stay 4 unless there are stop codons at intervening point
1520           OUTER:for my $perm (@{$mutator{'2'}{$same}}) {
1521               my $altered = $input{'cod1'};
1522               my $prev= $altered;
1523               #         print "$prev -> (", $t[$CODONS->{$altered}], ")";
1524               for       my $mut_i (@$perm) { #index of codon mutated
1525                   substr($altered, $mut_i,1) = substr($input{'cod2'}, $mut_i, 1);
1526                   if ($t[$CODONS->{$altered}] eq '*') {
1527                       $tot_muts -=2;
1528                       #print "changes to stop codon!!\n";
1529                       next OUTER;
1530                   }
1531                   else {
1532                       $s_cnt += $synchanges{$prev}{$altered};
1533                       #                                 print "$altered ->(", $t[$CODONS->{$altered}], ") ";
1534                   }
1535                   $prev = $altered;
1536               }
1537               #         print "\n";
1538           }
1539             if ($tot_muts != 0) {
1540                 $TOTAL += ($s_cnt/($tot_muts/2));
1541                 $TOTAL_n += ($tot_muts - $s_cnt)/ ($tot_muts / 2);
1542             }
1543
1544         }
1545         elsif ($diff_cnt ==3 ) {
1546             my $s_cnt = 0;
1547             my $n_cnt = 0;
1548             my $tot_muts = 18;  #potential number  of mutations
1549           OUTER: for my $perm (@{$mutator{'3'}}) {
1550               my $altered = $input{'cod1'};
1551               my $prev= $altered;
1552               # print "$prev -> (", $t[$CODONS->{$altered}], ")";
1553               for my $mut_i (@$perm) { #index of codon mutated
1554                   substr($altered, $mut_i,1) = substr($input{'cod2'}, $mut_i, 1);
1555                   if ($t[$CODONS->{$altered}] eq '*') {
1556                       $tot_muts -=3;
1557                       # print "changes to stop codon!!\n";
1558                       next OUTER;
1559
1560                   }
1561                   else {
1562                       $s_cnt += $synchanges{$prev}{$altered};
1563                       #                 print "$altered ->(", $t[$CODONS->{$altered}], ") ";
1564                   }
1565                   $prev = $altered;
1566               }
1567               # print "\n";
1568
1569           }#end OUTER loop
1570               #calculate number of synonymous/non synonymous mutations for that codon
1571               # and add to total
1572               if ($tot_muts != 0) {
1573                   $TOTAL += ($s_cnt / ($tot_muts /3));
1574                   $TOTAL_n += 3 - ($s_cnt / ($tot_muts /3));
1575               }
1576         }                       #endif $diffcnt = 3
1577     }                           #end of sequencetraversal
1578     return ($TOTAL, $TOTAL_n, $gap_cnt);
1579 }
1580
1581
1582 sub count_diffs {
1583     #counts the number of nucleotide differences between 2 codons
1584     # returns this value plus the codon index of which nucleotide is the same when 2
1585     #nucleotides are different. This is so analyse_mutations() knows which nucleotides
1586     # to change.
1587     my $ref = shift;
1588     my $cnt = 0;
1589     my $same= undef;
1590     #just for 2 differences
1591     for (0..2) {
1592         if (substr($ref->{'cod1'}, $_,1) ne substr($ref->{'cod2'}, $_, 1)){
1593             $cnt++;
1594         } else {
1595             $same = $_;
1596         }
1597     }
1598     return ($cnt, $same);
1599 }
1600
1601 =head2 get_syn_changes
1602
1603  Title   : get_syn_changes
1604  Usage   : Bio::Align::DNAStatitics->get_syn_changes
1605  Function: Generate a hashref of all pairwise combinations of codns
1606            differing by 1
1607  Returns : Symetic matrix using hashes
1608            First key is codon
1609            and each codon points to a hashref of codons
1610            the values of which describe type of change.
1611            my $type = $hash{$codon1}->{$codon2};
1612            values are :
1613              1   synonymous
1614              0   non-syn
1615             -1   either codon is a stop codon
1616  Args    : none
1617
1618 =cut
1619
1620 sub get_syn_changes {
1621 #hash of all pairwise combinations of codons differing by 1
1622 # 1 = syn, 0 = non-syn, -1 = stop
1623     my %results;
1624     my @codons = _make_codons ();
1625     my $arr_len = scalar @codons;
1626     for (my $i = 0; $i < $arr_len -1; $i++) {
1627         my $cod1 = $codons[$i];
1628         for (my $j = $i +1; $j < $arr_len; $j++) {
1629             my $diff_cnt = 0;
1630             for my $pos(0..2) {
1631                 $diff_cnt++ if substr($cod1, $pos, 1) ne substr($codons[$j], $pos, 1);
1632             }
1633             next if $diff_cnt !=1;
1634
1635             #synon change
1636             if($t[$CODONS->{$cod1}] eq $t[$CODONS->{$codons[$j]}]) {
1637                 $results{$cod1}{$codons[$j]} =1;
1638                 $results{$codons[$j]}{$cod1} = 1;
1639             }
1640             #stop codon
1641             elsif ($t[$CODONS->{$cod1}] eq '*' or $t[$CODONS->{$codons[$j]}] eq '*') {
1642                 $results{$cod1}{$codons[$j]} = -1;
1643                 $results{$codons[$j]}{$cod1} = -1;
1644             }
1645             # nc change
1646             else {
1647                 $results{$cod1}{$codons[$j]} = 0;
1648                 $results{$codons[$j]}{$cod1} = 0;
1649             }
1650         }
1651     }
1652     return %results;
1653 }
1654
1655 =head2 dnds_pattern_number
1656
1657  Title   : dnds_pattern_number
1658  Usage   : my $patterns = $stats->dnds_pattern_number($alnobj);
1659  Function: Counts the number of codons with no gaps in the MSA
1660  Returns : Number of codons with no gaps ('patterns' in PAML notation)
1661  Args    : A Bio::Align::AlignI compliant object such as a
1662             Bio::SimpleAlign object.
1663
1664 =cut
1665
1666 sub dnds_pattern_number{
1667     my ($self, $aln) = @_;
1668     return ($aln->remove_gaps->length)/3;
1669 }
1670
1671 sub count_syn_sites {
1672     #counts the number of possible synonymous changes for sequence
1673     my ($seq, $synsite) = @_;
1674     __PACKAGE__->throw("not integral number of codons") if length($seq) % 3 != 0;
1675     my $S = 0;
1676     for (my $i = 0; $i< length($seq); $i+=3) {
1677         my $cod = substr($seq, $i, 3);
1678         next if $cod =~ /\-/;   #deal with alignment gaps
1679         $S +=  $synsite->{$cod}{'s'};
1680     }
1681     #print "S is $S\n";
1682     return $S;
1683 }
1684
1685
1686
1687 sub get_syn_sites {
1688     #sub to generate lookup hash for the number of synonymous changes per codon
1689     my @nucs = qw(T C A G);
1690     my %raw_results;
1691     for my $i (@nucs) {
1692         for my $j (@nucs) {
1693             for my $k (@nucs) {
1694                 # for each possible codon
1695                 my $cod = "$i$j$k";
1696                 my $aa = $t[$CODONS->{$cod}];
1697                 #calculate number of synonymous mutations vs non syn mutations
1698                 for my $i (qw(0 1 2)){
1699                     my $s = 0;
1700                     my $n = 3;
1701                     for my $nuc (qw(A T C G)) {
1702                         next if substr ($cod, $i,1) eq $nuc;
1703                         my $test = $cod;
1704                         substr($test, $i, 1) = $nuc ;
1705                         if ($t[$CODONS->{$test}] eq $aa) {
1706                             $s++;
1707                         }
1708                         if ($t[$CODONS->{$test}] eq '*') {
1709                             $n--;
1710                         }
1711                     }
1712                     $raw_results{$cod}[$i] = {'s' => $s ,
1713                                               'n' => $n };
1714                 }
1715
1716             } #end analysis of single codon
1717         }
1718     } #end analysis of all codons
1719     my %final_results;
1720
1721     for my $cod (sort keys %raw_results) {
1722         my $t = 0;
1723         map{$t += ($_->{'s'} /$_->{'n'})} @{$raw_results{$cod}};
1724         $final_results{$cod} = { 's'=>$t, 'n' => 3 -$t};
1725     }
1726     return \%final_results;
1727 }
1728
1729 sub _make_codons {
1730 #makes all codon combinations, returns array of them
1731     my @nucs = qw(T C A G);
1732     my @codons;
1733     for my $i (@nucs) {
1734         for my $j (@nucs) {
1735             for my $k (@nucs) {
1736                 push @codons, "$i$j$k";
1737             }
1738         }
1739     }
1740     return @codons;
1741 }
1742
1743 sub get_codons {
1744  #generates codon translation look up table#
1745  my $x = 0;
1746  my  $CODONS = {};
1747  for my $codon (_make_codons) {
1748      $CODONS->{$codon} = $x;
1749      $x++;
1750  }
1751  return $CODONS;
1752 }
1753
1754 #########stats subs, can go in another module? Here for speed. ###
1755 sub mean {
1756     my $ref = shift;
1757     my $el_num = scalar @$ref;
1758     my $tot = 0;
1759     map{$tot += $_}@$ref;
1760     return ($tot/$el_num);
1761 }
1762
1763 sub variance {
1764     my $ref = shift;
1765     my $mean = mean($ref);
1766     my $sum_of_squares = 0;
1767     map{$sum_of_squares += ($_ - $mean) **2}@$ref;
1768     return $sum_of_squares;
1769 }
1770
1771 sub sampling_variance {
1772     my $ref = shift;
1773     return variance($ref) / (scalar @$ref -1);
1774 }
1775
1776 1;