2 # BioPerl module for Bio::Align::DNAStatistics
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-AT-bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
16 Bio::Align::DNAStatistics - Calculate some statistics for a DNA alignment
21 use Bio::Align::DNAStatistics;
23 my $stats = Bio::Align::DNAStatistics->new();
24 my $alignin = Bio::AlignIO->new(-format => 'emboss',
25 -file => 't/data/insulin.water');
26 my $aln = $alignin->next_aln;
27 my $jcmatrix = $stats->distance(-align => $aln,
28 -method => 'Jukes-Cantor');
30 print $jcmatrix->print_matrix;
31 ## and for measurements of synonymous /nonsynonymous substitutions ##
33 my $in = Bio::AlignIO->new(-format => 'fasta',
34 -file => 't/data/nei_gojobori_test.aln');
35 my $alnobj = $in->next_aln;
36 my ($seq1id,$seq2id) = map { $_->display_id } $alnobj->each_seq;
37 my $results = $stats->calc_KaKs_pair($alnobj, $seq1id, $seq2id);
38 print "comparing ".$results->[0]{'Seq1'}." and ".$results->[0]{'Seq2'}."\n";
39 for (sort keys %{$results->[0]} ){
41 printf("%-9s %.4f \n",$_ , $results->[0]{$_});
44 my $results2 = $stats->calc_all_KaKs_pairs($alnobj);
45 for my $an (@$results2){
46 print "comparing ". $an->{'Seq1'}." and ". $an->{'Seq2'}. " \n";
47 for (sort keys %$an ){
49 printf("%-9s %.4f \n",$_ , $an->{$_});
54 my $result3 = $stats->calc_average_KaKs($alnobj, 1000);
55 for (sort keys %$result3 ){
57 printf("%-9s %.4f \n",$_ , $result3->{$_});
62 This object contains routines for calculating various statistics and
63 distances for DNA alignments. The routines are not well tested and do
64 contain errors at this point. Work is underway to correct them, but
65 do not expect this code to give you the right answer currently! Use
66 dnadist/distmat in the PHLYIP or EMBOSS packages to calculate the
70 Several different distance method calculations are supported. Listed
71 in brackets are the pattern which will match
77 JukesCantor [jc|jukes|jukescantor|jukes-cantor]
81 Uncorrected [jcuncor|uncorrected]
89 Kimura [k2|k2p|k80|kimura]
93 Tamura [t92|tamura|tamura92]
97 F84 [f84|felsenstein84]
101 TajimaNei [tajimanei|tajima\-nei]
105 JinNei [jinnei|jin\-nei] (not implemented)
109 There are also three methods to calculate the ratio of synonymous to
110 non-synonymous mutations. All are implementations of the Nei-Gojobori
111 evolutionary pathway method and use the Jukes-Cantor method of
112 nucleotide substitution. This method works well so long as the
113 nucleotide frequencies are roughly equal and there is no significant
114 transition/transversion bias. In order to use these methods there are
115 several pre-requisites for the alignment.
121 DNA alignment must be based on protein alignment. Use the subroutine
122 L<Bio::Align::Utilities/aa_to_dna_aln> to achieve this.
126 Therefore alignment gaps must be in multiples of 3 (representing an aa
127 deletion/insertion) and at present must be indicated by a '-' symbol.
131 Alignment must be solely of coding region and be in reading frame 0 to
132 achieve meaningful results
136 Alignment must therefore be a multiple of 3 nucleotides long.
140 All sequences must be the same length (including gaps). This should be
141 the case anyway if the sequences have been automatically aligned using
142 a program like Clustal.
146 Only the standard codon alphabet is supported at present.
150 calc_KaKs_pair() calculates a number of statistics for a named pair of
151 sequences in the alignment.
153 calc_all_KaKs_pairs() calculates these statistics for all pairwise
154 comparisons in an MSA. The statistics returned are:
160 S_d - Number of synonymous mutations between the 2 sequences.
164 N_d - Number of non-synonymous mutations between the 2 sequences.
168 S - Mean number of synonymous sites in both sequences.
172 N - mean number of synonymous sites in both sequences.
176 P_s - proportion of synonymous differences in both sequences given by
181 P_n - proportion of non-synonymous differences in both sequences given
186 D_s - estimation of synonymous mutations per synonymous site (by
191 D_n - estimation of non-synonymous mutations per non-synonymous site (by
196 D_n_var - estimation of variance of D_n .
200 D_s_var - estimation of variance of S_n.
204 z_value - calculation of z value.Positive value indicates D_n E<gt> D_s,
205 negative value indicates D_s E<gt> D_n.
209 The statistics returned by calc_average_KaKs are:
215 D_s - Average number of synonymous mutations/synonymous site.
219 D_n - Average number of non-synonymous mutations/non-synonymous site.
223 D_s_var - Estimated variance of Ds from bootstrapped alignments.
227 D_n_var - Estimated variance of Dn from bootstrapped alignments.
231 z_score - calculation of z value. Positive value indicates D_n E<gt>D_s,
232 negative values vice versa.
236 The design of the code is based around the explanation of the
237 Nei-Gojobori algorithm in the excellent book "Molecular Evolution and
238 Phylogenetics" by Nei and Kumar, published by Oxford University
239 Press. The methods have been tested using the worked example 4.1 in
240 the book, and reproduce those results. If people like having this sort
241 of analysis in BioPerl other methods for estimating Ds and Dn can be
244 Much of the DNA distance code is based on implementations in EMBOSS
245 (Rice et al, www.emboss.org) [distmat.c] and PHYLIP (J. Felsenstein et
246 al) [dnadist.c]. Insight also gained from Eddy, Durbin, Krogh, &
257 "Phylogenetic Inference", Swoffrod, Olsen, Waddell and Hillis, in
258 Mol. Systematics, 2nd ed, 1996, Ch 11. Derived from "Evolution of
259 Protein Molecules", Jukes & Cantor, in Mammalian Prot. Metab., III,
266 K Tamura, Mol. Biol. Evol. 1992, 9, 678.
272 M Kimura, J. Mol. Evol., 1980, 16, 111.
278 Jin and Nei, Mol. Biol. Evol. 82, 7, 1990.
284 Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
292 User feedback is an integral part of the evolution of this and other
293 Bioperl modules. Send your comments and suggestions preferably to
294 the Bioperl mailing list. Your participation is much appreciated.
296 bioperl-l@bioperl.org - General discussion
297 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
301 Please direct usage questions or support issues to the mailing list:
303 I<bioperl-l@bioperl.org>
305 rather than to the module maintainer directly. Many experienced and
306 reponsive experts will be able look at the problem and quickly
307 address it. Please include a thorough description of the problem
308 with code and data examples if at all possible.
310 =head2 Reporting Bugs
312 Report bugs to the Bioperl bug tracking system to help us keep track
313 of the bugs and their resolution. Bug reports can be submitted via the
316 https://github.com/bioperl/bioperl-live/issues
318 =head1 AUTHOR - Jason Stajich
320 Email jason-AT-bioperl.org
324 Richard Adams, richard.adams@ed.ac.uk
328 The rest of the documentation details each of the object methods.
329 Internal methods are usually preceded with a _
334 # Let the code begin...
337 package Bio
::Align
::DNAStatistics
;
338 use vars
qw(%DNAChanges @Nucleotides %NucleotideIndexes
339 $GapChars $SeqCount $DefaultGapPenalty %DistanceMethods
340 $CODONS %synchanges $synsites $Precision $GCChhars);
342 use Bio::Align::PairwiseStatistics;
343 use Bio::Matrix::PhylipDist;
344 use Bio::Tools::IUPAC;
347 $GapChars = '[\.\-]';
349 @Nucleotides = qw(A G T C);
353 # these values come from EMBOSS distmat implementation
354 %NucleotideIndexes = ( 'A' => 0,
366 # these are wrong now
379 $DefaultGapPenalty = 0;
380 # could put ambiguities here?
381 %DNAChanges = ( 'Transversions' => { 'A' => [ 'T', 'C'],
386 'Transitions' => { 'A' => [ 'G' ],
392 %DistanceMethods = ( 'jc|jukes|jukescantor|jukes\-cantor' => 'JukesCantor',
393 'jcuncor|uncorrected' => 'Uncorrected',
394 'f81|felsenstein81' => 'F81',
395 'k2|k2p|k80|kimura' => 'Kimura',
396 't92|tamura|tamura92' => 'Tamura',
397 'f84|felsenstein84' => 'F84',
398 'tajimanei|tajima\-nei' => 'TajimaNei',
399 'jinnei|jin\-nei' => 'JinNei');
402 use base
qw(Bio::Root::Root Bio::Align::StatisticsI);
404 ## generate look up hashes for Nei_Gojobori methods##
405 $CODONS = get_codons
();
406 my @t = split '', "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
407 #create look up hash of number of possible synonymous mutations per codon
408 $synsites = get_syn_sites
();
409 #create reference look up hash of single basechanges in codons
410 %synchanges = get_syn_changes
();
417 Usage : my $obj = Bio::Align::DNAStatistics->new();
418 Function: Builds a new Bio::Align::DNAStatistics object
419 Returns : Bio::Align::DNAStatistics
426 my ($class,@args) = @_;
427 my $self = $class->SUPER::new
(@args);
429 $self->pairwise_stats( Bio
::Align
::PairwiseStatistics
->new());
438 Usage : my $distance_mat = $stats->distance(-align => $aln,
440 Function: Calculates a distance matrix for all pairwise distances of
441 sequences in an alignment.
442 Returns : L<Bio::Matrix::PhylipDist> object
443 Args : -align => Bio::Align::AlignI object
444 -method => String specifying specific distance method
445 (implementing class may assume a default)
446 See also: L<Bio::Matrix::PhylipDist>
451 my ($self,@args) = @_;
452 my ($aln,$method) = $self->_rearrange([qw(ALIGN METHOD)],@args);
453 if( ! defined $aln || ! ref ($aln) || ! $aln->isa('Bio::Align::AlignI') ) {
454 $self->throw("Must supply a valid Bio::Align::AlignI for the -align parameter in distance");
456 $method ||= 'JukesCantor';
457 foreach my $m ( keys %DistanceMethods ) {
458 if(defined $m && $method =~ /$m/i ) {
459 my $mtd = "D_$DistanceMethods{$m}";
460 return $self->$mtd($aln);
463 $self->warn("Unrecognized distance method $method must be one of [".
464 join(',',$self->available_distance_methods())."]");
468 =head2 available_distance_methods
470 Title : available_distance_methods
471 Usage : my @methods = $stats->available_distance_methods();
472 Function: Enumerates the possible distance methods
473 Returns : Array of strings
479 sub available_distance_methods
{
480 my ($self,@args) = @_;
481 return values %DistanceMethods;
484 =head2 D - distance methods
492 Title : D_JukesCantor
493 Usage : my $d = $stat->D_JukesCantor($aln)
494 Function: Calculates D (pairwise distance) between 2 sequences in an
495 alignment using the Jukes-Cantor 1 parameter model.
496 Returns : L<Bio::Matrix::PhylipDist>
497 Args : L<Bio::Align::AlignI> of DNA sequences
504 my ($self,$aln,$gappenalty) = @_;
505 return 0 unless $self->_check_arg($aln);
506 $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
507 # ambiguities ignored at this point
508 my (@seqs,@names,@values,%dist);
510 foreach my $seq ( $aln->each_seq) {
511 push @names, $seq->display_id;
512 push @seqs, uc $seq->seq();
515 my $precisionstr = "%.$Precision"."f";
516 for(my $i = 0; $i < $seqct-1; $i++ ) {
517 # (diagonals) distance is 0 for same sequence
518 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
519 $values[$i][$i] = sprintf($precisionstr,0);
521 for( my $j = $i+1; $j < $seqct; $j++ ) {
522 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
524 # just want diagonals
525 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
526 $matrix->[2]->[2] + $matrix->[3]->[3] );
527 my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
528 my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
530 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
531 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
532 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
533 # (diagonals) distance is 0 for same sequence
534 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
535 $values[$j][$j] = sprintf($precisionstr,0);
538 return Bio
::Matrix
::PhylipDist
->new(-program
=> 'bioperl_DNAstats',
541 -values => \
@values);
547 Usage : my $d = $stat->D_F81($aln)
548 Function: Calculates D (pairwise distance) between 2 sequences in an
549 alignment using the Felsenstein 1981 distance model.
550 Relaxes the assumption of equal base frequencies that is
552 Returns : L<Bio::Matrix::PhylipDist>
553 Args : L<Bio::Align::AlignI> of DNA sequences
559 my ($self,$aln,$gappenalty) = @_;
560 return 0 unless $self->_check_arg($aln);
561 $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
562 # ambiguities ignored at this point
563 my (@seqs,@names,@values,%dist);
565 foreach my $seq ( $aln->each_seq) {
566 push @names, $seq->display_id;;
567 push @seqs, uc $seq->seq();
570 my $precisionstr = "%.$Precision"."f";
571 for(my $i = 0; $i < $seqct-1; $i++ ) {
572 # (diagonals) distance is 0 for same sequence
573 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
574 $values[$i][$i] = sprintf($precisionstr,0);
576 for( my $j = $i+1; $j < $seqct; $j++ ) {
578 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
580 # just want diagonals
581 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
582 $matrix->[2]->[2] + $matrix->[3]->[3] );
583 my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
584 my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
586 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
587 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
588 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
589 # (diagonals) distance is 0 for same sequence
590 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
591 $values[$j][$j] = sprintf($precisionstr,0);
594 return Bio
::Matrix
::PhylipDist
->new(-program
=> 'bioperl_DNAstats',
597 -values => \
@values);
602 Title : D_Uncorrected
603 Usage : my $d = $stats->D_Uncorrected($aln)
604 Function: Calculate a distance D, no correction for multiple substitutions
605 is used. In rare cases where sequences may not overlap, 'NA' is
606 substituted for the distance.
607 Returns : L<Bio::Matrix::PhylipDist>
608 Args : L<Bio::Align::AlignI> (DNA Alignment)
609 [optional] gap penalty
614 my ($self,$aln,$gappenalty) = @_;
615 $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
616 return 0 unless $self->_check_arg($aln);
617 # ambiguities ignored at this point
618 my (@seqs,@names,@values,%dist);
620 foreach my $seq ( $aln->each_seq) {
621 push @names, $seq->display_id;
622 push @seqs, uc $seq->seq();
626 my $precisionstr = "%.$Precision"."f";
627 my $len = $aln->length;
628 for( my $i = 0; $i < $seqct-1; $i++ ) {
629 # (diagonals) distance is 0 for same sequence
630 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
631 $values[$i][$i] = sprintf($precisionstr,0);
633 for( my $j = $i+1; $j < $seqct; $j++ ) {
634 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
636 my $m = ( $matrix->[0]->[0] +
640 my $denom = ( $len - $gaps + ( $gaps * $gappenalty));
642 $self->warn("No distance calculated between $names[$i] and $names[$j], inserting -1")
645 my $D = $denom ?
1 - ( $m / $denom) : -1;
647 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
648 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
649 $values[$j][$i] = $values[$i][$j] = $denom ?
sprintf($precisionstr,$D)
650 : sprintf("%-*s", $Precision + 2, $D);
651 # (diagonals) distance is 0 for same sequence
652 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
653 $values[$j][$j] = sprintf($precisionstr,0);
656 return Bio
::Matrix
::PhylipDist
->new(-program
=> 'bioperl_DNAstats',
659 -values => \
@values);
663 # M Kimura, J. Mol. Evol., 1980, 16, 111.
668 Usage : my $d = $stat->D_Kimura($aln)
669 Function: Calculates D (pairwise distance) between all pairs of sequences
670 in an alignment using the Kimura 2 parameter model.
671 Returns : L<Bio::Matrix::PhylipDist>
672 Args : L<Bio::Align::AlignI> of DNA sequences
678 my ($self,$aln) = @_;
679 return 0 unless $self->_check_arg($aln);
680 # ambiguities ignored at this point
681 my (@names,@values,%dist);
683 foreach my $seq ( $aln->each_seq) {
684 push @names, $seq->display_id;
688 my $precisionstr = "%.$Precision"."f";
690 for( my $i = 0; $i < $seqct-1; $i++ ) {
691 # (diagonals) distance is 0 for same sequence
692 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
693 $values[$i][$i] = sprintf($precisionstr,0);
695 for( my $j = $i+1; $j < $seqct; $j++ ) {
696 my $pairwise = $aln->select_noncont($i+1,$j+1);
697 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
701 my $P = $self->transitions($pairwise) / $L;
702 my $Q = $self->transversions($pairwise) / $L;
704 my $denom = ( 1 - (2 * $P) - $Q);
706 $self->throw("cannot find distance for ",$i+1,
707 ",",$j+1," $P, $Q\n");
709 my $a = 1 / ( 1 - (2 * $P) - $Q);
710 my $b = 1 / ( 1 - 2 * $Q );
711 if( $a < 0 || $b < 0 ) {
714 $K = (1/2) * log ( $a ) + (1/4) * log($b);
717 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
718 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
719 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$K);
720 # (diagonals) distance is 0 for same sequence
721 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
722 $values[$j][$j] = sprintf($precisionstr,0);
725 return Bio
::Matrix
::PhylipDist
->new(-program
=> 'bioperl_DNAstats',
728 -values => \
@values);
732 =head2 D_Kimura_variance
735 Usage : my $d = $stat->D_Kimura_variance($aln)
736 Function: Calculates D (pairwise distance) between all pairs of sequences
737 in an alignment using the Kimura 2 parameter model.
738 Returns : array of 2 L<Bio::Matrix::PhylipDist>,
739 the first is the Kimura distance and the second is
740 a matrix of variance V(K)
741 Args : L<Bio::Align::AlignI> of DNA sequences
746 sub D_Kimura_variance
{
747 my ($self,$aln) = @_;
748 return 0 unless $self->_check_arg($aln);
749 # ambiguities ignored at this point
750 my (@names,@values,%dist,@var);
752 foreach my $seq ( $aln->each_seq) {
753 push @names, $seq->display_id;
757 my $precisionstr = "%.$Precision"."f";
759 for( my $i = 0; $i < $seqct-1; $i++ ) {
760 # (diagonals) distance is 0 for same sequence
761 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
762 $values[$i][$i] = sprintf($precisionstr,0);
764 for( my $j = $i+1; $j < $seqct; $j++ ) {
765 my $pairwise = $aln->select_noncont($i+1,$j+1);
766 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
770 my $P = $self->transitions($pairwise) / $L;
771 my $Q = $self->transversions($pairwise) / $L;
772 my ($a,$b,$K,$var_k);
773 my $a_denom = ( 1 - (2 * $P) - $Q);
774 my $b_denom = 1 - 2 * $Q;
775 unless( $a_denom > 0 && $b_denom > 0 ) {
783 $K = (1/2) * log ( $a ) + (1/4) * log($b);
784 # from Wu and Li 1985 which in turn is from Kimura 1980
785 my $c = ( $a - $b ) / 2;
786 my $d = ( $a + $b ) / 2;
787 $var_k = ( $a**2 * $P + $d**2 * $Q - ( $a * $P + $d * $Q)**2 ) / $L;
791 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
792 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
793 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$K);
794 # (diagonals) distance is 0 for same sequence
795 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
796 $values[$j]->[$j] = sprintf($precisionstr,0);
798 $var[$j]->[$i] = $var[$i]->[$j] = sprintf($precisionstr,$var_k);
799 $var[$j]->[$j] = $values[$j]->[$j];
802 return ( Bio
::Matrix
::PhylipDist
->new(-program
=> 'bioperl_DNAstats',
805 -values => \
@values),
806 Bio
::Matrix
::PhylipDist
->new(-program
=> 'bioperl_DNAstats',
814 # K Tamura, Mol. Biol. Evol. 1992, 9, 678.
819 Usage : Calculates D (pairwise distance) between 2 sequences in an
820 alignment using Tamura 1992 distance model.
821 Returns : L<Bio::Matrix::PhylipDist>
822 Args : L<Bio::Align::AlignI> of DNA sequences
828 my ($self,$aln) = @_;
829 return 0 unless $self->_check_arg($aln);
830 # ambiguities ignored at this point
831 my (@seqs,@names,@values,%dist,$i,$j);
833 my $length = $aln->length;
834 foreach my $seq ( $aln->each_seq) {
835 push @names, $seq->display_id;;
836 push @seqs, uc $seq->seq();
840 my $precisionstr = "%.$Precision"."f";
841 my (@gap,@gc,@trans,@tranv,@score);
843 for my $t1 ( @seqs ) {
845 for my $t2 ( @seqs ) {
847 for( my $k = 0; $k < $length; $k++ ) {
848 my ($c1,$c2) = ( substr($seqs[$i],$k,1),
849 substr($seqs[$j],$k,1) );
850 if( $c1 =~ /^$GapChars$/ ||
851 $c2 =~ /^$GapChars$/ ) {
853 } elsif( $c2 =~ /^$GCChhars$/i ) {
857 $gc[$i][$j] = ( $gc[$i][$j] /
858 ($length - $gap[$i][$j]) );
864 for( $i = 0; $i < $seqct-1; $i++ ) {
865 # (diagonals) distance is 0 for same sequence
866 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
867 $values[$i][$i] = sprintf($precisionstr,0);
869 for( $j = $i+1; $j < $seqct; $j++ ) {
871 my $pairwise = $aln->select_noncont($i+1,$j+1);
872 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
873 my $P = $self->transitions($pairwise) / $L;
874 my $Q = $self->transversions($pairwise) / $L;
875 my $C = $gc[$i][$j] + $gc[$j][$i]-
876 ( 2 * $gc[$i][$j] * $gc[$j][$i] );
880 my $d = -($C * log(1- $P - $Q)) -(0.5* ( 1 - $C) * log(1 - 2 * $Q));
882 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
883 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
884 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
885 # (diagonals) distance is 0 for same sequence
886 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
887 $values[$j][$j] = sprintf($precisionstr,0);
890 return Bio
::Matrix
::PhylipDist
->new(-program
=> 'bioperl_DNAstats',
893 -values => \
@values);
900 Usage : my $d = $stat->D_F84($aln)
901 Function: Calculates D (pairwise distance) between 2 sequences in an
902 alignment using the Felsenstein 1984 distance model.
903 Returns : L<Bio::Matrix::PhylipDist>
904 Args : L<Bio::Align::AlignI> of DNA sequences
905 [optional] double - gap penalty
910 my ($self,$aln,$gappenalty) = @_;
911 return 0 unless $self->_check_arg($aln);
912 $self->throw_not_implemented();
913 # ambiguities ignored at this point
914 my (@seqs,@names,@values,%dist);
916 foreach my $seq ( $aln->each_seq) {
917 # if there is no name,
918 my $id = $seq->display_id;
919 if( ! length($id) || # deal with empty names
924 push @seqs, uc $seq->seq();
928 my $precisionstr = "%.$Precision"."f";
930 for( my $i = 0; $i < $seqct-1; $i++ ) {
931 # (diagonals) distance is 0 for same sequence
932 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
933 $values[$i][$i] = sprintf($precisionstr,0);
935 for( my $j = $i+1; $j < $seqct; $j++ ) {
940 # Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
941 # Tajima-Nei correction used for multiple substitutions in the calc
942 # of the distance matrix. Nucleic acids only.
944 # D = p-distance = 1 - (matches/(posns_scored + gaps)
946 # distance = -b * ln(1-D/b)
952 Usage : my $d = $stat->D_TajimaNei($aln)
953 Function: Calculates D (pairwise distance) between 2 sequences in an
954 alignment using the TajimaNei 1984 distance model.
955 Returns : L<Bio::Matrix::PhylipDist>
956 Args : Bio::Align::AlignI of DNA sequences
962 my ($self,$aln) = @_;
963 return 0 unless $self->_check_arg($aln);
964 # ambiguities ignored at this point
965 my (@seqs,@names,@values,%dist);
967 foreach my $seq ( $aln->each_seq) {
968 # if there is no name,
969 push @names, $seq->display_id;
970 push @seqs, uc $seq->seq();
973 my $precisionstr = "%.$Precision"."f";
976 for( $i =0; $i < $seqct -1; $i++ ) {
977 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
978 $values[$i][$i] = sprintf($precisionstr,0);
980 for ( $j = $i+1; $j <$seqct;$j++ ) {
981 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
983 my $pairwise = $aln->select_noncont($i+1,$j+1);
984 my $slen = $self->pairwise_stats->number_of_comparable_bases($pairwise);
986 for( $bs = 0; $bs < 4; $bs++ ) {
988 map {$fi += $matrix->[$bs]->[$_] } 0..3;
991 map { $fj += $matrix->[$_]->[$bs] } 0..3;
992 my $fij = ( $fi && $fj ) ?
($fi + $fj) /( 2 * $slen) : 0;
996 my ($pair,$h) = (0,0);
997 for( $bs = 0; $bs < 3; $bs++ ) {
998 for(my $bs1 = $bs+1; $bs1 <= 3; $bs1++ ) {
999 my $fij = $pfreq->[$pair++] / $slen;
1002 my ($ci1,$ci2,$cj1,$cj2) = (0,0,0,0);
1004 map { $ci1 += $matrix->[$_]->[$bs] } 0..3;
1005 map { $cj1 += $matrix->[$bs]->[$_] } 0..3;
1006 map { $ci2 += $matrix->[$_]->[$bs1] } 0..3;
1007 map { $cj2 += $matrix->[$bs1]->[$_] } 0..3;
1010 $h += ( ($fij**2) / 2 ) /
1011 ( ( ( $ci1 + $cj1 ) / (2 * $slen) ) *
1012 ( ( $ci2 + $cj2 ) / (2 * $slen) )
1015 $self->debug( "slen is $slen h is $h fij = $fij ci1 =$ci1 cj1=$cj1 ci2=$ci2 cj2=$cj2\n");
1019 # just want diagonals which are matches (A matched A, C -> C)
1021 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
1022 $matrix->[2]->[2] + $matrix->[3]->[3] );
1023 my $D = 1 - ( $m / $slen);
1028 my $b = (1 - $fij2 + (($D**2)/$h)) / 2;
1034 $d = (-1 * $b) * log ( $c);
1037 # fwd and rev lookup
1038 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
1039 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
1040 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
1042 # (diagonals) distance is 0 for same sequence
1043 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
1044 $values[$j][$j] = sprintf($precisionstr,0);
1047 return Bio
::Matrix
::PhylipDist
->new(-program
=> 'bioperl_DNAstats',
1050 -values => \
@values);
1054 # Jin and Nei, Mol. Biol. Evol. 82, 7, 1990.
1059 Usage : my $d = $stat->D_JinNei($aln)
1060 Function: Calculates D (pairwise distance) between 2 sequences in an
1061 alignment using the Jin-Nei 1990 distance model.
1062 Returns : L<Bio::Matrix::PhylipDist>
1063 Args : L<Bio::Align::AlignI> of DNA sequences
1069 my ($self,@args) = @_;
1070 $self->warn("JinNei implementation not completed");
1074 =head2 transversions
1076 Title : transversions
1077 Usage : my $transversions = $stats->transversion($aln);
1078 Function: Calculates the number of transversions between two sequences in
1081 Args : Bio::Align::AlignI
1087 my ($self,$aln) = @_;
1088 return $self->_trans_count_helper($aln, $DNAChanges{'Transversions'});
1094 Usage : my $transitions = Bio::Align::DNAStatistics->transitions($aln);
1095 Function: Calculates the number of transitions in a given DNA alignment
1096 Returns : integer representing the number of transitions
1097 Args : Bio::Align::AlignI object
1103 my ($self,$aln) = @_;
1104 return $self->_trans_count_helper($aln, $DNAChanges{'Transitions'});
1108 sub _trans_count_helper
{
1109 my ($self,$aln,$type) = @_;
1110 return 0 unless( $self->_check_arg($aln) );
1111 if( ! $aln->is_flush ) { $self->throw("must be flush") }
1113 my ($first,$second) = ( uc $aln->get_seq_by_pos(1)->seq(),
1114 uc $aln->get_seq_by_pos(2)->seq() );
1115 my $alen = $aln->length;
1116 for (my $i = 0;$i<$alen; $i++ ) {
1117 my ($c1,$c2) = ( substr($first,$i,1),
1118 substr($second,$i,1) );
1120 foreach my $nt ( @
{$type->{$c1}} ) {
1128 map { if( $_) { $sum += $_} } @tcount;
1132 # this will generate a matrix which records across the row, the number
1135 sub _build_nt_matrix
{
1136 my ($self,$seqa,$seqb) = @_;
1139 my $basect_matrix = [ [ qw(0 0 0 0) ], # number of bases that match
1143 my $gaps = 0; # number of gaps
1144 my $pfreq = [ qw( 0 0 0 0 0 0)]; # matrix for pair frequency
1145 my $len_a = length($seqa);
1146 for( my $i = 0; $i < $len_a; $i++) {
1147 my ($ti,$tj) = (substr($seqa,$i,1),substr($seqb,$i,1));
1151 if( $ti =~ /^$GapChars$/) { $gaps++; next; }
1152 if( $tj =~ /^$GapChars$/) { $gaps++; next }
1154 my $ti_index = $NucleotideIndexes{$ti};
1155 my $tj_index = $NucleotideIndexes{$tj};
1157 if( ! defined $ti_index ) {
1158 $self->warn("ti_index not defined for $ti\n");
1162 $basect_matrix->[$ti_index]->[$tj_index]++;
1165 $pfreq->[$NucleotideIndexes{join('',sort ($ti,$tj))}]++;
1168 return ($basect_matrix,$pfreq,$gaps);
1171 sub _check_ambiguity_nucleotide
{
1172 my ($base1,$base2) = @_;
1173 my %iub = Bio
::Tools
::IUPAC
->iupac_iub();
1174 my @amb1 = @
{ $iub{uc($base1)} };
1175 my @amb2 = @
{ $iub{uc($base2)} };
1177 for my $amb ( @amb1 ) {
1178 if( grep { $amb eq $_ } @amb2 ) {
1184 return (1 / scalar @amb1) * (1 / scalar @amb2);
1192 my($self,$aln ) = @_;
1193 if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
1194 $self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics");
1196 } elsif( $aln->get_seq_by_pos(1)->alphabet ne 'dna' ) {
1197 $self->warn("Must provide a DNA alignment to Bio::Align::DNAStatistics, you provided a " . $aln->get_seq_by_pos(1)->alphabet);
1207 =head2 pairwise_stats
1209 Title : pairwise_stats
1210 Usage : $obj->pairwise_stats($newval)
1212 Returns : value of pairwise_stats
1213 Args : newvalue (optional)
1219 my ($self,$value) = @_;
1220 if( defined $value) {
1221 $self->{'_pairwise_stats'} = $value;
1223 return $self->{'_pairwise_stats'};
1227 =head2 calc_KaKs_pair
1229 Title : calc_KaKs_pair
1230 Useage : my $results = $stats->calc_KaKs_pair($alnobj,
1232 Function : calculates Nei-Gojobori statistics for pairwise
1234 Args : A Bio::Align::AlignI compliant object such as a
1235 Bio::SimpleAlign object, and 2 sequence name strings.
1236 Returns : a reference to a hash of statistics with keys as
1237 listed in Description.
1241 sub calc_KaKs_pair
{
1242 my ( $self, $aln, $seq1_id, $seq2_id) = @_;
1243 $self->throw("Needs 3 arguments - an alignment object, and 2 sequence ids")
1245 $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1247 #{id => $seq1_id, seq =>($aln->each_seq_with_id($seq1_id))[0]->seq},
1248 #{id => $seq2_id, seq =>($aln->each_seq_with_id($seq2_id))[0]->seq}
1249 {id
=> $seq1_id, seq
=> uc(($aln->each_seq_with_id($seq1_id))[0]->seq)},
1250 {id
=> $seq2_id, seq
=> uc(($aln->each_seq_with_id($seq2_id))[0]->seq)}
1252 if (length($seqs[0]{'seq'}) != length($seqs[1]{'seq'})) {
1253 $self->throw(" aligned sequences must be of equal length!");
1256 $self->_get_av_ds_dn(\
@seqs, $results);
1261 =head2 calc_all_KaKs_pairs
1263 Title : calc_all_KaKs_pairs
1264 Useage : my $results2 = $stats->calc_KaKs_pair($alnobj).
1265 Function : Calculates Nei_gojobori statistics for all pairwise
1266 combinations in sequence.
1267 Arguments: A Bio::Align::ALignI compliant object such as
1268 a Bio::SimpleAlign object.
1269 Returns : A reference to an array of hashes of statistics of
1270 all pairwise comparisons in the alignment.
1276 sub calc_all_KaKs_pairs
{
1277 #returns a multi_element_array with all pairwise comparisons
1278 my ($self,$aln) = @_;
1279 $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1281 for my $seq ($aln->each_seq) {
1282 push @seqs, {id
=> $seq->display_id, seq
=>$seq->seq};
1285 $results = $self->_get_av_ds_dn(\
@seqs, $results);
1289 =head2 calc_average_KaKs
1291 Title : calc_average_KaKs.
1292 Useage : my $res= $stats->calc_average_KaKs($alnobj, 1000).
1293 Function : calculates Nei_Gojobori stats for average of all
1294 sequences in the alignment.
1295 Args : A Bio::Align::AlignI compliant object such as a
1296 Bio::SimpleAlign object, number of bootstrap iterations
1298 Returns : A reference to a hash of statistics as listed in Description.
1302 sub calc_average_KaKs
{
1303 #calculates global value for sequences in alignment using bootstrapping
1304 #this is quite slow (~10 seconds per 3 X 200nt seqs);
1305 my ($self, $aln, $bootstrap_rpt) = @_;
1306 $bootstrap_rpt ||= 1000;
1307 $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1309 for my $seq ($aln->each_seq) {
1310 push @seqs, {id
=> $seq->display_id, seq
=>$seq->seq};
1313 my ($ds_orig, $dn_orig) = $self->_get_av_ds_dn(\
@seqs);
1314 #print "ds = $ds_orig, dn = $dn_orig\n";
1315 $results = {D_s
=> $ds_orig, D_n
=> $dn_orig};
1316 $self->_run_bootstrap(\
@seqs, $results, $bootstrap_rpt);
1320 ############## primary internal subs for alignment comparisons ########################
1322 sub _run_bootstrap
{
1323 ### generates sampled sequences, calculates Ds and Dn values,
1324 ### then calculates variance of sampled sequences and add results to results hash
1326 my ($self,$seq_ref, $results, $bootstrap_rpt) = @_;
1327 my @seqs = @
$seq_ref;
1328 my @btstrp_aoa; # to hold array of array of nucleotides for resampling
1329 my %bootstrap_values = (ds
=> [], dn
=>[]); # to hold list of av values
1331 #1st make alternative array of codons;
1333 while ($c < length $seqs[0]{'seq'}) {
1335 push @
{$btstrp_aoa[$_]}, substr ($seqs[$_]{'seq'}, $c, 3);
1340 for (1..$bootstrap_rpt) {
1341 my $sampled = _resample
(\
@btstrp_aoa);
1342 my ($ds, $dn) = $self->_get_av_ds_dn ($sampled) ; # is array ref
1343 push @
{$bootstrap_values{'ds'}}, $ds;
1344 push @
{$bootstrap_values{'dn'}}, $dn;
1347 $results->{'D_s_var'} = sampling_variance
($bootstrap_values{'ds'});
1348 $results->{'D_n_var'} = sampling_variance
($bootstrap_values{'dn'});
1349 $results->{'z_score'} = ($results->{'D_n'} - $results->{'D_s'}) /
1350 sqrt($results->{'D_s_var'} + $results->{'D_n_var'} );
1351 #print "bootstrapped var_syn = $results->{'D_s_var'} \n" ;
1352 #print "bootstrapped var_nc = $results->{'D_n_var'} \n";
1353 #print "z is $results->{'z_score'}\n"; ### end of global set up of/perm look up data
1358 my $codon_num = scalar (@
{$ref->[0]});
1360 for (0..$codon_num -1) { #for each codon
1361 my $rand = int (rand ($codon_num));
1363 push @
{$altered[$_]}, $ref->[$_][$rand];
1366 my @stringed = map {join '', @
$_}@altered;
1368 #now out in random name to keep other subs happy
1370 push @return, {id
=>'1', seq
=> $_};
1376 # takes array of hashes of sequence strings and ids #
1378 my $seq_ref = shift;
1379 my $result = shift if @_;
1380 my @caller = caller(1);
1381 my @seqarray = @
$seq_ref;
1382 my $bootstrap_score_list;
1383 #for a multiple alignment considers all pairwise combinations#
1384 my %dsfor_average = (ds
=> [], dn
=> []);
1385 for (my $i = 0; $i < scalar @seqarray; $i++) {
1386 for (my $j = $i +1; $j<scalar @seqarray; $j++ ){
1387 # print "comparing $i and $j\n";
1388 if (length($seqarray[$i]{'seq'}) != length($seqarray[$j]{'seq'})) {
1389 $self->warn(" aligned sequences must be of equal length!");
1393 my $syn_site_count = count_syn_sites
($seqarray[$i]{'seq'}, $synsites);
1394 my $syn_site_count2 = count_syn_sites
($seqarray[$j]{'seq'}, $synsites);
1395 # print "syn 1 is $syn_site_count , syn2 is $syn_site_count2\n";
1396 my ($syn_count, $non_syn_count, $gap_cnt) = analyse_mutations
($seqarray[$i]{'seq'}, $seqarray[$j]{'seq'});
1398 my $av_s_site = ($syn_site_count + $syn_site_count2)/2;
1399 my $av_ns_syn_site = length($seqarray[$i]{'seq'}) - $gap_cnt- $av_s_site ;
1401 #calculate ps and pn (p54)
1402 my $syn_prop = $syn_count / $av_s_site;
1403 my $nc_prop = $non_syn_count / $av_ns_syn_site ;
1405 #now use jukes/cantor to calculate D_s and D_n, would alter here if needed a different method
1406 my $d_syn = $self->jk($syn_prop);
1407 my $d_nc = $self->jk($nc_prop);
1409 #JK calculation must succeed for continuation of calculation
1410 #ret_value = -1 if error
1411 next unless $d_nc >=0 && $d_syn >=0;
1414 push @
{$dsfor_average{'ds'}}, $d_syn;
1415 push @
{$dsfor_average{'dn'}}, $d_nc;
1417 #if not doing bootstrap, calculate the pairwise comparisin stats
1418 if ($caller[3] =~ /calc_KaKs_pair/ || $caller[3] =~ /calc_all_KaKs_pairs/) {
1419 #now calculate variances assuming large sample
1420 my $d_syn_var = jk_var
($syn_prop, length($seqarray[$i]{'seq'}) - $gap_cnt );
1421 my $d_nc_var = jk_var
($nc_prop, length ($seqarray[$i]{'seq'}) - $gap_cnt);
1422 #now calculate z_value
1423 #print "d_syn_var is $d_syn_var,and d_nc_var is $d_nc_var\n";
1424 #my $z = ($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var);
1425 my $z = ($d_syn_var + $d_nc_var) ?
1426 ($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var) : 0;
1427 # print "z is $z\n";
1428 push @
$result , {S
=> $av_s_site, N
=>$av_ns_syn_site,
1429 S_d
=> $syn_count, N_d
=>$non_syn_count,
1430 P_s
=> $syn_prop, P_n
=>$nc_prop,
1431 D_s
=> @
{$dsfor_average{'ds'}}[-1],
1432 D_n
=> @
{$dsfor_average{'dn'}}[-1],
1433 D_n_var
=>$d_nc_var, D_s_var
=> $d_syn_var,
1434 Seq1
=> $seqarray[$i]{'id'},
1435 Seq2
=> $seqarray[$j]{'id'},
1438 $self->warn (" number of mutations too small to justify normal test for $seqarray[$i]{'id'} and $seqarray[$j]{'id'}\n- use Fisher's exact, or bootstrap a MSA")
1439 if ($syn_count < 10 || $non_syn_count < 10 ) && $self->verbose > -1 ;
1444 #warn of failure if no results hashes are present
1445 #will fail if Jukes Cantor has failed for all pairwise combinations
1446 #$self->warn("calculation failed!") if scalar @$result ==0;
1448 #return results unless bootstrapping
1449 return $result if $caller[3]=~ /calc_all_KaKs/ || $caller[3] =~ /calc_KaKs_pair/;
1450 #else if getting average for bootstrap
1451 return( mean
($dsfor_average{'ds'}),mean
($dsfor_average{'dn'})) ;
1456 my ($self, $p) = @_;
1458 $self->warn( " Jukes Cantor won't work -too divergent!");
1461 return -1 * (3/4) * (log(1 - (4/3) * $p));
1464 #works for large value of n (50?100?)
1467 return (9 * $p * (1 -$p))/(((3 - 4 *$p) **2) * $n);
1471 # compares 2 sequences to find the number of synonymous/non
1472 # synonymous mutations between them
1474 sub analyse_mutations
{
1475 my ($seq1, $seq2) = @_;
1476 my %mutator = ( 2=> {0=>[[1,2], # codon positions to be altered
1477 [2,1]], # depend on which is the same
1483 3=> [ [0,1,2], # all need to be altered
1490 my $TOTAL = 0; # total synonymous changes
1491 my $TOTAL_n = 0; # total non-synonymous changes
1495 my $seqlen = length($seq1);
1496 for (my $j=0; $j< $seqlen; $j+=3) {
1497 $input{'cod1'} = substr($seq1, $j,3);
1498 $input{'cod2'} = substr($seq2, $j,3);
1500 #ignore codon if beeing compared with gaps!
1501 if ($input{'cod1'} =~ /\-/ || $input{'cod2'} =~ /\-/){
1502 $gap_cnt += 3; #just increments once if there is a pair of gaps
1506 my ($diff_cnt, $same) = count_diffs
(\
%input);
1508 #ignore if codons are identical
1509 next if $diff_cnt == 0 ;
1510 if ($diff_cnt == 1) {
1511 $TOTAL += $synchanges{$input{'cod1'}}{$input{'cod2'}};
1512 $TOTAL_n += 1 - $synchanges{$input{'cod1'}}{$input{'cod2'}};
1513 #print " \nfordiff is 1 , total now $TOTAL, total n now $TOTAL_n\n\n"
1515 elsif ($diff_cnt ==2) {
1519 #will stay 4 unless there are stop codons at intervening point
1520 OUTER
:for my $perm (@
{$mutator{'2'}{$same}}) {
1521 my $altered = $input{'cod1'};
1523 # print "$prev -> (", $t[$CODONS->{$altered}], ")";
1524 for my $mut_i (@
$perm) { #index of codon mutated
1525 substr($altered, $mut_i,1) = substr($input{'cod2'}, $mut_i, 1);
1526 if ($t[$CODONS->{$altered}] eq '*') {
1528 #print "changes to stop codon!!\n";
1532 $s_cnt += $synchanges{$prev}{$altered};
1533 # print "$altered ->(", $t[$CODONS->{$altered}], ") ";
1539 if ($tot_muts != 0) {
1540 $TOTAL += ($s_cnt/($tot_muts/2));
1541 $TOTAL_n += ($tot_muts - $s_cnt)/ ($tot_muts / 2);
1545 elsif ($diff_cnt ==3 ) {
1548 my $tot_muts = 18; #potential number of mutations
1549 OUTER
: for my $perm (@
{$mutator{'3'}}) {
1550 my $altered = $input{'cod1'};
1552 # print "$prev -> (", $t[$CODONS->{$altered}], ")";
1553 for my $mut_i (@
$perm) { #index of codon mutated
1554 substr($altered, $mut_i,1) = substr($input{'cod2'}, $mut_i, 1);
1555 if ($t[$CODONS->{$altered}] eq '*') {
1557 # print "changes to stop codon!!\n";
1562 $s_cnt += $synchanges{$prev}{$altered};
1563 # print "$altered ->(", $t[$CODONS->{$altered}], ") ";
1570 #calculate number of synonymous/non synonymous mutations for that codon
1572 if ($tot_muts != 0) {
1573 $TOTAL += ($s_cnt / ($tot_muts /3));
1574 $TOTAL_n += 3 - ($s_cnt / ($tot_muts /3));
1576 } #endif $diffcnt = 3
1577 } #end of sequencetraversal
1578 return ($TOTAL, $TOTAL_n, $gap_cnt);
1583 #counts the number of nucleotide differences between 2 codons
1584 # returns this value plus the codon index of which nucleotide is the same when 2
1585 #nucleotides are different. This is so analyse_mutations() knows which nucleotides
1590 #just for 2 differences
1592 if (substr($ref->{'cod1'}, $_,1) ne substr($ref->{'cod2'}, $_, 1)){
1598 return ($cnt, $same);
1601 =head2 get_syn_changes
1603 Title : get_syn_changes
1604 Usage : Bio::Align::DNAStatitics->get_syn_changes
1605 Function: Generate a hashref of all pairwise combinations of codns
1607 Returns : Symetic matrix using hashes
1609 and each codon points to a hashref of codons
1610 the values of which describe type of change.
1611 my $type = $hash{$codon1}->{$codon2};
1615 -1 either codon is a stop codon
1620 sub get_syn_changes
{
1621 #hash of all pairwise combinations of codons differing by 1
1622 # 1 = syn, 0 = non-syn, -1 = stop
1624 my @codons = _make_codons
();
1625 my $arr_len = scalar @codons;
1626 for (my $i = 0; $i < $arr_len -1; $i++) {
1627 my $cod1 = $codons[$i];
1628 for (my $j = $i +1; $j < $arr_len; $j++) {
1631 $diff_cnt++ if substr($cod1, $pos, 1) ne substr($codons[$j], $pos, 1);
1633 next if $diff_cnt !=1;
1636 if($t[$CODONS->{$cod1}] eq $t[$CODONS->{$codons[$j]}]) {
1637 $results{$cod1}{$codons[$j]} =1;
1638 $results{$codons[$j]}{$cod1} = 1;
1641 elsif ($t[$CODONS->{$cod1}] eq '*' or $t[$CODONS->{$codons[$j]}] eq '*') {
1642 $results{$cod1}{$codons[$j]} = -1;
1643 $results{$codons[$j]}{$cod1} = -1;
1647 $results{$cod1}{$codons[$j]} = 0;
1648 $results{$codons[$j]}{$cod1} = 0;
1655 =head2 dnds_pattern_number
1657 Title : dnds_pattern_number
1658 Usage : my $patterns = $stats->dnds_pattern_number($alnobj);
1659 Function: Counts the number of codons with no gaps in the MSA
1660 Returns : Number of codons with no gaps ('patterns' in PAML notation)
1661 Args : A Bio::Align::AlignI compliant object such as a
1662 Bio::SimpleAlign object.
1666 sub dnds_pattern_number
{
1667 my ($self, $aln) = @_;
1668 return ($aln->remove_gaps->length)/3;
1671 sub count_syn_sites
{
1672 #counts the number of possible synonymous changes for sequence
1673 my ($seq, $synsite) = @_;
1674 __PACKAGE__
->throw("not integral number of codons") if length($seq) % 3 != 0;
1676 for (my $i = 0; $i< length($seq); $i+=3) {
1677 my $cod = substr($seq, $i, 3);
1678 next if $cod =~ /\-/; #deal with alignment gaps
1679 $S += $synsite->{$cod}{'s'};
1688 #sub to generate lookup hash for the number of synonymous changes per codon
1689 my @nucs = qw(T C A G);
1694 # for each possible codon
1696 my $aa = $t[$CODONS->{$cod}];
1697 #calculate number of synonymous mutations vs non syn mutations
1698 for my $i (qw(0 1 2)){
1701 for my $nuc (qw(A T C G)) {
1702 next if substr ($cod, $i,1) eq $nuc;
1704 substr($test, $i, 1) = $nuc ;
1705 if ($t[$CODONS->{$test}] eq $aa) {
1708 if ($t[$CODONS->{$test}] eq '*') {
1712 $raw_results{$cod}[$i] = {'s' => $s ,
1716 } #end analysis of single codon
1718 } #end analysis of all codons
1721 for my $cod (sort keys %raw_results) {
1723 map{$t += ($_->{'s'} /$_->{'n'})} @
{$raw_results{$cod}};
1724 $final_results{$cod} = { 's'=>$t, 'n' => 3 -$t};
1726 return \
%final_results;
1730 #makes all codon combinations, returns array of them
1731 my @nucs = qw(T C A G);
1736 push @codons, "$i$j$k";
1744 #generates codon translation look up table#
1747 for my $codon (_make_codons
) {
1748 $CODONS->{$codon} = $x;
1754 #########stats subs, can go in another module? Here for speed. ###
1757 my $el_num = scalar @
$ref;
1759 map{$tot += $_}@
$ref;
1760 return ($tot/$el_num);
1765 my $mean = mean
($ref);
1766 my $sum_of_squares = 0;
1767 map{$sum_of_squares += ($_ - $mean) **2}@
$ref;
1768 return $sum_of_squares;
1771 sub sampling_variance
{
1773 return variance
($ref) / (scalar @
$ref -1);