Move HMMER related modules, tests, and programs to new distribution.
[bioperl-live.git] / Bio / Align / DNAStatistics.pm
blob3d48594eadb9ae31216c5c79cfeeaf9263f058ac
2 # BioPerl module for Bio::Align::DNAStatistics
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-AT-bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::Align::DNAStatistics - Calculate some statistics for a DNA alignment
18 =head1 SYNOPSIS
20 use Bio::AlignIO;
21 use Bio::Align::DNAStatistics;
23 my $stats = Bio::Align::DNAStatistics->new();
24 my $alignin = Bio::AlignIO->new(-format => 'emboss',
25 -file => 't/data/insulin.water');
26 my $aln = $alignin->next_aln;
27 my $jcmatrix = $stats->distance(-align => $aln,
28 -method => 'Jukes-Cantor');
30 print $jcmatrix->print_matrix;
31 ## and for measurements of synonymous /nonsynonymous substitutions ##
33 my $in = Bio::AlignIO->new(-format => 'fasta',
34 -file => 't/data/nei_gojobori_test.aln');
35 my $alnobj = $in->next_aln;
36 my ($seq1id,$seq2id) = map { $_->display_id } $alnobj->each_seq;
37 my $results = $stats->calc_KaKs_pair($alnobj, $seq1id, $seq2id);
38 print "comparing ".$results->[0]{'Seq1'}." and ".$results->[0]{'Seq2'}."\n";
39 for (sort keys %{$results->[0]} ){
40 next if /Seq/;
41 printf("%-9s %.4f \n",$_ , $results->[0]{$_});
44 my $results2 = $stats->calc_all_KaKs_pairs($alnobj);
45 for my $an (@$results2){
46 print "comparing ". $an->{'Seq1'}." and ". $an->{'Seq2'}. " \n";
47 for (sort keys %$an ){
48 next if /Seq/;
49 printf("%-9s %.4f \n",$_ , $an->{$_});
51 print "\n\n";
54 my $result3 = $stats->calc_average_KaKs($alnobj, 1000);
55 for (sort keys %$result3 ){
56 next if /Seq/;
57 printf("%-9s %.4f \n",$_ , $result3->{$_});
60 =head1 DESCRIPTION
62 This object contains routines for calculating various statistics and
63 distances for DNA alignments. The routines are not well tested and do
64 contain errors at this point. Work is underway to correct them, but
65 do not expect this code to give you the right answer currently! Use
66 dnadist/distmat in the PHLYIP or EMBOSS packages to calculate the
67 distances.
70 Several different distance method calculations are supported. Listed
71 in brackets are the pattern which will match
73 =over 3
75 =item *
77 JukesCantor [jc|jukes|jukescantor|jukes-cantor]
79 =item *
81 Uncorrected [jcuncor|uncorrected]
83 =item *
85 F81 [f81|felsenstein]
87 =item *
89 Kimura [k2|k2p|k80|kimura]
91 =item *
93 Tamura [t92|tamura|tamura92]
95 =item *
97 F84 [f84|felsenstein84]
99 =item *
101 TajimaNei [tajimanei|tajima\-nei]
103 =item *
105 JinNei [jinnei|jin\-nei] (not implemented)
107 =back
109 There are also three methods to calculate the ratio of synonymous to
110 non-synonymous mutations. All are implementations of the Nei-Gojobori
111 evolutionary pathway method and use the Jukes-Cantor method of
112 nucleotide substitution. This method works well so long as the
113 nucleotide frequencies are roughly equal and there is no significant
114 transition/transversion bias. In order to use these methods there are
115 several pre-requisites for the alignment.
117 =over 3
119 =item 1
121 DNA alignment must be based on protein alignment. Use the subroutine
122 L<Bio::Align::Utilities/aa_to_dna_aln> to achieve this.
124 =item 2
126 Therefore alignment gaps must be in multiples of 3 (representing an aa
127 deletion/insertion) and at present must be indicated by a '-' symbol.
129 =item 3
131 Alignment must be solely of coding region and be in reading frame 0 to
132 achieve meaningful results
134 =item 4
136 Alignment must therefore be a multiple of 3 nucleotides long.
138 =item 5
140 All sequences must be the same length (including gaps). This should be
141 the case anyway if the sequences have been automatically aligned using
142 a program like Clustal.
144 =item 6
146 Only the standard codon alphabet is supported at present.
148 =back
150 calc_KaKs_pair() calculates a number of statistics for a named pair of
151 sequences in the alignment.
153 calc_all_KaKs_pairs() calculates these statistics for all pairwise
154 comparisons in an MSA. The statistics returned are:
156 =over 3
158 =item *
160 S_d - Number of synonymous mutations between the 2 sequences.
162 =item *
164 N_d - Number of non-synonymous mutations between the 2 sequences.
166 =item *
168 S - Mean number of synonymous sites in both sequences.
170 =item *
172 N - mean number of synonymous sites in both sequences.
174 =item *
176 P_s - proportion of synonymous differences in both sequences given by
177 P_s = S_d/S.
179 =item *
181 P_n - proportion of non-synonymous differences in both sequences given
182 by P_n = S_n/S.
184 =item *
186 D_s - estimation of synonymous mutations per synonymous site (by
187 Jukes-Cantor).
189 =item *
191 D_n - estimation of non-synonymous mutations per non-synonymous site (by
192 Jukes-Cantor).
194 =item *
196 D_n_var - estimation of variance of D_n .
198 =item *
200 D_s_var - estimation of variance of S_n.
202 =item *
204 z_value - calculation of z value.Positive value indicates D_n E<gt> D_s,
205 negative value indicates D_s E<gt> D_n.
207 =back
209 The statistics returned by calc_average_KaKs are:
211 =over 3
213 =item *
215 D_s - Average number of synonymous mutations/synonymous site.
217 =item *
219 D_n - Average number of non-synonymous mutations/non-synonymous site.
221 =item *
223 D_s_var - Estimated variance of Ds from bootstrapped alignments.
225 =item *
227 D_n_var - Estimated variance of Dn from bootstrapped alignments.
229 =item *
231 z_score - calculation of z value. Positive value indicates D_n E<gt>D_s,
232 negative values vice versa.
234 =back
236 The design of the code is based around the explanation of the
237 Nei-Gojobori algorithm in the excellent book "Molecular Evolution and
238 Phylogenetics" by Nei and Kumar, published by Oxford University
239 Press. The methods have been tested using the worked example 4.1 in
240 the book, and reproduce those results. If people like having this sort
241 of analysis in BioPerl other methods for estimating Ds and Dn can be
242 provided later.
244 Much of the DNA distance code is based on implementations in EMBOSS
245 (Rice et al, www.emboss.org) [distmat.c] and PHYLIP (J. Felsenstein et
246 al) [dnadist.c]. Insight also gained from Eddy, Durbin, Krogh, &
247 Mitchison.
249 =head1 REFERENCES
251 =over 3
253 =item *
255 D_JukesCantor
257 "Phylogenetic Inference", Swoffrod, Olsen, Waddell and Hillis, in
258 Mol. Systematics, 2nd ed, 1996, Ch 11. Derived from "Evolution of
259 Protein Molecules", Jukes & Cantor, in Mammalian Prot. Metab., III,
260 1969, pp. 21-132.
262 =item *
264 D_Tamura
266 K Tamura, Mol. Biol. Evol. 1992, 9, 678.
268 =item *
270 D_Kimura
272 M Kimura, J. Mol. Evol., 1980, 16, 111.
274 =item *
276 JinNei
278 Jin and Nei, Mol. Biol. Evol. 82, 7, 1990.
280 =item *
282 D_TajimaNei
284 Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
286 =back
288 =head1 FEEDBACK
290 =head2 Mailing Lists
292 User feedback is an integral part of the evolution of this and other
293 Bioperl modules. Send your comments and suggestions preferably to
294 the Bioperl mailing list. Your participation is much appreciated.
296 bioperl-l@bioperl.org - General discussion
297 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
299 =head2 Support
301 Please direct usage questions or support issues to the mailing list:
303 I<bioperl-l@bioperl.org>
305 rather than to the module maintainer directly. Many experienced and
306 reponsive experts will be able look at the problem and quickly
307 address it. Please include a thorough description of the problem
308 with code and data examples if at all possible.
310 =head2 Reporting Bugs
312 Report bugs to the Bioperl bug tracking system to help us keep track
313 of the bugs and their resolution. Bug reports can be submitted via the
314 web:
316 https://github.com/bioperl/bioperl-live/issues
318 =head1 AUTHOR - Jason Stajich
320 Email jason-AT-bioperl.org
322 =head1 CONTRIBUTORS
324 Richard Adams, richard.adams@ed.ac.uk
326 =head1 APPENDIX
328 The rest of the documentation details each of the object methods.
329 Internal methods are usually preceded with a _
331 =cut
334 # Let the code begin...
337 package Bio::Align::DNAStatistics;
338 use vars qw(%DNAChanges @Nucleotides %NucleotideIndexes
339 $GapChars $SeqCount $DefaultGapPenalty %DistanceMethods
340 $CODONS %synchanges $synsites $Precision $GCChhars);
341 use strict;
342 use Bio::Align::PairwiseStatistics;
343 use Bio::Matrix::PhylipDist;
344 use Bio::Tools::IUPAC;
346 BEGIN {
347 $GapChars = '[\.\-]';
348 $GCChhars = '[GCS]';
349 @Nucleotides = qw(A G T C);
350 $SeqCount = 2;
351 $Precision = 5;
353 # these values come from EMBOSS distmat implementation
354 %NucleotideIndexes = ( 'A' => 0,
355 'T' => 1,
356 'C' => 2,
357 'G' => 3,
359 'AT' => 0,
360 'AC' => 1,
361 'AG' => 2,
362 'CT' => 3,
363 'GT' => 4,
364 'CG' => 5,
366 # these are wrong now
367 # 'S' => [ 1, 3],
368 # 'W' => [ 0, 4],
369 # 'Y' => [ 2, 3],
370 # 'R' => [ 0, 1],
371 # 'M' => [ 0, 3],
372 # 'K' => [ 1, 2],
373 # 'B' => [ 1, 2, 3],
374 # 'H' => [ 0, 2, 3],
375 # 'V' => [ 0, 1, 3],
376 # 'D' => [ 0, 1, 2],
379 $DefaultGapPenalty = 0;
380 # could put ambiguities here?
381 %DNAChanges = ( 'Transversions' => { 'A' => [ 'T', 'C'],
382 'T' => [ 'A', 'G'],
383 'C' => [ 'A', 'G'],
384 'G' => [ 'C', 'T'],
386 'Transitions' => { 'A' => [ 'G' ],
387 'G' => [ 'A' ],
388 'C' => [ 'T' ],
389 'T' => [ 'C' ],
392 %DistanceMethods = ( 'jc|jukes|jukescantor|jukes\-cantor' => 'JukesCantor',
393 'jcuncor|uncorrected' => 'Uncorrected',
394 'f81|felsenstein81' => 'F81',
395 'k2|k2p|k80|kimura' => 'Kimura',
396 't92|tamura|tamura92' => 'Tamura',
397 'f84|felsenstein84' => 'F84',
398 'tajimanei|tajima\-nei' => 'TajimaNei',
399 'jinnei|jin\-nei' => 'JinNei');
402 use base qw(Bio::Root::Root Bio::Align::StatisticsI);
404 ## generate look up hashes for Nei_Gojobori methods##
405 $CODONS = get_codons();
406 my @t = split '', "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
407 #create look up hash of number of possible synonymous mutations per codon
408 $synsites = get_syn_sites();
409 #create reference look up hash of single basechanges in codons
410 %synchanges = get_syn_changes();
414 =head2 new
416 Title : new
417 Usage : my $obj = Bio::Align::DNAStatistics->new();
418 Function: Builds a new Bio::Align::DNAStatistics object
419 Returns : Bio::Align::DNAStatistics
420 Args : none
423 =cut
425 sub new {
426 my ($class,@args) = @_;
427 my $self = $class->SUPER::new(@args);
429 $self->pairwise_stats( Bio::Align::PairwiseStatistics->new());
431 return $self;
435 =head2 distance
437 Title : distance
438 Usage : my $distance_mat = $stats->distance(-align => $aln,
439 -method => $method);
440 Function: Calculates a distance matrix for all pairwise distances of
441 sequences in an alignment.
442 Returns : L<Bio::Matrix::PhylipDist> object
443 Args : -align => Bio::Align::AlignI object
444 -method => String specifying specific distance method
445 (implementing class may assume a default)
446 See also: L<Bio::Matrix::PhylipDist>
448 =cut
450 sub distance{
451 my ($self,@args) = @_;
452 my ($aln,$method) = $self->_rearrange([qw(ALIGN METHOD)],@args);
453 if( ! defined $aln || ! ref ($aln) || ! $aln->isa('Bio::Align::AlignI') ) {
454 $self->throw("Must supply a valid Bio::Align::AlignI for the -align parameter in distance");
456 $method ||= 'JukesCantor';
457 foreach my $m ( keys %DistanceMethods ) {
458 if(defined $m && $method =~ /$m/i ) {
459 my $mtd = "D_$DistanceMethods{$m}";
460 return $self->$mtd($aln);
463 $self->warn("Unrecognized distance method $method must be one of [".
464 join(',',$self->available_distance_methods())."]");
465 return;
468 =head2 available_distance_methods
470 Title : available_distance_methods
471 Usage : my @methods = $stats->available_distance_methods();
472 Function: Enumerates the possible distance methods
473 Returns : Array of strings
474 Args : none
477 =cut
479 sub available_distance_methods{
480 my ($self,@args) = @_;
481 return values %DistanceMethods;
484 =head2 D - distance methods
487 =cut
490 =head2 D_JukesCantor
492 Title : D_JukesCantor
493 Usage : my $d = $stat->D_JukesCantor($aln)
494 Function: Calculates D (pairwise distance) between 2 sequences in an
495 alignment using the Jukes-Cantor 1 parameter model.
496 Returns : L<Bio::Matrix::PhylipDist>
497 Args : L<Bio::Align::AlignI> of DNA sequences
498 double - gap penalty
501 =cut
503 sub D_JukesCantor{
504 my ($self,$aln,$gappenalty) = @_;
505 return 0 unless $self->_check_arg($aln);
506 $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
507 # ambiguities ignored at this point
508 my (@seqs,@names,@values,%dist);
509 my $seqct = 0;
510 foreach my $seq ( $aln->each_seq) {
511 push @names, $seq->display_id;
512 push @seqs, uc $seq->seq();
513 $seqct++;
515 my $precisionstr = "%.$Precision"."f";
516 for(my $i = 0; $i < $seqct-1; $i++ ) {
517 # (diagonals) distance is 0 for same sequence
518 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
519 $values[$i][$i] = sprintf($precisionstr,0);
521 for( my $j = $i+1; $j < $seqct; $j++ ) {
522 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
523 $seqs[$j]);
524 # just want diagonals
525 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
526 $matrix->[2]->[2] + $matrix->[3]->[3] );
527 my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
528 my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
529 # fwd and rev lookup
530 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
531 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
532 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
533 # (diagonals) distance is 0 for same sequence
534 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
535 $values[$j][$j] = sprintf($precisionstr,0);
538 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
539 -matrix => \%dist,
540 -names => \@names,
541 -values => \@values);
544 =head2 D_F81
546 Title : D_F81
547 Usage : my $d = $stat->D_F81($aln)
548 Function: Calculates D (pairwise distance) between 2 sequences in an
549 alignment using the Felsenstein 1981 distance model.
550 Relaxes the assumption of equal base frequencies that is
551 in JC.
552 Returns : L<Bio::Matrix::PhylipDist>
553 Args : L<Bio::Align::AlignI> of DNA sequences
556 =cut
558 sub D_F81{
559 my ($self,$aln,$gappenalty) = @_;
560 return 0 unless $self->_check_arg($aln);
561 $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
562 # ambiguities ignored at this point
563 my (@seqs,@names,@values,%dist);
564 my $seqct = 0;
565 foreach my $seq ( $aln->each_seq) {
566 push @names, $seq->display_id;;
567 push @seqs, uc $seq->seq();
568 $seqct++;
570 my $precisionstr = "%.$Precision"."f";
571 for(my $i = 0; $i < $seqct-1; $i++ ) {
572 # (diagonals) distance is 0 for same sequence
573 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
574 $values[$i][$i] = sprintf($precisionstr,0);
576 for( my $j = $i+1; $j < $seqct; $j++ ) {
578 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
579 $seqs[$j]);
580 # just want diagonals
581 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
582 $matrix->[2]->[2] + $matrix->[3]->[3] );
583 my $D = 1 - ( $m / ($aln->length - $gaps + ( $gaps * $gappenalty)));
584 my $d = (- 3 / 4) * log ( 1 - (4 * $D/ 3));
585 # fwd and rev lookup
586 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
587 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
588 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
589 # (diagonals) distance is 0 for same sequence
590 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
591 $values[$j][$j] = sprintf($precisionstr,0);
594 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
595 -matrix => \%dist,
596 -names => \@names,
597 -values => \@values);
600 =head2 D_Uncorrected
602 Title : D_Uncorrected
603 Usage : my $d = $stats->D_Uncorrected($aln)
604 Function: Calculate a distance D, no correction for multiple substitutions
605 is used. In rare cases where sequences may not overlap, 'NA' is
606 substituted for the distance.
607 Returns : L<Bio::Matrix::PhylipDist>
608 Args : L<Bio::Align::AlignI> (DNA Alignment)
609 [optional] gap penalty
611 =cut
613 sub D_Uncorrected {
614 my ($self,$aln,$gappenalty) = @_;
615 $gappenalty = $DefaultGapPenalty unless defined $gappenalty;
616 return 0 unless $self->_check_arg($aln);
617 # ambiguities ignored at this point
618 my (@seqs,@names,@values,%dist);
619 my $seqct = 0;
620 foreach my $seq ( $aln->each_seq) {
621 push @names, $seq->display_id;
622 push @seqs, uc $seq->seq();
623 $seqct++;
626 my $precisionstr = "%.$Precision"."f";
627 my $len = $aln->length;
628 for( my $i = 0; $i < $seqct-1; $i++ ) {
629 # (diagonals) distance is 0 for same sequence
630 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
631 $values[$i][$i] = sprintf($precisionstr,0);
633 for( my $j = $i+1; $j < $seqct; $j++ ) {
634 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
635 $seqs[$j]);
636 my $m = ( $matrix->[0]->[0] +
637 $matrix->[1]->[1] +
638 $matrix->[2]->[2] +
639 $matrix->[3]->[3] );
640 my $denom = ( $len - $gaps + ( $gaps * $gappenalty));
642 $self->warn("No distance calculated between $names[$i] and $names[$j], inserting -1")
643 unless $denom;
645 my $D = $denom ? 1 - ( $m / $denom) : -1;
646 # fwd and rev lookup
647 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
648 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
649 $values[$j][$i] = $values[$i][$j] = $denom ? sprintf($precisionstr,$D)
650 : sprintf("%-*s", $Precision + 2, $D);
651 # (diagonals) distance is 0 for same sequence
652 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
653 $values[$j][$j] = sprintf($precisionstr,0);
656 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
657 -matrix => \%dist,
658 -names => \@names,
659 -values => \@values);
663 # M Kimura, J. Mol. Evol., 1980, 16, 111.
665 =head2 D_Kimura
667 Title : D_Kimura
668 Usage : my $d = $stat->D_Kimura($aln)
669 Function: Calculates D (pairwise distance) between all pairs of sequences
670 in an alignment using the Kimura 2 parameter model.
671 Returns : L<Bio::Matrix::PhylipDist>
672 Args : L<Bio::Align::AlignI> of DNA sequences
675 =cut
677 sub D_Kimura {
678 my ($self,$aln) = @_;
679 return 0 unless $self->_check_arg($aln);
680 # ambiguities ignored at this point
681 my (@names,@values,%dist);
682 my $seqct = 0;
683 foreach my $seq ( $aln->each_seq) {
684 push @names, $seq->display_id;
685 $seqct++;
688 my $precisionstr = "%.$Precision"."f";
690 for( my $i = 0; $i < $seqct-1; $i++ ) {
691 # (diagonals) distance is 0 for same sequence
692 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
693 $values[$i][$i] = sprintf($precisionstr,0);
695 for( my $j = $i+1; $j < $seqct; $j++ ) {
696 my $pairwise = $aln->select_noncont($i+1,$j+1);
697 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
698 unless( $L ) {
699 $L = 1;
701 my $P = $self->transitions($pairwise) / $L;
702 my $Q = $self->transversions($pairwise) / $L;
703 my $K = 0;
704 my $denom = ( 1 - (2 * $P) - $Q);
705 if( $denom == 0 ) {
706 $self->throw("cannot find distance for ",$i+1,
707 ",",$j+1," $P, $Q\n");
709 my $a = 1 / ( 1 - (2 * $P) - $Q);
710 my $b = 1 / ( 1 - 2 * $Q );
711 if( $a < 0 || $b < 0 ) {
712 $K = -1;
713 } else{
714 $K = (1/2) * log ( $a ) + (1/4) * log($b);
716 # fwd and rev lookup
717 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
718 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
719 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$K);
720 # (diagonals) distance is 0 for same sequence
721 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
722 $values[$j][$j] = sprintf($precisionstr,0);
725 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
726 -matrix => \%dist,
727 -names => \@names,
728 -values => \@values);
732 =head2 D_Kimura_variance
734 Title : D_Kimura
735 Usage : my $d = $stat->D_Kimura_variance($aln)
736 Function: Calculates D (pairwise distance) between all pairs of sequences
737 in an alignment using the Kimura 2 parameter model.
738 Returns : array of 2 L<Bio::Matrix::PhylipDist>,
739 the first is the Kimura distance and the second is
740 a matrix of variance V(K)
741 Args : L<Bio::Align::AlignI> of DNA sequences
744 =cut
746 sub D_Kimura_variance {
747 my ($self,$aln) = @_;
748 return 0 unless $self->_check_arg($aln);
749 # ambiguities ignored at this point
750 my (@names,@values,%dist,@var);
751 my $seqct = 0;
752 foreach my $seq ( $aln->each_seq) {
753 push @names, $seq->display_id;
754 $seqct++;
757 my $precisionstr = "%.$Precision"."f";
759 for( my $i = 0; $i < $seqct-1; $i++ ) {
760 # (diagonals) distance is 0 for same sequence
761 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
762 $values[$i][$i] = sprintf($precisionstr,0);
764 for( my $j = $i+1; $j < $seqct; $j++ ) {
765 my $pairwise = $aln->select_noncont($i+1,$j+1);
766 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
767 unless( $L ) {
768 $L = 1;
770 my $P = $self->transitions($pairwise) / $L;
771 my $Q = $self->transversions($pairwise) / $L;
772 my ($a,$b,$K,$var_k);
773 my $a_denom = ( 1 - (2 * $P) - $Q);
774 my $b_denom = 1 - 2 * $Q;
775 unless( $a_denom > 0 && $b_denom > 0 ) {
776 $a = 1;
777 $b = 1;
778 $K = -1;
779 $var_k = -1;
780 } else {
781 $a = 1 / $a_denom;
782 $b = 1 / $b_denom;
783 $K = (1/2) * log ( $a ) + (1/4) * log($b);
784 # from Wu and Li 1985 which in turn is from Kimura 1980
785 my $c = ( $a - $b ) / 2;
786 my $d = ( $a + $b ) / 2;
787 $var_k = ( $a**2 * $P + $d**2 * $Q - ( $a * $P + $d * $Q)**2 ) / $L;
790 # fwd and rev lookup
791 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
792 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
793 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$K);
794 # (diagonals) distance is 0 for same sequence
795 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
796 $values[$j]->[$j] = sprintf($precisionstr,0);
798 $var[$j]->[$i] = $var[$i]->[$j] = sprintf($precisionstr,$var_k);
799 $var[$j]->[$j] = $values[$j]->[$j];
802 return ( Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
803 -matrix => \%dist,
804 -names => \@names,
805 -values => \@values),
806 Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
807 -matrix => \%dist,
808 -names => \@names,
809 -values => \@var)
814 # K Tamura, Mol. Biol. Evol. 1992, 9, 678.
816 =head2 D_Tamura
818 Title : D_Tamura
819 Usage : Calculates D (pairwise distance) between 2 sequences in an
820 alignment using Tamura 1992 distance model.
821 Returns : L<Bio::Matrix::PhylipDist>
822 Args : L<Bio::Align::AlignI> of DNA sequences
825 =cut
827 sub D_Tamura {
828 my ($self,$aln) = @_;
829 return 0 unless $self->_check_arg($aln);
830 # ambiguities ignored at this point
831 my (@seqs,@names,@values,%dist,$i,$j);
832 my $seqct = 0;
833 my $length = $aln->length;
834 foreach my $seq ( $aln->each_seq) {
835 push @names, $seq->display_id;;
836 push @seqs, uc $seq->seq();
837 $seqct++;
840 my $precisionstr = "%.$Precision"."f";
841 my (@gap,@gc,@trans,@tranv,@score);
842 $i = 0;
843 for my $t1 ( @seqs ) {
844 $j = 0;
845 for my $t2 ( @seqs ) {
846 $gap[$i][$j] = 0;
847 for( my $k = 0; $k < $length; $k++ ) {
848 my ($c1,$c2) = ( substr($seqs[$i],$k,1),
849 substr($seqs[$j],$k,1) );
850 if( $c1 =~ /^$GapChars$/ ||
851 $c2 =~ /^$GapChars$/ ) {
852 $gap[$i][$j]++;
853 } elsif( $c2 =~ /^$GCChhars$/i ) {
854 $gc[$i][$j]++;
857 $gc[$i][$j] = ( $gc[$i][$j] /
858 ($length - $gap[$i][$j]) );
859 $j++;
861 $i++;
864 for( $i = 0; $i < $seqct-1; $i++ ) {
865 # (diagonals) distance is 0 for same sequence
866 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
867 $values[$i][$i] = sprintf($precisionstr,0);
869 for( $j = $i+1; $j < $seqct; $j++ ) {
871 my $pairwise = $aln->select_noncont($i+1,$j+1);
872 my $L = $self->pairwise_stats->number_of_comparable_bases($pairwise);
873 my $P = $self->transitions($pairwise) / $L;
874 my $Q = $self->transversions($pairwise) / $L;
875 my $C = $gc[$i][$j] + $gc[$j][$i]-
876 ( 2 * $gc[$i][$j] * $gc[$j][$i] );
877 if( $P ) {
878 $P = $P / $C;
880 my $d = -($C * log(1- $P - $Q)) -(0.5* ( 1 - $C) * log(1 - 2 * $Q));
881 # fwd and rev lookup
882 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
883 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
884 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
885 # (diagonals) distance is 0 for same sequence
886 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
887 $values[$j][$j] = sprintf($precisionstr,0);
890 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
891 -matrix => \%dist,
892 -names => \@names,
893 -values => \@values);
897 =head2 D_F84
899 Title : D_F84
900 Usage : my $d = $stat->D_F84($aln)
901 Function: Calculates D (pairwise distance) between 2 sequences in an
902 alignment using the Felsenstein 1984 distance model.
903 Returns : L<Bio::Matrix::PhylipDist>
904 Args : L<Bio::Align::AlignI> of DNA sequences
905 [optional] double - gap penalty
907 =cut
909 sub D_F84 {
910 my ($self,$aln,$gappenalty) = @_;
911 return 0 unless $self->_check_arg($aln);
912 $self->throw_not_implemented();
913 # ambiguities ignored at this point
914 my (@seqs,@names,@values,%dist);
915 my $seqct = 0;
916 foreach my $seq ( $aln->each_seq) {
917 # if there is no name,
918 my $id = $seq->display_id;
919 if( ! length($id) || # deal with empty names
920 $id =~ /^\s+$/ ) {
921 $id = $seqct+1;
923 push @names, $id;
924 push @seqs, uc $seq->seq();
925 $seqct++;
928 my $precisionstr = "%.$Precision"."f";
930 for( my $i = 0; $i < $seqct-1; $i++ ) {
931 # (diagonals) distance is 0 for same sequence
932 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
933 $values[$i][$i] = sprintf($precisionstr,0);
935 for( my $j = $i+1; $j < $seqct; $j++ ) {
940 # Tajima and Nei, Mol. Biol. Evol. 1984, 1, 269.
941 # Tajima-Nei correction used for multiple substitutions in the calc
942 # of the distance matrix. Nucleic acids only.
944 # D = p-distance = 1 - (matches/(posns_scored + gaps)
946 # distance = -b * ln(1-D/b)
949 =head2 D_TajimaNei
951 Title : D_TajimaNei
952 Usage : my $d = $stat->D_TajimaNei($aln)
953 Function: Calculates D (pairwise distance) between 2 sequences in an
954 alignment using the TajimaNei 1984 distance model.
955 Returns : L<Bio::Matrix::PhylipDist>
956 Args : Bio::Align::AlignI of DNA sequences
959 =cut
961 sub D_TajimaNei{
962 my ($self,$aln) = @_;
963 return 0 unless $self->_check_arg($aln);
964 # ambiguities ignored at this point
965 my (@seqs,@names,@values,%dist);
966 my $seqct = 0;
967 foreach my $seq ( $aln->each_seq) {
968 # if there is no name,
969 push @names, $seq->display_id;
970 push @seqs, uc $seq->seq();
971 $seqct++;
973 my $precisionstr = "%.$Precision"."f";
974 my ($i,$j,$bs);
975 # pairwise
976 for( $i =0; $i < $seqct -1; $i++ ) {
977 $dist{$names[$i]}->{$names[$i]} = [$i,$i];
978 $values[$i][$i] = sprintf($precisionstr,0);
980 for ( $j = $i+1; $j <$seqct;$j++ ) {
981 my ($matrix,$pfreq,$gaps) = $self->_build_nt_matrix($seqs[$i],
982 $seqs[$j]);
983 my $pairwise = $aln->select_noncont($i+1,$j+1);
984 my $slen = $self->pairwise_stats->number_of_comparable_bases($pairwise);
985 my $fij2 = 0;
986 for( $bs = 0; $bs < 4; $bs++ ) {
987 my $fi = 0;
988 map {$fi += $matrix->[$bs]->[$_] } 0..3;
989 my $fj = 0;
990 # summation
991 map { $fj += $matrix->[$_]->[$bs] } 0..3;
992 my $fij = ( $fi && $fj ) ? ($fi + $fj) /( 2 * $slen) : 0;
993 $fij2 += $fij**2;
996 my ($pair,$h) = (0,0);
997 for( $bs = 0; $bs < 3; $bs++ ) {
998 for(my $bs1 = $bs+1; $bs1 <= 3; $bs1++ ) {
999 my $fij = $pfreq->[$pair++] / $slen;
1000 if( $fij ) {
1002 my ($ci1,$ci2,$cj1,$cj2) = (0,0,0,0);
1004 map { $ci1 += $matrix->[$_]->[$bs] } 0..3;
1005 map { $cj1 += $matrix->[$bs]->[$_] } 0..3;
1006 map { $ci2 += $matrix->[$_]->[$bs1] } 0..3;
1007 map { $cj2 += $matrix->[$bs1]->[$_] } 0..3;
1009 if( $fij ) {
1010 $h += ( ($fij**2) / 2 ) /
1011 ( ( ( $ci1 + $cj1 ) / (2 * $slen) ) *
1012 ( ( $ci2 + $cj2 ) / (2 * $slen) )
1015 $self->debug( "slen is $slen h is $h fij = $fij ci1 =$ci1 cj1=$cj1 ci2=$ci2 cj2=$cj2\n");
1019 # just want diagonals which are matches (A matched A, C -> C)
1021 my $m = ( $matrix->[0]->[0] + $matrix->[1]->[1] +
1022 $matrix->[2]->[2] + $matrix->[3]->[3] );
1023 my $D = 1 - ( $m / $slen);
1024 my $d;
1025 if( $h == 0 ) {
1026 $d = -1;
1027 } else {
1028 my $b = (1 - $fij2 + (($D**2)/$h)) / 2;
1029 my $c = 1- $D/ $b;
1031 if( $c < 0 ) {
1032 $d = -1;
1033 } else {
1034 $d = (-1 * $b) * log ( $c);
1037 # fwd and rev lookup
1038 $dist{$names[$i]}->{$names[$j]} = [$i,$j];
1039 $dist{$names[$j]}->{$names[$i]} = [$i,$j];
1040 $values[$j][$i] = $values[$i][$j] = sprintf($precisionstr,$d);
1042 # (diagonals) distance is 0 for same sequence
1043 $dist{$names[$j]}->{$names[$j]} = [$j,$j];
1044 $values[$j][$j] = sprintf($precisionstr,0);
1047 return Bio::Matrix::PhylipDist->new(-program => 'bioperl_DNAstats',
1048 -matrix => \%dist,
1049 -names => \@names,
1050 -values => \@values);
1054 # Jin and Nei, Mol. Biol. Evol. 82, 7, 1990.
1056 =head2 D_JinNei
1058 Title : D_JinNei
1059 Usage : my $d = $stat->D_JinNei($aln)
1060 Function: Calculates D (pairwise distance) between 2 sequences in an
1061 alignment using the Jin-Nei 1990 distance model.
1062 Returns : L<Bio::Matrix::PhylipDist>
1063 Args : L<Bio::Align::AlignI> of DNA sequences
1066 =cut
1068 sub D_JinNei{
1069 my ($self,@args) = @_;
1070 $self->warn("JinNei implementation not completed");
1071 return;
1074 =head2 transversions
1076 Title : transversions
1077 Usage : my $transversions = $stats->transversion($aln);
1078 Function: Calculates the number of transversions between two sequences in
1079 an alignment
1080 Returns : integer
1081 Args : Bio::Align::AlignI
1084 =cut
1086 sub transversions{
1087 my ($self,$aln) = @_;
1088 return $self->_trans_count_helper($aln, $DNAChanges{'Transversions'});
1091 =head2 transitions
1093 Title : transitions
1094 Usage : my $transitions = Bio::Align::DNAStatistics->transitions($aln);
1095 Function: Calculates the number of transitions in a given DNA alignment
1096 Returns : integer representing the number of transitions
1097 Args : Bio::Align::AlignI object
1100 =cut
1102 sub transitions{
1103 my ($self,$aln) = @_;
1104 return $self->_trans_count_helper($aln, $DNAChanges{'Transitions'});
1108 sub _trans_count_helper {
1109 my ($self,$aln,$type) = @_;
1110 return 0 unless( $self->_check_arg($aln) );
1111 if( ! $aln->is_flush ) { $self->throw("must be flush") }
1112 my (@tcount);
1113 my ($first,$second) = ( uc $aln->get_seq_by_pos(1)->seq(),
1114 uc $aln->get_seq_by_pos(2)->seq() );
1115 my $alen = $aln->length;
1116 for (my $i = 0;$i<$alen; $i++ ) {
1117 my ($c1,$c2) = ( substr($first,$i,1),
1118 substr($second,$i,1) );
1119 if( $c1 ne $c2 ) {
1120 foreach my $nt ( @{$type->{$c1}} ) {
1121 if( $nt eq $c2) {
1122 $tcount[$i]++;
1127 my $sum = 0;
1128 map { if( $_) { $sum += $_} } @tcount;
1129 return $sum;
1132 # this will generate a matrix which records across the row, the number
1133 # of DNA subst
1135 sub _build_nt_matrix {
1136 my ($self,$seqa,$seqb) = @_;
1139 my $basect_matrix = [ [ qw(0 0 0 0) ], # number of bases that match
1140 [ qw(0 0 0 0) ],
1141 [ qw(0 0 0 0) ],
1142 [ qw(0 0 0 0) ] ];
1143 my $gaps = 0; # number of gaps
1144 my $pfreq = [ qw( 0 0 0 0 0 0)]; # matrix for pair frequency
1145 my $len_a = length($seqa);
1146 for( my $i = 0; $i < $len_a; $i++) {
1147 my ($ti,$tj) = (substr($seqa,$i,1),substr($seqb,$i,1));
1148 $ti =~ tr/U/T/;
1149 $tj =~ tr/U/T/;
1151 if( $ti =~ /^$GapChars$/) { $gaps++; next; }
1152 if( $tj =~ /^$GapChars$/) { $gaps++; next }
1154 my $ti_index = $NucleotideIndexes{$ti};
1155 my $tj_index = $NucleotideIndexes{$tj};
1157 if( ! defined $ti_index ) {
1158 $self->warn("ti_index not defined for $ti\n");
1159 next;
1162 $basect_matrix->[$ti_index]->[$tj_index]++;
1164 if( $ti ne $tj ) {
1165 $pfreq->[$NucleotideIndexes{join('',sort ($ti,$tj))}]++;
1168 return ($basect_matrix,$pfreq,$gaps);
1171 sub _check_ambiguity_nucleotide {
1172 my ($base1,$base2) = @_;
1173 my %iub = Bio::Tools::IUPAC->iupac_iub();
1174 my @amb1 = @{ $iub{uc($base1)} };
1175 my @amb2 = @{ $iub{uc($base2)} };
1176 my ($pmatch) = (0);
1177 for my $amb ( @amb1 ) {
1178 if( grep { $amb eq $_ } @amb2 ) {
1179 $pmatch = 1;
1180 last;
1183 if( $pmatch ) {
1184 return (1 / scalar @amb1) * (1 / scalar @amb2);
1185 } else {
1186 return 0;
1191 sub _check_arg {
1192 my($self,$aln ) = @_;
1193 if( ! defined $aln || ! $aln->isa('Bio::Align::AlignI') ) {
1194 $self->warn("Must provide a Bio::Align::AlignI compliant object to Bio::Align::DNAStatistics");
1195 return 0;
1196 } elsif( $aln->get_seq_by_pos(1)->alphabet ne 'dna' ) {
1197 $self->warn("Must provide a DNA alignment to Bio::Align::DNAStatistics, you provided a " . $aln->get_seq_by_pos(1)->alphabet);
1198 return 0;
1200 return 1;
1203 =head2 Data Methods
1205 =cut
1207 =head2 pairwise_stats
1209 Title : pairwise_stats
1210 Usage : $obj->pairwise_stats($newval)
1211 Function:
1212 Returns : value of pairwise_stats
1213 Args : newvalue (optional)
1216 =cut
1218 sub pairwise_stats{
1219 my ($self,$value) = @_;
1220 if( defined $value) {
1221 $self->{'_pairwise_stats'} = $value;
1223 return $self->{'_pairwise_stats'};
1227 =head2 calc_KaKs_pair
1229 Title : calc_KaKs_pair
1230 Useage : my $results = $stats->calc_KaKs_pair($alnobj,
1231 $name1, $name2).
1232 Function : calculates Nei-Gojobori statistics for pairwise
1233 comparison.
1234 Args : A Bio::Align::AlignI compliant object such as a
1235 Bio::SimpleAlign object, and 2 sequence name strings.
1236 Returns : a reference to a hash of statistics with keys as
1237 listed in Description.
1239 =cut
1241 sub calc_KaKs_pair {
1242 my ( $self, $aln, $seq1_id, $seq2_id) = @_;
1243 $self->throw("Needs 3 arguments - an alignment object, and 2 sequence ids")
1244 if @_!= 4;
1245 $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1246 my @seqs = (
1247 #{id => $seq1_id, seq =>($aln->each_seq_with_id($seq1_id))[0]->seq},
1248 #{id => $seq2_id, seq =>($aln->each_seq_with_id($seq2_id))[0]->seq}
1249 {id => $seq1_id, seq => uc(($aln->each_seq_with_id($seq1_id))[0]->seq)},
1250 {id => $seq2_id, seq => uc(($aln->each_seq_with_id($seq2_id))[0]->seq)}
1252 if (length($seqs[0]{'seq'}) != length($seqs[1]{'seq'})) {
1253 $self->throw(" aligned sequences must be of equal length!");
1255 my $results = [];
1256 $self->_get_av_ds_dn(\@seqs, $results);
1257 return $results;
1261 =head2 calc_all_KaKs_pairs
1263 Title : calc_all_KaKs_pairs
1264 Useage : my $results2 = $stats->calc_KaKs_pair($alnobj).
1265 Function : Calculates Nei_gojobori statistics for all pairwise
1266 combinations in sequence.
1267 Arguments: A Bio::Align::ALignI compliant object such as
1268 a Bio::SimpleAlign object.
1269 Returns : A reference to an array of hashes of statistics of
1270 all pairwise comparisons in the alignment.
1272 =cut
1276 sub calc_all_KaKs_pairs {
1277 #returns a multi_element_array with all pairwise comparisons
1278 my ($self,$aln) = @_;
1279 $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1280 my @seqs;
1281 for my $seq ($aln->each_seq) {
1282 push @seqs, {id => $seq->display_id, seq=>$seq->seq};
1284 my $results ;
1285 $results = $self->_get_av_ds_dn(\@seqs, $results);
1286 return $results;
1289 =head2 calc_average_KaKs
1291 Title : calc_average_KaKs.
1292 Useage : my $res= $stats->calc_average_KaKs($alnobj, 1000).
1293 Function : calculates Nei_Gojobori stats for average of all
1294 sequences in the alignment.
1295 Args : A Bio::Align::AlignI compliant object such as a
1296 Bio::SimpleAlign object, number of bootstrap iterations
1297 (default 1000).
1298 Returns : A reference to a hash of statistics as listed in Description.
1300 =cut
1302 sub calc_average_KaKs {
1303 #calculates global value for sequences in alignment using bootstrapping
1304 #this is quite slow (~10 seconds per 3 X 200nt seqs);
1305 my ($self, $aln, $bootstrap_rpt) = @_;
1306 $bootstrap_rpt ||= 1000;
1307 $self->throw ("This calculation needs a Bio::Align::AlignI compatible object, not a [ " . ref($aln) . " ]object") unless $aln->isa('Bio::Align::AlignI');
1308 my @seqs;
1309 for my $seq ($aln->each_seq) {
1310 push @seqs, {id => $seq->display_id, seq=>$seq->seq};
1312 my $results ;
1313 my ($ds_orig, $dn_orig) = $self->_get_av_ds_dn(\@seqs);
1314 #print "ds = $ds_orig, dn = $dn_orig\n";
1315 $results = {D_s => $ds_orig, D_n => $dn_orig};
1316 $self->_run_bootstrap(\@seqs, $results, $bootstrap_rpt);
1317 return $results;
1320 ############## primary internal subs for alignment comparisons ########################
1322 sub _run_bootstrap {
1323 ### generates sampled sequences, calculates Ds and Dn values,
1324 ### then calculates variance of sampled sequences and add results to results hash
1325 ###
1326 my ($self,$seq_ref, $results, $bootstrap_rpt) = @_;
1327 my @seqs = @$seq_ref;
1328 my @btstrp_aoa; # to hold array of array of nucleotides for resampling
1329 my %bootstrap_values = (ds => [], dn =>[]); # to hold list of av values
1331 #1st make alternative array of codons;
1332 my $c = 0;
1333 while ($c < length $seqs[0]{'seq'}) {
1334 for (0..$#seqs) {
1335 push @{$btstrp_aoa[$_]}, substr ($seqs[$_]{'seq'}, $c, 3);
1337 $c+=3;
1340 for (1..$bootstrap_rpt) {
1341 my $sampled = _resample (\@btstrp_aoa);
1342 my ($ds, $dn) = $self->_get_av_ds_dn ($sampled) ; # is array ref
1343 push @{$bootstrap_values{'ds'}}, $ds;
1344 push @{$bootstrap_values{'dn'}}, $dn;
1347 $results->{'D_s_var'} = sampling_variance($bootstrap_values{'ds'});
1348 $results->{'D_n_var'} = sampling_variance($bootstrap_values{'dn'});
1349 $results->{'z_score'} = ($results->{'D_n'} - $results->{'D_s'}) /
1350 sqrt($results->{'D_s_var'} + $results->{'D_n_var'} );
1351 #print "bootstrapped var_syn = $results->{'D_s_var'} \n" ;
1352 #print "bootstrapped var_nc = $results->{'D_n_var'} \n";
1353 #print "z is $results->{'z_score'}\n"; ### end of global set up of/perm look up data
1356 sub _resample {
1357 my $ref = shift;
1358 my $codon_num = scalar (@{$ref->[0]});
1359 my @altered;
1360 for (0..$codon_num -1) { #for each codon
1361 my $rand = int (rand ($codon_num));
1362 for (0..$#$ref) {
1363 push @{$altered[$_]}, $ref->[$_][$rand];
1366 my @stringed = map {join '', @$_}@altered;
1367 my @return;
1368 #now out in random name to keep other subs happy
1369 for (@stringed) {
1370 push @return, {id=>'1', seq=> $_};
1372 return \@return;
1375 sub _get_av_ds_dn {
1376 # takes array of hashes of sequence strings and ids #
1377 my $self = shift;
1378 my $seq_ref = shift;
1379 my $result = shift if @_;
1380 my @caller = caller(1);
1381 my @seqarray = @$seq_ref;
1382 my $bootstrap_score_list;
1383 #for a multiple alignment considers all pairwise combinations#
1384 my %dsfor_average = (ds => [], dn => []);
1385 for (my $i = 0; $i < scalar @seqarray; $i++) {
1386 for (my $j = $i +1; $j<scalar @seqarray; $j++ ){
1387 # print "comparing $i and $j\n";
1388 if (length($seqarray[$i]{'seq'}) != length($seqarray[$j]{'seq'})) {
1389 $self->warn(" aligned sequences must be of equal length!");
1390 next;
1393 my $syn_site_count = count_syn_sites($seqarray[$i]{'seq'}, $synsites);
1394 my $syn_site_count2 = count_syn_sites($seqarray[$j]{'seq'}, $synsites);
1395 # print "syn 1 is $syn_site_count , syn2 is $syn_site_count2\n";
1396 my ($syn_count, $non_syn_count, $gap_cnt) = analyse_mutations($seqarray[$i]{'seq'}, $seqarray[$j]{'seq'});
1397 #get averages
1398 my $av_s_site = ($syn_site_count + $syn_site_count2)/2;
1399 my $av_ns_syn_site = length($seqarray[$i]{'seq'}) - $gap_cnt- $av_s_site ;
1401 #calculate ps and pn (p54)
1402 my $syn_prop = $syn_count / $av_s_site;
1403 my $nc_prop = $non_syn_count / $av_ns_syn_site ;
1405 #now use jukes/cantor to calculate D_s and D_n, would alter here if needed a different method
1406 my $d_syn = $self->jk($syn_prop);
1407 my $d_nc = $self->jk($nc_prop);
1409 #JK calculation must succeed for continuation of calculation
1410 #ret_value = -1 if error
1411 next unless $d_nc >=0 && $d_syn >=0;
1414 push @{$dsfor_average{'ds'}}, $d_syn;
1415 push @{$dsfor_average{'dn'}}, $d_nc;
1417 #if not doing bootstrap, calculate the pairwise comparisin stats
1418 if ($caller[3] =~ /calc_KaKs_pair/ || $caller[3] =~ /calc_all_KaKs_pairs/) {
1419 #now calculate variances assuming large sample
1420 my $d_syn_var = jk_var($syn_prop, length($seqarray[$i]{'seq'}) - $gap_cnt );
1421 my $d_nc_var = jk_var($nc_prop, length ($seqarray[$i]{'seq'}) - $gap_cnt);
1422 #now calculate z_value
1423 #print "d_syn_var is $d_syn_var,and d_nc_var is $d_nc_var\n";
1424 #my $z = ($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var);
1425 my $z = ($d_syn_var + $d_nc_var) ?
1426 ($d_nc - $d_syn) / sqrt($d_syn_var + $d_nc_var) : 0;
1427 # print "z is $z\n";
1428 push @$result , {S => $av_s_site, N=>$av_ns_syn_site,
1429 S_d => $syn_count, N_d =>$non_syn_count,
1430 P_s => $syn_prop, P_n=>$nc_prop,
1431 D_s => @{$dsfor_average{'ds'}}[-1],
1432 D_n => @{$dsfor_average{'dn'}}[-1],
1433 D_n_var =>$d_nc_var, D_s_var => $d_syn_var,
1434 Seq1 => $seqarray[$i]{'id'},
1435 Seq2 => $seqarray[$j]{'id'},
1436 z_score => $z,
1438 $self->warn (" number of mutations too small to justify normal test for $seqarray[$i]{'id'} and $seqarray[$j]{'id'}\n- use Fisher's exact, or bootstrap a MSA")
1439 if ($syn_count < 10 || $non_syn_count < 10 ) && $self->verbose > -1 ;
1440 }#endif
1444 #warn of failure if no results hashes are present
1445 #will fail if Jukes Cantor has failed for all pairwise combinations
1446 #$self->warn("calculation failed!") if scalar @$result ==0;
1448 #return results unless bootstrapping
1449 return $result if $caller[3]=~ /calc_all_KaKs/ || $caller[3] =~ /calc_KaKs_pair/;
1450 #else if getting average for bootstrap
1451 return( mean ($dsfor_average{'ds'}),mean ($dsfor_average{'dn'})) ;
1455 sub jk {
1456 my ($self, $p) = @_;
1457 if ($p > 0.75) {
1458 $self->warn( " Jukes Cantor won't work -too divergent!");
1459 return -1;
1461 return -1 * (3/4) * (log(1 - (4/3) * $p));
1464 #works for large value of n (50?100?)
1465 sub jk_var {
1466 my ($p, $n) = @_;
1467 return (9 * $p * (1 -$p))/(((3 - 4 *$p) **2) * $n);
1471 # compares 2 sequences to find the number of synonymous/non
1472 # synonymous mutations between them
1474 sub analyse_mutations {
1475 my ($seq1, $seq2) = @_;
1476 my %mutator = ( 2=> {0=>[[1,2], # codon positions to be altered
1477 [2,1]], # depend on which is the same
1478 1=>[[0,2],
1479 [2,0]],
1480 2=>[[0,1],
1481 [1,0]],
1483 3=> [ [0,1,2], # all need to be altered
1484 [1,0,2],
1485 [0,2,1],
1486 [1,2,0],
1487 [2,0,1],
1488 [2,1,0] ],
1490 my $TOTAL = 0; # total synonymous changes
1491 my $TOTAL_n = 0; # total non-synonymous changes
1492 my $gap_cnt = 0;
1494 my %input;
1495 my $seqlen = length($seq1);
1496 for (my $j=0; $j< $seqlen; $j+=3) {
1497 $input{'cod1'} = substr($seq1, $j,3);
1498 $input{'cod2'} = substr($seq2, $j,3);
1500 #ignore codon if beeing compared with gaps!
1501 if ($input{'cod1'} =~ /\-/ || $input{'cod2'} =~ /\-/){
1502 $gap_cnt += 3; #just increments once if there is a pair of gaps
1503 next;
1506 my ($diff_cnt, $same) = count_diffs(\%input);
1508 #ignore if codons are identical
1509 next if $diff_cnt == 0 ;
1510 if ($diff_cnt == 1) {
1511 $TOTAL += $synchanges{$input{'cod1'}}{$input{'cod2'}};
1512 $TOTAL_n += 1 - $synchanges{$input{'cod1'}}{$input{'cod2'}};
1513 #print " \nfordiff is 1 , total now $TOTAL, total n now $TOTAL_n\n\n"
1515 elsif ($diff_cnt ==2) {
1516 my $s_cnt = 0;
1517 my $n_cnt = 0;
1518 my $tot_muts = 4;
1519 #will stay 4 unless there are stop codons at intervening point
1520 OUTER:for my $perm (@{$mutator{'2'}{$same}}) {
1521 my $altered = $input{'cod1'};
1522 my $prev= $altered;
1523 # print "$prev -> (", $t[$CODONS->{$altered}], ")";
1524 for my $mut_i (@$perm) { #index of codon mutated
1525 substr($altered, $mut_i,1) = substr($input{'cod2'}, $mut_i, 1);
1526 if ($t[$CODONS->{$altered}] eq '*') {
1527 $tot_muts -=2;
1528 #print "changes to stop codon!!\n";
1529 next OUTER;
1531 else {
1532 $s_cnt += $synchanges{$prev}{$altered};
1533 # print "$altered ->(", $t[$CODONS->{$altered}], ") ";
1535 $prev = $altered;
1537 # print "\n";
1539 if ($tot_muts != 0) {
1540 $TOTAL += ($s_cnt/($tot_muts/2));
1541 $TOTAL_n += ($tot_muts - $s_cnt)/ ($tot_muts / 2);
1545 elsif ($diff_cnt ==3 ) {
1546 my $s_cnt = 0;
1547 my $n_cnt = 0;
1548 my $tot_muts = 18; #potential number of mutations
1549 OUTER: for my $perm (@{$mutator{'3'}}) {
1550 my $altered = $input{'cod1'};
1551 my $prev= $altered;
1552 # print "$prev -> (", $t[$CODONS->{$altered}], ")";
1553 for my $mut_i (@$perm) { #index of codon mutated
1554 substr($altered, $mut_i,1) = substr($input{'cod2'}, $mut_i, 1);
1555 if ($t[$CODONS->{$altered}] eq '*') {
1556 $tot_muts -=3;
1557 # print "changes to stop codon!!\n";
1558 next OUTER;
1561 else {
1562 $s_cnt += $synchanges{$prev}{$altered};
1563 # print "$altered ->(", $t[$CODONS->{$altered}], ") ";
1565 $prev = $altered;
1567 # print "\n";
1569 }#end OUTER loop
1570 #calculate number of synonymous/non synonymous mutations for that codon
1571 # and add to total
1572 if ($tot_muts != 0) {
1573 $TOTAL += ($s_cnt / ($tot_muts /3));
1574 $TOTAL_n += 3 - ($s_cnt / ($tot_muts /3));
1576 } #endif $diffcnt = 3
1577 } #end of sequencetraversal
1578 return ($TOTAL, $TOTAL_n, $gap_cnt);
1582 sub count_diffs {
1583 #counts the number of nucleotide differences between 2 codons
1584 # returns this value plus the codon index of which nucleotide is the same when 2
1585 #nucleotides are different. This is so analyse_mutations() knows which nucleotides
1586 # to change.
1587 my $ref = shift;
1588 my $cnt = 0;
1589 my $same= undef;
1590 #just for 2 differences
1591 for (0..2) {
1592 if (substr($ref->{'cod1'}, $_,1) ne substr($ref->{'cod2'}, $_, 1)){
1593 $cnt++;
1594 } else {
1595 $same = $_;
1598 return ($cnt, $same);
1601 =head2 get_syn_changes
1603 Title : get_syn_changes
1604 Usage : Bio::Align::DNAStatitics->get_syn_changes
1605 Function: Generate a hashref of all pairwise combinations of codns
1606 differing by 1
1607 Returns : Symetic matrix using hashes
1608 First key is codon
1609 and each codon points to a hashref of codons
1610 the values of which describe type of change.
1611 my $type = $hash{$codon1}->{$codon2};
1612 values are :
1613 1 synonymous
1614 0 non-syn
1615 -1 either codon is a stop codon
1616 Args : none
1618 =cut
1620 sub get_syn_changes {
1621 #hash of all pairwise combinations of codons differing by 1
1622 # 1 = syn, 0 = non-syn, -1 = stop
1623 my %results;
1624 my @codons = _make_codons ();
1625 my $arr_len = scalar @codons;
1626 for (my $i = 0; $i < $arr_len -1; $i++) {
1627 my $cod1 = $codons[$i];
1628 for (my $j = $i +1; $j < $arr_len; $j++) {
1629 my $diff_cnt = 0;
1630 for my $pos(0..2) {
1631 $diff_cnt++ if substr($cod1, $pos, 1) ne substr($codons[$j], $pos, 1);
1633 next if $diff_cnt !=1;
1635 #synon change
1636 if($t[$CODONS->{$cod1}] eq $t[$CODONS->{$codons[$j]}]) {
1637 $results{$cod1}{$codons[$j]} =1;
1638 $results{$codons[$j]}{$cod1} = 1;
1640 #stop codon
1641 elsif ($t[$CODONS->{$cod1}] eq '*' or $t[$CODONS->{$codons[$j]}] eq '*') {
1642 $results{$cod1}{$codons[$j]} = -1;
1643 $results{$codons[$j]}{$cod1} = -1;
1645 # nc change
1646 else {
1647 $results{$cod1}{$codons[$j]} = 0;
1648 $results{$codons[$j]}{$cod1} = 0;
1652 return %results;
1655 =head2 dnds_pattern_number
1657 Title : dnds_pattern_number
1658 Usage : my $patterns = $stats->dnds_pattern_number($alnobj);
1659 Function: Counts the number of codons with no gaps in the MSA
1660 Returns : Number of codons with no gaps ('patterns' in PAML notation)
1661 Args : A Bio::Align::AlignI compliant object such as a
1662 Bio::SimpleAlign object.
1664 =cut
1666 sub dnds_pattern_number{
1667 my ($self, $aln) = @_;
1668 return ($aln->remove_gaps->length)/3;
1671 sub count_syn_sites {
1672 #counts the number of possible synonymous changes for sequence
1673 my ($seq, $synsite) = @_;
1674 __PACKAGE__->throw("not integral number of codons") if length($seq) % 3 != 0;
1675 my $S = 0;
1676 for (my $i = 0; $i< length($seq); $i+=3) {
1677 my $cod = substr($seq, $i, 3);
1678 next if $cod =~ /\-/; #deal with alignment gaps
1679 $S += $synsite->{$cod}{'s'};
1681 #print "S is $S\n";
1682 return $S;
1687 sub get_syn_sites {
1688 #sub to generate lookup hash for the number of synonymous changes per codon
1689 my @nucs = qw(T C A G);
1690 my %raw_results;
1691 for my $i (@nucs) {
1692 for my $j (@nucs) {
1693 for my $k (@nucs) {
1694 # for each possible codon
1695 my $cod = "$i$j$k";
1696 my $aa = $t[$CODONS->{$cod}];
1697 #calculate number of synonymous mutations vs non syn mutations
1698 for my $i (qw(0 1 2)){
1699 my $s = 0;
1700 my $n = 3;
1701 for my $nuc (qw(A T C G)) {
1702 next if substr ($cod, $i,1) eq $nuc;
1703 my $test = $cod;
1704 substr($test, $i, 1) = $nuc ;
1705 if ($t[$CODONS->{$test}] eq $aa) {
1706 $s++;
1708 if ($t[$CODONS->{$test}] eq '*') {
1709 $n--;
1712 $raw_results{$cod}[$i] = {'s' => $s ,
1713 'n' => $n };
1716 } #end analysis of single codon
1718 } #end analysis of all codons
1719 my %final_results;
1721 for my $cod (sort keys %raw_results) {
1722 my $t = 0;
1723 map{$t += ($_->{'s'} /$_->{'n'})} @{$raw_results{$cod}};
1724 $final_results{$cod} = { 's'=>$t, 'n' => 3 -$t};
1726 return \%final_results;
1729 sub _make_codons {
1730 #makes all codon combinations, returns array of them
1731 my @nucs = qw(T C A G);
1732 my @codons;
1733 for my $i (@nucs) {
1734 for my $j (@nucs) {
1735 for my $k (@nucs) {
1736 push @codons, "$i$j$k";
1740 return @codons;
1743 sub get_codons {
1744 #generates codon translation look up table#
1745 my $x = 0;
1746 my $CODONS = {};
1747 for my $codon (_make_codons) {
1748 $CODONS->{$codon} = $x;
1749 $x++;
1751 return $CODONS;
1754 #########stats subs, can go in another module? Here for speed. ###
1755 sub mean {
1756 my $ref = shift;
1757 my $el_num = scalar @$ref;
1758 my $tot = 0;
1759 map{$tot += $_}@$ref;
1760 return ($tot/$el_num);
1763 sub variance {
1764 my $ref = shift;
1765 my $mean = mean($ref);
1766 my $sum_of_squares = 0;
1767 map{$sum_of_squares += ($_ - $mean) **2}@$ref;
1768 return $sum_of_squares;
1771 sub sampling_variance {
1772 my $ref = shift;
1773 return variance($ref) / (scalar @$ref -1);