1 /************************************************************
2 * Copyright (C) 2005, BGI of Chinese Academy of Sciences
6 * Abstract: Defination of YN00 class.
9 * Author: Zhang Zhang (zhanghzhang@genomics.org.cn)
16 Note: Source codes are adapted from yn00.c in PAML.
19 Yang Z, Nielsen R (2000) Estimating Synonymous and
20 Nonsynonymous Substitution Rates Under Realistic
21 Evolutionary Models. Mol Biol Evol 17:32-43.
22 *************************************************************/
30 initArray(f12pos
, 12);
35 void YN00::getFreqency(const string seq1
, const string seq2
) {
40 //Get A,C,G,T frequency at three positions
41 for(i
=0; i
<seq1
.length(); i
++) {
42 f12pos
[(i
%3)*4+convertChar(seq1
[i
])]++;
43 f12pos
[(i
%3)*4+convertChar(seq2
[i
])]++;
45 for(i
=0; i
<CODONFREQ
; i
++)
46 f12pos
[i
]/=(seq1
.length()+seq2
.length())/3;
48 //Get 64 amino acid probability
49 for(i
=0; i
<CODON
; i
++) {
50 pi
[i
] = f12pos
[i
/16] * f12pos
[4+(i
%16)/4] * f12pos
[8+i
%4];
51 if (getAminoAcid(i
)=='!') {
56 //Scale the sum of pi[] to 1
57 for(i
=0; i
<CODON
; i
++)
60 if (fabs(1-sumArray(pi
,CODON
))>1e-6)
61 cout
<<"Warning: error in get codon freqency."<<endl
;
63 for(i
=0,npi0
=0; i
<CODON
; i
++)
65 pi_sqrt
[npi0
++]=sqrt(pi
[i
]);
72 /* Estimate kappa using the fourfold degenerate sites at third codon positions and nondegenerate sites */
73 int YN00::GetKappa(const string seq1
, const string seq2
) {
75 int i
,j
,k
,h
,pos
,c
[2],aa
[2],b
[2][3],nondeg
,fourdeg
,by
[3]={16,4,1};
76 double ka
[2], F
[2][XSIZE
],S
[2],wk
[2], T
,V
, pi4
[4];
77 double kdefault
=2, nullValue
=NULL
, t
;
83 for(h
=0; h
<seq1
.length(); h
+=3) {
85 //c[]: amino acid(0--63)
86 c
[0]=getID(seq1
.substr(h
,3));
87 c
[1]=getID(seq2
.substr(h
,3));
89 aa
[0]=getAminoAcid(c
[0]);
90 aa
[1]=getAminoAcid(c
[1]);
93 b
[0][j
] = convertChar(seq1
[h
+j
]);
94 b
[1][j
] = convertChar(seq2
[h
+j
]);
97 //Find non-degenerate sites
98 for(pos
=0; pos
<3; pos
++) {
99 for(k
=0,nondeg
=0; k
<2; k
++) {
102 if (getAminoAcid(c
[k
]+(i
-b
[k
][pos
])*by
[pos
])==aa
[k
])
110 F
[0][b
[0][pos
]*4+b
[1][pos
]]+=.5;
111 F
[0][b
[1][pos
]*4+b
[0][pos
]]+=.5;
115 //Find 4-fold degenerate sites at 3rd position
116 for(k
=0,fourdeg
=0;k
<2;k
++) {
117 for(j
=0,i
=c
[k
]-b
[k
][2]; j
<4; j
++)
118 if(j
!=b
[k
][2] && getAminoAcid(i
+j
)!=aa
[k
])
120 if(aa
[0]==aa
[1] && j
==4)
125 F
[1][b
[0][2]*4+b
[1][2]]+=.5;
126 F
[1][b
[1][2]*4+b
[0][2]]+=.5;
131 for(k
=0; k
<2; k
++) { /* two kinds of sites */
133 S
[k
] = sumArray(F
[k
],16);
142 T
= (F
[k
][0*4+1]+F
[k
][2*4+3])*2;
144 V
= 1 - T
- (F
[k
][0*4+0]+F
[k
][1*4+1]+F
[k
][2*4+2]+F
[k
][3*4+3]);
146 //pi[]: the sum probabilty of T, C, A, G, respectively
148 pi4
[j
] = sumArray(F
[k
]+j
*4,4);
152 DistanceF84(S
[k
],T
,V
,pi4
, ka
[k
], t
, nullValue
);
153 wk
[k
]=(ka
[k
]>0?S
[k
]:0);
160 kappa
=(ka
[0]*wk
[0]+ka
[1]*wk
[1])/(wk
[0]+wk
[1]);
163 KAPPA
[0] = KAPPA
[1] = kappa
;
169 /* Correct for multiple substitutions */
170 int YN00::DistanceF84(double n
, double P
, double Q
, double pi4
[], double &k_HKY
, double &t
, double &SEt
) {
172 int failF84
=0,failK80
=0,failJC69
=0;
173 double tc
,ag
, Y
,R
, a
=0,b
=0, A
,B
,C
, k_F84
;
174 double Qsmall
=min(1e-10,0.1/n
), maxkappa
=2,maxt
=99;
189 if (P
<-1e-10 || Q
<-1e-10 || fabs(Y
+R
-1)>1e-8) {
196 else if (Y
<=0 || R
<=0 || (tc
<=0 && ag
<=0))
199 A
=tc
/Y
+ag
/R
; B
=tc
+ag
; C
=Y
*R
;
200 a
=(2*B
+2*(tc
*R
/Y
+ag
*Y
/R
)*(1-Q
/(2*C
)) - P
) / (2*A
);
207 a
=-.5*log(a
); b
=-.5*log(b
);
212 t
= 4*b
*(tc
*(1+ k_F84
/Y
)+ag
*(1+ k_F84
/R
)+C
);
213 k_HKY
= (B
+ (tc
/Y
+ag
/R
)* k_F84
)/B
; /* k_F84=>k_HKY85 */
216 a
= A
*C
/(A
*C
-C
*P
/2-(A
-B
)*Q
/2);
217 b
= A
*(A
-B
)/(A
*C
-C
*P
/2-(A
-B
)*Q
/2) - (A
-B
-C
)/(C
-Q
/2);
218 SEt
= sqrt((a
*a
*P
+b
*b
*Q
-square(a
*P
+b
*Q
))/n
);
223 if(failF84
&& !failK80
) { /* try K80 */
228 a
=-log(a
); b
=-log(b
);
232 k_HKY
=(.5*a
-.25*b
)/(.25*b
);
236 a
=1/(1-2*P
-Q
); b
=(a
+1/(1-2*Q
))/2;
237 SEt
= sqrt((a
*a
*P
+b
*b
*Q
-square(a
*P
+b
*Q
))/n
);
242 if(failK80
) {/* try JC69 */
247 t
= -.75*log(1-P
*4/3.);
252 SEt
= sqrt(9*P
*(1-P
)/n
) / (3-4*P
);
259 return(failF84
+ failK80
+ failJC69
);
262 int YN00::DistanceYN00(const string seq1
, const string seq2
, double &dS
,double &dN
, double &SEKs
, double &SEKa
) {
264 int j
,k
,ir
,nround
=10, status
=0;
265 double fbS
[4], fbN
[4], fbSt
[4], fbNt
[4], St
, Nt
, Sdts
, Sdtv
, Ndts
, Ndtv
, k_HKY
;
266 double w0
=0, dS0
=0, dN0
=0, accu
=5e-4, minomega
=1e-5, maxomega
=99;
267 double PMatrix
[CODON
*CODON
];
276 //Count sites of sequence 1
278 CountSites(seq1
, St
, Nt
, fbSt
, fbNt
);
285 //Count sites of sequence 2
286 CountSites(seq2
, St
, Nt
, fbSt
, fbNt
);
296 if (omega
<0.01 || omega
>5)
300 for (ir
=0; ir
<(iteration
?nround
:1); ir
++) { /* iteration */
302 GetPMatCodon(PMatrix
,kappa
,omega
);
304 for(j
=0; j
<CODON
*CODON
; j
++) {
308 CountDiffs(seq1
,seq2
, Sdts
, Sdtv
, Ndts
, Ndtv
, PMatrix
);
314 DistanceF84(S
, Sdts
/S
, Sdtv
/S
, fbS
, k_HKY
, dS
, SEKs
);
316 DistanceF84(N
, Ndts
/N
, Ndtv
/N
, fbN
, k_HKY
, dN
, SEKa
);
323 omega
= max(minomega
, dN
/dS
);
326 t
= dS
* 3 * S
/(S
+ N
) + dN
* 3 * N
/(S
+ N
);
328 if ( fabs(dS
-dS0
)<accu
&& fabs(dN
-dN0
)<accu
&& fabs(omega
-w0
)<accu
)
335 } //end of for(ir) */
343 //Count differences between two compared codons
344 int YN00::CountDiffs(const string seq1
, const string seq2
, double &Sdts
,double &Sdtv
,double &Ndts
, double &Ndtv
,double PMatrix
[]) {
345 int h
,i1
,i2
,i
,k
, transi
, c
[2],ct
[2], by
[3]={16,4,1};
347 int dmark
[3], step
[3], b
[2][3], bt1
[3], bt2
[3];
348 int ndiff
, npath
, nstop
, stspath
[6],stvpath
[6],ntspath
[6],ntvpath
[6];
349 double sts
,stv
,nts
,ntv
; /* syn ts & tv, nonsyn ts & tv for 2 codons */
350 double ppath
[6], sump
,p
;
353 for (h
=0,Sdts
=Sdtv
=Ndts
=Ndtv
=0; h
<seq1
.length(); h
+=3) {
355 c
[0]=getID(seq1
.substr(h
,3));
356 c
[1]=getID(seq2
.substr(h
,3));
365 aa
[i
]=getAminoAcid(c
[i
]);
368 //ndiff: differences of two codons
371 //dmark[]: position of different codon
374 if (b
[0][k
]!=b
[1][k
])
382 npath
=(ndiff
==2)?2:6;
385 transi
=b
[0][dmark
[0]]+b
[1][dmark
[0]];
386 transi
=(transi
==1 || transi
==5);
400 else { /* ndiff=2 or 3 */
403 for(k
=0; k
<npath
; k
++) {
406 for(i1
=0; i1
<3; i1
++)
415 if (step
[0]<=step
[1])
417 step
[2]=3-step
[0]-step
[1];
418 }//end of set the step[]
420 for(i1
=0; i1
<3; i1
++)
421 bt1
[i1
]=bt2
[i1
]=b
[0][i1
];
423 stspath
[k
]=stvpath
[k
]=ntspath
[k
]=ntvpath
[k
]=0;
425 //ppath[]: probabilty of each path
426 for (i1
=0,ppath
[k
]=1; i1
<ndiff
; i1
++) {
427 bt2
[step
[i1
]] = b
[1][step
[i1
]];
429 //ct[]: mutated codon's ID(0--63)
430 for (i2
=0,ct
[0]=ct
[1]=0; i2
<3; i2
++) {
431 ct
[0]+=bt1
[i2
]*by
[i2
];
432 ct
[1]+=bt2
[i2
]*by
[i2
];
434 //ppath[k]: probabilty of path k
435 ppath
[k
]*=PMatrix
[ct
[0]*CODON
+ct
[1]];
436 for(i2
=0; i2
<2; i2
++)
437 aa
[i2
]=getAminoAcid(ct
[i2
]);
445 transi
=b
[0][step
[i1
]]+b
[1][step
[i1
]];
446 transi
=(transi
==1 || transi
==5); /* transition? */
448 //ts & tr when syn & nonsyn in path k
462 for(i2
=0; i2
<3; i2
++)
467 if (npath
==nstop
) { /* all paths through stop codons */
478 //sum probabilty of all path
479 sump
=sumArray(ppath
,npath
);
481 for(k
=0;k
<npath
;k
++) { //p: the probabilty of path k
483 sts
+=stspath
[k
]*p
; stv
+=stvpath
[k
]*p
;
484 nts
+=ntspath
[k
]*p
; ntv
+=ntvpath
[k
]*p
;
500 /* Calculate transition probability matrix(64*64) */
501 int YN00::GetPMatCodon(double PMatrix
[], double kappa
, double omega
)
503 /* Get PMat=exp(Q*t) for weighting pathways
505 int i
,j
,k
, ndiff
,pos
=0,from
[3],to
[3];
508 double U
[CODON
*CODON
], V
[CODON
*CODON
], Root
[CODON
*CODON
];
510 initArray(PMatrix
, CODON
*CODON
);
511 initArray(U
, CODON
*CODON
);
512 initArray(V
, CODON
*CODON
);
513 initArray(Root
, CODON
*CODON
);
515 for(i
=0; i
<CODON
; i
++) {
519 from
[0]=i
/16; from
[1]=(i
/4)%4; from
[2]=i
%4;
521 to
[0]=j
/16; to
[1]=(j
/4)%4; to
[2]=j
%4;
522 //amino acid of 'from' and 'to'
523 c
[0]=getAminoAcid(i
);
524 c
[1]=getAminoAcid(j
);
526 if (c
[0]=='!' || c
[1]=='!')
529 //whether two codons only have one difference
530 for (k
=0,ndiff
=0; k
<3; k
++) {
531 if (from
[k
]!=to
[k
]) {
537 //only have one difference
538 PMatrix
[i
*CODON
+j
]=1;
540 if ((from
[pos
]+to
[pos
]-1)*(from
[pos
]+to
[pos
]-5)==0)
541 PMatrix
[i
*CODON
+j
]*=kappa
;
545 PMatrix
[i
*CODON
+j
]*=omega
;
547 //diagonal element is equal
548 PMatrix
[j
*CODON
+i
]=PMatrix
[i
*CODON
+j
];
553 //PMatrix[](*Q): transition probability matrix
554 for(i
=0; i
<CODON
; i
++)
555 for(j
=0; j
<CODON
; j
++)
556 PMatrix
[i
*CODON
+j
]*=pi
[j
];
558 //scale the sum of PMat[][j](j=0-63) to zero
559 for (i
=0,mr
=0; i
<CODON
; i
++) {
560 PMatrix
[i
*CODON
+i
]=-sumArray(PMatrix
+i
*CODON
,CODON
);
561 //The sum of transition probability of main diagnoal elements
562 mr
-=pi
[i
]*PMatrix
[i
*CODON
+i
];
565 //calculate exp(PMatrix*t)
566 eigenQREV(PMatrix
, pi
, pi_sqrt
, CODON
, npi0
, Root
, U
, V
);
567 for(i
=0; i
<CODON
; i
++)
569 PMatUVRoot(PMatrix
, t
, CODON
, U
, V
, Root
);
576 /* P(t) = U * exp{Root*t} * V */
577 int YN00::PMatUVRoot (double P
[], double t
, int n
, double U
[], double V
[], double Root
[]) {
580 double expt
, uexpt
, *pP
;
584 for (k
=0,initArray(P
,n
*n
); k
<n
; k
++)
585 for (i
=0,pP
=P
,expt
=exp(t
*Root
[k
]); i
<n
; i
++)
586 for (j
=0,uexpt
=U
[i
*n
+k
]*expt
; j
<n
; j
++)
587 *pP
++ += uexpt
*V
[k
*n
+j
];
597 /* Count the synonymous and nonsynonymous sites of two sequences */
598 int YN00::CountSites(const string seq
, double &Stot
, double &Ntot
,double fbS
[],double fbN
[]) {
599 int h
,i
,j
,k
, c
[2],aa
[2], b
[3], by
[3]={16,4,1};
606 for (h
=0; h
<seq
.length(); h
+=3) {
608 //Get codon id and amino acid
609 c
[0]=getID(seq
.substr(h
,3));
610 aa
[0]=getAminoAcid(c
[0]);
612 b
[i
]=convertChar(seq
[h
+i
]);
615 for (j
=0,S
=N
=0; j
<3; j
++) {
616 for(k
=0; k
<4; k
++) { /* b[j] changes to k */
619 //c[0] change at position j
620 c
[1] = c
[0]+(k
-b
[j
])*by
[j
];
621 aa
[1] = getAminoAcid(c
[1]);
627 if (k
+b
[j
]==1 || k
+b
[j
]==5) //transition
630 if (aa
[0]==aa
[1]) { //synonymous
632 fbS
[b
[j
]]+=r
; //syn probability of A,C,G,T
634 else { //nonsynonymous
636 fbN
[b
[j
]]+=r
; //nonsyn probability of A,C,G,T
644 //Scale Stot+Ntot to seq.length()
645 r
=seq
.length()/(Stot
+Ntot
);
649 //get probablity of syn of four nul.
654 //get probablity of nonsyn of four nul.
663 string
YN00::Run(string seq1
, string seq2
) {
670 getFreqency(seq1
, seq2
);
671 GetKappa(seq1
, seq2
);
672 DistanceYN00(seq1
, seq2
, Ks
, Ka
, SEKs
, SEKa
);
674 t
= (S
*Ks
+N
*Ka
)/(S
+N
);
676 return parseOutput();
681 //The following functions are used to calculate PMatrix by Taylor.
683 int YN00::eigenQREV (double Q
[], double pi
[], double pi_sqrt
[], int n
, int npi0
,
684 double Root
[], double U
[], double V
[])
687 This finds the eigen solution of the rate matrix Q for a time-reversible
688 Markov process, using the algorithm for a real symmetric matrix.
689 Rate matrix Q = S * diag{pi} = U * diag{Root} * V,
690 where S is symmetrical, all elements of pi are positive, and U*V = I.
691 pi_sqrt[n-npi0] has to be calculated before calling this routine.
693 [U 0] [Q_0 0] [U^-1 0] [Root 0]
694 [0 I] [0 0] [0 I] = [0 0]
696 Ziheng Yang, 25 December 2001 (ref is CME/eigenQ.pdf)
698 int i
,j
, inew
,jnew
, nnew
=n
-npi0
, status
;
700 //npi0 is the number of stop codons in selected genetic table
702 if(npi0
==0) { //seldom occur
706 for(j
=0,U
[i
*n
+i
] = Q
[i
*n
+i
]; j
<i
; j
++)
707 U
[i
*n
+j
] = U
[j
*n
+i
] = (Q
[i
*n
+j
] * pi_sqrt
[i
]/pi_sqrt
[j
]);
711 status
=eigenRealSym(U
, n
, Root
, V
);
715 V
[i
*n
+j
] = U
[j
*n
+i
] * pi_sqrt
[j
];
719 U
[i
*n
+j
] /= pi_sqrt
[i
];
722 for(i
=0,inew
=0; i
<n
; i
++) {
724 for(j
=0,jnew
=0; j
<i
; j
++)
726 U
[inew
*nnew
+jnew
] = U
[jnew
*nnew
+inew
] = Q
[i
*n
+j
] * pi_sqrt
[inew
]/pi_sqrt
[jnew
];
729 U
[inew
*nnew
+inew
] = Q
[i
*n
+i
];
733 status
=eigenRealSym(U
, nnew
, Root
, V
);
735 for(i
=n
-1,inew
=nnew
-1; i
>=0; i
--) /* construct Root */
736 Root
[i
] = (pi
[i
] ? Root
[inew
--] : 0);
738 for(i
=n
-1,inew
=nnew
-1; i
>=0; i
--) { /* construct V */
740 for(j
=n
-1,jnew
=nnew
-1; j
>=0; j
--)
742 V
[i
*n
+j
] = U
[jnew
*nnew
+inew
]*pi_sqrt
[jnew
];
754 for(i
=n
-1,inew
=nnew
-1; i
>=0; i
--) { /* construct U */
756 for(j
=n
-1,jnew
=nnew
-1;j
>=0;j
--)
758 U
[i
*n
+j
] = U
[inew
*nnew
+jnew
]/pi_sqrt
[inew
];
774 void YN00::HouseholderRealSym(double a
[], int n
, double d
[], double e
[]) {
775 /* This uses HouseholderRealSym transformation to reduce a real symmetrical matrix
776 a[n*n] into a tridiagonal matrix represented by d and e.
777 d[] is the diagonal (eigends), and e[] the off-diagonal.
780 double scale
,hh
,h
,g
,f
;
782 for (i
=n
-1;i
>=1;i
--) {
787 scale
+= fabs(a
[i
*n
+k
]);
793 h
+= a
[i
*n
+k
]*a
[i
*n
+k
];
796 g
=(f
>= 0 ? -sqrt(h
) : sqrt(h
));
805 g
+= a
[j
*n
+k
]*a
[i
*n
+k
];
807 g
+= a
[k
*n
+j
]*a
[i
*n
+k
];
816 a
[j
*n
+k
] -= (f
*e
[k
]+g
*a
[i
*n
+k
]);
826 /* Get eigenvectors */
833 g
+= a
[i
*n
+k
]*a
[k
*n
+j
];
835 a
[k
*n
+j
] -= g
*a
[k
*n
+i
];
840 for (j
=0;j
<=m
;j
++) a
[j
*n
+i
]=a
[i
*n
+j
]=0;
846 int YN00::EigenTridagQLImplicit(double d
[], double e
[], int n
, double z
[])
848 /* This finds the eigen solution of a tridiagonal matrix represented by d and e.
849 d[] is the diagonal (eigenvalues), e[] is the off-diagonal
850 z[n*n]: as input should have the identity matrix to get the eigen solution of the
851 tridiagonal matrix, or the output from HouseholderRealSym() to get the
852 eigen solution to the original real symmetric matrix.
853 z[n*n]: has the orthogonal matrix as output
855 Adapted from routine tqli in Numerical Recipes in C, with reference to
857 Ziheng Yang, May 2001
859 int m
,j
,iter
,niter
=30, status
=0, i
,k
;
860 double s
,r
,p
,g
,f
,dd
,c
,b
, aa
,bb
;
862 for (i
=1;i
<n
;i
++) e
[i
-1]=e
[i
]; e
[n
-1]=0;
866 for (m
=j
;m
<n
-1;m
++) {
867 dd
=fabs(d
[m
])+fabs(d
[m
+1]);
868 if (fabs(e
[m
])+dd
== dd
) break; /* ??? */
871 if (iter
++ == niter
) {
875 g
=(d
[j
+1]-d
[j
])/(2*e
[j
]);
879 if((aa
=fabs(g
))>1) r
=aa
*sqrt(1+1/(g
*g
));
882 g
=d
[m
]-d
[j
]+e
[j
]/(g
+SIGN(r
,g
));
885 for (i
=m
-1;i
>=j
;i
--) {
890 aa
=fabs(f
); bb
=fabs(g
);
891 if(aa
>bb
) { bb
/=aa
; r
=aa
*sqrt(1+bb
*bb
); }
893 else { aa
/=bb
; r
=bb
*sqrt(1+aa
*aa
); }
909 z
[k
*n
+i
+1]=s
*z
[k
*n
+i
]+c
*f
;
910 z
[k
*n
+i
]=c
*z
[k
*n
+i
]-s
*f
;
913 if (r
== 0 && i
>= j
) continue;
914 d
[j
]-=p
; e
[j
]=g
; e
[m
]=0;
921 void YN00::EigenSort(double d
[], double U
[], int n
)
923 /* this sorts the eigen values d[] and rearrange the (right) eigen vectors U[]
928 for (i
=0;i
<n
-1;i
++) {
931 if (d
[j
] >= p
) p
=d
[k
=j
];
944 int YN00::eigenRealSym(double A
[], int n
, double Root
[], double work
[])
946 /* This finds the eigen solution of a real symmetrical matrix A[n*n]. In return,
947 A has the right vectors and Root has the eigenvalues. work[n] is the working space.
948 The matrix is first reduced to a tridiagonal matrix using HouseholderRealSym(),
949 and then using the QL algorithm with implicit shifts.
951 Adapted from routine tqli in Numerical Recipes in C, with reference to LAPACK
952 Ziheng Yang, 23 May 2001
955 HouseholderRealSym(A
, n
, Root
, work
);
956 status
=EigenTridagQLImplicit(Root
, work
, n
, A
);
957 EigenSort(Root
, A
, n
);