1 /*********************************************************
2 * Copyright (C) 2005, BGI of Chinese Academy of Sciences
6 * Abstract: Definition of Modified YN00 (MYN) class.
9 * Author: Zhang Zhang (zhanghzhang@genomics.org.cn)
13 Zhang Zhang, Jun Li, Jun Yu. (2006) Computing Ka and Ks
14 with a consideration of unequal transitional substitutions.
15 BMC Evolutionary Biology, 6:44.
16 **********************************************************/
24 /* Get the two kappas between purines and between pyrimidines */
25 int MYN::GetKappa(const string seq1
, const string seq2
) {
27 int i
,j
,k
,h
,pos
,c
[2],aa
[2],b
[2][3],nondeg
,fourdeg
,by
[3]={16,4,1};
28 double kappatc_TN
[2], kappaag_TN
[2], kappa_TN
[2];
29 double F
[2][XSIZE
], S
[2], wk
[2], pi4
[4];
30 double T1
, T2
, V
;//proportions of transitional differences between purines and between
31 double kdefault
=2, nullValue
=NULL
;
37 for(h
=0; h
<seq1
.length(); h
+=3) {
39 //c[]: amino acid(0--63)
40 c
[0]=getID(seq1
.substr(h
,3));
41 c
[1]=getID(seq2
.substr(h
,3));
43 aa
[0]=getAminoAcid(c
[0]);
44 aa
[1]=getAminoAcid(c
[1]);
47 b
[0][j
] = convertChar(seq1
[h
+j
]);
48 b
[1][j
] = convertChar(seq2
[h
+j
]);
51 //Find non-degenerate sites
52 for(pos
=0; pos
<3; pos
++) {
53 for(k
=0,nondeg
=0; k
<2; k
++) {
56 if (getAminoAcid(c
[k
]+(i
-b
[k
][pos
])*by
[pos
])==aa
[k
])
64 F
[0][b
[0][pos
]*4+b
[1][pos
]]+=.5;
65 F
[0][b
[1][pos
]*4+b
[0][pos
]]+=.5;
69 //Find 4-fold degenerate sites at 3rd position
70 for(k
=0,fourdeg
=0;k
<2;k
++) {
71 for(j
=0,i
=c
[k
]-b
[k
][2]; j
<4; j
++)
72 if(j
!=b
[k
][2] && getAminoAcid(i
+j
)!=aa
[k
])
74 if(aa
[0]==aa
[1] && j
==4)
79 F
[1][b
[0][2]*4+b
[1][2]]+=.5;
80 F
[1][b
[1][2]*4+b
[0][2]]+=.5;
86 for(k
=0; k
<2; k
++) { /* two kinds of sites */
88 S
[k
] = sumArray(F
[k
],16);
97 //Transitions between purines
98 T1
= 2*F
[k
][2*DNASIZE
+3];
99 //Transitions between pyrimidines
100 T2
= 2*F
[k
][0*DNASIZE
+1];
102 V
= 1 - T1
- T2
- (F
[k
][0*DNASIZE
+0]+F
[k
][1*DNASIZE
+1]+F
[k
][2*DNASIZE
+2]+F
[k
][3*DNASIZE
+3]);
104 //pi[]: the sum probabilty of T, C, A, G, respectively
106 pi4
[j
] = sumArray(F
[k
]+j
*4,4);
109 CorrectKappaTN93(S
[k
], T1
, T2
, V
, pi4
, kappatc_TN
[k
], kappaag_TN
[k
]);
110 wk
[k
]=((kappatc_TN
[k
]>0 && kappaag_TN
[k
]>0)?S
[k
]:0);
112 //R = (¦ÐT¦ÐC¦Ê1 + ¦ÐA¦ÐG¦Ê2)/(¦ÐY¦ÐR), kappa = 2R in PAML's DOC
113 kappa_TN
[k
] = 2*(kappatc_TN
[k
]*pi4
[0]*pi4
[1] + kappaag_TN
[k
]*pi4
[2]*pi4
[3])/((pi4
[0]+pi4
[1])*(pi4
[2]+pi4
[3]));
118 kappatc
= kappaag
= kappa
= kdefault
;
121 kappatc
= (kappatc_TN
[0]*wk
[0] + kappatc_TN
[1]*wk
[1])/(wk
[0] + wk
[1]);
122 kappaag
= (kappaag_TN
[0]*wk
[0] + kappaag_TN
[1]*wk
[1])/(wk
[0] + wk
[1]);
123 kappa
= (kappa_TN
[0]*wk
[0] + kappa_TN
[1]*wk
[1])/(wk
[0] + wk
[1]);
133 /* Calculate transition probability matrix(64*64) */
134 int MYN::GetPMatCodon(double PMatrix
[], double kappa
, double omega
) {
136 int i
,j
,k
, ndiff
,pos
=0,from
[3],to
[3];
139 double U
[CODON
*CODON
], V
[CODON
*CODON
], Root
[CODON
*CODON
];
141 initArray(PMatrix
, CODON
*CODON
);
142 initArray(U
, CODON
*CODON
);
143 initArray(V
, CODON
*CODON
);
144 initArray(Root
, CODON
*CODON
);
146 for(i
=0; i
<CODON
; i
++) {
150 from
[0]=i
/16; from
[1]=(i
/4)%4; from
[2]=i
%4;
152 to
[0]=j
/16; to
[1]=(j
/4)%4; to
[2]=j
%4;
153 //amino acid of 'from' and 'to'
154 c
[0]=getAminoAcid(i
);
155 c
[1]=getAminoAcid(j
);
157 if (c
[0]=='!' || c
[1]=='!')
160 //whether two codons only have one difference
161 for (k
=0,ndiff
=0; k
<3; k
++) {
162 if (from
[k
]!=to
[k
]) {
168 //only have one difference
169 PMatrix
[i
*CODON
+j
]=1;
171 if ((from
[pos
]+to
[pos
]-1)*(from
[pos
]+to
[pos
]-5)==0) {
172 if (from
[pos
]+to
[pos
]==1)
173 PMatrix
[i
*CODON
+j
]*=kappatc
;//T<->C
175 PMatrix
[i
*CODON
+j
]*=kappaag
;//A<->G
180 PMatrix
[i
*CODON
+j
]*=omega
;
182 //diagonal element is equal
183 PMatrix
[j
*CODON
+i
]=PMatrix
[i
*CODON
+j
];
188 //PMatrix[](*Q): transition probability matrix
189 for(i
=0; i
<CODON
; i
++)
190 for(j
=0; j
<CODON
; j
++)
191 PMatrix
[i
*CODON
+j
]*=pi
[j
];
193 //scale the sum of PMat[][j](j=0-63) to zero
194 for (i
=0,mr
=0; i
<CODON
; i
++) {
195 PMatrix
[i
*CODON
+i
] =- sumArray(PMatrix
+i
*CODON
,CODON
);
196 //The sum of transition probability of main diagnoal elements
197 mr
-= pi
[i
]*PMatrix
[i
*CODON
+i
];
200 //calculate exp(PMatrix*t)
201 eigenQREV(PMatrix
, pi
, pi_sqrt
, CODON
, npi0
, Root
, U
, V
);
202 for(i
=0; i
<CODON
; i
++)
204 PMatUVRoot(PMatrix
,t
,64,U
,V
,Root
);
210 int MYN::CorrectKappaTN93(double n
, double P1
, double P2
, double Q
, double pi4
[], double &kappatc_TN93
, double &kappaag_TN93
) {
213 double tc
,ag
, Y
,R
, a1
=0, a2
=0,b
=0, A
,B
,C
;
214 double Qsmall
=min(1e-10,0.1/n
), default_kappa
= 2, maxkappa
= 99;
216 kappatc_TN93
= kappaag_TN93
= -1;
227 if ((P1
+P2
+Q
)>1 || (P1
+P2
)<-1e-10 || Q
<-1e-10 || fabs(Y
+R
-1)>1e-8) {
233 else if (Y
<=0 || R
<=0 || (tc
<=0 && ag
<=0))
235 else { //TN93 for multiple substitutions
236 A
=tc
/Y
+ag
/R
; B
=tc
+ag
; C
=Y
*R
;
237 a1
= 1 - R
*P1
/(2*ag
) - Q
/(2*R
);
238 a2
= 1 - Y
*P2
/(2*tc
) - Q
/(2*Y
);
240 if(a1
<0 || a2
<0 || b
<0) {
248 kappaag_TN93
= (Y
*b
- a1
)/(-R
*b
);
249 kappatc_TN93
= (R
*b
- a2
)/(-Y
*b
);
253 if(failTN93
) { //Fail to correct kappa
254 kappatc_TN93
= kappaag_TN93
= default_kappa
;
260 /* Correct Ka and Ks */
261 int MYN::CorrectKaksTN93(double n
, double P1
, double P2
, double Q
, double pi4
[], double &kaks
, double &SEkaks
) {
264 double tc
,ag
, Y
,R
, a1
, a2
, b
, A
, B
, C
;
267 a1
= a2
= b
= failTN93
= 0;
275 if (P1
+P2
+Q
>1 || fabs(Y
+R
-1)>Qsmall
|| Y
<=0 || R
<=0 || (tc
<=0 && ag
<=0)) {
278 else { //TN93 for multiple substitutions
279 A
=tc
/Y
+ag
/R
; B
=tc
+ag
; C
=Y
*R
;
280 a1
= 1 - R
*P1
/(2*ag
) - Q
/(2*R
);
281 a2
= 1 - Y
*P2
/(2*tc
) - Q
/(2*Y
);
283 if(a1
<0 || a2
<0 || b
<0) {
291 kaks
= (-2*ag
*a1
/R
) + (-2*tc
*a2
/Y
) + (-2*(C
-ag
*Y
/R
-tc
*R
/Y
)*b
);
293 double cc1
= 2*ag
*R
/ (2*ag
*R
-R
*R
*P1
-ag
*Q
);
294 double cc2
= 2*tc
*Y
/ (2*tc
*Y
-Y
*Y
*P2
-tc
*Q
);
295 double cc3
= 2*ag
*ag
/ (R
*(2*ag
*R
-R
*R
*P1
-ag
*Q
));
296 cc3
+= 2*tc
*tc
/ (Y
*(2*tc
*Y
-Y
*Y
*P2
-tc
*Q
));
297 cc3
+= (R
*R
*(Y
*Y
-2*tc
) + Y
*Y
*(R
*R
-2*ag
)) / (2*R
*R
*Y
*Y
-R
*Y
*Q
);
298 SEkaks
= (square(cc1
)*P1
+ square(cc2
)*P2
+ square(cc3
)*Q
- square(cc1
*P1
+cc2
*P2
+cc3
*Q
))/n
;
302 if (failTN93
==1) { //Use YN00's correction for Ka, Ks
303 DistanceF84(n
, P1
+P2
, Q
, pi4
, Qsmall
, kaks
, SEkaks
);
309 /* Count differences, considering different transitional pathways between purines and between pyrimidines */
310 int MYN::CountDiffs(const string seq1
, const string seq2
, double &Sdts1
, double &Sdts2
, double &Sdtv
,double &Ndts1
, double &Ndts2
, double &Ndtv
,double PMatrix
[]) {
311 int h
,i1
,i2
,i
,k
, transi
, c
[2],ct
[2], by
[3]={16,4,1};
313 int dmark
[3], step
[3], b
[2][3], bt1
[3], bt2
[3];
314 int ndiff
, npath
, nstop
, sts1path
[6], sts2path
[6], stvpath
[6],nts1path
[6], nts2path
[6], ntvpath
[6];
315 double sts1
, sts2
, stv
, nts1
, nts2
, ntv
; /* syn ts & tv, nonsyn ts & tv for 2 codons */
316 double ppath
[6], sump
,p
;
318 Sdts1
=Sdts2
=Sdtv
=Ndts1
=Ndts2
=Ndtv
=snp
=0;
319 for (h
=0; h
<seq1
.length(); h
+=3) {
321 c
[0]=getID(seq1
.substr(h
,3));
322 c
[1]=getID(seq2
.substr(h
,3));
331 aa
[i
]=getAminoAcid(c
[i
]);
334 //ndiff: differences of two codons
336 sts1
=sts2
=stv
=nts1
=nts2
=ntv
=0;
337 //dmark[]: position of different codon
340 if (b
[0][k
]!=b
[1][k
])
348 npath
=(ndiff
==2)?2:6;
351 transi
=b
[0][dmark
[0]]+b
[1][dmark
[0]];
352 //transi=(transi==1 || transi==5);
370 else { /* ndiff=2 or 3 */
373 for(k
=0; k
<npath
; k
++) {
376 for(i1
=0; i1
<3; i1
++)
385 if (step
[0]<=step
[1])
387 step
[2]=3-step
[0]-step
[1];
388 }//end of set the step[]
390 for(i1
=0; i1
<3; i1
++)
391 bt1
[i1
]=bt2
[i1
]=b
[0][i1
];
393 sts1path
[k
]=sts2path
[k
]=stvpath
[k
]=nts1path
[k
]=nts2path
[k
]=ntvpath
[k
]=0;
395 //ppath[]: probabilty of each path
396 for (i1
=0,ppath
[k
]=1; i1
<ndiff
; i1
++) {
397 bt2
[step
[i1
]] = b
[1][step
[i1
]];
399 //ct[]: mutated codon's ID(0--63)
400 for (i2
=0,ct
[0]=ct
[1]=0; i2
<3; i2
++) {
401 ct
[0]+=bt1
[i2
]*by
[i2
];
402 ct
[1]+=bt2
[i2
]*by
[i2
];
404 //ppath[k]: probabilty of path k
405 ppath
[k
]*=PMatrix
[ct
[0]*CODON
+ct
[1]];
406 for(i2
=0; i2
<2; i2
++)
407 aa
[i2
]=getAminoAcid(ct
[i2
]);
415 transi
=b
[0][step
[i1
]]+b
[1][step
[i1
]];
417 //ts & tr when syn & nonsyn in path k
435 for(i2
=0; i2
<3; i2
++)
440 if (npath
==nstop
) { /* all paths through stop codons */
453 //sum probabilty of all path
454 sump
=sumArray(ppath
,npath
);
456 for(k
=0;k
<npath
;k
++) { //p: the probabilty of path k
458 sts1
+=sts1path
[k
]*p
; sts2
+=sts2path
[k
]*p
; stv
+=stvpath
[k
]*p
;
459 nts1
+=nts1path
[k
]*p
; nts2
+=nts2path
[k
]*p
; ntv
+=ntvpath
[k
]*p
;
466 Sdts1
+=sts1
; Sdts2
+=sts2
; Sdtv
+=stv
;
467 Ndts1
+=nts1
; Ndts2
+=nts2
; Ndtv
+=ntv
;
473 int MYN::DistanceYN00(const string seq1
, const string seq2
, double &dS
,double &dN
, double &SEdS
, double &SEdN
) {
475 int j
,ir
,nround
=100, status
=1;
476 double fbS
[4], fbN
[4], fbSt
[4], fbNt
[4];
477 double St
, Nt
, Sdts1
, Sdts2
, Sdtv
, Ndts1
, Ndts2
, Ndtv
;
478 double w0
=0, S0
=0, N0
=0, dS0
=0, dN0
=0, accu
=5e-8, minomega
=1e-5, maxomega
=99;
479 double PMatrix
[CODON
*CODON
];
481 //initial values for t and omega(Ka/Ks)
488 //Count sites of sequence 1
489 CountSites(seq1
, St
, Nt
, fbSt
, fbNt
);
497 //Count sites of sequence 2
498 CountSites(seq2
, St
, Nt
, fbSt
, fbNt
);
507 for (ir
=0; ir
<nround
; ir
++) { /* iteration */
509 //Get transition probability matrix from one codon to another
510 GetPMatCodon(PMatrix
,kappa
,omega
);
513 CountDiffs(seq1
, seq2
, Sdts1
, Sdts2
, Sdtv
, Ndts1
, Ndts2
, Ndtv
, PMatrix
);
515 //Synonymous(Sd) and nonsynonymous(Nd) differences
516 Sd
= Sdts1
+ Sdts2
+ Sdtv
;
517 Nd
= Ndts1
+ Ndts2
+ Ndtv
;
532 CorrectKaksTN93(S
, Sdts1
/S
, Sdts2
/S
, Sdtv
/S
, fbS
, dS
, SEdS
);
534 CorrectKaksTN93(N
, Ndts1
/N
, Ndts2
/N
, Ndtv
/N
, fbN
, dN
, SEdN
);
544 omega
= max(minomega
, dN
/dS
);
547 t
= dS
* 3 * S
/(S
+ N
) + dN
* 3 * N
/(S
+ N
);
549 if (fabs(dS
-dS0
)<accu
&& fabs(dN
-dN0
)<accu
&& fabs(omega
-w0
)<accu
)
556 } //end of for(ir) */
565 /* Count the synonymous and nonsynonymous sites of two sequences */
566 int MYN::CountSites(const string seq
, double &Stot
, double &Ntot
,double fbS
[],double fbN
[]) {
567 int h
,i
,j
,k
, c
[2],aa
[2], b
[3], by
[3]={16,4,1};
574 for (h
=0; h
<seq
.length(); h
+=3) {
576 //Get codon id and amino acid
577 c
[0]=getID(seq
.substr(h
,3));
578 aa
[0]=getAminoAcid(c
[0]);
580 b
[i
]=convertChar(seq
[h
+i
]);
583 for (j
=0,S
=N
=0; j
<3; j
++) {
584 for(k
=0; k
<4; k
++) { /* b[j] changes to k */
587 //c[0] change at position j
588 c
[1] = c
[0]+(k
-b
[j
])*by
[j
];
589 aa
[1] = getAminoAcid(c
[1]);
595 if (k
+b
[j
]==1 || k
+b
[j
]==5) {//transition
596 if (k
+b
[j
]==1) r
*=kappatc
;
597 else r
*=kappaag
; //(k+b[j]==5)
600 if (aa
[0]==aa
[1]) { //synonymous
602 fbS
[b
[j
]]+=r
; //syn probability of A,C,G,T
604 else { //nonsynonymous
606 fbN
[b
[j
]]+=r
; //nonsyn probability of A,C,G,T
614 //Scale Stot+Ntot to seq.length()
615 r
=seq
.length()/(Stot
+Ntot
);
619 //get probablity of syn of four nul.
624 //get probablity of nonsyn of four nul.