modified: myjupyterlab.sh
[GalaxyCodeBases.git] / c_cpp / etc / KaKs_Calculator / src / MYN.cpp
blobdffd0e6603fc65520bd1f0c3bee1edcff185671a
1 /*********************************************************
2 * Copyright (C) 2005, BGI of Chinese Academy of Sciences
3 * All rights reserved.
5 * Filename: MYN.cpp
6 * Abstract: Definition of Modified YN00 (MYN) class.
8 * Version: 1.0
9 * Author: Zhang Zhang (zhanghzhang@genomics.org.cn)
10 * Date: Dec.30, 2005
12 References:
13 Zhang Zhang, Jun Li, Jun Yu. (2006) Computing Ka and Ks
14 with a consideration of unequal transitional substitutions.
15 BMC Evolutionary Biology, 6:44.
16 **********************************************************/
18 #include "MYN.h"
20 MYN::MYN() {
21 name = "MYN";
24 /* Get the two kappas between purines and between pyrimidines */
25 int MYN::GetKappa(const string seq1, const string seq2) {
27 int i,j,k,h,pos,c[2],aa[2],b[2][3],nondeg,fourdeg,by[3]={16,4,1};
28 double kappatc_TN[2], kappaag_TN[2], kappa_TN[2];
29 double F[2][XSIZE], S[2], wk[2], pi4[4];
30 double T1, T2, V;//proportions of transitional differences between purines and between
31 double kdefault=2, nullValue=NULL;
33 for(k=0; k<2; k++)
34 initArray(F[k],16);
36 //Get Pi[] of A,C,G,T
37 for(h=0; h<seq1.length(); h+=3) {
39 //c[]: amino acid(0--63)
40 c[0]=getID(seq1.substr(h,3));
41 c[1]=getID(seq2.substr(h,3));
42 //aa[ ]: amino acid
43 aa[0]=getAminoAcid(c[0]);
44 aa[1]=getAminoAcid(c[1]);
45 //b[][]: 0--3
46 for(j=0; j<3; j++) {
47 b[0][j] = convertChar(seq1[h+j]);
48 b[1][j] = convertChar(seq2[h+j]);
51 //Find non-degenerate sites
52 for(pos=0; pos<3; pos++) {
53 for(k=0,nondeg=0; k<2; k++) {
54 for(i=0; i<4; i++) {
55 if(i!=b[k][pos])
56 if (getAminoAcid(c[k]+(i-b[k][pos])*by[pos])==aa[k])
57 break;
59 if (i==4)
60 nondeg++;
62 //F[0][]: 0-fold
63 if(nondeg==2) {
64 F[0][b[0][pos]*4+b[1][pos]]+=.5;
65 F[0][b[1][pos]*4+b[0][pos]]+=.5;
69 //Find 4-fold degenerate sites at 3rd position
70 for(k=0,fourdeg=0;k<2;k++) {
71 for(j=0,i=c[k]-b[k][2]; j<4; j++)
72 if(j!=b[k][2] && getAminoAcid(i+j)!=aa[k])
73 break;
74 if(aa[0]==aa[1] && j==4)
75 fourdeg++;
77 //F[1][]: 4-fold
78 if (fourdeg==2) {
79 F[1][b[0][2]*4+b[1][2]]+=.5;
80 F[1][b[1][2]*4+b[0][2]]+=.5;
83 }//end of for(h)
86 for(k=0; k<2; k++) { /* two kinds of sites */
88 S[k] = sumArray(F[k],16);
89 if(S[k]<=0) {
90 wk[k] = 0;
91 continue;
93 for(j=0; j<16; j++) {
94 F[k][j]/=S[k];
97 //Transitions between purines
98 T1 = 2*F[k][2*DNASIZE+3];
99 //Transitions between pyrimidines
100 T2 = 2*F[k][0*DNASIZE+1];
101 //Tranversions
102 V = 1 - T1 - T2 - (F[k][0*DNASIZE+0]+F[k][1*DNASIZE+1]+F[k][2*DNASIZE+2]+F[k][3*DNASIZE+3]);
104 //pi[]: the sum probabilty of T, C, A, G, respectively
105 for(j=0; j<4; j++) {
106 pi4[j] = sumArray(F[k]+j*4,4);
109 CorrectKappaTN93(S[k], T1, T2, V, pi4, kappatc_TN[k], kappaag_TN[k]);
110 wk[k]=((kappatc_TN[k]>0 && kappaag_TN[k]>0)?S[k]:0);
112 //R = (¦ÐT¦ÐC¦Ê1 + ¦ÐA¦ÐG¦Ê2)/(¦ÐY¦ÐR), kappa = 2R in PAML's DOC
113 kappa_TN[k] = 2*(kappatc_TN[k]*pi4[0]*pi4[1] + kappaag_TN[k]*pi4[2]*pi4[3])/((pi4[0]+pi4[1])*(pi4[2]+pi4[3]));
117 if(wk[0]+wk[1]==0) {
118 kappatc = kappaag = kappa = kdefault;
120 else {
121 kappatc = (kappatc_TN[0]*wk[0] + kappatc_TN[1]*wk[1])/(wk[0] + wk[1]);
122 kappaag = (kappaag_TN[0]*wk[0] + kappaag_TN[1]*wk[1])/(wk[0] + wk[1]);
123 kappa = (kappa_TN[0]*wk[0] + kappa_TN[1]*wk[1])/(wk[0] + wk[1]);
126 KAPPA[0] = kappatc;
127 KAPPA[1] = kappaag;
129 return 0;
133 /* Calculate transition probability matrix(64*64) */
134 int MYN::GetPMatCodon(double PMatrix[], double kappa, double omega) {
136 int i,j,k, ndiff,pos=0,from[3],to[3];
137 double mr;
138 char c[2];
139 double U[CODON*CODON], V[CODON*CODON], Root[CODON*CODON];
141 initArray(PMatrix, CODON*CODON);
142 initArray(U, CODON*CODON);
143 initArray(V, CODON*CODON);
144 initArray(Root, CODON*CODON);
146 for(i=0; i<CODON; i++) {
147 for(j=0; j<i; j++) {
149 //codon 'from'
150 from[0]=i/16; from[1]=(i/4)%4; from[2]=i%4;
151 //codon 'to'
152 to[0]=j/16; to[1]=(j/4)%4; to[2]=j%4;
153 //amino acid of 'from' and 'to'
154 c[0]=getAminoAcid(i);
155 c[1]=getAminoAcid(j);
156 //stop codon
157 if (c[0]=='!' || c[1]=='!')
158 continue;
160 //whether two codons only have one difference
161 for (k=0,ndiff=0; k<3; k++) {
162 if (from[k]!=to[k]) {
163 ndiff++;
164 pos=k;
167 if (ndiff==1) {
168 //only have one difference
169 PMatrix[i*CODON+j]=1;
170 //transition
171 if ((from[pos]+to[pos]-1)*(from[pos]+to[pos]-5)==0) {
172 if (from[pos]+to[pos]==1)
173 PMatrix[i*CODON+j]*=kappatc;//T<->C
174 else
175 PMatrix[i*CODON+j]*=kappaag;//A<->G
178 //nonsynonymous
179 if(c[0]!=c[1])
180 PMatrix[i*CODON+j]*=omega;
182 //diagonal element is equal
183 PMatrix[j*CODON+i]=PMatrix[i*CODON+j];
188 //PMatrix[](*Q): transition probability matrix
189 for(i=0; i<CODON; i++)
190 for(j=0; j<CODON; j++)
191 PMatrix[i*CODON+j]*=pi[j];
193 //scale the sum of PMat[][j](j=0-63) to zero
194 for (i=0,mr=0; i<CODON; i++) {
195 PMatrix[i*CODON+i] =- sumArray(PMatrix+i*CODON,CODON);
196 //The sum of transition probability of main diagnoal elements
197 mr -= pi[i]*PMatrix[i*CODON+i];
200 //calculate exp(PMatrix*t)
201 eigenQREV(PMatrix, pi, pi_sqrt, CODON, npi0, Root, U, V);
202 for(i=0; i<CODON; i++)
203 Root[i]/=mr;
204 PMatUVRoot(PMatrix,t,64,U,V,Root);
206 return 0;
209 /* Correct kappas */
210 int MYN::CorrectKappaTN93(double n, double P1, double P2, double Q, double pi4[], double &kappatc_TN93, double &kappaag_TN93) {
212 int failTN93;
213 double tc,ag, Y,R, a1=0, a2=0,b=0, A,B,C;
214 double Qsmall=min(1e-10,0.1/n), default_kappa = 2, maxkappa = 99;
216 kappatc_TN93 = kappaag_TN93 = -1;
218 failTN93 = 0;
220 Y=pi4[0]+pi4[1];
221 R=pi4[2]+pi4[3];
223 tc=pi4[0]*pi4[1];
224 ag=pi4[2]*pi4[3];
227 if ((P1+P2+Q)>1 || (P1+P2)<-1e-10 || Q<-1e-10 || fabs(Y+R-1)>1e-8) {
228 return 0;
231 if(Q<Qsmall)
232 failTN93 =1;
233 else if (Y<=0 || R<=0 || (tc<=0 && ag<=0))
234 failTN93 = 1;
235 else { //TN93 for multiple substitutions
236 A=tc/Y+ag/R; B=tc+ag; C=Y*R;
237 a1 = 1 - R*P1/(2*ag) - Q/(2*R);
238 a2 = 1 - Y*P2/(2*tc) - Q/(2*Y);
239 b = 1 - Q/(2*C);
240 if(a1<0 || a2<0 || b<0) {
241 failTN93 = 1;
243 else {
244 a1 = log(a1);
245 a2 = log(a2);
246 b = log(b);
247 //Kappa
248 kappaag_TN93 = (Y*b - a1)/(-R*b);
249 kappatc_TN93 = (R*b - a2)/(-Y*b);
253 if(failTN93) { //Fail to correct kappa
254 kappatc_TN93 = kappaag_TN93 = default_kappa;
257 return 1;
260 /* Correct Ka and Ks */
261 int MYN::CorrectKaksTN93(double n, double P1, double P2, double Q, double pi4[], double &kaks, double &SEkaks) {
263 int failTN93;
264 double tc,ag, Y,R, a1, a2, b, A, B, C;
265 double Qsmall=1e-10;
267 a1 = a2 = b = failTN93 = 0;
269 Y=pi4[0]+pi4[1];
270 R=pi4[2]+pi4[3];
272 tc=pi4[0]*pi4[1];
273 ag=pi4[2]*pi4[3];
275 if (P1+P2+Q>1 || fabs(Y+R-1)>Qsmall || Y<=0 || R<=0 || (tc<=0 && ag<=0)) {
276 failTN93 = 1;
278 else { //TN93 for multiple substitutions
279 A=tc/Y+ag/R; B=tc+ag; C=Y*R;
280 a1 = 1 - R*P1/(2*ag) - Q/(2*R);
281 a2 = 1 - Y*P2/(2*tc) - Q/(2*Y);
282 b = 1-Q/(2*C);
283 if(a1<0 || a2<0 || b<0) {
284 failTN93 = 1;
286 else {
287 a1 = log(a1);
288 a2 = log(a2);
289 b = log(b);
290 //Ka or Ks
291 kaks = (-2*ag*a1/R) + (-2*tc*a2/Y) + (-2*(C-ag*Y/R-tc*R/Y)*b);
293 double cc1 = 2*ag*R / (2*ag*R-R*R*P1-ag*Q);
294 double cc2 = 2*tc*Y / (2*tc*Y-Y*Y*P2-tc*Q);
295 double cc3 = 2*ag*ag / (R*(2*ag*R-R*R*P1-ag*Q));
296 cc3 += 2*tc*tc / (Y*(2*tc*Y-Y*Y*P2-tc*Q));
297 cc3 += (R*R*(Y*Y-2*tc) + Y*Y*(R*R-2*ag)) / (2*R*R*Y*Y-R*Y*Q);
298 SEkaks = (square(cc1)*P1 + square(cc2)*P2 + square(cc3)*Q - square(cc1*P1+cc2*P2+cc3*Q))/n;
302 if (failTN93==1) { //Use YN00's correction for Ka, Ks
303 DistanceF84(n, P1+P2, Q, pi4, Qsmall, kaks, SEkaks);
306 return 1;
309 /* Count differences, considering different transitional pathways between purines and between pyrimidines */
310 int MYN::CountDiffs(const string seq1, const string seq2, double &Sdts1, double &Sdts2, double &Sdtv,double &Ndts1, double &Ndts2, double &Ndtv,double PMatrix[]) {
311 int h,i1,i2,i,k, transi, c[2],ct[2], by[3]={16,4,1};
312 char aa[2];
313 int dmark[3], step[3], b[2][3], bt1[3], bt2[3];
314 int ndiff, npath, nstop, sts1path[6], sts2path[6], stvpath[6],nts1path[6], nts2path[6], ntvpath[6];
315 double sts1, sts2, stv, nts1, nts2, ntv; /* syn ts & tv, nonsyn ts & tv for 2 codons */
316 double ppath[6], sump,p;
318 Sdts1=Sdts2=Sdtv=Ndts1=Ndts2=Ndtv=snp=0;
319 for (h=0; h<seq1.length(); h+=3) {
321 c[0]=getID(seq1.substr(h,3));
322 c[1]=getID(seq2.substr(h,3));
323 //Difference?
324 if (c[0]==c[1])
325 continue;
327 for(i=0; i<2; i++) {
328 b[i][0]=c[i]/16;
329 b[i][1]=(c[i]%16)/4;
330 b[i][2]=c[i]%4;
331 aa[i]=getAminoAcid(c[i]);
334 //ndiff: differences of two codons
335 ndiff=0;
336 sts1=sts2=stv=nts1=nts2=ntv=0;
337 //dmark[]: position of different codon
338 for(k=0; k<3; k++) {
339 dmark[k] = -1;
340 if (b[0][k]!=b[1][k])
341 dmark[ndiff++]=k;
344 snp+=ndiff;
346 npath=1;
347 if(ndiff>1)
348 npath=(ndiff==2)?2:6;
350 if (ndiff==1) {
351 transi=b[0][dmark[0]]+b[1][dmark[0]];
352 //transi=(transi==1 || transi==5);
353 if (aa[0]==aa[1]) {
354 if (transi==5)
355 sts1++;
356 else if (transi==1)
357 sts2++;
358 else
359 stv++;
361 else {
362 if (transi==5)
363 nts1++;
364 else if (transi==1)
365 nts2++;
366 else
367 ntv++;
370 else { /* ndiff=2 or 3 */
372 nstop=0;
373 for(k=0; k<npath; k++) {
375 //set the step[]
376 for(i1=0; i1<3; i1++)
377 step[i1]=-1;
378 if (ndiff==2) {
379 step[0]=dmark[k];
380 step[1]=dmark[1-k];
382 else {
383 step[0]=k/2;
384 step[1]=k%2;
385 if (step[0]<=step[1])
386 step[1]++;
387 step[2]=3-step[0]-step[1];
388 }//end of set the step[]
390 for(i1=0; i1<3; i1++)
391 bt1[i1]=bt2[i1]=b[0][i1];
393 sts1path[k]=sts2path[k]=stvpath[k]=nts1path[k]=nts2path[k]=ntvpath[k]=0;
395 //ppath[]: probabilty of each path
396 for (i1=0,ppath[k]=1; i1<ndiff; i1++) {
397 bt2[step[i1]] = b[1][step[i1]];
399 //ct[]: mutated codon's ID(0--63)
400 for (i2=0,ct[0]=ct[1]=0; i2<3; i2++) {
401 ct[0]+=bt1[i2]*by[i2];
402 ct[1]+=bt2[i2]*by[i2];
404 //ppath[k]: probabilty of path k
405 ppath[k]*=PMatrix[ct[0]*CODON+ct[1]];
406 for(i2=0; i2<2; i2++)
407 aa[i2]=getAminoAcid(ct[i2]);
409 if (aa[1]=='!') {
410 nstop++;
411 ppath[k]=0;
412 break;
415 transi=b[0][step[i1]]+b[1][step[i1]];
417 //ts & tr when syn & nonsyn in path k
418 if(aa[0]==aa[1]) {
419 if(transi==5)
420 sts1path[k]++;
421 else if(transi==1)
422 sts2path[k]++;
423 else
424 stvpath[k]++;
426 else {
427 if(transi==5)
428 nts1path[k]++;
429 else if(transi==1)
430 nts2path[k]++;
431 else
432 ntvpath[k]++;
435 for(i2=0; i2<3; i2++)
436 bt1[i2]=bt2[i2];
439 } /* for(k,npath) */
440 if (npath==nstop) { /* all paths through stop codons */
441 if (ndiff==2) {
442 nts1 = 0.25;
443 nts2 = 0.25;
444 ntv = 1.5;
446 else {
447 nts1 = 0.25;
448 nts2 = 0.25;
449 ntv = 2.5;
452 else {
453 //sum probabilty of all path
454 sump=sumArray(ppath,npath);
455 if(sump>1e-20) {
456 for(k=0;k<npath;k++) { //p: the probabilty of path k
457 p=ppath[k]/sump;
458 sts1+=sts1path[k]*p; sts2+=sts2path[k]*p; stv+=stvpath[k]*p;
459 nts1+=nts1path[k]*p; nts2+=nts2path[k]*p; ntv+=ntvpath[k]*p;
464 }//end of if(ndiff)
466 Sdts1+=sts1; Sdts2+=sts2; Sdtv+=stv;
467 Ndts1+=nts1; Ndts2+=nts2; Ndtv+=ntv;
468 }//end of for(h)
470 return (0);
473 int MYN::DistanceYN00(const string seq1, const string seq2, double &dS,double &dN, double &SEdS, double &SEdN) {
475 int j,ir,nround=100, status=1;
476 double fbS[4], fbN[4], fbSt[4], fbNt[4];
477 double St, Nt, Sdts1, Sdts2, Sdtv, Ndts1, Ndts2, Ndtv;
478 double w0=0, S0=0, N0=0, dS0=0, dN0=0, accu=5e-8, minomega=1e-5, maxomega=99;
479 double PMatrix[CODON*CODON];
481 //initial values for t and omega(Ka/Ks)
482 t=0.09;
483 omega=.5;
484 S = N = 0.0;
485 initArray(fbS, 4);
486 initArray(fbN, 4);
488 //Count sites of sequence 1
489 CountSites(seq1, St, Nt, fbSt, fbNt);
490 S+=St/2;
491 N+=Nt/2;
492 for(j=0; j<4; j++) {
493 fbS[j]+=fbSt[j]/2;
494 fbN[j]+=fbNt[j]/2;
497 //Count sites of sequence 2
498 CountSites(seq2, St, Nt, fbSt, fbNt);
499 S+=St/2;
500 N+=Nt/2;
501 for(j=0; j<4; j++) {
502 fbS[j]+=fbSt[j]/2;
503 fbN[j]+=fbNt[j]/2;
506 //Iterative loop
507 for (ir=0; ir<nround; ir++) { /* iteration */
509 //Get transition probability matrix from one codon to another
510 GetPMatCodon(PMatrix,kappa,omega);
512 //Count differences
513 CountDiffs(seq1, seq2, Sdts1, Sdts2, Sdtv, Ndts1, Ndts2, Ndtv, PMatrix);
515 //Synonymous(Sd) and nonsynonymous(Nd) differences
516 Sd = Sdts1 + Sdts2 + Sdtv;
517 Nd = Ndts1 + Ndts2 + Ndtv;
519 //Seldom happen
520 if (Sd>S) {
521 Sdts1 *= (S/Sd);
522 Sdts2 *= (S/Sd);
523 Sdtv *= (S/Sd);
525 if (Nd>N) {
526 Ndts1 *= (N/Nd);
527 Ndts2 *= (N/Nd);
528 Ndtv *= (N/Nd);
531 //Ks
532 CorrectKaksTN93(S, Sdts1/S, Sdts2/S, Sdtv/S, fbS, dS, SEdS);
533 //Ka
534 CorrectKaksTN93(N, Ndts1/N, Ndts2/N, Ndtv/N, fbN, dN, SEdN);
537 status=-1;
539 if(dS<1e-9) {
540 status=-1;
541 omega=maxomega;
543 else {
544 omega= max(minomega, dN/dS);
547 t = dS * 3 * S/(S + N) + dN * 3 * N/(S + N);
549 if (fabs(dS-dS0)<accu && fabs(dN-dN0)<accu && fabs(omega-w0)<accu)
550 break;
552 dS0=dS;
553 dN0=dN;
554 w0=omega;
556 } //end of for(ir) */
558 if(ir==nround)
559 status=-2;
561 return status;
565 /* Count the synonymous and nonsynonymous sites of two sequences */
566 int MYN::CountSites(const string seq, double &Stot, double &Ntot,double fbS[],double fbN[]) {
567 int h,i,j,k, c[2],aa[2], b[3], by[3]={16,4,1};
568 double r, S,N;
570 Stot=Ntot=0;
571 initArray(fbS, 4);
572 initArray(fbN, 4);
574 for (h=0; h<seq.length(); h+=3) {
576 //Get codon id and amino acid
577 c[0]=getID(seq.substr(h,3));
578 aa[0]=getAminoAcid(c[0]);
579 for(i=0; i<3; i++) {
580 b[i]=convertChar(seq[h+i]);
583 for (j=0,S=N=0; j<3; j++) {
584 for(k=0; k<4; k++) { /* b[j] changes to k */
585 if (k==b[j])
586 continue;
587 //c[0] change at position j
588 c[1] = c[0]+(k-b[j])*by[j];
589 aa[1] = getAminoAcid(c[1]);
591 if(aa[1]=='!')
592 continue;
594 r=pi[c[1]];
595 if (k+b[j]==1 || k+b[j]==5) {//transition
596 if (k+b[j]==1) r*=kappatc;
597 else r*=kappaag; //(k+b[j]==5)
600 if (aa[0]==aa[1]) { //synonymous
601 S+=r;
602 fbS[b[j]]+=r; //syn probability of A,C,G,T
604 else { //nonsynonymous
605 N+=r;
606 fbN[b[j]]+=r; //nonsyn probability of A,C,G,T
610 Stot+=S;
611 Ntot+=N;
614 //Scale Stot+Ntot to seq.length()
615 r=seq.length()/(Stot+Ntot);
616 Stot*=r;
617 Ntot*=r;
619 //get probablity of syn of four nul.
620 r=sumArray(fbS,4);
621 for(k=0; k<4; k++)
622 fbS[k]/=r;
624 //get probablity of nonsyn of four nul.
625 r=sumArray(fbN,4);
626 for(k=0; k<4; k++)
627 fbN[k]/=r;
629 return 0;