1 /************************************************************
2 * Copyright (c) 2005, BGI of Chinese Academy of Sciences
6 * Abstract: Definition of GY94 class.
9 * Author: Zhang Zhang (zhanghzhang@genomics.org.cn)
12 * Note: Source codes are taken from codeml.c in PAML.
15 Goldman N, Yang Z (1994) A codon-based model of nucleotide
16 substitution for protein-coding DNA sequences. Mol. Biol.
18 *************************************************************/
23 int GeneticCode
[][64] =
24 {{13,13,10,10,15,15,15,15,18,18,-1,-1, 4, 4,-1,17,
25 10,10,10,10,14,14,14,14, 8, 8, 5, 5, 1, 1, 1, 1,
26 9, 9, 9,12,16,16,16,16, 2, 2,11,11,15,15, 1, 1,
27 19,19,19,19, 0, 0, 0, 0, 3, 3, 6, 6, 7, 7, 7, 7}, /* 0:universal */
29 {13,13,10,10,15,15,15,15,18,18,-1,-1, 4, 4,17,17,
30 10,10,10,10,14,14,14,14, 8, 8, 5, 5, 1, 1, 1, 1,
31 9, 9,12,12,16,16,16,16, 2, 2,11,11,15,15,-1,-1,
32 19,19,19,19, 0, 0, 0, 0, 3, 3, 6, 6, 7, 7, 7, 7}, /* 1:vertebrate mt.*/
34 {13,13,10,10,15,15,15,15,18,18,-1,-1, 4, 4,17,17,
35 16,16,16,16,14,14,14,14, 8, 8, 5, 5, 1, 1, 1, 1,
36 9, 9,12,12,16,16,16,16, 2, 2,11,11,15,15, 1, 1,
37 19,19,19,19, 0, 0, 0, 0, 3, 3, 6, 6, 7, 7, 7, 7}, /* 2:yeast mt. */
39 {13,13,10,10,15,15,15,15,18,18,-1,-1, 4, 4,17,17,
40 10,10,10,10,14,14,14,14, 8, 8, 5, 5, 1, 1, 1, 1,
41 9, 9, 9,12,16,16,16,16, 2, 2,11,11,15,15, 1, 1,
42 19,19,19,19, 0, 0, 0, 0, 3, 3, 6, 6, 7, 7, 7, 7}, /* 3:mold mt. */
44 {13,13,10,10,15,15,15,15,18,18,-1,-1, 4, 4,17,17,
45 10,10,10,10,14,14,14,14, 8, 8, 5, 5, 1, 1, 1, 1,
46 9, 9,12,12,16,16,16,16, 2, 2,11,11,15,15,15,15,
47 19,19,19,19, 0, 0, 0, 0, 3, 3, 6, 6, 7, 7, 7, 7}, /* 4:invertebrate mt. */
49 {13,13,10,10,15,15,15,15,18,18, 5, 5, 4, 4,-1,17,
50 10,10,10,10,14,14,14,14, 8, 8, 5, 5, 1, 1, 1, 1,
51 9, 9, 9,12,16,16,16,16, 2, 2,11,11,15,15, 1, 1,
52 19,19,19,19, 0, 0, 0, 0, 3, 3, 6, 6, 7, 7, 7, 7}, /* 5:ciliate nuclear*/
54 {13,13,10,10,15,15,15,15,18,18,-1,-1, 4, 4,17,17,
55 10,10,10,10,14,14,14,14, 8, 8, 5, 5, 1, 1, 1, 1,
56 9, 9, 9,12,16,16,16,16, 2, 2, 2,11,15,15,15,15,
57 19,19,19,19, 0, 0, 0, 0, 3, 3, 6, 6, 7, 7, 7, 7}, /* 6:echinoderm mt.*/
59 {13,13,10,10,15,15,15,15,18,18,-1,-1, 4, 4, 4,17,
60 10,10,10,10,14,14,14,14, 8, 8, 5, 5, 1, 1, 1, 1,
61 9, 9, 9,12,16,16,16,16, 2, 2,11,11,15,15, 1, 1,
62 19,19,19,19, 0, 0, 0, 0, 3, 3, 6, 6, 7, 7, 7, 7}, /* 7:euplotid mt. */
64 {13,13,10,10,15,15,15,15,18,18,-1,-1, 4, 4,-1,17,
65 10,10,10,15,14,14,14,14, 8, 8, 5, 5, 1, 1, 1, 1,
66 9, 9, 9,12,16,16,16,16, 2, 2,11,11,15,15, 1, 1,
67 19,19,19,19, 0, 0, 0, 0, 3, 3, 6, 6, 7, 7, 7, 7},
68 /* 8:alternative yeast nu.*/
70 {13,13,10,10,15,15,15,15,18,18,-1,-1, 4, 4,17,17,
71 10,10,10,10,14,14,14,14, 8, 8, 5, 5, 1, 1, 1, 1,
72 9, 9,12,12,16,16,16,16, 2, 2,11,11,15,15, 7, 7,
73 19,19,19,19, 0, 0, 0, 0, 3, 3, 6, 6, 7, 7, 7, 7}, /* 9:ascidian mt. */
75 {13,13,10,10,15,15,15,15,18,18,-1, 5, 4, 4,-1,17,
76 10,10,10,10,14,14,14,14, 8, 8, 5, 5, 1, 1, 1, 1,
77 9, 9, 9,12,16,16,16,16, 2, 2,11,11,15,15, 1, 1,
78 19,19,19,19, 0, 0, 0, 0, 3, 3, 6, 6, 7, 7, 7, 7}, /* 10:blepharisma nu.*/
80 { 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4,
81 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8,
82 9, 9, 9, 9,10,10,10,10,11,11,11,11,12,12,12,12,
83 13,13,13,13,14,14,14,14,15,15,15,15,16,16,16,16} /* 11:Ziheng's regular code */
84 }; /* GeneticCode[icode][#codon] */
87 /********************************************
88 * Function: construction function
89 * Input Parameter: a candidate model
93 * Note: (Equal, Unequal: substitution rates)
94 JC, F81: rTC==rAG =rTA==rCG==rTG==rCA
95 K2P, HKY: rTC==rAG!=rTA==rCG==rTG==rCA
96 TNEF,TN: rTC!=rAG!=rTA==rCG==rTG==rCA
97 K3P, K3PUF: rTC==rAG!=rTA==rCG!=rTG==rCA
98 TIMEF, TIM: rTC!=rAG!=rTA==rCG!=rTG==rCA
99 TVMEF, TVM: rTC==rAG!=rTA!=rCG!=rTG!=rCA
100 SYM, GTR: rTC!=rAG!=rTA!=rCG!=rTG!=rCA
101 *********************************************/
107 GY94::GY94(string NulModel
) {
109 name
= "GY-"+NulModel
;
117 com
.icode
= genetic_code
-1;
118 if (com
.icode
>11) com
.icode
= 0;
119 Nsensecodon
= getNumNonsense(com
.icode
);
120 com
.ncode
= 64 - Nsensecodon
;
122 //NulModel is set among a set of candidate models.
125 if (model
=="JC" || model
=="F81") com
.nkappa
= 0;
126 else if (model
=="K2P" || model
=="HKY") com
.nkappa
= 1;
127 else if (model
=="TNEF"|| model
=="TN") com
.nkappa
= 2;
128 else if (model
=="K3P" || model
=="K3PUF") com
.nkappa
= 2;
129 else if (model
=="TIMEF" || model
=="TIM") com
.nkappa
= 3;
130 else if (model
=="TVMEF" || model
=="TVM") com
.nkappa
= 4;
131 else if (model
=="SYM" || model
=="GTR") com
.nkappa
= 5;
133 //parameters' number: plus another two parameters (Ka/Ks and t)
134 com
.np
= 2 + com
.nkappa
;
143 double GY94::rndu(void) {
144 w_rndu
= w_rndu
*69069+1;
145 return ldexp((double)w_rndu
, -32);
149 /* x[i]=x0[i] + t*p[i] */
150 double GY94::fun_ls (double t
, double x0
[], double p
[], double x
[], int n
) {
152 for (i
=0; i
<n
; i
++) x
[i
]=x0
[i
] + t
*p
[i
];
153 return( lfun2dSdN(x
,n
) );
158 int GY94::PatternWeight () {
160 int *fpatti
, lst
=com
.ls
,h
,ht
,j
,k
=-1;
163 char *zt
[2], b1
[2]; /* b[][] data at site h */
166 //fpatti=(int*)malloc(lst*sizeof(int));
167 fpatti
= new int[lst
];
169 for(h
=0; h
<lst
; h
++) fpatti
[h
]=0;
171 for(j
=0;j
<com
.ns
;j
++) {
172 zt
[j
]=(char*)malloc(lst
*sizeof(char));
173 //zt[j]= new char[lst];
178 for(h
=0; h
<com
.ls
; h
++) {
183 //cout<<(char)b1[1]<<endl;
185 for(ht
=0; ht
<com
.npatt
; ht
++) {
186 if(b1
[0]==zt
[0][ht
] && b1
[1]==zt
[1][ht
])
190 zt
[0][com
.npatt
]=b1
[0];
191 zt
[1][com
.npatt
]=b1
[1];
200 //com.fpatt=(double*)malloc(com.npatt*sizeof(double));
202 for(h
=0;h
<com
.npatt
;h
++) {
203 com
.fpatt
[h
]=fpatti
[h
];
209 for(j
=0; j
<com
.ns
; j
++) {
210 com
.z
[j
]=(char*)realloc(zt
[j
],com
.npatt
*sizeof(char));
216 //Preprocess in preparation for estimation
217 int GY94::preProcess(const char* seq1
, const char* seq2
) {
222 com
.kappa
=2; com
.omega
= 0.4;
226 com
.ls
= strlen(seq1
);
228 for(i
=0, snp
=0; i
<com
.ls
; i
++)
229 if (seq1
[i
]!=seq2
[i
]) snp
++;
231 for(i
=0; i
<com
.ns
; i
++) {
232 com
.z
[i
]=new char[com
.ls
+1];
235 strcpy(com
.z
[0], seq1
);
236 strcpy(com
.z
[1], seq2
);
246 int GY94::setmark_61_64 (void) {
248 int i
, *code
=GeneticCode
[com
.icode
];
249 //int c[3],aa0,aa1, by[3]={16,4,1};
250 //double nSilent, nStop, nRepl;
253 for (i
=0; i
<64; i
++) {
258 FROM61
[Nsensecodon
] = i
;
259 FROM64
[i
] = Nsensecodon
++;
263 com
.ncode
=Nsensecodon
;
268 //Convert T,C,A,G to 0,1,2,3
269 int GY94::transform(char *z
, int ls
) {
274 for (i
=0,p
=z
; i
<ls
; i
++,p
++) {
275 *p
=(char)convertChar(*p
);
281 void GY94::EncodeSeqs(void) {
286 for(j
=0; j
<com
.ns
; j
++) {
287 transform(com
.z
[j
],com
.ls
*3);
290 //encode to codon 0-64
291 for(j
=0; j
<com
.ns
; j
++) {
292 for(h
=0; h
<com
.ls
; h
++) {
293 b
[0]=com
.z
[j
][h
*3]; b
[1]=com
.z
[j
][h
*3+1]; b
[2]=com
.z
[j
][h
*3+2];
294 k
=b
[0]*16+b
[1]*4+b
[2];
295 com
.z
[j
][h
]=(char)FROM64
[k
];
300 int GY94::GetCodonFreqs(double pi
[]) {
301 int n
=com
.ncode
, i
,j
,ic
,b
[3];
302 double fb3x4
[12], fb4
[4];
305 for(i
=0, initArray(flag
, 64); i
<str1
.length(); i
+=3) {
306 j
= getID(str1
.substr(i
,3));
307 if (getAminoAcid(j
)!='!') flag
[j
] = 1;
309 j
= getID(str2
.substr(i
,3));
310 if (getAminoAcid(j
)!='!') flag
[j
] = 1;
313 //Whether sequences are long enough
314 if (sumArray(flag
, CODON
)==com
.ncode
) {
318 //Codon frequency is estimated from nucleotide frequency(fb3x4) at the three positions
319 for(i
=0,initArray(fb3x4
,12),initArray(fb4
,4); i
<n
; i
++) {
325 fb3x4
[j
*4+b
[j
]] += pi
[i
];
326 fb4
[b
[j
]] += pi
[i
]/3.;
330 //use nul frequencies to get codon frequency
331 for (i
=0; i
<n
; i
++) {
332 ic
=FROM61
[i
]; b
[0]=ic
/16; b
[1]=(ic
/4)%4; b
[2]=ic
%4;
333 pi
[i
]=fb3x4
[b
[0]]*fb3x4
[4+b
[1]]*fb3x4
[8+b
[2]];
336 scaleArray(1./sumArray(pi
,n
), pi
, n
);
341 int GY94::gradientB(int n
, double x
[], double f0
, double g
[], double space
[], int xmark
[]) {
342 /* f0=fun(x) is always provided.
343 xmark=0: central; 1: upper; -1: down
346 double *x0
=space
, *x1
=space
+n
, eh0
=Small_Diff
, eh
; /* eh0=1e-6 || 1e-7 */
349 eh
=eh0
*(fabs(x
[i
])+1);
350 if (xmark
[i
]==0 && SIZEp
<1) { //central
351 for (j
=0; j
<n
; j
++) x0
[j
]=x1
[j
]=x
[j
];
352 eh
=pow(eh
,.67); x0
[i
]-=eh
; x1
[i
]+=eh
;
353 g
[i
]=(lfun2dSdN(x1
,n
)-(lfun2dSdN
)(x0
,n
))/(eh
*2.0);
355 else {//forward or backward
356 for(j
=0; j
<n
; j
++) x1
[j
]=x
[j
];
357 if (xmark
[i
]) eh
*=-xmark
[i
];
359 g
[i
]=(lfun2dSdN(x1
,n
)-f0
)/eh
;
366 double GY94::LineSearch2 (double *f
, double x0
[], double p
[], double step
, double limit
, double e
, double space
[], int n
) {
367 /* linear search using quadratic interpolation
368 from x0[] in the direction of p[],
369 x = x0 + a*p a ~(0,limit)
370 returns (a). *f: f(x0) for input and f(x) for output
372 x0[n] x[n] p[n] space[n]
374 adapted from Wolfe M. A. 1978. Numerical methods for unconstrained
375 optimization: An introduction. Van Nostrand Reinhold Company, New York.
377 step is used to find the bracket and is increased or reduced as necessary,
378 and is not terribly important.
380 int ii
=0, maxround
=10, status
, nsymb
=0;
381 double *x
=space
, factor
=4, small
=1e-6, smallgapa
=0.2;
382 double a0
,a1
,a2
,a3
,a4
=-1,a5
,a6
, f0
,f1
,f2
,f3
,f4
=-1,f5
,f6
;
386 f2
=fun_ls(a2
, x0
,p
,x
,n
);
390 if (step
<small
) return (0);
392 a2
=a0
+step
; f2
=fun_ls(a2
, x0
,p
,x
,n
);
399 if (step
>limit
) step
=limit
;
400 a3
=a0
+step
; f3
=fun_ls(a3
, x0
,p
,x
,n
);
403 if (f3
>=f2
) { //a1<a2<a3 and f1>f2<f3
407 a1
=a2
; f1
=f2
; a2
=a3
; f2
=f3
;
414 /* iteration by quadratic interpolation, fig 2.2.9-10 (pp 71-71) */
415 for (ii
=0; ii
<maxround
; ii
++) { //a4 is the minimum from the parabola over (a1,a2,a3)
418 a4
= (a2
-a3
)*f1
+(a3
-a1
)*f2
+(a1
-a2
)*f3
;
419 if(fabs(a4
)>1e-100) {
421 a4
= ((a2
*a2
-a3
*a3
)*f1
+(a3
*a3
-a1
*a1
)*f2
+(a1
*a1
-a2
*a2
)*f3
)/(2*a4
);
423 if (a4
>a3
|| a4
<a1
) {//out of range: whether a1<a4<a3
427 if((a4
<=a2
&& a2
-a4
>smallgapa
*(a2
-a1
)) || (a4
>a2
&& a4
-a2
>smallgapa
*(a3
-a2
)))
432 f4
= fun_ls(a4
, x0
,p
,x
,n
);
434 if (fabs(f2
-f4
)<e
*(1+fabs(f2
))) {
438 if (a4
<=a2
) { /* fig 2.2.10 */
439 if (a2
-a4
>smallgapa
*(a2
-a1
)) {
440 if (f4
<=f2
) { a3
=a2
; a2
=a4
; f3
=f2
; f2
=f4
; }
441 else { a1
=a4
; f1
=f4
; }
445 a5
=(a2
+a3
)/2; f5
=fun_ls(a5
, x0
,p
,x
,n
);
446 if (f5
>f2
) { a1
=a4
; a3
=a5
; f1
=f4
; f3
=f5
; }
447 else { a1
=a2
; a2
=a5
; f1
=f2
; f2
=f5
; }
450 a5
=(a1
+a4
)/2; f5
=fun_ls(a5
, x0
,p
,x
,n
);
452 { a3
=a2
; a2
=a4
; a1
=a5
; f3
=f2
; f2
=f4
; f1
=f5
; }
454 a6
=(a1
+a5
)/2; f6
=fun_ls(a6
, x0
,p
,x
,n
);
456 { a1
=a6
; a2
=a5
; a3
=a4
; f1
=f6
; f2
=f5
; f3
=f4
; }
457 else { a2
=a6
; a3
=a5
; f2
=f6
; f3
=f5
; }
462 else { /* fig 2.2.9 */
463 if (a4
-a2
>smallgapa
*(a3
-a2
)) {
464 if (f2
>=f4
) { a1
=a2
; a2
=a4
; f1
=f2
; f2
=f4
; }
465 else { a3
=a4
; f3
=f4
; }
469 a5
=(a1
+a2
)/2; f5
=fun_ls(a5
, x0
,p
,x
,n
);
470 if (f5
>f2
) { a1
=a5
; a3
=a4
; f1
=f5
; f3
=f4
; }
471 else { a3
=a2
; a2
=a5
; f3
=f2
; f2
=f5
; }
474 a5
=(a3
+a4
)/2; f5
=fun_ls(a5
, x0
,p
,x
,n
);
476 { a1
=a2
; a2
=a4
; a3
=a5
; f1
=f2
; f2
=f4
; f3
=f5
; }
478 a6
=(a3
+a5
)/2; f6
=fun_ls(a6
, x0
,p
,x
,n
);
480 { a1
=a4
; a2
=a5
; a3
=a6
; f1
=f4
; f2
=f5
; f3
=f6
; }
481 else { a1
=a5
; a2
=a6
; f1
=f5
; f2
=f6
; }
488 if (f2
>f0
&& f4
>f0
) a4
=0;
489 if (f2
<=f4
) { *f
=f2
; a4
=a2
; }
495 double GY94::distance(double x
[], double y
[], int n
) {
500 for (i
=0; i
<n
; i
++) t
+= square(x
[i
]-y
[i
]);
505 int GY94::H_end (double x0
[], double x1
[], double f0
, double f1
, double e1
, double e2
, int n
) {
506 /* Himmelblau termination rule. return 1 for stop, 0 otherwise.
510 if((r
=norm(x0
,n
))<e2
) r
=1;
512 if(distance(x1
,x0
,n
)>=r
)
524 int GY94::ming2 (double *f
, double x
[], double xb
[][2], double space
[], double e
, int n
) {
525 /* n-variate minimization with bounds using the BFGS algorithm
526 g0[n] g[n] p[n] x0[n] y[n] s[n] z[n] H[n*n] C[n*n] tv[2*n]
528 Size of space should be (check carefully?)
529 #define spaceming2(n) ((n)*((n)*2+9+2)*sizeof(double))
530 nfree: # free variables
531 xmark[i]=0 for inside space; -1 for lower boundary; 1 for upper boundary.
532 x[] has initial values at input and returns the estimates in return.
533 ix[i] specifies the i-th free parameter
535 int i
,j
, i1
,i2
,it
, maxround
=2000, fail
=0, *xmark
, *ix
, nfree
;
536 int Ngoodtimes
=2, goodtimes
=0;
537 double small
=1e-6, sizep0
=0;
538 double f0
, *g0
, *g
, *p
, *x0
, *y
, *s
, *z
, *H
, *C
, *tv
;
539 double w
,v
, alpha
, am
, h
, maxstep
=8;
543 g0
=space
; g
=g0
+n
; p
=g
+n
; x0
=p
+n
;
544 y
=x0
+n
; s
=y
+n
; z
=s
+n
; H
=z
+n
; C
=H
+n
*n
; tv
=C
+n
*n
;
546 xmark
= (int*)(tv
+2*n
);
554 for(i
=0,nfree
=0;i
<n
;i
++) {
569 f0
=*f
=lfun2dSdN(x
,n
);
574 gradientB (n
, x0
, f0
, g0
, tv
, xmark
);
576 initIdentityMatrix(H
,nfree
);
578 for(Iround
=0; Iround
<maxround
; Iround
++) {
579 for(i
=0,initArray(p
,n
); i
<nfree
; i
++) {
580 for(j
=0; j
<nfree
; j
++) p
[ix
[i
]] -= H
[i
*nfree
+j
]*g0
[ix
[j
]];
584 SIZEp
=norm(p
,n
); /* check this */
586 for (i
=0,am
=maxstep
; i
<n
; i
++) { /* max step length */
587 if (p
[i
]>0 && (xb
[i
][1]-x0
[i
])/p
[i
]<am
)
588 am
=(xb
[i
][1]-x0
[i
])/p
[i
];
589 else if (p
[i
]<0 && (xb
[i
][0]-x0
[i
])/p
[i
]<am
)
590 am
=(xb
[i
][0]-x0
[i
])/p
[i
];
594 h
=fabs(2*f0
*.01/innerp(g0
,p
,n
)); /* check this?? */
598 h
=norm(s
,nfree
)/SIZEp
;
601 h
=max2(h
,1e-5); h
=min2(h
,am
/5);
603 alpha
= LineSearch2(f
,x0
,p
,h
,am
, min2(1e-3,e
), tv
,n
); /* n or nfree? */
606 for(i
=0; i
<n
; i
++) x
[i
]=x0
[i
]+alpha
*p
[i
];
612 if(Iround
==0 || SIZEp
<sizep0
|| (SIZEp
<.001 && sizep0
<.001))
616 if((n
==1||goodtimes
>=Ngoodtimes
) && SIZEp
<(e
>1e-5?.05:.001) && (f0
- *f
<0.001) && H_end(x0
,x
,f0
,*f
,e
,e
,n
))
619 gradientB (n
, x
, *f
, g
, tv
, xmark
);
621 /* modify the working set */
622 for (i
=0; i
<n
; i
++) { /* add constraints, reduce H */
627 if (fabs(x
[i
]-xb
[i
][0])<1e-6 && -g
[i
]<0)
629 else if (fabs(x
[i
]-xb
[i
][1])<1e-6 && -g
[i
]>0)
635 copyArray (H
, C
, nfree
*nfree
);
636 for (it
=0; it
<nfree
; it
++)
639 for (i1
=it
; i1
<nfree
-1; i1
++)
641 for (i1
=0,nfree
--; i1
<nfree
; i1
++)
642 for (i2
=0; i2
<nfree
; i2
++)
643 H
[i1
*nfree
+i2
]=C
[(i1
+(i1
>=it
))*(nfree
+1) + i2
+(i2
>=it
)];
646 for (i
=0,it
=0,w
=0; i
<n
; i
++) { /* delete a constraint, enlarge H */
647 if (xmark
[i
]==-1 && -g
[i
]>w
) {
650 else if (xmark
[i
]==1 && -g
[i
]<-w
) {
655 if (w
>10*SIZEp
/nfree
) {
656 copyArray (H
, C
, nfree
*nfree
);
658 for (i1
=0; i1
<nfree
; i1
++) {
659 for (i2
=0; i2
<nfree
; i2
++) {
660 H
[i1
*(nfree
+1)+i2
]=C
[i1
*nfree
+i2
];
664 for(i1
=0; i1
<nfree
+1; i1
++) {
665 H
[i1
*(nfree
+1)+nfree
]=H
[nfree
*(nfree
+1)+i1
]=0;
668 H
[(nfree
+1)*(nfree
+1)-1]=1;
673 for(i
=0,f0
=*f
; i
<nfree
; i
++) {
674 y
[i
]=g
[ix
[i
]]-g0
[ix
[i
]];
675 s
[i
]=x
[ix
[i
]]-x0
[ix
[i
]];
677 for (i
=0; i
<n
; i
++) {
683 for (i
=0,w
=v
=0.; i
<nfree
; i
++) {
684 for (j
=0,z
[i
]=0.; j
<nfree
; j
++) {
685 z
[i
]+=H
[i
*nfree
+j
]*y
[j
];
691 initIdentityMatrix(H
,nfree
);
695 for(i
=0; i
<nfree
; i
++) {
696 for(j
=0; j
<nfree
; j
++) {
697 H
[i
*nfree
+j
] += ((1+w
/v
)*s
[i
]*s
[j
]-z
[i
]*s
[j
]-s
[i
]*z
[j
])/v
;
701 }//end of for(Iround...)
706 copyArray(H
, space
, n
*n
);
713 void GY94::HouseholderRealSym(double a
[], int n
, double d
[], double e
[]) {
716 double scale
,hh
,h
,g
,f
;
718 for (i
=n
-1;i
>=1;i
--) {
723 scale
+= fabs(a
[i
*n
+k
]);
729 h
+= a
[i
*n
+k
]*a
[i
*n
+k
];
732 g
=(f
>= 0 ? -sqrt(h
) : sqrt(h
));
741 g
+= a
[j
*n
+k
]*a
[i
*n
+k
];
743 g
+= a
[k
*n
+j
]*a
[i
*n
+k
];
752 a
[j
*n
+k
] -= (f
*e
[k
]+g
*a
[i
*n
+k
]);
762 /* Get eigenvectors */
769 g
+= a
[i
*n
+k
]*a
[k
*n
+j
];
771 a
[k
*n
+j
] -= g
*a
[k
*n
+i
];
776 for (j
=0;j
<=m
;j
++) a
[j
*n
+i
]=a
[i
*n
+j
]=0;
780 int GY94::EigenTridagQLImplicit(double d
[], double e
[], int n
, double z
[]) {
782 int m
,j
,iter
,niter
=30, status
=0, i
,k
;
783 double s
,r
,p
,g
,f
,dd
,c
,b
, aa
,bb
;
785 for (i
=1;i
<n
;i
++) e
[i
-1]=e
[i
]; e
[n
-1]=0;
789 for (m
=j
;m
<n
-1;m
++) {
790 dd
=fabs(d
[m
])+fabs(d
[m
+1]);
791 if (fabs(e
[m
])+dd
== dd
) break; /* ??? */
794 if (iter
++ == niter
) {
798 g
=(d
[j
+1]-d
[j
])/(2*e
[j
]);
802 if((aa
=fabs(g
))>1) r
=aa
*sqrt(1+1/(g
*g
));
805 g
=d
[m
]-d
[j
]+e
[j
]/(g
+SIGN(r
,g
));
808 for (i
=m
-1;i
>=j
;i
--) {
813 aa
=fabs(f
); bb
=fabs(g
);
814 if(aa
>bb
) { bb
/=aa
; r
=aa
*sqrt(1+bb
*bb
); }
816 else { aa
/=bb
; r
=bb
*sqrt(1+aa
*aa
); }
832 z
[k
*n
+i
+1]=s
*z
[k
*n
+i
]+c
*f
;
833 z
[k
*n
+i
]=c
*z
[k
*n
+i
]-s
*f
;
836 if (r
== 0 && i
>= j
) continue;
837 d
[j
]-=p
; e
[j
]=g
; e
[m
]=0;
844 void GY94::EigenSort(double d
[], double U
[], int n
) {
849 for (i
=0;i
<n
-1;i
++) {
851 for (j
=i
+1;j
<n
;j
++) {
852 if (d
[j
] >= p
) p
=d
[k
=j
];
866 int GY94::eigenRealSym(double A
[], int n
, double Root
[], double work
[]) {
869 HouseholderRealSym(A
, n
, Root
, work
);
870 status
=EigenTridagQLImplicit(Root
, work
, n
, A
);
871 EigenSort(Root
, A
, n
);
877 int GY94::eigenQREV (double Q
[], double pi
[], int n
, double Root
[], double U
[], double V
[], double spacesqrtpi
[]) {
879 int i
,j
, inew
, jnew
, nnew
, status
;
880 double *pi_sqrt
=spacesqrtpi
, small
=1e-6;
882 for(j
=0,nnew
=0; j
<n
; j
++)
884 pi_sqrt
[nnew
++]=sqrt(pi
[j
]);
886 /* store in U the symmetrical matrix S = sqrt(D) * Q * sqrt(-D) */
890 for(j
=0,U
[i
*n
+i
] = Q
[i
*n
+i
]; j
<i
; j
++)
891 U
[i
*n
+j
] = U
[j
*n
+i
] = (Q
[i
*n
+j
] * pi_sqrt
[i
]/pi_sqrt
[j
]);
893 status
=eigenRealSym(U
, n
, Root
, V
);
894 for(i
=0;i
<n
;i
++) for(j
=0;j
<n
;j
++) V
[i
*n
+j
] = U
[j
*n
+i
] * pi_sqrt
[j
];
895 for(i
=0;i
<n
;i
++) for(j
=0;j
<n
;j
++) U
[i
*n
+j
] /= pi_sqrt
[i
];
898 for(i
=0,inew
=0; i
<n
; i
++) {
900 for(j
=0,jnew
=0; j
<i
; j
++)
902 U
[inew
*nnew
+jnew
] = U
[jnew
*nnew
+inew
]
903 = Q
[i
*n
+j
] * pi_sqrt
[inew
]/pi_sqrt
[jnew
];
906 U
[inew
*nnew
+inew
] = Q
[i
*n
+i
];
911 status
=eigenRealSym(U
, nnew
, Root
, V
);
913 for(i
=n
-1,inew
=nnew
-1; i
>=0; i
--) /* construct Root */
914 Root
[i
] = (pi
[i
]>small
? Root
[inew
--] : 0);
915 for(i
=n
-1,inew
=nnew
-1; i
>=0; i
--) { /* construct V */
917 for(j
=n
-1,jnew
=nnew
-1; j
>=0; j
--)
919 V
[i
*n
+j
] = U
[jnew
*nnew
+inew
]*pi_sqrt
[jnew
];
927 for(j
=0; j
<n
; j
++) V
[i
*n
+j
] = (i
==j
);
929 for(i
=n
-1,inew
=nnew
-1; i
>=0; i
--) { /* construct U */
931 for(j
=n
-1,jnew
=nnew
-1;j
>=0;j
--)
933 U
[i
*n
+j
] = U
[inew
*nnew
+jnew
]/pi_sqrt
[inew
];
949 /********************************************
950 * Function: parseSubRates
951 * Input Parameter: string, array of double
952 * Output: Parse substitution rates according to the given model
954 *********************************************/
955 int GY94::parseSubRates(string model
, double kappa
[]) {
958 JC, F81: rTC==rAG =rTA==rCG==rTG==rCA
959 K2P, HKY: rTC==rAG!=rTA==rCG==rTG==rCA
960 TNEF,TN: rTC!=rAG!=rTA==rCG==rTG==rCA
961 K3P, K3PUF: rTC==rAG!=rTA==rCG!=rTG==rCA
962 TIMEF, TIM: rTC!=rAG!=rTA==rCG!=rTG==rCA
963 TVMEF, TVM: rTC==rAG!=rTA!=rCG!=rTG!=rCA
964 SYM, GTR: rTC!=rAG!=rTA!=rCG!=rTG!=rCA
967 kappa
[5]=1.0; //Substitution rate between C and A
969 if(model
=="JC" || model
=="F81") {
970 //Q[i*n+j] = kappa[0] = 1;
971 kappa
[0]=kappa
[1]=kappa
[2]=kappa
[3]=kappa
[4]=kappa
[5];
973 else if(model
=="K2P" || model
=="HKY") {//one para.
974 //if ((b1+b2)==1 || (b1+b2)==5) Q[i*n+j] = kappa[0];
976 kappa
[2]=kappa
[3]=kappa
[4]=kappa
[5];
978 else if(model
=="TNEF" || model
=="TN") {//two para.
979 //if ((b1+b2)==1) Q[i*n+j] = kappa[0];
980 //else if ((b1+b2)==5) Q[i*n+j] = kappa[1];
981 kappa
[2]=kappa
[3]=kappa
[4]=kappa
[5];
983 else if(model
=="K3P" || model
=="K3PUF") {//two para.
984 //if ((b1+b2)==1 || (b1+b2)==5) Q[i*n+j] = kappa[0];
985 //else if ((b1+b2)==2 || (b1+b2)==4) Q[i*n+j] = kappa[1];
987 kappa
[2]=kappa
[3]=kappa
[1];
990 else if(model
=="TIMEF" || model
=="TIM") {//three para.
991 //if ((b1+b2)==1) Q[i*n+j] = kappa[0];
992 //else if ((b1+b2)==5) Q[i*n+j] = kappa[1];
993 //else if ((b1+b2)==2 || (b1+b2)==4) Q[i*n+j] = kappa[2];
997 else if(model
=="TVMEF" || model
=="TVM") {//four para.
998 //if ((b1+b2)==1 || (b1+b2)==5) Q[i*n+j] = kappa[0];
999 //else if ((b1+b2)==2) Q[i*n+j] = kappa[1];
1000 //else if ((b1+b2)==4) Q[i*n+j] = kappa[2];
1001 //else if ((b1==0) && (b2==3)) Q[i*n+j] = kappa[3];
1003 //TVMEF, TVM: rTC==rAG!=rTA!=rCG!=rTG!=rCA
1004 //SYM, GTR: rTC!=rAG!=rTA!=rCG!=rTG!=rCA
1010 else if(model
=="SYM" || model
=="GTR") {//five para.
1019 /* Construct Q: transition probability matrix 64*64 */
1020 int GY94::EigenQc (int getstats
, double blength
, double *S
, double *dS
, double *dN
, double Root
[], double U
[], double V
[], double kappa
[], double omega
, double Q
[]) {
1021 /* This contructs the rate matrix Q for codon substitution and get the eigen
1022 values and vectors if getstats==0, or get statistics (dS & dN) if
1024 The routine is also called by Qcodon2aa for mechanistic amino acid
1025 substitution models.
1026 Input parameters are kappa, omega and com.pi (or com.fb61).
1028 Statistics calculated include S, dS & dN.
1029 c0[0,1,2] and c[0,1,2] are rates for the 3 codon positions before and after
1030 selection. c4 is for 4-fold rates. ts[3] and tv[3] are transition/
1031 transversion rates, not calculated.
1033 Under NSsites or other site-class models, this function does not scale
1034 Q but calculates the Qfactor_NS.
1035 DetailOutput() uses this function to calculate dS & dN; for each omega
1036 component, dS and dN are not scaled here but are scaled in DetailOutput()
1037 after Qfactor_NS is calculated.
1039 aaDist=FIT1 & FIT2: ap,p*,av,v*, (and w0 for FIT2)
1040 The argument omega is used only if the model assumes one omega. For
1041 AAClasses, com.pomega is used instead.
1043 int n
=Nsensecodon
, i
,j
,k
, ic1
,ic2
,aa1
,aa2
;
1044 int ndiff
,pos
=0,from
[3],to
[3];
1045 double mr
, rs0
,ra0
,rs
,ra
; /* rho's */
1046 double d4
=0, d0
[3],d
[3],ts
[3],tv
[3]; /* rates at positions and 4-fold sites */
1047 double *pi
=com
.pi
, w
=-1, pijQij
;
1048 double space_pisqrt
[CODON
];
1050 for(i
=0; i
<3; d
[i
]=d0
[i
]=ts
[i
]=tv
[i
]=0, i
++);
1053 //Equal codon frequency
1054 if(model
=="JC" || model
=="K2P" || model
=="TNEF" || model
=="K3P"||
1055 model
=="TIMEF" ||model
=="TVMEF"|| model
=="SYM") {
1056 initArray(com
.pi
, 64, 1.0/(64-getNumNonsense(genetic_code
)));
1059 //Parse substitution rates according to the given model
1060 parseSubRates(model
, kappa
);
1062 //Construct Q: transition probability matrix 64*64
1063 for (i
=0, rs0
=ra0
=rs
=ra
=0; i
<n
; i
++) {
1066 ic1
=FROM61
[i
]; from
[0]=ic1
/16; from
[1]=(ic1
/4)%4; from
[2]=ic1
%4;
1069 for(j
=0; j
<i
; j
++) {
1070 ic2
=FROM61
[j
]; to
[0]=ic2
/16; to
[1]=(ic2
/4)%4; to
[2]=ic2
%4;
1071 for(k
=0,ndiff
=0; k
<3; k
++) {
1072 if(from
[k
]!=to
[k
]) {
1077 //consider only one difference between two codons
1078 if(ndiff
!=1) continue;
1080 int b1
= min2(from
[pos
],to
[pos
]);
1081 int b2
= max2(from
[pos
],to
[pos
]);
1083 /* 01 23 02 13 03 12
1084 JC, F81: rTC==rAG =rTA==rCG==rTG==rCA
1085 K2P, HKY: rTC==rAG!=rTA==rCG==rTG==rCA
1086 TNEF,TN: rTC!=rAG!=rTA==rCG==rTG==rCA
1087 K3P, K3PUF: rTC==rAG!=rTA==rCG!=rTG==rCA
1088 TIMEF, TIM: rTC!=rAG!=rTA==rCG!=rTG==rCA
1089 TVMEF, TVM: rTC==rAG!=rTA!=rCG!=rTG!=rCA
1090 SYM, GTR: rTC!=rAG!=rTA!=rCG!=rTG!=rCA
1092 if (b1
==0 && b2
==1) Q
[i
*n
+j
]=kappa
[0]; /* TC */
1093 else if(b1
==2 && b2
==3) Q
[i
*n
+j
]=kappa
[1]; /* AG */
1094 else if(b1
==0 && b2
==2) Q
[i
*n
+j
]=kappa
[2]; /* TA */
1095 else if(b1
==1 && b2
==3) Q
[i
*n
+j
]=kappa
[3]; /* CG */
1096 else if(b1
==0 && b2
==3) Q
[i
*n
+j
]=kappa
[4]; /* TG */
1097 else if(b1
==1 && b2
==2) Q
[i
*n
+j
]=kappa
[5]; /* CA=1 */
1100 Q
[i
*n
+j
]*=com
.pi
[j
];
1101 Q
[j
*n
+i
]*=com
.pi
[i
];
1104 pijQij
=2*pi
[i
]*Q
[i
*n
+j
];
1106 aa1
=GeneticCode
[com
.icode
][ic1
];
1107 aa2
=GeneticCode
[com
.icode
][ic2
];
1108 if(aa1
==aa2
) {//synonymous
1111 else {//nonsynonymous
1115 Q
[i
*n
+j
]*=w
; Q
[j
*n
+i
]*=w
;
1125 w
=(rs0
+ra0
); rs0
/=w
; ra0
/=w
; *S
=rs0
*3*com
.ls
;
1126 if(blength
>=0) { /* calculates dS & dN */
1127 if(blength
==0) *dS
= *dN
= 0;
1130 *dS
=blength
*rs
/(3*rs0
);
1131 *dN
=blength
*ra
/(3*ra0
);
1136 for (i
=0; i
<n
; i
++) Q
[i
*n
+i
]=-sumArray(Q
+i
*n
,n
);
1138 eigenQREV(Q
, com
.pi
, n
, Root
, U
, V
, space_pisqrt
);
1140 for(i
=0; i
<n
; i
++) Root
[i
]/=mr
;
1146 /* Return maximum-likelihood score */
1147 double GY94::lfun2dSdN(double x
[], int np
) {
1148 /* likelihood function for calculating dS and dN between 2 sequences,
1149 com.z[0] & com.z[1]:
1150 prob(i,j) = PI_i * p(i,j,t)
1152 Data are clean and coded.
1153 Transition probability pijt is calculated for observed patterns only.
1155 int n
=com
.ncode
, h
,k
, ik
, z0
,z1
;
1156 double fh
,expt
[CODON
], lnL1
=0;
1157 double *pkappa
=com
.KAPPA
;
1159 //cout<<name.c_str()<<": "<<com.ncode<<"\t"<<Nsensecodon<<endl;
1163 for (ik
=0; ik
<com
.nkappa
; ik
++)
1166 com
.omega
=x
[1+com
.nkappa
];
1168 EigenQc(0,-1,NULL
,NULL
,NULL
, Root
, U
, V
, pkappa
, com
.omega
, PMat
);
1171 for(k
=0; k
<n
; k
++) {
1172 expt
[k
] = exp(x
[0]*Root
[k
]);
1175 //com.npatt = number of patterns
1176 for (h
=0; h
<com
.npatt
; h
++) {
1178 if(com
.fpatt
[h
]<Small_Diff
) {
1185 for(k
=0,fh
=0;k
<n
;k
++) {
1186 fh
+= U
[z0
*n
+k
]*expt
[k
]*V
[k
*n
+z1
];
1191 lnL1
-=log(fh
)*com
.fpatt
[h
];
1197 /* Main function to calculate Ka and Ks, called by "Run" */
1198 int GY94::PairwiseCodon(double space
[]) {
1201 int npatt0
=com
.npatt
;
1202 double *fpatt0
, ls0
=com
.ls
;
1203 double fp
[CODON
*CODON
];
1204 int n
=com
.ncode
, j
,k
,h
;
1205 double x
[10]={.3,1,.5,.5,.5,.5,.3}, xb
[10][2]={{1e-6,3.}};
1206 double kappab
[2]={.01,30}, omegab
[2]={0.001, 50.};
1207 double e
=1e-6, dS
,dN
;
1208 double *pkappa
=com
.KAPPA
;
1210 //fpatt0=(double*)malloc(npatt0*3*sizeof(double));
1211 fpatt0
= new double[npatt0
*3];
1212 for(k
=0; k
<com
.ns
; k
++) pz0
[k
]=com
.z
[k
];
1213 com
.z
[0]=(char*)(fpatt0
+npatt0
);
1214 com
.z
[1]=com
.z
[0]+npatt0
;
1216 copyArray(com
.fpatt
, fpatt0
, npatt0
);
1219 if ((snp
/length
) > 1e-6) xb
[0][0] = 3*snp
/length
;
1222 for(j
=0; j
<com
.nkappa
; j
++) {
1223 xb
[1+j
][0]=kappab
[0];
1224 xb
[1+j
][1]=kappab
[1];
1228 xb
[k
][0] = omegab
[0];
1229 xb
[k
][1] = omegab
[1];
1231 //fp: codon -> codon frequency
1232 initArray(fp
, CODON
*CODON
);
1233 for(h
=0; h
<npatt0
; h
++) {
1234 j
= max2(pz0
[0][h
], pz0
[1][h
]);
1235 k
= min2(pz0
[0][h
], pz0
[1][h
]);
1236 fp
[j
*n
+k
] += fpatt0
[h
];
1240 for(j
=0,com
.npatt
=0;j
<n
;j
++) {
1243 com
.z
[0][com
.npatt
]=(char)j
;
1244 com
.z
[1][com
.npatt
]=(char)k
;
1245 com
.fpatt
[com
.npatt
++]=fp
[j
*n
+k
];
1249 //com.pi: sense codon's frequencies
1250 for(j
=0,initArray(com
.pi
,n
); j
<com
.npatt
; j
++) {
1251 com
.pi
[(int)com
.z
[0][j
]]+=com
.fpatt
[j
]/(2.*com
.ls
);
1252 com
.pi
[(int)com
.z
[1][j
]]+=com
.fpatt
[j
]/(2.*com
.ls
);
1255 GetCodonFreqs(com
.pi
);
1257 /* initial values and bounds */
1266 for(j
=0; j
<com
.nkappa
; j
++)
1267 x
[1+j
]=.2+.4*rndu();
1271 x
[k
]=(3*x
[k
]+0.6*rndu())/4;
1272 x
[k
]=max2(x
[k
],0.01);
1275 if ((snp
/length
) > 1e-6) x
[0] = 3.0*snp
/length
;
1277 ming2(&lnL
, x
, xb
, space
, e
, com
.np
);
1279 EigenQc(1, x
[0], &S
, &dS
, &dN
, NULL
, NULL
, NULL
, pkappa
, com
.omega
, PMat
);
1286 double scale
=(Sd
+Nd
)/snp
;
1291 //AICc = -2log(lnL) + 2K + 2K(K+1)/(n-K-1), K=parameters' number, n=sample size
1292 AICc
= -2*lnL
+ 2.*(com
.nkappa
+2)*(length
/3)/((length
/3)-(com
.nkappa
+2)-1.);
1295 //kappa = com.kappa;
1297 parseSubRates(model
, pkappa
);
1298 copyArray(pkappa
, KAPPA
, NUMBER_OF_RATES
);
1303 for(k
=0; k
<com
.ns
; k
++) com
.z
[k
]=pz0
[k
];
1306 for(h
=0; h
<npatt0
; h
++) com
.fpatt
[h
]=fpatt0
[h
];
1313 void GY94::FreeMemPUVR(void){
1317 /* Main fuction for GY method */
1318 string
GY94::Run(const char *seq1
, const char *seq2
) {
1325 preProcess(seq1
, seq2
);
1327 com
.sspace
= max2(800000, 3*com
.ncode
*com
.ncode
*(int)sizeof(double));
1328 //com.space = (double*)realloc(com.space, com.sspace);
1329 com
.space
= new double[com
.sspace
];
1331 PairwiseCodon(com
.space
);
1335 for(i
=0; i
<com
.ns
; i
++) delete []com
.z
[i
];
1341 return parseOutput();