1 /************************************************************
2 * Copyright (C) 2005, BGI of Chinese Academy of Sciences
6 * Abstract: Definition of base class for KaKs methods.
9 * Author: Zhang Zhang (zhanghzhang@genomics.org.cn)
12 *************************************************************/
17 /******** Global variables ********/
21 http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c
22 Last update of the Genetic Codes: October 05, 2000 */
23 int genetic_code
=1; //from 1 to 23
24 /* Genetic standard codon table, !=stop codon */
25 const char* transl_table
[] = {
26 "FFLLSSSSYY!!CC!WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "1-Standard Code",
27 "FFLLSSSSYY!!CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS!!VVVVAAAADDEEGGGG", "2-Vertebrate Mitochondrial Code",
28 "FFLLSSSSYY!!CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "3-Yeast Mitochondrial Code",
29 "FFLLSSSSYY!!CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "4-Mold Mitochondrial Code",
30 "FFLLSSSSYY!!CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG", "5-Invertebrate Mitochondrial Code",
31 "FFLLSSSSYYQQCC!WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "6-Ciliate, Dasycladacean and Hexamita Code",
34 "FFLLSSSSYY!!CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG", "9-Echinoderm and Flatworm Mitochondrial Code",
35 "FFLLSSSSYY!!CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "10-Euplotid Nuclear Code",
36 "FFLLSSSSYY!!CC!WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "11-Bacterial and Plant Plastid Code",
37 "FFLLSSSSYY!!CC!WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "12-Alternative Yeast Nuclear Code",
38 "FFLLSSSSYY!!CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG", "13-Ascidian Mitochondrial Code",
39 "FFLLSSSSYYY!CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG", "14-Alternative Flatworm Mitochondrial Code",
40 "FFLLSSSSYY!QCC!WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "15-Blepharisma Nuclear Code",
41 "FFLLSSSSYY!LCC!WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "16-Chlorophycean Mitochondrial Code",
46 "FFLLSSSSYY!!CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG", "21-Trematode Mitochondrial Code",
47 "FFLLSS!SYY!LCC!WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "22-Scenedesmus obliquus mitochondrial Code",
48 "FF!LSSSSYY!!CC!WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", "23-Thraustochytrium Mitochondrial Code"
51 string seq_name
; //sequences' name
52 unsigned long length
; //sequences' length
53 double GC
[4]; //GC Content
54 //********End of Global variables**********
56 //Constructor function
61 Si
[i
] = Vi
[i
] = L
[i
] = NULL
;
63 for (i
=0; i
<NUMBER_OF_RATES
; i
++) {
67 SEKa
= SEKs
= AICc
= lnL
= AkaikeWeight
= NA
;
68 Ka
= Ks
= Sd
= Nd
= S
= N
= snp
= t
= kappa
= NULL
;
73 /********************************************
75 * Input Parameter: string, string, string
76 * Output: result = result + str + flag
78 * Note: flag = "\t" (default) or "\n"
79 *********************************************/
80 void Base::addString(string
&result
, string str
, string flag
) {
85 /**********************************************************************
86 * Function: getAminoAcid
87 * Input Parameter: codon or codon's id
88 * Output: Calculate the amino acid according to codon or codon's id.
90 ***********************************************************************/
91 char Base::getAminoAcid(string codon
) {
92 return transl_table
[2*(genetic_code
-1)][getID(codon
)];
94 char Base::getAminoAcid(int id
) {
95 return transl_table
[2*(genetic_code
-1)][id
];
98 /**********************************
99 * Function: getNumNonsense
100 * Input Parameter: int
101 * Output: get the number of nonsense codons
103 ***********************************/
104 int Base::getNumNonsense(int genetic_code
) {
107 for(num
=i
=0; i
<CODON
; i
++) {
108 if(getAminoAcid(i
)=='!') num
++;
114 /********************************************
116 * Input Parameter: codon
117 * Output: Get codon's id in array of codon_table.
119 *********************************************/
120 int Base::getID(string codon
) {
121 return (convertChar(codon
[0])*XSIZE
+ convertChar(codon
[1])*DNASIZE
+ convertChar(codon
[2]));
124 /********************************************
126 * Input Parameter: int
127 * Output: Get the codon according to id;
128 a reverse funtion of getID.
129 * Return Value: string
130 *********************************************/
131 string
Base::getCodon(int IDcodon
) {
133 string codon
= "TTT";
135 if (IDcodon
>=0 && IDcodon
<64) {
136 codon
[0]=convertInt(IDcodon
/16);
137 codon
[1]=convertInt((IDcodon
%16)/4);
138 codon
[2]=convertInt(IDcodon
%4);
144 /*********************************************
145 * Function: convertChar
146 * Input Parameter: ch as char
147 * Output: Convert a char-T,C,A,G into a digit
148 * 0,1,2,3, respectively.
150 **********************************************/
151 int Base::convertChar(char ch
) {
170 /********************************************
171 * Function: convertInt
172 * Input Parameter: int
173 * Output: Convert a digit- 0,1,2,3 into a
174 * char-T,C,A,G, respectively.
176 *********************************************/
177 char Base::convertInt(int i
) {
196 /********************************************
197 * Function: stringtoUpper
198 * Input Parameter: string
199 * Output: upper string
200 * Return Value: string
201 *********************************************/
202 string
Base::stringtoUpper(string str
) {
204 for(j
=0; str
[j
] = toupper(str
[j
]), j
<str
.length(); j
++);
208 /********************************************
209 * Function: getRandom
210 * Input Parameter: void
211 * Output: Generate a radnom integer
213 *********************************************/
214 int Base::getRandom() {
215 srand((unsigned)time(NULL
));
219 /********************************************
220 * Function: initArray
221 * Input Parameter: array of int/double, int, int/double(default=0)
222 * Output: Init the array x[0...n-1]=value
224 *********************************************/
225 int Base::initArray(double x
[], int n
, double value
) {
227 for(i
=0; i
<n
; i
++) x
[i
] = value
;
231 int Base::initArray(int x
[], int n
, int value
) {
233 for(i
=0; i
<n
; i
++) x
[i
] = value
;
237 /********************************************
239 * Input Parameter: double/int, int, int(default=0)
240 * Output: Sum of array x[]
241 * Return Value: double/int
242 *********************************************/
243 double Base::sumArray(double x
[], int end
, int begin
) {
246 for(i
=begin
; i
<end
; sum
+= x
[i
], i
++);
250 int Base::sumArray(int x
[], int end
, int begin
) {
252 for(i
=begin
; i
<end
; sum
+= x
[i
], i
++);
256 /********************************************
258 * Input Parameter: array of double, int
259 * Output: Sqrt of the sum of the elements' square
260 sqrt(x0*x0 + x1*x1 + ...)
261 * Return Value: double
262 *********************************************/
263 double Base::norm(double x
[], int n
) {
267 for(i
=0; i
<n
; t
+= square(x
[i
]), i
++);
272 /********************************************
273 * Function: scaleArray
274 * Input Parameter: double, array of double, int
275 * Output: Elements in array are mutipled by scale
277 *********************************************/
278 int Base::scaleArray(double scale
, double x
[], int n
) {
280 for (i
=0; i
<n
; i
++) x
[i
] *= scale
;
285 /********************************************
286 * Function: copyArray
287 * Input Parameter: array, array, int
288 * Output: Copy array's values one by one: to[] = from[]
290 *********************************************/
291 int Base::copyArray(double from
[], double to
[], int n
) {
293 for(i
=0; i
<n
; i
++) to
[i
] = from
[i
];
298 /********************************************
300 * Input Parameter: array, array, int
301 * Output: Sum of 'n' products multiplied by
302 two elements in x[], y[].
304 *********************************************/
305 double Base::innerp(double x
[], double y
[], int n
) {
310 for(i
=0; i
<n
; t
+= x
[i
]*y
[i
], i
++);
315 /********************************************
316 * Function: initIdentityMatrix
317 * Input Parameter: array of double, int
318 * Output: Set x[i,j]=0 when x!=j and
321 *********************************************/
322 int Base::initIdentityMatrix(double x
[], int n
) {
326 for (i
=0; i
<n
; i
++) {
327 for(j
=0; j
<n
; x
[i
*n
+j
]=0, j
++);
335 /************************************************
336 * Function: writeFile
337 * Input Parameter: string, string
338 * Output: Write content into the given file.
339 * Return Value: True if succeed, otherwise false.
340 *************************************************/
341 bool Base::writeFile(string output_filename
, const char* result
) {
346 if (output_filename
!="" && output_filename
.length()>0) {
347 ofstream
os(output_filename
.c_str());
348 if (!os
.is_open()) throw 1;
355 cout
<<"Error in writing to file..."<<endl
;
363 /*****************************************************
364 * Function: parseOutput
365 * Input Parameter: void
366 * Output: Parse estimated results for outputing
367 * Return Value: string
369 Order: "Sequence", "Method", "Ka", "Ks", "Ka/Ks",
370 "P-Value(Fisher)", "Length", "S-Sites", "N-Sites", "Fold-Sites(0:2:4)",
371 "Substitutions", "S-Substitutions", "N-Substitutions", "Fold-S-Substitutions(0:2:4)", "Fold-N-Substitutions(0:2:4)",
372 "Divergence-Time", "Substitution-Rate-Ratio(rTC:rAG:rTA:rCG:rTG:rCA/rCA)", "GC(1:2:3)", "ML-Score", "AICc",
374 ******************************************************/
375 string
Base::parseOutput() {
378 string result
= "", tmp
;
381 addString(result
, seq_name
);
383 addString(result
, name
);
390 tmp
= CONVERT
<string
>(Ka
);
392 addString(result
, tmp
);
399 tmp
= CONVERT
<string
>(Ks
);
401 addString(result
, tmp
);
404 if(Ks
<SMALLVALUE
|| Ks
==NA
|| Ka
==NA
) {
408 tmp
= CONVERT
<string
>(Ka
/Ks
);
410 addString(result
, tmp
);
412 //Fisher's test: p_value
413 if (Sd
<SMALLVALUE
|| Nd
<SMALLVALUE
|| S
<SMALLVALUE
|| N
<SMALLVALUE
)
416 tmp
= CONVERT
<string
>(fisher(Sd
,Nd
,S
-Sd
,N
-Nd
));
418 addString(result
, tmp
);
420 //Length of compared pairwise sequences
421 addString(result
, CONVERT
<string
>(length
));
423 //Synonymous(S) sites
428 tmp
= CONVERT
<string
>(S
);
430 addString(result
, tmp
);
432 //Nonsynonymous(N) sites
437 tmp
= CONVERT
<string
>(N
);
439 addString(result
, tmp
);
441 //L[0], L[2], L[4] only for Prof.Li's series(LWL85, LPB93...)
442 if (L
[0]<SMALLVALUE
&& L
[2]<SMALLVALUE
&& L
[4]<SMALLVALUE
) {
446 tmp
= CONVERT
<string
>(L
[0]); tmp
+= ":";
447 tmp
+= CONVERT
<string
>(L
[2]); tmp
+= ":";
448 tmp
+= CONVERT
<string
>(L
[4]);
450 addString(result
, tmp
);
454 addString(result
, CONVERT
<string
>(snp
));
456 //Sysnonymous(Sd) Substitutions(Nd)
458 tmp
= CONVERT
<string
>(Sd
);
463 addString(result
, tmp
);
465 //Nonsysnonymous Substitutions(Nd)
467 tmp
= CONVERT
<string
>(Nd
);
472 addString(result
, tmp
);
474 //Si for Li's series' methods(LWL85, LPB93...)
475 if (Si
[0]!=NULL
|| Si
[2]!=NULL
|| Si
[4]!=NULL
) { //Si[0], Si[2], Si[4]
476 tmp
= CONVERT
<string
>(Si
[0]); tmp
+= ":";
477 tmp
+= CONVERT
<string
>(Si
[2]); tmp
+= ":";
478 tmp
+= CONVERT
<string
>(Si
[4]);
483 addString(result
, tmp
);
485 //Vi for Li's series' methods(LWL85, LPB93...)
486 if (Vi
[0]!=NULL
|| Vi
[2]!=NULL
|| Vi
[4]!=NULL
) { //Vi[0], Vi[2], Vi[4]
487 tmp
= CONVERT
<string
>(Vi
[0]); tmp
+= ":";
488 tmp
+= CONVERT
<string
>(Vi
[2]); tmp
+= ":";
489 tmp
+= CONVERT
<string
>(Vi
[4]);
494 addString(result
, tmp
);
497 //Divergence time or distance t = (S*Ks+N*Ka)/(S+N)
502 tmp
= CONVERT
<string
>(t
);
504 addString(result
, tmp
);
506 //Substitution-Rate-Ratio(rTC:rAG:rTA:rCG:rTG:rCA/rCA)
507 for(i
=0, tmp
=""; i
<NUMBER_OF_RATES
-1; i
++) {
508 tmp
+= CONVERT
<string
>(KAPPA
[i
]);
511 tmp
+= CONVERT
<string
>(KAPPA
[i
]);
512 addString(result
, tmp
);
515 tmp
= CONVERT
<string
>(GC
[0]); tmp
+= "(";
516 tmp
+= CONVERT
<string
>(GC
[1]); tmp
+= ":";
517 tmp
+= CONVERT
<string
>(GC
[2]); tmp
+= ":";
518 tmp
+= CONVERT
<string
>(GC
[3]); tmp
+= ")";
519 addString(result
, tmp
);
521 //Maximum Likelihood Value
522 if (lnL
==NA
) tmp
= "NA";
523 else tmp
= CONVERT
<string
>(lnL
);
524 addString(result
, tmp
);
527 if (AICc
==NA
) tmp
= "NA";
528 else tmp
= CONVERT
<string
>(AICc
);
529 addString(result
, tmp
);
531 //Akaike weight in model selection
532 if (AkaikeWeight
==NA
) tmp
= "NA";
533 else tmp
= CONVERT
<string
>(AkaikeWeight
);
534 addString(result
, tmp
);
536 //Selected Model according to AICc
537 if (model
==""||model
.length()==0) tmp
= "NA";
539 addString(result
, tmp
, "\n");
543 if (SEKa==NA) tmp = "NA";
544 else tmp = CONVERT<string>(SEKa);
545 addString(result, tmp, "\t");
547 if (SEKs==NA) tmp = "NA";
548 else tmp = CONVERT<string>(SEKs);
549 addString(result, tmp, "\n");
555 /**************************************************
557 * Input Parameter: double, double, double, double
558 * Output: Compute p-value by Fisher exact test
559 * Return Value: double
560 ***************************************************/
561 double Base::fisher(double sd
, double nd
, double s
, double n
) {
562 double denominator
, numerator
, prob_total
, prob_current
, sum
, fac_sum
;
563 double matrix
[4], R
[2], C
[2];
566 denominator
= numerator
= prob_total
= prob_current
= sum
= fac_sum
= 0.0;
568 matrix
[0]=sd
; matrix
[1]=s
; matrix
[2]=nd
; matrix
[3]=n
;
570 R
[0]=matrix
[0]+matrix
[2]; R
[1]=matrix
[1]+matrix
[3];
571 C
[0]=matrix
[0]+matrix
[1]; C
[1]=matrix
[2]+matrix
[3];
574 //Calculate the numberator that is a constant
575 numerator
+= factorial(R
[0]);
576 numerator
+= factorial(R
[1]);
577 numerator
+= factorial(C
[0]);
578 numerator
+= factorial(C
[1]);
580 //Log of Factorial of N
581 fac_sum
= factorial(sum
);
582 for(i
=0, denominator
=fac_sum
; i
<4; i
++) {
583 denominator
+= factorial(matrix
[i
]);
585 //Probability of current situtation
586 prob_current
= exp(numerator
-denominator
);
588 //Two-tail probabilities if less than prob_current
589 for(i
=0; i
<=R
[0]; i
++) {
593 matrix
[3] = R
[1]-C
[0]+i
;
594 if (matrix
[0]>=0 && matrix
[1]>=0 && matrix
[2]>=0 && matrix
[3]>=0) {
595 for(j
=0, denominator
=fac_sum
; j
<4; j
++) {
596 denominator
+= factorial(matrix
[j
]);
598 double temp
= numerator
-denominator
;
599 temp
= exp(numerator
-denominator
);
600 if (temp
<=prob_current
) {
609 /**************************************************
610 * Function: factorial
612 * Output: Compute the factorial of 'n', then return
614 * Return Value: double
615 ***************************************************/
616 double Base::factorial(double n
) {
621 x
+= 0.1659470187408462e-06/(n
+7);
622 x
+= 0.9934937113930748e-05/(n
+6);
623 x
-= 0.1385710331296526 /(n
+5);
624 x
+= 12.50734324009056 /(n
+4);
625 x
-= 176.6150291498386 /(n
+3);
626 x
+= 771.3234287757674 /(n
+2);
627 x
-= 1259.139216722289 /(n
+1);
628 x
+= 676.5203681218835 /(n
);
629 x
+= 0.9999999999995183;
630 temp
= log(x
) - 5.58106146679532777 - n
+(n
- 0.5)*log(n
+ 6.5);
639 int matby (double a[], double b[], double c[], int n,int m,int k)
640 // a[n*m], b[m*k], c[n*k] ...... c = a*b
646 for (i1=0,t=0; i1<m; i1++) t+=a[i*m+i1]*b[i1*k+j];
656 void PMatrixTaylor(double P[], double n, double t) {
658 // Get approximate PMat using polynomial approximation
659 // P(t) = I + Qt + (Qt)^2/2 + (Qt)^3/3!
661 int nterms=1000, i,j,k, c[2],ndiff,pos=0,from[3],to[3];
662 double *Q=space, *Q1=Q+n*n, *Q2=Q1+n*n, mr, div;
665 for (i=0; i<n; i++) FOR (j,i) {
666 from[0]=i/16; from[1]=(i/4)%4; from[2]=i%4;
667 to[0]=j/16; to[1]=(j/4)%4; to[2]=j%4;
668 c[0]=GenetCode[com.icode][i];
669 c[1]=GenetCode[com.icode][j];
670 if (c[0]==-1 || c[1]==-1) continue;
671 for (k=0,ndiff=0; k<3; k++) if (from[k]!=to[k]) { ndiff++; pos=k; }
672 if (ndiff!=1) continue;
674 if ((from[pos]+to[pos]-1)*(from[pos]+to[pos]-5)==0) Q[i*n+j]*=kappa;
675 if(c[0]!=c[1]) Q[i*n+j]*=omega;
678 FOR(i,n) FOR(j,n) Q[i*n+j]*=pi[j];
679 for (i=0,mr=0;i<n;i++) {
680 Q[i*n+i]=-sum(Q+i*n,n); mr-=pi[i]*Q[i*n+i];
682 FOR(i,n*n) Q[i]*=t/mr;
684 xtoy(Q,P,n*n); FOR(i,n) P[i*n+i]++; // I + Qt
686 for (i=2,div=2;i<nterms;i++,div*=i) { // k is divisor
687 matby(Q, Q1, Q2, n, n, n);
688 for(j=0,mr=0;j<n*n;j++) { P[j]+=Q2[j]/div; mr+=fabs(Q2[j]); }
690 // if(debug) printf("Pmat term %d: norm=%.6e\n", i, mr);
695 FOR(i,n*n) if(P[i]<0) P[i]=0;