modified: SpatialOmicsCoord.py
[GalaxyCodeBases.git] / c_cpp / etc / KaKs_Calculator / src / KaKs.cpp
blobdfd09d519aaf3bbcc48c3e81e4ba4b873734e76e
1 /************************************************************
2 * Copyright (C) 2005, BGI of Chinese Academy of Sciences
3 * All rights reserved.
5 * Filename: KaKs.cpp
6 * Abstract: Declaration of KAKS class including several methods.
8 * Version: 1.0
9 * Author: Zhang Zhang (zhanghzhang@genomics.org.cn)
10 * Date: Feb.2, 2005
12 *************************************************************/
14 #include "KaKs.h"
16 KAKS::KAKS() {
18 string items[] = {"Sequence", "Method", "Ka", "Ks", "Ka/Ks",
19 "P-Value(Fisher)", "Length", "S-Sites", "N-Sites", "Fold-Sites(0:2:4)",
20 "Substitutions", "S-Substitutions", "N-Substitutions", "Fold-S-Substitutions(0:2:4)", "Fold-N-Substitutions(0:2:4)",
21 "Divergence-Time", "Substitution-Rate-Ratio(rTC:rAG:rTA:rCG:rTG:rCA/rCA)", "GC(1:2:3)", "ML-Score", "AICc",
22 "Akaike-Weight", "Model"}; //21 outputing items
24 //Format of outputing parameters
25 int i;
26 for (i=0; i<sizeof(items)/sizeof(string); i++) titleInfo.push_back(items[i]);
28 //Load Methods' Names and References for -h in linux, also for windows' tool tip
29 method_name.push_back("NG"); method_ref.push_back("Nei, M. and Gojobori, T. (1986)");
30 method_name.push_back("LWL"); method_ref.push_back( "Li, W.H., Wu, C.I. and Luo, C.C. (1985)");
31 method_name.push_back("LPB"); method_ref.push_back("Li, W.H. (1993) and Pamilo, P. and Bianchi, N.O. (1993)");
32 method_name.push_back("MLWL"); method_ref.push_back("Tzeng, Y.H., Pan, R. and Li, W.H. (2004)");
33 method_name.push_back("MLPB"); method_ref.push_back("Tzeng, Y.H., Pan, R. and Li, W.H. (2004)");
34 method_name.push_back("GY"); method_ref.push_back("Goldman, N. and Yang, Z. (1994)");
35 method_name.push_back("YN"); method_ref.push_back("Yang, Z. and Nielsen, R. (2000)");
36 method_name.push_back("MYN"); method_ref.push_back("Zhang Z., Jun L. and Jun Y. (2006)");
37 method_name.push_back("MS"); method_ref.push_back("Model Selection according to the AICc");
38 method_name.push_back("MA"); method_ref.push_back("Model Averaging on a set of candidate models");
40 method_ref.clear();
42 method_ref.push_back("Nei, M. and Gojobori, T. (1986) Mol. Biol. Evol., 3, 418-426.");
43 method_ref.push_back("Li, W.H., Wu, C.I. and Luo, C.C. (1985) Mol. Biol. Evol., 2, 150-174.");
44 method_ref.push_back("Li, W.H. (1993) J. Mol. Evol., 36, 96-99. Pamilo, P. and Bianchi, N.O. (1993) Mol. Biol. Evol., 10, 271-281.");
45 method_ref.push_back("Tzeng, Y.H., Pan, R. and Li, W.H. (2004) Mol. Biol. Evol., 21, 2290-2298.");
46 method_ref.push_back("Tzeng, Y.H., Pan, R. and Li, W.H. (2004) Mol. Biol. Evol., 21, 2290-2298.");
47 method_ref.push_back("Goldman, N. and Yang, Z. (1994) Mol. Biol. Evol., 11, 725-736.");
48 method_ref.push_back("Yang, Z. and Nielsen, R. (2000) Mol. Biol. Evol., 17, 32-43.");
49 method_ref.push_back("Zhang, Z., Li, J. and Yu, J. (2006) BMC Evolutionary Biology, 6, 44.");
50 method_ref.push_back("Model Selection according to the AICc");
51 method_ref.push_back("Model Averaging on a set of candidate models");
53 Initialize();
58 KAKS::~KAKS() {
60 Uninitialize();
62 //Free memory
63 titleInfo.clear();
64 method_name.clear();
65 method_ref.clear();
68 int KAKS::Initialize() {
70 none = ng86 = lpb93 = lwl85 = mlwl85 = mlpb93 = yn00 = gy94 = myn = ms = ma = false;
71 result4Win = result = seq_name = seq1 = seq2 = "";
72 seq_filename = output_filename = detail_filename = "";
73 result = details = "";
74 genetic_code = 1;
75 number = 0;
77 return 1;
80 int KAKS::Uninitialize() {
82 if (os.is_open()) {
83 os.close();
86 return 1;
89 /* Get GCC of entire sequences GC[0] and of three codon positions GC[1,2,3] */
90 void KAKS::getGCContent(string str) {
91 int i, j;
93 initArray(GC, 4);
94 for(i=0; i<str.length(); i+=3) {
95 string codon = str.substr(i, 3);
96 for(j=0; j<3; j++) {
97 if(codon[j]=='G' || codon[j]=='C') GC[j+1]++;
101 GC[0] = sumArray(GC, 4, 1) / str.length();
102 for(i=1; i<4; i++) GC[i] /= (str.length()/3);
107 /****************************************************
108 * Function: ReadCalculateSeq
109 * Input Parameter: string
110 * Output: Read sequences, check sequences' validity
111 and calculate Ka and Ks.
112 * Return Value: True if succeed, otherwise false.
114 * Note: Using axt file for low memory
115 *****************************************************/
116 bool KAKS::ReadCalculateSeq(string filename) {
118 bool flag = true;
120 try {
122 ifstream is(filename.c_str());
123 if (!is) {
124 cout<<"Error in opening file..."<<endl;
125 throw 1;
128 showParaInfo(); //Show information on display
130 result = getTitleInfo();
132 //Output stream
133 if (output_filename!="" && output_filename.length()>0) {
134 os.open(output_filename.c_str());
137 string temp="", name="", str="";
139 while (getline(is, temp, '\n')) {
141 name = temp;
143 getline(is, temp, '\n');
144 while (temp!="") {
145 str += temp;
146 getline(is, temp, '\n');
149 //Get GCC at three codon positions
150 getGCContent(str);
151 //Check str's validility and calculate
152 if (checkValid(name, str.substr(0, str.length()/2), str.substr(str.length()/2, str.length()/2))) {
153 cout<<++number<<" "<<name<<"\t";
154 if(!calculateKaKs()) throw 1; //calculate Ka&Ks using selected method(s)
155 cout<<"[OK]"<<endl;
157 name = str = "";
159 is.close();
160 is.clear();
163 catch (...) {
164 flag = false;
167 return flag;
170 /**************************************************
171 * Function: checkValid
172 * Input Parameter: string, string, string
173 * Output: Check validity of pairwise sequences
174 * Return Value: True if succeed, otherwise false.
175 ***************************************************/
176 bool KAKS::checkValid(string name, string str1, string str2) {
177 bool flag = true;
178 long i;
180 try {
181 //Check whether (sequence length)/3==0
182 if (str1.length()!=str1.length() || str1.length()%3!=0 || str2.length()%3!=0) {
183 cout<<endl<<"Error. The size of two sequences in "<<"'"<<name<<"' is not equal."<<endl;
184 throw 1;
187 //Delete gap and stop codon
188 bool found;
189 int j;
190 for(i=0; i<str1.length(); i+=3) {
191 for(found=false, j=0; j<3 && !found; j++) {
192 if (str1[j+i]=='-' || str2[j+i]=='-') {
193 found = true;
195 str1[i+j] = toupper(str1[i+j]);
196 str2[i+j] = toupper(str2[i+j]);
197 if (convertChar(str1[i+j])==-1 || convertChar(str2[i+j])==-1) {
198 found = true;
202 if ((getAminoAcid(str1.substr(i,3))=='!') || (getAminoAcid(str2.substr(i,3))=='!')) {
203 found = true;
206 if (found) {
207 str1 = str1.replace(i, 3, "");
208 str2 = str2.replace(i, 3, "");
209 i -= 3;
213 //pass value into private variables
214 seq1 = str1;
215 seq2 = str2;
216 //pass value into extern variables
217 seq_name = name;
218 length = str1.length();
220 catch (...) {
221 flag = false;
224 return flag;
227 /**************************************************
228 * Function: Run
229 * Input Parameter: int, const char* []
230 * Output: Calculate Ka/Ks and output.
231 * Return Value: void
232 ***************************************************/
233 bool KAKS::Run(int argc, const char* argv[]) {
235 bool flag=true;
237 try {
238 //Judge whether input parameters are legal
239 if (!parseParameter(argc, argv)) {
240 throw 1;
243 //Record the start time
244 static time_t time_start = time(NULL);
246 //Read sequences and calculate Ka & Ks
247 if (!ReadCalculateSeq(seq_filename)) {
248 throw 1;
251 //Write details for model-selected method
252 if(!writeFile(detail_filename, (getTitleInfo()+details).c_str())) {
253 throw 1;
256 //Time used for running
257 time_t t = time(NULL)-time_start;
258 int h=t/3600, m=(t%3600)/60, s=t-(t/60)*60;
260 //Print on display
261 cout<<"Mission accomplished. (Time elapsed: ";
262 if(h) cout<<h<<":"<<m<<":"<<s<<")"<<endl;
263 else cout<<m<<":"<<s<<")"<<endl;
265 catch (...) {
266 flag = false;
269 return flag;
272 /**************************************************
273 * Function: parseParameter
274 * Input Parameter: int, const char* []
275 * Output: Parse the input parameters
276 * Return Value: bool
277 ***************************************************/
278 bool KAKS::parseParameter(int argc, const char* argv[]) {
280 bool flag = true;
281 int i;
282 string temp;
284 try {
286 //No parameter
287 if (argc==1) {
288 if (seq_filename=="") throw 1;
289 //else for only one parameter - seq_filename
291 else if (argc==2) {//help information
292 temp=argv[1];
293 if(temp=="-H"||temp=="-h") {
294 helpInfo();
295 flag = false;
297 else {
298 throw 1;
301 else {
302 //parse parameters
303 int inputflag=0, outputflag=0, codeflag=0;
304 for (i=1; i<argc; i++) {
306 temp = stringtoUpper(argv[i]);
307 //Input axt file
308 if (temp=="-I") {
309 if ((i+1)<argc && inputflag==0) {
310 seq_filename = argv[++i];
311 inputflag++;
313 else {
314 throw 1;
317 //Output file
318 else if (temp=="-O") {
319 if ((i+1)<argc && outputflag==0) {
320 output_filename = argv[++i];
321 outputflag++;
323 else {
324 throw 1;
327 //Genetic Code Table
328 else if (temp=="-C") {
329 if ((i+1)<argc && codeflag==0) {
330 genetic_code = CONVERT<int>(argv[++i]);
331 if (genetic_code<1 || genetic_code>NCODE || strlen(transl_table[2*(genetic_code-1)])<1)
332 throw 1;
333 codeflag++;
335 else {
336 throw 1;
339 //Details for Model Selection
340 else if (temp=="-D") {
341 if ((i+1)>argc) throw 1;
342 detail_filename = argv[++i];
345 //Algorithm(s) selected
346 else if (temp=="-M") {
347 if ((i+1)>argc) throw 1;
348 temp = stringtoUpper(argv[++i]);
349 if (temp=="NONE") none = true;
350 else if(temp=="NG") ng86 = true;
351 else if(temp=="LWL") lwl85 = true;
352 else if(temp=="LPB") lpb93 = true;
353 else if(temp=="MLPB") mlpb93 = true;
354 else if(temp=="MLWL") mlwl85 = true;
355 else if(temp=="GY") gy94 = true;
356 else if(temp=="YN") yn00 = true;
357 else if(temp=="MYN") myn = true;
358 else if(temp=="MS") ms = true;
359 else if(temp=="MA") ma = true;
360 else if(temp=="ALL") {
361 ng86 = lpb93 = lwl85 = mlwl85 = mlpb93 = gy94 = yn00 = myn = ms = ma = true;
363 else throw 1;
365 else throw 1;
368 //If no input or output file, report error
369 if(inputflag==0 || outputflag==0) throw 1;
371 //Default: use ma to to calculate Ka and Ks
372 if (!(none+ng86+lpb93+lwl85+mlwl85+mlpb93+gy94+yn00+myn+ms+ma)) {
373 ma = true;
377 catch (...) {
378 cout<<"Input parameter(s) error."<<endl;
379 cout<<"For help information: Kaks_Calculator -h"<<endl;
380 flag = false;
383 return flag;
387 /*******************************************************
388 * Function: Calculate
389 * Input Parameter: void
390 * Output: Calculate kaks and output results.
391 * Return Value: void
393 * Note:
394 ********************************************************/
395 bool KAKS::calculateKaKs() {
397 bool flag = true;
399 try {
401 //Estimate Ka and Ks
402 if (none) start_NONE();
403 if (ng86) start_NG86();
404 if (lwl85) start_LWL85();
405 if (mlwl85) start_MLWL85();
406 if (lpb93) start_LPB93();
407 if (mlpb93) start_MLPB93();
408 if (gy94) start_GY94();
409 if (yn00) start_YN00();
410 if (myn) start_MYN();
411 if (ms||ma) start_MSMA();
413 result4Win += result;
414 //Write into the file
415 if (output_filename.length()>0 && os.is_open()) {
416 os<<result;
417 os.flush();
419 result = "";
421 catch (...) {
422 flag = false;
425 return flag;
428 //NONE: NG without correction for multiple substitution
429 void KAKS::start_NONE() {
431 NONE zz;
432 result += zz.Run(seq1, seq2);
435 //NG
436 void KAKS::start_NG86() {
438 NG86 zz;
439 result += zz.Run(seq1, seq2);
442 //LWL
443 void KAKS::start_LWL85() {
445 LWL85 zz;
446 result += zz.Run(seq1, seq2);
449 //MLWL
450 void KAKS::start_MLWL85() {
452 MLWL85 zz;
453 result += zz.Run(seq1, seq2);
456 //LPB
457 void KAKS::start_LPB93() {
459 LPB93 zz;
460 result += zz.Run(seq1, seq2);
463 //MLPB
464 void KAKS::start_MLPB93() {
466 MLPB93 zz;
467 result += zz.Run(seq1, seq2);
470 //GY
471 void KAKS::start_GY94() {
473 GY94 zz("HKY");
474 result += zz.Run(seq1.c_str(), seq2.c_str());
477 //YN
478 void KAKS::start_YN00() {
480 YN00 zz;
481 result += zz.Run(seq1, seq2);
484 //MYN
485 void KAKS::start_MYN() {
487 MYN zz;
488 result += zz.Run(seq1, seq2);
491 /************************************************
492 * Function: start_MSMA
493 * Input Parameter: void
494 * Output: Calculate Ka and Ks using the method of
495 model selection or model averaging.
496 * Return Value: void
497 *************************************************/
498 void KAKS::start_MSMA() {
500 vector<MLResult> result4MA; //generated by MS and used by MA
502 //Model Selection
503 MS zz1;
504 string tmp = zz1.Run(seq1.c_str(), seq2.c_str(), result4MA, details);
505 if (ms) {
506 result += tmp;
509 //Model Averaging
510 if (ma) {
511 MA zz2;
512 result += zz2.Run(seq1.c_str(), seq2.c_str(), result4MA);
516 /************************************************
517 * Function: showParaInfo
518 * Input Parameter:
519 * Output: print on display
520 * Return Value:
521 *************************************************/
522 void KAKS::showParaInfo() {
524 cout<<"Method(s): ";
525 if (none) cout<<"NONE"<<" ";
526 if (ng86) cout<<"NG"<<" ";
527 if (lwl85) cout<<"LWL"<<" ";
528 if (mlwl85) cout<<"MLWL"<<" ";
529 if (lpb93) cout<<"LPB"<<" ";
530 if (mlpb93) cout<<"MLPB"<<" ";
531 if (gy94) cout<<"GY"<<" ";
532 if (yn00) cout<<"YN"<<" ";
533 if (myn) cout<<"MYN"<<" ";
534 if (ms) cout<<"MS"<<" ";
535 if (ma) cout<<"MA";
537 cout<<endl<<"Genetic code: "<<transl_table[2*(genetic_code-1)+1]<<endl;
538 cout<<"Please wait while reading sequences and calculating..."<<endl;
541 /************************************************
542 * Function: getTitleInfo
543 * Input Parameter:
544 * Output: get title information of outputing file
545 * Return Value: string
546 *************************************************/
547 string KAKS::getTitleInfo() {
549 string title = "";
550 int i=0;
552 if (titleInfo.size()>0) {
553 //Add "\t" to items except the last one
554 for (; i<titleInfo.size()-1; i++) addString(title, titleInfo[i]);
555 //The last item is added by "\n"
556 addString(title, titleInfo[i], "\n");
559 return title;
562 /***********************************
563 * Function: helpInfo
564 * Input Parameter:
565 * Output: Display help infomation.
566 * Return Value: void
567 ************************************/
568 void KAKS::helpInfo() {
569 int i,j;
571 cout<<endl;
572 cout<<"******************************************************************"<<endl;
573 cout<<" Program: KaKs_Calculator"<<endl;
574 cout<<" Version: "<<VERSION<<endl;
575 cout<<" Description: Calculate Ka and Ks through model selection and model averaging."<<endl;
576 cout<<"******************************************************************"<<endl;
577 cout<<endl;
579 cout<<"Usage: Kaks_Calculator <options>"<<endl;
580 cout<<"\t-i\tAxt file name for calculating Ka & Ks."<<endl;
581 cout<<"\t-o\tOutput file name for saving results."<<endl;
582 cout<<"\t-c\tGenetic code table (Default = 1-Standard Code)."<<endl;
583 for(i=0,j=0; i<NNCODE; i+=2) {
584 if(strlen(transl_table[i])>0) {
585 cout<<"\t\t "<<transl_table[i+1]<<"\t";
586 j++;
588 if (j==2) {
589 cout<<endl;
590 j = 0;
593 cout<<endl;
594 cout<<"\t\t (More information about the Genetic Codes: http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c)"<<endl;
596 cout<<"\t-m\tMethods for estimating Ka and Ks and theirs references(Default = MA)"<<endl;
597 for (i=0; i<method_name.size(); i++) {
598 cout<<"\t\t "<<method_name[i].c_str();
599 cout<<"\t\t"<<method_ref[i].c_str()<<endl;
601 cout<<"\t\t ALL(including all above methods)"<<endl;
603 cout<<"\t-d\tFile name for details about each candidate model when using the method of MS"<<endl;
604 cout<<endl;
606 cout<<"Example:"<<endl;
607 cout<<"\tKaKs_Calculator -i test.axt -o test.axt.kaks\t//use MA method based on a more suitable model and standard code"<<endl;
608 cout<<"\tKaKs_Calculator -i test.axt -o test.axt.kaks -c 2\t//use MA method and vertebrate mitochondrial code"<<endl;
609 cout<<"\tKaKs_Calculator -i test.axt -o test.axt.kaks -m LWL -m MYN\t//use LWL and MYN methods, and standard Code"<<endl;
611 cout<<endl;
612 cout<<"For help information: KaKs_Calculator -h"<<endl;
613 cout<<"Please send bugs or advice to Zhang Zhang at zhangzhang@genomics.org.cn."<<endl;
614 cout<<endl;