modified: SpatialOmicsCoord.py
[GalaxyCodeBases.git] / c_cpp / etc / KaKs_Calculator / src / MSMA.cpp
blob2dd732dfa3e8aee5453111df3b6c471306d438f6
1 /************************************************************
2 * Copyright (c) 2006, BGI of Chinese Academy of Sciences
3 * All rights reserved.
5 * Filename: MSMA.cpp
6 * Abstract: Definition of model-selected and model-averged methods' classes.
8 * Version: 1.0
9 * Author: Zhang Zhang (zhanghzhang@genomics.org.cn)
10 * Date: Apr. 2006
12 References:
13 Goldman N, Yang Z (1994) A codon-based model of nucleotide
14 substitution for protein-coding DNA sequences. Mol. Biol.
15 Evol. 11:725-736.
17 Posada, D. and Buckley, T.R. (2004) Model Selection and Model Averaging
18 in Phylogenetics: Advantages of Akaike Information Criterion and Bayesian
19 Approaches over Likelihood Ratio Tests, Syst. Biol., 53, 793-808.
21 Sullivan, J. and Joyce, P. (2005) Model Selection in Phylogenetics,
22 Annual Review of Ecology, Evolution, and Systematics, 36, 445-466.
23 *************************************************************/
24 #include "MSMA.h"
26 MS::MS() {
27 name = "MS";
30 /* Calculate Ka and Ks based on a given model, similar to the method of GY */
31 void MS::selectModel(const char *seq1, const char *seq2, string c_model, vector<MLResult>& result4MA) {
33 MLResult tmp;
34 GY94 zz(c_model);
36 tmp.result = zz.Run(seq1, seq2);
37 tmp.AICc = zz.AICc;
38 copyArray(zz.com.pi, tmp.freq, (int)CODON);
39 copyArray(zz.KAPPA, tmp.rate, (int)NUMBER_OF_RATES);
40 tmp.w = zz.com.omega;
41 tmp.t = 3.*zz.t;
43 result4MA.push_back(tmp);
46 /* Choose the estimates under a model with smallest AICc */
47 string MS::Run(const char *seq1, const char *seq2, vector<MLResult> &result4MA, string &details) {
49 int i, j, pos;
50 string candidate_models[] = {"JC", "F81", "K2P", "HKY", "TNEF", "TN", "K3P", "K3PUF", "TIMEF", "TIM", "TVMEF", "TVM", "SYM", "GTR"};
52 //Calculate Ka and Ks using 14 models
53 for (i=0; i<MODELCOUNT; i++) selectModel(seq1, seq2, candidate_models[i], result4MA);
55 //Choose the results under a model with smallest AICc
56 for (pos=i=0; i<result4MA.size(); i++) {
57 if (result4MA[i].AICc<result4MA[pos].AICc) pos = i;
60 //Calculate the AICc difference, substract the smallest AICc
61 double diff[MODELCOUNT];
62 for (i=0; i<MODELCOUNT; i++) diff[i] = result4MA[i].AICc - result4MA[pos].AICc;
64 //Compute Akaike weights
65 double w[MODELCOUNT], sum;
66 initArray(w, MODELCOUNT);
67 for (sum=i=0; i<MODELCOUNT; i++) {
68 //akaike wights of each model
69 for (j=0; j<MODELCOUNT; j++) {
70 double power = -0.5*diff[j] - (-0.5*diff[i]);
71 //Avoid overflow
72 if (power>709) power = 700;
73 else if (power<-709) power = -700;
74 //Normal
75 w[i] += exp(power);
77 w[i] = 1./w[i];
80 //Add Akaike weights to results
81 string tmp="";
82 for (i=0; i<MODELCOUNT; i++) {
84 //Add akaike weights
85 tmp = result4MA[i].result;
86 j = tmp.find_last_of('\t');
87 result4MA[i].result = tmp.substr(j, tmp.length()-j);
88 tmp = tmp.replace(j, tmp.length()-j, "");
89 j = tmp.find_last_of('\t');
90 tmp = tmp.replace(j+1, tmp.length()-j-1, "") + CONVERT<string>(w[i]);
91 result4MA[i].result = tmp + result4MA[i].result;
93 //Details on model selection
94 details += result4MA[i].result;
97 //Results at "pos" is more reliable, replace 'method name' by "MS".
98 tmp = result4MA[pos].result;
99 i = tmp.find('\t');
100 j = tmp.find('\t', i+1);
101 result4MA[pos].result = tmp.substr(0, i+1) + name;
102 result4MA[pos].result += tmp.substr(j, tmp.length()-j);
104 return result4MA[pos].result;
108 MA::MA() {
109 name = "MA";
110 Small_Diff=1e-6;
111 w_rndu=123456757;
112 com.ns = 2;
113 lnL = 0.0;
115 com.icode = genetic_code-1;
116 if (com.icode>11) com.icode = 0;
117 com.ncode = Nsensecodon = 64 - getNumNonsense(com.icode);
119 com.nkappa = 5;
120 com.np = 2 + com.nkappa;
124 string MA::Run(const char *seq1, const char *seq2, vector<MLResult> result4MA) {
126 int i, j, pos;
128 //Find the smallest AICc
129 for (pos=0, i=1; i<result4MA.size(); i++) {
130 if (result4MA[i].AICc<result4MA[pos].AICc) pos = i;
133 //Calculate the AICc difference, substract the smallest AICc
134 double diff[MODELCOUNT];
135 for (i=0; i<MODELCOUNT; i++) diff[i] = result4MA[i].AICc - result4MA[pos].AICc;
137 //Compute Akake weights
138 double w[MODELCOUNT];
139 initArray(w, MODELCOUNT);
140 for (i=0; i<MODELCOUNT; i++) {
141 //Avoid overflow
142 for (j=0; j<MODELCOUNT; j++) {
143 double power = -0.5*diff[j] - (-0.5*diff[i]);
144 if (power>709) power = 700;
145 else if (power<-709) power = -700;
146 w[i] += exp(power);
148 w[i] = 1./w[i];
151 int I[MODELCOUNT][NUMBER_OF_RATES];
152 for(i=0; i<MODELCOUNT; i++)
153 for (j=0; j<NUMBER_OF_RATES; j++)
154 I[i][j]=0;
156 //JC, F81: rTC==rAG =rTA==rCG==rTG==rCA
158 //K2P, HKY: rTC==rAG!=rTA==rCG==rTG==rCA
159 I[2][0]=1; I[2][1]=1;
160 I[3][0]=1; I[3][1]=1;
161 //TNEF,TN: rTC!=rAG!=rTA==rCG==rTG==rCA
162 I[4][0]=1; I[4][1]=1;
163 I[5][0]=1; I[5][1]=1;
164 //K3P, K3PUF: rTC==rAG!=rTA==rCG!=rTG==rCA
165 I[6][0]=1; I[6][1]=1; I[6][2]=1; I[6][3]=1;
166 I[7][0]=1; I[7][1]=1; I[7][2]=1; I[7][3]=1;
167 //TIMEF, TIM: rTC!=rAG!=rTA==rCG!=rTG==rCA
168 I[8][0]=1; I[8][1]=1; I[8][2]=1; I[8][3]=1;
169 I[9][0]=1; I[9][1]=1; I[9][2]=1; I[9][3]=1;
170 //TVMEF, TVM: rTC==rAG!=rTA!=rCG!=rTG!=rCA
171 I[10][0]=1; I[10][1]=1; I[10][2]=1; I[10][3]=1; I[10][4]=1;
172 I[11][0]=1; I[11][1]=1; I[11][2]=1; I[11][3]=1; I[11][4]=1;
173 //SYM, GTR: rTC!=rAG!=rTA!=rCG!=rTG!=rCA
174 I[12][0]=1; I[12][1]=1; I[12][2]=1; I[12][3]=1; I[12][4]=1;
175 I[13][0]=1; I[13][1]=1; I[13][2]=1; I[13][3]=1; I[13][4]=1;
178 //Average parameters
179 double para[8];
180 initArray(para, 8);
182 //Model-averaged time t
183 for(j=0; j<MODELCOUNT;j++) para[0] += (w[j]*result4MA[j].t);
184 //para[0] /= sum;
186 //Model-averaged Substitution Rates
187 double sum[NUMBER_OF_RATES];
188 initArray(sum, NUMBER_OF_RATES);
189 initArray(com.KAPPA, 8);
191 for (i=0; i<NUMBER_OF_RATES-1; i++) {
192 for(j=0; j<MODELCOUNT;j++) {
193 com.KAPPA[i] += (w[j]*I[j][i]*result4MA[j].rate[i]);
194 sum[i] += (w[j]*I[j][i]);
196 if(sum[i]<1e-50) com.KAPPA[i]=1;
197 else com.KAPPA[i] /= sum[i];
198 para[i+1] = com.KAPPA[i];
201 para[6] = 0;
202 //Model-averaged omega w
203 for(j=0; j<MODELCOUNT;j++) para[6] += (w[j]*result4MA[j].w);
204 //para[6] /= sum;
206 //Model-averaged Codon Frequencies
207 for (i=0; i<CODON; i++) {
208 com.pi[i] = 0.0;
209 for(j=0; j<MODELCOUNT;j++) com.pi[i] += (w[j]*result4MA[j].freq[i]);
210 //com.pi[i] /= sum;
213 /* Preprocess */
214 preProcess(seq1, seq2);
216 //Calculate maximum likelihood score
217 lnL = lfun2dSdN(para, com.np);
219 //Copy subsitution rates
220 copyArray(com.KAPPA, KAPPA, NUMBER_OF_RATES);
222 //Compute Ka and Ks
223 EigenQc(1, para[0], &S, &Ks, &Ka, NULL, NULL, NULL, KAPPA, com.omega, PMat);
225 N = com.ls*3-S;
226 Sd = Ks*S;
227 Nd = Ka*N;
228 double scale=(Sd+Nd)/snp;
229 Sd /= scale;
230 Nd /= scale;
232 lnL = -lnL;
234 t = para[0]/3;
236 //For the method of Model averaging, parameters' number is not specific.
237 AICc = NA;
239 return parseOutput();