1 /************************************************************
2 * Copyright (c) 2006, BGI of Chinese Academy of Sciences
6 * Abstract: Definition of model-selected and model-averged methods' classes.
9 * Author: Zhang Zhang (zhanghzhang@genomics.org.cn)
13 Goldman N, Yang Z (1994) A codon-based model of nucleotide
14 substitution for protein-coding DNA sequences. Mol. Biol.
17 Posada, D. and Buckley, T.R. (2004) Model Selection and Model Averaging
18 in Phylogenetics: Advantages of Akaike Information Criterion and Bayesian
19 Approaches over Likelihood Ratio Tests, Syst. Biol., 53, 793-808.
21 Sullivan, J. and Joyce, P. (2005) Model Selection in Phylogenetics,
22 Annual Review of Ecology, Evolution, and Systematics, 36, 445-466.
23 *************************************************************/
30 /* Calculate Ka and Ks based on a given model, similar to the method of GY */
31 void MS::selectModel(const char *seq1
, const char *seq2
, string c_model
, vector
<MLResult
>& result4MA
) {
36 tmp
.result
= zz
.Run(seq1
, seq2
);
38 copyArray(zz
.com
.pi
, tmp
.freq
, (int)CODON
);
39 copyArray(zz
.KAPPA
, tmp
.rate
, (int)NUMBER_OF_RATES
);
43 result4MA
.push_back(tmp
);
46 /* Choose the estimates under a model with smallest AICc */
47 string
MS::Run(const char *seq1
, const char *seq2
, vector
<MLResult
> &result4MA
, string
&details
) {
50 string candidate_models
[] = {"JC", "F81", "K2P", "HKY", "TNEF", "TN", "K3P", "K3PUF", "TIMEF", "TIM", "TVMEF", "TVM", "SYM", "GTR"};
52 //Calculate Ka and Ks using 14 models
53 for (i
=0; i
<MODELCOUNT
; i
++) selectModel(seq1
, seq2
, candidate_models
[i
], result4MA
);
55 //Choose the results under a model with smallest AICc
56 for (pos
=i
=0; i
<result4MA
.size(); i
++) {
57 if (result4MA
[i
].AICc
<result4MA
[pos
].AICc
) pos
= i
;
60 //Calculate the AICc difference, substract the smallest AICc
61 double diff
[MODELCOUNT
];
62 for (i
=0; i
<MODELCOUNT
; i
++) diff
[i
] = result4MA
[i
].AICc
- result4MA
[pos
].AICc
;
64 //Compute Akaike weights
65 double w
[MODELCOUNT
], sum
;
66 initArray(w
, MODELCOUNT
);
67 for (sum
=i
=0; i
<MODELCOUNT
; i
++) {
68 //akaike wights of each model
69 for (j
=0; j
<MODELCOUNT
; j
++) {
70 double power
= -0.5*diff
[j
] - (-0.5*diff
[i
]);
72 if (power
>709) power
= 700;
73 else if (power
<-709) power
= -700;
80 //Add Akaike weights to results
82 for (i
=0; i
<MODELCOUNT
; i
++) {
85 tmp
= result4MA
[i
].result
;
86 j
= tmp
.find_last_of('\t');
87 result4MA
[i
].result
= tmp
.substr(j
, tmp
.length()-j
);
88 tmp
= tmp
.replace(j
, tmp
.length()-j
, "");
89 j
= tmp
.find_last_of('\t');
90 tmp
= tmp
.replace(j
+1, tmp
.length()-j
-1, "") + CONVERT
<string
>(w
[i
]);
91 result4MA
[i
].result
= tmp
+ result4MA
[i
].result
;
93 //Details on model selection
94 details
+= result4MA
[i
].result
;
97 //Results at "pos" is more reliable, replace 'method name' by "MS".
98 tmp
= result4MA
[pos
].result
;
100 j
= tmp
.find('\t', i
+1);
101 result4MA
[pos
].result
= tmp
.substr(0, i
+1) + name
;
102 result4MA
[pos
].result
+= tmp
.substr(j
, tmp
.length()-j
);
104 return result4MA
[pos
].result
;
115 com
.icode
= genetic_code
-1;
116 if (com
.icode
>11) com
.icode
= 0;
117 com
.ncode
= Nsensecodon
= 64 - getNumNonsense(com
.icode
);
120 com
.np
= 2 + com
.nkappa
;
124 string
MA::Run(const char *seq1
, const char *seq2
, vector
<MLResult
> result4MA
) {
128 //Find the smallest AICc
129 for (pos
=0, i
=1; i
<result4MA
.size(); i
++) {
130 if (result4MA
[i
].AICc
<result4MA
[pos
].AICc
) pos
= i
;
133 //Calculate the AICc difference, substract the smallest AICc
134 double diff
[MODELCOUNT
];
135 for (i
=0; i
<MODELCOUNT
; i
++) diff
[i
] = result4MA
[i
].AICc
- result4MA
[pos
].AICc
;
137 //Compute Akake weights
138 double w
[MODELCOUNT
];
139 initArray(w
, MODELCOUNT
);
140 for (i
=0; i
<MODELCOUNT
; i
++) {
142 for (j
=0; j
<MODELCOUNT
; j
++) {
143 double power
= -0.5*diff
[j
] - (-0.5*diff
[i
]);
144 if (power
>709) power
= 700;
145 else if (power
<-709) power
= -700;
151 int I
[MODELCOUNT
][NUMBER_OF_RATES
];
152 for(i
=0; i
<MODELCOUNT
; i
++)
153 for (j
=0; j
<NUMBER_OF_RATES
; j
++)
156 //JC, F81: rTC==rAG =rTA==rCG==rTG==rCA
158 //K2P, HKY: rTC==rAG!=rTA==rCG==rTG==rCA
159 I
[2][0]=1; I
[2][1]=1;
160 I
[3][0]=1; I
[3][1]=1;
161 //TNEF,TN: rTC!=rAG!=rTA==rCG==rTG==rCA
162 I
[4][0]=1; I
[4][1]=1;
163 I
[5][0]=1; I
[5][1]=1;
164 //K3P, K3PUF: rTC==rAG!=rTA==rCG!=rTG==rCA
165 I
[6][0]=1; I
[6][1]=1; I
[6][2]=1; I
[6][3]=1;
166 I
[7][0]=1; I
[7][1]=1; I
[7][2]=1; I
[7][3]=1;
167 //TIMEF, TIM: rTC!=rAG!=rTA==rCG!=rTG==rCA
168 I
[8][0]=1; I
[8][1]=1; I
[8][2]=1; I
[8][3]=1;
169 I
[9][0]=1; I
[9][1]=1; I
[9][2]=1; I
[9][3]=1;
170 //TVMEF, TVM: rTC==rAG!=rTA!=rCG!=rTG!=rCA
171 I
[10][0]=1; I
[10][1]=1; I
[10][2]=1; I
[10][3]=1; I
[10][4]=1;
172 I
[11][0]=1; I
[11][1]=1; I
[11][2]=1; I
[11][3]=1; I
[11][4]=1;
173 //SYM, GTR: rTC!=rAG!=rTA!=rCG!=rTG!=rCA
174 I
[12][0]=1; I
[12][1]=1; I
[12][2]=1; I
[12][3]=1; I
[12][4]=1;
175 I
[13][0]=1; I
[13][1]=1; I
[13][2]=1; I
[13][3]=1; I
[13][4]=1;
182 //Model-averaged time t
183 for(j
=0; j
<MODELCOUNT
;j
++) para
[0] += (w
[j
]*result4MA
[j
].t
);
186 //Model-averaged Substitution Rates
187 double sum
[NUMBER_OF_RATES
];
188 initArray(sum
, NUMBER_OF_RATES
);
189 initArray(com
.KAPPA
, 8);
191 for (i
=0; i
<NUMBER_OF_RATES
-1; i
++) {
192 for(j
=0; j
<MODELCOUNT
;j
++) {
193 com
.KAPPA
[i
] += (w
[j
]*I
[j
][i
]*result4MA
[j
].rate
[i
]);
194 sum
[i
] += (w
[j
]*I
[j
][i
]);
196 if(sum
[i
]<1e-50) com
.KAPPA
[i
]=1;
197 else com
.KAPPA
[i
] /= sum
[i
];
198 para
[i
+1] = com
.KAPPA
[i
];
202 //Model-averaged omega w
203 for(j
=0; j
<MODELCOUNT
;j
++) para
[6] += (w
[j
]*result4MA
[j
].w
);
206 //Model-averaged Codon Frequencies
207 for (i
=0; i
<CODON
; i
++) {
209 for(j
=0; j
<MODELCOUNT
;j
++) com
.pi
[i
] += (w
[j
]*result4MA
[j
].freq
[i
]);
214 preProcess(seq1
, seq2
);
216 //Calculate maximum likelihood score
217 lnL
= lfun2dSdN(para
, com
.np
);
219 //Copy subsitution rates
220 copyArray(com
.KAPPA
, KAPPA
, NUMBER_OF_RATES
);
223 EigenQc(1, para
[0], &S
, &Ks
, &Ka
, NULL
, NULL
, NULL
, KAPPA
, com
.omega
, PMat
);
228 double scale
=(Sd
+Nd
)/snp
;
236 //For the method of Model averaging, parameters' number is not specific.
239 return parseOutput();