modified: diffout.py
[GalaxyCodeBases.git] / c_cpp / lib / htslib / probaln.c
blob5d617819b321b87806e2f953e0212215c6f20d0e
1 /* The MIT License
3 Copyright (C) 2003-2006, 2008-2010 by Heng Li <lh3lh3@live.co.uk>
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 SOFTWARE.
26 #include <config.h>
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <stdint.h>
32 #include <math.h>
33 #include "htslib/hts.h"
35 /*****************************************
36 * Probabilistic banded glocal alignment *
37 *****************************************/
39 #define EI .25
40 #define EM .33333333333
42 static float g_qual2prob[256];
44 #define set_u(u, b, i, k) { int x=(i)-(b); x=x>0?x:0; (u)=((k)-x+1)*3; }
47 The topology of the profile HMM:
49 /\ /\ /\ /\
50 I[1] I[k-1] I[k] I[L]
51 ^ \ \ ^ \ ^ \ \ ^
52 | \ \ | \ | \ \ |
53 M[0] M[1] -> ... -> M[k-1] -> M[k] -> ... -> M[L] M[L+1]
54 \ \/ \/ \/ /
55 \ /\ /\ /\ /
56 -> D[k-1] -> D[k] ->
58 M[0] points to every {M,I}[k] and every {M,I}[k] points to M[L+1].
60 On input, _ref is the reference sequence and _query is the query
61 sequence. Both are sequences of 0/1/2/3/4 where 4 stands for an
62 ambiguous residue. iqual is the base quality. c sets the gap open
63 probability, gap extension probability and band width.
65 On output, state and q are arrays of length l_query. The higher 30
66 bits give the reference position the query base is matched to and the
67 lower two bits can be 0 (an alignment match) or 1 (an
68 insertion). q[i] gives the phred scaled posterior probability of
69 state[i] being wrong.
71 int probaln_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query,
72 const uint8_t *iqual, const probaln_par_t *c, int *state, uint8_t *q)
74 double *f, *b = 0, *s, m[9], sI, sM, bI, bM;
75 float *qual, *_qual;
76 const uint8_t *ref, *query;
77 int bw, bw2, i, k, is_backward = 1, Pr;
79 if ( l_ref<=0 || l_query<=0 ) return 0; // FIXME: this may not be an ideal fix, just prevents sefgault
81 /*** initialization ***/
82 is_backward = state && q? 1 : 0;
83 ref = _ref - 1; query = _query - 1; // change to 1-based coordinate
84 bw = l_ref > l_query? l_ref : l_query;
85 if (bw > c->bw) bw = c->bw;
86 if (bw < abs(l_ref - l_query)) bw = abs(l_ref - l_query);
87 bw2 = bw * 2 + 1;
88 int i_dim = bw2*3+6;
89 // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[]
90 // Ideally these callocs would be mallocs + initialisation of the few bits needed.
91 f = calloc((l_query+1)*i_dim, sizeof(double));
92 if (is_backward) b = calloc((l_query+1)*i_dim, sizeof(double));
93 s = malloc((l_query+2) * sizeof(double)); // s[] is the scaling factor to avoid underflow
94 // initialize qual
95 _qual = malloc(l_query * sizeof(float));
96 if (g_qual2prob[0] == 0)
97 for (i = 0; i < 256; ++i)
98 g_qual2prob[i] = pow(10, -i/10.);
99 for (i = 0; i < l_query; ++i) _qual[i] = g_qual2prob[iqual? iqual[i] : 30];
100 qual = _qual - 1;
101 // initialize transition probability
102 sM = sI = 1. / (2 * l_query + 2); // the value here seems not to affect results; FIXME: need proof
103 m[0*3+0] = (1 - c->d - c->d) * (1 - sM); m[0*3+1] = m[0*3+2] = c->d * (1 - sM);
104 m[1*3+0] = (1 - c->e) * (1 - sI); m[1*3+1] = c->e * (1 - sI); m[1*3+2] = 0.;
105 m[2*3+0] = 1 - c->e; m[2*3+1] = 0.; m[2*3+2] = c->e;
106 bM = (1 - c->d) / l_ref; bI = c->d / l_ref; // (bM+bI)*l_ref==1
107 /*** forward ***/
108 // f[0]
109 set_u(k, bw, 0, 0);
110 f[0*i_dim+k] = s[0] = 1.;
111 { // f[1]
112 double *fi = &f[1*i_dim], sum;
113 int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1;
114 for (k = beg, sum = 0.; k <= end; ++k) {
115 int u;
116 double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM;
117 set_u(u, bw, 1, k);
118 fi[u+0] = e * bM; fi[u+1] = EI * bI;
119 sum += fi[u] + fi[u+1];
121 s[1] = sum;
123 // f[2..l_query]
124 for (i = 2; i <= l_query; ++i) {
125 double *fi = &f[i*i_dim], *fi1 = &f[(i-1)*i_dim], sum, qli = qual[i];
126 int beg = 1, end = l_ref, x;
127 uint8_t qyi = query[i];
128 x = i - bw; beg = beg > x? beg : x; // band start
129 x = i + bw; end = end < x? end : x; // band end
130 double E[] = {
131 qli * EM, // 00
132 1. - qli, // 01
133 1., // 10
134 1., // 11
136 double M = 1./s[i-1];
137 for (k = beg, sum = 0.; k <= end; ++k) {
138 int u, v11, v01, v10;
139 double e;
140 e = E[(ref[k] > 3 || qyi > 3)*2 + (ref[k] == qyi)];
141 set_u(u, bw, i, k); set_u(v11, bw, i-1, k-1); set_u(v10, bw, i-1, k); set_u(v01, bw, i, k-1);
142 fi[u+0] = e * (m[0] * M*fi1[v11+0] + m[3] * M*fi1[v11+1] + m[6] * M*fi1[v11+2]);
143 fi[u+1] = EI * (m[1] * M*fi1[v10+0] + m[4] * M*fi1[v10+1]);
144 fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2];
145 sum += fi[u] + fi[u+1] + fi[u+2];
146 // fprintf(stderr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG
148 s[i] = sum;
150 { // f[l_query+1]
151 double sum;
152 double M = 1./s[l_query];
153 for (k = 1, sum = 0.; k <= l_ref; ++k) {
154 int u;
155 set_u(u, bw, l_query, k);
156 if (u < 3 || u >= bw2*3+3) continue;
157 sum += M*f[l_query*i_dim + u+0] * sM + M*f[l_query*i_dim + u+1] * sI;
159 s[l_query+1] = sum; // the last scaling factor
161 { // compute likelihood
162 double p = 1., Pr1 = 0.;
163 for (i = 0; i <= l_query + 1; ++i) {
164 p *= s[i];
165 if (p < 1e-100) Pr1 += -4.343 * log(p), p = 1.;
167 Pr1 += -4.343 * log(p * l_ref * l_query);
168 Pr = (int)(Pr1 + .499);
169 if (!is_backward) { // skip backward and MAP
170 free(f); free(s); free(_qual);
171 return Pr;
174 /*** backward ***/
175 // b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from)
176 for (k = 1; k <= l_ref; ++k) {
177 int u;
178 double *bi = &b[l_query*i_dim];
179 set_u(u, bw, l_query, k);
180 if (u < 3 || u >= bw2*3+3) continue;
181 bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1];
183 // b[l_query-1..1]
184 for (i = l_query - 1; i >= 1; --i) {
185 int beg = 1, end = l_ref, x, _beg, _end;
186 double *bi = &b[i*i_dim], *bi1 = &b[(i+1)*i_dim], y = (i > 1), qli1 = qual[i+1];
187 uint8_t qyi1 = query[i+1];
188 x = i - bw; beg = beg > x? beg : x;
189 x = i + bw; end = end < x? end : x;
190 double E[] = {
191 qli1 * EM, //000
192 1. - qli1, //001
193 1., //010
194 1., //011
195 //0,0,0,0 //1xx
197 for (k = end; k >= beg; --k) {
198 int u, v11, v01, v10;
199 double e;
200 set_u(u, bw, i, k); set_u(v11, bw, i+1, k+1); set_u(v10, bw, i+1, k); set_u(v01, bw, i, k+1);
201 e = (k>=l_ref)?0 :E[(ref[k+1] > 3 || qyi1 > 3)*2 + (ref[k+1] == qyi1)] * bi1[v11];
202 bi[u+0] = e * m[0] + EI * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e.
203 bi[u+1] = e * m[3] + EI * m[4] * bi1[v10+1];
204 bi[u+2] = (e * m[6] + m[8] * bi[v01+2]) * y;
205 // fprintf(stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG
207 // rescale
208 set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2;
209 for (k = _beg, y = 1./s[i]; k <= _end; ++k) bi[k] *= y;
211 { // b[0]
212 int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1;
213 double sum = 0.;
214 for (k = end; k >= beg; --k) {
215 int u;
216 double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM;
217 set_u(u, bw, 1, k);
218 if (u < 3 || u >= bw2*3+3) continue;
219 sum += e * b[1*i_dim + u+0] * bM + EI * b[1*i_dim + u+1] * bI;
221 set_u(k, bw, 0, 0);
222 b[0*i_dim + k] = sum / s[0]; // if everything works as is expected, b[0][k] == 1.0
224 /*** MAP ***/
225 for (i = 1; i <= l_query; ++i) {
226 double sum = 0., *fi = &f[i*i_dim], *bi = &b[i*i_dim], max = 0.;
227 int beg = 1, end = l_ref, x, max_k = -1;
228 x = i - bw; beg = beg > x? beg : x;
229 x = i + bw; end = end < x? end : x;
230 double M = 1./s[i];
231 for (k = beg; k <= end; ++k) {
232 int u;
233 double z;
234 set_u(u, bw, i, k);
235 z = M*fi[u+0] * bi[u+0]; if (z > max) max = z, max_k = (k-1)<<2 | 0; sum += z;
236 z = M*fi[u+1] * bi[u+1]; if (z > max) max = z, max_k = (k-1)<<2 | 1; sum += z;
238 max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0
239 if (state) state[i-1] = max_k;
240 if (q) k = (int)(-4.343 * log(1. - max) + .499), q[i-1] = k > 100? 99 : k;
241 #ifdef _MAIN
242 k = 0;
243 set_u(k, bw, 0, 0);
244 fprintf(stderr, "(%.10lg,%.10lg) (%d,%d:%c,%c:%d) %lg\n", b[0][k], sum, i-1, max_k>>2,
245 "ACGT"[query[i]], "ACGT"[ref[(max_k>>2)+1]], max_k&3, max); // DEBUG
246 #endif
248 /*** free ***/
249 free(f); free(b); free(s); free(_qual);
250 return Pr;
253 #ifdef _MAIN
254 #include <unistd.h>
255 int main(int argc, char *argv[])
257 uint8_t conv[256], *iqual, *ref, *query;
258 probaln_par_t par = { 0.001, 0.1, 10 };
259 int c, l_ref, l_query, i, q = 30, b = 10, P;
260 while ((c = getopt(argc, argv, "b:q:")) >= 0) {
261 switch (c) {
262 case 'b': b = atoi(optarg); break;
263 case 'q': q = atoi(optarg); break;
266 if (optind + 2 > argc) {
267 fprintf(stderr, "Usage: %s [-q %d] [-b %d] <ref> <query>\n", argv[0], q, b); // example: acttc attc
268 return 1;
270 memset(conv, 4, 256);
271 conv['a'] = conv['A'] = 0; conv['c'] = conv['C'] = 1;
272 conv['g'] = conv['G'] = 2; conv['t'] = conv['T'] = 3;
273 ref = (uint8_t*)argv[optind]; query = (uint8_t*)argv[optind+1];
274 l_ref = strlen((char*)ref); l_query = strlen((char*)query);
275 for (i = 0; i < l_ref; ++i) ref[i] = conv[ref[i]];
276 for (i = 0; i < l_query; ++i) query[i] = conv[query[i]];
277 iqual = malloc(l_query);
278 memset(iqual, q, l_query);
279 par.bw = b;
280 P = probaln_glocal(ref, l_ref, query, l_query, iqual, &par, 0, 0);
281 fprintf(stderr, "%d\n", P);
282 free(iqual);
283 return 0;
285 #endif