3 Copyright (C) 2003-2006, 2008-2010 by Heng Li <lh3lh3@live.co.uk>
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 #include "htslib/hts.h"
35 /*****************************************
36 * Probabilistic banded glocal alignment *
37 *****************************************/
40 #define EM .33333333333
42 static float g_qual2prob
[256];
44 #define set_u(u, b, i, k) { int x=(i)-(b); x=x>0?x:0; (u)=((k)-x+1)*3; }
47 The topology of the profile HMM:
53 M[0] M[1] -> ... -> M[k-1] -> M[k] -> ... -> M[L] M[L+1]
58 M[0] points to every {M,I}[k] and every {M,I}[k] points to M[L+1].
60 On input, _ref is the reference sequence and _query is the query
61 sequence. Both are sequences of 0/1/2/3/4 where 4 stands for an
62 ambiguous residue. iqual is the base quality. c sets the gap open
63 probability, gap extension probability and band width.
65 On output, state and q are arrays of length l_query. The higher 30
66 bits give the reference position the query base is matched to and the
67 lower two bits can be 0 (an alignment match) or 1 (an
68 insertion). q[i] gives the phred scaled posterior probability of
71 int probaln_glocal(const uint8_t *_ref
, int l_ref
, const uint8_t *_query
, int l_query
,
72 const uint8_t *iqual
, const probaln_par_t
*c
, int *state
, uint8_t *q
)
74 double *f
, *b
= 0, *s
, m
[9], sI
, sM
, bI
, bM
;
76 const uint8_t *ref
, *query
;
77 int bw
, bw2
, i
, k
, is_backward
= 1, Pr
;
79 if ( l_ref
<=0 || l_query
<=0 ) return 0; // FIXME: this may not be an ideal fix, just prevents sefgault
81 /*** initialization ***/
82 is_backward
= state
&& q
? 1 : 0;
83 ref
= _ref
- 1; query
= _query
- 1; // change to 1-based coordinate
84 bw
= l_ref
> l_query
? l_ref
: l_query
;
85 if (bw
> c
->bw
) bw
= c
->bw
;
86 if (bw
< abs(l_ref
- l_query
)) bw
= abs(l_ref
- l_query
);
89 // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[]
90 // Ideally these callocs would be mallocs + initialisation of the few bits needed.
91 f
= calloc((l_query
+1)*i_dim
, sizeof(double));
92 if (is_backward
) b
= calloc((l_query
+1)*i_dim
, sizeof(double));
93 s
= malloc((l_query
+2) * sizeof(double)); // s[] is the scaling factor to avoid underflow
95 _qual
= malloc(l_query
* sizeof(float));
96 if (g_qual2prob
[0] == 0)
97 for (i
= 0; i
< 256; ++i
)
98 g_qual2prob
[i
] = pow(10, -i
/10.);
99 for (i
= 0; i
< l_query
; ++i
) _qual
[i
] = g_qual2prob
[iqual
? iqual
[i
] : 30];
101 // initialize transition probability
102 sM
= sI
= 1. / (2 * l_query
+ 2); // the value here seems not to affect results; FIXME: need proof
103 m
[0*3+0] = (1 - c
->d
- c
->d
) * (1 - sM
); m
[0*3+1] = m
[0*3+2] = c
->d
* (1 - sM
);
104 m
[1*3+0] = (1 - c
->e
) * (1 - sI
); m
[1*3+1] = c
->e
* (1 - sI
); m
[1*3+2] = 0.;
105 m
[2*3+0] = 1 - c
->e
; m
[2*3+1] = 0.; m
[2*3+2] = c
->e
;
106 bM
= (1 - c
->d
) / l_ref
; bI
= c
->d
/ l_ref
; // (bM+bI)*l_ref==1
110 f
[0*i_dim
+k
] = s
[0] = 1.;
112 double *fi
= &f
[1*i_dim
], sum
;
113 int beg
= 1, end
= l_ref
< bw
+ 1? l_ref
: bw
+ 1;
114 for (k
= beg
, sum
= 0.; k
<= end
; ++k
) {
116 double e
= (ref
[k
] > 3 || query
[1] > 3)? 1. : ref
[k
] == query
[1]? 1. - qual
[1] : qual
[1] * EM
;
118 fi
[u
+0] = e
* bM
; fi
[u
+1] = EI
* bI
;
119 sum
+= fi
[u
] + fi
[u
+1];
124 for (i
= 2; i
<= l_query
; ++i
) {
125 double *fi
= &f
[i
*i_dim
], *fi1
= &f
[(i
-1)*i_dim
], sum
, qli
= qual
[i
];
126 int beg
= 1, end
= l_ref
, x
;
127 uint8_t qyi
= query
[i
];
128 x
= i
- bw
; beg
= beg
> x
? beg
: x
; // band start
129 x
= i
+ bw
; end
= end
< x
? end
: x
; // band end
136 double M
= 1./s
[i
-1];
137 for (k
= beg
, sum
= 0.; k
<= end
; ++k
) {
138 int u
, v11
, v01
, v10
;
140 e
= E
[(ref
[k
] > 3 || qyi
> 3)*2 + (ref
[k
] == qyi
)];
141 set_u(u
, bw
, i
, k
); set_u(v11
, bw
, i
-1, k
-1); set_u(v10
, bw
, i
-1, k
); set_u(v01
, bw
, i
, k
-1);
142 fi
[u
+0] = e
* (m
[0] * M
*fi1
[v11
+0] + m
[3] * M
*fi1
[v11
+1] + m
[6] * M
*fi1
[v11
+2]);
143 fi
[u
+1] = EI
* (m
[1] * M
*fi1
[v10
+0] + m
[4] * M
*fi1
[v10
+1]);
144 fi
[u
+2] = m
[2] * fi
[v01
+0] + m
[8] * fi
[v01
+2];
145 sum
+= fi
[u
] + fi
[u
+1] + fi
[u
+2];
146 // fprintf(stderr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG
152 double M
= 1./s
[l_query
];
153 for (k
= 1, sum
= 0.; k
<= l_ref
; ++k
) {
155 set_u(u
, bw
, l_query
, k
);
156 if (u
< 3 || u
>= bw2
*3+3) continue;
157 sum
+= M
*f
[l_query
*i_dim
+ u
+0] * sM
+ M
*f
[l_query
*i_dim
+ u
+1] * sI
;
159 s
[l_query
+1] = sum
; // the last scaling factor
161 { // compute likelihood
162 double p
= 1., Pr1
= 0.;
163 for (i
= 0; i
<= l_query
+ 1; ++i
) {
165 if (p
< 1e-100) Pr1
+= -4.343 * log(p
), p
= 1.;
167 Pr1
+= -4.343 * log(p
* l_ref
* l_query
);
168 Pr
= (int)(Pr1
+ .499);
169 if (!is_backward
) { // skip backward and MAP
170 free(f
); free(s
); free(_qual
);
175 // b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from)
176 for (k
= 1; k
<= l_ref
; ++k
) {
178 double *bi
= &b
[l_query
*i_dim
];
179 set_u(u
, bw
, l_query
, k
);
180 if (u
< 3 || u
>= bw2
*3+3) continue;
181 bi
[u
+0] = sM
/ s
[l_query
] / s
[l_query
+1]; bi
[u
+1] = sI
/ s
[l_query
] / s
[l_query
+1];
184 for (i
= l_query
- 1; i
>= 1; --i
) {
185 int beg
= 1, end
= l_ref
, x
, _beg
, _end
;
186 double *bi
= &b
[i
*i_dim
], *bi1
= &b
[(i
+1)*i_dim
], y
= (i
> 1), qli1
= qual
[i
+1];
187 uint8_t qyi1
= query
[i
+1];
188 x
= i
- bw
; beg
= beg
> x
? beg
: x
;
189 x
= i
+ bw
; end
= end
< x
? end
: x
;
197 for (k
= end
; k
>= beg
; --k
) {
198 int u
, v11
, v01
, v10
;
200 set_u(u
, bw
, i
, k
); set_u(v11
, bw
, i
+1, k
+1); set_u(v10
, bw
, i
+1, k
); set_u(v01
, bw
, i
, k
+1);
201 e
= (k
>=l_ref
)?0 :E
[(ref
[k
+1] > 3 || qyi1
> 3)*2 + (ref
[k
+1] == qyi1
)] * bi1
[v11
];
202 bi
[u
+0] = e
* m
[0] + EI
* m
[1] * bi1
[v10
+1] + m
[2] * bi
[v01
+2]; // bi1[v11] has been foled into e.
203 bi
[u
+1] = e
* m
[3] + EI
* m
[4] * bi1
[v10
+1];
204 bi
[u
+2] = (e
* m
[6] + m
[8] * bi
[v01
+2]) * y
;
205 // fprintf(stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG
208 set_u(_beg
, bw
, i
, beg
); set_u(_end
, bw
, i
, end
); _end
+= 2;
209 for (k
= _beg
, y
= 1./s
[i
]; k
<= _end
; ++k
) bi
[k
] *= y
;
212 int beg
= 1, end
= l_ref
< bw
+ 1? l_ref
: bw
+ 1;
214 for (k
= end
; k
>= beg
; --k
) {
216 double e
= (ref
[k
] > 3 || query
[1] > 3)? 1. : ref
[k
] == query
[1]? 1. - qual
[1] : qual
[1] * EM
;
218 if (u
< 3 || u
>= bw2
*3+3) continue;
219 sum
+= e
* b
[1*i_dim
+ u
+0] * bM
+ EI
* b
[1*i_dim
+ u
+1] * bI
;
222 b
[0*i_dim
+ k
] = sum
/ s
[0]; // if everything works as is expected, b[0][k] == 1.0
225 for (i
= 1; i
<= l_query
; ++i
) {
226 double sum
= 0., *fi
= &f
[i
*i_dim
], *bi
= &b
[i
*i_dim
], max
= 0.;
227 int beg
= 1, end
= l_ref
, x
, max_k
= -1;
228 x
= i
- bw
; beg
= beg
> x
? beg
: x
;
229 x
= i
+ bw
; end
= end
< x
? end
: x
;
231 for (k
= beg
; k
<= end
; ++k
) {
235 z
= M
*fi
[u
+0] * bi
[u
+0]; if (z
> max
) max
= z
, max_k
= (k
-1)<<2 | 0; sum
+= z
;
236 z
= M
*fi
[u
+1] * bi
[u
+1]; if (z
> max
) max
= z
, max_k
= (k
-1)<<2 | 1; sum
+= z
;
238 max
/= sum
; sum
*= s
[i
]; // if everything works as is expected, sum == 1.0
239 if (state
) state
[i
-1] = max_k
;
240 if (q
) k
= (int)(-4.343 * log(1. - max
) + .499), q
[i
-1] = k
> 100? 99 : k
;
244 fprintf(stderr
, "(%.10lg,%.10lg) (%d,%d:%c,%c:%d) %lg\n", b
[0][k
], sum
, i
-1, max_k
>>2,
245 "ACGT"[query
[i
]], "ACGT"[ref
[(max_k
>>2)+1]], max_k
&3, max
); // DEBUG
249 free(f
); free(b
); free(s
); free(_qual
);
255 int main(int argc
, char *argv
[])
257 uint8_t conv
[256], *iqual
, *ref
, *query
;
258 probaln_par_t par
= { 0.001, 0.1, 10 };
259 int c
, l_ref
, l_query
, i
, q
= 30, b
= 10, P
;
260 while ((c
= getopt(argc
, argv
, "b:q:")) >= 0) {
262 case 'b': b
= atoi(optarg
); break;
263 case 'q': q
= atoi(optarg
); break;
266 if (optind
+ 2 > argc
) {
267 fprintf(stderr
, "Usage: %s [-q %d] [-b %d] <ref> <query>\n", argv
[0], q
, b
); // example: acttc attc
270 memset(conv
, 4, 256);
271 conv
['a'] = conv
['A'] = 0; conv
['c'] = conv
['C'] = 1;
272 conv
['g'] = conv
['G'] = 2; conv
['t'] = conv
['T'] = 3;
273 ref
= (uint8_t*)argv
[optind
]; query
= (uint8_t*)argv
[optind
+1];
274 l_ref
= strlen((char*)ref
); l_query
= strlen((char*)query
);
275 for (i
= 0; i
< l_ref
; ++i
) ref
[i
] = conv
[ref
[i
]];
276 for (i
= 0; i
< l_query
; ++i
) query
[i
] = conv
[query
[i
]];
277 iqual
= malloc(l_query
);
278 memset(iqual
, q
, l_query
);
280 P
= probaln_glocal(ref
, l_ref
, query
, l_query
, iqual
, &par
, 0, 0);
281 fprintf(stderr
, "%d\n", P
);