Merge pull request #589 from tjni/clean-up-build-dependencies
[MACS.git] / MACS3 / Signal / VariantStat.pyx
blobbe2699ac39d939ff1f755066a500fe80b80143c8
1 # cython: language_level=3
2 # cython: profile=True
3 # Time-stamp: <2020-12-04 18:41:28 Tao Liu>
5 """Module for SAPPER BAMParser class
7 Copyright (c) 2017 Tao Liu <tliu4@buffalo.edu>
9 This code is free software; you can redistribute it and/or modify it
10 under the terms of the BSD License (see the file COPYING included
11 with the distribution).
13 @status: experimental
14 @version: $Revision$
15 @author: Tao Liu
16 @contact: tliu4@buffalo.edu
17 """
19 # ------------------------------------
20 # python modules
21 # ------------------------------------
22 from cpython cimport bool
24 cimport cython
26 import numpy as np
27 cimport numpy as np
29 ctypedef np.float32_t float32_t
30 ctypedef np.int32_t int32_t
32 #from libc.math cimport log10, log, exp, M_LN10 #,fabs,log1p
33 #from libc.math cimport M_LN10
34 from math import log1p, exp, log
36 LN10 = 2.3025850929940458
37 LN10_tenth = 0.23025850929940458
39 @cython.boundscheck(False) # turn off bounds-checking for entire function
40 @cython.wraparound(False) # turn off negative index wrapping for entire function
41 cpdef tuple CalModel_Homo( np.ndarray[int32_t, ndim=1] top1_bq_T, np.ndarray[int32_t, ndim=1] top1_bq_C, np.ndarray[int32_t, ndim=1] top2_bq_T, np.ndarray[int32_t, ndim=1] top2_bq_C):
42 """Return (lnL, BIC).
44 """
45 cdef:
46 int i
47 double lnL, BIC
49 lnL=0
50 # Phred score is Phred = -10log_{10} E, where E is the error rate.
51 # to get the 1-E: 1-E = 1-exp( Phred/-10*M_LN10 ) = 1-exp( Phred * -LOG10_E_tenth )
52 for i in range( top1_bq_T.shape[0] ):
53 lnL += log1p( -exp(-top1_bq_T[ i ]*LN10_tenth) )
54 for i in range( top1_bq_C.shape[0] ):
55 lnL += log1p( -exp(-top1_bq_C[ i ]*LN10_tenth) )
57 for i in range( top2_bq_T.shape[0] ):
58 lnL += log( exp(-top2_bq_T[ i ]*LN10_tenth) )
59 for i in range( top2_bq_C.shape[0] ):
60 lnL += log( exp(-top2_bq_C[ i ]*LN10_tenth) )
62 BIC = -2*lnL # no free variable, no penalty
63 return (lnL, BIC)
65 @cython.boundscheck(False) # turn off bounds-checking for entire function
66 @cython.wraparound(False) # turn off negative index wrapping for entire function
67 cpdef tuple CalModel_Heter_noAS( np.ndarray[int32_t, ndim=1] top1_bq_T,np.ndarray[int32_t, ndim=1] top1_bq_C,np.ndarray[int32_t, ndim=1] top2_bq_T,np.ndarray[int32_t, ndim=1] top2_bq_C ):
68 """Return (lnL, BIC)
70 k_T
71 k_C
72 """
73 cdef:
74 int k_T, k_C
75 double lnL, BIC
76 int i
77 int tn_T, tn_C, tn # total observed NTs
78 double lnL_T, lnL_C # log likelihood for treatment and control
80 lnL = 0
81 BIC = 0
82 #for k_T
83 # total oberseved treatment reads from top1 and top2 NTs
84 tn_T = top1_bq_T.shape[0] + top2_bq_T.shape[0]
86 if tn_T == 0:
87 raise Exception("Total number of treatment reads is 0!")
88 else:
89 ( lnL_T, k_T ) = GreedyMaxFunctionNoAS( top1_bq_T.shape[0], top2_bq_T.shape[0], tn_T, top1_bq_T, top2_bq_T )
90 lnL += lnL_T
91 BIC += -2*lnL_T
93 #for k_C
94 tn_C = top1_bq_C.shape[0] + top2_bq_C.shape[0]
96 if tn_C == 0:
97 pass
98 else:
99 ( lnL_C, k_C ) = GreedyMaxFunctionNoAS( top1_bq_C.shape[0], top2_bq_C.shape[0], tn_C, top1_bq_C, top2_bq_C )
100 lnL += lnL_C
101 BIC += -2*lnL_C
103 tn = tn_C + tn_T
105 # we penalize big model depending on the number of reads/samples
106 if tn_T == 0:
107 BIC += log( tn_C )
108 elif tn_C == 0:
109 BIC += log( tn_T )
110 else:
111 BIC += log( tn_T ) + log( tn_C )
113 return ( lnL, BIC )
116 @cython.boundscheck(False) # turn off bounds-checking for entire function
117 @cython.wraparound(False) # turn off negative index wrapping for entire function
118 cpdef tuple CalModel_Heter_AS( np.ndarray[int32_t, ndim=1] top1_bq_T, np.ndarray[int32_t, ndim=1] top1_bq_C, np.ndarray[int32_t, ndim=1] top2_bq_T, np.ndarray[int32_t, ndim=1] top2_bq_C, float max_allowed_ar = 0.99 ):
119 """Return (lnL, BIC)
123 AS_alleleratio
125 cdef:
126 int k_T, k_C
127 double lnL, BIC
128 int i
129 int tn_T, tn_C, tn # total observed NTs
130 double lnL_T, lnL_C # log likelihood for treatment and control
131 double AS_alleleratio # allele ratio
133 lnL = 0
134 BIC = 0
136 #assert top2_bq_T.shape[0] + top2_bq_C.shape[0] > 0, "Total number of top2 nt should not be zero while using this function: CalModel_Heter_AS!"
138 # Treatment
139 tn_T = top1_bq_T.shape[0] + top2_bq_T.shape[0]
141 if tn_T == 0:
142 raise Exception("Total number of treatment reads is 0!")
143 else:
144 ( lnL_T, k_T, AS_alleleratio ) = GreedyMaxFunctionAS( top1_bq_T.shape[0], top2_bq_T.shape[0], tn_T, top1_bq_T, top2_bq_T, max_allowed_ar)
145 #print ">>>",lnL_T, k_T, AS_alleleratio
146 lnL += lnL_T
147 BIC += -2*lnL_T
149 # control
150 tn_C = top1_bq_C.shape[0] + top2_bq_C.shape[0]
152 if tn_C == 0:
153 pass
154 else:
155 # We assume control will not have allele preference
156 ( lnL_C, k_C ) = GreedyMaxFunctionNoAS ( top1_bq_C.shape[0], top2_bq_C.shape[0], tn_C, top1_bq_C, top2_bq_C)
157 lnL += lnL_C
158 BIC += -2*lnL_C
160 tn = tn_C + tn_T
162 # we penalize big model depending on the number of reads/samples
163 if tn_T == 0:
164 BIC += log( tn_C )
165 elif tn_C == 0:
166 BIC += 2 * log( tn_T )
167 else:
168 BIC += 2 * log( tn_T ) + log( tn_C )
170 return (lnL, BIC)
173 @cython.boundscheck(False) # turn off bounds-checking for entire function
174 @cython.wraparound(False) # turn off negative index wrapping for entire function
175 cdef tuple GreedyMaxFunctionAS( int m, int n, int tn, np.ndarray[int32_t, ndim=1] me, np.ndarray[int32_t, ndim=1] ne, float max_allowed_ar = 0.99 ):
176 """Return lnL, k and alleleratio in tuple.
178 Note: I only translate Liqing's C++ code into pyx here. Haven't done any review.
180 cdef:
181 double dnew, dold, rold, rnew
182 int kold, knew
183 bool btemp
184 int k0
185 double dl, dr, d0, d1l, d1r
187 assert m+n == tn
188 btemp = False
189 if tn == 1: # only 1 read; I don't expect this to be run...
190 dl=calculate_ln(m,n,tn,me,ne,0,0);
191 dr=calculate_ln(m,n,tn,me,ne,1,1);
193 if dl>dr:
194 k = 0
195 return ( dl, 0, 0 )
196 else:
197 k = 1
198 return ( dr, 1, 1 )
199 elif m == 0: #no top1 nt
200 return ( calculate_ln( m, n, tn, me, ne, 0, m, max_allowed_ar ), m, 1-max_allowed_ar )
201 #k0 = m + 1
202 elif m == tn: #all reads are top1
203 return ( calculate_ln( m, n, tn, me, ne, 1, m, max_allowed_ar ), m, max_allowed_ar )
204 else:
205 k0 = m
207 d0 = calculate_ln( m, n, tn, me, ne, float(k0)/tn, k0, max_allowed_ar )
208 d1l = calculate_ln( m, n, tn, me, ne, float(k0-1)/tn, k0-1, max_allowed_ar )
209 d1r = calculate_ln( m, n, tn, me, ne, float(k0+1)/tn, k0+1, max_allowed_ar )
211 if d0 > d1l-1e-8 and d0 > d1r-1e-8:
212 k = k0
213 ar = float(k0)/tn
214 return ( d0, k, ar )
215 elif d1l > d0:
216 dold = d1l
217 kold = k0-1
218 rold = float(k0-1)/tn
219 while kold > 1: #disable: when kold=1 still run, than knew=0 is the final run
220 knew = kold - 1
221 rnew = float(knew)/tn
223 dnew = calculate_ln( m,n,tn,me,ne,rnew,knew, max_allowed_ar )
225 if(dnew-1e-8 < dold) :
226 btemp=True
227 break
228 kold=knew
229 dold=dnew
230 rold=rnew
232 if btemp: #maximum L value is in [1,m-1];
233 k = kold
234 ar= rold
235 return ( dold, k, ar )
236 else: #L(k=0) is the max for [0,m-1]
237 k = kold
238 ar = rold
239 return ( dold, k, ar )
241 elif d1r > d0:
242 dold = d1r
243 kold = k0 + 1
244 rold = float(k0 + 1)/tn
245 while kold < tn - 1: #//disable: when kold=tn-1 still run, than knew=tn is the final run
246 knew = kold + 1
248 rnew = float(knew)/tn
250 dnew = calculate_ln( m,n,tn,me,ne,rnew,knew, max_allowed_ar )
252 if dnew - 1e-8 < dold:
253 btemp = True
254 break
255 kold = knew
256 dold = dnew
257 rold = rnew
259 if btemp: #maximum L value is in [m+1,tn-1]
260 k = kold
261 ar= rold
262 return ( dold, k, ar )
263 else: #L(k=tn) is the max for [m+1,tn]
264 k = kold
265 ar = rold
266 return ( dold, k, ar )
267 else:
268 raise Exception("error in GreedyMaxFunctionAS")
271 @cython.boundscheck(False) # turn off bounds-checking for entire function
272 @cython.wraparound(False) # turn off negative index wrapping for entire function
273 cdef tuple GreedyMaxFunctionNoAS (int m, int n, int tn, np.ndarray[int32_t, ndim=1] me, np.ndarray[int32_t, ndim=1] ne ):
274 """Return lnL, and k in tuple.
276 Note: I only translate Liqing's C++ code into pyx here. Haven't done any review.
278 cdef:
279 double dnew, dold
280 int kold, knew
281 bool btemp
282 int k0
283 double bg_r, dl, dr, d0, d1l, d1r
285 btemp = False
286 bg_r = 0.5
288 if tn == 1:
289 dl = calculate_ln( m, n, tn, me, ne, bg_r, 0)
290 dr= calculate_ln( m, n, tn, me, ne, bg_r, 1)
291 if dl > dr:
292 k = 0
293 return ( dl, 0 )
294 else:
295 k = 1
296 return ( dr, 1 )
297 elif m == 0: #no top1 nt
298 return ( calculate_ln( m, n, tn, me, ne, bg_r, m ), m )
299 #k0 = m + 1
300 elif m == tn: #all reads are top1
301 return ( calculate_ln( m, n, tn, me, ne, bg_r, m ), m )
302 #elif m == 0:
303 # k0 = m + 1
304 #elif m == tn:
305 # k0 = m - 1
306 else:
307 k0 = m
309 d0 = calculate_ln( m, n, tn, me, ne, bg_r, k0)
310 d1l = calculate_ln( m, n, tn, me, ne, bg_r, k0 - 1)
311 d1r = calculate_ln( m, n, tn, me, ne, bg_r, k0 + 1)
313 if d0 > d1l - 1e-8 and d0 > d1r - 1e-8:
314 k = k0
315 return ( d0, k )
316 elif d1l > d0:
317 dold = d1l
318 kold=k0 - 1
319 while kold >= 1: #//when kold=1 still run, than knew=0 is the final run
320 knew = kold - 1
321 dnew = calculate_ln( m, n, tn, me, ne, bg_r, knew )
322 if dnew - 1e-8 < dold:
323 btemp = True
324 break
325 kold=knew
326 dold=dnew
328 if btemp: #//maximum L value is in [1,m-1];
329 k = kold
330 return ( dold, k )
331 else: #//L(k=0) is the max for [0,m-1]
332 k = kold
333 return ( dold, k )
334 elif d1r > d0:
335 dold = d1r
336 kold = k0 + 1
337 while kold <= tn - 1: #//when kold=tn-1 still run, than knew=tn is the final run
338 knew = kold + 1
339 dnew = calculate_ln( m, n, tn, me, ne, bg_r, knew )
340 if dnew - 1e-8 < dold:
341 btemp = True
342 break
343 kold = knew
344 dold = dnew
346 if btemp: #//maximum L value is in [m+1,tn-1]
347 k = kold
348 return ( dold, k )
349 else: #//L(k=tn) is the max for [m+1,tn]
350 k = kold
351 return ( dold, k )
352 else:
353 raise Exception("error in GreedyMaxFunctionNoAS")
355 @cython.boundscheck(False) # turn off bounds-checking for entire function
356 @cython.wraparound(False) # turn off negative index wrapping for entire function
357 cdef calculate_ln( int m, int n, int tn, np.ndarray[int32_t, ndim=1] me, np.ndarray[int32_t, ndim=1] ne, double r, int k, float max_allowed_r = 0.99):
358 """Calculate log likelihood given quality of top1 and top2, the ratio r and the observed k.
361 cdef:
362 int i
363 double lnL
364 double e
366 lnL = 0
368 if r > max_allowed_r or r < 1 - max_allowed_r: # r is extremely high or
369 #print "here1"
370 lnL += k*log( max_allowed_r ) + (tn-k)*log( 1- max_allowed_r) #-10000
371 else:
372 lnL += k*log( r ) + (tn-k)*log(1-r)
374 if k == 0 or k == tn: # it's entirely biased toward 1 allele
375 #print "here2"
376 pass
377 #lnL += k*log( max_allowed_r ) #-10000
378 #lnL += -10000
379 elif k <= tn/2:
380 for i in range( k ):
381 lnL += log(float(tn-i)/(k-i))
382 else:
383 for i in range( tn-k ):
384 lnL += log(float(tn-i)/(tn-k-i))
386 for i in range( m ):
387 e = exp( - me[ i ] * LN10_tenth )
388 lnL += log((1-e)*(float(k)/tn) + e*(1-float(k)/tn))
390 for i in range( n ):
391 e = exp( - ne[ i ] * LN10_tenth )
392 lnL += log((1-e)*(1-float(k)/tn) + e*(float(k)/tn))
394 #print r,k,lnL
395 return lnL
397 cpdef int calculate_GQ ( double lnL1, double lnL2, double lnL3):
398 """GQ1 = -10*log_{10}((L2+L3)/(L1+L2+L3))
402 cdef:
403 double L1, L2, L3, sum, tmp
404 int GQ_score
406 #L1 = exp(lnL1-lnL1)
407 L1 = 1
408 L2 = exp(lnL2-lnL1)
409 L3 = exp(lnL3-lnL1)
411 #if L1 > 1:
412 # L1 = 1
414 if L2 > 1:
415 L2 = 1
416 if L3 > 1:
417 L3 = 1
418 #if(L1<1e-110) L1=1e-110;
419 if L2 < 1e-110:
420 L2=1e-110
421 if L3 < 1e-110:
422 L3 = 1e-110
424 sum = L1 + L2 + L3
425 tmp = ( L2 + L3 )/sum
426 if tmp > 1e-110:
427 GQ_score = (int)(-4.34294*log(tmp))
428 else:
429 GQ_score = 255
431 return GQ_score;
433 cpdef int calculate_GQ_heterASsig( double lnL1, double lnL2):
436 cdef:
437 double L1, L2, sum, tmp
438 int ASsig_score
440 #L1=exp(2.7182818,lnL1-lnL1)
441 L1 = 1
442 L2 = exp( lnL2 - lnL1 )
444 #if L1 > 1:
445 # L1 = 1
446 if L2 > 1:
447 L2 = 1
448 #if L1 < 1e-110:
449 # L1 = 1e-110
450 if L2 < 1e-110:
451 L2 = 1e-110
453 sum = L1 + L2
454 tmp = L2/sum
455 if tmp > 1e-110:
456 ASsig_score = (int)(-4.34294*log(tmp))
457 else:
458 ASsig_score = 255
460 return ASsig_score