1 # cython: language_level=3
3 # Time-stamp: <2019-10-30 17:28:00 taoliu>
5 """Module Description: statistics functions to calculate p-values
7 This code is free software; you can redistribute it and/or modify it
8 under the terms of the BSD License (see the file LICENSE included with
12 # ------------------------------------
14 # ------------------------------------
15 from libc
.math cimport exp
,log
,log10
, M_LN10
#,fabs,log1p
17 from math
import log1p
#as py_log1p
22 from numpy cimport uint8_t
, uint16_t
, uint32_t
, uint64_t
, int8_t
, int16_t
, int32_t
, int64_t
, float32_t
, float64_t
24 #ctypedef np.float64_t float64_t
25 #ctypedef np.float32_t float32_t
26 #ctypedef np.int64_t int64_t
27 #ctypedef np.int32_t int32_t
28 #ctypedef np.uint64_t uint64_t
29 #ctypedef np.uint32_t uint32_t
31 from cpython cimport bool
32 # ------------------------------------
34 # ------------------------------------
35 cdef int32_t LSTEP
= 200
36 cdef float64_t EXPTHRES
= exp
(LSTEP
)
37 cdef float64_t EXPSTEP
= exp
(-1*LSTEP
)
38 cdef float64_t bigx
= 20
40 # ------------------------------------
41 # Normal distribution functions
42 # ------------------------------------
43 # x is the value, u is the mean, v is the variance
44 cpdef pnorm
(int32_t x
, int32_t u
, int32_t v
):
45 """The probability of X=x when X=Norm(u,v)
47 return 1.0/sqrt
(2.0 * 3.141592653589793 * <float32_t
>(v
)) * exp
(-<float32_t
>(x
-u
)**2 / (2.0 * <float32_t
>(v
)))
49 # ------------------------------------
51 # ------------------------------------
53 cpdef factorial
( uint32_t n
):
57 cdef float64_t fact
= 1
61 for i
in range( 2,n
+1 ):
65 cdef float64_t poz
(float64_t z
):
66 """ probability of normal z value
70 float64_t Z_MAX
= 6.0 # Maximum meaningful z value
75 if y
>= (Z_MAX
* 0.5):
79 x
= ((((((((0.000124818987 * w
80 - 0.001075204047) * w
+ 0.005198775019) * w
81 - 0.019198292004) * w
+ 0.059054035642) * w
82 - 0.151968751364) * w
+ 0.319152932694) * w
83 - 0.531923007300) * w
+ 0.797884560593) * y
* 2.0
86 x
= (((((((((((((-0.000045255659 * y
87 + 0.000152529290) * y
- 0.000019538132) * y
88 - 0.000676904986) * y
+ 0.001390604284) * y
89 - 0.000794620820) * y
- 0.002034254874) * y
90 + 0.006549791214) * y
- 0.010557625006) * y
91 + 0.011630447319) * y
- 0.009279453341) * y
92 + 0.005353579108) * y
- 0.002141268741) * y
93 + 0.000535310849) * y
+ 0.999936657524
95 return ((x
+ 1.0) * 0.5)
97 return ((1.0 - x
) * 0.5)
99 cdef float64_t ex20
( float64_t x
):
100 """Wrapper on exp function. It will return 0 if x is smaller than -20.
107 cpdef float64_t chisq_pvalue_e
( float64_t x
, uint32_t df
):
108 """Chisq CDF function for upper tail and even degree of freedom.
109 Good for p-value calculation and designed for combining pvalues.
111 Note: we assume df to be an even number larger than 1! Do not
112 violate this assumption and there is no checking.
114 df has to be even number! if df is odd, result will be wrong!
125 even
= ((2*(df
/2)) == df
)
131 if a
> bigx
: # approximation for big number
150 cpdef float64_t chisq_logp_e
( float64_t x
, uint32_t df
, bool log10
= False
):
151 """Chisq CDF function in log space for upper tail and even degree of freedom
152 Good for p-value calculation and designed for combining pvalues.
154 Note: we assume df to be an even number larger than 1! Do not
155 violate this assumption and there is no checking.
157 Return value is -logp. If log10 is set as True, return -log10p
163 float64_t s
# s is the return value
170 y
= exp
(-a
) # y is for small number calculation
174 x
= 0.5 * (df
- 1.0) # control number of iterations
175 z
= 1.0 # variable for iteration
176 if a
> bigx
: # approximation for big number
178 c
= log
(a
) # constant
179 while z
<= x
: # iterations
181 s
= logspace_add
(s
, c
*z
-a
-e
)
183 else: # for small number
184 e
= 1.0 # not a constant
185 c
= 0.0 # not a constant
190 s
= log
(y
+c
*y
) #logspace_add( s, log(c) ) - a
197 cpdef float64_t poisson_cdf
( uint32_t n
, float64_t lam
, bool lower
=False
, bool log10
=False
):
198 """Poisson CDF evaluater.
200 This is a more stable CDF function. It can tolerate large lambda
201 value. While the lambda is larger than 700, the function will be a
206 lam : lambda of poisson distribution
207 lower : if lower is False, calculate the upper tail CDF, otherwise, to calculate lower tail; Default is False.
208 log10 : if log10 is True, calculation will be in log space. Default is False.
210 assert lam
> 0.0, "Lambda must > 0, however we got %d" % lam
215 return log10_poisson_cdf_P_large_lambda
(n
, lam
)
218 return log10_poisson_cdf_Q_large_lambda
(n
, lam
)
221 if lam
> 700: # may be problematic
222 return __poisson_cdf_large_lambda
(n
, lam
)
224 return __poisson_cdf
(n
,lam
)
227 if lam
> 700: # may be problematic
228 return __poisson_cdf_Q_large_lambda
(n
, lam
)
230 return __poisson_cdf_Q
(n
,lam
)
232 cdef inline float64_t __poisson_cdf
( uint32_t k
, float64_t a
):
233 """Poisson CDF For small lambda. If a > 745, this will return
247 return 0.0 # special cases
249 nextcdf
= exp
( -1 * a
)
252 for i
in range( 1, k
+ 1 ):
254 nextcdf
= lastcdf
* a
/ i
261 cdef inline float64_t __poisson_cdf_large_lambda
( uint32_t k
, float64_t a
):
262 """Slower poisson cdf for large lambda ( > 700 )
278 return 0.0 # special cases
280 num_parts
= <int32_t
>( a
/ LSTEP
)
281 lastexp
= exp
( -1 * ( a
% LSTEP
) )
286 for i
in range( 1 , k
+ 1 ):
288 nextcdf
= lastcdf
* a
/ i
290 if nextcdf
> EXPTHRES
or cdf
> EXPTHRES
:
299 for i
in range( num_parts
):
304 cdef inline float64_t __poisson_cdf_Q
( uint32_t k
, float64_t a
):
305 """internal Poisson CDF evaluater for upper tail with small
315 return 1.0 # special cases
316 cdef float64_t nextcdf
317 nextcdf
= exp
( -1 * a
)
318 cdef float64_t lastcdf
320 for i
in range(1,k
+1):
322 nextcdf
= lastcdf
* a
/ i
324 cdef float64_t cdf
= 0.0
328 nextcdf
= lastcdf
* a
/ i
333 cdef inline float64_t __poisson_cdf_Q_large_lambda
( uint32_t k
, float64_t a
):
334 """Slower internal Poisson CDF evaluater for upper tail with large
342 if k
< 0: return 1.0 # special cases
343 cdef uint32_t num_parts
= <int32_t
>(a
/LSTEP
)
344 cdef float64_t lastexp
= exp
(-1 * (a
% LSTEP
) )
345 cdef float64_t nextcdf
= EXPSTEP
347 cdef float64_t lastcdf
351 for i
in range(1,k
+1):
353 nextcdf
= lastcdf
* a
/ i
354 if nextcdf
> EXPTHRES
:
359 # simply raise an error
360 raise Exception("Unexpected error")
363 cdef float64_t cdf
= 0.0
367 nextcdf
= lastcdf
* a
/ i
370 if nextcdf
> EXPTHRES
or cdf
> EXPTHRES
:
379 for i
in range(num_parts
):
384 cdef inline float64_t log10_poisson_cdf_P_large_lambda
( uint32_t k
, float64_t lbd
):
385 """Slower Poisson CDF evaluater for lower tail which allow
386 calculation in log space. Better for the pvalue < 10^-310.
392 ret = -lambda + \ln( \sum_{i=k+1}^{\inf} {lambda^i/i!} = -lambda + \ln( sum{ exp{ln(F)} } ), where F=lambda^m/m!
393 \ln{F(m)} = m*ln{lambda} - \sum_{x=1}^{m}\ln(x)
394 Calculate \ln( sum{exp{N} ) by logspace_add function
396 Return the log10(pvalue)
398 cdef float64_t residue
= 0
399 cdef float64_t logx
= 0
400 cdef float64_t ln_lbd
= log
(lbd
)
404 cdef float64_t sum_ln_m
= 0
406 for i
in range(1,m
+1):
408 logx
= m
*ln_lbd
- sum_ln_m
413 logy
= logx
-ln_lbd
+log
(m
)
414 pre_residue
= residue
415 residue
= logspace_add
(pre_residue
,logy
)
416 if fabs
(pre_residue
-residue
) < 1e-10:
420 return round((residue
-lbd
)/M_LN10
,5)
422 cdef inline float64_t log10_poisson_cdf_Q_large_lambda
( uint32_t k
, float64_t lbd
):
423 """Slower Poisson CDF evaluater for upper tail which allow
424 calculation in log space. Better for the pvalue < 10^-310.
430 ret = -lambda + \ln( \sum_{i=k+1}^{\inf} {lambda^i/i!} = -lambda + \ln( sum{ exp{ln(F)} } ), where F=lambda^m/m!
431 \ln{F(m)} = m*ln{lambda} - \sum_{x=1}^{m}\ln(x)
432 Calculate \ln( sum{exp{N} ) by logspace_add function
434 Return the log10(pvalue)
436 cdef float64_t residue
= 0
437 cdef float64_t logx
= 0
438 cdef float64_t ln_lbd
= log
(lbd
)
442 cdef float64_t sum_ln_m
= 0
444 for i
in range(1,m
+1):
446 logx
= m
*ln_lbd
- sum_ln_m
451 logy
= logx
+ln_lbd
-log
(m
)
452 pre_residue
= residue
453 residue
= logspace_add
(pre_residue
,logy
)
454 if fabs
(pre_residue
-residue
) < 1e-5:
458 return round((residue
-lbd
)/log
(10),5)
460 cdef inline float64_t logspace_add
( float64_t logx
, float64_t logy
):
461 """addition in log space.
463 Given two log values, such as logx and logy, return
464 log(exp(logx)+exp(logy)).
467 return max (logx
, logy
) + log1p
(exp
(-fabs
(logx
- logy
)))
469 cpdef poisson_cdf_inv
( float64_t cdf
, float64_t lam
, int32_t maximum
=1000 ):
470 """inverse poisson distribution.
473 lam : the lambda of poisson distribution
475 note: maxmimum return value is 1000
476 and lambda must be smaller than 740.
479 if cdf
< 0 or cdf
> 1:
480 raise Exception ("CDF must >= 0 and <= 1")
483 cdef float64_t sum2
= 0
484 cdef float64_t newval
= exp
( -1*lam
)
488 cdef float64_t sumold
489 cdef float64_t lastval
491 for i
in range(1,maximum
+1):
494 newval
= lastval
* lam
/ i
496 if sumold
<= cdf
and cdf
<= sum2
:
501 cpdef poisson_cdf_Q_inv
( float64_t cdf
, float64_t lam
, int32_t maximum
=1000 ):
502 """inverse poisson distribution.
505 lam : the lambda of poisson distribution
507 note: maxmimum return value is 1000
508 and lambda must be smaller than 740.
511 if cdf
< 0 or cdf
> 1:
512 raise Exception ("CDF must >= 0 and <= 1")
515 cdef float64_t sum2
= 0
516 cdef float64_t newval
= exp
( -1 * lam
)
520 cdef float64_t lastval
521 cdef float64_t sumold
523 for i
in range(1,maximum
+1):
526 newval
= lastval
* lam
/ i
528 if sumold
<= cdf
and cdf
<= sum2
:
533 cpdef poisson_pdf
( uint32_t k
, float64_t a
):
536 PDF(K,A) is the probability that the number of events observed in
537 a unit time period will be K, given the expected number of events
542 return exp
(-a
) * pow (a
, k
, None
) / factorial
(k
)
545 cdef binomial_coef
( int64_t n
, int64_t k
):
546 """BINOMIAL_COEF computes the Binomial coefficient C(N,K)
550 cdef int64_t mn
= min (k
, n
-k
)
560 cnk
= <float32_t
>(mx
+1)
561 for i
in range(2,mn
+1):
562 cnk
= cnk
* (mx
+i
) / i
565 cpdef binomial_cdf
( int64_t x
, int64_t a
, float64_t b
, bool lower
=True
):
566 """ BINOMIAL_CDF compute the binomial CDF.
568 CDF(x)(A,B) is the probability of at most X successes in A trials,
569 given that the probability of success on a single trial is B.
572 return _binomial_cdf_f
(x
,a
,b
)
574 return _binomial_cdf_r
(x
,a
,b
)
576 cpdef binomial_sf
( int64_t x
, int64_t a
, float64_t b
, bool lower
=True
):
577 """ BINOMIAL_SF compute the binomial survival function (1-CDF)
579 SF(x)(A,B) is the probability of more than X successes in A trials,
580 given that the probability of success on a single trial is B.
583 return 1.0 - _binomial_cdf_f
(x
,a
,b
)
585 return 1.0 - _binomial_cdf_r
(x
,a
,b
)
587 cpdef pduplication
(np
.ndarray
[np
.float64_t
] pmf
, int32_t N_obs
):
588 """return the probability of a duplicate fragment given a pmf
589 and a number of observed fragments N_obs
593 float32_t p
, sf
= 0.0
595 sf
+= binomial_sf
(2, N_obs
, p
)
596 return sf
/ <float32_t
>n
598 cdef _binomial_cdf_r
( int64_t x
, int64_t a
, float64_t b
):
599 """ Binomial CDF for upper tail.
602 cdef int64_t argmax
=<int32_t
>(a
*b
)
603 cdef float64_t seedpdf
618 seedpdf
=binomial_pdf
(argmax
,a
,b
)
621 for i
in range(argmax
-1,x
,-1):
622 pdf
/=(a
-i
)*b
/(1-b
)/(i
+1)
629 pdf
*=(a
-i
)*b
/(1-b
)/(i
+1)
634 #cdf = float("%.10e" %cdf)
637 pdf
=binomial_pdf
(x
+1,a
,b
)
641 pdf
*=(a
-i
)*b
/(1-b
)/(i
+1)
646 #cdf = float("%.10e" %cdf)
650 cdef _binomial_cdf_f
( int64_t x
, int64_t a
, float64_t b
):
651 """ Binomial CDF for lower tail.
654 cdef int64_t argmax
=<int32_t
>(a
*b
)
655 cdef float64_t seedpdf
670 seedpdf
=binomial_pdf
(argmax
,a
,b
)
673 for i
in range(argmax
-1,-1,-1):
674 pdf
/=(a
-i
)*b
/(1-b
)/(i
+1)
679 for i
in range(argmax
,x
):
680 pdf
*=(a
-i
)*b
/(1-b
)/(i
+1)
684 #cdf = float("%.10e" %cdf)
687 pdf
=binomial_pdf
(x
,a
,b
)
689 for i
in range(x
-1,-1,-1):
690 pdf
/=(a
-i
)*b
/(1-b
)/(i
+1)
694 #cdf = float("%.10e" %cdf)
697 cpdef binomial_cdf_inv
( float64_t cdf
, int64_t a
, float64_t b
):
698 """BINOMIAL_CDF_INV inverts the binomial CDF. For lower tail only!
701 if cdf
< 0 or cdf
>1:
702 raise Exception("CDF must >= 0 or <= 1")
704 cdef float64_t cdf2
= 0
707 for x
in range(0,a
+1):
708 pdf
= binomial_pdf
(x
,a
,b
)
714 cpdef binomial_pdf
( int64_t x
, int64_t a
, float64_t b
):
715 """binomial PDF by H. Gene Shin
751 for q
in range(1,mn
+1):
752 pdf
*=(a
-q
+1)*p
/(mn
-q
+1)
758 while pdf
> 1e+3 and t
<mx
:
762 for i
in range(mx
-t
):
765 #pdf=float("%.10e" % pdf)
768 # cdef normal_01_cdf ( float64_t x ):
769 # """NORMAL_01_CDF evaluates the Normal 01 CDF.
771 # cdef float64_t a1 = 0.398942280444
772 # cdef float64_t a2 = 0.399903438504
773 # cdef float64_t a3 = 5.75885480458
774 # cdef float64_t a4 = 29.8213557808
775 # cdef float64_t a5 = 2.62433121679
776 # cdef float64_t a6 = 48.6959930692
777 # cdef float64_t a7 = 5.92885724438
778 # cdef float64_t b0 = 0.398942280385
779 # cdef float64_t b1 = 3.8052E-08
780 # cdef float64_t b2 = 1.00000615302
781 # cdef float64_t b3 = 3.98064794E-04
782 # cdef float64_t b4 = 1.98615381364
783 # cdef float64_t b5 = 0.151679116635
784 # cdef float64_t b6 = 5.29330324926
785 # cdef float64_t b7 = 4.8385912808
786 # cdef float64_t b8 = 15.1508972451
787 # cdef float64_t b9 = 0.742380924027
788 # cdef float64_t b10 = 30.789933034
789 # cdef float64_t b11 = 3.99019417011
792 # if abs ( x ) <= 1.28:
794 # q = 0.5 - abs ( x ) * ( a1 - a2 * y / ( y + a3 - a4 / ( y + a5 + a6 / ( y + a7 ) ) ) )
795 # elif abs ( x ) <= 12.7:
798 # q = exp ( - y ) * b0 / ( abs ( x ) - b1 \
799 # + b2 / ( abs ( x ) + b3 \
800 # + b4 / ( abs ( x ) - b5 \
801 # + b6 / ( abs ( x ) + b7 \
802 # - b8 / ( abs ( x ) + b9 \
803 # + b10 / ( abs ( x ) + b11 ) ) ) ) ) )
807 # # Take account of negative X.
816 # def normal_cdf_inv(p, mu = None, sigma2 = None, lower=True):
820 # raise Exception("Illegal argument %f for qnorm(p)." % p)
824 # a1 = -18.61500062529
825 # a2 = 41.39119773534
826 # a3 = -25.44106049637
827 # b1 = -8.47351093090
828 # b2 = 23.08336743743
829 # b3 = -21.06224101826
831 # c0 = -2.78718931138
832 # c1 = -2.29796479134
842 # if abs(q) <= split:
844 # ppnd = q * (((a3 * r + a2) * r + a1) * r + a0) / ((((b4 * r + b3) * r + b2) * r + b1) * r + 1)
851 # r = math.sqrt(- math.log(r))
852 # ppnd = (((c3 * r + c2) * r + c1) * r + c0) / ((d2 * r + d1) * r + 1)
862 # if mu != None and sigma2 != None:
863 # return ppnd * math.sqrt(sigma2) + mu
867 # def normal_cdf (z, mu = 0.0, sigma2 = 1.0, lower=True):
871 # z = (z - mu) / math.sqrt(sigma2)
876 # a1 = 0.398942280444
877 # a2 = 0.399903438504
883 # b1 = 0.398942280385
887 # b5 = 1.986153813664
888 # b6 = 0.151679116635
892 # b10 = 0.742380924027
894 # b12 = 3.99019417011
903 # if z <= ltone or upper and z <= utzero:
906 # alnorm = b1 * math.exp(- y) / (z - b2 + b3 / (z + b4 + b5 / (z - b6 + b7 / (z + b8 - b9 / (z + b10 + b11 / (z + b12))))))
908 # alnorm = 0.5 - z * (a1 - a2 * y / (y + a3 - a4 / (y + a5 + a6 / (y + a7))))
912 # alnorm = 1.0 - alnorm