1 # cython: language_level=3
3 # Time-stamp: <2019-10-30 17:28:00 taoliu>
5 """Module Description: statistics functions to calculate p-values
7 This code is free software; you can redistribute it and/or modify it
8 under the terms of the BSD License (see the file LICENSE included with
12 # ------------------------------------
14 # ------------------------------------
15 from libc
.math cimport exp
,log
,log10
, M_LN10
#,fabs,log1p
17 from math
import log1p
#as py_log1p
23 from cpython cimport bool
24 # ------------------------------------
26 # ------------------------------------
28 cdef double EXPTHRES
= exp
(LSTEP
)
29 cdef double EXPSTEP
= exp
(-1*LSTEP
)
32 # ------------------------------------
33 # Normal distribution functions
34 # ------------------------------------
35 # x is the value, u is the mean, v is the variance
36 cpdef pnorm
(int x
, int u
, int v
):
37 """The probability of X=x when X=Norm(u,v)
39 return 1.0/sqrt
(2.0 * 3.141592653589793 * <float>v
) * exp
(-<float>(x
-u
)**2 / (2.0 * <float>v
))
41 # ------------------------------------
43 # ------------------------------------
45 cpdef factorial
( unsigned
int n
):
53 for i
in xrange( 2,n
+1 ):
57 cdef double poz
(double z
):
58 """ probability of normal z value
62 double Z_MAX
= 6.0 # Maximum meaningful z value
67 if y
>= (Z_MAX
* 0.5):
71 x
= ((((((((0.000124818987 * w
72 - 0.001075204047) * w
+ 0.005198775019) * w
73 - 0.019198292004) * w
+ 0.059054035642) * w
74 - 0.151968751364) * w
+ 0.319152932694) * w
75 - 0.531923007300) * w
+ 0.797884560593) * y
* 2.0
78 x
= (((((((((((((-0.000045255659 * y
79 + 0.000152529290) * y
- 0.000019538132) * y
80 - 0.000676904986) * y
+ 0.001390604284) * y
81 - 0.000794620820) * y
- 0.002034254874) * y
82 + 0.006549791214) * y
- 0.010557625006) * y
83 + 0.011630447319) * y
- 0.009279453341) * y
84 + 0.005353579108) * y
- 0.002141268741) * y
85 + 0.000535310849) * y
+ 0.999936657524
87 return ((x
+ 1.0) * 0.5)
89 return ((1.0 - x
) * 0.5)
91 cdef double ex20
( double x
):
92 """Wrapper on exp function. It will return 0 if x is smaller than -20.
99 cpdef double chisq_pvalue_e
( double x
, unsigned
int df
):
100 """Chisq CDF function for upper tail and even degree of freedom.
101 Good for p-value calculation and designed for combining pvalues.
103 Note: we assume df to be an even number larger than 1! Do not
104 violate this assumption and there is no checking.
106 df has to be even number! if df is odd, result will be wrong!
117 even
= ((2*(df
/2)) == df
)
123 if a
> bigx
: # approximation for big number
142 cpdef double chisq_logp_e
( double x
, unsigned
int df
, bool log10
= False
):
143 """Chisq CDF function in log space for upper tail and even degree of freedom
144 Good for p-value calculation and designed for combining pvalues.
146 Note: we assume df to be an even number larger than 1! Do not
147 violate this assumption and there is no checking.
149 Return value is -logp. If log10 is set as True, return -log10p
155 double s
# s is the return value
162 y
= exp
(-a
) # y is for small number calculation
166 x
= 0.5 * (df
- 1.0) # control number of iterations
167 z
= 1.0 # variable for iteration
168 if a
> bigx
: # approximation for big number
170 c
= log
(a
) # constant
171 while z
<= x
: # iterations
173 s
= logspace_add
(s
, c
*z
-a
-e
)
175 else: # for small number
176 e
= 1.0 # not a constant
177 c
= 0.0 # not a constant
182 s
= log
(y
+c
*y
) #logspace_add( s, log(c) ) - a
189 cpdef double poisson_cdf
( unsigned
int n
, double lam
, bool lower
=False
, bool log10
=False
):
190 """Poisson CDF evaluater.
192 This is a more stable CDF function. It can tolerate large lambda
193 value. While the lambda is larger than 700, the function will be a
198 lam : lambda of poisson distribution
199 lower : if lower is False, calculate the upper tail CDF, otherwise, to calculate lower tail; Default is False.
200 log10 : if log10 is True, calculation will be in log space. Default is False.
202 assert lam
> 0.0, "Lambda must > 0, however we got %d" % lam
207 return log10_poisson_cdf_P_large_lambda
(n
, lam
)
210 return log10_poisson_cdf_Q_large_lambda
(n
, lam
)
213 if lam
> 700: # may be problematic
214 return __poisson_cdf_large_lambda
(n
, lam
)
216 return __poisson_cdf
(n
,lam
)
219 if lam
> 700: # may be problematic
220 return __poisson_cdf_Q_large_lambda
(n
, lam
)
222 return __poisson_cdf_Q
(n
,lam
)
224 cdef inline double __poisson_cdf
( unsigned
int k
, double a
):
225 """Poisson CDF For small lambda. If a > 745, this will return
239 return 0.0 # special cases
241 nextcdf
= exp
( -1 * a
)
244 for i
in range( 1, k
+ 1 ):
246 nextcdf
= lastcdf
* a
/ i
253 cdef inline double __poisson_cdf_large_lambda
( unsigned
int k
, double a
):
254 """Slower poisson cdf for large lambda ( > 700 )
270 return 0.0 # special cases
272 num_parts
= int( a
/ LSTEP
)
273 lastexp
= exp
( -1 * ( a
% LSTEP
) )
278 for i
in range( 1 , k
+ 1 ):
280 nextcdf
= lastcdf
* a
/ i
282 if nextcdf
> EXPTHRES
or cdf
> EXPTHRES
:
291 for i
in range( num_parts
):
296 cdef inline double __poisson_cdf_Q
( unsigned
int k
, double a
):
297 """internal Poisson CDF evaluater for upper tail with small
307 return 1.0 # special cases
309 nextcdf
= exp
( -1 * a
)
312 for i
in xrange(1,k
+1):
314 nextcdf
= lastcdf
* a
/ i
316 cdef double cdf
= 0.0
320 nextcdf
= lastcdf
* a
/ i
325 cdef inline double __poisson_cdf_Q_large_lambda
( unsigned
int k
, double a
):
326 """Slower internal Poisson CDF evaluater for upper tail with large
335 return 1.0 # special cases
336 cdef unsigned
int num_parts
= int(a
/LSTEP
)
337 cdef double lastexp
= exp
(-1 * (a
% LSTEP
) )
338 cdef double nextcdf
= EXPSTEP
344 for i
in xrange(1,k
+1):
346 nextcdf
= lastcdf
* a
/ i
347 if nextcdf
> EXPTHRES
:
352 # simply raise an error
353 raise Exception("Unexpected error")
356 cdef double cdf
= 0.0
360 nextcdf
= lastcdf
* a
/ i
363 if nextcdf
> EXPTHRES
or cdf
> EXPTHRES
:
372 for i
in xrange(num_parts
):
377 cdef inline double log10_poisson_cdf_P_large_lambda
( unsigned
int k
, double lbd
):
378 """Slower Poisson CDF evaluater for lower tail which allow
379 calculation in log space. Better for the pvalue < 10^-310.
385 ret = -lambda + \ln( \sum_{i=k+1}^{\inf} {lambda^i/i!} = -lambda + \ln( sum{ exp{ln(F)} } ), where F=lambda^m/m!
386 \ln{F(m)} = m*ln{lambda} - \sum_{x=1}^{m}\ln(x)
387 Calculate \ln( sum{exp{N} ) by logspace_add function
389 Return the log10(pvalue)
391 cdef double residue
= 0
393 cdef double ln_lbd
= log
(lbd
)
397 cdef double sum_ln_m
= 0
399 for i
in range(1,m
+1):
401 logx
= m
*ln_lbd
- sum_ln_m
406 logy
= logx
-ln_lbd
+log
(m
)
407 pre_residue
= residue
408 residue
= logspace_add
(pre_residue
,logy
)
409 if fabs
(pre_residue
-residue
) < 1e-10:
413 return round((residue
-lbd
)/M_LN10
,5)
415 cdef inline double log10_poisson_cdf_Q_large_lambda
( unsigned
int k
, double lbd
):
416 """Slower Poisson CDF evaluater for upper tail which allow
417 calculation in log space. Better for the pvalue < 10^-310.
423 ret = -lambda + \ln( \sum_{i=k+1}^{\inf} {lambda^i/i!} = -lambda + \ln( sum{ exp{ln(F)} } ), where F=lambda^m/m!
424 \ln{F(m)} = m*ln{lambda} - \sum_{x=1}^{m}\ln(x)
425 Calculate \ln( sum{exp{N} ) by logspace_add function
427 Return the log10(pvalue)
429 cdef double residue
= 0
431 cdef double ln_lbd
= log
(lbd
)
435 cdef double sum_ln_m
= 0
437 for i
in range(1,m
+1):
439 logx
= m
*ln_lbd
- sum_ln_m
444 logy
= logx
+ln_lbd
-log
(m
)
445 pre_residue
= residue
446 residue
= logspace_add
(pre_residue
,logy
)
447 if fabs
(pre_residue
-residue
) < 1e-5:
451 return round((residue
-lbd
)/log
(10),5)
453 cdef inline double logspace_add
( double logx
, double logy
):
454 """addition in log space.
456 Given two log values, such as logx and logy, return
457 log(exp(logx)+exp(logy)).
460 return max (logx
, logy
) + log1p
(exp
(-fabs
(logx
- logy
)))
462 cpdef poisson_cdf_inv
( double cdf
, double lam
, int maximum
=1000 ):
463 """inverse poisson distribution.
466 lam : the lambda of poisson distribution
468 note: maxmimum return value is 1000
469 and lambda must be smaller than 740.
472 if cdf
< 0 or cdf
> 1:
473 raise Exception ("CDF must >= 0 and <= 1")
477 cdef double newval
= exp
( -1*lam
)
484 for i
in xrange(1,maximum
+1):
487 newval
= lastval
* lam
/ i
489 if sumold
<= cdf
and cdf
<= sum2
:
494 cpdef poisson_cdf_Q_inv
( double cdf
, double lam
, int maximum
=1000 ):
495 """inverse poisson distribution.
498 lam : the lambda of poisson distribution
500 note: maxmimum return value is 1000
501 and lambda must be smaller than 740.
504 if cdf
< 0 or cdf
> 1:
505 raise Exception ("CDF must >= 0 and <= 1")
509 cdef double newval
= exp
( -1 * lam
)
516 for i
in xrange(1,maximum
+1):
519 newval
= lastval
* lam
/ i
521 if sumold
<= cdf
and cdf
<= sum2
:
526 cpdef poisson_pdf
( unsigned
int k
, double a
):
529 PDF(K,A) is the probability that the number of events observed in
530 a unit time period will be K, given the expected number of events
535 return exp
(-a
) * pow (a
, k
, None
) / factorial
(k
)
538 cdef binomial_coef
( long n
, long k
):
539 """BINOMIAL_COEF computes the Binomial coefficient C(N,K)
543 cdef long mn
= min (k
, n
-k
)
554 for i
in xrange(2,mn
+1):
555 cnk
= cnk
* (mx
+i
) / i
558 cpdef binomial_cdf
( long x
, long a
, double b
, bool lower
=True
):
559 """ BINOMIAL_CDF compute the binomial CDF.
561 CDF(x)(A,B) is the probability of at most X successes in A trials,
562 given that the probability of success on a single trial is B.
565 return _binomial_cdf_f
(x
,a
,b
)
567 return _binomial_cdf_r
(x
,a
,b
)
569 cpdef binomial_sf
( long x
, long a
, double b
, bool lower
=True
):
570 """ BINOMIAL_SF compute the binomial survival function (1-CDF)
572 SF(x)(A,B) is the probability of more than X successes in A trials,
573 given that the probability of success on a single trial is B.
576 return 1.0 - _binomial_cdf_f
(x
,a
,b
)
578 return 1.0 - _binomial_cdf_r
(x
,a
,b
)
580 cpdef pduplication
(np
.ndarray
[np
.float64_t
] pmf
, int N_obs
):
581 """return the probability of a duplicate fragment given a pmf
582 and a number of observed fragments N_obs
588 sf
+= binomial_sf
(2, N_obs
, p
)
591 cdef _binomial_cdf_r
( long x
, long a
, double b
):
592 """ Binomial CDF for upper tail.
604 cdef long argmax
=int(a
*b
)
611 seedpdf
=binomial_pdf
(argmax
,a
,b
)
614 for i
in xrange(argmax
-1,x
,-1):
615 pdf
/=(a
-i
)*b
/(1-b
)/(i
+1)
622 pdf
*=(a
-i
)*b
/(1-b
)/(i
+1)
627 cdf
= float("%.10e" %cdf
)
630 pdf
=binomial_pdf
(x
+1,a
,b
)
634 pdf
*=(a
-i
)*b
/(1-b
)/(i
+1)
639 cdf
= float("%.10e" %cdf
)
643 cdef _binomial_cdf_f
( long x
, long a
, double b
):
644 """ Binomial CDF for lower tail.
656 cdef long argmax
=int(a
*b
)
663 seedpdf
=binomial_pdf
(argmax
,a
,b
)
666 for i
in xrange(argmax
-1,-1,-1):
667 pdf
/=(a
-i
)*b
/(1-b
)/(i
+1)
672 for i
in xrange(argmax
,x
):
673 pdf
*=(a
-i
)*b
/(1-b
)/(i
+1)
677 cdf
= float("%.10e" %cdf
)
680 pdf
=binomial_pdf
(x
,a
,b
)
682 for i
in xrange(x
-1,-1,-1):
683 pdf
/=(a
-i
)*b
/(1-b
)/(i
+1)
687 cdf
= float("%.10e" %cdf
)
690 cpdef binomial_cdf_inv
( double cdf
, long a
, double b
):
691 """BINOMIAL_CDF_INV inverts the binomial CDF. For lower tail only!
694 if cdf
< 0 or cdf
>1:
695 raise Exception("CDF must >= 0 or <= 1")
700 for x
in xrange(0,a
+1):
701 pdf
= binomial_pdf
(x
,a
,b
)
707 cpdef binomial_pdf
( long x
, long a
, double b
):
708 """binomial PDF by H. Gene Shin
744 for q
in xrange(1,mn
+1):
745 pdf
*=(a
-q
+1)*p
/(mn
-q
+1)
751 while pdf
> 1e+3 and t
<mx
:
755 for i
in xrange(mx
-t
):
758 pdf
=float("%.10e" % pdf
)
761 # cdef normal_01_cdf ( double x ):
762 # """NORMAL_01_CDF evaluates the Normal 01 CDF.
764 # cdef double a1 = 0.398942280444
765 # cdef double a2 = 0.399903438504
766 # cdef double a3 = 5.75885480458
767 # cdef double a4 = 29.8213557808
768 # cdef double a5 = 2.62433121679
769 # cdef double a6 = 48.6959930692
770 # cdef double a7 = 5.92885724438
771 # cdef double b0 = 0.398942280385
772 # cdef double b1 = 3.8052E-08
773 # cdef double b2 = 1.00000615302
774 # cdef double b3 = 3.98064794E-04
775 # cdef double b4 = 1.98615381364
776 # cdef double b5 = 0.151679116635
777 # cdef double b6 = 5.29330324926
778 # cdef double b7 = 4.8385912808
779 # cdef double b8 = 15.1508972451
780 # cdef double b9 = 0.742380924027
781 # cdef double b10 = 30.789933034
782 # cdef double b11 = 3.99019417011
785 # if abs ( x ) <= 1.28:
787 # q = 0.5 - abs ( x ) * ( a1 - a2 * y / ( y + a3 - a4 / ( y + a5 + a6 / ( y + a7 ) ) ) )
788 # elif abs ( x ) <= 12.7:
791 # q = exp ( - y ) * b0 / ( abs ( x ) - b1 \
792 # + b2 / ( abs ( x ) + b3 \
793 # + b4 / ( abs ( x ) - b5 \
794 # + b6 / ( abs ( x ) + b7 \
795 # - b8 / ( abs ( x ) + b9 \
796 # + b10 / ( abs ( x ) + b11 ) ) ) ) ) )
800 # # Take account of negative X.
809 # def normal_cdf_inv(p, mu = None, sigma2 = None, lower=True):
813 # raise Exception("Illegal argument %f for qnorm(p)." % p)
817 # a1 = -18.61500062529
818 # a2 = 41.39119773534
819 # a3 = -25.44106049637
820 # b1 = -8.47351093090
821 # b2 = 23.08336743743
822 # b3 = -21.06224101826
824 # c0 = -2.78718931138
825 # c1 = -2.29796479134
835 # if abs(q) <= split:
837 # ppnd = q * (((a3 * r + a2) * r + a1) * r + a0) / ((((b4 * r + b3) * r + b2) * r + b1) * r + 1)
844 # r = math.sqrt(- math.log(r))
845 # ppnd = (((c3 * r + c2) * r + c1) * r + c0) / ((d2 * r + d1) * r + 1)
855 # if mu != None and sigma2 != None:
856 # return ppnd * math.sqrt(sigma2) + mu
860 # def normal_cdf (z, mu = 0.0, sigma2 = 1.0, lower=True):
864 # z = (z - mu) / math.sqrt(sigma2)
869 # a1 = 0.398942280444
870 # a2 = 0.399903438504
876 # b1 = 0.398942280385
880 # b5 = 1.986153813664
881 # b6 = 0.151679116635
885 # b10 = 0.742380924027
887 # b12 = 3.99019417011
896 # if z <= ltone or upper and z <= utzero:
899 # alnorm = b1 * math.exp(- y) / (z - b2 + b3 / (z + b4 + b5 / (z - b6 + b7 / (z + b8 - b9 / (z + b10 + b11 / (z + b12))))))
901 # alnorm = 0.5 - z * (a1 - a2 * y / (y + a3 - a4 / (y + a5 + a6 / (y + a7))))
905 # alnorm = 1.0 - alnorm