c_cpp/lib/htslib/kfunc.c

   1 /* The MIT License
   2
   3    Copyright (C) 2010, 2013 Genome Research Ltd.
   4    Copyright (C) 2011 Attractive Chaos <attractor@live.co.uk>
   5
   6    Permission is hereby granted, free of charge, to any person obtaining
   7    a copy of this software and associated documentation files (the
   8    "Software"), to deal in the Software without restriction, including
   9    without limitation the rights to use, copy, modify, merge, publish,
  10    distribute, sublicense, and/or sell copies of the Software, and to
  11    permit persons to whom the Software is furnished to do so, subject to
  12    the following conditions:
  13
  14    The above copyright notice and this permission notice shall be
  15    included in all copies or substantial portions of the Software.
  16
  17    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  18    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  19    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  20    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  21    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  22    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  23    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  24    SOFTWARE.
  25 */
  26
  27 #include <config.h>
  28
  29 #include <math.h>
  30 #include <stdlib.h>
  31 #include "htslib/kfunc.h"
  32
  33 /* Log gamma function
  34  * \log{\Gamma(z)}
  35  * AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245
  36  */
  37 double kf_lgamma(double z)
  38 {
  39         double x = 0;
  40         x += 0.1659470187408462e-06 / (z+7);
  41         x += 0.9934937113930748e-05 / (z+6);
  42         x -= 0.1385710331296526     / (z+5);
  43         x += 12.50734324009056      / (z+4);
  44         x -= 176.6150291498386      / (z+3);
  45         x += 771.3234287757674      / (z+2);
  46         x -= 1259.139216722289      / (z+1);
  47         x += 676.5203681218835      / z;
  48         x += 0.9999999999995183;
  49         return log(x) - 5.58106146679532777 - z + (z-0.5) * log(z+6.5);
  50 }
  51
  52 /* complementary error function
  53  * \frac{2}{\sqrt{\pi}} \int_x^{\infty} e^{-t^2} dt
  54  * AS66, 2nd algorithm, http://lib.stat.cmu.edu/apstat/66
  55  */
  56 double kf_erfc(double x)
  57 {
  58         const double p0 = 220.2068679123761;
  59         const double p1 = 221.2135961699311;
  60         const double p2 = 112.0792914978709;
  61         const double p3 = 33.912866078383;
  62         const double p4 = 6.37396220353165;
  63         const double p5 = .7003830644436881;
  64         const double p6 = .03526249659989109;
  65         const double q0 = 440.4137358247522;
  66         const double q1 = 793.8265125199484;
  67         const double q2 = 637.3336333788311;
  68         const double q3 = 296.5642487796737;
  69         const double q4 = 86.78073220294608;
  70         const double q5 = 16.06417757920695;
  71         const double q6 = 1.755667163182642;
  72         const double q7 = .08838834764831844;
  73         double expntl, z, p;
  74         z = fabs(x) * M_SQRT2;
  75         if (z > 37.) return x > 0.? 0. : 2.;
  76         expntl = exp(z * z * - .5);
  77         if (z < 10. / M_SQRT2) // for small z
  78             p = expntl * ((((((p6 * z + p5) * z + p4) * z + p3) * z + p2) * z + p1) * z + p0)
  79                         / (((((((q7 * z + q6) * z + q5) * z + q4) * z + q3) * z + q2) * z + q1) * z + q0);
  80         else p = expntl / 2.506628274631001 / (z + 1. / (z + 2. / (z + 3. / (z + 4. / (z + .65)))));
  81         return x > 0.? 2. * p : 2. * (1. - p);
  82 }
  83
  84 /* The following computes regularized incomplete gamma functions.
  85  * Formulas are taken from Wiki, with additional input from Numerical
  86  * Recipes in C (for modified Lentz's algorithm) and AS245
  87  * (http://lib.stat.cmu.edu/apstat/245).
  88  *
  89  * A good online calculator is available at:
  90  *
  91  *   http://www.danielsoper.com/statcalc/calc23.aspx
  92  *
  93  * It calculates upper incomplete gamma function, which equals
  94  * kf_gammaq(s,z)*tgamma(s).
  95  */
  96
  97 #define KF_GAMMA_EPS 1e-14
  98 #define KF_TINY 1e-290
  99
 100 // regularized lower incomplete gamma function, by series expansion
 101 static double _kf_gammap(double s, double z)
 102 {
 103         double sum, x;
 104         int k;
 105         for (k = 1, sum = x = 1.; k < 100; ++k) {
 106                 sum += (x *= z / (s + k));
 107                 if (x / sum < KF_GAMMA_EPS) break;
 108         }
 109         return exp(s * log(z) - z - kf_lgamma(s + 1.) + log(sum));
 110 }
 111 // regularized upper incomplete gamma function, by continued fraction
 112 static double _kf_gammaq(double s, double z)
 113 {
 114         int j;
 115         double C, D, f;
 116         f = 1. + z - s; C = f; D = 0.;
 117         // Modified Lentz's algorithm for computing continued fraction
 118         // See Numerical Recipes in C, 2nd edition, section 5.2
 119         for (j = 1; j < 100; ++j) {
 120                 double a = j * (s - j), b = (j<<1) + 1 + z - s, d;
 121                 D = b + a * D;
 122                 if (D < KF_TINY) D = KF_TINY;
 123                 C = b + a / C;
 124                 if (C < KF_TINY) C = KF_TINY;
 125                 D = 1. / D;
 126                 d = C * D;
 127                 f *= d;
 128                 if (fabs(d - 1.) < KF_GAMMA_EPS) break;
 129         }
 130         return exp(s * log(z) - z - kf_lgamma(s) - log(f));
 131 }
 132
 133 double kf_gammap(double s, double z)
 134 {
 135         return z <= 1. || z < s? _kf_gammap(s, z) : 1. - _kf_gammaq(s, z);
 136 }
 137
 138 double kf_gammaq(double s, double z)
 139 {
 140         return z <= 1. || z < s? 1. - _kf_gammap(s, z) : _kf_gammaq(s, z);
 141 }
 142
 143 /* Regularized incomplete beta function. The method is taken from
 144  * Numerical Recipe in C, 2nd edition, section 6.4. The following web
 145  * page calculates the incomplete beta function, which equals
 146  * kf_betai(a,b,x) * gamma(a) * gamma(b) / gamma(a+b):
 147  *
 148  *   http://www.danielsoper.com/statcalc/calc36.aspx
 149  */
 150 static double kf_betai_aux(double a, double b, double x)
 151 {
 152         double C, D, f;
 153         int j;
 154         if (x == 0.) return 0.;
 155         if (x == 1.) return 1.;
 156         f = 1.; C = f; D = 0.;
 157         // Modified Lentz's algorithm for computing continued fraction
 158         for (j = 1; j < 200; ++j) {
 159                 double aa, d;
 160                 int m = j>>1;
 161                 aa = (j&1)? -(a + m) * (a + b + m) * x / ((a + 2*m) * (a + 2*m + 1))
 162                         : m * (b - m) * x / ((a + 2*m - 1) * (a + 2*m));
 163                 D = 1. + aa * D;
 164                 if (D < KF_TINY) D = KF_TINY;
 165                 C = 1. + aa / C;
 166                 if (C < KF_TINY) C = KF_TINY;
 167                 D = 1. / D;
 168                 d = C * D;
 169                 f *= d;
 170                 if (fabs(d - 1.) < KF_GAMMA_EPS) break;
 171         }
 172         return exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b) + a * log(x) + b * log(1.-x)) / a / f;
 173 }
 174 double kf_betai(double a, double b, double x)
 175 {
 176         return x < (a + 1.) / (a + b + 2.)? kf_betai_aux(a, b, x) : 1. - kf_betai_aux(b, a, 1. - x);
 177 }
 178
 179 #ifdef KF_MAIN
 180 #include <stdio.h>
 181 int main(int argc, char *argv[])
 182 {
 183         double x = 5.5, y = 3;
 184         double a, b;
 185         printf("erfc(%lg): %lg, %lg\n", x, erfc(x), kf_erfc(x));
 186         printf("upper-gamma(%lg,%lg): %lg\n", x, y, kf_gammaq(y, x)*tgamma(y));
 187         a = 2; b = 2; x = 0.5;
 188         printf("incomplete-beta(%lg,%lg,%lg): %lg\n", a, b, x, kf_betai(a, b, x) / exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b)));
 189         return 0;
 190 }
 191 #endif
 192
 193
 194 // log\binom{n}{k}
 195 static double lbinom(int n, int k)
 196 {
 197     if (k == 0 || n == k) return 0;
 198     return lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1);
 199 }
 200
 201 // n11  n12  | n1_
 202 // n21  n22  | n2_
 203 //-----------+----
 204 // n_1  n_2  | n
 205
 206 // hypergeometric distribution
 207 static double hypergeo(int n11, int n1_, int n_1, int n)
 208 {
 209     return exp(lbinom(n1_, n11) + lbinom(n-n1_, n_1-n11) - lbinom(n, n_1));
 210 }
 211
 212 typedef struct {
 213     int n11, n1_, n_1, n;
 214     double p;
 215 } hgacc_t;
 216
 217 // incremental version of hypergenometric distribution
 218 static double hypergeo_acc(int n11, int n1_, int n_1, int n, hgacc_t *aux)
 219 {
 220     if (n1_ || n_1 || n) {
 221         aux->n11 = n11; aux->n1_ = n1_; aux->n_1 = n_1; aux->n = n;
 222     } else { // then only n11 changed; the rest fixed
 223         if (n11%11 && n11 + aux->n - aux->n1_ - aux->n_1) {
 224             if (n11 == aux->n11 + 1) { // incremental
 225                 aux->p *= (double)(aux->n1_ - aux->n11) / n11
 226                     * (aux->n_1 - aux->n11) / (n11 + aux->n - aux->n1_ - aux->n_1);
 227                 aux->n11 = n11;
 228                 return aux->p;
 229             }
 230             if (n11 == aux->n11 - 1) { // incremental
 231                 aux->p *= (double)aux->n11 / (aux->n1_ - n11)
 232                     * (aux->n11 + aux->n - aux->n1_ - aux->n_1) / (aux->n_1 - n11);
 233                 aux->n11 = n11;
 234                 return aux->p;
 235             }
 236         }
 237         aux->n11 = n11;
 238     }
 239     aux->p = hypergeo(aux->n11, aux->n1_, aux->n_1, aux->n);
 240     return aux->p;
 241 }
 242
 243 double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two)
 244 {
 245     int i, j, max, min;
 246     double p, q, left, right;
 247     hgacc_t aux;
 248     int n1_, n_1, n;
 249
 250     n1_ = n11 + n12; n_1 = n11 + n21; n = n11 + n12 + n21 + n22; // calculate n1_, n_1 and n
 251     max = (n_1 < n1_) ? n_1 : n1_; // max n11, for right tail
 252     min = n1_ + n_1 - n;    // not sure why n11-n22 is used instead of min(n_1,n1_)
 253     if (min < 0) min = 0; // min n11, for left tail
 254     *two = *_left = *_right = 1.;
 255     if (min == max) return 1.; // no need to do test
 256     q = hypergeo_acc(n11, n1_, n_1, n, &aux); // the probability of the current table
 257     // left tail
 258     p = hypergeo_acc(min, 0, 0, 0, &aux);
 259     for (left = 0., i = min + 1; p < 0.99999999 * q && i<=max; ++i) // loop until underflow
 260         left += p, p = hypergeo_acc(i, 0, 0, 0, &aux);
 261     --i;
 262     if (p < 1.00000001 * q) left += p;
 263     else --i;
 264     // right tail
 265     p = hypergeo_acc(max, 0, 0, 0, &aux);
 266     for (right = 0., j = max - 1; p < 0.99999999 * q && j>=0; --j) // loop until underflow
 267         right += p, p = hypergeo_acc(j, 0, 0, 0, &aux);
 268     ++j;
 269     if (p < 1.00000001 * q) right += p;
 270     else ++j;
 271     // two-tail
 272     *two = left + right;
 273     if (*two > 1.) *two = 1.;
 274     // adjust left and right
 275     if (abs(i - n11) < abs(j - n11)) right = 1. - left + q;
 276     else left = 1.0 - right + q;
 277     *_left = left; *_right = right;
 278     return q;
 279 }
 280
 281
 282