BGI/soap_src/soap_builder/karlin.c

   1 /*
   2    This file is extracted from the software package on FSA-BLAST.
   3    http://www.fsa-blast.org/
   4
   5    Wong Swee Seong
   6    20-4-2006
   7
   8 */
   9 // karlin.c
  10 // Copyright (c) 2005, Michael Cameron
  11 // Permission to use this code is freely granted under the BSD license agreement,
  12 // provided that this statement is retained.
  13 //
  14 // This code has been copied from blastkar.c and modified to make it work standalone
  15 // without using implementations of mathematical functions in ncbimath.c and other
  16 // definitions in the NCBI BLAST toolkit.
  17
  18 #include <stdio.h>
  19 #include <stdlib.h>
  20 #include <math.h>
  21 // wss: do not need to bring in blast.h
  22 //#include "blast.h"
  23 #include <string.h>
  24 #include "karlin.h"
  25
  26 /**************** Statistical Significance Parameter Subroutine ****************
  27
  28     Version 1.0     February 2, 1990
  29     Version 1.2     July 6,     1990
  30
  31     Program by:     Stephen Altschul
  32
  33     Address:        National Center for Biotechnology Information
  34                     National Library of Medicine
  35                     National Institutes of Health
  36                     Bethesda, MD  20894
  37
  38     Internet:       altschul@ncbi.nlm.nih.gov
  39
  40 See:  Karlin, S. & Altschul, S.F. "Methods for Assessing the Statistical
  41     Significance of Molecular Sequence Features by Using General Scoring
  42     Schemes,"  Proc. Natl. Acad. Sci. USA 87 (1990), 2264-2268.
  43
  44     Computes the parameters lambda and K for use in calculating the
  45     statistical significance of high-scoring segments or subalignments.
  46
  47     The scoring scheme must be integer valued.  A positive score must be
  48     possible, but the expected (mean) score must be negative.
  49
  50     A program that calls this routine must provide the value of the lowest
  51     possible score, the value of the greatest possible score, and a pointer
  52     to an array of probabilities for the occurence of all scores between
  53     these two extreme scores.  For example, if score -2 occurs with
  54     probability 0.7, score 0 occurs with probability 0.1, and score 3
  55     occurs with probability 0.2, then the subroutine must be called with
  56     low = -2, high = 3, and pr pointing to the array of values
  57     { 0.7, 0.0, 0.1, 0.0, 0.0, 0.2 }.  The calling program must also provide
  58     pointers to lambda and K; the subroutine will then calculate the values
  59     of these two parameters.  In this example, lambda=0.330 and K=0.154.
  60
  61     The parameters lambda and K can be used as follows.  Suppose we are
  62     given a length N random sequence of independent letters.  Associated
  63     with each letter is a score, and the probabilities of the letters
  64     determine the probability for each score.  Let S be the aggregate score
  65     of the highest scoring contiguous segment of this sequence.  Then if N
  66     is sufficiently large (greater than 100), the following bound on the
  67     probability that S is greater than or equal to x applies:
  68
  69             P( S >= x )   <=   1 - exp [ - KN exp ( - lambda * x ) ].
  70
  71     In other words, the p-value for this segment can be written as
  72     1-exp[-KN*exp(-lambda*S)].
  73
  74     This formula can be applied to pairwise sequence comparison by assigning
  75     scores to pairs of letters (e.g. amino acids), and by replacing N in the
  76     formula with N*M, where N and M are the lengths of the two sequences
  77     being compared.
  78
  79     In addition, letting y = KN*exp(-lambda*S), the p-value for finding m
  80     distinct segments all with score >= S is given by:
  81
  82                            2             m-1           -y
  83             1 - [ 1 + y + y /2! + ... + y   /(m-1)! ] e
  84
  85     Notice that for m=1 this formula reduces to 1-exp(-y), which is the same
  86     as the previous formula.
  87
  88 *******************************************************************************/
  89
  90 double BlastKarlinLHtoK(double* sfp, int4 min, int4 max, double lambda, double H);
  91 double BlastKarlinLambdaNR(double* sfp, int4 min, int4 max);
  92 double BlastKarlinLtoH(double* sfp, int4 min, int4 max, double lambda);
  93
  94 void BlastKarlinBlkCalc(double* scoreProbabilities, int4 min, int4 max)
  95 {
  96         /* Calculate the parameter Lambda */
  97         BlastKarlin_lambda = BlastKarlinLambdaNR(scoreProbabilities, min, max);
  98
  99         /* Calculate H */
 100         BlastKarlin_H = BlastKarlinLtoH(scoreProbabilities, min, max, BlastKarlin_lambda);
 101
 102         /* Calculate K */
 103         BlastKarlin_K = BlastKarlinLHtoK(scoreProbabilities, min, max, BlastKarlin_lambda,
 104                                          BlastKarlin_H);
 105 }
 106
 107 #define DIMOFP0 (iter*range + 1)
 108 #define DIMOFP0_MAX (BLAST_KARLIN_K_ITER_MAX*BLAST_SCORE_RANGE_MAX+1)
 109 /****************************************************************************
 110 For more accuracy in the calculation of K, set K_SUMLIMIT to 0.00001.
 111 For high speed in the calculation of K, use a K_SUMLIMIT of 0.001
 112 Note:  statistical significance is often not greatly affected by the value
 113 of K, so high accuracy is generally unwarranted.
 114 *****************************************************************************/
 115 /* K_SUMLIMIT_DEFAULT == sumlimit used in BlastKarlinLHtoK() */
 116 #define BLAST_KARLIN_K_SUMLIMIT_DEFAULT 0.01
 117 #define BLAST_KARLIN_K_ITER_MAX 100
 118 /*
 119 SCORE_MIN is (-2**31 + 1)/2 because it has been observed more than once that
 120 a compiler did not properly calculate the quantity (-2**31)/2.  The list
 121 of compilers which failed this operation have at least at some time included:
 122 NeXT and a version of AIX/370's MetaWare High C R2.1r.
 123 For this reason, SCORE_MIN is not simply defined to be LONG_MIN/2.
 124 */
 125 #define BLAST_SCORE_1MIN (-10000)
 126 #define BLAST_SCORE_1MAX ( 1000)
 127 #define BLAST_SCORE_RANGE_MAX   (BLAST_SCORE_1MAX - BLAST_SCORE_1MIN)
 128 /* Initial guess for the value of Lambda in BlastKarlinLambdaNR */
 129 #define BLAST_KARLIN_LAMBDA0_DEFAULT    0.5
 130 /* LAMBDA_ACCURACY_DEFAULT == accuracy to which Lambda should be calc'd */
 131 #define BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT    (1.e-5)
 132 /* LAMBDA_ITER_DEFAULT == no. of iterations in LambdaBis = ln(accuracy)/ln(2)*/
 133 #define BLAST_KARLIN_LAMBDA_ITER_DEFAULT        17
 134 #define ABS(a)  ((a)>=0?(a):-(a))
 135
 136 int gcd(int a, int b);
 137 int4 BlastScoreChk(int4 lo, int4 hi);
 138
 139 double BlastKarlinLHtoK(double* sfp, int4 min, int4 max, double lambda, double H)
 140 {
 141 //#ifndef BLAST_KARLIN_STACKP
 142         double *P0 = NULL;
 143 //#else
 144 //      double P0 [DIMOFP0_MAX];
 145 //#endif
 146         int4    low;    /* Lowest score (must be negative) */
 147         int4    high;   /* Highest score (must be positive) */
 148         double  K;                      /* local copy of K */
 149         double  ratio;
 150         int4            i, j;
 151         int4    range, lo, hi, first, last;
 152         double  sum;
 153         double  Sum, av, oldsum, oldsum2;
 154         int4            iter;
 155         double  sumlimit;
 156         double  *p, *ptrP, *ptr1, *ptr2, *ptr1e;
 157         double  etolami, etolam;
 158
 159         if (lambda <= 0. || H <= 0.) {
 160                 return -1.;
 161         }
 162
 163         low = min;
 164         high = max;
 165         range = high - low;
 166
 167         av = H/lambda;
 168         etolam = exp((double)lambda);
 169         if (low == -1 || high == 1) {
 170                 K = av;
 171                 return K * (1.0 - 1./etolam);
 172         }
 173
 174         sumlimit = BLAST_KARLIN_K_SUMLIMIT_DEFAULT;
 175
 176         iter = BLAST_KARLIN_K_ITER_MAX;
 177
 178         if (DIMOFP0 > DIMOFP0_MAX) {
 179                 return -1.;
 180         }
 181
 182 //#ifndef BLAST_KARLIN_STACKP
 183         //P0 = (double *)global_malloc(DIMOFP0 * sizeof(*P0));
 184         P0 = (double *)malloc(DIMOFP0 * sizeof(*P0));
 185         if (P0 == NULL)
 186                 return -1.;
 187 //#else
 188 //      memset((CharPtr)P0, 0, DIMOFP0*sizeof(P0[0]));
 189 //#endif
 190
 191         Sum = 0.;
 192         lo = hi = 0;
 193         p = &sfp[low];
 194         P0[0] = sum = oldsum = oldsum2 = 1.;
 195     for (j = 0; j < iter && sum > sumlimit; Sum += sum /= ++j) {
 196         first = last = range;
 197                 lo += low;
 198                 hi += high;
 199         for (ptrP = P0+(hi-lo); ptrP >= P0; *ptrP-- =sum) {
 200             ptr1 = ptrP - first;
 201             ptr1e = ptrP - last;
 202             ptr2 = p + first;
 203             for (sum = 0.; ptr1 >= ptr1e; )
 204                 sum += *ptr1--  *  *ptr2++;
 205             if (first)
 206                 --first;
 207             if (ptrP - P0 <= range)
 208                 --last;
 209         }
 210                 etolami = pow((double)etolam, lo - 1);
 211         for (sum = 0., i = lo; i != 0; ++i) {
 212                         etolami *= etolam;
 213             sum += *++ptrP * etolami;
 214                 }
 215         for (; i <= hi; ++i)
 216             sum += *++ptrP;
 217                 oldsum2 = oldsum;
 218                 oldsum = sum;
 219     }
 220
 221         /* Terms of geometric progression added for correction */
 222         ratio = oldsum / oldsum2;
 223         if (ratio >= (1.0 - sumlimit*0.001)) {
 224                 K = -1.;
 225                 goto CleanUp;
 226         }
 227         sumlimit *= 0.01;
 228         while (sum > sumlimit) {
 229                 oldsum *= ratio;
 230                 Sum += sum = oldsum / ++j;
 231         }
 232
 233 /* Look for the greatest common divisor ("delta" in Appendix of PNAS 87 of
 234 Karlin&Altschul (1990) */
 235         for (i = 1, j = -low; i <= range && j > 1; ++i)
 236                 if (p[i])
 237                         j = gcd(j, i);
 238
 239         if (j*etolam > 0.05)
 240         {
 241                 etolami = pow((double)etolam, -j);
 242                 K = j*exp((double)-2.0*Sum) / (av*(1.0 - etolami));
 243         }
 244         else
 245             K = -j*exp((double)-2.0*Sum) / (av*(exp(-j*(double)lambda) - 1));
 246
 247 CleanUp:
 248 //#ifndef BLAST_KARLIN_K_STACKP
 249         if (P0 != NULL)
 250                 free(P0);
 251 //#endif
 252         return K;
 253 }
 254
 255 /*
 256         BlastKarlinLambdaBis
 257
 258         Calculate Lambda using the bisection method (slow).
 259 */
 260 double BlastKarlinLambdaBis(double* sfp, int4 min, int4 max)
 261 {
 262         double* sprob;
 263         double lambda, up, newval;
 264         int4    i, low, high;
 265         int4            j;
 266         double sum, x0, x1;
 267
 268         low = min;
 269         high = max;
 270         if (BlastScoreChk(low, high) != 0)
 271                 return -1.;
 272
 273         sprob = sfp;
 274         up = BLAST_KARLIN_LAMBDA0_DEFAULT;
 275         for (lambda=0.; ; ) {
 276                 up *= 2;
 277                 x0 = exp((double)up);
 278                 x1 = pow((double)x0, low - 1);
 279                 if (x1 > 0.) {
 280                         for (sum=0., i=low; i<=high; ++i)
 281                                 sum += sprob[i] * (x1 *= x0);
 282                 }
 283                 else {
 284                         for (sum=0., i=low; i<=high; ++i)
 285                                 sum += sprob[i] * exp(up * i);
 286                 }
 287                 if (sum >= 1.0)
 288                         break;
 289                 lambda = up;
 290         }
 291
 292         for (j=0; j<BLAST_KARLIN_LAMBDA_ITER_DEFAULT; ++j) {
 293                 newval = (lambda + up) / 2.;
 294                 x0 = exp((double)newval);
 295                 x1 = pow((double)x0, low - 1);
 296                 if (x1 > 0.) {
 297                         for (sum=0., i=low; i<=high; ++i)
 298                                 sum += sprob[i] * (x1 *= x0);
 299                 }
 300                 else {
 301                         for (sum=0., i=low; i<=high; ++i)
 302                                 sum += sprob[i] * exp(newval * i);
 303                 }
 304                 if (sum > 1.0)
 305                         up = newval;
 306                 else
 307                         lambda = newval;
 308         }
 309         return (lambda + up) / 2.;
 310 }
 311
 312 /******************* Fast Lambda Calculation Subroutine ************************
 313         Version 1.0     May 16, 1991
 314         Program by:     Stephen Altschul
 315
 316         Uses Newton-Raphson method (fast) to solve for Lambda, given an initial
 317         guess (lambda0) obtained perhaps by the bisection method.
 318 *******************************************************************************/
 319 double BlastKarlinLambdaNR(double* sfp, int4 min, int4 max)
 320 {
 321         int4    low;                    /* Lowest score (must be negative)  */
 322         int4    high;                   /* Highest score (must be positive) */
 323         int4    j;
 324         int4    i;
 325         double *sprob;
 326         double lambda0, sum, slope, temp, x0, x1, amt;
 327
 328         low = min;
 329         high = max;
 330         if (BlastScoreChk(low, high) != 0)
 331                 return -1.;
 332
 333         lambda0 = BLAST_KARLIN_LAMBDA0_DEFAULT;
 334
 335         /* Calculate lambda */
 336         sprob = sfp;
 337         for (j=0; j<20; ++j) { /* limit of 20 should never be close-approached */
 338                 sum = -1.0;
 339                 slope = 0.0;
 340                 if (lambda0 < 0.01)
 341                         break;
 342                 x0 = exp((double)lambda0);
 343                 x1 = pow((double)x0, low - 1);
 344                 if (x1 == 0.)
 345                         break;
 346                 for (i=low; i<=high; ++i) {
 347                         sum += (temp = sprob[i] * (x1 *= x0));
 348                         slope += temp * i;
 349                 }
 350                 lambda0 -= (amt = sum/slope);
 351                 if (ABS(amt/lambda0) < BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT) {
 352                         /*
 353                         Does it appear that we may be on the verge of converging
 354                         to the ever-present, zero-valued solution?
 355                         */
 356                         if (lambda0 > BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT)
 357                                 return lambda0;
 358                         break;
 359                 }
 360         }
 361         return BlastKarlinLambdaBis(sfp, min, max);
 362 }
 363
 364
 365 /*
 366         BlastKarlinLtoH
 367
 368         Calculate H, the relative entropy of the p's and q's
 369 */
 370 double BlastKarlinLtoH(double* sfp, int4 min, int4 max, double lambda)
 371 {
 372         int4 score;
 373         double av, etolam, etolami;
 374
 375         if (lambda < 0.) {
 376                 return -1.;
 377         }
 378         if (BlastScoreChk(min, max) != 0)
 379                 return -1.;
 380
 381         etolam = exp((double)lambda);
 382         etolami = pow((double)etolam, min - 1);
 383         if (etolami > 0.)
 384         {
 385             av = 0.0;
 386             for (score=min; score<=max; score++)
 387                         av += sfp[score] * score * (etolami *= etolam);
 388         }
 389         else
 390         {
 391             av = 0.0;
 392             for (score=min; score<=max; score++)
 393                         av += sfp[score] * score * exp(lambda * score);
 394         }
 395
 396         return lambda * av;
 397 }
 398
 399 int gcd(int a, int b)
 400 {
 401         int c;
 402
 403         b = ABS(b);
 404         if (b > a)
 405                 c=a, a=b, b=c;
 406
 407         while (b != 0) {
 408                 c = a%b;
 409                 a = b;
 410                 b = c;
 411         }
 412         return a;
 413 }
 414
 415 int4 BlastScoreChk(int4 lo, int4 hi)
 416 {
 417         if (lo >= 0 || hi <= 0 ||
 418                         lo < BLAST_SCORE_1MIN || hi > BLAST_SCORE_1MAX)
 419                 return 1;
 420
 421         if (hi - lo > BLAST_SCORE_RANGE_MAX)
 422                 return 1;
 423
 424         return 0;
 425 }
 426
 427 /**
 428  * Computes the adjustment to the lengths of the query and database sequences
 429  * that is used to compensate for edge effects when computing evalues.
 430  *
 431  * The length adjustment is an integer-valued approximation to the fixed
 432  * point of the function
 433  *
 434  *    f(ell) = beta +
 435  *               (alpha/lambda) * (log K + log((m - ell)*(n - N ell)))
 436  *
 437  * where m is the query length n is the length of the database and N is the
 438  * number of sequences in the database. The values beta, alpha, lambda and
 439  * K are statistical, Karlin-Altschul parameters.
 440  *
 441  * The value of the length adjustment computed by this routine, A,
 442  * will always be an integer smaller than the fixed point of
 443  * f(ell). Usually, it will be the largest such integer.  However, the
 444  * computed length adjustment, A, will also be so small that
 445  *
 446  *    K * (m - A) * (n - N * A) > min(m,n).
 447  *
 448  * Moreover, an iterative method is used to compute A, and under
 449  * unusual circumstances the iterative method may not converge.
 450  *
 451  * @param K      the statistical parameter K
 452  * @param logK   the natural logarithm of K
 453  * @param alpha_d_lambda    the ratio of the statistical parameters
 454  *                          alpha and lambda (for ungapped alignments, the
 455  *                          value 1/H should be used)
 456  * @param beta              the statistical parameter beta (for ungapped
 457  *                          alignments, beta == 0)
 458  * @param query_length      the length of the query sequence
 459  * @param db_length         the length of the database
 460  * @param db_num_seq        the number of sequences in the database
 461  * @param length_adjustment the computed value of the length adjustment [out]
 462  *
 463  * @return   0 if length_adjustment is known to be the largest integer less
 464  *           than the fixed point of f(ell); 1 otherwise.
 465  */
 466
 467 #define maximum(a,b) ((a > b) ? a : b)
 468
 469 int4 BlastComputeLengthAdjustment(double K,
 470                              double logK,
 471                              double alpha_d_lambda,
 472                              double beta,
 473                              int4 query_length,
 474                              uint4 db_length,
 475                              int4 db_num_seqs,
 476                              int4 *length_adjustment)
 477 {
 478     int4 i;                     /* iteration index */
 479     const int4 maxits = 20;     /* maximum allowed iterations */
 480     double m = query_length, n = db_length, N = db_num_seqs;
 481
 482     double ell;            /* A float value of the length adjustment */
 483     double ss;             /* effective size of the search space */
 484     double ell_min = 0, ell_max;   /* At each iteration i,
 485                                          * ell_min <= ell <= ell_max. */
 486     char converged = 0;       /* True if the iteration converged */
 487     double ell_next = 0;   /* Value the variable ell takes at iteration
 488                                  * i + 1 */
 489     /* Choose ell_max to be the largest nonnegative value that satisfies
 490      *
 491      *    K * (m - ell) * (n - N * ell) > max(m,n)
 492      *
 493      * Use quadratic formula: 2 c /( - b + sqrt( b*b - 4 * a * c )) */
 494     { /* scope of a, mb, and c, the coefficients in the quadratic formula
 495        * (the variable mb is -b) */
 496         double a  = N;
 497         double mb = m * N + n;
 498         double c  = n * m - maximum(m, n) / K;
 499
 500         if(c < 0) {
 501             *length_adjustment = 0;
 502             return 1;
 503         } else {
 504             ell_max = 2 * c / (mb + sqrt(mb * mb - 4 * a * c));
 505         }
 506     } /* end scope of a, mb and c */
 507
 508     for(i = 1; i <= maxits; i++) {      /* for all iteration indices */
 509         double ell_bar;    /* proposed next value of ell */
 510         ell      = ell_next;
 511         ss       = (m - ell) * (n - N * ell);
 512         ell_bar  = alpha_d_lambda * (logK + log(ss)) + beta;
 513         if(ell_bar >= ell) { /* ell is no bigger than the true fixed point */
 514             ell_min = ell;
 515             if(ell_bar - ell_min <= 1.0) {
 516                 converged = 1;
 517                 break;
 518             }
 519             if(ell_min == ell_max) { /* There are no more points to check */
 520                 break;
 521             }
 522         } else { /* else ell is greater than the true fixed point */
 523             ell_max = ell;
 524         }
 525         if(ell_min <= ell_bar && ell_bar <= ell_max) {
 526           /* ell_bar is in range. Accept it */
 527             ell_next = ell_bar;
 528         } else { /* else ell_bar is not in range. Reject it */
 529             ell_next = (i == 1) ? ell_max : (ell_min + ell_max) / 2;
 530         }
 531     } /* end for all iteration indices */
 532     if(converged) { /* the iteration converged */
 533         /* If ell_fixed is the (unknown) true fixed point, then we
 534          * wish to set (*length_adjustment) to floor(ell_fixed).  We
 535          * assume that floor(ell_min) = floor(ell_fixed) */
 536         *length_adjustment = (int4) ell_min;
 537         /* But verify that ceil(ell_min) != floor(ell_fixed) */
 538         ell = ceil(ell_min);
 539         if( ell <= ell_max ) {
 540           ss = (m - ell) * (n - N * ell);
 541           if(alpha_d_lambda * (logK + log(ss)) + beta >= ell) {
 542             /* ceil(ell_min) == floor(ell_fixed) */
 543             *length_adjustment = (int4) ell;
 544           }
 545         }
 546     } else { /* else the iteration did not converge. */
 547         /* Use the best value seen so far */
 548         *length_adjustment = (int4) ell_min;
 549     }
 550
 551     return converged ? 0 : 1;
 552 }