modified: makefile
[GalaxyCodeBases.git] / BGI / soap_src / soap_builder / karlin.c
blob7ea3a217bbbd8cbeb0022684ebdfc7338f05da42
1 /*
2 This file is extracted from the software package on FSA-BLAST.
3 http://www.fsa-blast.org/
5 Wong Swee Seong
6 20-4-2006
8 */
9 // karlin.c
10 // Copyright (c) 2005, Michael Cameron
11 // Permission to use this code is freely granted under the BSD license agreement,
12 // provided that this statement is retained.
14 // This code has been copied from blastkar.c and modified to make it work standalone
15 // without using implementations of mathematical functions in ncbimath.c and other
16 // definitions in the NCBI BLAST toolkit.
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <math.h>
21 // wss: do not need to bring in blast.h
22 //#include "blast.h"
23 #include <string.h>
24 #include "karlin.h"
26 /**************** Statistical Significance Parameter Subroutine ****************
28 Version 1.0 February 2, 1990
29 Version 1.2 July 6, 1990
31 Program by: Stephen Altschul
33 Address: National Center for Biotechnology Information
34 National Library of Medicine
35 National Institutes of Health
36 Bethesda, MD 20894
38 Internet: altschul@ncbi.nlm.nih.gov
40 See: Karlin, S. & Altschul, S.F. "Methods for Assessing the Statistical
41 Significance of Molecular Sequence Features by Using General Scoring
42 Schemes," Proc. Natl. Acad. Sci. USA 87 (1990), 2264-2268.
44 Computes the parameters lambda and K for use in calculating the
45 statistical significance of high-scoring segments or subalignments.
47 The scoring scheme must be integer valued. A positive score must be
48 possible, but the expected (mean) score must be negative.
50 A program that calls this routine must provide the value of the lowest
51 possible score, the value of the greatest possible score, and a pointer
52 to an array of probabilities for the occurence of all scores between
53 these two extreme scores. For example, if score -2 occurs with
54 probability 0.7, score 0 occurs with probability 0.1, and score 3
55 occurs with probability 0.2, then the subroutine must be called with
56 low = -2, high = 3, and pr pointing to the array of values
57 { 0.7, 0.0, 0.1, 0.0, 0.0, 0.2 }. The calling program must also provide
58 pointers to lambda and K; the subroutine will then calculate the values
59 of these two parameters. In this example, lambda=0.330 and K=0.154.
61 The parameters lambda and K can be used as follows. Suppose we are
62 given a length N random sequence of independent letters. Associated
63 with each letter is a score, and the probabilities of the letters
64 determine the probability for each score. Let S be the aggregate score
65 of the highest scoring contiguous segment of this sequence. Then if N
66 is sufficiently large (greater than 100), the following bound on the
67 probability that S is greater than or equal to x applies:
69 P( S >= x ) <= 1 - exp [ - KN exp ( - lambda * x ) ].
71 In other words, the p-value for this segment can be written as
72 1-exp[-KN*exp(-lambda*S)].
74 This formula can be applied to pairwise sequence comparison by assigning
75 scores to pairs of letters (e.g. amino acids), and by replacing N in the
76 formula with N*M, where N and M are the lengths of the two sequences
77 being compared.
79 In addition, letting y = KN*exp(-lambda*S), the p-value for finding m
80 distinct segments all with score >= S is given by:
82 2 m-1 -y
83 1 - [ 1 + y + y /2! + ... + y /(m-1)! ] e
85 Notice that for m=1 this formula reduces to 1-exp(-y), which is the same
86 as the previous formula.
88 *******************************************************************************/
90 double BlastKarlinLHtoK(double* sfp, int4 min, int4 max, double lambda, double H);
91 double BlastKarlinLambdaNR(double* sfp, int4 min, int4 max);
92 double BlastKarlinLtoH(double* sfp, int4 min, int4 max, double lambda);
94 void BlastKarlinBlkCalc(double* scoreProbabilities, int4 min, int4 max)
96 /* Calculate the parameter Lambda */
97 BlastKarlin_lambda = BlastKarlinLambdaNR(scoreProbabilities, min, max);
99 /* Calculate H */
100 BlastKarlin_H = BlastKarlinLtoH(scoreProbabilities, min, max, BlastKarlin_lambda);
102 /* Calculate K */
103 BlastKarlin_K = BlastKarlinLHtoK(scoreProbabilities, min, max, BlastKarlin_lambda,
104 BlastKarlin_H);
107 #define DIMOFP0 (iter*range + 1)
108 #define DIMOFP0_MAX (BLAST_KARLIN_K_ITER_MAX*BLAST_SCORE_RANGE_MAX+1)
109 /****************************************************************************
110 For more accuracy in the calculation of K, set K_SUMLIMIT to 0.00001.
111 For high speed in the calculation of K, use a K_SUMLIMIT of 0.001
112 Note: statistical significance is often not greatly affected by the value
113 of K, so high accuracy is generally unwarranted.
114 *****************************************************************************/
115 /* K_SUMLIMIT_DEFAULT == sumlimit used in BlastKarlinLHtoK() */
116 #define BLAST_KARLIN_K_SUMLIMIT_DEFAULT 0.01
117 #define BLAST_KARLIN_K_ITER_MAX 100
119 SCORE_MIN is (-2**31 + 1)/2 because it has been observed more than once that
120 a compiler did not properly calculate the quantity (-2**31)/2. The list
121 of compilers which failed this operation have at least at some time included:
122 NeXT and a version of AIX/370's MetaWare High C R2.1r.
123 For this reason, SCORE_MIN is not simply defined to be LONG_MIN/2.
125 #define BLAST_SCORE_1MIN (-10000)
126 #define BLAST_SCORE_1MAX ( 1000)
127 #define BLAST_SCORE_RANGE_MAX (BLAST_SCORE_1MAX - BLAST_SCORE_1MIN)
128 /* Initial guess for the value of Lambda in BlastKarlinLambdaNR */
129 #define BLAST_KARLIN_LAMBDA0_DEFAULT 0.5
130 /* LAMBDA_ACCURACY_DEFAULT == accuracy to which Lambda should be calc'd */
131 #define BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT (1.e-5)
132 /* LAMBDA_ITER_DEFAULT == no. of iterations in LambdaBis = ln(accuracy)/ln(2)*/
133 #define BLAST_KARLIN_LAMBDA_ITER_DEFAULT 17
134 #define ABS(a) ((a)>=0?(a):-(a))
136 int gcd(int a, int b);
137 int4 BlastScoreChk(int4 lo, int4 hi);
139 double BlastKarlinLHtoK(double* sfp, int4 min, int4 max, double lambda, double H)
141 //#ifndef BLAST_KARLIN_STACKP
142 double *P0 = NULL;
143 //#else
144 // double P0 [DIMOFP0_MAX];
145 //#endif
146 int4 low; /* Lowest score (must be negative) */
147 int4 high; /* Highest score (must be positive) */
148 double K; /* local copy of K */
149 double ratio;
150 int4 i, j;
151 int4 range, lo, hi, first, last;
152 double sum;
153 double Sum, av, oldsum, oldsum2;
154 int4 iter;
155 double sumlimit;
156 double *p, *ptrP, *ptr1, *ptr2, *ptr1e;
157 double etolami, etolam;
159 if (lambda <= 0. || H <= 0.) {
160 return -1.;
163 low = min;
164 high = max;
165 range = high - low;
167 av = H/lambda;
168 etolam = exp((double)lambda);
169 if (low == -1 || high == 1) {
170 K = av;
171 return K * (1.0 - 1./etolam);
174 sumlimit = BLAST_KARLIN_K_SUMLIMIT_DEFAULT;
176 iter = BLAST_KARLIN_K_ITER_MAX;
178 if (DIMOFP0 > DIMOFP0_MAX) {
179 return -1.;
182 //#ifndef BLAST_KARLIN_STACKP
183 //P0 = (double *)global_malloc(DIMOFP0 * sizeof(*P0));
184 P0 = (double *)malloc(DIMOFP0 * sizeof(*P0));
185 if (P0 == NULL)
186 return -1.;
187 //#else
188 // memset((CharPtr)P0, 0, DIMOFP0*sizeof(P0[0]));
189 //#endif
191 Sum = 0.;
192 lo = hi = 0;
193 p = &sfp[low];
194 P0[0] = sum = oldsum = oldsum2 = 1.;
195 for (j = 0; j < iter && sum > sumlimit; Sum += sum /= ++j) {
196 first = last = range;
197 lo += low;
198 hi += high;
199 for (ptrP = P0+(hi-lo); ptrP >= P0; *ptrP-- =sum) {
200 ptr1 = ptrP - first;
201 ptr1e = ptrP - last;
202 ptr2 = p + first;
203 for (sum = 0.; ptr1 >= ptr1e; )
204 sum += *ptr1-- * *ptr2++;
205 if (first)
206 --first;
207 if (ptrP - P0 <= range)
208 --last;
210 etolami = pow((double)etolam, lo - 1);
211 for (sum = 0., i = lo; i != 0; ++i) {
212 etolami *= etolam;
213 sum += *++ptrP * etolami;
215 for (; i <= hi; ++i)
216 sum += *++ptrP;
217 oldsum2 = oldsum;
218 oldsum = sum;
221 /* Terms of geometric progression added for correction */
222 ratio = oldsum / oldsum2;
223 if (ratio >= (1.0 - sumlimit*0.001)) {
224 K = -1.;
225 goto CleanUp;
227 sumlimit *= 0.01;
228 while (sum > sumlimit) {
229 oldsum *= ratio;
230 Sum += sum = oldsum / ++j;
233 /* Look for the greatest common divisor ("delta" in Appendix of PNAS 87 of
234 Karlin&Altschul (1990) */
235 for (i = 1, j = -low; i <= range && j > 1; ++i)
236 if (p[i])
237 j = gcd(j, i);
239 if (j*etolam > 0.05)
241 etolami = pow((double)etolam, -j);
242 K = j*exp((double)-2.0*Sum) / (av*(1.0 - etolami));
244 else
245 K = -j*exp((double)-2.0*Sum) / (av*(exp(-j*(double)lambda) - 1));
247 CleanUp:
248 //#ifndef BLAST_KARLIN_K_STACKP
249 if (P0 != NULL)
250 free(P0);
251 //#endif
252 return K;
256 BlastKarlinLambdaBis
258 Calculate Lambda using the bisection method (slow).
260 double BlastKarlinLambdaBis(double* sfp, int4 min, int4 max)
262 double* sprob;
263 double lambda, up, newval;
264 int4 i, low, high;
265 int4 j;
266 double sum, x0, x1;
268 low = min;
269 high = max;
270 if (BlastScoreChk(low, high) != 0)
271 return -1.;
273 sprob = sfp;
274 up = BLAST_KARLIN_LAMBDA0_DEFAULT;
275 for (lambda=0.; ; ) {
276 up *= 2;
277 x0 = exp((double)up);
278 x1 = pow((double)x0, low - 1);
279 if (x1 > 0.) {
280 for (sum=0., i=low; i<=high; ++i)
281 sum += sprob[i] * (x1 *= x0);
283 else {
284 for (sum=0., i=low; i<=high; ++i)
285 sum += sprob[i] * exp(up * i);
287 if (sum >= 1.0)
288 break;
289 lambda = up;
292 for (j=0; j<BLAST_KARLIN_LAMBDA_ITER_DEFAULT; ++j) {
293 newval = (lambda + up) / 2.;
294 x0 = exp((double)newval);
295 x1 = pow((double)x0, low - 1);
296 if (x1 > 0.) {
297 for (sum=0., i=low; i<=high; ++i)
298 sum += sprob[i] * (x1 *= x0);
300 else {
301 for (sum=0., i=low; i<=high; ++i)
302 sum += sprob[i] * exp(newval * i);
304 if (sum > 1.0)
305 up = newval;
306 else
307 lambda = newval;
309 return (lambda + up) / 2.;
312 /******************* Fast Lambda Calculation Subroutine ************************
313 Version 1.0 May 16, 1991
314 Program by: Stephen Altschul
316 Uses Newton-Raphson method (fast) to solve for Lambda, given an initial
317 guess (lambda0) obtained perhaps by the bisection method.
318 *******************************************************************************/
319 double BlastKarlinLambdaNR(double* sfp, int4 min, int4 max)
321 int4 low; /* Lowest score (must be negative) */
322 int4 high; /* Highest score (must be positive) */
323 int4 j;
324 int4 i;
325 double *sprob;
326 double lambda0, sum, slope, temp, x0, x1, amt;
328 low = min;
329 high = max;
330 if (BlastScoreChk(low, high) != 0)
331 return -1.;
333 lambda0 = BLAST_KARLIN_LAMBDA0_DEFAULT;
335 /* Calculate lambda */
336 sprob = sfp;
337 for (j=0; j<20; ++j) { /* limit of 20 should never be close-approached */
338 sum = -1.0;
339 slope = 0.0;
340 if (lambda0 < 0.01)
341 break;
342 x0 = exp((double)lambda0);
343 x1 = pow((double)x0, low - 1);
344 if (x1 == 0.)
345 break;
346 for (i=low; i<=high; ++i) {
347 sum += (temp = sprob[i] * (x1 *= x0));
348 slope += temp * i;
350 lambda0 -= (amt = sum/slope);
351 if (ABS(amt/lambda0) < BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT) {
353 Does it appear that we may be on the verge of converging
354 to the ever-present, zero-valued solution?
356 if (lambda0 > BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT)
357 return lambda0;
358 break;
361 return BlastKarlinLambdaBis(sfp, min, max);
366 BlastKarlinLtoH
368 Calculate H, the relative entropy of the p's and q's
370 double BlastKarlinLtoH(double* sfp, int4 min, int4 max, double lambda)
372 int4 score;
373 double av, etolam, etolami;
375 if (lambda < 0.) {
376 return -1.;
378 if (BlastScoreChk(min, max) != 0)
379 return -1.;
381 etolam = exp((double)lambda);
382 etolami = pow((double)etolam, min - 1);
383 if (etolami > 0.)
385 av = 0.0;
386 for (score=min; score<=max; score++)
387 av += sfp[score] * score * (etolami *= etolam);
389 else
391 av = 0.0;
392 for (score=min; score<=max; score++)
393 av += sfp[score] * score * exp(lambda * score);
396 return lambda * av;
399 int gcd(int a, int b)
401 int c;
403 b = ABS(b);
404 if (b > a)
405 c=a, a=b, b=c;
407 while (b != 0) {
408 c = a%b;
409 a = b;
410 b = c;
412 return a;
415 int4 BlastScoreChk(int4 lo, int4 hi)
417 if (lo >= 0 || hi <= 0 ||
418 lo < BLAST_SCORE_1MIN || hi > BLAST_SCORE_1MAX)
419 return 1;
421 if (hi - lo > BLAST_SCORE_RANGE_MAX)
422 return 1;
424 return 0;
428 * Computes the adjustment to the lengths of the query and database sequences
429 * that is used to compensate for edge effects when computing evalues.
431 * The length adjustment is an integer-valued approximation to the fixed
432 * point of the function
434 * f(ell) = beta +
435 * (alpha/lambda) * (log K + log((m - ell)*(n - N ell)))
437 * where m is the query length n is the length of the database and N is the
438 * number of sequences in the database. The values beta, alpha, lambda and
439 * K are statistical, Karlin-Altschul parameters.
441 * The value of the length adjustment computed by this routine, A,
442 * will always be an integer smaller than the fixed point of
443 * f(ell). Usually, it will be the largest such integer. However, the
444 * computed length adjustment, A, will also be so small that
446 * K * (m - A) * (n - N * A) > min(m,n).
448 * Moreover, an iterative method is used to compute A, and under
449 * unusual circumstances the iterative method may not converge.
451 * @param K the statistical parameter K
452 * @param logK the natural logarithm of K
453 * @param alpha_d_lambda the ratio of the statistical parameters
454 * alpha and lambda (for ungapped alignments, the
455 * value 1/H should be used)
456 * @param beta the statistical parameter beta (for ungapped
457 * alignments, beta == 0)
458 * @param query_length the length of the query sequence
459 * @param db_length the length of the database
460 * @param db_num_seq the number of sequences in the database
461 * @param length_adjustment the computed value of the length adjustment [out]
463 * @return 0 if length_adjustment is known to be the largest integer less
464 * than the fixed point of f(ell); 1 otherwise.
467 #define maximum(a,b) ((a > b) ? a : b)
469 int4 BlastComputeLengthAdjustment(double K,
470 double logK,
471 double alpha_d_lambda,
472 double beta,
473 int4 query_length,
474 uint4 db_length,
475 int4 db_num_seqs,
476 int4 *length_adjustment)
478 int4 i; /* iteration index */
479 const int4 maxits = 20; /* maximum allowed iterations */
480 double m = query_length, n = db_length, N = db_num_seqs;
482 double ell; /* A float value of the length adjustment */
483 double ss; /* effective size of the search space */
484 double ell_min = 0, ell_max; /* At each iteration i,
485 * ell_min <= ell <= ell_max. */
486 char converged = 0; /* True if the iteration converged */
487 double ell_next = 0; /* Value the variable ell takes at iteration
488 * i + 1 */
489 /* Choose ell_max to be the largest nonnegative value that satisfies
491 * K * (m - ell) * (n - N * ell) > max(m,n)
493 * Use quadratic formula: 2 c /( - b + sqrt( b*b - 4 * a * c )) */
494 { /* scope of a, mb, and c, the coefficients in the quadratic formula
495 * (the variable mb is -b) */
496 double a = N;
497 double mb = m * N + n;
498 double c = n * m - maximum(m, n) / K;
500 if(c < 0) {
501 *length_adjustment = 0;
502 return 1;
503 } else {
504 ell_max = 2 * c / (mb + sqrt(mb * mb - 4 * a * c));
506 } /* end scope of a, mb and c */
508 for(i = 1; i <= maxits; i++) { /* for all iteration indices */
509 double ell_bar; /* proposed next value of ell */
510 ell = ell_next;
511 ss = (m - ell) * (n - N * ell);
512 ell_bar = alpha_d_lambda * (logK + log(ss)) + beta;
513 if(ell_bar >= ell) { /* ell is no bigger than the true fixed point */
514 ell_min = ell;
515 if(ell_bar - ell_min <= 1.0) {
516 converged = 1;
517 break;
519 if(ell_min == ell_max) { /* There are no more points to check */
520 break;
522 } else { /* else ell is greater than the true fixed point */
523 ell_max = ell;
525 if(ell_min <= ell_bar && ell_bar <= ell_max) {
526 /* ell_bar is in range. Accept it */
527 ell_next = ell_bar;
528 } else { /* else ell_bar is not in range. Reject it */
529 ell_next = (i == 1) ? ell_max : (ell_min + ell_max) / 2;
531 } /* end for all iteration indices */
532 if(converged) { /* the iteration converged */
533 /* If ell_fixed is the (unknown) true fixed point, then we
534 * wish to set (*length_adjustment) to floor(ell_fixed). We
535 * assume that floor(ell_min) = floor(ell_fixed) */
536 *length_adjustment = (int4) ell_min;
537 /* But verify that ceil(ell_min) != floor(ell_fixed) */
538 ell = ceil(ell_min);
539 if( ell <= ell_max ) {
540 ss = (m - ell) * (n - N * ell);
541 if(alpha_d_lambda * (logK + log(ss)) + beta >= ell) {
542 /* ceil(ell_min) == floor(ell_fixed) */
543 *length_adjustment = (int4) ell;
546 } else { /* else the iteration did not converge. */
547 /* Use the best value seen so far */
548 *length_adjustment = (int4) ell_min;
551 return converged ? 0 : 1;