2 This file is extracted from the software package on FSA-BLAST.
3 http://www.fsa-blast.org/
10 // Copyright (c) 2005, Michael Cameron
11 // Permission to use this code is freely granted under the BSD license agreement,
12 // provided that this statement is retained.
14 // This code has been copied from blastkar.c and modified to make it work standalone
15 // without using implementations of mathematical functions in ncbimath.c and other
16 // definitions in the NCBI BLAST toolkit.
21 // wss: do not need to bring in blast.h
26 /**************** Statistical Significance Parameter Subroutine ****************
28 Version 1.0 February 2, 1990
29 Version 1.2 July 6, 1990
31 Program by: Stephen Altschul
33 Address: National Center for Biotechnology Information
34 National Library of Medicine
35 National Institutes of Health
38 Internet: altschul@ncbi.nlm.nih.gov
40 See: Karlin, S. & Altschul, S.F. "Methods for Assessing the Statistical
41 Significance of Molecular Sequence Features by Using General Scoring
42 Schemes," Proc. Natl. Acad. Sci. USA 87 (1990), 2264-2268.
44 Computes the parameters lambda and K for use in calculating the
45 statistical significance of high-scoring segments or subalignments.
47 The scoring scheme must be integer valued. A positive score must be
48 possible, but the expected (mean) score must be negative.
50 A program that calls this routine must provide the value of the lowest
51 possible score, the value of the greatest possible score, and a pointer
52 to an array of probabilities for the occurence of all scores between
53 these two extreme scores. For example, if score -2 occurs with
54 probability 0.7, score 0 occurs with probability 0.1, and score 3
55 occurs with probability 0.2, then the subroutine must be called with
56 low = -2, high = 3, and pr pointing to the array of values
57 { 0.7, 0.0, 0.1, 0.0, 0.0, 0.2 }. The calling program must also provide
58 pointers to lambda and K; the subroutine will then calculate the values
59 of these two parameters. In this example, lambda=0.330 and K=0.154.
61 The parameters lambda and K can be used as follows. Suppose we are
62 given a length N random sequence of independent letters. Associated
63 with each letter is a score, and the probabilities of the letters
64 determine the probability for each score. Let S be the aggregate score
65 of the highest scoring contiguous segment of this sequence. Then if N
66 is sufficiently large (greater than 100), the following bound on the
67 probability that S is greater than or equal to x applies:
69 P( S >= x ) <= 1 - exp [ - KN exp ( - lambda * x ) ].
71 In other words, the p-value for this segment can be written as
72 1-exp[-KN*exp(-lambda*S)].
74 This formula can be applied to pairwise sequence comparison by assigning
75 scores to pairs of letters (e.g. amino acids), and by replacing N in the
76 formula with N*M, where N and M are the lengths of the two sequences
79 In addition, letting y = KN*exp(-lambda*S), the p-value for finding m
80 distinct segments all with score >= S is given by:
83 1 - [ 1 + y + y /2! + ... + y /(m-1)! ] e
85 Notice that for m=1 this formula reduces to 1-exp(-y), which is the same
86 as the previous formula.
88 *******************************************************************************/
90 double BlastKarlinLHtoK(double* sfp
, int4 min
, int4 max
, double lambda
, double H
);
91 double BlastKarlinLambdaNR(double* sfp
, int4 min
, int4 max
);
92 double BlastKarlinLtoH(double* sfp
, int4 min
, int4 max
, double lambda
);
94 void BlastKarlinBlkCalc(double* scoreProbabilities
, int4 min
, int4 max
)
96 /* Calculate the parameter Lambda */
97 BlastKarlin_lambda
= BlastKarlinLambdaNR(scoreProbabilities
, min
, max
);
100 BlastKarlin_H
= BlastKarlinLtoH(scoreProbabilities
, min
, max
, BlastKarlin_lambda
);
103 BlastKarlin_K
= BlastKarlinLHtoK(scoreProbabilities
, min
, max
, BlastKarlin_lambda
,
107 #define DIMOFP0 (iter*range + 1)
108 #define DIMOFP0_MAX (BLAST_KARLIN_K_ITER_MAX*BLAST_SCORE_RANGE_MAX+1)
109 /****************************************************************************
110 For more accuracy in the calculation of K, set K_SUMLIMIT to 0.00001.
111 For high speed in the calculation of K, use a K_SUMLIMIT of 0.001
112 Note: statistical significance is often not greatly affected by the value
113 of K, so high accuracy is generally unwarranted.
114 *****************************************************************************/
115 /* K_SUMLIMIT_DEFAULT == sumlimit used in BlastKarlinLHtoK() */
116 #define BLAST_KARLIN_K_SUMLIMIT_DEFAULT 0.01
117 #define BLAST_KARLIN_K_ITER_MAX 100
119 SCORE_MIN is (-2**31 + 1)/2 because it has been observed more than once that
120 a compiler did not properly calculate the quantity (-2**31)/2. The list
121 of compilers which failed this operation have at least at some time included:
122 NeXT and a version of AIX/370's MetaWare High C R2.1r.
123 For this reason, SCORE_MIN is not simply defined to be LONG_MIN/2.
125 #define BLAST_SCORE_1MIN (-10000)
126 #define BLAST_SCORE_1MAX ( 1000)
127 #define BLAST_SCORE_RANGE_MAX (BLAST_SCORE_1MAX - BLAST_SCORE_1MIN)
128 /* Initial guess for the value of Lambda in BlastKarlinLambdaNR */
129 #define BLAST_KARLIN_LAMBDA0_DEFAULT 0.5
130 /* LAMBDA_ACCURACY_DEFAULT == accuracy to which Lambda should be calc'd */
131 #define BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT (1.e-5)
132 /* LAMBDA_ITER_DEFAULT == no. of iterations in LambdaBis = ln(accuracy)/ln(2)*/
133 #define BLAST_KARLIN_LAMBDA_ITER_DEFAULT 17
134 #define ABS(a) ((a)>=0?(a):-(a))
136 int gcd(int a
, int b
);
137 int4
BlastScoreChk(int4 lo
, int4 hi
);
139 double BlastKarlinLHtoK(double* sfp
, int4 min
, int4 max
, double lambda
, double H
)
141 //#ifndef BLAST_KARLIN_STACKP
144 // double P0 [DIMOFP0_MAX];
146 int4 low
; /* Lowest score (must be negative) */
147 int4 high
; /* Highest score (must be positive) */
148 double K
; /* local copy of K */
151 int4 range
, lo
, hi
, first
, last
;
153 double Sum
, av
, oldsum
, oldsum2
;
156 double *p
, *ptrP
, *ptr1
, *ptr2
, *ptr1e
;
157 double etolami
, etolam
;
159 if (lambda
<= 0. || H
<= 0.) {
168 etolam
= exp((double)lambda
);
169 if (low
== -1 || high
== 1) {
171 return K
* (1.0 - 1./etolam
);
174 sumlimit
= BLAST_KARLIN_K_SUMLIMIT_DEFAULT
;
176 iter
= BLAST_KARLIN_K_ITER_MAX
;
178 if (DIMOFP0
> DIMOFP0_MAX
) {
182 //#ifndef BLAST_KARLIN_STACKP
183 //P0 = (double *)global_malloc(DIMOFP0 * sizeof(*P0));
184 P0
= (double *)malloc(DIMOFP0
* sizeof(*P0
));
188 // memset((CharPtr)P0, 0, DIMOFP0*sizeof(P0[0]));
194 P0
[0] = sum
= oldsum
= oldsum2
= 1.;
195 for (j
= 0; j
< iter
&& sum
> sumlimit
; Sum
+= sum
/= ++j
) {
196 first
= last
= range
;
199 for (ptrP
= P0
+(hi
-lo
); ptrP
>= P0
; *ptrP
-- =sum
) {
203 for (sum
= 0.; ptr1
>= ptr1e
; )
204 sum
+= *ptr1
-- * *ptr2
++;
207 if (ptrP
- P0
<= range
)
210 etolami
= pow((double)etolam
, lo
- 1);
211 for (sum
= 0., i
= lo
; i
!= 0; ++i
) {
213 sum
+= *++ptrP
* etolami
;
221 /* Terms of geometric progression added for correction */
222 ratio
= oldsum
/ oldsum2
;
223 if (ratio
>= (1.0 - sumlimit
*0.001)) {
228 while (sum
> sumlimit
) {
230 Sum
+= sum
= oldsum
/ ++j
;
233 /* Look for the greatest common divisor ("delta" in Appendix of PNAS 87 of
234 Karlin&Altschul (1990) */
235 for (i
= 1, j
= -low
; i
<= range
&& j
> 1; ++i
)
241 etolami
= pow((double)etolam
, -j
);
242 K
= j
*exp((double)-2.0*Sum
) / (av
*(1.0 - etolami
));
245 K
= -j
*exp((double)-2.0*Sum
) / (av
*(exp(-j
*(double)lambda
) - 1));
248 //#ifndef BLAST_KARLIN_K_STACKP
258 Calculate Lambda using the bisection method (slow).
260 double BlastKarlinLambdaBis(double* sfp
, int4 min
, int4 max
)
263 double lambda
, up
, newval
;
270 if (BlastScoreChk(low
, high
) != 0)
274 up
= BLAST_KARLIN_LAMBDA0_DEFAULT
;
275 for (lambda
=0.; ; ) {
277 x0
= exp((double)up
);
278 x1
= pow((double)x0
, low
- 1);
280 for (sum
=0., i
=low
; i
<=high
; ++i
)
281 sum
+= sprob
[i
] * (x1
*= x0
);
284 for (sum
=0., i
=low
; i
<=high
; ++i
)
285 sum
+= sprob
[i
] * exp(up
* i
);
292 for (j
=0; j
<BLAST_KARLIN_LAMBDA_ITER_DEFAULT
; ++j
) {
293 newval
= (lambda
+ up
) / 2.;
294 x0
= exp((double)newval
);
295 x1
= pow((double)x0
, low
- 1);
297 for (sum
=0., i
=low
; i
<=high
; ++i
)
298 sum
+= sprob
[i
] * (x1
*= x0
);
301 for (sum
=0., i
=low
; i
<=high
; ++i
)
302 sum
+= sprob
[i
] * exp(newval
* i
);
309 return (lambda
+ up
) / 2.;
312 /******************* Fast Lambda Calculation Subroutine ************************
313 Version 1.0 May 16, 1991
314 Program by: Stephen Altschul
316 Uses Newton-Raphson method (fast) to solve for Lambda, given an initial
317 guess (lambda0) obtained perhaps by the bisection method.
318 *******************************************************************************/
319 double BlastKarlinLambdaNR(double* sfp
, int4 min
, int4 max
)
321 int4 low
; /* Lowest score (must be negative) */
322 int4 high
; /* Highest score (must be positive) */
326 double lambda0
, sum
, slope
, temp
, x0
, x1
, amt
;
330 if (BlastScoreChk(low
, high
) != 0)
333 lambda0
= BLAST_KARLIN_LAMBDA0_DEFAULT
;
335 /* Calculate lambda */
337 for (j
=0; j
<20; ++j
) { /* limit of 20 should never be close-approached */
342 x0
= exp((double)lambda0
);
343 x1
= pow((double)x0
, low
- 1);
346 for (i
=low
; i
<=high
; ++i
) {
347 sum
+= (temp
= sprob
[i
] * (x1
*= x0
));
350 lambda0
-= (amt
= sum
/slope
);
351 if (ABS(amt
/lambda0
) < BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT
) {
353 Does it appear that we may be on the verge of converging
354 to the ever-present, zero-valued solution?
356 if (lambda0
> BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT
)
361 return BlastKarlinLambdaBis(sfp
, min
, max
);
368 Calculate H, the relative entropy of the p's and q's
370 double BlastKarlinLtoH(double* sfp
, int4 min
, int4 max
, double lambda
)
373 double av
, etolam
, etolami
;
378 if (BlastScoreChk(min
, max
) != 0)
381 etolam
= exp((double)lambda
);
382 etolami
= pow((double)etolam
, min
- 1);
386 for (score
=min
; score
<=max
; score
++)
387 av
+= sfp
[score
] * score
* (etolami
*= etolam
);
392 for (score
=min
; score
<=max
; score
++)
393 av
+= sfp
[score
] * score
* exp(lambda
* score
);
399 int gcd(int a
, int b
)
415 int4
BlastScoreChk(int4 lo
, int4 hi
)
417 if (lo
>= 0 || hi
<= 0 ||
418 lo
< BLAST_SCORE_1MIN
|| hi
> BLAST_SCORE_1MAX
)
421 if (hi
- lo
> BLAST_SCORE_RANGE_MAX
)
428 * Computes the adjustment to the lengths of the query and database sequences
429 * that is used to compensate for edge effects when computing evalues.
431 * The length adjustment is an integer-valued approximation to the fixed
432 * point of the function
435 * (alpha/lambda) * (log K + log((m - ell)*(n - N ell)))
437 * where m is the query length n is the length of the database and N is the
438 * number of sequences in the database. The values beta, alpha, lambda and
439 * K are statistical, Karlin-Altschul parameters.
441 * The value of the length adjustment computed by this routine, A,
442 * will always be an integer smaller than the fixed point of
443 * f(ell). Usually, it will be the largest such integer. However, the
444 * computed length adjustment, A, will also be so small that
446 * K * (m - A) * (n - N * A) > min(m,n).
448 * Moreover, an iterative method is used to compute A, and under
449 * unusual circumstances the iterative method may not converge.
451 * @param K the statistical parameter K
452 * @param logK the natural logarithm of K
453 * @param alpha_d_lambda the ratio of the statistical parameters
454 * alpha and lambda (for ungapped alignments, the
455 * value 1/H should be used)
456 * @param beta the statistical parameter beta (for ungapped
457 * alignments, beta == 0)
458 * @param query_length the length of the query sequence
459 * @param db_length the length of the database
460 * @param db_num_seq the number of sequences in the database
461 * @param length_adjustment the computed value of the length adjustment [out]
463 * @return 0 if length_adjustment is known to be the largest integer less
464 * than the fixed point of f(ell); 1 otherwise.
467 #define maximum(a,b) ((a > b) ? a : b)
469 int4
BlastComputeLengthAdjustment(double K
,
471 double alpha_d_lambda
,
476 int4
*length_adjustment
)
478 int4 i
; /* iteration index */
479 const int4 maxits
= 20; /* maximum allowed iterations */
480 double m
= query_length
, n
= db_length
, N
= db_num_seqs
;
482 double ell
; /* A float value of the length adjustment */
483 double ss
; /* effective size of the search space */
484 double ell_min
= 0, ell_max
; /* At each iteration i,
485 * ell_min <= ell <= ell_max. */
486 char converged
= 0; /* True if the iteration converged */
487 double ell_next
= 0; /* Value the variable ell takes at iteration
489 /* Choose ell_max to be the largest nonnegative value that satisfies
491 * K * (m - ell) * (n - N * ell) > max(m,n)
493 * Use quadratic formula: 2 c /( - b + sqrt( b*b - 4 * a * c )) */
494 { /* scope of a, mb, and c, the coefficients in the quadratic formula
495 * (the variable mb is -b) */
497 double mb
= m
* N
+ n
;
498 double c
= n
* m
- maximum(m
, n
) / K
;
501 *length_adjustment
= 0;
504 ell_max
= 2 * c
/ (mb
+ sqrt(mb
* mb
- 4 * a
* c
));
506 } /* end scope of a, mb and c */
508 for(i
= 1; i
<= maxits
; i
++) { /* for all iteration indices */
509 double ell_bar
; /* proposed next value of ell */
511 ss
= (m
- ell
) * (n
- N
* ell
);
512 ell_bar
= alpha_d_lambda
* (logK
+ log(ss
)) + beta
;
513 if(ell_bar
>= ell
) { /* ell is no bigger than the true fixed point */
515 if(ell_bar
- ell_min
<= 1.0) {
519 if(ell_min
== ell_max
) { /* There are no more points to check */
522 } else { /* else ell is greater than the true fixed point */
525 if(ell_min
<= ell_bar
&& ell_bar
<= ell_max
) {
526 /* ell_bar is in range. Accept it */
528 } else { /* else ell_bar is not in range. Reject it */
529 ell_next
= (i
== 1) ? ell_max
: (ell_min
+ ell_max
) / 2;
531 } /* end for all iteration indices */
532 if(converged
) { /* the iteration converged */
533 /* If ell_fixed is the (unknown) true fixed point, then we
534 * wish to set (*length_adjustment) to floor(ell_fixed). We
535 * assume that floor(ell_min) = floor(ell_fixed) */
536 *length_adjustment
= (int4
) ell_min
;
537 /* But verify that ceil(ell_min) != floor(ell_fixed) */
539 if( ell
<= ell_max
) {
540 ss
= (m
- ell
) * (n
- N
* ell
);
541 if(alpha_d_lambda
* (logK
+ log(ss
)) + beta
>= ell
) {
542 /* ceil(ell_min) == floor(ell_fixed) */
543 *length_adjustment
= (int4
) ell
;
546 } else { /* else the iteration did not converge. */
547 /* Use the best value seen so far */
548 *length_adjustment
= (int4
) ell_min
;
551 return converged
? 0 : 1;