modified: makefile
[GalaxyCodeBases.git] / BGI / soap_src / soap_builder / HSPstatistic.c
blobfefcac82cd7a0bfccb59641f61830662ce5afb66
1 /* ==========================================================================
3 * HSPstatistic.c
5 * author: Wong Swee Seong
6 * date: 30-03-2006
8 * gapped and ungapped extensions for string matching
10 * There are many references to BLAST statistic, a good development reading
11 * would from the NCBI BLAST pc version, file doc/scoring.pdf
13 * This files contains fragmented codes extracted and modified from FSA-BLAST
14 * by MIcheal Cameron, and as such the following statement is provided.
16 Copyright (c) 2005, Michael Cameron
17 Permission to use this code is freely granted under the BSD license agreement,
18 provided that this statement is retained.
20 Statistical functions related to BLAST (ie. converting nominal scores
21 to normalized scores and to e-values)
22 * ==========================================================================*/
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <math.h>
29 #include <assert.h>
30 #include "karlin.h"
31 #include "HSPstatistic.h"
33 // default cutoff evalue for blastn, used in ungapped extension
34 // as define in NCBI BLAST, file: include/algo/blast/core/blast_parameters.h
35 #define CUTOFF_E_BLASTN 0.05
37 double stat_log2;
38 int4 stat_ungapXdropoff;
39 int4 stat_gapXdropoff;
40 int4 stat_gapXdropoffFinal;
41 double stat_expectationValue = 10;
42 STAT_scoreBlock stat_scoreBlock = {1,-3,5,2};
43 STAT_Xdropoffs stat_XdropoffsInBits = {20,30,50};
45 /* ungap parameters */
46 double stat_ungapK;
47 double stat_ungapLogK;
48 double stat_ungapH;
49 double stat_ungapLambda;
51 /* gap parameters */
52 double stat_gapK;
53 double stat_gapLogK;
54 double stat_gapH;
55 double stat_gapLambda;
57 /* search space parameters */
58 uint8 stat_databaseSize;
59 uint8 stat_numOfSequences;
60 uint8 stat_averageSubjectSize; // average sequence size in database
61 uint8 stat_minimumSubjectSize; // minimum sequence size in database
62 int8 stat_adjustmentLength;
63 uint8 stat_querySize;
64 uint8 stat_numOfContext;
65 uint8 stat_effectiveQuerySize;
66 uint8 stat_effectiveDatabaseSize;
67 uint8 stat_effectiveSearchSpaceSize;
70 //enum karlinParam_ENUM {LAMBDA,parmK,parmH};
71 //int HSPscoreBlock[4] = {1, -3, 5, 2};
72 /* HSPstatBlk -
73 * 1. ungapXdropOff = 10, default 20 in bits
74 * 2. gapXdropOff = 15, default 30 in bits
75 * 3. gapXdropOffFinal = 25, default 50 in bits
76 * 4. ungapCutoffScore = 15, to be calculated
77 * 5. expectationValue = 10
78 * */
79 //float HSPXdropoffInBits[3] = {10, 15, 25};
81 /* Karlin Parameters extracted from NCBI tool-kit,
82 * file: blast_stat.c
83 * Will add more in future
84 * Only storing (lambda,k) pair for each pre-defined score value set
85 * (match,miss), we will the need other parameter if
86 * we want to calculate score for gapped alignment
89 int KarlinParamSelect = 0;
90 const int KarlinParamSet = 3;
91 const int scoreList[][2] = {{1,-3},
92 {1,-4},
93 {1,-2}};
94 const double KarlinParam[][2] = {{1.374, 0.711},
95 {1.383, 0.738},
96 {1.28, 0.46}};
99 /* forward declaration */
100 void stat_calcUngapKarlinParameters(double *dbCharProb, double *queryCharProb, int scoringMatrix[16][16]);
101 double stat_calcLengthAdjust();
104 Initialize the statistical paramters for extension
105 Need only to call once, with the database size and number of sequences in
106 the database.
108 //void initializeHSPstatistic(uint8 databaseSize, uint8 numOfSequences, uint8 minSeqLength, double *dbCharProb,
109 // uint8 querySize, uint8 numOfContext, double *queryCharProb, int scoringMatrix[16][16])
112 // stat_databaseSize = databaseSize;
113 // stat_numOfSequences = numOfSequences;
114 // stat_minimumSubjectSize = minSeqLength;
115 // stat_averageSubjectSize = (uint4)(databaseSize /numOfSequences);
117 // stat_log2 = log(2.0);
119 // // Compute for the Karlin-Atschul parameters: ungap K, H and Lambda
120 // // For DNA, assuming that each character A,T,G,C are equally probable at 0.25
121 // stat_calcUngapKarlinParameters(dbCharProb, queryCharProb, scoringMatrix);
123 // // for blastn, they are the same
124 // stat_gapK = stat_ungapK;
125 // stat_gapH = stat_ungapH;
126 // stat_gapLambda = stat_ungapLambda;
127 // stat_ungapLogK = log(stat_ungapK);
128 // stat_gapLogK = stat_ungapLogK;
130 // stat_ungapXdropoff = (int4)ceil(stat_XdropoffsInBits.ungapXdropoff * stat_log2 /
131 // stat_ungapLambda);
132 // stat_gapXdropoff = (int4)floor(stat_XdropoffsInBits.gapXdropoff * stat_log2 /
133 // stat_gapLambda);
134 // stat_gapXdropoffFinal = (int4)floor(stat_XdropoffsInBits.gapXdropoffFinal *
135 // stat_log2 / stat_gapLambda);
137 // stat_querySize = querySize;
138 // stat_numOfContext = numOfContext;
139 // // compute the adjustment length
140 // // stat_calcLengthAdjust is a down-sized calc for nucleotide only
141 // stat_adjustmentLength = (int4)floor(stat_calcLengthAdjust());
143 // stat_effectiveQuerySize = stat_querySize - stat_adjustmentLength;
144 // stat_effectiveDatabaseSize = stat_databaseSize -
145 // stat_numOfSequences * stat_adjustmentLength;
146 // stat_effectiveSearchSpaceSize = stat_effectiveDatabaseSize * stat_effectiveQuerySize;
149 //void stat_calcUngapKarlinParameters(double *dbCharProb, double *queryCharProb, int scoringMatrix[16][16])
151 // int4 scoreProbSize, highest, lowest;
152 // double * scoreProb;
153 // int i, j;
155 // lowest = stat_scoreBlock.mismatch;
156 // highest = stat_scoreBlock.match;
157 // scoreProbSize = highest-lowest+1;
158 // scoreProb = (double *) calloc(scoreProbSize, sizeof(double));
160 // for (i=0; i<16; i++) {
161 // for (j=0; j<16; j++) {
162 // scoreProb[scoringMatrix[i][j] - lowest] += dbCharProb[i] * queryCharProb[j];
163 // }
164 // }
166 // // calculate the Karlin parameters, refer to karlin.c
167 // BlastKarlinBlkCalc(scoreProb-lowest, lowest, highest);
168 // stat_ungapH = BlastKarlin_H;
169 // stat_ungapK = BlastKarlin_K;
170 // stat_ungapLambda = BlastKarlin_lambda;
172 // if (stat_ungapK <= 0.001)
173 // {
174 // fprintf(stderr,"ERROR: unable to calculate the KARLIN statistic for the given scoring scheme.\n");
175 // exit(-1);
176 // }
178 // free(scoreProb);
183 Codes extracted and modified from statistic.c in FSA BLAST by Michael Cameron
185 // Calculate length adjustment (aka. "effective HSP length") by iteratively
186 // applying equation from "Local Alignment Statistics" Altschul,Gish
187 // using ungapped K & H values
188 double stat_calcLengthAdjust()
190 int4 count;
191 double lengthAdjust;
192 double minimumQueryLength;
194 // Query can not be shorter than 1/K
195 minimumQueryLength = (double)1 / stat_ungapK;
197 count = 0;
198 lengthAdjust = 0;
199 // Calculate length adjustment value
200 while (count < 5)
202 lengthAdjust = (stat_ungapLogK +
203 log(((double)stat_querySize - lengthAdjust) *
204 ((double)stat_databaseSize - (double)stat_numOfSequences *
205 lengthAdjust))) / stat_ungapH;
207 // Length adjust can not shorten query to less than minimum query length
208 if (lengthAdjust > stat_querySize - minimumQueryLength)
210 lengthAdjust = stat_querySize - minimumQueryLength;
213 count++;
216 return lengthAdjust;
219 int4 calcUngapCutoffScore()
221 uint8 minSize = stat_minimumSubjectSize;
222 int4 gappedScore, ungappedScore;
224 if (minSize > stat_querySize)
225 minSize = stat_querySize;
227 ungappedScore = (int4)ceil(
228 (log((double)minSize * (double)stat_numOfContext * ((double)stat_averageSubjectSize / (double)CUTOFF_E_BLASTN))
229 + stat_ungapLogK) / stat_ungapLambda );
230 gappedScore = stat_gapEvalue2nominal(stat_expectationValue);
232 if (gappedScore > ungappedScore)
233 return ungappedScore;
234 else
235 return gappedScore;
238 int4 calcGapCutoffScore()
240 return stat_gapEvalue2nominal(stat_expectationValue);
243 int4 getUngapXdropoff()
245 return stat_ungapXdropoff;
248 int4 getGapXdropoff()
250 return stat_gapXdropoff;
253 int4 getGapXdropoffFinal()
255 return stat_gapXdropoffFinal;
258 void printHSPstatistic(FILE * outFile)
260 int vtmp;
261 fprintf(outFile,"Lambda K H\n");
262 fprintf(outFile,"%10.2f %10.2f %10.2f\n", stat_ungapLambda, stat_ungapK, stat_ungapH);
263 fprintf(outFile,"Gapped\n");
264 fprintf(outFile,"Lambda K H\n");
265 fprintf(outFile,"%10.2f %10.2f %10.2f\n\n", stat_gapLambda, stat_gapK, stat_gapH);
266 fprintf(outFile,"Matrix: blastn matrix: %d %d\n", stat_scoreBlock.match, stat_scoreBlock.mismatch );
267 fprintf(outFile,"Gap Penalties: Existence: %d, Extension: %d\n", stat_scoreBlock.gapOpen, stat_scoreBlock.gapExt);
268 fprintf(outFile,"Length of query: %llu\n", stat_querySize);
269 fprintf(outFile,"Length of database: %llu\n", stat_databaseSize);
270 fprintf(outFile,"Effective HSP length: %llu\n", stat_adjustmentLength);
271 fprintf(outFile,"Effective length of query: %llu\n", stat_effectiveQuerySize);
272 fprintf(outFile,"Effective length of database: %llu\n", stat_effectiveDatabaseSize);
273 fprintf(outFile,"Effective search space: %llu\n",stat_effectiveSearchSpaceSize);
274 fprintf(outFile,"X1: %d (%2.1lf bits)\n",stat_ungapXdropoff,
275 stat_ungapXdropoff * stat_ungapLambda / stat_log2);
276 fprintf(outFile,"X2: %d (%2.1lf bits)\n",stat_gapXdropoff,
277 stat_gapXdropoff * stat_gapLambda / stat_log2);
278 vtmp = calcUngapCutoffScore();
279 fprintf(outFile,"S1: %d (%2.1f bits)\n",vtmp,
280 stat_ungapNominal2normalized(vtmp));
281 vtmp = calcGapCutoffScore();
282 fprintf(outFile,"S2: %d (%2.1f bits)\n",vtmp,
283 stat_gapNominal2normalized(vtmp));
286 /*--------------------------------------------------------------------------
287 * Routines for conversion between evalue and score, as well as from
288 normalise (in bits) to nominal.
291 // Convert a normalized score to a nominal score
292 int4 stat_ungapNormalized2nominal(double normalizedScore)
294 double nominalScore;
296 nominalScore = (normalizedScore * stat_log2 + stat_ungapLogK) / stat_ungapLambda;
298 return (int4)floor(nominalScore);
301 // Convert an ungapped nominal score to a normalized score
302 double stat_ungapNominal2normalized(int4 nominalScore)
304 double normalizedScore;
306 normalizedScore = (((double)nominalScore * stat_ungapLambda) - stat_ungapLogK)
307 / stat_log2;
309 return normalizedScore;
312 // Calculate the evalue for a given ungapped normalizedScore
313 double stat_ungapCalcEvalue(double normalizedScore)
315 return stat_effectiveSearchSpaceSize / pow(2, normalizedScore);
318 // Given an evalue (such as a cutoff) calculate the minimum ungapped nominal score needed to attain it
319 int4 stat_ungapEvalue2nominal(double evalue)
321 double normalizedScore;
323 normalizedScore = log(stat_effectiveSearchSpaceSize / evalue) / (double)stat_log2;
324 return (int4)ceil((stat_log2 * normalizedScore + stat_ungapLogK)
325 / stat_ungapLambda);
328 // Convert a gapped nominal score to a normalized score
329 double stat_gapNominal2normalized(int4 nominalScore)
331 double normalizedScore;
333 normalizedScore = (((double)nominalScore * stat_gapLambda)
334 - stat_gapLogK) / stat_log2;
336 return normalizedScore;
339 // Given a normalized score calculate the lowest gapped nominal score needed to achieve it
340 int4 stat_gapNormalized2nominal(double normalizedScore)
342 return (int4)ceil(((stat_log2 * normalizedScore + stat_gapLogK)
343 / stat_gapLambda) - 0.5);
345 // Calculate the evalue for a given gapped normalizedScore
346 double stat_gapCalcEvalue(double normalizedScore)
348 return stat_effectiveSearchSpaceSize / pow(2, normalizedScore);
351 // Given an evalue (such as a cutoff) calculate the minimum gapped nominal score needed to attain it
352 int4 stat_gapEvalue2nominal(double evalue)
354 return (int4)ceil(log(stat_gapK * stat_effectiveSearchSpaceSize / evalue) / stat_gapLambda);
358 //---------------------------------------------------------------------------