modified: nfig1.py
[GalaxyCodeBases.git] / BGI / BASE / src / 2bwt / HSP.c
blob9c379e6bc1d4ebb64ff86a20b6fbfd087e307092
1 /*
3 HSP.c BWTBlastn functions
5 This module contains miscellaneous BWTBlastn functions.
7 Copyright (C) 2004, Wong Chi Kwong.
9 This program is free software; you can redistribute it and/or
10 modify it under the terms of the GNU General Public License
11 as published by the Free Software Foundation; either version 2
12 of the License, or (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <math.h>
29 #include <stdint.h>
30 #include <emmintrin.h>
31 #include <mmintrin.h>
32 #include "TextConverter.h"
33 #include "MiscUtilities.h"
34 #include "Socket.h"
35 #include "r250.h"
36 #include "HSP.h"
37 #include "HSPstatistic.h"
40 extern double stat_expectationValue;
42 //unsigned int UChromosomeStartPos(HSP * hsp, unsigned chrNo) {
43 // unsigned int correction = hsp->ambiguity[hsp->seqOffset[chrNo].firstAmbiguityIndex-1].rightOfEndPos -
44 // hsp->ambiguity[hsp->seqOffset[chrNo].firstAmbiguityIndex-1].startPos;
45 // return hsp->seqOffset[chrNo].startPos+correction;
46 //}
47 //unsigned int UChromosomeEndPos(HSP * hsp, unsigned chrNo) {
48 // unsigned int correction = hsp->ambiguity[hsp->seqOffset[chrNo].lastAmbiguityIndex].rightOfEndPos -
49 // hsp->ambiguity[hsp->seqOffset[chrNo].lastAmbiguityIndex].startPos;
50 // return hsp->seqOffset[chrNo].endPos+correction;
51 //}
53 //void ComputeChromosomeGrid2(HSP * hsp,unsigned int gridEntries,unsigned short * ambiguityMap, Translate * translate) {
54 // //unsigned int gridEntries = (hsp->dnaLength/GRID_SAMPLING_FACTOR)+1;
55 // //hsp->ambiguityMap = malloc(sizeof(unsigned short)*gridEntries);
56 // //hsp->translate = malloc(sizeof(Translate)*(hsp->numOfAmbiguity+hsp->numOfSeq));
57 //
58 // unsigned int i=0;
59 // unsigned int acc = 0;
60 // for (i=0;i<gridEntries;i++) {
61 // ambiguityMap[i]=0;
62 // }
63 // unsigned int j=0,k=0;
64 // i=0;
65 // unsigned int lastChrAmbAccumulate=0;
66 // while(j<hsp->numOfAmbiguity && k<hsp->numOfSeq) {
67 // if (hsp->ambiguity[j].startPos<hsp->seqOffset[k].startPos) {
68 // //Write Ambiguous Segment Flag
69 // unsigned int index = hsp->ambiguity[j].startPos;
70 // ambiguityMap[index/GRID_SAMPLING_FACTOR]+=1;
71 // translate[i].startPos=hsp->ambiguity[j].startPos;
72 // translate[i].chrID=k;
73 // //chrAmbAccumulate+=(hsp->ambiguity[j].rightOfEndPos - hsp->ambiguity[j].startPos);
74 // translate[i].correction=UChromosomeStartPos(hsp,k-1)-(hsp->ambiguity[j].rightOfEndPos - hsp->ambiguity[j].startPos)-1;
75 // //printf("%u:AMB%u\ti=%u\tSAMPLE_i=%u\tCORRECT=%u\tCHR=%u\n",i,j,index,index/GRID_SAMPLING_FACTOR,duplicateAmb[i].correction,k-1);
76 // j++;
77 // i++;
78 // } else if (hsp->ambiguity[j].startPos>hsp->seqOffset[k].startPos) {
79 // //Write Artificial Chromosome Start Flag
80 // unsigned int index = hsp->seqOffset[k].startPos;
81 // ambiguityMap[index/GRID_SAMPLING_FACTOR]+=1;
82 // translate[i].startPos=hsp->seqOffset[k].startPos;
83 // translate[i].chrID=k+1;
84 // translate[i].correction=hsp->seqOffset[k].startPos-1;
85 // //printf("%u:CHR%u\ti=%u\tSAMPLE_i=%u\tAccCORRECT=-%u\tCHR=%u\n",i,k,index,index/GRID_SAMPLING_FACTOR,hsp->seqOffset[k].startPos,k);
86 // //chrAmbAccumulate=0;
87 // k++;
88 // i++;
89 // } else {
90 // //chrAmbAccumulate=0;
91 // k++;
92 // }
93 // }
94 // while (j<hsp->numOfAmbiguity) {
95 // unsigned int index = hsp->ambiguity[j].startPos;
96 // ambiguityMap[index/GRID_SAMPLING_FACTOR]+=1;
97 // translate[i].startPos=hsp->ambiguity[j].startPos;
98 // translate[i].chrID=k;
99 // translate[i].correction=UChromosomeStartPos(hsp,k-1)-(hsp->ambiguity[j].rightOfEndPos - hsp->ambiguity[j].startPos)-1;
100 // //chrAmbAccumulate+=hsp->ambiguity[j].rightOfEndPos - hsp->ambiguity[j].startPos;
101 // //duplicateAmb[i].correction=hsp->seqOffset[k-1].startPos-chrAmbAccumulate;
102 // //printf("%u:AMB%u\ti=%u\tSAMPLE_i=%u\tCORRECT=%u\tCHR=%u\n",i,j,index,index/GRID_SAMPLING_FACTOR,duplicateAmb[i].correction,k-1);
103 // j++;i++;
104 // }
105 // while (k<hsp->numOfSeq) {
106 // unsigned int index = hsp->seqOffset[k].startPos;
107 // ambiguityMap[index/GRID_SAMPLING_FACTOR]+=1;
108 // translate[i].startPos=hsp->seqOffset[k].startPos;
109 // translate[i].chrID=k+1;
110 // translate[i].correction=hsp->seqOffset[k].startPos-1;
111 // //printf("%u:CHR%u\ti=%u\tSAMPLE_i=%u\tAccCORRECT=-%u\tCHR=%u\n",i,k,index,index/GRID_SAMPLING_FACTOR,hsp->seqOffset[k].startPos,k);
112 // //mapGridToChromID[i]=chrNo;
113 // k++;i++;
114 // }
116 // for (i=1;i<gridEntries;i++) {
117 // ambiguityMap[i]+=ambiguityMap[i-1];
118 // }
119 // for (i=0;i<gridEntries;i++) {
120 // ambiguityMap[i]--;
121 // }
124 /*unsigned int DetermineChromosome(HSP * hsp, unsigned int tp) {
125 unsigned int numOfSeq = hsp->numOfSeq;
126 int l=0;
127 int r=hsp->numOfSeq;
128 int m=0;
129 int answer = 0;
130 while (l+1<r) {
131 m = (l+r)/2;
132 unsigned int startPos = hsp->seqOffset[m].startPos;
133 if (tp<startPos) {
134 r=m-1;
135 } else if (tp>startPos) {
136 l=m;
137 } else {
138 answer=m;
139 break;
142 if (answer==0) {answer=l;}
143 return answer;
145 /*unsigned int ComputeRelativeChromosomePosition(HSP * hsp, unsigned int utp,unsigned chrNo) {
146 //unsigned int prevAmb = hsp->seqOffset[chrNo].firstAmbiguityIndex-1;
147 //unsigned int chrUnambiguousStartPos = hsp->seqOffset[chrNo].startPos + (hsp->ambiguity[prevAmb].rightOfEndPos-hsp->ambiguity[prevAmb].startPos);
148 //printf("Chromosome unambiguous start position = %u\n",chrUnambiguousStartPos);
149 return utp - UChromosomeStartPos(hsp,chrNo);
151 /*unsigned int DetermineChromosomeRank(HSP * hsp, unsigned short *chromosomeSampleTable, unsigned int chromosomeSampleFactor, unsigned int tp) {
152 unsigned int approxIndex = (int)(tp/chromosomeSampleFactor);
153 unsigned int approxValue = chromosomeSampleTable[approxIndex];
154 if (hsp->seqOffset[approxValue].startPos>tp) return approxValue-1;
155 return approxValue;
157 unsigned int UDetermineChromosomeRank(HSP * hsp,unsigned short *uChromosomeSampleTable,unsigned int chromosomeSampleFactor, unsigned int utp) {
158 unsigned int approxIndex = (int)(utp/chromosomeSampleFactor);
159 approxIndex=uChromosomeSampleTable[approxIndex];
160 if (UChromosomeStartPos(hsp,approxIndex)>utp) return approxIndex-1;
161 return approxIndex;
163 /*unsigned int TranslateUnambiguousTextPosition(HSP * hsp, unsigned int tp,unsigned int chrNo) {
164 int l=hsp->seqOffset[chrNo].firstAmbiguityIndex-1;
165 int r=hsp->seqOffset[chrNo].lastAmbiguityIndex+1;
166 //printf("Within [%u,x]..[%u,x]\n",hsp->ambiguity[l].startPos,hsp->ambiguity[r].startPos);
167 int m=0;
168 int answer = 0;
169 while (l+1<r) {
170 m = (l+r)/2;
171 unsigned int startPos = hsp->ambiguity[m].startPos;
172 if (tp<startPos) {
173 r=m;
174 } else if (tp>startPos) {
175 l=m;
176 } else {
177 answer=m;
178 break;
181 if (answer==0) {answer=l;}
182 //printf("Ambigunity record %u is found = [%u,%u]\n",answer,hsp->ambiguity[answer].startPos,hsp->ambiguity[answer].rightOfEndPos);
184 //printf("text position = %u\n",tp);
185 //printf("text position translated = %u\n",tp-(hsp->ambiguity[answer].startPos)+(hsp->ambiguity[answer].rightOfEndPos));
186 return tp-(hsp->ambiguity[answer].startPos)+(hsp->ambiguity[answer].rightOfEndPos);
190 void HSPFillCharMap(unsigned char charMap[255]) {
192 int i;
194 for (i=0; i<255; i++) {
195 charMap[i] = nonMatchDnaCharIndex;
197 for (i=0; i<DNA_CHAR_SIZE; i++) {
198 charMap[dnaChar[i]] = (unsigned char)i;
199 charMap[dnaChar[i] - 'A' + 'a'] = (unsigned char)i;
204 void HSPFillComplementMap(unsigned char complementMap[255]) {
206 int i;
208 for (i=0; i<255; i++) {
209 complementMap[i] = dnaChar[nonMatchDnaCharIndex];
211 for (i=0; i<DNA_CHAR_SIZE; i++) {
212 complementMap[dnaComplement[i]] = dnaChar[i];
213 complementMap[dnaComplement[i] - 'A' + 'a'] = dnaChar[i] - 'A' + 'a';
218 void HSPFillScoringMatrix(int scoringMatrix[DNA_CHAR_SIZE][DNA_CHAR_SIZE], const int matchScore, const int mismatchScore, const int leftShift) {
220 int i, j, k, l;
222 for (i=0; i<DNA_CHAR_SIZE; i++) {
223 for (j=0; j<DNA_CHAR_SIZE; j++) {
224 if (ambiguityCount[i] > 0 && ambiguityCount[j] > 0) {
225 scoringMatrix[i][j] = 0;
226 for (k=0; k<ambiguityCount[i]; k++) {
227 for (l=0; l<ambiguityCount[j]; l++) {
228 if (ambiguityMatch[i][k] == ambiguityMatch[j][l]) {
229 // match
230 scoringMatrix[i][j] += matchScore;
231 } else {
232 // mismatch
233 scoringMatrix[i][j] += mismatchScore;
237 if (scoringMatrix[i][j] > 0) {
238 scoringMatrix[i][j] = (scoringMatrix[i][j] + ambiguityCount[i] * ambiguityCount[j] - 1) / (ambiguityCount[i] * ambiguityCount[j]);
239 } else if (scoringMatrix[i][j] < 0) {
240 scoringMatrix[i][j] = -((-scoringMatrix[i][j] + ambiguityCount[i] * ambiguityCount[j] - 1) / (ambiguityCount[i] * ambiguityCount[j]));
241 } else {
242 scoringMatrix[i][j] = 0;
244 } else {
245 // Character meant to match nothing
246 scoringMatrix[i][j] = mismatchScore;
248 scoringMatrix[i][j] = scoringMatrix[i][j] * (1 << leftShift);
255 HSP *HSPLoad(MMPool *mmPool, const char *PackedDNAFileName, const char *AnnotationFileName, const char *AmbiguityFileName, const char * TranslateFileName, const unsigned int trailerBufferInWord) {
257 HSP *hsp;
258 unsigned long long dnaLength;
259 unsigned int randomSeed;
260 int i;
261 char c;
262 unsigned char charMap[255];
264 FILE *annotationFile = NULL, *ambiguityFile = NULL, *translateFile = NULL;
266 hsp = (HSP*) MMPoolDispatch(mmPool, sizeof(HSP));
268 // Load packed DNA
269 if (PackedDNAFileName != NULL && PackedDNAFileName[0] != '\0' && PackedDNAFileName[0] != '-') {
270 hsp->packedDNA = (unsigned int *) DNALoadPacked(PackedDNAFileName, &hsp->dnaLength, TRUE, trailerBufferInWord);
271 } else {
272 hsp->packedDNA = NULL;
273 hsp->dnaLength = 0;
276 // Load annotation
277 if (AnnotationFileName != NULL && AnnotationFileName[0] != '\0' && AnnotationFileName[0] != '-') {
279 annotationFile = (FILE*)fopen64(AnnotationFileName, "r");
280 if (annotationFile == NULL) {
281 fprintf(stderr, "Cannot open annotation file!\n");
282 exit(1);
285 fscanf(annotationFile, "%llu %d %u\n", &dnaLength, &hsp->numOfSeq, &randomSeed);
286 if (hsp->dnaLength != 0 && dnaLength != hsp->dnaLength) {
287 fprintf(stderr, "Annotation database length not match!\n");
288 exit(1);
290 hsp->dnaLength = dnaLength;
291 if (hsp->numOfSeq == 0) {
292 fprintf(stderr, "Annotation number of sequence = 0!\n");
293 exit(1);
295 hsp->annotation = (Annotation*) MMUnitAllocate((hsp->numOfSeq+1) * sizeof(Annotation));
296 hsp->seqOffset = (SeqOffset*) MMUnitAllocate((hsp->numOfSeq+1) * sizeof(SeqOffset));
298 i = 0;
299 hsp->minSeqLength = UINT32_MAX;
300 while (!feof(annotationFile) && i < hsp->numOfSeq) {
301 fscanf(annotationFile, "%u ", &hsp->annotation[i].gi);
302 fgets(hsp->annotation[i].text, MAX_SEQ_NAME_LENGTH, annotationFile);
303 fscanf(annotationFile, "%llu %llu %d\n", &hsp->seqOffset[i].startPos, &hsp->seqOffset[i].endPos, &hsp->seqOffset[i+1].firstAmbiguityIndex);
304 hsp->seqOffset[i].lastAmbiguityIndex = hsp->seqOffset[i+1].firstAmbiguityIndex;
305 if (hsp->seqOffset[i].endPos < hsp->minSeqLength) {
306 hsp->minSeqLength = hsp->seqOffset[i].endPos;
308 hsp->seqOffset[i].endPos = hsp->seqOffset[i].startPos + hsp->seqOffset[i].endPos - 1; // length of sequence is stored
309 i++;
311 if (i < hsp->numOfSeq) {
312 fprintf(stderr, "Annotation missing entries!\n");
313 exit(1);
315 fclose(annotationFile);
317 hsp->annotation[i].gi = 0;
318 hsp->annotation[i].text[0] = '\0';
320 hsp->seqOffset[i].startPos = UINT32_MAX;
321 hsp->seqOffset[i].endPos = UINT32_MAX;
322 hsp->seqOffset[0].firstAmbiguityIndex = 1; // ambiguity[0] and ambiguity[numOfAmbiguity+1] are dummy
323 for (i=1; i<=hsp->numOfSeq; i++) {
324 hsp->seqOffset[i].firstAmbiguityIndex += hsp->seqOffset[i-1].firstAmbiguityIndex; // number of ambiguity is stored
326 // hsp->seqOffset[hsp->numOfSeq].firstAmbiguityIndex = total number of ambiguity + 1 now
327 hsp->seqOffset[hsp->numOfSeq].lastAmbiguityIndex = hsp->seqOffset[hsp->numOfSeq].firstAmbiguityIndex - 1;
328 // hsp->seqOffset[hsp->numOfSeq].lastAmbiguityIndex = total number of ambiguity now
329 for (i=hsp->numOfSeq; i>1; i--) {
330 hsp->seqOffset[i-1].lastAmbiguityIndex = hsp->seqOffset[i].lastAmbiguityIndex - hsp->seqOffset[i-1].lastAmbiguityIndex; // number of ambiguity is stored
332 for (i=0; i<hsp->numOfSeq; i++) {
333 hsp->seqOffset[i].lastAmbiguityIndex = hsp->seqOffset[i+1].lastAmbiguityIndex;
336 } else {
338 // Make up a dummy annotation and a dummy sequence offset
339 hsp->annotation = (Annotation*) MMUnitAllocate((1+1) * sizeof(Annotation));
340 hsp->seqOffset = (SeqOffset*) MMUnitAllocate((1+1) * sizeof(SeqOffset));
342 hsp->numOfSeq = 1;
343 hsp->numOfAmbiguity = 0;
345 hsp->annotation[0].gi = 0;
346 hsp->annotation[0].text[0] = '\0';
347 hsp->annotation[1].gi = 0;
348 hsp->annotation[1].text[0] = '\0';
349 hsp->seqOffset[0].startPos = 0;
350 hsp->seqOffset[0].endPos = hsp->dnaLength - 1;
351 hsp->seqOffset[0].firstAmbiguityIndex = 1;
352 hsp->seqOffset[0].lastAmbiguityIndex = 0;
353 hsp->seqOffset[1].startPos = UINT32_MAX;
354 hsp->seqOffset[1].endPos = UINT32_MAX;
355 hsp->seqOffset[1].firstAmbiguityIndex = UINT32_MAX; // should not be referred
356 hsp->seqOffset[1].lastAmbiguityIndex = UINT32_MAX; // should not be referred
360 // Load ambigity
361 if (AmbiguityFileName != NULL && AmbiguityFileName[0] != '\0' && AmbiguityFileName[0] != '-') {
363 // Load ambigity
364 ambiguityFile = (FILE*)fopen64(AmbiguityFileName, "r");
365 if (ambiguityFile == NULL) {
366 fprintf(stderr, "Cannot open ambiguity file!\n");
367 exit(1);
370 fscanf(ambiguityFile, "%llu %d %d\n", &dnaLength, &i, &hsp->numOfAmbiguity);
371 if (hsp->dnaLength != 0 && dnaLength != hsp->dnaLength) {
372 fprintf(stderr, "Ambiguity database length not match!\n");
373 exit(1);
375 hsp->dnaLength = dnaLength;
376 if (i != hsp->numOfSeq) {
377 fprintf(stderr, "Ambiguity database number of sequence not match!\n");
378 exit(1);
380 if (hsp->numOfAmbiguity != hsp->seqOffset[hsp->numOfSeq].firstAmbiguityIndex - 1) {
381 fprintf(stderr, "Ambiguity number of ambiguity not match!\n");
382 exit(1);
385 HSPFillCharMap(charMap);
387 hsp->ambiguity = (Ambiguity*) MMUnitAllocate((hsp->numOfAmbiguity + 2) * sizeof(Ambiguity));
388 hsp->ambiguity[0].startPos = 0;
389 hsp->ambiguity[0].rightOfEndPos = 0;
390 hsp->ambiguity[0].symbol = 0;
391 i = 1;
392 while (!feof(ambiguityFile) && i-1 < hsp->numOfAmbiguity) {
393 fscanf(ambiguityFile, "%llu %llu %c\n", &hsp->ambiguity[i].startPos, &hsp->ambiguity[i].rightOfEndPos, &c);
394 hsp->ambiguity[i].rightOfEndPos = hsp->ambiguity[i].startPos + hsp->ambiguity[i].rightOfEndPos; // number of character is stored
395 hsp->ambiguity[i].symbol = charMap[c];
396 i++;
398 hsp->ambiguity[i].startPos = UINT32_MAX;
399 hsp->ambiguity[i].rightOfEndPos = UINT32_MAX;
400 hsp->ambiguity[i].symbol = 0;
402 if (i-1 < hsp->numOfAmbiguity) {
403 fprintf(stderr, "Ambiguity missing entries!\n");
404 exit(1);
406 fclose(ambiguityFile);
408 } else {
410 if (hsp->numOfAmbiguity > 0) {
411 fprintf(stderr, "Ambiguity file missing!\n");
412 exit(1);
415 hsp->ambiguity = (Ambiguity*) MMUnitAllocate((0 + 2) * sizeof(Ambiguity));
416 hsp->ambiguity[0].startPos = 0;
417 hsp->ambiguity[0].rightOfEndPos = 0;
418 hsp->ambiguity[0].symbol = 0;
419 hsp->ambiguity[1].startPos = UINT32_MAX;
420 hsp->ambiguity[1].rightOfEndPos = UINT32_MAX;
421 hsp->ambiguity[1].symbol = 0;
425 // Load translate
426 if (TranslateFileName != NULL && TranslateFileName[0] != '\0' && TranslateFileName[0] != '-') {
428 // Load translate, ambiguity map
429 translateFile = (FILE*)fopen64(TranslateFileName, "r");
430 if (translateFile == NULL) {
431 fprintf(stderr, "Cannot open translate file!\n");
432 exit(1);
434 unsigned int gridEntries;
435 unsigned int removedSegmentCount;
436 fscanf(translateFile, "%llu %d %u %u\n", &dnaLength, &i, &removedSegmentCount, &gridEntries);
437 if (hsp->dnaLength != 0 && dnaLength != hsp->dnaLength) {
438 fprintf(stderr, "Translate database length not match!\n");
439 exit(1);
441 hsp->dnaLength = dnaLength;
442 if (i != hsp->numOfSeq) {
443 fprintf(stderr, "Translate database number of sequence not match!\n");
444 exit(1);
446 /*if (hsp->numOfAmbiguity != hsp->seqOffset[hsp->numOfSeq].firstAmbiguityIndex - 1) {
447 fprintf(stderr, "Translate number of ambiguity not match!\n");
448 exit(1);
450 hsp->numOfRemovedSegment = removedSegmentCount;
451 hsp->numOfGridEntry = gridEntries;
452 hsp->seqActualOffset = (SeqActualOffset*) MMUnitAllocate((hsp->numOfSeq+1) * sizeof(SeqActualOffset));
453 hsp->ambiguityMap = (unsigned short*) MMUnitAllocate((gridEntries) * sizeof(unsigned short));//MMUnitAllocate((gridEntries) * sizeof(unsigned short));
454 hsp->translate = (Translate*) MMUnitAllocate((hsp->numOfSeq+removedSegmentCount) * sizeof(Translate));//MMUnitAllocate((hsp->numOfSeq+hsp->numOfAmbiguity) * sizeof(Translate));
456 unsigned long long j=0;
457 while (!feof(translateFile) && j<gridEntries) {
458 unsigned short tmp;
459 fscanf(translateFile, "%hu\n", &tmp);
460 hsp->ambiguityMap[j]=(unsigned short)tmp;j++;
462 if (j < gridEntries) {
463 fprintf(stderr, "Translate missing entries!\n");
464 exit(1);
467 j=0;
468 while (!feof(translateFile) && j<hsp->numOfSeq+removedSegmentCount) {
469 fscanf(translateFile, "%llu %d %llu\n", &hsp->translate[j].startPos, &hsp->translate[j].chrID, &hsp->translate[j].correction);
470 j++;
472 if (j < hsp->numOfSeq+removedSegmentCount) {
473 fprintf(stderr, "Translate missing entries!\n");
474 exit(1);
477 j=0;
478 while (!feof(translateFile) && j<hsp->numOfSeq) {
479 fscanf(translateFile, "%llu %llu\n", &hsp->seqActualOffset[j].startPos, &hsp->seqActualOffset[j].endPos);
480 hsp->seqActualOffset[j].endPos = hsp->seqActualOffset[j].startPos + hsp->seqActualOffset[j].endPos;
481 j++;
483 // Backward compatibility
484 if (j < hsp->numOfSeq) {
485 fprintf(stdout, "[Translation] Missing actual chromosome lengths.\n");
486 fprintf(stdout, "Re-build Packed indexes for more accurate chromosome length in output.\n");
488 //Translation file is missing, initialise the value with inaccurate chromosome length
489 j=0;
490 while (j<hsp->numOfSeq) {
491 hsp->seqActualOffset[j].startPos = hsp->seqOffset[j].startPos;
492 hsp->seqActualOffset[j].endPos = hsp->seqOffset[j].endPos;
493 j++;
496 fclose(translateFile);
498 } else {
500 if (hsp->numOfAmbiguity > 0 || hsp->numOfSeq > 0) {
501 fprintf(stderr, "Translate file missing!\n");
502 exit(1);
505 hsp->numOfRemovedSegment = 0;
506 unsigned int gridEntries = (hsp->dnaLength/GRID_SAMPLING_FACTOR)+1;
507 hsp->numOfGridEntry = gridEntries;
508 hsp->seqActualOffset = (SeqActualOffset*) MMUnitAllocate((hsp->numOfSeq+1) * sizeof(SeqActualOffset));
509 hsp->ambiguityMap = (unsigned short*) MMUnitAllocate((gridEntries) * sizeof(unsigned short));
510 hsp->translate = (Translate*) MMUnitAllocate((hsp->numOfSeq) * sizeof(Translate));//MMUnitAllocate((hsp->numOfSeq+hsp->numOfAmbiguity) * sizeof(Translate));
511 unsigned int j=0;
512 while (!feof(translateFile) && j<gridEntries) {
513 hsp->ambiguityMap[j]=0;
515 hsp->translate[0].startPos=0;
516 hsp->translate[0].chrID=0;
517 hsp->translate[0].correction=0;
519 j=0;
520 while (j<hsp->numOfSeq) {
521 hsp->seqActualOffset[j].startPos = hsp->seqOffset[j].startPos;
522 hsp->seqActualOffset[j].endPos = hsp->seqOffset[j].endPos;
523 j++;
526 return hsp;
530 HSP *HSPConvertFromText(MMPool *mmPool, const unsigned char *text, const unsigned long long textLength,
531 const unsigned int FASTARandomSeed, const int maskLowerCase,
532 const int gi, const char *seqName) {
534 HSP *hsp;
535 Ambiguity *tempAmbiguity;
536 int ambiguityAllocated = 256;
538 char c;
539 unsigned long long i;
540 int numAmbiguity;
541 unsigned long long lastAmbiguityPos;
542 unsigned long long numCharInBuffer, numOfConversion;
543 unsigned int sequenceRandomSeed;
544 unsigned char charMap[255];
546 unsigned char buffer[PACKED_BUFFER_SIZE];
548 HSPFillCharMap(charMap);
550 hsp = (HSP*) MMPoolDispatch(mmPool, sizeof(HSP));
552 hsp->dnaLength = textLength;
553 hsp->minSeqLength = textLength;
554 hsp->numOfSeq = 1;
556 hsp->annotation = (Annotation*) MMPoolDispatch(mmPool, (1+1) * sizeof(Annotation));
557 hsp->annotation[0].gi = gi;
558 strncpy(hsp->annotation[0].text, seqName, MAX_SEQ_NAME_LENGTH);
559 hsp->annotation[1].gi = 0;
560 hsp->annotation[1].text[0] = '\0';
562 hsp->seqOffset = (SeqOffset*) MMPoolDispatch(mmPool, (1+1) * sizeof(SeqOffset));
563 hsp->seqOffset[0].startPos = 0;
564 hsp->seqOffset[0].endPos = textLength - 1;
565 hsp->seqOffset[0].firstAmbiguityIndex = 1;
566 hsp->seqOffset[0].lastAmbiguityIndex = 0;
567 hsp->seqOffset[1].startPos = UINT32_MAX;
568 hsp->seqOffset[1].endPos = UINT32_MAX;
569 hsp->seqOffset[1].firstAmbiguityIndex = UINT32_MAX; // should not be referred
570 hsp->seqOffset[1].lastAmbiguityIndex = UINT32_MAX; // should not be referred
572 hsp->packedDNA = (unsigned int*) MMPoolDispatch(mmPool, (textLength / CHAR_PER_WORD + 1) * sizeof(int));
574 tempAmbiguity = (Ambiguity*) MMUnitAllocate(ambiguityAllocated * sizeof(Ambiguity));
576 HSPFillCharMap(charMap);
578 // Set random seed for the sequence
579 sequenceRandomSeed = FASTARandomSeed;
580 if (gi > 0) {
581 sequenceRandomSeed += gi;
582 } else {
583 for (i=0; i<(int)strlen(seqName); i++) {
584 c = seqName[i];
585 sequenceRandomSeed += c;
588 r250_init(sequenceRandomSeed);
590 numAmbiguity = -1;
591 numCharInBuffer = 0;
592 numOfConversion = 0;
593 lastAmbiguityPos = (unsigned int)-2;
595 for (i=0; i<textLength; i++) {
597 c = text[i];
599 if (maskLowerCase && c >= 'a' && c <= 'z') {
600 c = dnaChar[lowercaseDnaCharIndex];
602 if (ambiguityCount[charMap[c]] == 1) {
603 buffer[numCharInBuffer] = c;
604 } else {
605 c = charMap[c];
606 if (ambiguityCount[c] > 0) {
607 buffer[numCharInBuffer] = dnaChar[ambiguityMatch[c][r250() % ambiguityCount[c]]];
608 } else {
609 buffer[numCharInBuffer] = dnaChar[ambiguityMatch[c][r250() % ALPHABET_SIZE]];
611 if (i == lastAmbiguityPos + 1 && c == (char)tempAmbiguity[numAmbiguity].symbol) {
612 tempAmbiguity[numAmbiguity].rightOfEndPos++;
613 } else {
614 numAmbiguity++;
615 if (numAmbiguity >= ambiguityAllocated) {
616 tempAmbiguity = (Ambiguity*) MMUnitReallocate(tempAmbiguity, sizeof(Ambiguity) * ambiguityAllocated * 2, sizeof(Ambiguity) * ambiguityAllocated);
617 ambiguityAllocated *= 2;
619 tempAmbiguity[numAmbiguity].startPos = i;
620 tempAmbiguity[numAmbiguity].rightOfEndPos = i+1;
621 tempAmbiguity[numAmbiguity].symbol = c;
623 lastAmbiguityPos = i;
625 numCharInBuffer++;
626 if (numCharInBuffer >= PACKED_BUFFER_SIZE) {
627 ConvertTextToWordPacked(buffer, hsp->packedDNA + numOfConversion * PACKED_BUFFER_SIZE / CHAR_PER_WORD, charMap, 4, PACKED_BUFFER_SIZE);
628 numCharInBuffer = 0;
629 numOfConversion++;
632 if (numCharInBuffer > 0) {
633 ConvertTextToWordPacked(buffer, hsp->packedDNA + numOfConversion * PACKED_BUFFER_SIZE / CHAR_PER_WORD, charMap, 4, numCharInBuffer);
636 numAmbiguity++;
637 hsp->numOfAmbiguity = numAmbiguity;
638 hsp->ambiguity = (Ambiguity*) MMPoolDispatch(mmPool, (hsp->numOfAmbiguity + 2) * sizeof(Ambiguity));
639 if (hsp->numOfAmbiguity > 0) {
640 memcpy(hsp->ambiguity + 1, tempAmbiguity, hsp->numOfAmbiguity * sizeof(Ambiguity));
642 hsp->ambiguity[0].startPos = 0;
643 hsp->ambiguity[0].rightOfEndPos = 0;
644 hsp->ambiguity[0].symbol = 0;
645 hsp->ambiguity[hsp->numOfAmbiguity+1].startPos = UINT32_MAX;
646 hsp->ambiguity[hsp->numOfAmbiguity+1].rightOfEndPos = UINT32_MAX;
647 hsp->ambiguity[hsp->numOfAmbiguity+1].symbol = 0;
649 hsp->seqOffset[0].firstAmbiguityIndex = 1;
650 hsp->seqOffset[0].lastAmbiguityIndex = numAmbiguity;
652 MMUnitFree(tempAmbiguity, sizeof(Ambiguity) * ambiguityAllocated);
654 return hsp;
658 void HSPFree(MMPool *mmPool, HSP *hsp, const unsigned int trailerBufferInWord) {
660 if (hsp->packedDNA != NULL) {
661 DNAFreePacked(hsp->packedDNA, hsp->dnaLength, trailerBufferInWord);
663 MMUnitFree(hsp->seqOffset, (hsp->numOfSeq+1) * sizeof(SeqOffset));
664 MMUnitFree(hsp->annotation, (hsp->numOfSeq+1) * sizeof(Annotation));
665 MMUnitFree(hsp->ambiguity, (hsp->numOfAmbiguity+2) * sizeof(Ambiguity));
666 MMUnitFree(hsp->seqActualOffset, (hsp->numOfSeq+1) * sizeof(SeqActualOffset));
667 MMUnitFree(hsp->ambiguityMap, (hsp->numOfGridEntry) * sizeof(unsigned short));
668 MMUnitFree(hsp->translate, (hsp->numOfSeq+hsp->numOfRemovedSegment) * sizeof(Translate));
670 MMPoolReturn(mmPool, hsp, sizeof(hsp));
674 unsigned int HSPParseFASTAToPacked(const char* FASTAFileName, const char* annotationFileName, const char* packedDNAFileName, const char* ambiguityFileName, const char * translateFileName,
675 const unsigned int FASTARandomSeed, const int maskLowerCase) {
677 FILE *FASTAFile, *annotationFile, *packedDNAFile, *ambiguityFile, * translateFile;
678 Annotation *annotation;
679 SeqOffset *seqOffset;
680 SeqActualOffset *seqActualOffset;
681 Ambiguity *ambiguity;
682 int annotationAllocated = 256;
683 int ambiguityAllocated = 256;
684 unsigned long long totalDiscardedChar = 0;
686 char c;
687 int i;
688 int numAmbiguity;
689 unsigned long long lastAmbiguityPos;
690 unsigned long long numChar, numCharInBuffer, totalNumChar, totalActualNumChar;
691 unsigned int sequenceRandomSeed;
692 int numSeq;
693 unsigned char charMap[255];
695 unsigned char buffer[PACKED_BUFFER_SIZE];
696 unsigned char packedBuffer[PACKED_BUFFER_SIZE / CHAR_PER_BYTE];
698 FASTAFile = (FILE*)fopen64(FASTAFileName, "r");
699 if (FASTAFile == NULL) {
700 fprintf(stderr, "ParseFASTToPacked() : Cannot open FASTAFileName!\n");
701 exit(1);
704 annotationFile = (FILE*)fopen64(annotationFileName, "w");
705 if (annotationFile == NULL) {
706 fprintf(stderr, "ParseFASTToPacked() : Cannot open annotationFileName!\n");
707 exit(1);
710 packedDNAFile = (FILE*)fopen64(packedDNAFileName, "wb");
711 if (packedDNAFile == NULL) {
712 fprintf(stderr, "ParseFASTToPacked() : Cannot open packedDNAFileName!\n");
713 exit(1);
716 ambiguityFile = (FILE*)fopen64(ambiguityFileName, "w");
717 if (ambiguityFile == NULL) {
718 fprintf(stderr, "ParseFASTToPacked() : Cannot open ambiguityFileName!\n");
719 exit(1);
722 translateFile = (FILE*)fopen64(translateFileName, "w");
723 if (translateFile == NULL) {
724 fprintf(stderr, "ParseFASTToPacked() : Cannot open translateFileName!\n");
725 exit(1);
728 HSPFillCharMap(charMap);
730 c = (char)getc(FASTAFile);
731 if (c != '>') {
732 fprintf(stderr, "ParseFASTToPacked() : FASTA file does not begin with '>'!\n");
733 exit(1);
736 totalNumChar = 0;
737 totalActualNumChar = 0;
738 numSeq = 0;
739 numAmbiguity = -1;
740 numCharInBuffer = 0;
742 annotation = (Annotation*) MMUnitAllocate(sizeof(Annotation) * annotationAllocated);
743 seqOffset = (SeqOffset*) MMUnitAllocate(sizeof(SeqOffset) * annotationAllocated);
744 ambiguity = (Ambiguity*) MMUnitAllocate(sizeof(Ambiguity) * ambiguityAllocated);
745 seqActualOffset = (SeqActualOffset*) MMUnitAllocate(sizeof(SeqActualOffset) * annotationAllocated);
747 while (!feof(FASTAFile)) {
749 numChar = 0;
750 if (numSeq >= annotationAllocated) {
751 annotation = (Annotation*) MMUnitReallocate(annotation, sizeof(Annotation) * annotationAllocated * 2, sizeof(Annotation) * annotationAllocated);
752 seqOffset = (SeqOffset*) MMUnitReallocate(seqOffset, sizeof(SeqOffset) * annotationAllocated * 2, sizeof(SeqOffset) * annotationAllocated);
753 seqActualOffset = (SeqActualOffset*) MMUnitReallocate(seqActualOffset, sizeof(SeqActualOffset) * annotationAllocated * 2, sizeof(SeqActualOffset) * annotationAllocated);
754 annotationAllocated *= 2;
757 annotation[numSeq].gi = 0;
759 c = (char)getc(FASTAFile);
760 while (!feof(FASTAFile) && c != '\n') {
761 if (numChar < MAX_SEQ_NAME_LENGTH) {
762 annotation[numSeq].text[numChar] = c;
763 numChar++;
765 if (numChar == 3 && annotation[numSeq].text[0] == 'g'
766 && annotation[numSeq].text[1] == 'i'
767 && annotation[numSeq].text[2] == '|') {
768 fscanf(FASTAFile, "%u|", &(annotation[numSeq].gi));
769 numChar = 0;
771 c = (char)getc(FASTAFile);
773 annotation[numSeq].text[numChar] = '\0';
775 // Set random seed for the sequence
776 sequenceRandomSeed = FASTARandomSeed;
777 if (annotation[numSeq].gi > 0) {
778 sequenceRandomSeed += annotation[numSeq].gi;
779 } else {
780 for (i=0; i<(int)numChar; i++) {
781 c = annotation[numSeq].text[i];
782 sequenceRandomSeed += c;
785 r250_init(sequenceRandomSeed);
787 seqOffset[numSeq].startPos = totalNumChar;
788 seqActualOffset[numSeq].startPos = totalActualNumChar;
789 numChar = 0;
790 lastAmbiguityPos = (unsigned int)-2;
792 c = (char)getc(FASTAFile);
793 unsigned long long nCount=0;
794 while (!feof(FASTAFile) && c != '>') {
795 // Get sequence
796 if (c != '\n' && c != '\t') {
797 if (maskLowerCase && c >= 'a' && c <= 'z') {
798 c = dnaChar[lowercaseDnaCharIndex];
801 if (charMap[c] != nonMatchDnaCharIndex) {
802 if (ambiguityCount[charMap[c]] == 1) {
803 buffer[numCharInBuffer] = c;
804 } else {
805 //Char != A C G T...can be anything else on Earth!!
806 c = (char)getc(FASTAFile);
807 unsigned long long nCount=1;
808 while (!feof(FASTAFile) && c != '>') {
809 if (charMap[c] != nonMatchDnaCharIndex) {
810 if (ambiguityCount[charMap[c]] != 1) {
811 nCount++;
812 } else {
813 break;
816 c = (char)getc(FASTAFile);
818 if (nCount<10) {
819 int k;
820 for (k=0;k<nCount;k++) {
821 buffer[numCharInBuffer] = 'G'; //Substitute rest of the 'N' with 'G'
822 numCharInBuffer++;
823 if (numCharInBuffer >= PACKED_BUFFER_SIZE) {
824 ConvertTextToBytePacked(buffer, packedBuffer, charMap, ALPHABET_SIZE, PACKED_BUFFER_SIZE);
825 fwrite(packedBuffer, 1, PACKED_BUFFER_SIZE / CHAR_PER_BYTE, packedDNAFile);
826 numCharInBuffer = 0;
828 numChar++;
830 } else {
831 totalDiscardedChar+=nCount;
833 numAmbiguity++;
834 if (numAmbiguity >= ambiguityAllocated) {
835 ambiguity = (Ambiguity*) MMUnitReallocate(ambiguity, sizeof(Ambiguity) * ambiguityAllocated * 2, sizeof(Ambiguity) * ambiguityAllocated);
836 ambiguityAllocated *= 2;
838 ambiguity[numAmbiguity].startPos = totalNumChar + numChar;
839 ambiguity[numAmbiguity].rightOfEndPos = totalNumChar + numChar + totalDiscardedChar - 1;
840 ambiguity[numAmbiguity].symbol = 0;
842 continue;
844 numCharInBuffer++;
845 if (numCharInBuffer >= PACKED_BUFFER_SIZE) {
846 ConvertTextToBytePacked(buffer, packedBuffer, charMap, ALPHABET_SIZE, PACKED_BUFFER_SIZE);
847 fwrite(packedBuffer, 1, PACKED_BUFFER_SIZE / CHAR_PER_BYTE, packedDNAFile);
848 numCharInBuffer = 0;
850 numChar++;
853 c = (char)getc(FASTAFile);
855 seqOffset[numSeq].endPos = totalNumChar + numChar - 1;
856 seqOffset[numSeq].firstAmbiguityIndex = numAmbiguity + 1;
857 seqActualOffset[numSeq].endPos = totalNumChar + totalDiscardedChar + numChar - 1;
858 totalNumChar += numChar;
859 totalActualNumChar = totalNumChar + totalDiscardedChar;
860 numSeq++;
863 fclose(FASTAFile);
866 // Finalize packed DNA file
868 numAmbiguity++;
870 if (numCharInBuffer > 0) {
871 ConvertTextToBytePacked(buffer, packedBuffer, charMap, ALPHABET_SIZE, numCharInBuffer);
872 fwrite(packedBuffer, 1, (numCharInBuffer + CHAR_PER_BYTE-1) / CHAR_PER_BYTE, packedDNAFile);
873 numCharInBuffer = 0;
875 if (totalNumChar % CHAR_PER_BYTE == 0) {
876 c = 0;
877 fwrite(&c, 1, 1, packedDNAFile);
879 c = (char)(totalNumChar % CHAR_PER_BYTE);
880 fwrite(&c, 1, 1, packedDNAFile);
882 fclose(packedDNAFile);
884 //Compute Translation Map
885 unsigned int gridEntries = (totalNumChar/GRID_SAMPLING_FACTOR)+1;
886 unsigned short *ambiguityMap = (unsigned short*)malloc(sizeof(unsigned short)*gridEntries);
887 Translate * translate = (Translate*) malloc(sizeof(Translate)*(numAmbiguity+numSeq));
888 unsigned int j=0,k=0;
889 for (j=0;j<gridEntries;j++) {
890 ambiguityMap[j]=0;
892 j=0;i=0;
893 while(j<numAmbiguity && k<numSeq) {
894 if (ambiguity[j].startPos<seqOffset[k].startPos) {
895 //Write Ambiguous Segment Flag
896 unsigned long long index = ambiguity[j].startPos;
897 ambiguityMap[index/GRID_SAMPLING_FACTOR]+=1;
898 translate[i].startPos=ambiguity[j].startPos;
899 translate[i].chrID=k;
901 if (k-1>0) {
902 unsigned long long correction = ambiguity[seqOffset[k-2].firstAmbiguityIndex-1].rightOfEndPos -
903 ambiguity[seqOffset[k-2].firstAmbiguityIndex-1].startPos + 1;
904 translate[i].correction=(seqOffset[k-1].startPos)+
905 correction-
906 (ambiguity[j].rightOfEndPos - ambiguity[j].startPos +1) - 1;
907 } else {
908 translate[i].correction=-ambiguity[j].rightOfEndPos + ambiguity[j].startPos-2;
910 j++;
911 i++;
912 } else if (ambiguity[j].startPos>seqOffset[k].startPos) {
913 //Write Artificial Chromosome Start Flag
914 unsigned long long index = seqOffset[k].startPos;
915 ambiguityMap[index/GRID_SAMPLING_FACTOR]+=1;
916 translate[i].startPos=seqOffset[k].startPos;
917 translate[i].chrID=k+1;
918 translate[i].correction=seqOffset[k].startPos-1;
919 k++;
920 i++;
921 } else {
922 k++;
925 while (j<numAmbiguity) {
926 unsigned long long index = ambiguity[j].startPos;
927 ambiguityMap[index/GRID_SAMPLING_FACTOR]+=1;
928 translate[i].startPos=ambiguity[j].startPos;
929 translate[i].chrID=k;
931 if (k-1>0) {
932 unsigned long long correction = ambiguity[seqOffset[k-2].firstAmbiguityIndex-1].rightOfEndPos -
933 ambiguity[seqOffset[k-2].firstAmbiguityIndex-1].startPos + 1;
934 translate[i].correction=(seqOffset[k-1].startPos)+
935 correction-
936 (ambiguity[j].rightOfEndPos - ambiguity[j].startPos +1) - 1;
937 } else {
938 translate[i].correction=-ambiguity[j].rightOfEndPos + ambiguity[j].startPos-2;
940 //printf("%u\tAMB%u\tSP:%u\tCHR:%u\tCORRECT:%u\n",i,j,translate[i].startPos,translate[i].chrID,translate[i].correction);
941 j++;i++;
943 while (k<numSeq) {
944 unsigned long long index = seqOffset[k].startPos;
945 ambiguityMap[index/GRID_SAMPLING_FACTOR]+=1;
946 translate[i].startPos=seqOffset[k].startPos;
947 translate[i].chrID=k+1;
948 translate[i].correction=seqOffset[k].startPos-1;
949 //printf("%u\tCHR%u\tSP:%u\tCHR:%u\tCORRECT:%u\n",i,k,translate[i].startPos,translate[i].chrID,translate[i].correction);
950 k++;i++;
952 while (i<numAmbiguity+numSeq) {
953 if (i>0) {
954 translate[i].startPos=translate[i-1].startPos;
955 translate[i].chrID=translate[i-1].chrID;
956 translate[i].correction=translate[i-1].correction;
957 } i++;
960 for (j=1;j<gridEntries;j++) {
961 ambiguityMap[j]+=ambiguityMap[j-1];
963 for (j=0;j<gridEntries;j++) {
964 ambiguityMap[j]--;
967 // Output annotation file
968 fprintf(annotationFile, "%llu %u %u\n", totalNumChar, numSeq, FASTARandomSeed);
969 for (i=0; i<numSeq; i++) {
970 fprintf(annotationFile, "%u %s\n", annotation[i].gi, annotation[i].text);
971 fprintf(annotationFile, "%llu %llu 0\n", seqOffset[i].startPos, seqOffset[i].endPos - seqOffset[i].startPos + 1);
972 // output number of ambiguity
973 /*if (i > 0) {
974 fprintf(annotationFile, "%u\n", seqOffset[i].firstAmbiguityIndex - seqOffset[i-1].firstAmbiguityIndex);
975 } else {
976 fprintf(annotationFile, "%u\n", seqOffset[i].firstAmbiguityIndex);
979 fclose(annotationFile);
981 MMUnitFree(annotation, sizeof(Annotation) * annotationAllocated);
982 MMUnitFree(seqOffset, sizeof(SeqOffset) * annotationAllocated);
984 // Output ambiguity file
985 fprintf(ambiguityFile, "%llu %u %u\n", totalNumChar, numSeq, 0);//numAmbiguity);
986 //for (i=0; i<numAmbiguity; i++) {
987 // The ambiguity for the dummy length is visible
988 //fprintf(ambiguityFile, "%u %u %c\n", ambiguity[i].startPos, ambiguity[i].rightOfEndPos - ambiguity[i].startPos + 1, dnaChar[ambiguity[i].symbol]);
990 fclose(ambiguityFile);
992 MMUnitFree(ambiguity, ambiguityAllocated * sizeof(Ambiguity));
994 fprintf(translateFile, "%llu %u %u %u\n", totalNumChar, numSeq, numAmbiguity, gridEntries);
995 for (j=0; j<gridEntries; j++) {
996 // The ambiguity for the dummy length is visible
997 fprintf(translateFile, "%u\n", ambiguityMap[j]);
999 for (j=0; j<numAmbiguity+numSeq; j++) {
1000 // The ambiguity for the dummy length is visible
1001 fprintf(translateFile, "%llu %u %llu\n", translate[j].startPos,translate[j].chrID, translate[j].correction );
1003 for (j=0; j<numSeq; j++) {
1004 fprintf(translateFile, "%llu %llu\n", seqActualOffset[j].startPos, seqActualOffset[j].endPos - seqActualOffset[j].startPos + 1);
1006 fclose(translateFile);
1008 free(translate);
1009 free(ambiguityMap);
1010 MMUnitFree(seqActualOffset, sizeof(SeqActualOffset) * annotationAllocated);
1012 return numSeq;
1016 unsigned int HSPPackedToFASTA(const char* FASTAFileName, const char* annotationFileName, const char* packedDNAFileName, const char* ambiguityFileName, const char* translateFileName) {
1018 HSP *hsp;
1019 FILE *FASTAFile;
1021 hsp = HSPLoad(NULL, packedDNAFileName, annotationFileName, ambiguityFileName,translateFileName, 1);
1023 // Generate FASTA from packed
1024 FASTAFile = (FILE*)fopen64(FASTAFileName, "w");
1025 if (FASTAFile == NULL) {
1026 fprintf(stderr, "Cannot open FASTA file!\n");
1027 exit(1);
1030 // to be done...
1031 fprintf(stderr, "HSPPackedToFASTA(): Function not complete!\n");
1033 fclose(FASTAFile);
1035 HSPFree(NULL, hsp, 1);
1037 return 0;
1041 // The hit must be an mismatch hit anchored on the left most of the pattern
1043 unsigned int HSPShortPattern1GapRightExt(const unsigned LONG *packedDNA, const unsigned char *pattern,
1044 const unsigned int patternLength, const unsigned int textLength,
1045 HitList* __restrict hitList, const unsigned int numHit,
1046 const unsigned int maxSpaceInGap, const unsigned int *maxMismatch) {
1048 unsigned int anchorLength;
1049 unsigned int maxSpaceInGap;
1050 unsigned int maxMismatch[MAX_SP_SPACE_IN_GAP];
1052 unsigned LONG anchorText, anchorPattern;
1053 unsigned LONG nonAnchorText, nonAnchorPattern;
1054 unsigned LONG nonAnchorTextReversed, nonAnchorPatternReversed;
1055 unsigned int pos, shift, index;
1056 unsigned maxRightShift;
1057 unsigned int anchorNumError;
1058 unsigned int nonAnchorError[MAX_SP_MISMATCH];
1059 unsigned int leftShiftError[MAX_SP_MISMATCH];
1060 unsigned int rightShiftError[MAX_SP_MISMATCH];
1062 __m128i t;
1063 __m128i m0, m1, m3, m0f, m00ff, m0000ffff;
1064 __m128i ts, tn, p, mb, mb1, mp, mneg, mb31, md;
1066 unsigned LONG ALIGN_16 temp[2];
1067 unsigned int numExtHit;
1068 const unsigned char *thisPattern;
1071 const static int bitPos[32] = { 0, 16, 14, 1, 30, 7, 12, 17, 15, 11, 10, 23, 28, 24, 2, 4,
1072 31, 29, 22, 27, 26, 25, 8, 19, 13, 6, 9, 3, 21, 18, 5, 20 };
1074 unsigned int i, h;
1076 if (patternLength >= MAX_SP_NON_ANCHOR_LENGTH) {
1077 anchorLength = patternLength - MAX_SP_NON_ANCHOR_LENGTH;
1078 } else {
1079 anchorLength = 0;
1082 // Determine maximum no. of spaces and maximum mismatch corresponding to the no. of spaces
1083 if (maxPenalty >= gapOpenScore) {
1084 maxSpaceInGap = (maxPenalty - gapOpenScore) / gapExtendScore;
1085 } else {
1086 maxSpaceInGap = 0;
1089 for (i=0; i<MAX_SP_SPACE_IN_GAP; i++) {
1090 maxMismatch[i] = 0;
1092 for (i=0; i<maxSpaceInGap; i++) {
1093 maxMismatch[i] = (maxPenalty - gapOpenScore - gapOpenScore * i) / (matchScore - mismatchScore);
1096 numExtHit = 0;
1098 for (h=0; h<numHit; h++) {
1100 thisPattern = pattern + patternLength * hitList[h].info;
1101 if (hitList[h].posText + patternLength <= textLength) {
1102 maxRightShift = textLength - hitList[h].posText - patternLength;
1103 if (maxRightShift > MAX_SP_SPACE_IN_GAP) {
1104 maxRightShift = MAX_SP_SPACE_IN_GAP;
1106 } else {
1107 continue;
1110 anchorNumError = 0;
1112 anchorPattern = 0;
1113 for (i=0; i<anchorLength; i++) {
1114 anchorPattern <<= 2;
1115 anchorPattern |= thisPattern[i];
1118 nonAnchorPattern = 0;
1119 for (; i<patternLength; i++) {
1120 nonAnchorPattern <<= 2;
1121 nonAnchorPattern |= thisPattern[i];
1124 nonAnchorPatternReversed = 0; // reverse pattern
1125 for (i=patternLength; --i;) { // from patternLength - 1 to 0
1126 nonAnchorPatternReversed <<= 2;
1127 nonAnchorPatternReversed |= thisPattern[i];
1130 shift = hitList[h].posText % CHAR_PER_64;
1131 pos = hitList[h].posText / CHAR_PER_64;
1132 if (shift) {
1133 anchorText = (packedDNA[pos] << shift) + (packedDNA[pos + 1] >> (64 - shift));
1134 } else {
1135 anchorText = packedDNA[pos / CHAR_PER_64];
1138 shift = (hitList[h].posText + anchorLength - MAX_SP_SPACE_IN_GAP) % CHAR_PER_64;
1139 pos = (hitList[h].posText + anchorLength - MAX_SP_SPACE_IN_GAP) / CHAR_PER_64;
1140 if (shift) {
1141 nonAnchorText = (packedDNA[pos] << shift) + (packedDNA[pos + 1] >> (64 - shift));
1142 } else {
1143 nonAnchorText = packedDNA[pos / CHAR_PER_64];
1146 // Reverse nonAnchorText and nonAnchorPattern
1147 temp[0] = nonAnchorText;
1148 temp[1] = nonAnchorPattern;
1149 t = _mm_load_si128((__m128i*)temp);
1150 m3 = _mm_set1_epi32(0x33333333);
1151 m0f = _mm_set1_epi32(0x0f0f0f0f);
1152 m00ff = _mm_set1_epi32(0x00ff00ff);
1153 m0000ffff = _mm_set1_epi32(0x0000ffff);
1155 ts = _mm_srli_epi64(t, 2);
1156 ts = _mm_and_si128(ts, m3);
1157 tn = _mm_and_si128(t, m3);
1158 tn = _mm_srli_epi64(tn, 2);
1159 t = _mm_or_si128(ts, tn);
1161 ts = _mm_srli_epi64(t, 4);
1162 ts = _mm_and_si128(ts, m0f);
1163 tn = _mm_and_si128(t, m0f);
1164 tn = _mm_srli_epi64(tn, 4);
1165 t = _mm_or_si128(ts, tn);
1167 ts = _mm_srli_epi64(t, 8);
1168 ts = _mm_and_si128(ts, m00ff);
1169 tn = _mm_and_si128(t, m00ff);
1170 tn = _mm_srli_epi64(tn, 8);
1171 t = _mm_or_si128(ts, tn);
1173 ts = _mm_srli_epi64(t, 16);
1174 ts = _mm_and_si128(ts, m0000ffff);
1175 tn = _mm_and_si128(t, m0000ffff);
1176 tn = _mm_srli_epi64(tn, 16);
1177 t = _mm_or_si128(ts, tn);
1179 ts = _mm_srli_epi64(t, 32);
1180 tn = _mm_srli_epi64(tn, 32);
1181 t = _mm_or_si128(ts, tn);
1183 _mm_store_si128((__m128i*)temp, t);
1184 nonAnchorTextReversed = temp[0];
1185 nonAnchorPatternReversed = temp[1];
1187 m1 = _mm_set1_epi32(0xFFFFFFFF);
1188 m0 = _mm_setzero_si128();
1190 // Find the mismatches for anchor and non-anchor no-shift
1191 temp[0] = anchorText;
1192 temp[1] = nonAnchorTextReversed;
1193 t = _mm_load_si128((__m128i*)temp);
1194 temp[0] = anchorPattern;
1195 temp[1] = nonAnchorPatternReversed;
1196 p = _mm_load_si128((__m128i*)temp);
1197 mb = _mm_and_si128(t, p);
1198 mb1 = _mm_srli_epi64(mb, 1);
1199 mb = _mm_and_si128(mb, mb1);
1200 mb = _mm_andnot_si128(mb, m1);
1202 mneg = _mm_sub_epi64(m0, mb);
1203 mb = _mm_and_si128(mb, mneg);
1205 mb31 = _mm_srli_epi64(mb, 31);
1206 mb = _mm_and_si128(mb, mb31);
1208 md = _mm_set1_epi32(0x077CB531);
1209 mp = _mm_mul_epu32(mb, md);
1210 mp = _mm_srli_epi32(mp, 27);
1212 index = _mm_extract_epi16(mp, 0);
1213 anchorNumError += (index > 0);
1215 index = _mm_extract_epi16(mp, 4);
1216 nonAnchorError[0] = bitPos[index];
1218 // Continue with error 2 and 3
1221 // Find the mismatches for non-anchor 1-shift
1223 // Find the mismatches for non-anchor 2-shift
1225 // Find the mismatches for non-anchor 3-shift
1230 // Find mismatches in non-anchor non-shift comparison
1232 // Find mismatches in non-anchor left-shift comparison
1234 // Find mismatches in non-anchor right-shift comparison
1236 // Check spaces and mismatches
1240 // reverse the text and the pattern
1242 // v = (v64 >> 31) | v64
1245 // bitPos[((v & -v) * 0x077CB531UL) >> 27];
1252 unsigned int HSPUngappedExtension(const unsigned int *packedDNA, const unsigned int *packedKey, const unsigned int *packedMask, const unsigned int queryPatternLength,
1253 const unsigned int subPattenLength, const unsigned long long textLength,
1254 HitListWithPosQuery* __restrict hitList, const unsigned long long numberOfHit,
1255 const HSPUngappedExtLookupTable *ungappedLookupTable,
1256 const int cutoffScore, const int dropoffScore) {
1258 static const unsigned int oddBitMask = 0x55555555;
1259 static const unsigned int evenBitMask = 0xAAAAAAAA;
1261 unsigned long long hitProcessed, numberOfUngappedHit;
1262 unsigned long long textShift, queryShift;
1263 unsigned long long posText, posQuery;
1264 unsigned long long queryWordPos, textWordPos;
1265 unsigned int matchVector32, matchVector16;
1266 int numberOfValidChar;
1267 unsigned long long currentPos;
1268 int currentScore, forwardMaxScore, backwardMaxScore;
1269 unsigned long long forwardMaxScorePos, backwardMaxScorePos;
1271 hitProcessed = 0;
1272 numberOfUngappedHit = 0;
1274 while (hitProcessed < numberOfHit) {
1276 // Move position to mid-point of hit
1277 posText = hitList[hitProcessed].posText + subPattenLength / 2;
1278 posQuery = hitList[hitProcessed].posQuery + subPattenLength / 2;
1280 textShift = (posText % CHAR_PER_WORD) * BIT_PER_CHAR;
1281 queryShift = (posQuery % CHAR_PER_WORD) * BIT_PER_CHAR;
1283 // Forward extend
1285 textWordPos = posText / CHAR_PER_WORD;
1286 queryWordPos = posQuery / CHAR_PER_WORD;
1287 numberOfValidChar = min(queryPatternLength - posQuery, textLength - posText);
1289 forwardMaxScore = 0;
1290 forwardMaxScorePos = posQuery;
1291 currentPos = posQuery;
1292 currentScore = 0;
1294 while (forwardMaxScore <= currentScore + dropoffScore && numberOfValidChar > 0) {
1296 matchVector32 = ((packedKey[queryWordPos] << queryShift) | ((packedKey[queryWordPos+1] >> (BITS_IN_WORD - queryShift)) * (queryShift > 0)))
1297 ^ ((packedDNA[textWordPos] << textShift) | ((packedDNA[textWordPos+1] >> (BITS_IN_WORD - textShift)) * (textShift > 0)));
1299 matchVector32 |= (packedMask[queryWordPos] << queryShift) | ((packedMask[queryWordPos+1] >> (BITS_IN_WORD - queryShift)) * (queryShift > 0));
1301 if (numberOfValidChar < CHAR_PER_WORD) {
1302 // mask the invalid character as mismatch
1303 matchVector32 |= ALL_ONE_MASK >> (numberOfValidChar * BIT_PER_CHAR);
1306 // Process vector to even bits for forward extend
1307 matchVector32 = (matchVector32 | (matchVector32 << 1)) & evenBitMask;
1308 matchVector16 = (ungappedLookupTable[matchVector32 >> 16].matchMismatchBitVector << 8) |
1309 ungappedLookupTable[matchVector32 & 0xFFFF].matchMismatchBitVector;
1311 if (currentScore + ungappedLookupTable[matchVector16].maxScore > forwardMaxScore) {
1312 forwardMaxScore = currentScore + ungappedLookupTable[matchVector16].maxScore;
1313 forwardMaxScorePos = currentPos + ungappedLookupTable[matchVector16].maxScorePos;
1315 currentScore += ungappedLookupTable[matchVector16].finalScore;
1316 currentPos += CHAR_PER_WORD;
1318 textWordPos++;
1319 queryWordPos++;
1320 numberOfValidChar -= CHAR_PER_WORD;
1325 // Backward extend by word
1326 textWordPos = posText / CHAR_PER_WORD;
1327 queryWordPos = posQuery / CHAR_PER_WORD;
1328 numberOfValidChar = min(posQuery, posText);
1330 backwardMaxScore = 0;
1331 backwardMaxScorePos = posQuery;
1332 currentPos = posQuery;
1333 currentScore = 0;
1335 while (backwardMaxScore <= currentScore + dropoffScore && numberOfValidChar > 0) {
1337 if (numberOfValidChar >= CHAR_PER_WORD) {
1338 matchVector32 = ((packedKey[queryWordPos-1] << queryShift) | ((packedKey[queryWordPos] >> (BITS_IN_WORD - queryShift)) * (queryShift > 0)))
1339 ^ ((packedDNA[textWordPos-1] << textShift) | ((packedDNA[textWordPos] >> (BITS_IN_WORD - textShift)) * (textShift > 0)));
1340 matchVector32 |= (packedMask[queryWordPos-1] << queryShift) | ((packedMask[queryWordPos] >> (BITS_IN_WORD - queryShift)) * (queryShift > 0));
1341 } else {
1342 if (textWordPos > 0) {
1343 matchVector32 = ((packedKey[queryWordPos] >> (BITS_IN_WORD - queryShift)) * (queryShift > 0))
1344 ^ ((packedDNA[textWordPos-1] << textShift) | ((packedDNA[textWordPos] >> (BITS_IN_WORD - textShift)) * (textShift > 0)));
1345 matchVector32 |= (packedMask[queryWordPos] >> (BITS_IN_WORD - queryShift)) * (queryShift > 0);
1346 } else if (queryWordPos > 0) {
1347 matchVector32 = ((packedKey[queryWordPos-1] << queryShift) | ((packedKey[queryWordPos] >> (BITS_IN_WORD - queryShift)) * (queryShift > 0)))
1348 ^ ((packedDNA[textWordPos] >> (BITS_IN_WORD - textShift)) * (textShift > 0));
1349 matchVector32 |= (packedMask[queryWordPos-1] << queryShift) | ((packedMask[queryWordPos] >> (BITS_IN_WORD - queryShift)) * (queryShift > 0));
1350 } else {
1351 matchVector32 = ((packedKey[queryWordPos] >> (BITS_IN_WORD - queryShift)) * (queryShift > 0))
1352 ^ ((packedDNA[textWordPos] >> (BITS_IN_WORD - textShift)) * (textShift > 0));
1353 matchVector32 |= (packedMask[queryWordPos] >> (BITS_IN_WORD - queryShift)) * (queryShift > 0);
1355 matchVector32 |= ALL_ONE_MASK << (numberOfValidChar * BIT_PER_CHAR);
1358 // Process vector to odd bits for backward extend
1359 matchVector32 = (matchVector32 | (matchVector32 >> 1)) & oddBitMask;
1360 matchVector16 = ungappedLookupTable[matchVector32 >> 16].matchMismatchBitVector |
1361 (ungappedLookupTable[matchVector32 & 0xFFFF].matchMismatchBitVector << 8); // matchVecter16 is reverse of packed odd bits
1363 if (currentScore + ungappedLookupTable[matchVector16].maxScore > backwardMaxScore) {
1364 backwardMaxScore = currentScore + ungappedLookupTable[matchVector16].maxScore;
1365 backwardMaxScorePos = currentPos - ungappedLookupTable[matchVector16].maxScorePos;
1367 currentScore += ungappedLookupTable[matchVector16].finalScore;
1368 currentPos -= CHAR_PER_WORD;
1370 textWordPos--;
1371 queryWordPos--;
1372 numberOfValidChar -= CHAR_PER_WORD;
1376 // Check cut off score
1377 if (backwardMaxScore + forwardMaxScore >= cutoffScore) {
1378 hitList[numberOfUngappedHit].posQuery = (forwardMaxScorePos + backwardMaxScorePos) / 2;
1379 hitList[numberOfUngappedHit].posText = posText - posQuery + hitList[numberOfUngappedHit].posQuery;
1380 numberOfUngappedHit++;
1383 hitProcessed++;
1387 return numberOfUngappedHit;
1391 int HSPGappedExtension(const unsigned int *packedDNA, const unsigned long long textLength, const unsigned char *convertedKey, const int queryPatternLength,
1392 const HitListWithPosQuery* hitList, const int numberOfHit,
1393 GappedHitList* __restrict gappedHitList,
1394 MMPool *mmPool,
1395 const int matchScore, const int mismatchScore,
1396 const int gapOpenScore, const int gapExtendScore,
1397 const int cutoffScore, const int dropoffScore) {
1399 int* __restrict B;
1400 int M, BLast;
1401 int* __restrict I; // insert and delete wrt query
1402 int D; // insert and delete wrt query
1403 int scoringMatrix[DNA_CHAR_SIZE][DNA_CHAR_SIZE];
1405 int hitProcessed, numberOfGappedHit;
1407 int i;
1408 unsigned int c = 0;
1409 char textChar;
1410 int queryOffset;
1411 unsigned long long textOffset;
1412 unsigned long long textDecodePos;
1413 int charToProcess;
1415 int lastStartPos, startPos, nextStartPos;
1416 int endPos, nextEndPos;
1417 unsigned long long textPos;
1419 int maxLengthOfGap;
1420 int currentPos;
1421 int forwardMaxScore, backwardMaxScore;
1422 unsigned long long forwardMaxScorePosQuery, forwardMaxScorePosText;
1423 unsigned long long backwardMaxScorePosQuery, backwardMaxScorePosText;
1425 int dpCellAllocated;
1427 dpCellAllocated = min(queryPatternLength + 1, MAX_ALIGNMENT_LENGTH);
1429 // allocate working memory
1430 B = (int*) MMPoolDispatch(mmPool, dpCellAllocated * sizeof(int));
1431 I = (int*) MMPoolDispatch(mmPool, dpCellAllocated * sizeof(int));
1433 HSPFillScoringMatrix(scoringMatrix, matchScore, mismatchScore, 0);
1435 hitProcessed = 0;
1436 numberOfGappedHit = 0;
1438 maxLengthOfGap = (dropoffScore + gapOpenScore) / (-gapExtendScore);
1440 while (hitProcessed < numberOfHit) {
1442 queryOffset = hitList[hitProcessed].posQuery;
1443 textOffset = hitList[hitProcessed].posText;
1445 // Forward extend
1447 textDecodePos = textOffset;
1448 if (textDecodePos % CHAR_PER_WORD > 0) {
1449 // decode text to the next word boundary
1450 charToProcess = CHAR_PER_WORD - (textDecodePos % CHAR_PER_WORD);
1451 c = packedDNA[textDecodePos / CHAR_PER_WORD] << (BITS_IN_WORD - charToProcess * BIT_PER_CHAR);
1452 textDecodePos += charToProcess;
1455 // Fill initial scores
1456 B[0] = 0;
1457 I[0] = gapExtendScore + gapOpenScore;
1458 for (i=1; i<=maxLengthOfGap; i++) {
1459 B[i] = i * gapExtendScore + gapOpenScore;
1460 I[i] = B[i] + gapExtendScore + gapOpenScore;
1462 I[i] = DP_NEG_INFINITY;
1464 lastStartPos = startPos = 0;
1465 endPos = min(maxLengthOfGap + 1, queryPatternLength - queryOffset);
1466 textPos = textOffset;
1468 forwardMaxScore = 0;
1469 forwardMaxScorePosQuery = queryOffset;
1470 forwardMaxScorePosText = textOffset;
1472 while (startPos <= endPos && textPos < textLength) {
1474 if (endPos >= dpCellAllocated) {
1475 fprintf(stderr, "HSPGappedExtension(): Not enough DP cells allocated!\n");
1476 exit(1);
1479 if (textPos >= textDecodePos) {
1480 c = packedDNA[textDecodePos / CHAR_PER_WORD];
1481 textDecodePos += CHAR_PER_WORD;
1483 textChar = (char)(c >> (BITS_IN_WORD - BIT_PER_CHAR));
1484 c <<= BIT_PER_CHAR;
1486 nextEndPos = 0;
1487 nextStartPos = INT_MAX;
1489 if (startPos > lastStartPos) {
1490 BLast = B[startPos - 1];
1491 D = DP_NEG_INFINITY;
1492 currentPos = startPos;
1493 } else {
1494 // all scores in currentPos - 1 is DP_NEG_INFINITY
1495 BLast = B[startPos];
1496 B[startPos] = I[startPos];
1497 I[startPos] = I[startPos] + gapExtendScore;
1498 D = B[startPos] + gapExtendScore + gapOpenScore;
1499 if (B[startPos] + dropoffScore >= forwardMaxScore) {
1500 nextStartPos = startPos;
1501 nextEndPos = startPos;
1503 currentPos = startPos + 1;
1506 while (currentPos <= endPos) {
1508 M = BLast + scoringMatrix[textChar][convertedKey[currentPos + queryOffset - 1]];
1509 BLast = B[currentPos];
1511 if (M >= I[currentPos] && M >= D) {
1512 // matchScore is maximum
1513 B[currentPos] = M;
1514 I[currentPos] = max(M + gapOpenScore, I[currentPos]) + gapExtendScore;
1515 D = max(M + gapOpenScore, D) + gapExtendScore;
1516 } else {
1517 B[currentPos] = max(I[currentPos], D);
1518 // insert cannot follow delete and delete cannot follow insert
1519 I[currentPos] = I[currentPos] + gapExtendScore;
1520 D = D + gapExtendScore;
1522 if (B[currentPos] + dropoffScore >= forwardMaxScore) {
1523 if (nextStartPos > currentPos) {
1524 nextStartPos = currentPos;
1526 nextEndPos = currentPos;
1528 if (B[currentPos] > forwardMaxScore) {
1529 forwardMaxScore = B[currentPos];
1530 forwardMaxScorePosQuery = currentPos - 1 + queryOffset;
1531 forwardMaxScorePosText = textPos;
1534 currentPos++;
1538 nextEndPos++;
1539 if (nextEndPos == currentPos) {
1540 while (nextEndPos <= queryPatternLength - queryOffset && D + dropoffScore >= forwardMaxScore) {
1541 if (nextEndPos >= dpCellAllocated) {
1542 fprintf(stderr, "HSPGappedExtension(): Not enough DP cells allocated!\n");
1543 exit(1);
1545 B[nextEndPos] = D;
1546 D = D + gapExtendScore;
1547 I[nextEndPos] = DP_NEG_INFINITY;
1548 nextEndPos++;
1550 I[nextEndPos] = DP_NEG_INFINITY;
1553 if (nextEndPos > queryPatternLength - queryOffset) {
1554 nextEndPos = queryPatternLength - queryOffset;
1557 lastStartPos = startPos;
1558 startPos = nextStartPos;
1559 endPos = nextEndPos;
1561 textPos++;
1565 // Backward extend
1567 textDecodePos = textOffset;
1568 if (textDecodePos % CHAR_PER_WORD > 0) {
1569 charToProcess = textDecodePos % CHAR_PER_WORD;
1570 c = packedDNA[textDecodePos / CHAR_PER_WORD] >> (BITS_IN_WORD - charToProcess * BIT_PER_CHAR);
1571 textDecodePos -= charToProcess;
1574 // Fill initial scores
1575 B[0] = 0;
1576 I[0] = gapExtendScore + gapOpenScore;
1577 for (i=1; i<=maxLengthOfGap; i++) {
1578 B[i] = i * gapExtendScore + gapOpenScore;
1579 I[i] = B[i] + gapExtendScore + gapOpenScore;
1581 I[i] = DP_NEG_INFINITY;
1583 lastStartPos = startPos = 0;
1584 endPos = min(maxLengthOfGap + 1, queryOffset);
1585 textPos = textOffset;
1587 backwardMaxScore = 0;
1588 backwardMaxScorePosQuery = queryOffset;
1589 backwardMaxScorePosText = textOffset;
1591 while (startPos <= endPos && textPos > 0) {
1593 if (endPos >= dpCellAllocated) {
1594 fprintf(stderr, "HSPGappedExtension(): Not enough DP cells allocated!\n");
1595 exit(1);
1598 if (textPos <= textDecodePos) {
1599 c = packedDNA[textDecodePos / CHAR_PER_WORD - 1];
1600 textDecodePos -= CHAR_PER_WORD;
1602 textChar = (char)(c & 0x3);
1603 c >>= BIT_PER_CHAR;
1605 nextEndPos = 0;
1606 nextStartPos = INT_MAX;
1608 if (startPos > lastStartPos) {
1609 BLast = B[startPos - 1];
1610 D = DP_NEG_INFINITY;
1611 currentPos = startPos;
1612 } else {
1613 // all scores in currentPos - 1 is DP_NEG_INFINITY
1614 BLast = B[startPos];
1615 B[startPos] = I[startPos];
1616 I[startPos] = I[startPos] + gapExtendScore;
1617 D = B[startPos] + gapExtendScore + gapOpenScore;
1618 if (B[startPos] + dropoffScore >= backwardMaxScore) {
1619 nextStartPos = startPos;
1620 nextEndPos = startPos;
1622 currentPos = startPos + 1;
1625 while (currentPos <= endPos) {
1627 M = BLast + scoringMatrix[textChar][convertedKey[queryOffset - currentPos]];
1628 BLast = B[currentPos];
1630 if (M >= I[currentPos] && M >= D) {
1631 // matchScore is maximum
1632 B[currentPos] = M;
1633 I[currentPos] = max(M + gapOpenScore, I[currentPos]) + gapExtendScore;
1634 D = max(M + gapOpenScore, D) + gapExtendScore;
1635 } else {
1636 B[currentPos] = max(I[currentPos], D);
1637 // insert cannot follow delete and delete cannot follow insert
1638 I[currentPos] = I[currentPos] + gapExtendScore;
1639 D = D + gapExtendScore;
1641 if (B[currentPos] + dropoffScore >= backwardMaxScore) {
1642 if (nextStartPos > currentPos) {
1643 nextStartPos = currentPos;
1645 nextEndPos = currentPos;
1647 if (B[currentPos] > backwardMaxScore) {
1648 backwardMaxScore = B[currentPos];
1649 backwardMaxScorePosQuery = queryOffset - currentPos;
1650 backwardMaxScorePosText = textPos - 1;
1653 currentPos++;
1657 nextEndPos++;
1658 if (nextEndPos == currentPos) {
1659 while (nextEndPos <= queryOffset && D + dropoffScore >= backwardMaxScore) {
1660 if (nextEndPos >= dpCellAllocated) {
1661 fprintf(stderr, "HSPGappedExtension(): Not enough DP cells allocated!\n");
1662 exit(1);
1664 B[nextEndPos] = D;
1665 D = D + gapExtendScore;
1666 I[nextEndPos] = DP_NEG_INFINITY;
1667 nextEndPos++;
1669 I[nextEndPos] = DP_NEG_INFINITY;
1672 if (nextEndPos > queryOffset) {
1673 nextEndPos = queryOffset;
1676 lastStartPos = startPos;
1677 startPos = nextStartPos;
1678 endPos = nextEndPos;
1680 textPos--;
1684 // check cutoff score
1685 if (forwardMaxScore + backwardMaxScore >= cutoffScore) {
1686 gappedHitList[numberOfGappedHit].posText = backwardMaxScorePosText;
1687 gappedHitList[numberOfGappedHit].posQuery = backwardMaxScorePosQuery;
1688 gappedHitList[numberOfGappedHit].lengthQuery = forwardMaxScorePosQuery + 1 - backwardMaxScorePosQuery;
1689 gappedHitList[numberOfGappedHit].score = forwardMaxScore + backwardMaxScore;
1690 gappedHitList[numberOfGappedHit].lengthText = forwardMaxScorePosText + 1 - backwardMaxScorePosText;
1691 gappedHitList[numberOfGappedHit].ungappedPosText = textOffset;
1692 gappedHitList[numberOfGappedHit].ungappedPosQuery = queryOffset;
1693 numberOfGappedHit++;
1696 hitProcessed++;
1700 // free working memory
1701 MMPoolReturn(mmPool, B, dpCellAllocated * sizeof(int));
1702 MMPoolReturn(mmPool, I, dpCellAllocated * sizeof(int));
1705 return numberOfGappedHit;
1709 // mmPool can be NULL but alignmentPool must be allocated for storing alignments as they are not freed explicitly and must be freed through MMPool
1710 // gappedHitList should be sorted in increasin posText order
1711 int HSPGappedExtensionWithTraceback(const unsigned int *packedDNA, const unsigned char *convertedKey, const int queryPatternLength,
1712 GappedHitList* __restrict gappedHitList, const int numberOfHit,
1713 MMPool *mmPool, MMPool *alignmentPool,
1714 const SeqOffset *seqOffset, const Ambiguity *ambiguity,
1715 const int matchScore, const int mismatchScore,
1716 const int gapOpenScore, const int gapExtendScore,
1717 const double maxEvalue, const int dropoffScore) {
1719 char* __restrict textBuffer;
1720 int M, BLast;
1721 int* __restrict B;
1722 int* __restrict I; // insert and delete wrt query
1723 int D; // insert and delete wrt query
1724 char* __restrict IType; // Gapped opening or gap extension
1725 char DType; // Gapped opening or gap extension
1726 int scoringMatrix[DNA_CHAR_SIZE][DNA_CHAR_SIZE];
1728 char* __restrict traceback;
1729 int* __restrict tracebackIndex;
1730 char* __restrict tempForwardAlignment;
1731 char* __restrict tempBackwardAlignment;
1732 char* __restrict tempForwardAuxiliaryText;
1733 char* __restrict tempBackwardAuxiliaryText;
1734 unsigned int* __restrict alignment;
1735 unsigned int* __restrict auxiliaryText;
1737 int hitProcessed, numberOfGappedHit;
1739 int i, j;
1740 unsigned int c;
1741 int queryOffset;
1742 unsigned long long textOffset;
1743 unsigned long long sequenceStart, sequenceEnd;
1744 int charToProcess;
1745 unsigned long long textDecodePos;
1746 double evalue;
1748 int lastStartPos, startPos, nextStartPos;
1749 int endPos, nextEndPos;
1750 unsigned long long textPos;
1751 int ambiguityIndex;
1753 int maxLengthOfGap;
1754 int currentPos;
1755 int currentTracebackIndex;
1756 int forwardMaxScore, backwardMaxScore;
1757 unsigned long long forwardMaxScorePosQuery, forwardMaxScorePosText;
1758 unsigned long long backwardMaxScorePosQuery, backwardMaxScorePosText;
1759 int forwardNumOfAlignment, forwardNumOfAuxiliaryText;
1760 int backwardNumOfAlignment, backwardNumOfAuxiliaryText;
1761 int numOfAlignmentWord, numOfAuxiliaryTextWord;
1762 int tracebackAllocationUnit, tracebackAllocated;
1763 char dpType;
1764 int dpScore;
1766 int dpCellAllocated;
1768 dpCellAllocated = min(queryPatternLength + 1, MAX_ALIGNMENT_LENGTH);
1770 if (alignmentPool == NULL) {
1771 fprintf(stderr, "HSPGappedExtensionWithTraceback(): alignmentPool is not allocated!\n");
1772 exit(1);
1775 maxLengthOfGap = (dropoffScore + gapOpenScore) / (-gapExtendScore);
1777 // allocate working memory
1778 textBuffer = (char*) MMPoolDispatch(mmPool, dpCellAllocated * 2);
1779 B = (int*) MMPoolDispatch(mmPool, dpCellAllocated * sizeof(int));
1780 IType = (char*) MMPoolDispatch(mmPool, dpCellAllocated);
1781 I = (int*) MMPoolDispatch(mmPool, dpCellAllocated * sizeof(int));
1783 tracebackIndex = (int*) MMPoolDispatch(mmPool, dpCellAllocated * 2 * 2 * sizeof(int));
1785 tracebackAllocated = tracebackAllocationUnit = dpCellAllocated * maxLengthOfGap * maxLengthOfGap;
1786 traceback = (char*) MMUnitAllocate(tracebackAllocated);
1788 tempForwardAlignment = (char*) MMPoolDispatch(mmPool, dpCellAllocated * 2);
1789 tempBackwardAlignment = (char*) MMPoolDispatch(mmPool, dpCellAllocated * 2);
1790 tempForwardAuxiliaryText = (char*) MMPoolDispatch(mmPool, dpCellAllocated);
1791 tempBackwardAuxiliaryText = (char*) MMPoolDispatch(mmPool, dpCellAllocated);
1793 HSPFillScoringMatrix(scoringMatrix, matchScore, mismatchScore, 0);
1795 hitProcessed = 0;
1796 numberOfGappedHit = 0;
1797 ambiguityIndex = 0;
1799 while (hitProcessed < numberOfHit) {
1801 sequenceStart = seqOffset[gappedHitList[hitProcessed].dbSeqIndex].startPos;
1802 sequenceEnd = seqOffset[gappedHitList[hitProcessed].dbSeqIndex].endPos;
1803 if (seqOffset[gappedHitList[hitProcessed].dbSeqIndex].firstAmbiguityIndex > ambiguityIndex) {
1804 ambiguityIndex = seqOffset[gappedHitList[hitProcessed].dbSeqIndex].firstAmbiguityIndex;
1807 textOffset = gappedHitList[hitProcessed].ungappedPosText;
1808 queryOffset = gappedHitList[hitProcessed].ungappedPosQuery;
1811 // Forward extend
1813 textDecodePos = textOffset;
1814 if (textDecodePos % CHAR_PER_WORD > 0) {
1815 // decode text to the next word boundary
1816 charToProcess = CHAR_PER_WORD - (textDecodePos % CHAR_PER_WORD);
1817 c = packedDNA[textDecodePos / CHAR_PER_WORD] << (BITS_IN_WORD - charToProcess * BIT_PER_CHAR);
1818 for (i=0; i<charToProcess; i++) {
1819 textBuffer[textDecodePos + i - textOffset + 1] = (unsigned char)(c >> (BITS_IN_WORD - BIT_PER_CHAR));
1820 c <<= BIT_PER_CHAR;
1823 // Apply ambiguity
1824 while (ambiguity[ambiguityIndex].rightOfEndPos <= textOffset) {
1825 ambiguityIndex++;
1827 if (ambiguity[ambiguityIndex].startPos < textOffset + charToProcess) {
1828 for (i=0; i<charToProcess; i++) {
1829 while (ambiguity[ambiguityIndex].rightOfEndPos <= textOffset + i) {
1830 ambiguity++;
1832 if (ambiguity[ambiguityIndex].startPos <= textOffset + i) {
1833 textBuffer[textDecodePos + i - textOffset + 1] = (char)ambiguity[ambiguityIndex].symbol;
1838 textDecodePos += charToProcess;
1841 // Fill initial scores
1842 B[0] = 0;
1843 traceback[0] = DP_MATCH_MISMATCH;
1844 I[0] = gapExtendScore + gapOpenScore;
1845 IType[0] = DP_INSERT_OPEN;
1847 B[1] = gapExtendScore + gapOpenScore;
1848 traceback[1] = DP_DELETE | DP_DELETE_OPEN;
1849 I[1] = B[1] + gapExtendScore + gapOpenScore;
1850 IType[1] = DP_INSERT_OPEN;
1852 for (i=2; i<=maxLengthOfGap; i++) {
1853 B[i] = B[i-1] + gapExtendScore;
1854 traceback[i] = DP_DELETE;
1855 I[i] = B[i] + gapExtendScore + gapOpenScore;
1856 IType[i] = DP_INSERT_OPEN;
1858 I[i] = DP_NEG_INFINITY;
1860 currentTracebackIndex = maxLengthOfGap + 1;
1862 lastStartPos = startPos = 0;
1863 endPos = min((maxLengthOfGap + 1), queryPatternLength - queryOffset);
1864 textPos = textOffset;
1866 forwardMaxScore = 0;
1867 forwardMaxScorePosQuery = queryOffset;
1868 forwardMaxScorePosText = textOffset;
1870 tracebackIndex[0] = 0; // The first trackback of the row
1871 tracebackIndex[1] = 0; // The first query position of the row
1873 while (startPos <= endPos && textPos <= sequenceEnd) {
1875 if (endPos >= dpCellAllocated) {
1876 fprintf(stderr, "HSPGappedExtensionWithTraceback(): Not enough DP cells allocated!\n");
1877 exit(1);
1880 if (textPos < textDecodePos) {
1881 } else {
1882 // decode a word of text
1883 c = packedDNA[textDecodePos / CHAR_PER_WORD];
1884 for (j=0; j<CHAR_PER_WORD; j++) {
1885 textBuffer[textDecodePos + j - textOffset + 1] = (unsigned char)(c >> (BITS_IN_WORD - BIT_PER_CHAR));
1886 c <<= BIT_PER_CHAR;
1889 // Apply ambiguity
1890 while (ambiguity[ambiguityIndex].rightOfEndPos <= textPos) {
1891 ambiguityIndex++;
1893 if (ambiguity[ambiguityIndex].startPos < textPos + CHAR_PER_WORD) {
1894 for (j=0; j<CHAR_PER_WORD; j++) {
1895 while (ambiguity[ambiguityIndex].rightOfEndPos <= textPos + j) {
1896 ambiguity++;
1898 if (ambiguity[ambiguityIndex].startPos <= textPos + j) {
1899 textBuffer[textDecodePos + j - textOffset + 1] = (char)ambiguity[ambiguityIndex].symbol;
1904 textDecodePos += CHAR_PER_WORD;
1907 // traceback
1908 tracebackIndex[(textPos - textOffset + 1) * 2] = currentTracebackIndex; // The first trackback of the row
1909 tracebackIndex[(textPos - textOffset + 1) * 2 + 1] = startPos; // The first query position of the row
1911 nextEndPos = 0;
1912 nextStartPos = INT_MAX;
1914 if (currentTracebackIndex + queryPatternLength >= tracebackAllocated) {
1915 traceback = (char*) MMUnitReallocate(traceback, tracebackAllocated + tracebackAllocationUnit, tracebackAllocated);
1916 tracebackAllocated += tracebackAllocationUnit;
1919 if (startPos > lastStartPos) {
1920 BLast = B[startPos - 1];
1921 D = DP_NEG_INFINITY;
1922 DType = 0;
1923 currentPos = startPos;
1924 } else {
1925 // all scores in currentPos - 1 is DP_NEG_INFINITY
1926 BLast = B[startPos];
1927 B[startPos] = I[startPos];
1928 traceback[currentTracebackIndex] = DP_INSERT | IType[startPos];
1929 I[startPos] = I[startPos] + gapExtendScore;
1930 IType[startPos] = DP_INSERT_EXTEND;
1931 D = B[startPos] + gapExtendScore + gapOpenScore;
1932 DType = DP_DELETE_OPEN;
1933 if (B[startPos] + dropoffScore >= forwardMaxScore) {
1934 nextStartPos = startPos;
1935 nextEndPos = startPos;
1937 currentPos = startPos + 1;
1938 currentTracebackIndex++;
1941 while (currentPos <= endPos) {
1943 M = BLast + scoringMatrix[textBuffer[textPos - textOffset + 1]][convertedKey[currentPos + queryOffset - 1]];
1944 BLast = B[currentPos];
1946 if (M >= I[currentPos] && M >= D) {
1947 // matchScore is maximum
1948 B[currentPos] = M;
1949 traceback[currentTracebackIndex] = DP_MATCH_MISMATCH | IType[currentPos] | DType;
1950 if (M + gapOpenScore >= I[currentPos]) {
1951 I[currentPos] = M + gapOpenScore + gapExtendScore;
1952 IType[currentPos] = DP_INSERT_OPEN;
1953 } else {
1954 I[currentPos] = I[currentPos] + gapExtendScore;
1955 IType[currentPos] = DP_INSERT_EXTEND;
1957 if (M + gapOpenScore >= D) {
1958 D = M + gapOpenScore + gapExtendScore;
1959 DType = DP_DELETE_OPEN;
1960 } else {
1961 D = D + gapExtendScore;
1962 DType = DP_DELETE_EXTEND;
1964 if (B[currentPos] > forwardMaxScore) {
1965 forwardMaxScore = B[currentPos];
1966 forwardMaxScorePosQuery = currentPos + queryOffset;
1967 forwardMaxScorePosText = textPos + 1;
1969 } else {
1970 if (I[currentPos] >= D) {
1971 B[currentPos] = I[currentPos];
1972 traceback[currentTracebackIndex] = DP_INSERT | IType[currentPos] | DType;
1973 } else {
1974 B[currentPos] = D;
1975 traceback[currentTracebackIndex] = DP_DELETE | IType[currentPos] | DType;
1977 // insert cannot follow delete and delete cannot follow insert
1978 I[currentPos] = I[currentPos] + gapExtendScore;
1979 IType[currentPos] = DP_INSERT_EXTEND;
1980 D = D + gapExtendScore;
1981 DType = DP_DELETE_EXTEND;
1983 if (B[currentPos] + dropoffScore >= forwardMaxScore) {
1984 if (nextStartPos > currentPos) {
1985 nextStartPos = currentPos;
1987 nextEndPos = currentPos;
1990 currentPos++;
1991 currentTracebackIndex++;
1995 nextEndPos++;
1996 if (nextEndPos == currentPos) {
1997 while (nextEndPos <= queryPatternLength - queryOffset && D + dropoffScore >= forwardMaxScore) {
1998 if (nextEndPos >= dpCellAllocated) {
1999 fprintf(stderr, "HSPGappedExtensionWithTraceback(): Not enough DP cells allocated!\n");
2000 exit(1);
2002 B[nextEndPos] = D;
2003 traceback[currentTracebackIndex] = DP_DELETE | DType;
2004 D = D + gapExtendScore;
2005 DType = DP_DELETE_EXTEND;
2006 I[nextEndPos] = DP_NEG_INFINITY;
2007 IType[nextEndPos] = 0;
2008 nextEndPos++;
2009 currentTracebackIndex++;
2011 I[nextEndPos] = DP_NEG_INFINITY;
2012 IType[nextEndPos] = 0;
2015 if (nextEndPos > queryPatternLength - queryOffset) {
2016 nextEndPos = queryPatternLength - queryOffset;
2019 lastStartPos = startPos;
2020 startPos = nextStartPos;
2021 endPos = nextEndPos;
2023 textPos++;
2027 // traceback
2028 forwardNumOfAlignment = 0;
2029 forwardNumOfAuxiliaryText = 0;
2030 textPos = forwardMaxScorePosText - 1;
2031 currentPos = forwardMaxScorePosQuery - queryOffset;
2032 dpScore = 0; // for verifying traceback
2034 currentTracebackIndex = tracebackIndex[(textPos - textOffset + 1) * 2]
2035 + currentPos - tracebackIndex[(textPos - textOffset + 1) * 2 + 1];
2037 while (currentTracebackIndex > 0) {
2039 dpType = traceback[currentTracebackIndex] & DP_MASK;
2041 if (dpType == DP_MATCH_MISMATCH) {
2042 dpScore += scoringMatrix[textBuffer[textPos - textOffset + 1]][convertedKey[currentPos + queryOffset - 1]];
2043 if (textBuffer[textPos - textOffset + 1] == convertedKey[currentPos + queryOffset - 1] &&
2044 textBuffer[textPos - textOffset + 1] < 4) { // match and not ambiguity
2045 tempForwardAlignment[forwardNumOfAlignment] = ALIGN_MATCH;
2046 } else {
2047 tempForwardAlignment[forwardNumOfAlignment] = ALIGN_MISMATCH_AMBIGUITY;
2048 tempForwardAuxiliaryText[forwardNumOfAuxiliaryText] = textBuffer[textPos - textOffset + 1];
2049 forwardNumOfAuxiliaryText++;
2051 textPos--;
2052 currentPos--;
2053 currentTracebackIndex = tracebackIndex[(textPos - textOffset + 1) * 2]
2054 + currentPos - tracebackIndex[(textPos - textOffset + 1) * 2 + 1];
2055 } else if (dpType == DP_INSERT) {
2056 while (!(traceback[currentTracebackIndex] & DP_INSERT_OPEN)) {
2057 dpScore += gapExtendScore;
2058 tempForwardAlignment[forwardNumOfAlignment] = ALIGN_INSERT;
2059 forwardNumOfAlignment++;
2060 tempForwardAuxiliaryText[forwardNumOfAuxiliaryText] = textBuffer[textPos - textOffset + 1];
2061 forwardNumOfAuxiliaryText++;
2062 textPos--;
2063 currentTracebackIndex = tracebackIndex[(textPos - textOffset + 1) * 2]
2064 + currentPos - tracebackIndex[(textPos - textOffset + 1) * 2 + 1];
2066 dpScore += gapOpenScore + gapExtendScore;
2067 tempForwardAlignment[forwardNumOfAlignment] = ALIGN_INSERT;
2068 tempForwardAuxiliaryText[forwardNumOfAuxiliaryText] = textBuffer[textPos - textOffset + 1];
2069 forwardNumOfAuxiliaryText++;
2070 textPos--;
2071 currentTracebackIndex = tracebackIndex[(textPos - textOffset + 1) * 2]
2072 + currentPos - tracebackIndex[(textPos - textOffset + 1) * 2 + 1];
2073 } else {
2074 while (!(traceback[currentTracebackIndex] & DP_DELETE_OPEN)) {
2075 dpScore += gapExtendScore;
2076 tempForwardAlignment[forwardNumOfAlignment] = ALIGN_DELETE;
2077 forwardNumOfAlignment++;
2078 currentPos--;
2079 currentTracebackIndex--;
2081 dpScore += gapOpenScore + gapExtendScore;
2082 tempForwardAlignment[forwardNumOfAlignment] = ALIGN_DELETE;
2083 currentPos--;
2084 currentTracebackIndex--;
2087 forwardNumOfAlignment++;
2091 if (dpScore != forwardMaxScore) {
2092 fprintf(stderr, "Forward gapped extension traceback error!\n");
2093 exit(1);
2096 // Backward extend
2098 textDecodePos = textOffset;
2099 if (textDecodePos % CHAR_PER_WORD > 0) {
2100 // decode text to the next word boundary
2101 charToProcess = textDecodePos % CHAR_PER_WORD;
2102 c = packedDNA[textDecodePos / CHAR_PER_WORD] >> (BITS_IN_WORD - charToProcess * BIT_PER_CHAR);
2103 for (i=0; i<charToProcess; i++) {
2104 textBuffer[textOffset - textDecodePos + i + 1] = (unsigned char)(c & 0x3);
2105 c >>= BIT_PER_CHAR;
2108 // Apply ambiguity
2109 while (ambiguity[ambiguityIndex].startPos >= textOffset) {
2110 ambiguityIndex--;
2112 if (ambiguity[ambiguityIndex].rightOfEndPos > (textOffset - charToProcess)) {
2113 for (i=0; i<charToProcess; i++) {
2114 while (ambiguity[ambiguityIndex].startPos >= (textOffset - i)) {
2115 ambiguityIndex--;
2117 if (ambiguity[ambiguityIndex].rightOfEndPos > (textOffset - i - 1)) {
2118 textBuffer[textOffset - textDecodePos + i + 1] = (char)ambiguity[ambiguityIndex].symbol;
2123 textDecodePos -= charToProcess;
2126 // Fill initial scores
2127 B[0] = 0;
2128 traceback[0] = DP_MATCH_MISMATCH;
2129 I[0] = gapExtendScore + gapOpenScore;
2130 IType[0] = DP_INSERT_OPEN;
2132 B[1] = gapExtendScore + gapOpenScore;
2133 traceback[1] = DP_DELETE | DP_DELETE_OPEN;
2134 I[1] = B[1] + gapExtendScore + gapOpenScore;
2135 IType[1] = DP_INSERT_OPEN;
2137 for (i=2; i<=maxLengthOfGap; i++) {
2138 B[i] = B[i-1] + gapExtendScore;
2139 traceback[i] = DP_DELETE;
2140 I[i] = B[i] + gapExtendScore + gapOpenScore;
2141 IType[i] = DP_INSERT_OPEN;
2143 I[i] = DP_NEG_INFINITY;
2145 currentTracebackIndex = maxLengthOfGap + 1;
2147 lastStartPos = startPos = 0;
2148 endPos = min((maxLengthOfGap + 1), queryOffset);
2149 textPos = textOffset;
2151 backwardMaxScore = 0;
2152 backwardMaxScorePosQuery = queryOffset;
2153 backwardMaxScorePosText = textOffset;
2155 tracebackIndex[0] = 0;
2156 tracebackIndex[1] = 0;
2158 while (startPos <= endPos && textPos + 1 > sequenceStart) {
2160 if (endPos >= dpCellAllocated) {
2161 fprintf(stderr, "HSPGappedExtensionWithTraceback(): Not enough DP cells allocated!\n");
2162 exit(1);
2165 if (textPos > textDecodePos) {
2166 } else {
2167 // decode a word of text
2168 c = packedDNA[textDecodePos / CHAR_PER_WORD - 1];
2169 for (j=0; j<CHAR_PER_WORD; j++) {
2170 textBuffer[textOffset - textDecodePos + j + 1] = (unsigned char)(c & 0x3);
2171 c >>= BIT_PER_CHAR;
2174 // Apply ambiguity
2175 while (ambiguity[ambiguityIndex].startPos >= textPos) {
2176 ambiguityIndex--;
2178 if (ambiguity[ambiguityIndex].rightOfEndPos > (textPos - CHAR_PER_WORD)) {
2179 for (j=0; j<CHAR_PER_WORD; j++) {
2180 while (ambiguity[ambiguityIndex].startPos >= (textPos - j)) {
2181 ambiguityIndex--;
2183 if (ambiguity[ambiguityIndex].rightOfEndPos > (textPos - j - 1)) {
2184 textBuffer[textOffset - textDecodePos + j + 1] = (char)ambiguity[ambiguityIndex].symbol;
2189 textDecodePos -= CHAR_PER_WORD;
2192 // traceback
2193 tracebackIndex[(textOffset - textPos + 1) * 2] = currentTracebackIndex;
2194 tracebackIndex[(textOffset - textPos + 1) * 2 + 1] = startPos;
2196 nextEndPos = 0;
2197 nextStartPos = INT_MAX;
2199 if (currentTracebackIndex + queryPatternLength >= tracebackAllocated) {
2200 traceback = (char*) MMUnitReallocate(traceback, tracebackAllocated + tracebackAllocationUnit, tracebackAllocated);
2201 tracebackAllocated += tracebackAllocationUnit;
2204 if (startPos > lastStartPos) {
2205 BLast = B[startPos - 1];
2206 D = DP_NEG_INFINITY;
2207 DType = 0;
2208 currentPos = startPos;
2209 } else {
2210 // all scores in currentPos - 1 is DP_NEG_INFINITY
2211 BLast = B[startPos];
2212 B[startPos] = I[startPos];
2213 traceback[currentTracebackIndex] = DP_INSERT | IType[startPos];
2214 I[startPos] = I[startPos] + gapExtendScore;
2215 IType[startPos] = DP_INSERT_EXTEND;
2216 D = B[startPos] + gapExtendScore + gapOpenScore;
2217 DType = DP_DELETE_OPEN;
2218 if (B[startPos] + dropoffScore >= forwardMaxScore) {
2219 nextStartPos = startPos;
2220 nextEndPos = startPos;
2222 currentPos = startPos + 1;
2223 currentTracebackIndex++;
2227 while (currentPos <= endPos) {
2229 M = BLast + scoringMatrix[textBuffer[textOffset - textPos + 1]][convertedKey[queryOffset - currentPos]];
2230 BLast = B[currentPos];
2232 if (M >= I[currentPos] && M >= D) {
2233 // matchScore is maximum
2234 B[currentPos] = M;
2235 traceback[currentTracebackIndex] = DP_MATCH_MISMATCH | IType[currentPos] | DType;
2236 if (M + gapOpenScore >= I[currentPos]) {
2237 I[currentPos] = M + gapOpenScore + gapExtendScore;
2238 IType[currentPos] = DP_INSERT_OPEN;
2239 } else {
2240 I[currentPos] = I[currentPos] + gapExtendScore;
2241 IType[currentPos] = DP_INSERT_EXTEND;
2243 if (M + gapOpenScore >= D) {
2244 D = M + gapOpenScore + gapExtendScore;
2245 DType = DP_DELETE_OPEN;
2246 } else {
2247 D = D + gapExtendScore;
2248 DType = DP_DELETE_EXTEND;
2250 if (B[currentPos] > backwardMaxScore) {
2251 backwardMaxScore = B[currentPos];
2252 backwardMaxScorePosQuery = queryOffset - currentPos;
2253 backwardMaxScorePosText = textPos - 1;
2255 } else {
2256 if (I[currentPos] >= D) {
2257 B[currentPos] = I[currentPos];
2258 traceback[currentTracebackIndex] = DP_INSERT | IType[currentPos] | DType;
2259 } else {
2260 B[currentPos] = D;
2261 traceback[currentTracebackIndex] = DP_DELETE | IType[currentPos] | DType;
2263 // insert cannot follow delete and delete cannot follow insert
2264 I[currentPos] = I[currentPos] + gapExtendScore;
2265 IType[currentPos] = DP_INSERT_EXTEND;
2266 D = D + gapExtendScore;
2267 DType = DP_DELETE_EXTEND;
2270 if (B[currentPos] + dropoffScore >= backwardMaxScore) {
2271 if (nextStartPos > currentPos) {
2272 nextStartPos = currentPos;
2274 nextEndPos = currentPos;
2277 currentPos++;
2278 currentTracebackIndex++;
2282 nextEndPos++;
2283 if (nextEndPos == currentPos) {
2284 while (nextEndPos <= queryOffset && D + dropoffScore >= backwardMaxScore) {
2285 if (nextEndPos >= dpCellAllocated) {
2286 fprintf(stderr, "HSPGappedExtensionWithTraceback(): Not enough DP cells allocated!\n");
2287 exit(1);
2289 B[nextEndPos] = D;
2290 traceback[currentTracebackIndex] = DP_DELETE | DType;
2291 D = D + gapExtendScore;
2292 DType = DP_DELETE_EXTEND;
2293 I[nextEndPos] = DP_NEG_INFINITY;
2294 IType[nextEndPos] = 0;
2295 nextEndPos++;
2296 currentTracebackIndex++;
2298 I[nextEndPos] = DP_NEG_INFINITY;
2299 IType[nextEndPos] = 0;
2302 if (nextEndPos > queryOffset) {
2303 nextEndPos = queryOffset;
2306 lastStartPos = startPos;
2307 startPos = nextStartPos;
2308 endPos = nextEndPos;
2310 textPos--;
2314 // traceback
2315 backwardNumOfAlignment = 0;
2316 backwardNumOfAuxiliaryText = 0;
2317 textPos = backwardMaxScorePosText + 1;
2318 currentPos = queryOffset - backwardMaxScorePosQuery;
2319 dpScore = 0; // for verifying traceback
2321 currentTracebackIndex = tracebackIndex[(textOffset - textPos + 1) * 2]
2322 + currentPos - tracebackIndex[(textOffset - textPos + 1) * 2 + 1];
2324 while (currentTracebackIndex > 0) {
2326 dpType = traceback[currentTracebackIndex] & DP_MASK;
2328 if (dpType == DP_MATCH_MISMATCH) {
2329 dpScore += scoringMatrix[textBuffer[textOffset - textPos + 1]][convertedKey[queryOffset - currentPos]];
2330 if (textBuffer[textOffset - textPos + 1] == convertedKey[queryOffset - currentPos] &&
2331 textBuffer[textOffset - textPos + 1] < 4) { // match and not ambiguity
2332 tempBackwardAlignment[backwardNumOfAlignment] = ALIGN_MATCH;
2333 } else {
2334 tempBackwardAlignment[backwardNumOfAlignment] = ALIGN_MISMATCH_AMBIGUITY;
2335 tempBackwardAuxiliaryText[backwardNumOfAuxiliaryText] = textBuffer[textOffset - textPos + 1];
2336 backwardNumOfAuxiliaryText++;
2338 textPos++;
2339 currentPos--;
2340 currentTracebackIndex = tracebackIndex[(textOffset - textPos + 1) * 2]
2341 + currentPos - tracebackIndex[(textOffset - textPos + 1) * 2 + 1];
2342 } else if (dpType == DP_INSERT) {
2343 while (!(traceback[currentTracebackIndex] & DP_INSERT_OPEN)) {
2344 dpScore += gapExtendScore;
2345 tempBackwardAlignment[backwardNumOfAlignment] = ALIGN_INSERT;
2346 backwardNumOfAlignment++;
2347 tempBackwardAuxiliaryText[backwardNumOfAuxiliaryText] = textBuffer[textOffset - textPos + 1];
2348 backwardNumOfAuxiliaryText++;
2349 textPos++;
2350 currentTracebackIndex = tracebackIndex[(textOffset - textPos + 1) * 2]
2351 + currentPos - tracebackIndex[(textOffset - textPos + 1) * 2 + 1];
2353 dpScore += gapOpenScore + gapExtendScore;
2354 tempBackwardAlignment[backwardNumOfAlignment] = ALIGN_INSERT;
2355 tempBackwardAuxiliaryText[backwardNumOfAuxiliaryText] = textBuffer[textOffset - textPos + 1];
2356 backwardNumOfAuxiliaryText++;
2357 textPos++;
2358 currentTracebackIndex = tracebackIndex[(textOffset - textPos + 1) * 2]
2359 + currentPos - tracebackIndex[(textOffset - textPos + 1) * 2 + 1];
2360 } else {
2361 while (!(traceback[currentTracebackIndex] & DP_DELETE_OPEN)) {
2362 dpScore += gapExtendScore;
2363 tempBackwardAlignment[backwardNumOfAlignment] = ALIGN_DELETE;
2364 backwardNumOfAlignment++;
2365 currentPos--;
2366 currentTracebackIndex--;
2368 dpScore += gapOpenScore + gapExtendScore;
2369 tempBackwardAlignment[backwardNumOfAlignment] = ALIGN_DELETE;
2370 currentPos--;
2371 currentTracebackIndex--;
2374 backwardNumOfAlignment++;
2378 if (dpScore != backwardMaxScore) {
2379 fprintf(stderr, "Backward gapped extension traceback error!\n");
2380 exit(1);
2383 // check cutoff score
2384 evalue = stat_gapCalcEvalue(stat_gapNominal2normalized(forwardMaxScore + backwardMaxScore));
2385 if (evalue < maxEvalue) {
2386 gappedHitList[numberOfGappedHit].posText = backwardMaxScorePosText;
2387 gappedHitList[numberOfGappedHit].posQuery = backwardMaxScorePosQuery;
2388 gappedHitList[numberOfGappedHit].lengthQuery = forwardMaxScorePosQuery - backwardMaxScorePosQuery;
2389 gappedHitList[numberOfGappedHit].score = forwardMaxScore + backwardMaxScore;
2390 gappedHitList[numberOfGappedHit].lengthText = forwardMaxScorePosText - backwardMaxScorePosText;
2391 gappedHitList[numberOfGappedHit].dbSeqIndex = gappedHitList[hitProcessed].dbSeqIndex;
2393 // Store alignment and auxiliary text
2394 numOfAlignmentWord = (forwardNumOfAlignment + backwardNumOfAlignment + ALIGN_PER_WORD - 1) / ALIGN_PER_WORD;
2395 ((GappedHitListWithAlignment*)(gappedHitList + numberOfGappedHit))->alignmentOffset = MMPoolDispatchOffset(alignmentPool, numOfAlignmentWord * sizeof(unsigned int));
2396 alignment = (unsigned int*)((char*)alignmentPool + ((GappedHitListWithAlignment*)(gappedHitList + numberOfGappedHit))->alignmentOffset);
2397 for (i=0; i<numOfAlignmentWord; i++) {
2398 alignment[i] = 0;
2400 if (forwardNumOfAuxiliaryText + backwardNumOfAuxiliaryText > 0) {
2401 numOfAuxiliaryTextWord = (forwardNumOfAuxiliaryText + backwardNumOfAuxiliaryText + AUX_TEXT_PER_WORD - 1) / AUX_TEXT_PER_WORD;
2402 ((GappedHitListWithAlignment*)(gappedHitList + numberOfGappedHit))->auxiliaryTextOffset = MMPoolDispatchOffset(alignmentPool, numOfAuxiliaryTextWord * sizeof(unsigned int));
2403 auxiliaryText = (unsigned int*)((char*)alignmentPool + ((GappedHitListWithAlignment*)(gappedHitList + numberOfGappedHit))->auxiliaryTextOffset);
2404 for (i=0; i<numOfAuxiliaryTextWord; i++) {
2405 auxiliaryText[i] = 0;
2407 } else {
2408 ((GappedHitListWithAlignment*)(gappedHitList + numberOfGappedHit))->auxiliaryTextOffset = 0;
2411 // backward alignment
2412 for (i=0; i<backwardNumOfAlignment; i++) {
2413 alignment[i/ALIGN_PER_WORD] |= (tempBackwardAlignment[i] << (BITS_IN_WORD - (i % ALIGN_PER_WORD + 1) * ALIGN_BIT));
2415 for (i=0; i<backwardNumOfAuxiliaryText; i++) {
2416 auxiliaryText[i/AUX_TEXT_PER_WORD] |= (tempBackwardAuxiliaryText[i] << (BITS_IN_WORD - (i % AUX_TEXT_PER_WORD + 1) * AUX_TEXT_BIT));
2419 // forward alignment
2420 for (i=0; i<forwardNumOfAlignment; i++) {
2421 alignment[(i+backwardNumOfAlignment)/ALIGN_PER_WORD] |= (tempForwardAlignment[forwardNumOfAlignment - i - 1]
2422 << (BITS_IN_WORD - ((i+backwardNumOfAlignment) % ALIGN_PER_WORD + 1) * ALIGN_BIT));
2424 for (i=0; i<forwardNumOfAuxiliaryText; i++) {
2425 auxiliaryText[(i+backwardNumOfAuxiliaryText)/AUX_TEXT_PER_WORD] |= (tempForwardAuxiliaryText[forwardNumOfAuxiliaryText - i - 1]
2426 << (BITS_IN_WORD - ((i+backwardNumOfAuxiliaryText) % AUX_TEXT_PER_WORD + 1) * AUX_TEXT_BIT));
2429 numberOfGappedHit++;
2433 hitProcessed++;
2437 // free working memory
2438 MMPoolReturn(mmPool, textBuffer, dpCellAllocated * 2);
2439 MMPoolReturn(mmPool, B, dpCellAllocated * sizeof(int));
2440 MMPoolReturn(mmPool, IType, dpCellAllocated);
2441 MMPoolReturn(mmPool, I, dpCellAllocated * sizeof(int));
2443 MMPoolReturn(mmPool, tracebackIndex, dpCellAllocated * 2 * 2 * sizeof(int));
2445 MMUnitFree(traceback, tracebackAllocated);
2447 MMPoolReturn(mmPool, tempForwardAlignment, dpCellAllocated * 2);
2448 MMPoolReturn(mmPool, tempBackwardAlignment, dpCellAllocated * 2);
2449 MMPoolReturn(mmPool, tempForwardAuxiliaryText, dpCellAllocated);
2450 MMPoolReturn(mmPool, tempBackwardAuxiliaryText, dpCellAllocated);
2452 return numberOfGappedHit;
2456 // textList must be sorted in descending posText order;
2457 // queryPosListIndex contains the starting positions of queryPos in queryPosList within a queryPosGroup
2458 // a dummy entry must be present at the end of queryPosListIndex
2460 int HSPDPDBvsQuery(const unsigned int *packedText, const HitList *textList, const int numOfTextHit,
2461 const SeqOffset *textSeqOffset, const Ambiguity *textAmbiguity, const int numOfTextSeq,
2462 const unsigned char *convertedKey, const int queryPatternLength,
2463 const int *queryPosListIndex, const unsigned int *queryPosList,
2464 GappedHitListWithAlignment* __restrict gappedHitList, const int maxNumOfGappedHit,
2465 MMPool *mmPool, MMPool *alignmentPool,
2466 const int matchScore, const int mismatchScore,
2467 const int gapOpenScore, const int gapExtendScore,
2468 const int cutoffScore, const double maxEvalue) {
2470 #define NUM_SOURCE_BIT 12
2471 #define SOURCE_BIT_MASK 0xFFF
2473 #define DROPOFF_MAX_ENTRY 256
2475 #define EARLY_DROPOFF_MIN 11
2476 #define EARLY_DROPOFF_FACTOR 4 // These two are heuristic values to determine the dropoff to trigger addition traceback
2477 // When this heuistic is not applied, about 0.01% alignments reported by Blast are
2478 // filtered out during traceback on experiments with query size at chromosome lengths
2479 // The filtered alignments are of low evalues and are close to some strong alignments
2481 #define TEXT_BUFFER_SIZE 16
2483 // DP table
2484 DPCell** __restrict dpCell;
2485 int M;
2486 int D; // insert and delete wrt query
2488 // The following variables are all indexed by source bit
2490 // Max score
2491 DPMaxScore* __restrict maxScore;
2492 DPMaxScore* __restrict dropoffMaxScore;
2494 // Keeping track whether dropoff should be valid alignments
2495 int* __restrict currentMaxScore;
2496 int* __restrict lastDropoffMaxScoreIndex;
2498 // End variables indexed by source bit
2500 int textBuffer[TEXT_BUFFER_SIZE];
2502 char* __restrict tempAlignment;
2503 char* __restrict tempAuxiliaryText;
2505 int scoringMatrix[DNA_CHAR_SIZE][DNA_CHAR_SIZE];
2506 int gapOpenScoreShifted, gapExtendScoreShifted;
2507 int cutoffScoreShifted;
2508 int dropoffScoreShifted;
2509 int minPositiveScore;
2511 int sourceBit;
2513 double evalue;
2515 int firstTextHitBeingProcessed, numOfTextHitBeingProcessed;
2517 unsigned long long textOffset;
2518 int textChar;
2519 int charToProcess;
2521 int textDbSeqIndex, textAmbiguityIndex;
2522 unsigned long long textSeqStart, textSeqEnd;
2524 int i, j;
2525 unsigned int c;
2527 int numOfGappedHit;
2529 int sourceBitMinScore, maxSourceBit, nextSourceBit, usedSourceBit;
2530 int numOfDropoffMaxScore;
2532 int tempAlignmentIndex;
2533 int tempAuxiliaryTextIndex;
2534 int numOfAlignmentWord, numOfAuxiliaryTextWord;
2536 Traceback* __restrict traceback;
2537 int tracebackAllocated;
2538 int tracebackIndex;
2539 int tracebackIndexAdjustment;
2541 unsigned int queryListInfo;
2543 int lastDpCellUsed, lastDpCellIndex;
2544 int dpCellUsed, dpCellIndex;
2545 int dpCellUsedAdjustedForEdge;
2547 int q;
2548 unsigned long long qPos, lastQPos, maxQPos;
2549 int qChar;
2551 int bestScore, insertScore, deleteScore;
2553 int minAlignmentLength, minNegAlignmentLength, maxDPGroupingDist;
2555 unsigned int* __restrict alignment;
2556 unsigned int* __restrict auxiliaryText;
2558 // allocate working memory
2559 dpCell = (DPCell**) MMPoolDispatch(mmPool, 2 * sizeof(DPCell*));
2560 dpCell[0] = (DPCell*) MMPoolDispatch(mmPool, MAX_ALIGNMENT_LENGTH * sizeof(DPCell));
2561 dpCell[1] = (DPCell*) MMPoolDispatch(mmPool, MAX_ALIGNMENT_LENGTH * sizeof(DPCell));
2563 maxScore = (DPMaxScore*) MMPoolDispatch(mmPool, (1 << NUM_SOURCE_BIT) * sizeof(DPMaxScore));
2564 dropoffMaxScore = (DPMaxScore*) MMPoolDispatch(mmPool, DROPOFF_MAX_ENTRY * sizeof(DPMaxScore));
2566 currentMaxScore = (int*) MMPoolDispatch(mmPool, (1 << NUM_SOURCE_BIT) * sizeof(int));
2567 lastDropoffMaxScoreIndex = (int*) MMPoolDispatch(mmPool, (1 << NUM_SOURCE_BIT) * sizeof(int));
2569 tempAlignment = (char*) MMPoolDispatch(mmPool, (MAX_ALIGNMENT_LENGTH * 2) * sizeof(char));
2570 tempAuxiliaryText = (char*) MMPoolDispatch(mmPool, (MAX_ALIGNMENT_LENGTH * 2) * sizeof(char));
2572 tracebackAllocated = MMPoolByteAvailable(mmPool) / sizeof(Traceback);
2573 if (tracebackAllocated < 0) {
2574 fprintf(stderr, "HSPDPDBvsDB(): Not enough memory allocated!\n");
2575 exit(1);
2577 traceback = (Traceback*) MMPoolDispatch(mmPool, tracebackAllocated * sizeof(Traceback));
2579 HSPFillScoringMatrix(scoringMatrix, matchScore, mismatchScore, NUM_SOURCE_BIT);
2581 gapOpenScoreShifted = gapOpenScore * (1 << NUM_SOURCE_BIT);
2582 gapExtendScoreShifted = gapExtendScore * (1 << NUM_SOURCE_BIT);
2583 cutoffScoreShifted = cutoffScore * (1 << NUM_SOURCE_BIT);
2584 minPositiveScore = 1 << NUM_SOURCE_BIT;
2585 if (cutoffScore <= EARLY_DROPOFF_MIN) {
2586 dropoffScoreShifted = cutoffScoreShifted;
2587 } else {
2588 dropoffScoreShifted = (cutoffScore - (cutoffScore - EARLY_DROPOFF_MIN + EARLY_DROPOFF_FACTOR - 1) / EARLY_DROPOFF_FACTOR) * (1 << NUM_SOURCE_BIT) ;
2591 minAlignmentLength = (cutoffScore + matchScore - 1) / matchScore;
2592 minNegAlignmentLength = ((gapOpenScore + gapExtendScore + matchScore + 1 - minAlignmentLength * matchScore + 1) / (gapOpenScore + gapExtendScore + matchScore)) * 2 - 1;
2593 maxDPGroupingDist = minAlignmentLength + minNegAlignmentLength; // minimum length for an alignment to reached cutoff and then drop back to zero
2595 // maxSourceBit are initially assigned to cells
2596 maxSourceBit = (1 << NUM_SOURCE_BIT) - 1;
2597 // source bits are assigned when score >= sourceBitMinScore
2598 sourceBitMinScore = (1 - gapOpenScore - gapExtendScore) * (1 << NUM_SOURCE_BIT) + maxSourceBit;
2600 numOfGappedHit = 0; // Number of alignments
2602 firstTextHitBeingProcessed = 0;
2604 // Clear last dropoff
2605 for (j=0; j<maxSourceBit; j++) {
2606 lastDropoffMaxScoreIndex[j] = 0;
2608 numOfDropoffMaxScore = 0;
2609 usedSourceBit = 0;
2611 textOffset = 0;
2613 while (firstTextHitBeingProcessed < numOfTextHit) {
2615 if (textList[firstTextHitBeingProcessed].posText > textOffset) {
2616 // textOffset will be larger than those processed
2617 textDbSeqIndex = numOfTextSeq - 1;
2618 textSeqStart = textSeqOffset[textDbSeqIndex].startPos;
2619 textSeqEnd = textSeqOffset[textDbSeqIndex].endPos;
2620 textAmbiguityIndex = textSeqOffset[textDbSeqIndex].lastAmbiguityIndex;
2623 textOffset = textList[firstTextHitBeingProcessed].posText; // posText points to the character right after the hit
2624 queryListInfo = textList[firstTextHitBeingProcessed].info;
2626 // Find corresponding DB sequence for the text
2627 while (textSeqOffset[textDbSeqIndex].startPos >= textOffset) {
2628 textDbSeqIndex--;
2629 textSeqStart = textSeqOffset[textDbSeqIndex].startPos;
2630 textSeqEnd = textSeqOffset[textDbSeqIndex].endPos;
2631 textAmbiguityIndex = textSeqOffset[textDbSeqIndex].lastAmbiguityIndex;
2634 if (textOffset % CHAR_PER_WORD > 0) {
2636 // Decode text to the next word boundary
2637 charToProcess = textOffset % CHAR_PER_WORD;
2638 c = packedText[textOffset / CHAR_PER_WORD] >> (BITS_IN_WORD - charToProcess * BIT_PER_CHAR);
2639 for (i=charToProcess; i--;) { // from charToProcess - 1 to 0
2640 textBuffer[i] = (c & 0x3);
2641 c >>= BIT_PER_CHAR;
2644 // Apply ambiguity
2645 while (textAmbiguity[textAmbiguityIndex].startPos >= textOffset) {
2646 textAmbiguityIndex--;
2648 if (textAmbiguity[textAmbiguityIndex].rightOfEndPos > (textOffset - charToProcess)) {
2649 for (i=0; i<charToProcess; i++) {
2650 while (textAmbiguity[textAmbiguityIndex].startPos >= (textOffset - i)) {
2651 textAmbiguityIndex--;
2653 if (textAmbiguity[textAmbiguityIndex].rightOfEndPos > (textOffset - i - 1)) {
2654 textBuffer[(textOffset - i - 1) % CHAR_PER_WORD] = textAmbiguity[textAmbiguityIndex].symbol;
2661 lastDpCellUsed = 0;
2662 lastDpCellIndex = 0;
2663 dpCellIndex = 1;
2664 maxQPos = 0;
2665 tracebackIndex = 0;
2666 tracebackIndexAdjustment = 0;
2668 // Clear last dropoff
2669 for (j=0; j<usedSourceBit; j++) {
2670 lastDropoffMaxScoreIndex[j] = 0;
2672 numOfDropoffMaxScore = 0;
2674 numOfTextHitBeingProcessed = 0;
2675 nextSourceBit = 0;
2677 while (textOffset > textSeqOffset[textDbSeqIndex].startPos &&
2678 (lastDpCellUsed > 0 || textList[firstTextHitBeingProcessed + numOfTextHitBeingProcessed].posText == textOffset)) {
2680 if (textOffset % CHAR_PER_WORD == 0) {
2682 // Decode text to the next word boundary
2683 c = packedText[textOffset / CHAR_PER_WORD - 1];
2684 for (i=CHAR_PER_WORD; i--;) { // from CHAR_PER_WORD - 1 to 0
2685 textBuffer[i] = (c & 0x3);
2686 c >>= BIT_PER_CHAR;
2689 // Apply ambiguity
2690 while (textAmbiguity[textAmbiguityIndex].startPos >= textOffset) {
2691 textAmbiguityIndex--;
2693 if (textAmbiguity[textAmbiguityIndex].rightOfEndPos > (textOffset - CHAR_PER_WORD)) {
2694 for (i=0; i<CHAR_PER_WORD; i++) {
2695 while (textAmbiguity[textAmbiguityIndex].startPos >= (textOffset - i)) {
2696 textAmbiguityIndex--;
2698 if (textAmbiguity[textAmbiguityIndex].rightOfEndPos > (textOffset - i - 1)) {
2699 textBuffer[(textOffset - i - 1) % CHAR_PER_WORD] = textAmbiguity[textAmbiguityIndex].symbol;
2706 // Set text char
2707 textChar = textBuffer[(textOffset - 1) % CHAR_PER_WORD];
2709 // Check if it is a valid starting position of alignment
2710 if (firstTextHitBeingProcessed + numOfTextHitBeingProcessed > numOfTextHit ||
2711 textList[firstTextHitBeingProcessed + numOfTextHitBeingProcessed].posText != textOffset ||
2712 textOffset + maxDPGroupingDist < textList[firstTextHitBeingProcessed].posText) {
2713 // (numOfTextHitBeingProcessed > 0 && textList[firstTextHitBeingProcessed + numOfTextHitBeingProcessed].posText + maxDPSkip <
2714 // textList[firstTextHitBeingProcessed + numOfTextHitBeingProcessed - 1].posText)) {
2715 } else {
2716 // Merge starting positions in query into the dpCell
2717 queryListInfo = textList[firstTextHitBeingProcessed + numOfTextHitBeingProcessed].info; // the group of query hits corresponding to the text hit
2718 q = queryPosListIndex[queryListInfo]; // the startingindex of the group of query hits
2720 dpCellUsed = 0;
2721 i = 0;
2722 while ((q < queryPosListIndex[queryListInfo + 1] ||
2723 i < lastDpCellUsed) && dpCellUsed < MAX_ALIGNMENT_LENGTH) {
2724 while (i < lastDpCellUsed && (q >= queryPosListIndex[queryListInfo + 1] || dpCell[lastDpCellIndex][i].P >= queryPosList[q]
2725 && dpCellUsed < MAX_ALIGNMENT_LENGTH)) {
2726 dpCell[dpCellIndex][dpCellUsed] = dpCell[lastDpCellIndex][i];
2727 dpCellUsed++;
2728 i++;
2730 if (q < queryPosListIndex[queryListInfo + 1] && i > 0 && dpCell[lastDpCellIndex][i-1].P == queryPosList[q]) {
2731 // The cell is positive; ignore starting position in query
2732 q++;
2734 while (q < queryPosListIndex[queryListInfo + 1] && dpCellUsed < MAX_ALIGNMENT_LENGTH &&
2735 (i >= lastDpCellUsed || dpCell[lastDpCellIndex][i].P < queryPosList[q])) {
2736 // insert starting position in query
2737 if (queryPosList[q] > (unsigned int)queryPatternLength) {
2738 fprintf(stderr, "HSPDPDBvsQuery(): Query position is larger than query length!\n");
2739 exit(1);
2741 dpCell[dpCellIndex][dpCellUsed].P = queryPosList[q]; // posText points to the character right after the hit
2742 dpCell[dpCellIndex][dpCellUsed].B = maxSourceBit; // initial score for a blank cell
2743 dpCell[dpCellIndex][dpCellUsed].I = DP_NEG_INFINITY; // insertion not allowed here
2744 dpCellUsed++;
2745 tracebackIndexAdjustment--;
2746 q++;
2749 if (q < queryPosListIndex[queryListInfo + 1] || i < lastDpCellUsed) {
2750 // Not enough memory
2751 } else {
2752 numOfTextHitBeingProcessed++;
2753 dpCellIndex ^= 1; // Swap dpCells
2754 lastDpCellIndex ^= 1; // Swap dpCells
2755 lastDpCellUsed = dpCellUsed;
2759 D = DP_NEG_INFINITY;
2761 // Clear current max score
2762 usedSourceBit = nextSourceBit;
2763 for (j=0; j<usedSourceBit; j++) {
2764 currentMaxScore[j] = 0;
2767 i = 0;
2768 dpCellUsed = 0;
2769 dpCellUsedAdjustedForEdge = FALSE;
2770 lastQPos = ALL_ONE_MASK;
2772 // check traceback buffer
2773 if (tracebackIndex + MAX_ALIGNMENT_LENGTH >= tracebackAllocated) {
2774 fprintf(stderr, "HSPDPDBvsQuery(): Not enough traceback buffer!\n");
2775 exit(1);
2778 while (i < lastDpCellUsed) {
2780 // DP
2782 // check buffer
2783 if (dpCellUsed + 3 >= MAX_ALIGNMENT_LENGTH) {
2784 // 1 cell at most give 3 positive cells
2785 fprintf(stderr, "HSPDPDBvsQuery(): Not enough query buffer(1)!\n");
2786 exit(1);
2789 if (dpCell[lastDpCellIndex][i].B == maxSourceBit) {
2790 // the cell is inserted through input query positions
2791 tracebackIndexAdjustment++;
2794 qPos = dpCell[lastDpCellIndex][i].P - 1;
2796 if (qPos != lastQPos - 1) {
2797 // The cell on the left is non-positive; handle the insertion + deletion only
2798 if (dpCell[lastDpCellIndex][i].I >= minPositiveScore || D >= minPositiveScore) {
2800 traceback[tracebackIndex].indexDiff = dpCellUsed - i + lastDpCellUsed + tracebackIndexAdjustment; // indexDiff always point to the cell directly above logically
2801 traceback[tracebackIndex].textChar = textChar;
2803 if (dpCell[lastDpCellIndex][i].I >= minPositiveScore && dpCell[lastDpCellIndex][i].I == dpCell[lastDpCellIndex][i].B + gapOpenScoreShifted + gapExtendScoreShifted) {
2804 traceback[tracebackIndex].IOpen = 1; // Gap opening
2805 } else {
2806 traceback[tracebackIndex].IOpen = 0; // Not gap opening
2808 if (D >= minPositiveScore && D == dpCell[dpCellIndex][dpCellUsed-1].B + gapOpenScoreShifted + gapExtendScoreShifted) {
2809 traceback[tracebackIndex].DOpen = 1; // Gap opening
2810 } else {
2811 traceback[tracebackIndex].DOpen = 0; // Not gap opening
2814 if (dpCell[lastDpCellIndex][i].I >= D) {
2815 dpCell[dpCellIndex][dpCellUsed].B = dpCell[lastDpCellIndex][i].I;
2816 traceback[tracebackIndex].alignment = DP_INSERT;
2817 } else {
2818 dpCell[dpCellIndex][dpCellUsed].B = D;
2819 traceback[tracebackIndex].alignment = DP_DELETE;
2821 tracebackIndex++;
2823 dpCell[dpCellIndex][dpCellUsed].I = dpCell[lastDpCellIndex][i].I + gapExtendScoreShifted; // insert cannot follow delete
2824 D = D + gapExtendScoreShifted; // delete cannot follow insert
2825 dpCell[dpCellIndex][dpCellUsed].P = qPos + 1;
2826 dpCellUsed++;
2830 qChar = convertedKey[qPos];
2832 // DP - Match/mismatch
2833 M = dpCell[lastDpCellIndex][i].B + scoringMatrix[textChar][qChar];
2835 if (M >= sourceBitMinScore &&
2836 (M & SOURCE_BIT_MASK) == maxSourceBit) {
2837 // Assign source bits
2838 if (nextSourceBit < maxSourceBit) {
2839 M += nextSourceBit - maxSourceBit;
2840 maxScore[nextSourceBit].score = 0;
2841 nextSourceBit++;
2842 } else {
2843 fprintf(stderr, "HSPDPDBvsQuery(): Not enough source bit space!\n");
2847 // Store the insert and delete score before updating them
2848 if (i+1 < lastDpCellUsed && dpCell[lastDpCellIndex][i+1].P == qPos) {
2849 // insert score available
2850 insertScore = dpCell[lastDpCellIndex][i+1].I;
2851 } else {
2852 insertScore = DP_NEG_INFINITY;
2854 deleteScore = D;
2856 // Determine the best score
2857 if (M >= insertScore && M >= D) {
2858 // match score is maximum
2859 dpCell[dpCellIndex][dpCellUsed].B = M;
2860 if (M + gapOpenScoreShifted >= insertScore) {
2861 dpCell[dpCellIndex][dpCellUsed].I = M + gapOpenScoreShifted + gapExtendScoreShifted;
2862 } else {
2863 dpCell[dpCellIndex][dpCellUsed].I = insertScore + gapExtendScoreShifted;
2865 if (M + gapOpenScoreShifted >= D) {
2866 D = M + gapOpenScoreShifted + gapExtendScoreShifted;
2867 } else {
2868 D = D + gapExtendScoreShifted;
2870 traceback[tracebackIndex].alignment = DP_MATCH_MISMATCH;
2871 } else {
2872 if (insertScore >= D) {
2873 dpCell[dpCellIndex][dpCellUsed].B = insertScore;
2874 traceback[tracebackIndex].alignment = DP_INSERT;
2875 } else {
2876 dpCell[dpCellIndex][dpCellUsed].B = D;
2877 traceback[tracebackIndex].alignment = DP_DELETE;
2879 dpCell[dpCellIndex][dpCellUsed].I = insertScore + gapExtendScoreShifted;
2880 D = D + gapExtendScoreShifted;
2883 // check if the best score is positive
2884 if (dpCell[dpCellIndex][dpCellUsed].B >= minPositiveScore) {
2886 traceback[tracebackIndex].indexDiff = dpCellUsed - i -1 + lastDpCellUsed + tracebackIndexAdjustment; // indexDiff always point to the cell directly above logically
2887 traceback[tracebackIndex].textChar = textChar;
2888 traceback[tracebackIndex].queryChar = qChar;
2889 if (insertScore >= minPositiveScore && insertScore == dpCell[lastDpCellIndex][i+1].B + gapOpenScoreShifted + gapExtendScoreShifted) {
2890 traceback[tracebackIndex].IOpen = 1; // Gap opening
2891 } else {
2892 traceback[tracebackIndex].IOpen = 0; // Not gap opening
2894 if (deleteScore >= minPositiveScore && deleteScore == dpCell[dpCellIndex][dpCellUsed-1].B + gapOpenScoreShifted + gapExtendScoreShifted) {
2895 traceback[tracebackIndex].DOpen = 1; // Gap opening
2896 } else {
2897 traceback[tracebackIndex].DOpen = 0; // Not gap opening
2900 dpCell[dpCellIndex][dpCellUsed].P = qPos;
2902 sourceBit = dpCell[dpCellIndex][dpCellUsed].B & SOURCE_BIT_MASK;
2903 if (dpCell[dpCellIndex][dpCellUsed].B > maxScore[sourceBit].score) {
2904 maxScore[sourceBit].score = dpCell[dpCellIndex][dpCellUsed].B;
2905 maxScore[sourceBit].tracebackIndex = tracebackIndex;
2906 maxScore[sourceBit].posText = textOffset - 1;
2907 maxScore[sourceBit].posQuery = qPos;
2909 if (sourceBit < usedSourceBit && dpCell[dpCellIndex][dpCellUsed].B > currentMaxScore[sourceBit]) {
2910 currentMaxScore[sourceBit] = dpCell[dpCellIndex][dpCellUsed].B;
2913 dpCellUsed++;
2914 tracebackIndex++;
2918 lastQPos = qPos;
2920 if ((i+1 >= lastDpCellUsed || dpCell[lastDpCellIndex][i+1].P < (qPos-1)) && qPos > 0 && D >= minPositiveScore) {
2922 // delete score is still positive;
2923 qPos--;
2924 dpCell[dpCellIndex][dpCellUsed].B = D;
2925 dpCell[dpCellIndex][dpCellUsed].I = DP_NEG_INFINITY;
2926 dpCell[dpCellIndex][dpCellUsed].P = qPos;
2927 D = D + gapExtendScoreShifted;
2929 traceback[tracebackIndex].indexDiff = traceback[tracebackIndex-1].indexDiff;
2930 traceback[tracebackIndex].textChar = textChar;
2931 traceback[tracebackIndex].IOpen = 0; // Not gap opening
2932 if (dpCell[dpCellIndex][dpCellUsed].B == dpCell[dpCellIndex][dpCellUsed-1].B + gapOpenScoreShifted + gapExtendScoreShifted) {
2933 traceback[tracebackIndex].DOpen = 1; // Gap opening
2934 } else {
2935 traceback[tracebackIndex].DOpen = 0; // Not gap opening
2937 traceback[tracebackIndex].alignment = DP_DELETE;
2939 dpCellUsed++;
2940 tracebackIndex++;
2942 if (qPos == 0) {
2943 // Reached the start of query
2944 D = DP_NEG_INFINITY;
2945 if (dpCellUsed > 0 && dpCell[dpCellIndex][dpCellUsed - 1].P == 0) {
2946 // No need to process the cell on the start of query
2947 dpCellUsed--;
2948 dpCellUsedAdjustedForEdge = TRUE;
2950 break;
2953 i++;
2957 if (dpCellUsedAdjustedForEdge) {
2958 tracebackIndexAdjustment = 1;
2959 } else {
2960 tracebackIndexAdjustment = 0;
2963 // Add to dropoff list if maxScore > currentMaxScore + cutoff
2964 for (j=0; j<usedSourceBit; j++) {
2965 if (maxScore[j].score >= currentMaxScore[j] + dropoffScoreShifted && lastDropoffMaxScoreIndex[j] != maxScore[j].tracebackIndex) {
2966 if (numOfDropoffMaxScore < DROPOFF_MAX_ENTRY) {
2967 dropoffMaxScore[numOfDropoffMaxScore] = maxScore[j];
2968 lastDropoffMaxScoreIndex[j] = maxScore[j].tracebackIndex;
2969 numOfDropoffMaxScore++;
2970 } else {
2971 fprintf(stderr, "HSPDPDBvsQuery(): Not enough dropoff space!\n");
2976 dpCellIndex ^= 1; // Swap dpCells
2977 lastDpCellIndex ^= 1; // Swap dpCells
2978 lastDpCellUsed = dpCellUsed;
2980 textOffset--;
2984 // Move scores reached cutoff to front
2985 j = 0;
2986 for (i=0; i<nextSourceBit; i++) {
2987 if (maxScore[i].score >= cutoffScoreShifted) {
2988 maxScore[j] = maxScore[i];
2989 j++;
2992 nextSourceBit = j;
2994 // Append dropoff max score
2995 for (i=0; i<numOfDropoffMaxScore; i++) {
2996 if (nextSourceBit < (1<<NUM_SOURCE_BIT)) {
2997 for (j=0; j<nextSourceBit; j++) {
2998 if (maxScore[j].tracebackIndex == dropoffMaxScore[i].tracebackIndex) {
2999 break;
3002 if (j >= nextSourceBit) {
3003 maxScore[nextSourceBit] = dropoffMaxScore[i];
3004 nextSourceBit++;
3006 } else {
3007 fprintf(stderr, "HSPDPDBvsQuery(): Not enough source bit for dropoff!\n");
3011 // Traceback alignments
3012 for (sourceBit=0; sourceBit<nextSourceBit; sourceBit++) {
3014 bestScore = maxScore[sourceBit].score;
3015 evalue = stat_gapCalcEvalue(stat_gapNominal2normalized(bestScore >> NUM_SOURCE_BIT));
3016 if (evalue <= maxEvalue) {
3018 // Traceback
3019 tempAlignmentIndex = 0;
3020 tempAuxiliaryTextIndex = 0;
3022 textOffset = maxScore[sourceBit].posText;
3023 qPos = maxScore[sourceBit].posQuery;
3025 tracebackIndex = maxScore[sourceBit].tracebackIndex;
3027 while (bestScore >= minPositiveScore) {
3029 if (traceback[tracebackIndex].alignment == DP_MATCH_MISMATCH) {
3030 if (tempAlignmentIndex >= MAX_ALIGNMENT_LENGTH) {
3031 fprintf(stderr, "HSPDPDBvsQuery(): Not enough temp alignment buffer!\n");
3032 exit(1);
3034 textChar = traceback[tracebackIndex].textChar;
3035 qChar = traceback[tracebackIndex].queryChar;
3036 bestScore -= scoringMatrix[textChar][qChar];
3037 if (textChar == qChar && textChar < 4) { // match and not ambiguity
3038 tempAlignment[tempAlignmentIndex] = ALIGN_MATCH;
3039 } else {
3040 tempAlignment[tempAlignmentIndex] = ALIGN_MISMATCH_AMBIGUITY;
3041 tempAuxiliaryText[tempAuxiliaryTextIndex] = (char)traceback[tracebackIndex].textChar; // store text character if mismatch or ambiguity
3042 tempAuxiliaryTextIndex++;
3044 textOffset++;
3045 qPos++;
3046 tracebackIndex -= traceback[tracebackIndex].indexDiff + 1; // indexDiff always refer to the cell directly above logically
3047 } else if (traceback[tracebackIndex].alignment == DP_INSERT) {
3048 while (!traceback[tracebackIndex].IOpen) {
3049 if (tempAlignmentIndex >= MAX_ALIGNMENT_LENGTH) {
3050 fprintf(stderr, "HSPDPDBvsQuery(): Not enough temp alignment buffer!\n");
3051 exit(1);
3053 bestScore -= gapExtendScoreShifted;
3054 tempAlignment[tempAlignmentIndex] = ALIGN_INSERT;
3055 tempAlignmentIndex++;
3056 tempAuxiliaryText[tempAuxiliaryTextIndex] = (char)traceback[tracebackIndex].textChar;
3057 tempAuxiliaryTextIndex++;
3058 tracebackIndex -= traceback[tracebackIndex].indexDiff; // indexDiff always refer to the cell directly above logically
3059 textOffset++;
3061 if (tempAlignmentIndex >= MAX_ALIGNMENT_LENGTH) {
3062 fprintf(stderr, "HSPDPDBvsQuery(): Not enough temp alignment buffer!\n");
3063 exit(1);
3065 bestScore -= gapOpenScoreShifted + gapExtendScoreShifted;
3066 tempAlignment[tempAlignmentIndex] = ALIGN_INSERT;
3067 tempAuxiliaryText[tempAuxiliaryTextIndex] = (char)traceback[tracebackIndex].textChar;
3068 tempAuxiliaryTextIndex++;
3069 tracebackIndex -= traceback[tracebackIndex].indexDiff; // indexDiff always refer to the cell directly above logically
3070 textOffset++;
3071 } else {
3072 while (!traceback[tracebackIndex].DOpen) {
3073 if (tempAlignmentIndex >= MAX_ALIGNMENT_LENGTH) {
3074 fprintf(stderr, "HSPDPDBvsQuery(): Not enough temp alignment buffer!\n");
3075 exit(1);
3077 bestScore -= gapExtendScoreShifted;
3078 tempAlignment[tempAlignmentIndex] = ALIGN_DELETE;
3079 tempAlignmentIndex++;
3080 tracebackIndex--;
3081 qPos++;
3083 if (tempAlignmentIndex >= MAX_ALIGNMENT_LENGTH) {
3084 fprintf(stderr, "HSPDPDBvsQuery(): Not enough temp alignment buffer!\n");
3085 exit(1);
3087 bestScore -= gapOpenScoreShifted + gapExtendScoreShifted;
3088 tempAlignment[tempAlignmentIndex] = ALIGN_DELETE;
3089 tracebackIndex--;
3090 qPos++;
3093 tempAlignmentIndex++;
3097 if (numOfGappedHit >= maxNumOfGappedHit) {
3098 fprintf(stderr, "HSPDPDBvsQuery(): Not enough gapped hit buffer!\n");
3099 exit(1);
3101 gappedHitList[numOfGappedHit].posText = maxScore[sourceBit].posText;
3102 gappedHitList[numOfGappedHit].posQuery = maxScore[sourceBit].posQuery;
3103 gappedHitList[numOfGappedHit].lengthText = textOffset - maxScore[sourceBit].posText;
3104 gappedHitList[numOfGappedHit].lengthQuery = qPos - maxScore[sourceBit].posQuery;
3105 gappedHitList[numOfGappedHit].score = maxScore[sourceBit].score >> NUM_SOURCE_BIT;
3106 gappedHitList[numOfGappedHit].dbSeqIndex = textDbSeqIndex;
3108 numOfAlignmentWord = (tempAlignmentIndex + ALIGN_PER_WORD - 1) / ALIGN_PER_WORD;
3109 gappedHitList[numOfGappedHit].alignmentOffset = MMPoolDispatchOffset(alignmentPool, numOfAlignmentWord * sizeof(unsigned int));
3110 alignment = (unsigned int*)((char*)alignmentPool + gappedHitList[numOfGappedHit].alignmentOffset);
3111 for (i=0; i<numOfAlignmentWord; i++) {
3112 alignment[i] = 0;
3115 for (i=0; i<tempAlignmentIndex; i++) {
3116 alignment[i/ALIGN_PER_WORD] |= (tempAlignment[i] << (BITS_IN_WORD - (i % ALIGN_PER_WORD + 1) * ALIGN_BIT));
3119 if (tempAuxiliaryTextIndex > 0) {
3120 numOfAuxiliaryTextWord = (tempAuxiliaryTextIndex + AUX_TEXT_PER_WORD - 1) / AUX_TEXT_PER_WORD;
3121 gappedHitList[numOfGappedHit].auxiliaryTextOffset = MMPoolDispatchOffset(alignmentPool, numOfAuxiliaryTextWord * sizeof(unsigned int));
3122 auxiliaryText = (unsigned int*)((char*)alignmentPool + gappedHitList[numOfGappedHit].auxiliaryTextOffset);
3123 for (i=0; i<numOfAuxiliaryTextWord; i++) {
3124 auxiliaryText[i] = 0;
3126 for (i=0; i<tempAuxiliaryTextIndex; i++) {
3127 auxiliaryText[i/AUX_TEXT_PER_WORD] |= (tempAuxiliaryText[i] << (BITS_IN_WORD - (i % AUX_TEXT_PER_WORD + 1) * AUX_TEXT_BIT));
3129 } else {
3130 gappedHitList[numOfGappedHit].auxiliaryTextOffset = 0;
3133 numOfGappedHit++;
3138 firstTextHitBeingProcessed += numOfTextHitBeingProcessed;
3142 // free working memory
3143 MMPoolReturn(mmPool, traceback, tracebackAllocated * sizeof(int));
3145 MMPoolReturn(mmPool, tempAuxiliaryText, (MAX_ALIGNMENT_LENGTH * 2) * sizeof(char));
3146 MMPoolReturn(mmPool, tempAlignment, (MAX_ALIGNMENT_LENGTH * 2) * sizeof(char));
3148 MMPoolReturn(mmPool, lastDropoffMaxScoreIndex, (1 << NUM_SOURCE_BIT) * sizeof(int));
3149 MMPoolReturn(mmPool, currentMaxScore, (1 << NUM_SOURCE_BIT) * sizeof(int));
3151 MMPoolReturn(mmPool, dropoffMaxScore, DROPOFF_MAX_ENTRY * sizeof(DPMaxScore));
3152 MMPoolReturn(mmPool, maxScore, (1 << NUM_SOURCE_BIT) * sizeof(DPMaxScore));
3154 MMPoolReturn(mmPool, dpCell[0], MAX_ALIGNMENT_LENGTH * sizeof(DPCell));
3155 MMPoolReturn(mmPool, dpCell[1], MAX_ALIGNMENT_LENGTH * sizeof(DPCell));
3156 MMPoolReturn(mmPool, dpCell, 2 * sizeof(DPCell*));
3158 return numOfGappedHit;
3164 unsigned int HSPRemoveDuplicateUngappedHit(HitListWithPosQuery* __restrict ungappedHitList, const unsigned long long numberOfHit) {
3166 unsigned long long i, j;
3168 QSort(ungappedHitList, numberOfHit, sizeof(HitListWithPosQuery), HitListPosTextQueryOrder);
3170 i = j = 0;
3171 while (i<numberOfHit) {
3172 ungappedHitList[j].posText = ungappedHitList[i].posText;
3173 ungappedHitList[j].posQuery = ungappedHitList[i].posQuery;
3174 i++;
3175 while (i<numberOfHit && ungappedHitList[i].posText == ungappedHitList[j].posText
3176 && ungappedHitList[i].posQuery == ungappedHitList[j].posQuery) {
3177 i++;
3179 j++;
3182 return j;
3186 unsigned int HSPSplitCrossBoundaryGappedHit(GappedHitList* __restrict gappedHitList, const unsigned long long numberOfHit, const unsigned int queryPatternLength,
3187 const SeqOffset *seqOffset,
3188 const int matchScore, const int cutoffScore) {
3190 unsigned long long i;
3191 unsigned int numberOfSplitHit;
3192 unsigned long long dbSeqIndex;
3193 unsigned long long splitDbSeqIndex;
3194 unsigned long long minHitLength;
3195 unsigned long long oldHitLength;
3196 unsigned long long newHitLength;
3197 unsigned long long hitEnd;
3198 LONG diagonalPos;
3201 minHitLength = (cutoffScore + matchScore - 1) / matchScore;
3203 QSort(gappedHitList, numberOfHit, sizeof(GappedHitList), GappedHitListPosTextQueryLengthScoreOrder);
3205 i = 0;
3206 numberOfSplitHit = 0;
3207 dbSeqIndex = 0;
3209 while (i<numberOfHit) {
3211 // Find corresponding DB sequence
3212 while (seqOffset[dbSeqIndex].endPos < gappedHitList[i].posText) {
3213 dbSeqIndex++;
3215 gappedHitList[i].dbSeqIndex = dbSeqIndex;
3216 diagonalPos = (LONG)gappedHitList[i].ungappedPosText - (LONG)gappedHitList[i].ungappedPosQuery;
3217 oldHitLength = gappedHitList[i].lengthText;
3219 splitDbSeqIndex = dbSeqIndex + 1;
3220 while (seqOffset[splitDbSeqIndex].startPos <= gappedHitList[i].posText + gappedHitList[i].lengthText - 1 - minHitLength) {
3222 // the hit span across 2 db sequences; and the portion on the right > minimum hit length
3224 gappedHitList[numberOfHit + numberOfSplitHit].posText = seqOffset[splitDbSeqIndex].startPos;
3225 if (gappedHitList[numberOfHit + numberOfSplitHit].posText - diagonalPos < queryPatternLength) {
3226 gappedHitList[numberOfHit + numberOfSplitHit].posQuery = (unsigned int)((LONG)gappedHitList[numberOfHit + numberOfSplitHit].posText - diagonalPos);
3227 } else {
3228 gappedHitList[numberOfHit + numberOfSplitHit].posQuery = queryPatternLength - 1;
3230 hitEnd = gappedHitList[i].posText + gappedHitList[i].lengthText - 1;
3231 if (hitEnd > seqOffset[splitDbSeqIndex].endPos) {
3232 hitEnd = seqOffset[splitDbSeqIndex].endPos;
3234 newHitLength = hitEnd - gappedHitList[numberOfHit + numberOfSplitHit].posText + 1;
3235 gappedHitList[numberOfHit + numberOfSplitHit].lengthText = newHitLength;
3236 if (gappedHitList[numberOfHit + numberOfSplitHit].lengthQuery >= oldHitLength - newHitLength) {
3237 gappedHitList[numberOfHit + numberOfSplitHit].lengthQuery += newHitLength - oldHitLength;
3238 } else {
3239 gappedHitList[numberOfHit + numberOfSplitHit].lengthQuery = 0;
3242 if (gappedHitList[i].ungappedPosText < seqOffset[splitDbSeqIndex].startPos) {
3243 gappedHitList[numberOfHit + numberOfSplitHit].ungappedPosText = seqOffset[splitDbSeqIndex].startPos;
3244 } else if (gappedHitList[i].ungappedPosText > hitEnd) {
3245 gappedHitList[numberOfHit + numberOfSplitHit].ungappedPosText = hitEnd;
3246 } else {
3247 gappedHitList[numberOfHit + numberOfSplitHit].ungappedPosText = gappedHitList[i].ungappedPosText;
3249 if (gappedHitList[numberOfHit + numberOfSplitHit].ungappedPosText < diagonalPos) {
3250 gappedHitList[numberOfHit + numberOfSplitHit].ungappedPosQuery = 0;
3251 } else if (gappedHitList[numberOfHit + numberOfSplitHit].ungappedPosText - diagonalPos >= queryPatternLength) {
3252 gappedHitList[numberOfHit + numberOfSplitHit].ungappedPosQuery = queryPatternLength - 1;
3253 } else {
3254 gappedHitList[numberOfHit + numberOfSplitHit].ungappedPosQuery = (unsigned int)((LONG)gappedHitList[numberOfHit + numberOfSplitHit].ungappedPosText - diagonalPos);
3257 if (gappedHitList[i].ungappedPosText >= seqOffset[splitDbSeqIndex].startPos &&
3258 gappedHitList[i].ungappedPosText <= hitEnd) {
3259 gappedHitList[numberOfHit + numberOfSplitHit].score = gappedHitList[i].score;
3260 } else {
3261 gappedHitList[numberOfHit + numberOfSplitHit].score = 0;
3263 gappedHitList[numberOfHit + numberOfSplitHit].dbSeqIndex = splitDbSeqIndex;
3265 numberOfSplitHit++;
3266 splitDbSeqIndex++;
3270 hitEnd = gappedHitList[i].posText + gappedHitList[i].lengthText - 1;
3271 if (hitEnd > seqOffset[splitDbSeqIndex].endPos) {
3272 hitEnd = seqOffset[splitDbSeqIndex].endPos;
3274 if (gappedHitList[i].posText < seqOffset[dbSeqIndex].startPos) {
3275 gappedHitList[i].posText = seqOffset[dbSeqIndex].startPos;
3276 if (gappedHitList[i].posText - diagonalPos < queryPatternLength) {
3277 gappedHitList[i].posQuery = (int)((LONG)gappedHitList[i].posText - diagonalPos);
3278 } else {
3279 gappedHitList[i].posQuery = queryPatternLength - 1;
3282 newHitLength = hitEnd - gappedHitList[i].posText + 1;
3283 gappedHitList[i].lengthText = newHitLength;
3284 if (gappedHitList[i].lengthQuery >= oldHitLength - newHitLength) {
3285 gappedHitList[i].lengthQuery += newHitLength - oldHitLength;
3286 } else {
3287 gappedHitList[i].lengthQuery = 0;
3290 if (gappedHitList[i].ungappedPosText < seqOffset[dbSeqIndex].startPos) {
3291 gappedHitList[i].ungappedPosText = seqOffset[dbSeqIndex].startPos;
3292 } else if (gappedHitList[i].ungappedPosText > hitEnd) {
3293 gappedHitList[i].ungappedPosText = hitEnd;
3294 } else {
3295 gappedHitList[i].ungappedPosText = gappedHitList[i].ungappedPosText;
3297 if (gappedHitList[i].ungappedPosText < diagonalPos) {
3298 gappedHitList[i].ungappedPosQuery = 0;
3299 } else if (gappedHitList[i].ungappedPosText - diagonalPos >= queryPatternLength) {
3300 gappedHitList[i].ungappedPosQuery = queryPatternLength - 1;
3301 } else {
3302 gappedHitList[i].ungappedPosQuery = (int)((LONG)gappedHitList[i].ungappedPosText - diagonalPos);
3305 if (gappedHitList[i].ungappedPosText >= seqOffset[dbSeqIndex].startPos &&
3306 gappedHitList[i].ungappedPosText <= hitEnd) {
3307 gappedHitList[i].score = gappedHitList[i].score;
3308 } else {
3309 gappedHitList[i].score = 0;
3311 gappedHitList[i].dbSeqIndex = dbSeqIndex;
3313 if (newHitLength >= minHitLength) {
3314 i++;
3319 return numberOfHit + numberOfSplitHit;
3323 unsigned int HSPRemoveDuplicateGappedHit(GappedHitList* __restrict gappedHitList, const unsigned long long numberOfHit) {
3325 unsigned long long i, j;
3327 QSort(gappedHitList, numberOfHit, sizeof(GappedHitList), GappedHitListPosTextQueryLengthScoreOrder);
3329 i = j = 0;
3330 while (i<numberOfHit) {
3331 gappedHitList[j].posText = gappedHitList[i].posText;
3332 gappedHitList[j].posQuery = gappedHitList[i].posQuery;
3333 gappedHitList[j].lengthText = gappedHitList[i].lengthText;
3334 gappedHitList[j].lengthQuery = gappedHitList[i].lengthQuery;
3335 gappedHitList[j].score = gappedHitList[i].score;
3336 gappedHitList[j].dbSeqIndex = gappedHitList[i].dbSeqIndex;
3337 gappedHitList[j].ungappedPosText = gappedHitList[i].ungappedPosText;
3338 gappedHitList[j].ungappedPosQuery = gappedHitList[i].ungappedPosQuery;
3339 i++;
3340 while (i<numberOfHit && gappedHitList[i].posText == gappedHitList[j].posText
3341 && gappedHitList[i].posQuery == gappedHitList[j].posQuery
3342 && gappedHitList[i].lengthText == gappedHitList[j].lengthText
3343 && gappedHitList[i].lengthQuery == gappedHitList[j].lengthQuery) {
3344 i++;
3346 j++;
3349 return j;
3353 // alignment and auxiliary text must be allocated through MMPool
3354 // otherwise the pointer will be lost without freeing the memory
3355 unsigned int HSPFinalFilter(GappedHitListWithAlignment* __restrict gappedHitList, const unsigned int numberOfHit) {
3357 unsigned long long i, j;
3358 unsigned int hitRemaining;
3360 hitRemaining = numberOfHit;
3362 // First filter among hits with common starting points the longer and lower scoring hits
3364 QSort(gappedHitList, hitRemaining, sizeof(GappedHitList), GappedHitListPosTextQueryScoreLengthOrder);
3366 j = 0;
3367 while (j<hitRemaining) {
3368 if (gappedHitList[j].alignmentOffset != 0) { // not marked as discarded
3369 i = j + 1;
3370 while (i<hitRemaining && gappedHitList[i].posText == gappedHitList[j].posText
3371 && gappedHitList[i].posQuery == gappedHitList[j].posQuery) {
3372 if (gappedHitList[i].alignmentOffset != 0) {
3373 if (gappedHitList[i].lengthText >= gappedHitList[j].lengthText &&
3374 gappedHitList[i].lengthQuery >= gappedHitList[j].lengthQuery &&
3375 gappedHitList[i].score <= gappedHitList[j].score) {
3376 // discard hit
3377 gappedHitList[i].alignmentOffset = 0;
3380 i++;
3383 j++;
3386 // Pack retained hit to the beginning
3387 i = j = 0;
3388 while (i<hitRemaining) {
3389 if (gappedHitList[i].alignmentOffset != 0) { // not marked as discarded
3390 gappedHitList[j].posText = gappedHitList[i].posText;
3391 gappedHitList[j].posQuery = gappedHitList[i].posQuery;
3392 gappedHitList[j].lengthText = gappedHitList[i].lengthText;
3393 gappedHitList[j].lengthQuery = gappedHitList[i].lengthQuery;
3394 gappedHitList[j].score = gappedHitList[i].score;
3395 gappedHitList[j].dbSeqIndex = gappedHitList[i].dbSeqIndex;
3396 gappedHitList[j].alignmentOffset = gappedHitList[i].alignmentOffset;
3397 gappedHitList[j].auxiliaryTextOffset = gappedHitList[i].auxiliaryTextOffset;
3398 j++;
3400 i++;
3403 hitRemaining = j;
3405 // Second filter among hits with common ending points the longer and lower scoring hits
3407 QSort(gappedHitList, hitRemaining, sizeof(GappedHitList), GappedHitListEndTextQueryScoreLengthOrder);
3409 j = 0;
3410 while (j<hitRemaining) {
3411 if (gappedHitList[j].alignmentOffset != 0) { // not marked as discarded
3412 i = j + 1;
3413 while (i<hitRemaining && gappedHitList[i].posText + gappedHitList[i].lengthText == gappedHitList[j].posText + gappedHitList[j].lengthText
3414 && gappedHitList[i].posQuery + gappedHitList[i].lengthQuery == gappedHitList[j].posQuery + gappedHitList[j].lengthQuery) {
3415 if (gappedHitList[i].alignmentOffset != 0) {
3416 if (gappedHitList[i].lengthText >= gappedHitList[j].lengthText &&
3417 gappedHitList[i].lengthQuery >= gappedHitList[j].lengthQuery &&
3418 gappedHitList[i].score <= gappedHitList[j].score) {
3419 // discard hit
3420 gappedHitList[i].alignmentOffset = 0;
3423 i++;
3426 j++;
3429 // Pack retained hit to the beginning
3430 i = j = 0;
3431 while (i<hitRemaining) {
3432 if (gappedHitList[i].alignmentOffset != 0) { // not marked as discarded
3433 gappedHitList[j].posText = gappedHitList[i].posText;
3434 gappedHitList[j].posQuery = gappedHitList[i].posQuery;
3435 gappedHitList[j].lengthText = gappedHitList[i].lengthText;
3436 gappedHitList[j].lengthQuery = gappedHitList[i].lengthQuery;
3437 gappedHitList[j].score = gappedHitList[i].score;
3438 gappedHitList[j].dbSeqIndex = gappedHitList[i].dbSeqIndex;
3439 gappedHitList[j].alignmentOffset = gappedHitList[i].alignmentOffset;
3440 gappedHitList[j].auxiliaryTextOffset = gappedHitList[i].auxiliaryTextOffset;
3441 j++;
3443 i++;
3446 hitRemaining = j;
3448 // Third filter hits with common start points
3450 QSort(gappedHitList, hitRemaining, sizeof(GappedHitList), GappedHitListPosTextQueryScoreLengthOrder);
3452 i = j = 0;
3453 while (i<hitRemaining) {
3454 gappedHitList[j].posText = gappedHitList[i].posText;
3455 gappedHitList[j].posQuery = gappedHitList[i].posQuery;
3456 gappedHitList[j].lengthText = gappedHitList[i].lengthText;
3457 gappedHitList[j].lengthQuery = gappedHitList[i].lengthQuery;
3458 gappedHitList[j].score = gappedHitList[i].score;
3459 gappedHitList[j].dbSeqIndex = gappedHitList[i].dbSeqIndex;
3460 gappedHitList[j].alignmentOffset = gappedHitList[i].alignmentOffset;
3461 gappedHitList[j].auxiliaryTextOffset = gappedHitList[i].auxiliaryTextOffset;
3462 i++;
3463 while (i<hitRemaining && gappedHitList[i].posText == gappedHitList[j].posText
3464 && gappedHitList[i].posQuery == gappedHitList[j].posQuery) {
3465 // discard hit
3466 gappedHitList[i].alignmentOffset = 0;
3467 i++;
3469 j++;
3472 hitRemaining = j;
3474 // Fourth filter hits with common end points
3476 QSort(gappedHitList, hitRemaining, sizeof(GappedHitListWithAlignment), GappedHitListEndTextQueryScoreLengthOrder);
3478 i = j = 0;
3479 while (i<hitRemaining) {
3480 gappedHitList[j].posText = gappedHitList[i].posText;
3481 gappedHitList[j].posQuery = gappedHitList[i].posQuery;
3482 gappedHitList[j].lengthText = gappedHitList[i].lengthText;
3483 gappedHitList[j].lengthQuery = gappedHitList[i].lengthQuery;
3484 gappedHitList[j].score = gappedHitList[i].score;
3485 gappedHitList[j].dbSeqIndex = gappedHitList[i].dbSeqIndex;
3486 gappedHitList[j].alignmentOffset = gappedHitList[i].alignmentOffset;
3487 gappedHitList[j].auxiliaryTextOffset = gappedHitList[i].auxiliaryTextOffset;
3488 i++;
3489 while (i<hitRemaining && gappedHitList[i].posText + gappedHitList[i].lengthText == gappedHitList[j].posText + gappedHitList[j].lengthText
3490 && gappedHitList[i].posQuery + gappedHitList[i].lengthQuery == gappedHitList[j].posQuery + gappedHitList[j].lengthQuery) {
3491 // discard hit
3492 gappedHitList[i].alignmentOffset = 0;
3493 i++;
3495 j++;
3498 hitRemaining = j;
3500 // Fifth filter enclosed hits with lower scores
3502 QSort(gappedHitList, hitRemaining, sizeof(GappedHitList), GappedHitListEnclosedScoreOrder);
3504 j = 0;
3505 while (j<hitRemaining) {
3506 if (gappedHitList[j].alignmentOffset != 0) { // not marked as discarded
3507 i = j + 1;
3508 while (i<hitRemaining && gappedHitList[i].posText < gappedHitList[j].posText + gappedHitList[j].lengthText) {
3509 if (gappedHitList[i].alignmentOffset != 0) {
3510 if (gappedHitList[i].posText + gappedHitList[i].lengthText <= gappedHitList[j].posText + gappedHitList[j].lengthText &&
3511 gappedHitList[i].posQuery >= gappedHitList[j].posQuery &&
3512 gappedHitList[i].posQuery + gappedHitList[i].lengthQuery <= gappedHitList[j].posQuery + gappedHitList[j].lengthQuery &&
3513 gappedHitList[i].score <= gappedHitList[j].score) {
3514 // discard hit
3515 gappedHitList[i].alignmentOffset = 0;
3518 i++;
3521 j++;
3524 // Pack retained hit to the beginning
3525 i = j = 0;
3526 while (i<hitRemaining) {
3527 if (gappedHitList[i].alignmentOffset != 0) { // not marked as discarded
3528 gappedHitList[j].posText = gappedHitList[i].posText;
3529 gappedHitList[j].posQuery = gappedHitList[i].posQuery;
3530 gappedHitList[j].lengthText = gappedHitList[i].lengthText;
3531 gappedHitList[j].lengthQuery = gappedHitList[i].lengthQuery;
3532 gappedHitList[j].score = gappedHitList[i].score;
3533 gappedHitList[j].dbSeqIndex = gappedHitList[i].dbSeqIndex;
3534 gappedHitList[j].alignmentOffset = gappedHitList[i].alignmentOffset;
3535 gappedHitList[j].auxiliaryTextOffset = gappedHitList[i].auxiliaryTextOffset;
3536 j++;
3538 i++;
3541 hitRemaining = j;
3543 return hitRemaining;
3547 HSPUngappedExtLookupTable *HSPGenUngappedExtLookupTable(MMPool *mmPool, const int matchScore, const int mismatchScore) {
3549 HSPUngappedExtLookupTable *hspUngappedExtLookupTable;
3551 hspUngappedExtLookupTable = (HSPUngappedExtLookupTable*) MMPoolDispatch(mmPool, (1 << 16) * sizeof(HSPUngappedExtLookupTable));
3553 HSPCalUngappedExtLookupTable(hspUngappedExtLookupTable, matchScore, mismatchScore);
3555 return hspUngappedExtLookupTable;
3558 void HSPCalUngappedExtLookupTable(HSPUngappedExtLookupTable *hspUngappedExtLookupTable, const int matchScore, const int mismatchScore) {
3560 static const unsigned int oddBitMask = 0x55555555;
3561 static const unsigned int evenBitMask = 0xAAAAAAAA;
3563 unsigned int i, j;
3564 char maxScore, maxScorePos, finalScore;
3566 // In entries where even bit = match/mismatch, odd bit = 0 -> matchMismatchBitVector is even bit compacted (used for forward extension)
3567 // In entries where odd bit = match/mismatch, even bit = 0 -> matchMismatchBitVector is odd bit compacted and then reversed (used for backward extension)
3568 // When all bits are 0, both above case can be handled without problem
3569 hspUngappedExtLookupTable[0].matchMismatchBitVector = 0;
3570 for (i=1; i<(1<<16); i++) {
3571 hspUngappedExtLookupTable[i].matchMismatchBitVector = 0;
3572 if ((i & oddBitMask) == 0) {
3573 for (j=0; j<8; j++) {
3574 hspUngappedExtLookupTable[i].matchMismatchBitVector |= ((i >> (15-j*2)) & 1) << (7-j);
3577 if ((i & evenBitMask) == 0) {
3578 for (j=0; j<8; j++) {
3579 hspUngappedExtLookupTable[i].matchMismatchBitVector |= ((i >> (14-j*2)) & 1) << j;
3582 // entries where both odd and even bits contain 1 are not used
3585 // For each 16 bit match/mismatch vector, calculate max score, position giving max score, and final score
3586 // The first match/mismatch is in the leftmost bit of the 16 bit vector
3587 // 0 is a match and 1 is a mismatch (from result of xor)
3588 for (i=0; i<(1<<16); i++) {
3590 maxScore = 0;
3591 maxScorePos = 0; // 0 -> before the 1st character; 16 -> after the 16th character
3592 finalScore = 0;
3594 for (j=0; j<16; j++) {
3595 if (((i >> (15-j)) & 1) == 0) {
3596 // match
3597 finalScore = finalScore + (char)matchScore;
3598 // favor longer hit if scores are the same
3599 if (finalScore >= maxScore) {
3600 maxScore = finalScore;
3601 maxScorePos = (char)(j+1);
3603 } else {
3604 // mismatch
3605 finalScore = finalScore + (char)mismatchScore;
3609 hspUngappedExtLookupTable[i].maxScore = maxScore;
3610 hspUngappedExtLookupTable[i].maxScorePos = maxScorePos;
3611 hspUngappedExtLookupTable[i].finalScore = finalScore;
3616 void HSPFreeUngappedExtLookupTable(MMPool *mmPool, HSPUngappedExtLookupTable* hspUngappedExtLookupTable) {
3618 MMPoolReturn(mmPool, hspUngappedExtLookupTable, (1 << 16) * sizeof(HSPUngappedExtLookupTable));
3622 Histogram* HSPAllocateHistogram(double minEvalue, double maxEvalue) {
3624 Histogram *histogram;
3626 histogram = (Histogram*) MMUnitAllocate(sizeof(Histogram));
3627 histogram->minEvalue = minEvalue;
3628 histogram->maxEvalue = maxEvalue;
3629 histogram->histogramSize = (int)(ceil(log10(maxEvalue)) - floor(log10(minEvalue))) + 1;
3630 histogram->count = (int*) MMUnitAllocate(histogram->histogramSize * sizeof(int));
3632 return histogram;
3636 void HSPInitializeHistogram(Histogram* histogram) {
3638 int i;
3640 for (i=0; i<histogram->histogramSize; i++) {
3641 histogram->count[i] = 0;
3646 void HSPFreeHistogram(Histogram* histogram) {
3648 MMUnitFree(histogram->count, histogram->histogramSize * sizeof(int));
3649 MMUnitFree(histogram, histogram->histogramSize * sizeof(int));
3653 void HSPCountEvalueToHistogram(Histogram* histogram, const GappedHitListWithAlignment* gappedHitList, const int numberOfHit, const int roundEvalue) {
3655 int i;
3656 int evalueIndex;
3657 double tempEvalue;
3659 for (i=0; i<numberOfHit; i++) {
3661 if (roundEvalue == TRUE) {
3662 tempEvalue = HSPCalculateAndPrintEValue(NULL, gappedHitList[i].score);
3663 } else {
3664 tempEvalue = stat_gapCalcEvalue(stat_gapNominal2normalized(gappedHitList[i].score));
3666 if (tempEvalue == 0.0) {
3667 tempEvalue = histogram->minEvalue;
3669 evalueIndex = (int)ceil(log10(tempEvalue) - floor(log10(histogram->minEvalue)));
3670 if (evalueIndex < 0) {
3671 evalueIndex = 0;
3673 if (evalueIndex < histogram->histogramSize) {
3674 histogram->count[evalueIndex]++;
3681 void HSPPrintHistogram(FILE * outputFile, Histogram* histogram) {
3683 int i;
3684 double tempEvalue;
3686 Socketfprintf(outputFile,"\nCount histogram of gapped extension by expectation value:\n");
3688 tempEvalue = pow(10, (int)floor(log10(histogram->minEvalue)));
3689 Socketfprintf(outputFile,"( 0,%6.2g] : %d\n", tempEvalue, histogram->count[0]);
3691 for (i=1; i<histogram->histogramSize; i++) {
3692 tempEvalue = pow(10, i - 1 + (int)log10(histogram->minEvalue));
3693 Socketfprintf(outputFile,"(%6.2g,%6.2g] : %d\n", tempEvalue, tempEvalue*10, histogram->count[i]);
3698 void HSPPrintHeader(FILE *outputFile, const int outputFormat,
3699 const char* databaseName, const int numOfSeq, const unsigned long long databaseLength) {
3701 switch (outputFormat) {
3703 case OUTPUT_PAIRWISE :
3705 fprintf(outputFile, "Database : %s\n", databaseName);
3706 fprintf(outputFile, "No. of sequences : %d\n", numOfSeq);
3707 fprintf(outputFile, "No. of letters : %llu\n", databaseLength);
3708 fprintf(outputFile, "\n");
3710 break;
3712 case OUTPUT_TABULAR :
3714 break;
3716 case OUTPUT_TABULAR_COMMENT :
3718 fprintf(outputFile, "# Database : %s\n", databaseName);
3719 fprintf(outputFile, "# No. of sequences : %d\n", numOfSeq);
3720 fprintf(outputFile, "# No. of letters : %llu\n", databaseLength);
3721 fprintf(outputFile, "#\n");
3723 break;
3729 void HSPPrintInvalidReadTrailer(FILE *outputFile, const int outputFormat,
3730 const char* databaseName, const int numOfSeq, const unsigned long long databaseLength) {
3732 switch (outputFormat) {
3734 case OUTPUT_PAIRWISE :
3736 fprintf(outputFile, "Database : %s\n", databaseName);
3737 fprintf(outputFile, "No. of sequences : %d\n", numOfSeq);
3738 fprintf(outputFile, "No. of letters : %llu\n", databaseLength);
3739 fprintf(outputFile, "\n\n");
3740 fprintf(outputFile,"Lambda K H\n");
3741 fprintf(outputFile,"%10.2f %10.2f %10.2f\n", (double)-1, (double)-1, (double)-1);
3742 fprintf(outputFile,"Gapped\n");
3743 fprintf(outputFile,"Lambda K H\n");
3744 fprintf(outputFile,"%10.2f %10.2f %10.2f\n\n", (double)-1, (double)-1, (double)-1);
3746 break;
3748 case OUTPUT_TABULAR :
3750 break;
3752 case OUTPUT_TABULAR_COMMENT :
3754 break;
3759 void HSPPrintTrailer(FILE *outputFile, const int outputFormat,
3760 const char* databaseName, const int numOfSeq, const unsigned long long databaseLength) {
3762 switch (outputFormat) {
3764 case OUTPUT_PAIRWISE :
3766 fprintf(outputFile, "Database : %s\n", databaseName);
3767 fprintf(outputFile, "No. of sequences : %d\n", numOfSeq);
3768 fprintf(outputFile, "No. of letters : %llu\n", databaseLength);
3769 fprintf(outputFile, "\n\n");
3771 printHSPstatistic(outputFile);
3772 fprintf(outputFile, "\n");
3774 break;
3776 case OUTPUT_TABULAR :
3778 break;
3780 case OUTPUT_TABULAR_COMMENT :
3782 break;
3788 void HSPPrintQueryHeader(FILE *outputFile, const int outputFormat,
3789 const char* queryPatternName, const int queryPatternLength,
3790 const int *dbOrder, const int *dbScore, const Annotation *annotation, const int numOfSeq) {
3792 int i;
3793 int charPrinted;
3794 char stringFormat[8] = "%00.00s";
3795 int anything=0;
3797 switch (outputFormat) {
3799 case OUTPUT_PAIRWISE :
3801 fprintf(outputFile, "Query : %s\n", queryPatternName);
3802 fprintf(outputFile, "No. of letters : %u\n", queryPatternLength);
3804 fprintf(outputFile, "\n\n");
3805 fprintf(outputFile, " Score E\n");
3806 fprintf(outputFile, "Sequences producing significant alignments: (bits) Value\n");
3807 fprintf(outputFile, "\n");
3809 anything=0;
3810 for (i=0; i<numOfSeq && dbScore[dbOrder[i]] > 0; i++) {
3811 charPrinted = 0; anything = 1;
3812 if (annotation[dbOrder[i]].gi > 0) {
3813 charPrinted += fprintf(outputFile, "gi|%d|", annotation[dbOrder[i]].gi);
3815 stringFormat[1] = stringFormat[4] = (char)('0' + (64 - charPrinted) / 10);
3816 stringFormat[2] = stringFormat[5] = (char)('0' + (64 - charPrinted) % 10);
3817 fprintf(outputFile, stringFormat, annotation[dbOrder[i]].text);
3818 fprintf(outputFile, " ");
3819 HSPCalculateAndPrintBitScore(outputFile, dbScore[dbOrder[i]]);
3820 fprintf(outputFile, " ");
3821 HSPCalculateAndPrintEValue(outputFile, dbScore[dbOrder[i]]);
3822 fprintf(outputFile, "\n");
3824 if (anything) fprintf(outputFile, "\n\n");
3826 break;
3828 case OUTPUT_TABULAR :
3830 break;
3832 case OUTPUT_TABULAR_COMMENT :
3834 fprintf(outputFile, "# Query : %s\n", queryPatternName);
3835 fprintf(outputFile, "# Query id, Subject id, %% identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score\n");
3841 void HSPPrintAlignment(MMPool *mmPool, FILE *outputFile, MMPool *alignmentPool, const GappedHitListWithAlignment* gappedHitList, const int numOfHit,
3842 int outputFormat, const ContextInfo* contextInfo,
3843 const unsigned char *charMap, const unsigned char *complementMap,
3844 const char* queryPatternName, const unsigned char* convertedQueryPattern, const int queryPatternLength,
3845 const int * dbOrder, const HSP *hsp) {
3847 int i, j;
3848 int lastDbSeqIndex, dbSeqIndex, contextNum;
3849 char* __restrict tempDbChar;
3850 char* __restrict tempQueryChar;
3852 char queryName[MAX_SEQ_NAME_LENGTH+1];
3853 char seqName[MAX_SEQ_NAME_LENGTH+1];
3855 int matchMatrix[DNA_CHAR_SIZE][DNA_CHAR_SIZE];
3857 unsigned long long queryIndex, dbIndex;
3858 int totalLength, auxiliaryTextIndex;
3859 unsigned int c;
3860 int a, lastA;
3862 int alignmentCharPrinted;
3863 int textPosNumOfChar;
3864 char positionFormat[6] = "%-00u";
3866 char lowercase[256];
3868 int numOfMatch, numOfMismatch, numOfSpace, numOfGap;
3870 unsigned int* __restrict alignment;
3871 unsigned int* __restrict auxiliaryText;
3873 // Allocate memory
3874 tempDbChar = (char*) MMPoolDispatch(mmPool, MAX_ALIGNMENT_LENGTH + 1);
3875 tempQueryChar = (char*) MMPoolDispatch(mmPool, MAX_ALIGNMENT_LENGTH + 1);
3877 // duplicate query name
3878 strncpy(queryName, queryPatternName, MAX_SEQ_NAME_LENGTH+1);
3880 // take the first white space as end of pattern name
3881 for (i=0; i<MAX_SEQ_NAME_LENGTH; i++) {
3882 if (queryName[i] == ' ' || queryName[i] == '\0' || queryName[i] == '\t' || queryName[i] == '\n') {
3883 queryName[i] = '\0';
3884 break;
3888 HSPFillScoringMatrix(matchMatrix, 1, -1, 0);
3890 // initialize lowercase
3891 for (i=0; i<256; i++) {
3892 lowercase[i] = (char)i;
3894 for (i=0; i<26; i++) {
3895 lowercase['A' + i] = 'a' + (char)i;
3898 // Process all hits
3899 lastDbSeqIndex = -1;
3900 for (i=0; i<numOfHit; i++) {
3902 if (gappedHitList[i].lengthQuery > MAX_ALIGNMENT_LENGTH || gappedHitList[i].lengthText > MAX_ALIGNMENT_LENGTH) {
3903 fprintf(stderr, "HSPPrintAlignment(): Alignment length > maximum!\n");
3904 exit(1);
3907 dbSeqIndex = dbOrder[gappedHitList[i].dbSeqIndex & CONTEXT_MASK];
3908 contextNum = gappedHitList[i].dbSeqIndex >> (BITS_IN_WORD - CONTEXT_BIT);
3910 if (dbSeqIndex != lastDbSeqIndex) {
3912 switch(outputFormat) {
3913 case OUTPUT_PAIRWISE :
3915 // Print header for sequence
3916 if (hsp->annotation[dbSeqIndex].gi > 0) {
3917 fprintf(outputFile, ">gi|%d|%s", hsp->annotation[dbSeqIndex].gi, hsp->annotation[dbSeqIndex].text);
3918 } else {
3919 fprintf(outputFile, ">%s", hsp->annotation[dbSeqIndex].text);
3921 fprintf(outputFile, "No. of letters : %llu\n", hsp->seqOffset[dbSeqIndex].endPos - hsp->seqOffset[dbSeqIndex].startPos + 1);
3922 fprintf(outputFile, "\n");
3924 break;
3926 case OUTPUT_TABULAR :
3927 case OUTPUT_TABULAR_COMMENT :
3929 // duplicate sequence name
3930 strncpy(seqName, hsp->annotation[dbSeqIndex].text, MAX_SEQ_NAME_LENGTH+1);
3931 // take the first white space as end of pattern name
3932 for (j=0; j<MAX_SEQ_NAME_LENGTH; j++) {
3933 if (seqName[j] == ' ' || seqName[j] == '\0' || seqName[j] == '\t' || seqName[j] == '\n') {
3934 seqName[j] = '\0';
3935 break;
3939 break;
3944 lastDbSeqIndex = dbSeqIndex;
3946 // Decode alignment and DB characters
3948 queryIndex = 0;
3949 totalLength = 0;
3950 auxiliaryTextIndex = 0;
3951 numOfMatch = 0;
3952 numOfMismatch = 0;
3953 numOfSpace = 0;
3954 numOfGap = 0;
3955 lastA = -1;
3956 c = 0;
3958 alignment = (unsigned int*)((char*)alignmentPool + gappedHitList[i].alignmentOffset);
3959 auxiliaryText = (unsigned int*)((char*)alignmentPool + gappedHitList[i].auxiliaryTextOffset);
3961 while (queryIndex < gappedHitList[i].lengthQuery) {
3962 if (totalLength % ALIGN_PER_WORD == 0) {
3963 c = alignment[totalLength / ALIGN_PER_WORD];
3965 a = (char)(c >> (BITS_IN_WORD - ALIGN_BIT));
3966 c <<= ALIGN_BIT;
3967 switch(a) {
3968 case ALIGN_MATCH :
3969 if (contextInfo[contextNum].reversed) {
3970 tempQueryChar[totalLength] = dnaChar[convertedQueryPattern[queryPatternLength - 1 - queryIndex - gappedHitList[i].posQuery]];
3971 } else {
3972 tempQueryChar[totalLength] = dnaChar[convertedQueryPattern[queryIndex + gappedHitList[i].posQuery]];
3974 tempDbChar[totalLength] = tempQueryChar[totalLength];
3975 numOfMatch++;
3976 queryIndex++;
3977 break;
3978 case ALIGN_MISMATCH_AMBIGUITY :
3979 tempDbChar[totalLength] = dnaChar[(auxiliaryText[auxiliaryTextIndex / AUX_TEXT_PER_WORD] << ((auxiliaryTextIndex % AUX_TEXT_PER_WORD) * AUX_TEXT_BIT) >> (BITS_IN_WORD - AUX_TEXT_BIT))];
3980 if (contextInfo[contextNum].reversed) {
3981 tempDbChar[totalLength] = complementMap[tempDbChar[totalLength]];
3982 tempQueryChar[totalLength] = dnaChar[convertedQueryPattern[queryPatternLength - 1 - queryIndex - gappedHitList[i].posQuery]];
3983 } else {
3984 tempQueryChar[totalLength] = dnaChar[convertedQueryPattern[queryIndex + gappedHitList[i].posQuery]];
3986 if (matchMatrix[charMap[tempDbChar[totalLength]]][charMap[tempQueryChar[totalLength]]] <= 0) {
3987 // Mismatch
3988 numOfMismatch++;
3989 } else {
3990 // Match with ambiguity
3991 if (tempDbChar[totalLength] == tempQueryChar[totalLength] && charMap[tempDbChar[totalLength]] < ALPHABET_SIZE) {
3992 fprintf(stderr, "HSPPrintAlignment(): Alignment error!\n");
3993 exit(1);
3995 numOfMatch++;
3997 queryIndex++;
3998 auxiliaryTextIndex++;
3999 break;
4000 case ALIGN_INSERT :
4001 tempDbChar[totalLength] = dnaChar[(auxiliaryText[auxiliaryTextIndex / AUX_TEXT_PER_WORD] << ((auxiliaryTextIndex % AUX_TEXT_PER_WORD) * AUX_TEXT_BIT) >> (BITS_IN_WORD - AUX_TEXT_BIT))];
4002 if (contextInfo[contextNum].reversed) {
4003 tempDbChar[totalLength] = complementMap[tempDbChar[totalLength]];
4005 tempQueryChar[totalLength] = '-';
4006 numOfSpace++;
4007 if (lastA != ALIGN_INSERT) {
4008 numOfGap++;
4010 auxiliaryTextIndex++;
4011 break;
4012 case ALIGN_DELETE :
4013 tempDbChar[totalLength] = '-';
4014 if (contextInfo[contextNum].reversed) {
4015 tempQueryChar[totalLength] = dnaChar[convertedQueryPattern[queryPatternLength - 1 - queryIndex - gappedHitList[i].posQuery]];
4016 } else {
4017 tempQueryChar[totalLength] = dnaChar[convertedQueryPattern[queryIndex + gappedHitList[i].posQuery]];
4019 numOfSpace++;
4020 if (lastA != ALIGN_DELETE) {
4021 numOfGap++;
4023 queryIndex++;
4024 break;
4026 lastA = a;
4027 totalLength++;
4030 // Print alignment
4032 switch (outputFormat) {
4034 case OUTPUT_PAIRWISE :
4036 fprintf(outputFile, "Score = ");
4037 HSPCalculateAndPrintBitScore(outputFile, gappedHitList[i].score);
4038 fprintf(outputFile, " bits (%d), Expect = ", gappedHitList[i].score);
4039 HSPCalculateAndPrintEValue(outputFile, gappedHitList[i].score);
4040 fprintf(outputFile, "\n");
4041 fprintf(outputFile, "Identities = %d/%d (%.2f%%), Gaps = %d/%d (%.2f%%)\n", numOfMatch, totalLength, (float)numOfMatch / totalLength * 100, numOfSpace, totalLength, (float)numOfSpace / totalLength * 100);
4042 if (contextInfo[contextNum].reversed) {
4043 fprintf(outputFile, "Strand = Plus / Minus\n");
4044 } else {
4045 fprintf(outputFile, "Strand = Plus / Plus\n");
4047 fprintf(outputFile, "\n\n");
4049 // Determine position length and format
4050 c = gappedHitList[i].posText + gappedHitList[i].lengthText - hsp->seqOffset[dbSeqIndex].startPos;
4051 textPosNumOfChar = 0;
4052 while (c > 0) {
4053 c /= 10;
4054 textPosNumOfChar++;
4056 if (textPosNumOfChar >= 10) {
4057 positionFormat[2] = '0' + (char)(textPosNumOfChar / 10);
4058 positionFormat[3] = '0' + (char)(textPosNumOfChar % 10);
4059 positionFormat[4] = 'u';
4060 positionFormat[5] = '\0';
4061 } else {
4062 positionFormat[2] = '0' + (char)textPosNumOfChar;
4063 positionFormat[3] = 'u';
4064 positionFormat[4] = '\0';
4067 alignmentCharPrinted = 0;
4068 /*if (contextInfo[contextNum].reversed) {
4069 queryIndex = queryPatternLength + 1 - gappedHitList[i].posQuery - gappedHitList[i].lengthQuery;
4070 dbIndex = gappedHitList[i].posText + gappedHitList[i].lengthText - hsp->seqOffset[dbSeqIndex].startPos;
4071 } else {
4072 queryIndex = gappedHitList[i].posQuery + 1;
4073 dbIndex = gappedHitList[i].posText - hsp->seqOffset[dbSeqIndex].startPos + 1;
4076 //Handle the removal of >10-consecutive N in reference sequence.
4077 if (contextInfo[contextNum].reversed) {
4078 queryIndex = queryPatternLength + 1 - gappedHitList[i].posQuery - gappedHitList[i].lengthQuery;
4079 unsigned long long tp = gappedHitList[i].posText + gappedHitList[i].lengthText - 1;
4081 unsigned long long approxIndex = tp>>GRID_SAMPLING_FACTOR_2_POWER;
4082 unsigned long long approxValue = hsp->ambiguityMap[approxIndex];
4083 while (hsp->translate[approxValue].startPos>tp) {
4084 approxValue--;
4086 tp-=hsp->translate[approxValue].correction;
4087 //unsigned int utp = TranslateUnambiguousTextPosition(hsp,gappedHitList[i].posText + gappedHitList[i].lengthText - 1,dbSeqIndex);
4088 //unsigned int rtp = ComputeRelativeChromosomePosition(hsp,utp,dbSeqIndex);
4089 dbIndex=tp;
4090 } else {
4091 queryIndex = gappedHitList[i].posQuery + 1;
4092 unsigned long long tp = gappedHitList[i].posText;
4094 unsigned long long approxIndex = tp>>GRID_SAMPLING_FACTOR_2_POWER;
4095 unsigned long long approxValue = hsp->ambiguityMap[approxIndex];
4096 while (hsp->translate[approxValue].startPos>tp) {
4097 approxValue--;
4099 tp-=hsp->translate[approxValue].correction;
4100 //unsigned int utp = TranslateUnambiguousTextPosition(hsp,gappedHitList[i].posText,dbSeqIndex);
4101 //unsigned int rtp = ComputeRelativeChromosomePosition(hsp,utp,dbSeqIndex);
4102 dbIndex=tp;
4105 while (alignmentCharPrinted < totalLength) {
4107 if (contextInfo[contextNum].reversed) {
4109 // Print query line
4110 fprintf(outputFile, "Query: ");
4111 fprintf(outputFile, positionFormat, queryIndex);
4112 fprintf(outputFile, " ");
4114 j = 0;
4115 while (j < 60 && alignmentCharPrinted + j < totalLength) {
4116 fprintf(outputFile, "%c", lowercase[tempQueryChar[totalLength - 1 - alignmentCharPrinted - j]]);
4117 if (tempQueryChar[totalLength - 1 - alignmentCharPrinted - j] != '-') {
4118 queryIndex++;
4120 j++;
4123 fprintf(outputFile, " ");
4124 fprintf(outputFile, "%llu", queryIndex - 1);
4125 fprintf(outputFile, "\n");
4127 // Print alignment line
4128 for (j=0; j<textPosNumOfChar + 8; j++) {
4129 fprintf(outputFile, " ");
4132 j = 0;
4133 while (j < 60 && alignmentCharPrinted + j < totalLength) {
4134 if (tempQueryChar[totalLength - 1 - alignmentCharPrinted - j] == tempDbChar[totalLength - 1 - alignmentCharPrinted - j]) {
4135 fprintf(outputFile, "|");
4136 } else {
4137 fprintf(outputFile, " ");
4139 j++;
4141 fprintf(outputFile, "\n");
4143 // Print database line
4144 fprintf(outputFile, "Sbjct: ");
4145 fprintf(outputFile, positionFormat, dbIndex);
4146 fprintf(outputFile, " ");
4148 j = 0;
4149 while (j < 60 && alignmentCharPrinted + j < totalLength) {
4150 fprintf(outputFile, "%c", lowercase[tempDbChar[totalLength - 1 - alignmentCharPrinted - j]]);
4151 if (tempDbChar[totalLength - 1 - alignmentCharPrinted - j] != '-') {
4152 dbIndex--;
4154 j++;
4157 fprintf(outputFile, " ");
4158 fprintf(outputFile, positionFormat, dbIndex + 1);
4159 fprintf(outputFile, "\n");
4161 } else {
4163 // Print query line
4164 fprintf(outputFile, "Query: ");
4165 fprintf(outputFile, positionFormat, queryIndex);
4166 fprintf(outputFile, " ");
4168 j = 0;
4169 while (j < 60 && alignmentCharPrinted + j < totalLength) {
4170 fprintf(outputFile, "%c", lowercase[tempQueryChar[alignmentCharPrinted + j]]);
4171 if (tempQueryChar[alignmentCharPrinted + j] != '-') {
4172 queryIndex++;
4174 j++;
4177 fprintf(outputFile, " ");
4178 fprintf(outputFile, "%llu", queryIndex - 1);
4179 fprintf(outputFile, "\n");
4181 // Print alignment line
4182 for (j=0; j<textPosNumOfChar + 8; j++) {
4183 fprintf(outputFile, " ");
4186 j = 0;
4187 while (j < 60 && alignmentCharPrinted + j < totalLength) {
4188 if (matchMatrix[charMap[tempQueryChar[alignmentCharPrinted + j]]][charMap[tempDbChar[alignmentCharPrinted + j]]] > 0) {
4189 fprintf(outputFile, "|");
4190 } else {
4191 fprintf(outputFile, " ");
4193 j++;
4195 fprintf(outputFile, "\n");
4197 // Print database line
4198 fprintf(outputFile, "Sbjct: ");
4199 fprintf(outputFile, positionFormat, dbIndex);
4200 fprintf(outputFile, " ");
4202 j = 0;
4203 while (j < 60 && alignmentCharPrinted + j < totalLength) {
4204 fprintf(outputFile, "%c", lowercase[tempDbChar[alignmentCharPrinted + j]]);
4205 if (tempDbChar[alignmentCharPrinted + j] != '-') {
4206 dbIndex++;
4208 j++;
4211 fprintf(outputFile, " ");
4212 fprintf(outputFile, positionFormat, dbIndex - 1);
4213 fprintf(outputFile, "\n");
4217 fprintf(outputFile, "\n\n");
4218 alignmentCharPrinted += j;
4222 break;
4224 case OUTPUT_TABULAR :
4225 case OUTPUT_TABULAR_COMMENT :
4226 // print query name
4227 fprintf(outputFile, "%s\t", queryName);
4229 // print gi
4230 if (hsp->annotation[dbSeqIndex].gi > 0) {
4231 fprintf(outputFile, "gi|%d|", hsp->annotation[dbSeqIndex].gi);
4234 // print db sequence name
4235 fprintf(outputFile, "%s\t", seqName);
4237 // print percentage identity
4238 fprintf(outputFile, "%.2f\t", (float)(totalLength - numOfMismatch - numOfSpace) / (float)(totalLength) * 100);
4240 // print align length
4241 fprintf(outputFile, "%d\t", totalLength);
4243 // print number of mismatch
4244 fprintf(outputFile, "%d\t", numOfMismatch);
4246 // print number of gap opening
4247 fprintf(outputFile, "%d\t", numOfGap);
4249 // print query start and end
4250 if (contextInfo[contextNum].reversed) {
4251 fprintf(outputFile, "%u\t%u\t", queryPatternLength - gappedHitList[i].posQuery - gappedHitList[i].lengthQuery + 1,
4252 queryPatternLength - gappedHitList[i].posQuery);
4253 } else {
4254 fprintf(outputFile, "%u\t%u\t", gappedHitList[i].posQuery + 1,
4255 gappedHitList[i].posQuery + gappedHitList[i].lengthQuery);
4258 // print text start and end
4259 if (contextInfo[contextNum].reversed) {
4260 unsigned long long tp = gappedHitList[i].posText + gappedHitList[i].lengthText - 1;
4261 unsigned long long approxIndex = tp>>GRID_SAMPLING_FACTOR_2_POWER;
4262 unsigned long long approxValue = hsp->ambiguityMap[approxIndex];
4263 while (hsp->translate[approxValue].startPos>tp) {
4264 approxValue--;
4266 tp-=hsp->translate[approxValue].correction;
4267 fprintf(outputFile, "%llu\t%llu\t", tp,
4268 tp - gappedHitList[i].lengthText + 1);
4269 //fprintf(outputFile, "%u\t%u\t", gappedHitList[i].posText + gappedHitList[i].lengthText - hsp->seqOffset[dbSeqIndex].startPos,
4270 // gappedHitList[i].posText + 1 - hsp->seqOffset[dbSeqIndex].startPos);
4271 } else {
4272 unsigned long long tp = gappedHitList[i].posText;
4274 unsigned long long approxIndex = tp>>GRID_SAMPLING_FACTOR_2_POWER;
4275 unsigned long long approxValue = hsp->ambiguityMap[approxIndex];
4276 while (hsp->translate[approxValue].startPos>tp) {
4277 approxValue--;
4279 tp-=hsp->translate[approxValue].correction;
4280 fprintf(outputFile, "%llu\t%llu\t", tp,
4281 tp + gappedHitList[i].lengthText-1);
4282 //fprintf(outputFile, "%u\t%u\t", gappedHitList[i].posText + 1 - hsp->seqOffset[dbSeqIndex].startPos,
4283 // gappedHitList[i].posText + gappedHitList[i].lengthText - hsp->seqOffset[dbSeqIndex].startPos);
4287 // print evalue
4288 HSPCalculateAndPrintEValue(outputFile, gappedHitList[i].score);
4290 // print bit score
4291 HSPCalculateAndPrintBitScore(outputFile, gappedHitList[i].score);
4293 fprintf(outputFile, "\n");
4295 break;
4297 default :
4299 fprintf(stderr, "HSPPrintAlignment() : Output format is invalid!\n");
4300 exit(1);
4306 // Free memory
4307 MMPoolReturn(mmPool, tempDbChar, MAX_ALIGNMENT_LENGTH + 1);
4308 MMPoolReturn(mmPool, tempQueryChar, MAX_ALIGNMENT_LENGTH + 1);
4312 void HSPPrintInvalidHit(FILE *outputFile, int outputFormat) {
4313 switch (outputFormat) {
4315 case OUTPUT_PAIRWISE :
4317 fprintf(outputFile, "#Effective query length reaches zero. \n#Unable to compute Karlin-Altschul parameters.\n\n");
4318 break;
4320 case OUTPUT_TABULAR :
4321 case OUTPUT_TABULAR_COMMENT :
4323 fprintf(outputFile, "\n");
4328 void HSPPrintNoAlignment(FILE *outputFile, int outputFormat) {
4329 switch (outputFormat) {
4331 case OUTPUT_PAIRWISE :
4333 fprintf(outputFile, "#No hit\n\n");
4334 break;
4336 case OUTPUT_TABULAR :
4337 case OUTPUT_TABULAR_COMMENT :
4339 fprintf(outputFile, "\n");
4345 double HSPCalculateAndPrintBitScore(FILE *outputFile, const int score) {
4347 double bitScore;
4349 // print bit score
4350 bitScore = stat_gapNominal2normalized(score);
4352 if (outputFile != NULL) {
4353 if (bitScore > 9999) {
4354 fprintf(outputFile, "%4.3le", bitScore);
4355 } else if (bitScore > 99.9) {
4356 fprintf(outputFile, "%4.0d", (int)bitScore);
4357 } else {
4358 fprintf(outputFile, "%4.1lf", bitScore);
4362 return bitScore;
4366 double HSPCalculateAndPrintEValue(FILE *outputFile, const int score) {
4368 double evalue;
4369 char evalueString[10];
4371 evalue = stat_gapCalcEvalue(stat_gapNominal2normalized(score));
4373 if (evalue < 1.0e-180) {
4374 sprintf(evalueString, "0.0\t");
4375 } else if (evalue < 1.0e-99) {
4376 sprintf(evalueString, "%2.0le\t", evalue);
4377 } else if (evalue < 0.0009) {
4378 sprintf(evalueString, "%3.0le\t", evalue);
4379 } else if (evalue < 0.1) {
4380 sprintf(evalueString, "%4.3lf\t", evalue);
4381 } else if (evalue < 1.0) {
4382 sprintf(evalueString, "%3.2lf\t", evalue);
4383 } else if (evalue < 10.0) {
4384 sprintf(evalueString, "%2.1lf\t", evalue);
4385 } else {
4386 sprintf(evalueString, "%5.0lf\t", evalue);
4389 if (outputFile != NULL) {
4390 fprintf(outputFile, "%s\t", evalueString);
4393 sscanf(evalueString, "%lf", &evalue);
4394 return evalue;
4398 int HitListPosTextQueryOrder(const void *hitList, const long long index1, const long long index2) {
4400 if (((HitListWithPosQuery*)hitList + index1)->posText != ((HitListWithPosQuery*)hitList + index2)->posText) {
4401 if (((HitListWithPosQuery*)hitList + index1)->posText > ((HitListWithPosQuery*)hitList + index2)->posText) {
4402 return 1;
4403 } else {
4404 return -1;
4406 } else {
4407 return ((HitListWithPosQuery*)hitList + index1)->posQuery - ((HitListWithPosQuery*)hitList + index2)->posQuery;
4412 int GappedHitListPosTextQueryLengthScoreOrder(const void *gappedHitList, const long long index1, const long long index2) {
4414 if (((GappedHitList*)gappedHitList + index1)->posText != ((GappedHitList*)gappedHitList + index2)->posText) {
4415 if (((GappedHitList*)gappedHitList + index1)->posText > ((GappedHitList*)gappedHitList + index2)->posText) {
4416 return 1;
4417 } else {
4418 return -1;
4420 } else {
4421 if (((GappedHitList*)gappedHitList + index1)->posQuery != ((GappedHitList*)gappedHitList + index2)->posQuery) {
4422 if (((GappedHitList*)gappedHitList + index1)->posQuery > ((GappedHitList*)gappedHitList + index2)->posQuery) {
4423 return 1;
4424 } else {
4425 return -1;
4427 } else {
4428 if (((GappedHitList*)gappedHitList + index1)->lengthText != ((GappedHitList*)gappedHitList + index2)->lengthText) {
4429 return ((GappedHitList*)gappedHitList + index1)->lengthText - ((GappedHitList*)gappedHitList + index2)->lengthText;
4430 } else {
4431 if (((GappedHitList*)gappedHitList + index1)->lengthQuery != ((GappedHitList*)gappedHitList + index2)->lengthQuery) {
4432 return ((GappedHitList*)gappedHitList + index1)->lengthQuery - ((GappedHitList*)gappedHitList + index2)->lengthQuery;
4433 } else {
4434 // higher score first
4435 return ((GappedHitList*)gappedHitList + index2)->score - ((GappedHitList*)gappedHitList + index1)->score;
4443 int GappedHitListPosTextQueryScoreLengthOrder(const void *gappedHitList, const long long index1, const long long index2) {
4445 if (((GappedHitList*)gappedHitList + index1)->posText != ((GappedHitList*)gappedHitList + index2)->posText) {
4446 if (((GappedHitList*)gappedHitList + index1)->posText > ((GappedHitList*)gappedHitList + index2)->posText) {
4447 return 1;
4448 } else {
4449 return -1;
4451 } else {
4452 if (((GappedHitList*)gappedHitList + index1)->posQuery != ((GappedHitList*)gappedHitList + index2)->posQuery) {
4453 if (((GappedHitList*)gappedHitList + index1)->posQuery > ((GappedHitList*)gappedHitList + index2)->posQuery) {
4454 return 1;
4455 } else {
4456 return -1;
4458 } else {
4459 if (((GappedHitList*)gappedHitList + index1)->score != ((GappedHitList*)gappedHitList + index2)->score) {
4460 // higher score first
4461 return ((GappedHitList*)gappedHitList + index2)->score - ((GappedHitList*)gappedHitList + index1)->score;
4462 } else {
4463 // short query length first
4464 if (((GappedHitList*)gappedHitList + index1)->lengthQuery != ((GappedHitList*)gappedHitList + index2)->lengthQuery) {
4465 if (((GappedHitList*)gappedHitList + index1)->lengthQuery > ((GappedHitListWithAlignment*)gappedHitList + index2)->lengthQuery) {
4466 return 1;
4467 } else {
4468 return -1;
4470 } else {
4471 // short text length first
4472 if (((GappedHitList*)gappedHitList + index1)->lengthText != ((GappedHitListWithAlignment*)gappedHitList + index2)->lengthText) {
4473 if (((GappedHitList*)gappedHitList + index1)->lengthText > ((GappedHitListWithAlignment*)gappedHitList + index2)->lengthText) {
4474 return 1;
4475 } else {
4476 return -1;
4478 } else {
4479 return 0;
4488 int GappedHitListEndTextQueryScoreLengthOrder(const void *gappedHitList, const long long index1, const long long index2) {
4490 if (((GappedHitList*)gappedHitList + index1)->posText + ((GappedHitList*)gappedHitList + index1)->lengthText !=
4491 ((GappedHitList*)gappedHitList + index2)->posText + ((GappedHitList*)gappedHitList + index2)->lengthText) {
4492 if (((GappedHitList*)gappedHitList + index1)->posText + ((GappedHitList*)gappedHitList + index1)->lengthText >
4493 ((GappedHitList*)gappedHitList + index2)->posText + ((GappedHitList*)gappedHitList + index2)->lengthText) {
4494 return 1;
4495 } else {
4496 return -1;
4498 } else {
4499 if (((GappedHitList*)gappedHitList + index1)->posQuery + ((GappedHitList*)gappedHitList + index1)->lengthQuery !=
4500 ((GappedHitList*)gappedHitList + index2)->posQuery + ((GappedHitList*)gappedHitList + index2)->lengthQuery) {
4501 if (((GappedHitList*)gappedHitList + index1)->posQuery + ((GappedHitList*)gappedHitList + index1)->lengthQuery >
4502 ((GappedHitList*)gappedHitList + index2)->posQuery + ((GappedHitList*)gappedHitList + index2)->lengthQuery) {
4503 return 1;
4504 } else {
4505 return -1;
4507 } else {
4508 if (((GappedHitList*)gappedHitList + index1)->score != ((GappedHitList*)gappedHitList + index2)->score) {
4509 // higher score first
4510 return ((GappedHitList*)gappedHitList + index2)->score - ((GappedHitList*)gappedHitList + index1)->score;
4511 } else {
4512 // short query length first
4513 if (((GappedHitList*)gappedHitList + index1)->lengthQuery != ((GappedHitListWithAlignment*)gappedHitList + index2)->lengthQuery) {
4514 if (((GappedHitList*)gappedHitList + index1)->lengthQuery > ((GappedHitListWithAlignment*)gappedHitList + index2)->lengthQuery) {
4515 return 1;
4516 } else {
4517 return -1;
4519 } else {
4520 // short text length first
4521 if (((GappedHitList*)gappedHitList + index1)->lengthText != ((GappedHitListWithAlignment*)gappedHitList + index2)->lengthText) {
4522 if (((GappedHitList*)gappedHitList + index1)->lengthText > ((GappedHitListWithAlignment*)gappedHitList + index2)->lengthText) {
4523 return 1;
4524 } else {
4525 return -1;
4527 } else {
4528 return 0;
4537 int GappedHitListEnclosedScoreOrder(const void *gappedHitList, const long long index1, const long long index2) {
4539 if (((GappedHitList*)gappedHitList + index1)->posText != ((GappedHitList*)gappedHitList + index2)->posText) {
4540 if (((GappedHitList*)gappedHitList + index1)->posText > ((GappedHitList*)gappedHitList + index2)->posText) {
4541 return 1;
4542 } else {
4543 return -1;
4545 } else {
4546 if (((GappedHitList*)gappedHitList + index1)->lengthText != ((GappedHitList*)gappedHitList + index2)->lengthText) {
4547 if (((GappedHitList*)gappedHitList + index1)->lengthText < ((GappedHitList*)gappedHitList + index2)->lengthText) {
4548 return 1;
4549 } else {
4550 return -1;
4552 } else {
4553 if (((GappedHitList*)gappedHitList + index1)->posQuery != ((GappedHitList*)gappedHitList + index2)->posQuery) {
4554 if (((GappedHitList*)gappedHitList + index1)->posQuery > ((GappedHitList*)gappedHitList + index2)->posQuery) {
4555 return 1;
4556 } else {
4557 return -1;
4559 } else {
4560 if (((GappedHitList*)gappedHitList + index1)->lengthQuery != ((GappedHitList*)gappedHitList + index2)->lengthQuery) {
4561 if (((GappedHitList*)gappedHitList + index1)->lengthQuery < ((GappedHitList*)gappedHitList + index2)->lengthQuery) {
4562 return 1;
4563 } else {
4564 return -1;
4566 } else {
4567 // higher score first
4568 return ((GappedHitList*)gappedHitList + index2)->score - ((GappedHitList*)gappedHitList + index1)->score;