3 HSP.c BWTBlastn functions
5 This module contains miscellaneous BWTBlastn functions.
7 Copyright (C) 2004, Wong Chi Kwong.
9 This program is free software; you can redistribute it and/or
10 modify it under the terms of the GNU General Public License
11 as published by the Free Software Foundation; either version 2
12 of the License, or (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
30 #include <emmintrin.h>
32 #include "TextConverter.h"
33 #include "MiscUtilities.h"
37 #include "HSPstatistic.h"
40 extern double stat_expectationValue
;
42 //unsigned int UChromosomeStartPos(HSP * hsp, unsigned chrNo) {
43 // unsigned int correction = hsp->ambiguity[hsp->seqOffset[chrNo].firstAmbiguityIndex-1].rightOfEndPos -
44 // hsp->ambiguity[hsp->seqOffset[chrNo].firstAmbiguityIndex-1].startPos;
45 // return hsp->seqOffset[chrNo].startPos+correction;
47 //unsigned int UChromosomeEndPos(HSP * hsp, unsigned chrNo) {
48 // unsigned int correction = hsp->ambiguity[hsp->seqOffset[chrNo].lastAmbiguityIndex].rightOfEndPos -
49 // hsp->ambiguity[hsp->seqOffset[chrNo].lastAmbiguityIndex].startPos;
50 // return hsp->seqOffset[chrNo].endPos+correction;
53 //void ComputeChromosomeGrid2(HSP * hsp,unsigned int gridEntries,unsigned short * ambiguityMap, Translate * translate) {
54 // //unsigned int gridEntries = (hsp->dnaLength/GRID_SAMPLING_FACTOR)+1;
55 // //hsp->ambiguityMap = malloc(sizeof(unsigned short)*gridEntries);
56 // //hsp->translate = malloc(sizeof(Translate)*(hsp->numOfAmbiguity+hsp->numOfSeq));
59 // unsigned int acc = 0;
60 // for (i=0;i<gridEntries;i++) {
63 // unsigned int j=0,k=0;
65 // unsigned int lastChrAmbAccumulate=0;
66 // while(j<hsp->numOfAmbiguity && k<hsp->numOfSeq) {
67 // if (hsp->ambiguity[j].startPos<hsp->seqOffset[k].startPos) {
68 // //Write Ambiguous Segment Flag
69 // unsigned int index = hsp->ambiguity[j].startPos;
70 // ambiguityMap[index/GRID_SAMPLING_FACTOR]+=1;
71 // translate[i].startPos=hsp->ambiguity[j].startPos;
72 // translate[i].chrID=k;
73 // //chrAmbAccumulate+=(hsp->ambiguity[j].rightOfEndPos - hsp->ambiguity[j].startPos);
74 // translate[i].correction=UChromosomeStartPos(hsp,k-1)-(hsp->ambiguity[j].rightOfEndPos - hsp->ambiguity[j].startPos)-1;
75 // //printf("%u:AMB%u\ti=%u\tSAMPLE_i=%u\tCORRECT=%u\tCHR=%u\n",i,j,index,index/GRID_SAMPLING_FACTOR,duplicateAmb[i].correction,k-1);
78 // } else if (hsp->ambiguity[j].startPos>hsp->seqOffset[k].startPos) {
79 // //Write Artificial Chromosome Start Flag
80 // unsigned int index = hsp->seqOffset[k].startPos;
81 // ambiguityMap[index/GRID_SAMPLING_FACTOR]+=1;
82 // translate[i].startPos=hsp->seqOffset[k].startPos;
83 // translate[i].chrID=k+1;
84 // translate[i].correction=hsp->seqOffset[k].startPos-1;
85 // //printf("%u:CHR%u\ti=%u\tSAMPLE_i=%u\tAccCORRECT=-%u\tCHR=%u\n",i,k,index,index/GRID_SAMPLING_FACTOR,hsp->seqOffset[k].startPos,k);
86 // //chrAmbAccumulate=0;
90 // //chrAmbAccumulate=0;
94 // while (j<hsp->numOfAmbiguity) {
95 // unsigned int index = hsp->ambiguity[j].startPos;
96 // ambiguityMap[index/GRID_SAMPLING_FACTOR]+=1;
97 // translate[i].startPos=hsp->ambiguity[j].startPos;
98 // translate[i].chrID=k;
99 // translate[i].correction=UChromosomeStartPos(hsp,k-1)-(hsp->ambiguity[j].rightOfEndPos - hsp->ambiguity[j].startPos)-1;
100 // //chrAmbAccumulate+=hsp->ambiguity[j].rightOfEndPos - hsp->ambiguity[j].startPos;
101 // //duplicateAmb[i].correction=hsp->seqOffset[k-1].startPos-chrAmbAccumulate;
102 // //printf("%u:AMB%u\ti=%u\tSAMPLE_i=%u\tCORRECT=%u\tCHR=%u\n",i,j,index,index/GRID_SAMPLING_FACTOR,duplicateAmb[i].correction,k-1);
105 // while (k<hsp->numOfSeq) {
106 // unsigned int index = hsp->seqOffset[k].startPos;
107 // ambiguityMap[index/GRID_SAMPLING_FACTOR]+=1;
108 // translate[i].startPos=hsp->seqOffset[k].startPos;
109 // translate[i].chrID=k+1;
110 // translate[i].correction=hsp->seqOffset[k].startPos-1;
111 // //printf("%u:CHR%u\ti=%u\tSAMPLE_i=%u\tAccCORRECT=-%u\tCHR=%u\n",i,k,index,index/GRID_SAMPLING_FACTOR,hsp->seqOffset[k].startPos,k);
112 // //mapGridToChromID[i]=chrNo;
116 // for (i=1;i<gridEntries;i++) {
117 // ambiguityMap[i]+=ambiguityMap[i-1];
119 // for (i=0;i<gridEntries;i++) {
120 // ambiguityMap[i]--;
124 /*unsigned int DetermineChromosome(HSP * hsp, unsigned int tp) {
125 unsigned int numOfSeq = hsp->numOfSeq;
132 unsigned int startPos = hsp->seqOffset[m].startPos;
135 } else if (tp>startPos) {
142 if (answer==0) {answer=l;}
145 /*unsigned int ComputeRelativeChromosomePosition(HSP * hsp, unsigned int utp,unsigned chrNo) {
146 //unsigned int prevAmb = hsp->seqOffset[chrNo].firstAmbiguityIndex-1;
147 //unsigned int chrUnambiguousStartPos = hsp->seqOffset[chrNo].startPos + (hsp->ambiguity[prevAmb].rightOfEndPos-hsp->ambiguity[prevAmb].startPos);
148 //printf("Chromosome unambiguous start position = %u\n",chrUnambiguousStartPos);
149 return utp - UChromosomeStartPos(hsp,chrNo);
151 /*unsigned int DetermineChromosomeRank(HSP * hsp, unsigned short *chromosomeSampleTable, unsigned int chromosomeSampleFactor, unsigned int tp) {
152 unsigned int approxIndex = (int)(tp/chromosomeSampleFactor);
153 unsigned int approxValue = chromosomeSampleTable[approxIndex];
154 if (hsp->seqOffset[approxValue].startPos>tp) return approxValue-1;
157 unsigned int UDetermineChromosomeRank(HSP * hsp,unsigned short *uChromosomeSampleTable,unsigned int chromosomeSampleFactor, unsigned int utp) {
158 unsigned int approxIndex = (int)(utp/chromosomeSampleFactor);
159 approxIndex=uChromosomeSampleTable[approxIndex];
160 if (UChromosomeStartPos(hsp,approxIndex)>utp) return approxIndex-1;
163 /*unsigned int TranslateUnambiguousTextPosition(HSP * hsp, unsigned int tp,unsigned int chrNo) {
164 int l=hsp->seqOffset[chrNo].firstAmbiguityIndex-1;
165 int r=hsp->seqOffset[chrNo].lastAmbiguityIndex+1;
166 //printf("Within [%u,x]..[%u,x]\n",hsp->ambiguity[l].startPos,hsp->ambiguity[r].startPos);
171 unsigned int startPos = hsp->ambiguity[m].startPos;
174 } else if (tp>startPos) {
181 if (answer==0) {answer=l;}
182 //printf("Ambigunity record %u is found = [%u,%u]\n",answer,hsp->ambiguity[answer].startPos,hsp->ambiguity[answer].rightOfEndPos);
184 //printf("text position = %u\n",tp);
185 //printf("text position translated = %u\n",tp-(hsp->ambiguity[answer].startPos)+(hsp->ambiguity[answer].rightOfEndPos));
186 return tp-(hsp->ambiguity[answer].startPos)+(hsp->ambiguity[answer].rightOfEndPos);
190 void HSPFillCharMap(unsigned char charMap
[255]) {
194 for (i
=0; i
<255; i
++) {
195 charMap
[i
] = nonMatchDnaCharIndex
;
197 for (i
=0; i
<DNA_CHAR_SIZE
; i
++) {
198 charMap
[dnaChar
[i
]] = (unsigned char)i
;
199 charMap
[dnaChar
[i
] - 'A' + 'a'] = (unsigned char)i
;
204 void HSPFillComplementMap(unsigned char complementMap
[255]) {
208 for (i
=0; i
<255; i
++) {
209 complementMap
[i
] = dnaChar
[nonMatchDnaCharIndex
];
211 for (i
=0; i
<DNA_CHAR_SIZE
; i
++) {
212 complementMap
[dnaComplement
[i
]] = dnaChar
[i
];
213 complementMap
[dnaComplement
[i
] - 'A' + 'a'] = dnaChar
[i
] - 'A' + 'a';
218 void HSPFillScoringMatrix(int scoringMatrix
[DNA_CHAR_SIZE
][DNA_CHAR_SIZE
], const int matchScore
, const int mismatchScore
, const int leftShift
) {
222 for (i
=0; i
<DNA_CHAR_SIZE
; i
++) {
223 for (j
=0; j
<DNA_CHAR_SIZE
; j
++) {
224 if (ambiguityCount
[i
] > 0 && ambiguityCount
[j
] > 0) {
225 scoringMatrix
[i
][j
] = 0;
226 for (k
=0; k
<ambiguityCount
[i
]; k
++) {
227 for (l
=0; l
<ambiguityCount
[j
]; l
++) {
228 if (ambiguityMatch
[i
][k
] == ambiguityMatch
[j
][l
]) {
230 scoringMatrix
[i
][j
] += matchScore
;
233 scoringMatrix
[i
][j
] += mismatchScore
;
237 if (scoringMatrix
[i
][j
] > 0) {
238 scoringMatrix
[i
][j
] = (scoringMatrix
[i
][j
] + ambiguityCount
[i
] * ambiguityCount
[j
] - 1) / (ambiguityCount
[i
] * ambiguityCount
[j
]);
239 } else if (scoringMatrix
[i
][j
] < 0) {
240 scoringMatrix
[i
][j
] = -((-scoringMatrix
[i
][j
] + ambiguityCount
[i
] * ambiguityCount
[j
] - 1) / (ambiguityCount
[i
] * ambiguityCount
[j
]));
242 scoringMatrix
[i
][j
] = 0;
245 // Character meant to match nothing
246 scoringMatrix
[i
][j
] = mismatchScore
;
248 scoringMatrix
[i
][j
] = scoringMatrix
[i
][j
] * (1 << leftShift
);
255 HSP
*HSPLoad(MMPool
*mmPool
, const char *PackedDNAFileName
, const char *AnnotationFileName
, const char *AmbiguityFileName
, const char * TranslateFileName
, const unsigned int trailerBufferInWord
) {
258 unsigned long long dnaLength
;
259 unsigned int randomSeed
;
262 unsigned char charMap
[255];
264 FILE *annotationFile
= NULL
, *ambiguityFile
= NULL
, *translateFile
= NULL
;
266 hsp
= (HSP
*) MMPoolDispatch(mmPool
, sizeof(HSP
));
269 if (PackedDNAFileName
!= NULL
&& PackedDNAFileName
[0] != '\0' && PackedDNAFileName
[0] != '-') {
270 hsp
->packedDNA
= (unsigned int *) DNALoadPacked(PackedDNAFileName
, &hsp
->dnaLength
, TRUE
, trailerBufferInWord
);
272 hsp
->packedDNA
= NULL
;
277 if (AnnotationFileName
!= NULL
&& AnnotationFileName
[0] != '\0' && AnnotationFileName
[0] != '-') {
279 annotationFile
= (FILE*)fopen64(AnnotationFileName
, "r");
280 if (annotationFile
== NULL
) {
281 fprintf(stderr
, "Cannot open annotation file!\n");
285 fscanf(annotationFile
, "%llu %d %u\n", &dnaLength
, &hsp
->numOfSeq
, &randomSeed
);
286 if (hsp
->dnaLength
!= 0 && dnaLength
!= hsp
->dnaLength
) {
287 fprintf(stderr
, "Annotation database length not match!\n");
290 hsp
->dnaLength
= dnaLength
;
291 if (hsp
->numOfSeq
== 0) {
292 fprintf(stderr
, "Annotation number of sequence = 0!\n");
295 hsp
->annotation
= (Annotation
*) MMUnitAllocate((hsp
->numOfSeq
+1) * sizeof(Annotation
));
296 hsp
->seqOffset
= (SeqOffset
*) MMUnitAllocate((hsp
->numOfSeq
+1) * sizeof(SeqOffset
));
299 hsp
->minSeqLength
= UINT32_MAX
;
300 while (!feof(annotationFile
) && i
< hsp
->numOfSeq
) {
301 fscanf(annotationFile
, "%u ", &hsp
->annotation
[i
].gi
);
302 fgets(hsp
->annotation
[i
].text
, MAX_SEQ_NAME_LENGTH
, annotationFile
);
303 fscanf(annotationFile
, "%llu %llu %d\n", &hsp
->seqOffset
[i
].startPos
, &hsp
->seqOffset
[i
].endPos
, &hsp
->seqOffset
[i
+1].firstAmbiguityIndex
);
304 hsp
->seqOffset
[i
].lastAmbiguityIndex
= hsp
->seqOffset
[i
+1].firstAmbiguityIndex
;
305 if (hsp
->seqOffset
[i
].endPos
< hsp
->minSeqLength
) {
306 hsp
->minSeqLength
= hsp
->seqOffset
[i
].endPos
;
308 hsp
->seqOffset
[i
].endPos
= hsp
->seqOffset
[i
].startPos
+ hsp
->seqOffset
[i
].endPos
- 1; // length of sequence is stored
311 if (i
< hsp
->numOfSeq
) {
312 fprintf(stderr
, "Annotation missing entries!\n");
315 fclose(annotationFile
);
317 hsp
->annotation
[i
].gi
= 0;
318 hsp
->annotation
[i
].text
[0] = '\0';
320 hsp
->seqOffset
[i
].startPos
= UINT32_MAX
;
321 hsp
->seqOffset
[i
].endPos
= UINT32_MAX
;
322 hsp
->seqOffset
[0].firstAmbiguityIndex
= 1; // ambiguity[0] and ambiguity[numOfAmbiguity+1] are dummy
323 for (i
=1; i
<=hsp
->numOfSeq
; i
++) {
324 hsp
->seqOffset
[i
].firstAmbiguityIndex
+= hsp
->seqOffset
[i
-1].firstAmbiguityIndex
; // number of ambiguity is stored
326 // hsp->seqOffset[hsp->numOfSeq].firstAmbiguityIndex = total number of ambiguity + 1 now
327 hsp
->seqOffset
[hsp
->numOfSeq
].lastAmbiguityIndex
= hsp
->seqOffset
[hsp
->numOfSeq
].firstAmbiguityIndex
- 1;
328 // hsp->seqOffset[hsp->numOfSeq].lastAmbiguityIndex = total number of ambiguity now
329 for (i
=hsp
->numOfSeq
; i
>1; i
--) {
330 hsp
->seqOffset
[i
-1].lastAmbiguityIndex
= hsp
->seqOffset
[i
].lastAmbiguityIndex
- hsp
->seqOffset
[i
-1].lastAmbiguityIndex
; // number of ambiguity is stored
332 for (i
=0; i
<hsp
->numOfSeq
; i
++) {
333 hsp
->seqOffset
[i
].lastAmbiguityIndex
= hsp
->seqOffset
[i
+1].lastAmbiguityIndex
;
338 // Make up a dummy annotation and a dummy sequence offset
339 hsp
->annotation
= (Annotation
*) MMUnitAllocate((1+1) * sizeof(Annotation
));
340 hsp
->seqOffset
= (SeqOffset
*) MMUnitAllocate((1+1) * sizeof(SeqOffset
));
343 hsp
->numOfAmbiguity
= 0;
345 hsp
->annotation
[0].gi
= 0;
346 hsp
->annotation
[0].text
[0] = '\0';
347 hsp
->annotation
[1].gi
= 0;
348 hsp
->annotation
[1].text
[0] = '\0';
349 hsp
->seqOffset
[0].startPos
= 0;
350 hsp
->seqOffset
[0].endPos
= hsp
->dnaLength
- 1;
351 hsp
->seqOffset
[0].firstAmbiguityIndex
= 1;
352 hsp
->seqOffset
[0].lastAmbiguityIndex
= 0;
353 hsp
->seqOffset
[1].startPos
= UINT32_MAX
;
354 hsp
->seqOffset
[1].endPos
= UINT32_MAX
;
355 hsp
->seqOffset
[1].firstAmbiguityIndex
= UINT32_MAX
; // should not be referred
356 hsp
->seqOffset
[1].lastAmbiguityIndex
= UINT32_MAX
; // should not be referred
361 if (AmbiguityFileName
!= NULL
&& AmbiguityFileName
[0] != '\0' && AmbiguityFileName
[0] != '-') {
364 ambiguityFile
= (FILE*)fopen64(AmbiguityFileName
, "r");
365 if (ambiguityFile
== NULL
) {
366 fprintf(stderr
, "Cannot open ambiguity file!\n");
370 fscanf(ambiguityFile
, "%llu %d %d\n", &dnaLength
, &i
, &hsp
->numOfAmbiguity
);
371 if (hsp
->dnaLength
!= 0 && dnaLength
!= hsp
->dnaLength
) {
372 fprintf(stderr
, "Ambiguity database length not match!\n");
375 hsp
->dnaLength
= dnaLength
;
376 if (i
!= hsp
->numOfSeq
) {
377 fprintf(stderr
, "Ambiguity database number of sequence not match!\n");
380 if (hsp
->numOfAmbiguity
!= hsp
->seqOffset
[hsp
->numOfSeq
].firstAmbiguityIndex
- 1) {
381 fprintf(stderr
, "Ambiguity number of ambiguity not match!\n");
385 HSPFillCharMap(charMap
);
387 hsp
->ambiguity
= (Ambiguity
*) MMUnitAllocate((hsp
->numOfAmbiguity
+ 2) * sizeof(Ambiguity
));
388 hsp
->ambiguity
[0].startPos
= 0;
389 hsp
->ambiguity
[0].rightOfEndPos
= 0;
390 hsp
->ambiguity
[0].symbol
= 0;
392 while (!feof(ambiguityFile
) && i
-1 < hsp
->numOfAmbiguity
) {
393 fscanf(ambiguityFile
, "%llu %llu %c\n", &hsp
->ambiguity
[i
].startPos
, &hsp
->ambiguity
[i
].rightOfEndPos
, &c
);
394 hsp
->ambiguity
[i
].rightOfEndPos
= hsp
->ambiguity
[i
].startPos
+ hsp
->ambiguity
[i
].rightOfEndPos
; // number of character is stored
395 hsp
->ambiguity
[i
].symbol
= charMap
[c
];
398 hsp
->ambiguity
[i
].startPos
= UINT32_MAX
;
399 hsp
->ambiguity
[i
].rightOfEndPos
= UINT32_MAX
;
400 hsp
->ambiguity
[i
].symbol
= 0;
402 if (i
-1 < hsp
->numOfAmbiguity
) {
403 fprintf(stderr
, "Ambiguity missing entries!\n");
406 fclose(ambiguityFile
);
410 if (hsp
->numOfAmbiguity
> 0) {
411 fprintf(stderr
, "Ambiguity file missing!\n");
415 hsp
->ambiguity
= (Ambiguity
*) MMUnitAllocate((0 + 2) * sizeof(Ambiguity
));
416 hsp
->ambiguity
[0].startPos
= 0;
417 hsp
->ambiguity
[0].rightOfEndPos
= 0;
418 hsp
->ambiguity
[0].symbol
= 0;
419 hsp
->ambiguity
[1].startPos
= UINT32_MAX
;
420 hsp
->ambiguity
[1].rightOfEndPos
= UINT32_MAX
;
421 hsp
->ambiguity
[1].symbol
= 0;
426 if (TranslateFileName
!= NULL
&& TranslateFileName
[0] != '\0' && TranslateFileName
[0] != '-') {
428 // Load translate, ambiguity map
429 translateFile
= (FILE*)fopen64(TranslateFileName
, "r");
430 if (translateFile
== NULL
) {
431 fprintf(stderr
, "Cannot open translate file!\n");
434 unsigned int gridEntries
;
435 unsigned int removedSegmentCount
;
436 fscanf(translateFile
, "%llu %d %u %u\n", &dnaLength
, &i
, &removedSegmentCount
, &gridEntries
);
437 if (hsp
->dnaLength
!= 0 && dnaLength
!= hsp
->dnaLength
) {
438 fprintf(stderr
, "Translate database length not match!\n");
441 hsp
->dnaLength
= dnaLength
;
442 if (i
!= hsp
->numOfSeq
) {
443 fprintf(stderr
, "Translate database number of sequence not match!\n");
446 /*if (hsp->numOfAmbiguity != hsp->seqOffset[hsp->numOfSeq].firstAmbiguityIndex - 1) {
447 fprintf(stderr, "Translate number of ambiguity not match!\n");
450 hsp
->numOfRemovedSegment
= removedSegmentCount
;
451 hsp
->numOfGridEntry
= gridEntries
;
452 hsp
->seqActualOffset
= (SeqActualOffset
*) MMUnitAllocate((hsp
->numOfSeq
+1) * sizeof(SeqActualOffset
));
453 hsp
->ambiguityMap
= (unsigned short*) MMUnitAllocate((gridEntries
) * sizeof(unsigned short));//MMUnitAllocate((gridEntries) * sizeof(unsigned short));
454 hsp
->translate
= (Translate
*) MMUnitAllocate((hsp
->numOfSeq
+removedSegmentCount
) * sizeof(Translate
));//MMUnitAllocate((hsp->numOfSeq+hsp->numOfAmbiguity) * sizeof(Translate));
456 unsigned long long j
=0;
457 while (!feof(translateFile
) && j
<gridEntries
) {
459 fscanf(translateFile
, "%hu\n", &tmp
);
460 hsp
->ambiguityMap
[j
]=(unsigned short)tmp
;j
++;
462 if (j
< gridEntries
) {
463 fprintf(stderr
, "Translate missing entries!\n");
468 while (!feof(translateFile
) && j
<hsp
->numOfSeq
+removedSegmentCount
) {
469 fscanf(translateFile
, "%llu %d %llu\n", &hsp
->translate
[j
].startPos
, &hsp
->translate
[j
].chrID
, &hsp
->translate
[j
].correction
);
472 if (j
< hsp
->numOfSeq
+removedSegmentCount
) {
473 fprintf(stderr
, "Translate missing entries!\n");
478 while (!feof(translateFile
) && j
<hsp
->numOfSeq
) {
479 fscanf(translateFile
, "%llu %llu\n", &hsp
->seqActualOffset
[j
].startPos
, &hsp
->seqActualOffset
[j
].endPos
);
480 hsp
->seqActualOffset
[j
].endPos
= hsp
->seqActualOffset
[j
].startPos
+ hsp
->seqActualOffset
[j
].endPos
;
483 // Backward compatibility
484 if (j
< hsp
->numOfSeq
) {
485 fprintf(stdout
, "[Translation] Missing actual chromosome lengths.\n");
486 fprintf(stdout
, "Re-build Packed indexes for more accurate chromosome length in output.\n");
488 //Translation file is missing, initialise the value with inaccurate chromosome length
490 while (j
<hsp
->numOfSeq
) {
491 hsp
->seqActualOffset
[j
].startPos
= hsp
->seqOffset
[j
].startPos
;
492 hsp
->seqActualOffset
[j
].endPos
= hsp
->seqOffset
[j
].endPos
;
496 fclose(translateFile
);
500 if (hsp
->numOfAmbiguity
> 0 || hsp
->numOfSeq
> 0) {
501 fprintf(stderr
, "Translate file missing!\n");
505 hsp
->numOfRemovedSegment
= 0;
506 unsigned int gridEntries
= (hsp
->dnaLength
/GRID_SAMPLING_FACTOR
)+1;
507 hsp
->numOfGridEntry
= gridEntries
;
508 hsp
->seqActualOffset
= (SeqActualOffset
*) MMUnitAllocate((hsp
->numOfSeq
+1) * sizeof(SeqActualOffset
));
509 hsp
->ambiguityMap
= (unsigned short*) MMUnitAllocate((gridEntries
) * sizeof(unsigned short));
510 hsp
->translate
= (Translate
*) MMUnitAllocate((hsp
->numOfSeq
) * sizeof(Translate
));//MMUnitAllocate((hsp->numOfSeq+hsp->numOfAmbiguity) * sizeof(Translate));
512 while (!feof(translateFile
) && j
<gridEntries
) {
513 hsp
->ambiguityMap
[j
]=0;
515 hsp
->translate
[0].startPos
=0;
516 hsp
->translate
[0].chrID
=0;
517 hsp
->translate
[0].correction
=0;
520 while (j
<hsp
->numOfSeq
) {
521 hsp
->seqActualOffset
[j
].startPos
= hsp
->seqOffset
[j
].startPos
;
522 hsp
->seqActualOffset
[j
].endPos
= hsp
->seqOffset
[j
].endPos
;
530 HSP
*HSPConvertFromText(MMPool
*mmPool
, const unsigned char *text
, const unsigned long long textLength
,
531 const unsigned int FASTARandomSeed
, const int maskLowerCase
,
532 const int gi
, const char *seqName
) {
535 Ambiguity
*tempAmbiguity
;
536 int ambiguityAllocated
= 256;
539 unsigned long long i
;
541 unsigned long long lastAmbiguityPos
;
542 unsigned long long numCharInBuffer
, numOfConversion
;
543 unsigned int sequenceRandomSeed
;
544 unsigned char charMap
[255];
546 unsigned char buffer
[PACKED_BUFFER_SIZE
];
548 HSPFillCharMap(charMap
);
550 hsp
= (HSP
*) MMPoolDispatch(mmPool
, sizeof(HSP
));
552 hsp
->dnaLength
= textLength
;
553 hsp
->minSeqLength
= textLength
;
556 hsp
->annotation
= (Annotation
*) MMPoolDispatch(mmPool
, (1+1) * sizeof(Annotation
));
557 hsp
->annotation
[0].gi
= gi
;
558 strncpy(hsp
->annotation
[0].text
, seqName
, MAX_SEQ_NAME_LENGTH
);
559 hsp
->annotation
[1].gi
= 0;
560 hsp
->annotation
[1].text
[0] = '\0';
562 hsp
->seqOffset
= (SeqOffset
*) MMPoolDispatch(mmPool
, (1+1) * sizeof(SeqOffset
));
563 hsp
->seqOffset
[0].startPos
= 0;
564 hsp
->seqOffset
[0].endPos
= textLength
- 1;
565 hsp
->seqOffset
[0].firstAmbiguityIndex
= 1;
566 hsp
->seqOffset
[0].lastAmbiguityIndex
= 0;
567 hsp
->seqOffset
[1].startPos
= UINT32_MAX
;
568 hsp
->seqOffset
[1].endPos
= UINT32_MAX
;
569 hsp
->seqOffset
[1].firstAmbiguityIndex
= UINT32_MAX
; // should not be referred
570 hsp
->seqOffset
[1].lastAmbiguityIndex
= UINT32_MAX
; // should not be referred
572 hsp
->packedDNA
= (unsigned int*) MMPoolDispatch(mmPool
, (textLength
/ CHAR_PER_WORD
+ 1) * sizeof(int));
574 tempAmbiguity
= (Ambiguity
*) MMUnitAllocate(ambiguityAllocated
* sizeof(Ambiguity
));
576 HSPFillCharMap(charMap
);
578 // Set random seed for the sequence
579 sequenceRandomSeed
= FASTARandomSeed
;
581 sequenceRandomSeed
+= gi
;
583 for (i
=0; i
<(int)strlen(seqName
); i
++) {
585 sequenceRandomSeed
+= c
;
588 r250_init(sequenceRandomSeed
);
593 lastAmbiguityPos
= (unsigned int)-2;
595 for (i
=0; i
<textLength
; i
++) {
599 if (maskLowerCase
&& c
>= 'a' && c
<= 'z') {
600 c
= dnaChar
[lowercaseDnaCharIndex
];
602 if (ambiguityCount
[charMap
[c
]] == 1) {
603 buffer
[numCharInBuffer
] = c
;
606 if (ambiguityCount
[c
] > 0) {
607 buffer
[numCharInBuffer
] = dnaChar
[ambiguityMatch
[c
][r250() % ambiguityCount
[c
]]];
609 buffer
[numCharInBuffer
] = dnaChar
[ambiguityMatch
[c
][r250() % ALPHABET_SIZE
]];
611 if (i
== lastAmbiguityPos
+ 1 && c
== (char)tempAmbiguity
[numAmbiguity
].symbol
) {
612 tempAmbiguity
[numAmbiguity
].rightOfEndPos
++;
615 if (numAmbiguity
>= ambiguityAllocated
) {
616 tempAmbiguity
= (Ambiguity
*) MMUnitReallocate(tempAmbiguity
, sizeof(Ambiguity
) * ambiguityAllocated
* 2, sizeof(Ambiguity
) * ambiguityAllocated
);
617 ambiguityAllocated
*= 2;
619 tempAmbiguity
[numAmbiguity
].startPos
= i
;
620 tempAmbiguity
[numAmbiguity
].rightOfEndPos
= i
+1;
621 tempAmbiguity
[numAmbiguity
].symbol
= c
;
623 lastAmbiguityPos
= i
;
626 if (numCharInBuffer
>= PACKED_BUFFER_SIZE
) {
627 ConvertTextToWordPacked(buffer
, hsp
->packedDNA
+ numOfConversion
* PACKED_BUFFER_SIZE
/ CHAR_PER_WORD
, charMap
, 4, PACKED_BUFFER_SIZE
);
632 if (numCharInBuffer
> 0) {
633 ConvertTextToWordPacked(buffer
, hsp
->packedDNA
+ numOfConversion
* PACKED_BUFFER_SIZE
/ CHAR_PER_WORD
, charMap
, 4, numCharInBuffer
);
637 hsp
->numOfAmbiguity
= numAmbiguity
;
638 hsp
->ambiguity
= (Ambiguity
*) MMPoolDispatch(mmPool
, (hsp
->numOfAmbiguity
+ 2) * sizeof(Ambiguity
));
639 if (hsp
->numOfAmbiguity
> 0) {
640 memcpy(hsp
->ambiguity
+ 1, tempAmbiguity
, hsp
->numOfAmbiguity
* sizeof(Ambiguity
));
642 hsp
->ambiguity
[0].startPos
= 0;
643 hsp
->ambiguity
[0].rightOfEndPos
= 0;
644 hsp
->ambiguity
[0].symbol
= 0;
645 hsp
->ambiguity
[hsp
->numOfAmbiguity
+1].startPos
= UINT32_MAX
;
646 hsp
->ambiguity
[hsp
->numOfAmbiguity
+1].rightOfEndPos
= UINT32_MAX
;
647 hsp
->ambiguity
[hsp
->numOfAmbiguity
+1].symbol
= 0;
649 hsp
->seqOffset
[0].firstAmbiguityIndex
= 1;
650 hsp
->seqOffset
[0].lastAmbiguityIndex
= numAmbiguity
;
652 MMUnitFree(tempAmbiguity
, sizeof(Ambiguity
) * ambiguityAllocated
);
658 void HSPFree(MMPool
*mmPool
, HSP
*hsp
, const unsigned int trailerBufferInWord
) {
660 if (hsp
->packedDNA
!= NULL
) {
661 DNAFreePacked(hsp
->packedDNA
, hsp
->dnaLength
, trailerBufferInWord
);
663 MMUnitFree(hsp
->seqOffset
, (hsp
->numOfSeq
+1) * sizeof(SeqOffset
));
664 MMUnitFree(hsp
->annotation
, (hsp
->numOfSeq
+1) * sizeof(Annotation
));
665 MMUnitFree(hsp
->ambiguity
, (hsp
->numOfAmbiguity
+2) * sizeof(Ambiguity
));
666 MMUnitFree(hsp
->seqActualOffset
, (hsp
->numOfSeq
+1) * sizeof(SeqActualOffset
));
667 MMUnitFree(hsp
->ambiguityMap
, (hsp
->numOfGridEntry
) * sizeof(unsigned short));
668 MMUnitFree(hsp
->translate
, (hsp
->numOfSeq
+hsp
->numOfRemovedSegment
) * sizeof(Translate
));
670 MMPoolReturn(mmPool
, hsp
, sizeof(hsp
));
674 unsigned int HSPParseFASTAToPacked(const char* FASTAFileName
, const char* annotationFileName
, const char* packedDNAFileName
, const char* ambiguityFileName
, const char * translateFileName
,
675 const unsigned int FASTARandomSeed
, const int maskLowerCase
) {
677 FILE *FASTAFile
, *annotationFile
, *packedDNAFile
, *ambiguityFile
, * translateFile
;
678 Annotation
*annotation
;
679 SeqOffset
*seqOffset
;
680 SeqActualOffset
*seqActualOffset
;
681 Ambiguity
*ambiguity
;
682 int annotationAllocated
= 256;
683 int ambiguityAllocated
= 256;
684 unsigned long long totalDiscardedChar
= 0;
689 unsigned long long lastAmbiguityPos
;
690 unsigned long long numChar
, numCharInBuffer
, totalNumChar
, totalActualNumChar
;
691 unsigned int sequenceRandomSeed
;
693 unsigned char charMap
[255];
695 unsigned char buffer
[PACKED_BUFFER_SIZE
];
696 unsigned char packedBuffer
[PACKED_BUFFER_SIZE
/ CHAR_PER_BYTE
];
698 FASTAFile
= (FILE*)fopen64(FASTAFileName
, "r");
699 if (FASTAFile
== NULL
) {
700 fprintf(stderr
, "ParseFASTToPacked() : Cannot open FASTAFileName!\n");
704 annotationFile
= (FILE*)fopen64(annotationFileName
, "w");
705 if (annotationFile
== NULL
) {
706 fprintf(stderr
, "ParseFASTToPacked() : Cannot open annotationFileName!\n");
710 packedDNAFile
= (FILE*)fopen64(packedDNAFileName
, "wb");
711 if (packedDNAFile
== NULL
) {
712 fprintf(stderr
, "ParseFASTToPacked() : Cannot open packedDNAFileName!\n");
716 ambiguityFile
= (FILE*)fopen64(ambiguityFileName
, "w");
717 if (ambiguityFile
== NULL
) {
718 fprintf(stderr
, "ParseFASTToPacked() : Cannot open ambiguityFileName!\n");
722 translateFile
= (FILE*)fopen64(translateFileName
, "w");
723 if (translateFile
== NULL
) {
724 fprintf(stderr
, "ParseFASTToPacked() : Cannot open translateFileName!\n");
728 HSPFillCharMap(charMap
);
730 c
= (char)getc(FASTAFile
);
732 fprintf(stderr
, "ParseFASTToPacked() : FASTA file does not begin with '>'!\n");
737 totalActualNumChar
= 0;
742 annotation
= (Annotation
*) MMUnitAllocate(sizeof(Annotation
) * annotationAllocated
);
743 seqOffset
= (SeqOffset
*) MMUnitAllocate(sizeof(SeqOffset
) * annotationAllocated
);
744 ambiguity
= (Ambiguity
*) MMUnitAllocate(sizeof(Ambiguity
) * ambiguityAllocated
);
745 seqActualOffset
= (SeqActualOffset
*) MMUnitAllocate(sizeof(SeqActualOffset
) * annotationAllocated
);
747 while (!feof(FASTAFile
)) {
750 if (numSeq
>= annotationAllocated
) {
751 annotation
= (Annotation
*) MMUnitReallocate(annotation
, sizeof(Annotation
) * annotationAllocated
* 2, sizeof(Annotation
) * annotationAllocated
);
752 seqOffset
= (SeqOffset
*) MMUnitReallocate(seqOffset
, sizeof(SeqOffset
) * annotationAllocated
* 2, sizeof(SeqOffset
) * annotationAllocated
);
753 seqActualOffset
= (SeqActualOffset
*) MMUnitReallocate(seqActualOffset
, sizeof(SeqActualOffset
) * annotationAllocated
* 2, sizeof(SeqActualOffset
) * annotationAllocated
);
754 annotationAllocated
*= 2;
757 annotation
[numSeq
].gi
= 0;
759 c
= (char)getc(FASTAFile
);
760 while (!feof(FASTAFile
) && c
!= '\n') {
761 if (numChar
< MAX_SEQ_NAME_LENGTH
) {
762 annotation
[numSeq
].text
[numChar
] = c
;
765 if (numChar
== 3 && annotation
[numSeq
].text
[0] == 'g'
766 && annotation
[numSeq
].text
[1] == 'i'
767 && annotation
[numSeq
].text
[2] == '|') {
768 fscanf(FASTAFile
, "%u|", &(annotation
[numSeq
].gi
));
771 c
= (char)getc(FASTAFile
);
773 annotation
[numSeq
].text
[numChar
] = '\0';
775 // Set random seed for the sequence
776 sequenceRandomSeed
= FASTARandomSeed
;
777 if (annotation
[numSeq
].gi
> 0) {
778 sequenceRandomSeed
+= annotation
[numSeq
].gi
;
780 for (i
=0; i
<(int)numChar
; i
++) {
781 c
= annotation
[numSeq
].text
[i
];
782 sequenceRandomSeed
+= c
;
785 r250_init(sequenceRandomSeed
);
787 seqOffset
[numSeq
].startPos
= totalNumChar
;
788 seqActualOffset
[numSeq
].startPos
= totalActualNumChar
;
790 lastAmbiguityPos
= (unsigned int)-2;
792 c
= (char)getc(FASTAFile
);
793 unsigned long long nCount
=0;
794 while (!feof(FASTAFile
) && c
!= '>') {
796 if (c
!= '\n' && c
!= '\t') {
797 if (maskLowerCase
&& c
>= 'a' && c
<= 'z') {
798 c
= dnaChar
[lowercaseDnaCharIndex
];
801 if (charMap
[c
] != nonMatchDnaCharIndex
) {
802 if (ambiguityCount
[charMap
[c
]] == 1) {
803 buffer
[numCharInBuffer
] = c
;
805 //Char != A C G T...can be anything else on Earth!!
806 c
= (char)getc(FASTAFile
);
807 unsigned long long nCount
=1;
808 while (!feof(FASTAFile
) && c
!= '>') {
809 if (charMap
[c
] != nonMatchDnaCharIndex
) {
810 if (ambiguityCount
[charMap
[c
]] != 1) {
816 c
= (char)getc(FASTAFile
);
820 for (k
=0;k
<nCount
;k
++) {
821 buffer
[numCharInBuffer
] = 'G'; //Substitute rest of the 'N' with 'G'
823 if (numCharInBuffer
>= PACKED_BUFFER_SIZE
) {
824 ConvertTextToBytePacked(buffer
, packedBuffer
, charMap
, ALPHABET_SIZE
, PACKED_BUFFER_SIZE
);
825 fwrite(packedBuffer
, 1, PACKED_BUFFER_SIZE
/ CHAR_PER_BYTE
, packedDNAFile
);
831 totalDiscardedChar
+=nCount
;
834 if (numAmbiguity
>= ambiguityAllocated
) {
835 ambiguity
= (Ambiguity
*) MMUnitReallocate(ambiguity
, sizeof(Ambiguity
) * ambiguityAllocated
* 2, sizeof(Ambiguity
) * ambiguityAllocated
);
836 ambiguityAllocated
*= 2;
838 ambiguity
[numAmbiguity
].startPos
= totalNumChar
+ numChar
;
839 ambiguity
[numAmbiguity
].rightOfEndPos
= totalNumChar
+ numChar
+ totalDiscardedChar
- 1;
840 ambiguity
[numAmbiguity
].symbol
= 0;
845 if (numCharInBuffer
>= PACKED_BUFFER_SIZE
) {
846 ConvertTextToBytePacked(buffer
, packedBuffer
, charMap
, ALPHABET_SIZE
, PACKED_BUFFER_SIZE
);
847 fwrite(packedBuffer
, 1, PACKED_BUFFER_SIZE
/ CHAR_PER_BYTE
, packedDNAFile
);
853 c
= (char)getc(FASTAFile
);
855 seqOffset
[numSeq
].endPos
= totalNumChar
+ numChar
- 1;
856 seqOffset
[numSeq
].firstAmbiguityIndex
= numAmbiguity
+ 1;
857 seqActualOffset
[numSeq
].endPos
= totalNumChar
+ totalDiscardedChar
+ numChar
- 1;
858 totalNumChar
+= numChar
;
859 totalActualNumChar
= totalNumChar
+ totalDiscardedChar
;
866 // Finalize packed DNA file
870 if (numCharInBuffer
> 0) {
871 ConvertTextToBytePacked(buffer
, packedBuffer
, charMap
, ALPHABET_SIZE
, numCharInBuffer
);
872 fwrite(packedBuffer
, 1, (numCharInBuffer
+ CHAR_PER_BYTE
-1) / CHAR_PER_BYTE
, packedDNAFile
);
875 if (totalNumChar
% CHAR_PER_BYTE
== 0) {
877 fwrite(&c
, 1, 1, packedDNAFile
);
879 c
= (char)(totalNumChar
% CHAR_PER_BYTE
);
880 fwrite(&c
, 1, 1, packedDNAFile
);
882 fclose(packedDNAFile
);
884 //Compute Translation Map
885 unsigned int gridEntries
= (totalNumChar
/GRID_SAMPLING_FACTOR
)+1;
886 unsigned short *ambiguityMap
= (unsigned short*)malloc(sizeof(unsigned short)*gridEntries
);
887 Translate
* translate
= (Translate
*) malloc(sizeof(Translate
)*(numAmbiguity
+numSeq
));
888 unsigned int j
=0,k
=0;
889 for (j
=0;j
<gridEntries
;j
++) {
893 while(j
<numAmbiguity
&& k
<numSeq
) {
894 if (ambiguity
[j
].startPos
<seqOffset
[k
].startPos
) {
895 //Write Ambiguous Segment Flag
896 unsigned long long index
= ambiguity
[j
].startPos
;
897 ambiguityMap
[index
/GRID_SAMPLING_FACTOR
]+=1;
898 translate
[i
].startPos
=ambiguity
[j
].startPos
;
899 translate
[i
].chrID
=k
;
902 unsigned long long correction
= ambiguity
[seqOffset
[k
-2].firstAmbiguityIndex
-1].rightOfEndPos
-
903 ambiguity
[seqOffset
[k
-2].firstAmbiguityIndex
-1].startPos
+ 1;
904 translate
[i
].correction
=(seqOffset
[k
-1].startPos
)+
906 (ambiguity
[j
].rightOfEndPos
- ambiguity
[j
].startPos
+1) - 1;
908 translate
[i
].correction
=-ambiguity
[j
].rightOfEndPos
+ ambiguity
[j
].startPos
-2;
912 } else if (ambiguity
[j
].startPos
>seqOffset
[k
].startPos
) {
913 //Write Artificial Chromosome Start Flag
914 unsigned long long index
= seqOffset
[k
].startPos
;
915 ambiguityMap
[index
/GRID_SAMPLING_FACTOR
]+=1;
916 translate
[i
].startPos
=seqOffset
[k
].startPos
;
917 translate
[i
].chrID
=k
+1;
918 translate
[i
].correction
=seqOffset
[k
].startPos
-1;
925 while (j
<numAmbiguity
) {
926 unsigned long long index
= ambiguity
[j
].startPos
;
927 ambiguityMap
[index
/GRID_SAMPLING_FACTOR
]+=1;
928 translate
[i
].startPos
=ambiguity
[j
].startPos
;
929 translate
[i
].chrID
=k
;
932 unsigned long long correction
= ambiguity
[seqOffset
[k
-2].firstAmbiguityIndex
-1].rightOfEndPos
-
933 ambiguity
[seqOffset
[k
-2].firstAmbiguityIndex
-1].startPos
+ 1;
934 translate
[i
].correction
=(seqOffset
[k
-1].startPos
)+
936 (ambiguity
[j
].rightOfEndPos
- ambiguity
[j
].startPos
+1) - 1;
938 translate
[i
].correction
=-ambiguity
[j
].rightOfEndPos
+ ambiguity
[j
].startPos
-2;
940 //printf("%u\tAMB%u\tSP:%u\tCHR:%u\tCORRECT:%u\n",i,j,translate[i].startPos,translate[i].chrID,translate[i].correction);
944 unsigned long long index
= seqOffset
[k
].startPos
;
945 ambiguityMap
[index
/GRID_SAMPLING_FACTOR
]+=1;
946 translate
[i
].startPos
=seqOffset
[k
].startPos
;
947 translate
[i
].chrID
=k
+1;
948 translate
[i
].correction
=seqOffset
[k
].startPos
-1;
949 //printf("%u\tCHR%u\tSP:%u\tCHR:%u\tCORRECT:%u\n",i,k,translate[i].startPos,translate[i].chrID,translate[i].correction);
952 while (i
<numAmbiguity
+numSeq
) {
954 translate
[i
].startPos
=translate
[i
-1].startPos
;
955 translate
[i
].chrID
=translate
[i
-1].chrID
;
956 translate
[i
].correction
=translate
[i
-1].correction
;
960 for (j
=1;j
<gridEntries
;j
++) {
961 ambiguityMap
[j
]+=ambiguityMap
[j
-1];
963 for (j
=0;j
<gridEntries
;j
++) {
967 // Output annotation file
968 fprintf(annotationFile
, "%llu %u %u\n", totalNumChar
, numSeq
, FASTARandomSeed
);
969 for (i
=0; i
<numSeq
; i
++) {
970 fprintf(annotationFile
, "%u %s\n", annotation
[i
].gi
, annotation
[i
].text
);
971 fprintf(annotationFile
, "%llu %llu 0\n", seqOffset
[i
].startPos
, seqOffset
[i
].endPos
- seqOffset
[i
].startPos
+ 1);
972 // output number of ambiguity
974 fprintf(annotationFile, "%u\n", seqOffset[i].firstAmbiguityIndex - seqOffset[i-1].firstAmbiguityIndex);
976 fprintf(annotationFile, "%u\n", seqOffset[i].firstAmbiguityIndex);
979 fclose(annotationFile
);
981 MMUnitFree(annotation
, sizeof(Annotation
) * annotationAllocated
);
982 MMUnitFree(seqOffset
, sizeof(SeqOffset
) * annotationAllocated
);
984 // Output ambiguity file
985 fprintf(ambiguityFile
, "%llu %u %u\n", totalNumChar
, numSeq
, 0);//numAmbiguity);
986 //for (i=0; i<numAmbiguity; i++) {
987 // The ambiguity for the dummy length is visible
988 //fprintf(ambiguityFile, "%u %u %c\n", ambiguity[i].startPos, ambiguity[i].rightOfEndPos - ambiguity[i].startPos + 1, dnaChar[ambiguity[i].symbol]);
990 fclose(ambiguityFile
);
992 MMUnitFree(ambiguity
, ambiguityAllocated
* sizeof(Ambiguity
));
994 fprintf(translateFile
, "%llu %u %u %u\n", totalNumChar
, numSeq
, numAmbiguity
, gridEntries
);
995 for (j
=0; j
<gridEntries
; j
++) {
996 // The ambiguity for the dummy length is visible
997 fprintf(translateFile
, "%u\n", ambiguityMap
[j
]);
999 for (j
=0; j
<numAmbiguity
+numSeq
; j
++) {
1000 // The ambiguity for the dummy length is visible
1001 fprintf(translateFile
, "%llu %u %llu\n", translate
[j
].startPos
,translate
[j
].chrID
, translate
[j
].correction
);
1003 for (j
=0; j
<numSeq
; j
++) {
1004 fprintf(translateFile
, "%llu %llu\n", seqActualOffset
[j
].startPos
, seqActualOffset
[j
].endPos
- seqActualOffset
[j
].startPos
+ 1);
1006 fclose(translateFile
);
1010 MMUnitFree(seqActualOffset
, sizeof(SeqActualOffset
) * annotationAllocated
);
1016 unsigned int HSPPackedToFASTA(const char* FASTAFileName
, const char* annotationFileName
, const char* packedDNAFileName
, const char* ambiguityFileName
, const char* translateFileName
) {
1021 hsp
= HSPLoad(NULL
, packedDNAFileName
, annotationFileName
, ambiguityFileName
,translateFileName
, 1);
1023 // Generate FASTA from packed
1024 FASTAFile
= (FILE*)fopen64(FASTAFileName
, "w");
1025 if (FASTAFile
== NULL
) {
1026 fprintf(stderr
, "Cannot open FASTA file!\n");
1031 fprintf(stderr
, "HSPPackedToFASTA(): Function not complete!\n");
1035 HSPFree(NULL
, hsp
, 1);
1041 // The hit must be an mismatch hit anchored on the left most of the pattern
1043 unsigned int HSPShortPattern1GapRightExt(const unsigned LONG *packedDNA, const unsigned char *pattern,
1044 const unsigned int patternLength, const unsigned int textLength,
1045 HitList* __restrict hitList, const unsigned int numHit,
1046 const unsigned int maxSpaceInGap, const unsigned int *maxMismatch) {
1048 unsigned int anchorLength;
1049 unsigned int maxSpaceInGap;
1050 unsigned int maxMismatch[MAX_SP_SPACE_IN_GAP];
1052 unsigned LONG anchorText, anchorPattern;
1053 unsigned LONG nonAnchorText, nonAnchorPattern;
1054 unsigned LONG nonAnchorTextReversed, nonAnchorPatternReversed;
1055 unsigned int pos, shift, index;
1056 unsigned maxRightShift;
1057 unsigned int anchorNumError;
1058 unsigned int nonAnchorError[MAX_SP_MISMATCH];
1059 unsigned int leftShiftError[MAX_SP_MISMATCH];
1060 unsigned int rightShiftError[MAX_SP_MISMATCH];
1063 __m128i m0, m1, m3, m0f, m00ff, m0000ffff;
1064 __m128i ts, tn, p, mb, mb1, mp, mneg, mb31, md;
1066 unsigned LONG ALIGN_16 temp[2];
1067 unsigned int numExtHit;
1068 const unsigned char *thisPattern;
1071 const static int bitPos[32] = { 0, 16, 14, 1, 30, 7, 12, 17, 15, 11, 10, 23, 28, 24, 2, 4,
1072 31, 29, 22, 27, 26, 25, 8, 19, 13, 6, 9, 3, 21, 18, 5, 20 };
1076 if (patternLength >= MAX_SP_NON_ANCHOR_LENGTH) {
1077 anchorLength = patternLength - MAX_SP_NON_ANCHOR_LENGTH;
1082 // Determine maximum no. of spaces and maximum mismatch corresponding to the no. of spaces
1083 if (maxPenalty >= gapOpenScore) {
1084 maxSpaceInGap = (maxPenalty - gapOpenScore) / gapExtendScore;
1089 for (i=0; i<MAX_SP_SPACE_IN_GAP; i++) {
1092 for (i=0; i<maxSpaceInGap; i++) {
1093 maxMismatch[i] = (maxPenalty - gapOpenScore - gapOpenScore * i) / (matchScore - mismatchScore);
1098 for (h=0; h<numHit; h++) {
1100 thisPattern = pattern + patternLength * hitList[h].info;
1101 if (hitList[h].posText + patternLength <= textLength) {
1102 maxRightShift = textLength - hitList[h].posText - patternLength;
1103 if (maxRightShift > MAX_SP_SPACE_IN_GAP) {
1104 maxRightShift = MAX_SP_SPACE_IN_GAP;
1113 for (i=0; i<anchorLength; i++) {
1114 anchorPattern <<= 2;
1115 anchorPattern |= thisPattern[i];
1118 nonAnchorPattern = 0;
1119 for (; i<patternLength; i++) {
1120 nonAnchorPattern <<= 2;
1121 nonAnchorPattern |= thisPattern[i];
1124 nonAnchorPatternReversed = 0; // reverse pattern
1125 for (i=patternLength; --i;) { // from patternLength - 1 to 0
1126 nonAnchorPatternReversed <<= 2;
1127 nonAnchorPatternReversed |= thisPattern[i];
1130 shift = hitList[h].posText % CHAR_PER_64;
1131 pos = hitList[h].posText / CHAR_PER_64;
1133 anchorText = (packedDNA[pos] << shift) + (packedDNA[pos + 1] >> (64 - shift));
1135 anchorText = packedDNA[pos / CHAR_PER_64];
1138 shift = (hitList[h].posText + anchorLength - MAX_SP_SPACE_IN_GAP) % CHAR_PER_64;
1139 pos = (hitList[h].posText + anchorLength - MAX_SP_SPACE_IN_GAP) / CHAR_PER_64;
1141 nonAnchorText = (packedDNA[pos] << shift) + (packedDNA[pos + 1] >> (64 - shift));
1143 nonAnchorText = packedDNA[pos / CHAR_PER_64];
1146 // Reverse nonAnchorText and nonAnchorPattern
1147 temp[0] = nonAnchorText;
1148 temp[1] = nonAnchorPattern;
1149 t = _mm_load_si128((__m128i*)temp);
1150 m3 = _mm_set1_epi32(0x33333333);
1151 m0f = _mm_set1_epi32(0x0f0f0f0f);
1152 m00ff = _mm_set1_epi32(0x00ff00ff);
1153 m0000ffff = _mm_set1_epi32(0x0000ffff);
1155 ts = _mm_srli_epi64(t, 2);
1156 ts = _mm_and_si128(ts, m3);
1157 tn = _mm_and_si128(t, m3);
1158 tn = _mm_srli_epi64(tn, 2);
1159 t = _mm_or_si128(ts, tn);
1161 ts = _mm_srli_epi64(t, 4);
1162 ts = _mm_and_si128(ts, m0f);
1163 tn = _mm_and_si128(t, m0f);
1164 tn = _mm_srli_epi64(tn, 4);
1165 t = _mm_or_si128(ts, tn);
1167 ts = _mm_srli_epi64(t, 8);
1168 ts = _mm_and_si128(ts, m00ff);
1169 tn = _mm_and_si128(t, m00ff);
1170 tn = _mm_srli_epi64(tn, 8);
1171 t = _mm_or_si128(ts, tn);
1173 ts = _mm_srli_epi64(t, 16);
1174 ts = _mm_and_si128(ts, m0000ffff);
1175 tn = _mm_and_si128(t, m0000ffff);
1176 tn = _mm_srli_epi64(tn, 16);
1177 t = _mm_or_si128(ts, tn);
1179 ts = _mm_srli_epi64(t, 32);
1180 tn = _mm_srli_epi64(tn, 32);
1181 t = _mm_or_si128(ts, tn);
1183 _mm_store_si128((__m128i*)temp, t);
1184 nonAnchorTextReversed = temp[0];
1185 nonAnchorPatternReversed = temp[1];
1187 m1 = _mm_set1_epi32(0xFFFFFFFF);
1188 m0 = _mm_setzero_si128();
1190 // Find the mismatches for anchor and non-anchor no-shift
1191 temp[0] = anchorText;
1192 temp[1] = nonAnchorTextReversed;
1193 t = _mm_load_si128((__m128i*)temp);
1194 temp[0] = anchorPattern;
1195 temp[1] = nonAnchorPatternReversed;
1196 p = _mm_load_si128((__m128i*)temp);
1197 mb = _mm_and_si128(t, p);
1198 mb1 = _mm_srli_epi64(mb, 1);
1199 mb = _mm_and_si128(mb, mb1);
1200 mb = _mm_andnot_si128(mb, m1);
1202 mneg = _mm_sub_epi64(m0, mb);
1203 mb = _mm_and_si128(mb, mneg);
1205 mb31 = _mm_srli_epi64(mb, 31);
1206 mb = _mm_and_si128(mb, mb31);
1208 md = _mm_set1_epi32(0x077CB531);
1209 mp = _mm_mul_epu32(mb, md);
1210 mp = _mm_srli_epi32(mp, 27);
1212 index = _mm_extract_epi16(mp, 0);
1213 anchorNumError += (index > 0);
1215 index = _mm_extract_epi16(mp, 4);
1216 nonAnchorError[0] = bitPos[index];
1218 // Continue with error 2 and 3
1221 // Find the mismatches for non-anchor 1-shift
1223 // Find the mismatches for non-anchor 2-shift
1225 // Find the mismatches for non-anchor 3-shift
1230 // Find mismatches in non-anchor non-shift comparison
1232 // Find mismatches in non-anchor left-shift comparison
1234 // Find mismatches in non-anchor right-shift comparison
1236 // Check spaces and mismatches
1240 // reverse the text and the pattern
1242 // v = (v64 >> 31) | v64
1245 // bitPos[((v & -v) * 0x077CB531UL) >> 27];
1252 unsigned int HSPUngappedExtension(const unsigned int *packedDNA
, const unsigned int *packedKey
, const unsigned int *packedMask
, const unsigned int queryPatternLength
,
1253 const unsigned int subPattenLength
, const unsigned long long textLength
,
1254 HitListWithPosQuery
* __restrict hitList
, const unsigned long long numberOfHit
,
1255 const HSPUngappedExtLookupTable
*ungappedLookupTable
,
1256 const int cutoffScore
, const int dropoffScore
) {
1258 static const unsigned int oddBitMask
= 0x55555555;
1259 static const unsigned int evenBitMask
= 0xAAAAAAAA;
1261 unsigned long long hitProcessed
, numberOfUngappedHit
;
1262 unsigned long long textShift
, queryShift
;
1263 unsigned long long posText
, posQuery
;
1264 unsigned long long queryWordPos
, textWordPos
;
1265 unsigned int matchVector32
, matchVector16
;
1266 int numberOfValidChar
;
1267 unsigned long long currentPos
;
1268 int currentScore
, forwardMaxScore
, backwardMaxScore
;
1269 unsigned long long forwardMaxScorePos
, backwardMaxScorePos
;
1272 numberOfUngappedHit
= 0;
1274 while (hitProcessed
< numberOfHit
) {
1276 // Move position to mid-point of hit
1277 posText
= hitList
[hitProcessed
].posText
+ subPattenLength
/ 2;
1278 posQuery
= hitList
[hitProcessed
].posQuery
+ subPattenLength
/ 2;
1280 textShift
= (posText
% CHAR_PER_WORD
) * BIT_PER_CHAR
;
1281 queryShift
= (posQuery
% CHAR_PER_WORD
) * BIT_PER_CHAR
;
1285 textWordPos
= posText
/ CHAR_PER_WORD
;
1286 queryWordPos
= posQuery
/ CHAR_PER_WORD
;
1287 numberOfValidChar
= min(queryPatternLength
- posQuery
, textLength
- posText
);
1289 forwardMaxScore
= 0;
1290 forwardMaxScorePos
= posQuery
;
1291 currentPos
= posQuery
;
1294 while (forwardMaxScore
<= currentScore
+ dropoffScore
&& numberOfValidChar
> 0) {
1296 matchVector32
= ((packedKey
[queryWordPos
] << queryShift
) | ((packedKey
[queryWordPos
+1] >> (BITS_IN_WORD
- queryShift
)) * (queryShift
> 0)))
1297 ^ ((packedDNA
[textWordPos
] << textShift
) | ((packedDNA
[textWordPos
+1] >> (BITS_IN_WORD
- textShift
)) * (textShift
> 0)));
1299 matchVector32
|= (packedMask
[queryWordPos
] << queryShift
) | ((packedMask
[queryWordPos
+1] >> (BITS_IN_WORD
- queryShift
)) * (queryShift
> 0));
1301 if (numberOfValidChar
< CHAR_PER_WORD
) {
1302 // mask the invalid character as mismatch
1303 matchVector32
|= ALL_ONE_MASK
>> (numberOfValidChar
* BIT_PER_CHAR
);
1306 // Process vector to even bits for forward extend
1307 matchVector32
= (matchVector32
| (matchVector32
<< 1)) & evenBitMask
;
1308 matchVector16
= (ungappedLookupTable
[matchVector32
>> 16].matchMismatchBitVector
<< 8) |
1309 ungappedLookupTable
[matchVector32
& 0xFFFF].matchMismatchBitVector
;
1311 if (currentScore
+ ungappedLookupTable
[matchVector16
].maxScore
> forwardMaxScore
) {
1312 forwardMaxScore
= currentScore
+ ungappedLookupTable
[matchVector16
].maxScore
;
1313 forwardMaxScorePos
= currentPos
+ ungappedLookupTable
[matchVector16
].maxScorePos
;
1315 currentScore
+= ungappedLookupTable
[matchVector16
].finalScore
;
1316 currentPos
+= CHAR_PER_WORD
;
1320 numberOfValidChar
-= CHAR_PER_WORD
;
1325 // Backward extend by word
1326 textWordPos
= posText
/ CHAR_PER_WORD
;
1327 queryWordPos
= posQuery
/ CHAR_PER_WORD
;
1328 numberOfValidChar
= min(posQuery
, posText
);
1330 backwardMaxScore
= 0;
1331 backwardMaxScorePos
= posQuery
;
1332 currentPos
= posQuery
;
1335 while (backwardMaxScore
<= currentScore
+ dropoffScore
&& numberOfValidChar
> 0) {
1337 if (numberOfValidChar
>= CHAR_PER_WORD
) {
1338 matchVector32
= ((packedKey
[queryWordPos
-1] << queryShift
) | ((packedKey
[queryWordPos
] >> (BITS_IN_WORD
- queryShift
)) * (queryShift
> 0)))
1339 ^ ((packedDNA
[textWordPos
-1] << textShift
) | ((packedDNA
[textWordPos
] >> (BITS_IN_WORD
- textShift
)) * (textShift
> 0)));
1340 matchVector32
|= (packedMask
[queryWordPos
-1] << queryShift
) | ((packedMask
[queryWordPos
] >> (BITS_IN_WORD
- queryShift
)) * (queryShift
> 0));
1342 if (textWordPos
> 0) {
1343 matchVector32
= ((packedKey
[queryWordPos
] >> (BITS_IN_WORD
- queryShift
)) * (queryShift
> 0))
1344 ^ ((packedDNA
[textWordPos
-1] << textShift
) | ((packedDNA
[textWordPos
] >> (BITS_IN_WORD
- textShift
)) * (textShift
> 0)));
1345 matchVector32
|= (packedMask
[queryWordPos
] >> (BITS_IN_WORD
- queryShift
)) * (queryShift
> 0);
1346 } else if (queryWordPos
> 0) {
1347 matchVector32
= ((packedKey
[queryWordPos
-1] << queryShift
) | ((packedKey
[queryWordPos
] >> (BITS_IN_WORD
- queryShift
)) * (queryShift
> 0)))
1348 ^ ((packedDNA
[textWordPos
] >> (BITS_IN_WORD
- textShift
)) * (textShift
> 0));
1349 matchVector32
|= (packedMask
[queryWordPos
-1] << queryShift
) | ((packedMask
[queryWordPos
] >> (BITS_IN_WORD
- queryShift
)) * (queryShift
> 0));
1351 matchVector32
= ((packedKey
[queryWordPos
] >> (BITS_IN_WORD
- queryShift
)) * (queryShift
> 0))
1352 ^ ((packedDNA
[textWordPos
] >> (BITS_IN_WORD
- textShift
)) * (textShift
> 0));
1353 matchVector32
|= (packedMask
[queryWordPos
] >> (BITS_IN_WORD
- queryShift
)) * (queryShift
> 0);
1355 matchVector32
|= ALL_ONE_MASK
<< (numberOfValidChar
* BIT_PER_CHAR
);
1358 // Process vector to odd bits for backward extend
1359 matchVector32
= (matchVector32
| (matchVector32
>> 1)) & oddBitMask
;
1360 matchVector16
= ungappedLookupTable
[matchVector32
>> 16].matchMismatchBitVector
|
1361 (ungappedLookupTable
[matchVector32
& 0xFFFF].matchMismatchBitVector
<< 8); // matchVecter16 is reverse of packed odd bits
1363 if (currentScore
+ ungappedLookupTable
[matchVector16
].maxScore
> backwardMaxScore
) {
1364 backwardMaxScore
= currentScore
+ ungappedLookupTable
[matchVector16
].maxScore
;
1365 backwardMaxScorePos
= currentPos
- ungappedLookupTable
[matchVector16
].maxScorePos
;
1367 currentScore
+= ungappedLookupTable
[matchVector16
].finalScore
;
1368 currentPos
-= CHAR_PER_WORD
;
1372 numberOfValidChar
-= CHAR_PER_WORD
;
1376 // Check cut off score
1377 if (backwardMaxScore
+ forwardMaxScore
>= cutoffScore
) {
1378 hitList
[numberOfUngappedHit
].posQuery
= (forwardMaxScorePos
+ backwardMaxScorePos
) / 2;
1379 hitList
[numberOfUngappedHit
].posText
= posText
- posQuery
+ hitList
[numberOfUngappedHit
].posQuery
;
1380 numberOfUngappedHit
++;
1387 return numberOfUngappedHit
;
1391 int HSPGappedExtension(const unsigned int *packedDNA
, const unsigned long long textLength
, const unsigned char *convertedKey
, const int queryPatternLength
,
1392 const HitListWithPosQuery
* hitList
, const int numberOfHit
,
1393 GappedHitList
* __restrict gappedHitList
,
1395 const int matchScore
, const int mismatchScore
,
1396 const int gapOpenScore
, const int gapExtendScore
,
1397 const int cutoffScore
, const int dropoffScore
) {
1401 int* __restrict I
; // insert and delete wrt query
1402 int D
; // insert and delete wrt query
1403 int scoringMatrix
[DNA_CHAR_SIZE
][DNA_CHAR_SIZE
];
1405 int hitProcessed
, numberOfGappedHit
;
1411 unsigned long long textOffset
;
1412 unsigned long long textDecodePos
;
1415 int lastStartPos
, startPos
, nextStartPos
;
1416 int endPos
, nextEndPos
;
1417 unsigned long long textPos
;
1421 int forwardMaxScore
, backwardMaxScore
;
1422 unsigned long long forwardMaxScorePosQuery
, forwardMaxScorePosText
;
1423 unsigned long long backwardMaxScorePosQuery
, backwardMaxScorePosText
;
1425 int dpCellAllocated
;
1427 dpCellAllocated
= min(queryPatternLength
+ 1, MAX_ALIGNMENT_LENGTH
);
1429 // allocate working memory
1430 B
= (int*) MMPoolDispatch(mmPool
, dpCellAllocated
* sizeof(int));
1431 I
= (int*) MMPoolDispatch(mmPool
, dpCellAllocated
* sizeof(int));
1433 HSPFillScoringMatrix(scoringMatrix
, matchScore
, mismatchScore
, 0);
1436 numberOfGappedHit
= 0;
1438 maxLengthOfGap
= (dropoffScore
+ gapOpenScore
) / (-gapExtendScore
);
1440 while (hitProcessed
< numberOfHit
) {
1442 queryOffset
= hitList
[hitProcessed
].posQuery
;
1443 textOffset
= hitList
[hitProcessed
].posText
;
1447 textDecodePos
= textOffset
;
1448 if (textDecodePos
% CHAR_PER_WORD
> 0) {
1449 // decode text to the next word boundary
1450 charToProcess
= CHAR_PER_WORD
- (textDecodePos
% CHAR_PER_WORD
);
1451 c
= packedDNA
[textDecodePos
/ CHAR_PER_WORD
] << (BITS_IN_WORD
- charToProcess
* BIT_PER_CHAR
);
1452 textDecodePos
+= charToProcess
;
1455 // Fill initial scores
1457 I
[0] = gapExtendScore
+ gapOpenScore
;
1458 for (i
=1; i
<=maxLengthOfGap
; i
++) {
1459 B
[i
] = i
* gapExtendScore
+ gapOpenScore
;
1460 I
[i
] = B
[i
] + gapExtendScore
+ gapOpenScore
;
1462 I
[i
] = DP_NEG_INFINITY
;
1464 lastStartPos
= startPos
= 0;
1465 endPos
= min(maxLengthOfGap
+ 1, queryPatternLength
- queryOffset
);
1466 textPos
= textOffset
;
1468 forwardMaxScore
= 0;
1469 forwardMaxScorePosQuery
= queryOffset
;
1470 forwardMaxScorePosText
= textOffset
;
1472 while (startPos
<= endPos
&& textPos
< textLength
) {
1474 if (endPos
>= dpCellAllocated
) {
1475 fprintf(stderr
, "HSPGappedExtension(): Not enough DP cells allocated!\n");
1479 if (textPos
>= textDecodePos
) {
1480 c
= packedDNA
[textDecodePos
/ CHAR_PER_WORD
];
1481 textDecodePos
+= CHAR_PER_WORD
;
1483 textChar
= (char)(c
>> (BITS_IN_WORD
- BIT_PER_CHAR
));
1487 nextStartPos
= INT_MAX
;
1489 if (startPos
> lastStartPos
) {
1490 BLast
= B
[startPos
- 1];
1491 D
= DP_NEG_INFINITY
;
1492 currentPos
= startPos
;
1494 // all scores in currentPos - 1 is DP_NEG_INFINITY
1495 BLast
= B
[startPos
];
1496 B
[startPos
] = I
[startPos
];
1497 I
[startPos
] = I
[startPos
] + gapExtendScore
;
1498 D
= B
[startPos
] + gapExtendScore
+ gapOpenScore
;
1499 if (B
[startPos
] + dropoffScore
>= forwardMaxScore
) {
1500 nextStartPos
= startPos
;
1501 nextEndPos
= startPos
;
1503 currentPos
= startPos
+ 1;
1506 while (currentPos
<= endPos
) {
1508 M
= BLast
+ scoringMatrix
[textChar
][convertedKey
[currentPos
+ queryOffset
- 1]];
1509 BLast
= B
[currentPos
];
1511 if (M
>= I
[currentPos
] && M
>= D
) {
1512 // matchScore is maximum
1514 I
[currentPos
] = max(M
+ gapOpenScore
, I
[currentPos
]) + gapExtendScore
;
1515 D
= max(M
+ gapOpenScore
, D
) + gapExtendScore
;
1517 B
[currentPos
] = max(I
[currentPos
], D
);
1518 // insert cannot follow delete and delete cannot follow insert
1519 I
[currentPos
] = I
[currentPos
] + gapExtendScore
;
1520 D
= D
+ gapExtendScore
;
1522 if (B
[currentPos
] + dropoffScore
>= forwardMaxScore
) {
1523 if (nextStartPos
> currentPos
) {
1524 nextStartPos
= currentPos
;
1526 nextEndPos
= currentPos
;
1528 if (B
[currentPos
] > forwardMaxScore
) {
1529 forwardMaxScore
= B
[currentPos
];
1530 forwardMaxScorePosQuery
= currentPos
- 1 + queryOffset
;
1531 forwardMaxScorePosText
= textPos
;
1539 if (nextEndPos
== currentPos
) {
1540 while (nextEndPos
<= queryPatternLength
- queryOffset
&& D
+ dropoffScore
>= forwardMaxScore
) {
1541 if (nextEndPos
>= dpCellAllocated
) {
1542 fprintf(stderr
, "HSPGappedExtension(): Not enough DP cells allocated!\n");
1546 D
= D
+ gapExtendScore
;
1547 I
[nextEndPos
] = DP_NEG_INFINITY
;
1550 I
[nextEndPos
] = DP_NEG_INFINITY
;
1553 if (nextEndPos
> queryPatternLength
- queryOffset
) {
1554 nextEndPos
= queryPatternLength
- queryOffset
;
1557 lastStartPos
= startPos
;
1558 startPos
= nextStartPos
;
1559 endPos
= nextEndPos
;
1567 textDecodePos
= textOffset
;
1568 if (textDecodePos
% CHAR_PER_WORD
> 0) {
1569 charToProcess
= textDecodePos
% CHAR_PER_WORD
;
1570 c
= packedDNA
[textDecodePos
/ CHAR_PER_WORD
] >> (BITS_IN_WORD
- charToProcess
* BIT_PER_CHAR
);
1571 textDecodePos
-= charToProcess
;
1574 // Fill initial scores
1576 I
[0] = gapExtendScore
+ gapOpenScore
;
1577 for (i
=1; i
<=maxLengthOfGap
; i
++) {
1578 B
[i
] = i
* gapExtendScore
+ gapOpenScore
;
1579 I
[i
] = B
[i
] + gapExtendScore
+ gapOpenScore
;
1581 I
[i
] = DP_NEG_INFINITY
;
1583 lastStartPos
= startPos
= 0;
1584 endPos
= min(maxLengthOfGap
+ 1, queryOffset
);
1585 textPos
= textOffset
;
1587 backwardMaxScore
= 0;
1588 backwardMaxScorePosQuery
= queryOffset
;
1589 backwardMaxScorePosText
= textOffset
;
1591 while (startPos
<= endPos
&& textPos
> 0) {
1593 if (endPos
>= dpCellAllocated
) {
1594 fprintf(stderr
, "HSPGappedExtension(): Not enough DP cells allocated!\n");
1598 if (textPos
<= textDecodePos
) {
1599 c
= packedDNA
[textDecodePos
/ CHAR_PER_WORD
- 1];
1600 textDecodePos
-= CHAR_PER_WORD
;
1602 textChar
= (char)(c
& 0x3);
1606 nextStartPos
= INT_MAX
;
1608 if (startPos
> lastStartPos
) {
1609 BLast
= B
[startPos
- 1];
1610 D
= DP_NEG_INFINITY
;
1611 currentPos
= startPos
;
1613 // all scores in currentPos - 1 is DP_NEG_INFINITY
1614 BLast
= B
[startPos
];
1615 B
[startPos
] = I
[startPos
];
1616 I
[startPos
] = I
[startPos
] + gapExtendScore
;
1617 D
= B
[startPos
] + gapExtendScore
+ gapOpenScore
;
1618 if (B
[startPos
] + dropoffScore
>= backwardMaxScore
) {
1619 nextStartPos
= startPos
;
1620 nextEndPos
= startPos
;
1622 currentPos
= startPos
+ 1;
1625 while (currentPos
<= endPos
) {
1627 M
= BLast
+ scoringMatrix
[textChar
][convertedKey
[queryOffset
- currentPos
]];
1628 BLast
= B
[currentPos
];
1630 if (M
>= I
[currentPos
] && M
>= D
) {
1631 // matchScore is maximum
1633 I
[currentPos
] = max(M
+ gapOpenScore
, I
[currentPos
]) + gapExtendScore
;
1634 D
= max(M
+ gapOpenScore
, D
) + gapExtendScore
;
1636 B
[currentPos
] = max(I
[currentPos
], D
);
1637 // insert cannot follow delete and delete cannot follow insert
1638 I
[currentPos
] = I
[currentPos
] + gapExtendScore
;
1639 D
= D
+ gapExtendScore
;
1641 if (B
[currentPos
] + dropoffScore
>= backwardMaxScore
) {
1642 if (nextStartPos
> currentPos
) {
1643 nextStartPos
= currentPos
;
1645 nextEndPos
= currentPos
;
1647 if (B
[currentPos
] > backwardMaxScore
) {
1648 backwardMaxScore
= B
[currentPos
];
1649 backwardMaxScorePosQuery
= queryOffset
- currentPos
;
1650 backwardMaxScorePosText
= textPos
- 1;
1658 if (nextEndPos
== currentPos
) {
1659 while (nextEndPos
<= queryOffset
&& D
+ dropoffScore
>= backwardMaxScore
) {
1660 if (nextEndPos
>= dpCellAllocated
) {
1661 fprintf(stderr
, "HSPGappedExtension(): Not enough DP cells allocated!\n");
1665 D
= D
+ gapExtendScore
;
1666 I
[nextEndPos
] = DP_NEG_INFINITY
;
1669 I
[nextEndPos
] = DP_NEG_INFINITY
;
1672 if (nextEndPos
> queryOffset
) {
1673 nextEndPos
= queryOffset
;
1676 lastStartPos
= startPos
;
1677 startPos
= nextStartPos
;
1678 endPos
= nextEndPos
;
1684 // check cutoff score
1685 if (forwardMaxScore
+ backwardMaxScore
>= cutoffScore
) {
1686 gappedHitList
[numberOfGappedHit
].posText
= backwardMaxScorePosText
;
1687 gappedHitList
[numberOfGappedHit
].posQuery
= backwardMaxScorePosQuery
;
1688 gappedHitList
[numberOfGappedHit
].lengthQuery
= forwardMaxScorePosQuery
+ 1 - backwardMaxScorePosQuery
;
1689 gappedHitList
[numberOfGappedHit
].score
= forwardMaxScore
+ backwardMaxScore
;
1690 gappedHitList
[numberOfGappedHit
].lengthText
= forwardMaxScorePosText
+ 1 - backwardMaxScorePosText
;
1691 gappedHitList
[numberOfGappedHit
].ungappedPosText
= textOffset
;
1692 gappedHitList
[numberOfGappedHit
].ungappedPosQuery
= queryOffset
;
1693 numberOfGappedHit
++;
1700 // free working memory
1701 MMPoolReturn(mmPool
, B
, dpCellAllocated
* sizeof(int));
1702 MMPoolReturn(mmPool
, I
, dpCellAllocated
* sizeof(int));
1705 return numberOfGappedHit
;
1709 // mmPool can be NULL but alignmentPool must be allocated for storing alignments as they are not freed explicitly and must be freed through MMPool
1710 // gappedHitList should be sorted in increasin posText order
1711 int HSPGappedExtensionWithTraceback(const unsigned int *packedDNA
, const unsigned char *convertedKey
, const int queryPatternLength
,
1712 GappedHitList
* __restrict gappedHitList
, const int numberOfHit
,
1713 MMPool
*mmPool
, MMPool
*alignmentPool
,
1714 const SeqOffset
*seqOffset
, const Ambiguity
*ambiguity
,
1715 const int matchScore
, const int mismatchScore
,
1716 const int gapOpenScore
, const int gapExtendScore
,
1717 const double maxEvalue
, const int dropoffScore
) {
1719 char* __restrict textBuffer
;
1722 int* __restrict I
; // insert and delete wrt query
1723 int D
; // insert and delete wrt query
1724 char* __restrict IType
; // Gapped opening or gap extension
1725 char DType
; // Gapped opening or gap extension
1726 int scoringMatrix
[DNA_CHAR_SIZE
][DNA_CHAR_SIZE
];
1728 char* __restrict traceback
;
1729 int* __restrict tracebackIndex
;
1730 char* __restrict tempForwardAlignment
;
1731 char* __restrict tempBackwardAlignment
;
1732 char* __restrict tempForwardAuxiliaryText
;
1733 char* __restrict tempBackwardAuxiliaryText
;
1734 unsigned int* __restrict alignment
;
1735 unsigned int* __restrict auxiliaryText
;
1737 int hitProcessed
, numberOfGappedHit
;
1742 unsigned long long textOffset
;
1743 unsigned long long sequenceStart
, sequenceEnd
;
1745 unsigned long long textDecodePos
;
1748 int lastStartPos
, startPos
, nextStartPos
;
1749 int endPos
, nextEndPos
;
1750 unsigned long long textPos
;
1755 int currentTracebackIndex
;
1756 int forwardMaxScore
, backwardMaxScore
;
1757 unsigned long long forwardMaxScorePosQuery
, forwardMaxScorePosText
;
1758 unsigned long long backwardMaxScorePosQuery
, backwardMaxScorePosText
;
1759 int forwardNumOfAlignment
, forwardNumOfAuxiliaryText
;
1760 int backwardNumOfAlignment
, backwardNumOfAuxiliaryText
;
1761 int numOfAlignmentWord
, numOfAuxiliaryTextWord
;
1762 int tracebackAllocationUnit
, tracebackAllocated
;
1766 int dpCellAllocated
;
1768 dpCellAllocated
= min(queryPatternLength
+ 1, MAX_ALIGNMENT_LENGTH
);
1770 if (alignmentPool
== NULL
) {
1771 fprintf(stderr
, "HSPGappedExtensionWithTraceback(): alignmentPool is not allocated!\n");
1775 maxLengthOfGap
= (dropoffScore
+ gapOpenScore
) / (-gapExtendScore
);
1777 // allocate working memory
1778 textBuffer
= (char*) MMPoolDispatch(mmPool
, dpCellAllocated
* 2);
1779 B
= (int*) MMPoolDispatch(mmPool
, dpCellAllocated
* sizeof(int));
1780 IType
= (char*) MMPoolDispatch(mmPool
, dpCellAllocated
);
1781 I
= (int*) MMPoolDispatch(mmPool
, dpCellAllocated
* sizeof(int));
1783 tracebackIndex
= (int*) MMPoolDispatch(mmPool
, dpCellAllocated
* 2 * 2 * sizeof(int));
1785 tracebackAllocated
= tracebackAllocationUnit
= dpCellAllocated
* maxLengthOfGap
* maxLengthOfGap
;
1786 traceback
= (char*) MMUnitAllocate(tracebackAllocated
);
1788 tempForwardAlignment
= (char*) MMPoolDispatch(mmPool
, dpCellAllocated
* 2);
1789 tempBackwardAlignment
= (char*) MMPoolDispatch(mmPool
, dpCellAllocated
* 2);
1790 tempForwardAuxiliaryText
= (char*) MMPoolDispatch(mmPool
, dpCellAllocated
);
1791 tempBackwardAuxiliaryText
= (char*) MMPoolDispatch(mmPool
, dpCellAllocated
);
1793 HSPFillScoringMatrix(scoringMatrix
, matchScore
, mismatchScore
, 0);
1796 numberOfGappedHit
= 0;
1799 while (hitProcessed
< numberOfHit
) {
1801 sequenceStart
= seqOffset
[gappedHitList
[hitProcessed
].dbSeqIndex
].startPos
;
1802 sequenceEnd
= seqOffset
[gappedHitList
[hitProcessed
].dbSeqIndex
].endPos
;
1803 if (seqOffset
[gappedHitList
[hitProcessed
].dbSeqIndex
].firstAmbiguityIndex
> ambiguityIndex
) {
1804 ambiguityIndex
= seqOffset
[gappedHitList
[hitProcessed
].dbSeqIndex
].firstAmbiguityIndex
;
1807 textOffset
= gappedHitList
[hitProcessed
].ungappedPosText
;
1808 queryOffset
= gappedHitList
[hitProcessed
].ungappedPosQuery
;
1813 textDecodePos
= textOffset
;
1814 if (textDecodePos
% CHAR_PER_WORD
> 0) {
1815 // decode text to the next word boundary
1816 charToProcess
= CHAR_PER_WORD
- (textDecodePos
% CHAR_PER_WORD
);
1817 c
= packedDNA
[textDecodePos
/ CHAR_PER_WORD
] << (BITS_IN_WORD
- charToProcess
* BIT_PER_CHAR
);
1818 for (i
=0; i
<charToProcess
; i
++) {
1819 textBuffer
[textDecodePos
+ i
- textOffset
+ 1] = (unsigned char)(c
>> (BITS_IN_WORD
- BIT_PER_CHAR
));
1824 while (ambiguity
[ambiguityIndex
].rightOfEndPos
<= textOffset
) {
1827 if (ambiguity
[ambiguityIndex
].startPos
< textOffset
+ charToProcess
) {
1828 for (i
=0; i
<charToProcess
; i
++) {
1829 while (ambiguity
[ambiguityIndex
].rightOfEndPos
<= textOffset
+ i
) {
1832 if (ambiguity
[ambiguityIndex
].startPos
<= textOffset
+ i
) {
1833 textBuffer
[textDecodePos
+ i
- textOffset
+ 1] = (char)ambiguity
[ambiguityIndex
].symbol
;
1838 textDecodePos
+= charToProcess
;
1841 // Fill initial scores
1843 traceback
[0] = DP_MATCH_MISMATCH
;
1844 I
[0] = gapExtendScore
+ gapOpenScore
;
1845 IType
[0] = DP_INSERT_OPEN
;
1847 B
[1] = gapExtendScore
+ gapOpenScore
;
1848 traceback
[1] = DP_DELETE
| DP_DELETE_OPEN
;
1849 I
[1] = B
[1] + gapExtendScore
+ gapOpenScore
;
1850 IType
[1] = DP_INSERT_OPEN
;
1852 for (i
=2; i
<=maxLengthOfGap
; i
++) {
1853 B
[i
] = B
[i
-1] + gapExtendScore
;
1854 traceback
[i
] = DP_DELETE
;
1855 I
[i
] = B
[i
] + gapExtendScore
+ gapOpenScore
;
1856 IType
[i
] = DP_INSERT_OPEN
;
1858 I
[i
] = DP_NEG_INFINITY
;
1860 currentTracebackIndex
= maxLengthOfGap
+ 1;
1862 lastStartPos
= startPos
= 0;
1863 endPos
= min((maxLengthOfGap
+ 1), queryPatternLength
- queryOffset
);
1864 textPos
= textOffset
;
1866 forwardMaxScore
= 0;
1867 forwardMaxScorePosQuery
= queryOffset
;
1868 forwardMaxScorePosText
= textOffset
;
1870 tracebackIndex
[0] = 0; // The first trackback of the row
1871 tracebackIndex
[1] = 0; // The first query position of the row
1873 while (startPos
<= endPos
&& textPos
<= sequenceEnd
) {
1875 if (endPos
>= dpCellAllocated
) {
1876 fprintf(stderr
, "HSPGappedExtensionWithTraceback(): Not enough DP cells allocated!\n");
1880 if (textPos
< textDecodePos
) {
1882 // decode a word of text
1883 c
= packedDNA
[textDecodePos
/ CHAR_PER_WORD
];
1884 for (j
=0; j
<CHAR_PER_WORD
; j
++) {
1885 textBuffer
[textDecodePos
+ j
- textOffset
+ 1] = (unsigned char)(c
>> (BITS_IN_WORD
- BIT_PER_CHAR
));
1890 while (ambiguity
[ambiguityIndex
].rightOfEndPos
<= textPos
) {
1893 if (ambiguity
[ambiguityIndex
].startPos
< textPos
+ CHAR_PER_WORD
) {
1894 for (j
=0; j
<CHAR_PER_WORD
; j
++) {
1895 while (ambiguity
[ambiguityIndex
].rightOfEndPos
<= textPos
+ j
) {
1898 if (ambiguity
[ambiguityIndex
].startPos
<= textPos
+ j
) {
1899 textBuffer
[textDecodePos
+ j
- textOffset
+ 1] = (char)ambiguity
[ambiguityIndex
].symbol
;
1904 textDecodePos
+= CHAR_PER_WORD
;
1908 tracebackIndex
[(textPos
- textOffset
+ 1) * 2] = currentTracebackIndex
; // The first trackback of the row
1909 tracebackIndex
[(textPos
- textOffset
+ 1) * 2 + 1] = startPos
; // The first query position of the row
1912 nextStartPos
= INT_MAX
;
1914 if (currentTracebackIndex
+ queryPatternLength
>= tracebackAllocated
) {
1915 traceback
= (char*) MMUnitReallocate(traceback
, tracebackAllocated
+ tracebackAllocationUnit
, tracebackAllocated
);
1916 tracebackAllocated
+= tracebackAllocationUnit
;
1919 if (startPos
> lastStartPos
) {
1920 BLast
= B
[startPos
- 1];
1921 D
= DP_NEG_INFINITY
;
1923 currentPos
= startPos
;
1925 // all scores in currentPos - 1 is DP_NEG_INFINITY
1926 BLast
= B
[startPos
];
1927 B
[startPos
] = I
[startPos
];
1928 traceback
[currentTracebackIndex
] = DP_INSERT
| IType
[startPos
];
1929 I
[startPos
] = I
[startPos
] + gapExtendScore
;
1930 IType
[startPos
] = DP_INSERT_EXTEND
;
1931 D
= B
[startPos
] + gapExtendScore
+ gapOpenScore
;
1932 DType
= DP_DELETE_OPEN
;
1933 if (B
[startPos
] + dropoffScore
>= forwardMaxScore
) {
1934 nextStartPos
= startPos
;
1935 nextEndPos
= startPos
;
1937 currentPos
= startPos
+ 1;
1938 currentTracebackIndex
++;
1941 while (currentPos
<= endPos
) {
1943 M
= BLast
+ scoringMatrix
[textBuffer
[textPos
- textOffset
+ 1]][convertedKey
[currentPos
+ queryOffset
- 1]];
1944 BLast
= B
[currentPos
];
1946 if (M
>= I
[currentPos
] && M
>= D
) {
1947 // matchScore is maximum
1949 traceback
[currentTracebackIndex
] = DP_MATCH_MISMATCH
| IType
[currentPos
] | DType
;
1950 if (M
+ gapOpenScore
>= I
[currentPos
]) {
1951 I
[currentPos
] = M
+ gapOpenScore
+ gapExtendScore
;
1952 IType
[currentPos
] = DP_INSERT_OPEN
;
1954 I
[currentPos
] = I
[currentPos
] + gapExtendScore
;
1955 IType
[currentPos
] = DP_INSERT_EXTEND
;
1957 if (M
+ gapOpenScore
>= D
) {
1958 D
= M
+ gapOpenScore
+ gapExtendScore
;
1959 DType
= DP_DELETE_OPEN
;
1961 D
= D
+ gapExtendScore
;
1962 DType
= DP_DELETE_EXTEND
;
1964 if (B
[currentPos
] > forwardMaxScore
) {
1965 forwardMaxScore
= B
[currentPos
];
1966 forwardMaxScorePosQuery
= currentPos
+ queryOffset
;
1967 forwardMaxScorePosText
= textPos
+ 1;
1970 if (I
[currentPos
] >= D
) {
1971 B
[currentPos
] = I
[currentPos
];
1972 traceback
[currentTracebackIndex
] = DP_INSERT
| IType
[currentPos
] | DType
;
1975 traceback
[currentTracebackIndex
] = DP_DELETE
| IType
[currentPos
] | DType
;
1977 // insert cannot follow delete and delete cannot follow insert
1978 I
[currentPos
] = I
[currentPos
] + gapExtendScore
;
1979 IType
[currentPos
] = DP_INSERT_EXTEND
;
1980 D
= D
+ gapExtendScore
;
1981 DType
= DP_DELETE_EXTEND
;
1983 if (B
[currentPos
] + dropoffScore
>= forwardMaxScore
) {
1984 if (nextStartPos
> currentPos
) {
1985 nextStartPos
= currentPos
;
1987 nextEndPos
= currentPos
;
1991 currentTracebackIndex
++;
1996 if (nextEndPos
== currentPos
) {
1997 while (nextEndPos
<= queryPatternLength
- queryOffset
&& D
+ dropoffScore
>= forwardMaxScore
) {
1998 if (nextEndPos
>= dpCellAllocated
) {
1999 fprintf(stderr
, "HSPGappedExtensionWithTraceback(): Not enough DP cells allocated!\n");
2003 traceback
[currentTracebackIndex
] = DP_DELETE
| DType
;
2004 D
= D
+ gapExtendScore
;
2005 DType
= DP_DELETE_EXTEND
;
2006 I
[nextEndPos
] = DP_NEG_INFINITY
;
2007 IType
[nextEndPos
] = 0;
2009 currentTracebackIndex
++;
2011 I
[nextEndPos
] = DP_NEG_INFINITY
;
2012 IType
[nextEndPos
] = 0;
2015 if (nextEndPos
> queryPatternLength
- queryOffset
) {
2016 nextEndPos
= queryPatternLength
- queryOffset
;
2019 lastStartPos
= startPos
;
2020 startPos
= nextStartPos
;
2021 endPos
= nextEndPos
;
2028 forwardNumOfAlignment
= 0;
2029 forwardNumOfAuxiliaryText
= 0;
2030 textPos
= forwardMaxScorePosText
- 1;
2031 currentPos
= forwardMaxScorePosQuery
- queryOffset
;
2032 dpScore
= 0; // for verifying traceback
2034 currentTracebackIndex
= tracebackIndex
[(textPos
- textOffset
+ 1) * 2]
2035 + currentPos
- tracebackIndex
[(textPos
- textOffset
+ 1) * 2 + 1];
2037 while (currentTracebackIndex
> 0) {
2039 dpType
= traceback
[currentTracebackIndex
] & DP_MASK
;
2041 if (dpType
== DP_MATCH_MISMATCH
) {
2042 dpScore
+= scoringMatrix
[textBuffer
[textPos
- textOffset
+ 1]][convertedKey
[currentPos
+ queryOffset
- 1]];
2043 if (textBuffer
[textPos
- textOffset
+ 1] == convertedKey
[currentPos
+ queryOffset
- 1] &&
2044 textBuffer
[textPos
- textOffset
+ 1] < 4) { // match and not ambiguity
2045 tempForwardAlignment
[forwardNumOfAlignment
] = ALIGN_MATCH
;
2047 tempForwardAlignment
[forwardNumOfAlignment
] = ALIGN_MISMATCH_AMBIGUITY
;
2048 tempForwardAuxiliaryText
[forwardNumOfAuxiliaryText
] = textBuffer
[textPos
- textOffset
+ 1];
2049 forwardNumOfAuxiliaryText
++;
2053 currentTracebackIndex
= tracebackIndex
[(textPos
- textOffset
+ 1) * 2]
2054 + currentPos
- tracebackIndex
[(textPos
- textOffset
+ 1) * 2 + 1];
2055 } else if (dpType
== DP_INSERT
) {
2056 while (!(traceback
[currentTracebackIndex
] & DP_INSERT_OPEN
)) {
2057 dpScore
+= gapExtendScore
;
2058 tempForwardAlignment
[forwardNumOfAlignment
] = ALIGN_INSERT
;
2059 forwardNumOfAlignment
++;
2060 tempForwardAuxiliaryText
[forwardNumOfAuxiliaryText
] = textBuffer
[textPos
- textOffset
+ 1];
2061 forwardNumOfAuxiliaryText
++;
2063 currentTracebackIndex
= tracebackIndex
[(textPos
- textOffset
+ 1) * 2]
2064 + currentPos
- tracebackIndex
[(textPos
- textOffset
+ 1) * 2 + 1];
2066 dpScore
+= gapOpenScore
+ gapExtendScore
;
2067 tempForwardAlignment
[forwardNumOfAlignment
] = ALIGN_INSERT
;
2068 tempForwardAuxiliaryText
[forwardNumOfAuxiliaryText
] = textBuffer
[textPos
- textOffset
+ 1];
2069 forwardNumOfAuxiliaryText
++;
2071 currentTracebackIndex
= tracebackIndex
[(textPos
- textOffset
+ 1) * 2]
2072 + currentPos
- tracebackIndex
[(textPos
- textOffset
+ 1) * 2 + 1];
2074 while (!(traceback
[currentTracebackIndex
] & DP_DELETE_OPEN
)) {
2075 dpScore
+= gapExtendScore
;
2076 tempForwardAlignment
[forwardNumOfAlignment
] = ALIGN_DELETE
;
2077 forwardNumOfAlignment
++;
2079 currentTracebackIndex
--;
2081 dpScore
+= gapOpenScore
+ gapExtendScore
;
2082 tempForwardAlignment
[forwardNumOfAlignment
] = ALIGN_DELETE
;
2084 currentTracebackIndex
--;
2087 forwardNumOfAlignment
++;
2091 if (dpScore
!= forwardMaxScore
) {
2092 fprintf(stderr
, "Forward gapped extension traceback error!\n");
2098 textDecodePos
= textOffset
;
2099 if (textDecodePos
% CHAR_PER_WORD
> 0) {
2100 // decode text to the next word boundary
2101 charToProcess
= textDecodePos
% CHAR_PER_WORD
;
2102 c
= packedDNA
[textDecodePos
/ CHAR_PER_WORD
] >> (BITS_IN_WORD
- charToProcess
* BIT_PER_CHAR
);
2103 for (i
=0; i
<charToProcess
; i
++) {
2104 textBuffer
[textOffset
- textDecodePos
+ i
+ 1] = (unsigned char)(c
& 0x3);
2109 while (ambiguity
[ambiguityIndex
].startPos
>= textOffset
) {
2112 if (ambiguity
[ambiguityIndex
].rightOfEndPos
> (textOffset
- charToProcess
)) {
2113 for (i
=0; i
<charToProcess
; i
++) {
2114 while (ambiguity
[ambiguityIndex
].startPos
>= (textOffset
- i
)) {
2117 if (ambiguity
[ambiguityIndex
].rightOfEndPos
> (textOffset
- i
- 1)) {
2118 textBuffer
[textOffset
- textDecodePos
+ i
+ 1] = (char)ambiguity
[ambiguityIndex
].symbol
;
2123 textDecodePos
-= charToProcess
;
2126 // Fill initial scores
2128 traceback
[0] = DP_MATCH_MISMATCH
;
2129 I
[0] = gapExtendScore
+ gapOpenScore
;
2130 IType
[0] = DP_INSERT_OPEN
;
2132 B
[1] = gapExtendScore
+ gapOpenScore
;
2133 traceback
[1] = DP_DELETE
| DP_DELETE_OPEN
;
2134 I
[1] = B
[1] + gapExtendScore
+ gapOpenScore
;
2135 IType
[1] = DP_INSERT_OPEN
;
2137 for (i
=2; i
<=maxLengthOfGap
; i
++) {
2138 B
[i
] = B
[i
-1] + gapExtendScore
;
2139 traceback
[i
] = DP_DELETE
;
2140 I
[i
] = B
[i
] + gapExtendScore
+ gapOpenScore
;
2141 IType
[i
] = DP_INSERT_OPEN
;
2143 I
[i
] = DP_NEG_INFINITY
;
2145 currentTracebackIndex
= maxLengthOfGap
+ 1;
2147 lastStartPos
= startPos
= 0;
2148 endPos
= min((maxLengthOfGap
+ 1), queryOffset
);
2149 textPos
= textOffset
;
2151 backwardMaxScore
= 0;
2152 backwardMaxScorePosQuery
= queryOffset
;
2153 backwardMaxScorePosText
= textOffset
;
2155 tracebackIndex
[0] = 0;
2156 tracebackIndex
[1] = 0;
2158 while (startPos
<= endPos
&& textPos
+ 1 > sequenceStart
) {
2160 if (endPos
>= dpCellAllocated
) {
2161 fprintf(stderr
, "HSPGappedExtensionWithTraceback(): Not enough DP cells allocated!\n");
2165 if (textPos
> textDecodePos
) {
2167 // decode a word of text
2168 c
= packedDNA
[textDecodePos
/ CHAR_PER_WORD
- 1];
2169 for (j
=0; j
<CHAR_PER_WORD
; j
++) {
2170 textBuffer
[textOffset
- textDecodePos
+ j
+ 1] = (unsigned char)(c
& 0x3);
2175 while (ambiguity
[ambiguityIndex
].startPos
>= textPos
) {
2178 if (ambiguity
[ambiguityIndex
].rightOfEndPos
> (textPos
- CHAR_PER_WORD
)) {
2179 for (j
=0; j
<CHAR_PER_WORD
; j
++) {
2180 while (ambiguity
[ambiguityIndex
].startPos
>= (textPos
- j
)) {
2183 if (ambiguity
[ambiguityIndex
].rightOfEndPos
> (textPos
- j
- 1)) {
2184 textBuffer
[textOffset
- textDecodePos
+ j
+ 1] = (char)ambiguity
[ambiguityIndex
].symbol
;
2189 textDecodePos
-= CHAR_PER_WORD
;
2193 tracebackIndex
[(textOffset
- textPos
+ 1) * 2] = currentTracebackIndex
;
2194 tracebackIndex
[(textOffset
- textPos
+ 1) * 2 + 1] = startPos
;
2197 nextStartPos
= INT_MAX
;
2199 if (currentTracebackIndex
+ queryPatternLength
>= tracebackAllocated
) {
2200 traceback
= (char*) MMUnitReallocate(traceback
, tracebackAllocated
+ tracebackAllocationUnit
, tracebackAllocated
);
2201 tracebackAllocated
+= tracebackAllocationUnit
;
2204 if (startPos
> lastStartPos
) {
2205 BLast
= B
[startPos
- 1];
2206 D
= DP_NEG_INFINITY
;
2208 currentPos
= startPos
;
2210 // all scores in currentPos - 1 is DP_NEG_INFINITY
2211 BLast
= B
[startPos
];
2212 B
[startPos
] = I
[startPos
];
2213 traceback
[currentTracebackIndex
] = DP_INSERT
| IType
[startPos
];
2214 I
[startPos
] = I
[startPos
] + gapExtendScore
;
2215 IType
[startPos
] = DP_INSERT_EXTEND
;
2216 D
= B
[startPos
] + gapExtendScore
+ gapOpenScore
;
2217 DType
= DP_DELETE_OPEN
;
2218 if (B
[startPos
] + dropoffScore
>= forwardMaxScore
) {
2219 nextStartPos
= startPos
;
2220 nextEndPos
= startPos
;
2222 currentPos
= startPos
+ 1;
2223 currentTracebackIndex
++;
2227 while (currentPos
<= endPos
) {
2229 M
= BLast
+ scoringMatrix
[textBuffer
[textOffset
- textPos
+ 1]][convertedKey
[queryOffset
- currentPos
]];
2230 BLast
= B
[currentPos
];
2232 if (M
>= I
[currentPos
] && M
>= D
) {
2233 // matchScore is maximum
2235 traceback
[currentTracebackIndex
] = DP_MATCH_MISMATCH
| IType
[currentPos
] | DType
;
2236 if (M
+ gapOpenScore
>= I
[currentPos
]) {
2237 I
[currentPos
] = M
+ gapOpenScore
+ gapExtendScore
;
2238 IType
[currentPos
] = DP_INSERT_OPEN
;
2240 I
[currentPos
] = I
[currentPos
] + gapExtendScore
;
2241 IType
[currentPos
] = DP_INSERT_EXTEND
;
2243 if (M
+ gapOpenScore
>= D
) {
2244 D
= M
+ gapOpenScore
+ gapExtendScore
;
2245 DType
= DP_DELETE_OPEN
;
2247 D
= D
+ gapExtendScore
;
2248 DType
= DP_DELETE_EXTEND
;
2250 if (B
[currentPos
] > backwardMaxScore
) {
2251 backwardMaxScore
= B
[currentPos
];
2252 backwardMaxScorePosQuery
= queryOffset
- currentPos
;
2253 backwardMaxScorePosText
= textPos
- 1;
2256 if (I
[currentPos
] >= D
) {
2257 B
[currentPos
] = I
[currentPos
];
2258 traceback
[currentTracebackIndex
] = DP_INSERT
| IType
[currentPos
] | DType
;
2261 traceback
[currentTracebackIndex
] = DP_DELETE
| IType
[currentPos
] | DType
;
2263 // insert cannot follow delete and delete cannot follow insert
2264 I
[currentPos
] = I
[currentPos
] + gapExtendScore
;
2265 IType
[currentPos
] = DP_INSERT_EXTEND
;
2266 D
= D
+ gapExtendScore
;
2267 DType
= DP_DELETE_EXTEND
;
2270 if (B
[currentPos
] + dropoffScore
>= backwardMaxScore
) {
2271 if (nextStartPos
> currentPos
) {
2272 nextStartPos
= currentPos
;
2274 nextEndPos
= currentPos
;
2278 currentTracebackIndex
++;
2283 if (nextEndPos
== currentPos
) {
2284 while (nextEndPos
<= queryOffset
&& D
+ dropoffScore
>= backwardMaxScore
) {
2285 if (nextEndPos
>= dpCellAllocated
) {
2286 fprintf(stderr
, "HSPGappedExtensionWithTraceback(): Not enough DP cells allocated!\n");
2290 traceback
[currentTracebackIndex
] = DP_DELETE
| DType
;
2291 D
= D
+ gapExtendScore
;
2292 DType
= DP_DELETE_EXTEND
;
2293 I
[nextEndPos
] = DP_NEG_INFINITY
;
2294 IType
[nextEndPos
] = 0;
2296 currentTracebackIndex
++;
2298 I
[nextEndPos
] = DP_NEG_INFINITY
;
2299 IType
[nextEndPos
] = 0;
2302 if (nextEndPos
> queryOffset
) {
2303 nextEndPos
= queryOffset
;
2306 lastStartPos
= startPos
;
2307 startPos
= nextStartPos
;
2308 endPos
= nextEndPos
;
2315 backwardNumOfAlignment
= 0;
2316 backwardNumOfAuxiliaryText
= 0;
2317 textPos
= backwardMaxScorePosText
+ 1;
2318 currentPos
= queryOffset
- backwardMaxScorePosQuery
;
2319 dpScore
= 0; // for verifying traceback
2321 currentTracebackIndex
= tracebackIndex
[(textOffset
- textPos
+ 1) * 2]
2322 + currentPos
- tracebackIndex
[(textOffset
- textPos
+ 1) * 2 + 1];
2324 while (currentTracebackIndex
> 0) {
2326 dpType
= traceback
[currentTracebackIndex
] & DP_MASK
;
2328 if (dpType
== DP_MATCH_MISMATCH
) {
2329 dpScore
+= scoringMatrix
[textBuffer
[textOffset
- textPos
+ 1]][convertedKey
[queryOffset
- currentPos
]];
2330 if (textBuffer
[textOffset
- textPos
+ 1] == convertedKey
[queryOffset
- currentPos
] &&
2331 textBuffer
[textOffset
- textPos
+ 1] < 4) { // match and not ambiguity
2332 tempBackwardAlignment
[backwardNumOfAlignment
] = ALIGN_MATCH
;
2334 tempBackwardAlignment
[backwardNumOfAlignment
] = ALIGN_MISMATCH_AMBIGUITY
;
2335 tempBackwardAuxiliaryText
[backwardNumOfAuxiliaryText
] = textBuffer
[textOffset
- textPos
+ 1];
2336 backwardNumOfAuxiliaryText
++;
2340 currentTracebackIndex
= tracebackIndex
[(textOffset
- textPos
+ 1) * 2]
2341 + currentPos
- tracebackIndex
[(textOffset
- textPos
+ 1) * 2 + 1];
2342 } else if (dpType
== DP_INSERT
) {
2343 while (!(traceback
[currentTracebackIndex
] & DP_INSERT_OPEN
)) {
2344 dpScore
+= gapExtendScore
;
2345 tempBackwardAlignment
[backwardNumOfAlignment
] = ALIGN_INSERT
;
2346 backwardNumOfAlignment
++;
2347 tempBackwardAuxiliaryText
[backwardNumOfAuxiliaryText
] = textBuffer
[textOffset
- textPos
+ 1];
2348 backwardNumOfAuxiliaryText
++;
2350 currentTracebackIndex
= tracebackIndex
[(textOffset
- textPos
+ 1) * 2]
2351 + currentPos
- tracebackIndex
[(textOffset
- textPos
+ 1) * 2 + 1];
2353 dpScore
+= gapOpenScore
+ gapExtendScore
;
2354 tempBackwardAlignment
[backwardNumOfAlignment
] = ALIGN_INSERT
;
2355 tempBackwardAuxiliaryText
[backwardNumOfAuxiliaryText
] = textBuffer
[textOffset
- textPos
+ 1];
2356 backwardNumOfAuxiliaryText
++;
2358 currentTracebackIndex
= tracebackIndex
[(textOffset
- textPos
+ 1) * 2]
2359 + currentPos
- tracebackIndex
[(textOffset
- textPos
+ 1) * 2 + 1];
2361 while (!(traceback
[currentTracebackIndex
] & DP_DELETE_OPEN
)) {
2362 dpScore
+= gapExtendScore
;
2363 tempBackwardAlignment
[backwardNumOfAlignment
] = ALIGN_DELETE
;
2364 backwardNumOfAlignment
++;
2366 currentTracebackIndex
--;
2368 dpScore
+= gapOpenScore
+ gapExtendScore
;
2369 tempBackwardAlignment
[backwardNumOfAlignment
] = ALIGN_DELETE
;
2371 currentTracebackIndex
--;
2374 backwardNumOfAlignment
++;
2378 if (dpScore
!= backwardMaxScore
) {
2379 fprintf(stderr
, "Backward gapped extension traceback error!\n");
2383 // check cutoff score
2384 evalue
= stat_gapCalcEvalue(stat_gapNominal2normalized(forwardMaxScore
+ backwardMaxScore
));
2385 if (evalue
< maxEvalue
) {
2386 gappedHitList
[numberOfGappedHit
].posText
= backwardMaxScorePosText
;
2387 gappedHitList
[numberOfGappedHit
].posQuery
= backwardMaxScorePosQuery
;
2388 gappedHitList
[numberOfGappedHit
].lengthQuery
= forwardMaxScorePosQuery
- backwardMaxScorePosQuery
;
2389 gappedHitList
[numberOfGappedHit
].score
= forwardMaxScore
+ backwardMaxScore
;
2390 gappedHitList
[numberOfGappedHit
].lengthText
= forwardMaxScorePosText
- backwardMaxScorePosText
;
2391 gappedHitList
[numberOfGappedHit
].dbSeqIndex
= gappedHitList
[hitProcessed
].dbSeqIndex
;
2393 // Store alignment and auxiliary text
2394 numOfAlignmentWord
= (forwardNumOfAlignment
+ backwardNumOfAlignment
+ ALIGN_PER_WORD
- 1) / ALIGN_PER_WORD
;
2395 ((GappedHitListWithAlignment
*)(gappedHitList
+ numberOfGappedHit
))->alignmentOffset
= MMPoolDispatchOffset(alignmentPool
, numOfAlignmentWord
* sizeof(unsigned int));
2396 alignment
= (unsigned int*)((char*)alignmentPool
+ ((GappedHitListWithAlignment
*)(gappedHitList
+ numberOfGappedHit
))->alignmentOffset
);
2397 for (i
=0; i
<numOfAlignmentWord
; i
++) {
2400 if (forwardNumOfAuxiliaryText
+ backwardNumOfAuxiliaryText
> 0) {
2401 numOfAuxiliaryTextWord
= (forwardNumOfAuxiliaryText
+ backwardNumOfAuxiliaryText
+ AUX_TEXT_PER_WORD
- 1) / AUX_TEXT_PER_WORD
;
2402 ((GappedHitListWithAlignment
*)(gappedHitList
+ numberOfGappedHit
))->auxiliaryTextOffset
= MMPoolDispatchOffset(alignmentPool
, numOfAuxiliaryTextWord
* sizeof(unsigned int));
2403 auxiliaryText
= (unsigned int*)((char*)alignmentPool
+ ((GappedHitListWithAlignment
*)(gappedHitList
+ numberOfGappedHit
))->auxiliaryTextOffset
);
2404 for (i
=0; i
<numOfAuxiliaryTextWord
; i
++) {
2405 auxiliaryText
[i
] = 0;
2408 ((GappedHitListWithAlignment
*)(gappedHitList
+ numberOfGappedHit
))->auxiliaryTextOffset
= 0;
2411 // backward alignment
2412 for (i
=0; i
<backwardNumOfAlignment
; i
++) {
2413 alignment
[i
/ALIGN_PER_WORD
] |= (tempBackwardAlignment
[i
] << (BITS_IN_WORD
- (i
% ALIGN_PER_WORD
+ 1) * ALIGN_BIT
));
2415 for (i
=0; i
<backwardNumOfAuxiliaryText
; i
++) {
2416 auxiliaryText
[i
/AUX_TEXT_PER_WORD
] |= (tempBackwardAuxiliaryText
[i
] << (BITS_IN_WORD
- (i
% AUX_TEXT_PER_WORD
+ 1) * AUX_TEXT_BIT
));
2419 // forward alignment
2420 for (i
=0; i
<forwardNumOfAlignment
; i
++) {
2421 alignment
[(i
+backwardNumOfAlignment
)/ALIGN_PER_WORD
] |= (tempForwardAlignment
[forwardNumOfAlignment
- i
- 1]
2422 << (BITS_IN_WORD
- ((i
+backwardNumOfAlignment
) % ALIGN_PER_WORD
+ 1) * ALIGN_BIT
));
2424 for (i
=0; i
<forwardNumOfAuxiliaryText
; i
++) {
2425 auxiliaryText
[(i
+backwardNumOfAuxiliaryText
)/AUX_TEXT_PER_WORD
] |= (tempForwardAuxiliaryText
[forwardNumOfAuxiliaryText
- i
- 1]
2426 << (BITS_IN_WORD
- ((i
+backwardNumOfAuxiliaryText
) % AUX_TEXT_PER_WORD
+ 1) * AUX_TEXT_BIT
));
2429 numberOfGappedHit
++;
2437 // free working memory
2438 MMPoolReturn(mmPool
, textBuffer
, dpCellAllocated
* 2);
2439 MMPoolReturn(mmPool
, B
, dpCellAllocated
* sizeof(int));
2440 MMPoolReturn(mmPool
, IType
, dpCellAllocated
);
2441 MMPoolReturn(mmPool
, I
, dpCellAllocated
* sizeof(int));
2443 MMPoolReturn(mmPool
, tracebackIndex
, dpCellAllocated
* 2 * 2 * sizeof(int));
2445 MMUnitFree(traceback
, tracebackAllocated
);
2447 MMPoolReturn(mmPool
, tempForwardAlignment
, dpCellAllocated
* 2);
2448 MMPoolReturn(mmPool
, tempBackwardAlignment
, dpCellAllocated
* 2);
2449 MMPoolReturn(mmPool
, tempForwardAuxiliaryText
, dpCellAllocated
);
2450 MMPoolReturn(mmPool
, tempBackwardAuxiliaryText
, dpCellAllocated
);
2452 return numberOfGappedHit
;
2456 // textList must be sorted in descending posText order;
2457 // queryPosListIndex contains the starting positions of queryPos in queryPosList within a queryPosGroup
2458 // a dummy entry must be present at the end of queryPosListIndex
2460 int HSPDPDBvsQuery(const unsigned int *packedText
, const HitList
*textList
, const int numOfTextHit
,
2461 const SeqOffset
*textSeqOffset
, const Ambiguity
*textAmbiguity
, const int numOfTextSeq
,
2462 const unsigned char *convertedKey
, const int queryPatternLength
,
2463 const int *queryPosListIndex
, const unsigned int *queryPosList
,
2464 GappedHitListWithAlignment
* __restrict gappedHitList
, const int maxNumOfGappedHit
,
2465 MMPool
*mmPool
, MMPool
*alignmentPool
,
2466 const int matchScore
, const int mismatchScore
,
2467 const int gapOpenScore
, const int gapExtendScore
,
2468 const int cutoffScore
, const double maxEvalue
) {
2470 #define NUM_SOURCE_BIT 12
2471 #define SOURCE_BIT_MASK 0xFFF
2473 #define DROPOFF_MAX_ENTRY 256
2475 #define EARLY_DROPOFF_MIN 11
2476 #define EARLY_DROPOFF_FACTOR 4 // These two are heuristic values to determine the dropoff to trigger addition traceback
2477 // When this heuistic is not applied, about 0.01% alignments reported by Blast are
2478 // filtered out during traceback on experiments with query size at chromosome lengths
2479 // The filtered alignments are of low evalues and are close to some strong alignments
2481 #define TEXT_BUFFER_SIZE 16
2484 DPCell
** __restrict dpCell
;
2486 int D
; // insert and delete wrt query
2488 // The following variables are all indexed by source bit
2491 DPMaxScore
* __restrict maxScore
;
2492 DPMaxScore
* __restrict dropoffMaxScore
;
2494 // Keeping track whether dropoff should be valid alignments
2495 int* __restrict currentMaxScore
;
2496 int* __restrict lastDropoffMaxScoreIndex
;
2498 // End variables indexed by source bit
2500 int textBuffer
[TEXT_BUFFER_SIZE
];
2502 char* __restrict tempAlignment
;
2503 char* __restrict tempAuxiliaryText
;
2505 int scoringMatrix
[DNA_CHAR_SIZE
][DNA_CHAR_SIZE
];
2506 int gapOpenScoreShifted
, gapExtendScoreShifted
;
2507 int cutoffScoreShifted
;
2508 int dropoffScoreShifted
;
2509 int minPositiveScore
;
2515 int firstTextHitBeingProcessed
, numOfTextHitBeingProcessed
;
2517 unsigned long long textOffset
;
2521 int textDbSeqIndex
, textAmbiguityIndex
;
2522 unsigned long long textSeqStart
, textSeqEnd
;
2529 int sourceBitMinScore
, maxSourceBit
, nextSourceBit
, usedSourceBit
;
2530 int numOfDropoffMaxScore
;
2532 int tempAlignmentIndex
;
2533 int tempAuxiliaryTextIndex
;
2534 int numOfAlignmentWord
, numOfAuxiliaryTextWord
;
2536 Traceback
* __restrict traceback
;
2537 int tracebackAllocated
;
2539 int tracebackIndexAdjustment
;
2541 unsigned int queryListInfo
;
2543 int lastDpCellUsed
, lastDpCellIndex
;
2544 int dpCellUsed
, dpCellIndex
;
2545 int dpCellUsedAdjustedForEdge
;
2548 unsigned long long qPos
, lastQPos
, maxQPos
;
2551 int bestScore
, insertScore
, deleteScore
;
2553 int minAlignmentLength
, minNegAlignmentLength
, maxDPGroupingDist
;
2555 unsigned int* __restrict alignment
;
2556 unsigned int* __restrict auxiliaryText
;
2558 // allocate working memory
2559 dpCell
= (DPCell
**) MMPoolDispatch(mmPool
, 2 * sizeof(DPCell
*));
2560 dpCell
[0] = (DPCell
*) MMPoolDispatch(mmPool
, MAX_ALIGNMENT_LENGTH
* sizeof(DPCell
));
2561 dpCell
[1] = (DPCell
*) MMPoolDispatch(mmPool
, MAX_ALIGNMENT_LENGTH
* sizeof(DPCell
));
2563 maxScore
= (DPMaxScore
*) MMPoolDispatch(mmPool
, (1 << NUM_SOURCE_BIT
) * sizeof(DPMaxScore
));
2564 dropoffMaxScore
= (DPMaxScore
*) MMPoolDispatch(mmPool
, DROPOFF_MAX_ENTRY
* sizeof(DPMaxScore
));
2566 currentMaxScore
= (int*) MMPoolDispatch(mmPool
, (1 << NUM_SOURCE_BIT
) * sizeof(int));
2567 lastDropoffMaxScoreIndex
= (int*) MMPoolDispatch(mmPool
, (1 << NUM_SOURCE_BIT
) * sizeof(int));
2569 tempAlignment
= (char*) MMPoolDispatch(mmPool
, (MAX_ALIGNMENT_LENGTH
* 2) * sizeof(char));
2570 tempAuxiliaryText
= (char*) MMPoolDispatch(mmPool
, (MAX_ALIGNMENT_LENGTH
* 2) * sizeof(char));
2572 tracebackAllocated
= MMPoolByteAvailable(mmPool
) / sizeof(Traceback
);
2573 if (tracebackAllocated
< 0) {
2574 fprintf(stderr
, "HSPDPDBvsDB(): Not enough memory allocated!\n");
2577 traceback
= (Traceback
*) MMPoolDispatch(mmPool
, tracebackAllocated
* sizeof(Traceback
));
2579 HSPFillScoringMatrix(scoringMatrix
, matchScore
, mismatchScore
, NUM_SOURCE_BIT
);
2581 gapOpenScoreShifted
= gapOpenScore
* (1 << NUM_SOURCE_BIT
);
2582 gapExtendScoreShifted
= gapExtendScore
* (1 << NUM_SOURCE_BIT
);
2583 cutoffScoreShifted
= cutoffScore
* (1 << NUM_SOURCE_BIT
);
2584 minPositiveScore
= 1 << NUM_SOURCE_BIT
;
2585 if (cutoffScore
<= EARLY_DROPOFF_MIN
) {
2586 dropoffScoreShifted
= cutoffScoreShifted
;
2588 dropoffScoreShifted
= (cutoffScore
- (cutoffScore
- EARLY_DROPOFF_MIN
+ EARLY_DROPOFF_FACTOR
- 1) / EARLY_DROPOFF_FACTOR
) * (1 << NUM_SOURCE_BIT
) ;
2591 minAlignmentLength
= (cutoffScore
+ matchScore
- 1) / matchScore
;
2592 minNegAlignmentLength
= ((gapOpenScore
+ gapExtendScore
+ matchScore
+ 1 - minAlignmentLength
* matchScore
+ 1) / (gapOpenScore
+ gapExtendScore
+ matchScore
)) * 2 - 1;
2593 maxDPGroupingDist
= minAlignmentLength
+ minNegAlignmentLength
; // minimum length for an alignment to reached cutoff and then drop back to zero
2595 // maxSourceBit are initially assigned to cells
2596 maxSourceBit
= (1 << NUM_SOURCE_BIT
) - 1;
2597 // source bits are assigned when score >= sourceBitMinScore
2598 sourceBitMinScore
= (1 - gapOpenScore
- gapExtendScore
) * (1 << NUM_SOURCE_BIT
) + maxSourceBit
;
2600 numOfGappedHit
= 0; // Number of alignments
2602 firstTextHitBeingProcessed
= 0;
2604 // Clear last dropoff
2605 for (j
=0; j
<maxSourceBit
; j
++) {
2606 lastDropoffMaxScoreIndex
[j
] = 0;
2608 numOfDropoffMaxScore
= 0;
2613 while (firstTextHitBeingProcessed
< numOfTextHit
) {
2615 if (textList
[firstTextHitBeingProcessed
].posText
> textOffset
) {
2616 // textOffset will be larger than those processed
2617 textDbSeqIndex
= numOfTextSeq
- 1;
2618 textSeqStart
= textSeqOffset
[textDbSeqIndex
].startPos
;
2619 textSeqEnd
= textSeqOffset
[textDbSeqIndex
].endPos
;
2620 textAmbiguityIndex
= textSeqOffset
[textDbSeqIndex
].lastAmbiguityIndex
;
2623 textOffset
= textList
[firstTextHitBeingProcessed
].posText
; // posText points to the character right after the hit
2624 queryListInfo
= textList
[firstTextHitBeingProcessed
].info
;
2626 // Find corresponding DB sequence for the text
2627 while (textSeqOffset
[textDbSeqIndex
].startPos
>= textOffset
) {
2629 textSeqStart
= textSeqOffset
[textDbSeqIndex
].startPos
;
2630 textSeqEnd
= textSeqOffset
[textDbSeqIndex
].endPos
;
2631 textAmbiguityIndex
= textSeqOffset
[textDbSeqIndex
].lastAmbiguityIndex
;
2634 if (textOffset
% CHAR_PER_WORD
> 0) {
2636 // Decode text to the next word boundary
2637 charToProcess
= textOffset
% CHAR_PER_WORD
;
2638 c
= packedText
[textOffset
/ CHAR_PER_WORD
] >> (BITS_IN_WORD
- charToProcess
* BIT_PER_CHAR
);
2639 for (i
=charToProcess
; i
--;) { // from charToProcess - 1 to 0
2640 textBuffer
[i
] = (c
& 0x3);
2645 while (textAmbiguity
[textAmbiguityIndex
].startPos
>= textOffset
) {
2646 textAmbiguityIndex
--;
2648 if (textAmbiguity
[textAmbiguityIndex
].rightOfEndPos
> (textOffset
- charToProcess
)) {
2649 for (i
=0; i
<charToProcess
; i
++) {
2650 while (textAmbiguity
[textAmbiguityIndex
].startPos
>= (textOffset
- i
)) {
2651 textAmbiguityIndex
--;
2653 if (textAmbiguity
[textAmbiguityIndex
].rightOfEndPos
> (textOffset
- i
- 1)) {
2654 textBuffer
[(textOffset
- i
- 1) % CHAR_PER_WORD
] = textAmbiguity
[textAmbiguityIndex
].symbol
;
2662 lastDpCellIndex
= 0;
2666 tracebackIndexAdjustment
= 0;
2668 // Clear last dropoff
2669 for (j
=0; j
<usedSourceBit
; j
++) {
2670 lastDropoffMaxScoreIndex
[j
] = 0;
2672 numOfDropoffMaxScore
= 0;
2674 numOfTextHitBeingProcessed
= 0;
2677 while (textOffset
> textSeqOffset
[textDbSeqIndex
].startPos
&&
2678 (lastDpCellUsed
> 0 || textList
[firstTextHitBeingProcessed
+ numOfTextHitBeingProcessed
].posText
== textOffset
)) {
2680 if (textOffset
% CHAR_PER_WORD
== 0) {
2682 // Decode text to the next word boundary
2683 c
= packedText
[textOffset
/ CHAR_PER_WORD
- 1];
2684 for (i
=CHAR_PER_WORD
; i
--;) { // from CHAR_PER_WORD - 1 to 0
2685 textBuffer
[i
] = (c
& 0x3);
2690 while (textAmbiguity
[textAmbiguityIndex
].startPos
>= textOffset
) {
2691 textAmbiguityIndex
--;
2693 if (textAmbiguity
[textAmbiguityIndex
].rightOfEndPos
> (textOffset
- CHAR_PER_WORD
)) {
2694 for (i
=0; i
<CHAR_PER_WORD
; i
++) {
2695 while (textAmbiguity
[textAmbiguityIndex
].startPos
>= (textOffset
- i
)) {
2696 textAmbiguityIndex
--;
2698 if (textAmbiguity
[textAmbiguityIndex
].rightOfEndPos
> (textOffset
- i
- 1)) {
2699 textBuffer
[(textOffset
- i
- 1) % CHAR_PER_WORD
] = textAmbiguity
[textAmbiguityIndex
].symbol
;
2707 textChar
= textBuffer
[(textOffset
- 1) % CHAR_PER_WORD
];
2709 // Check if it is a valid starting position of alignment
2710 if (firstTextHitBeingProcessed
+ numOfTextHitBeingProcessed
> numOfTextHit
||
2711 textList
[firstTextHitBeingProcessed
+ numOfTextHitBeingProcessed
].posText
!= textOffset
||
2712 textOffset
+ maxDPGroupingDist
< textList
[firstTextHitBeingProcessed
].posText
) {
2713 // (numOfTextHitBeingProcessed > 0 && textList[firstTextHitBeingProcessed + numOfTextHitBeingProcessed].posText + maxDPSkip <
2714 // textList[firstTextHitBeingProcessed + numOfTextHitBeingProcessed - 1].posText)) {
2716 // Merge starting positions in query into the dpCell
2717 queryListInfo
= textList
[firstTextHitBeingProcessed
+ numOfTextHitBeingProcessed
].info
; // the group of query hits corresponding to the text hit
2718 q
= queryPosListIndex
[queryListInfo
]; // the startingindex of the group of query hits
2722 while ((q
< queryPosListIndex
[queryListInfo
+ 1] ||
2723 i
< lastDpCellUsed
) && dpCellUsed
< MAX_ALIGNMENT_LENGTH
) {
2724 while (i
< lastDpCellUsed
&& (q
>= queryPosListIndex
[queryListInfo
+ 1] || dpCell
[lastDpCellIndex
][i
].P
>= queryPosList
[q
]
2725 && dpCellUsed
< MAX_ALIGNMENT_LENGTH
)) {
2726 dpCell
[dpCellIndex
][dpCellUsed
] = dpCell
[lastDpCellIndex
][i
];
2730 if (q
< queryPosListIndex
[queryListInfo
+ 1] && i
> 0 && dpCell
[lastDpCellIndex
][i
-1].P
== queryPosList
[q
]) {
2731 // The cell is positive; ignore starting position in query
2734 while (q
< queryPosListIndex
[queryListInfo
+ 1] && dpCellUsed
< MAX_ALIGNMENT_LENGTH
&&
2735 (i
>= lastDpCellUsed
|| dpCell
[lastDpCellIndex
][i
].P
< queryPosList
[q
])) {
2736 // insert starting position in query
2737 if (queryPosList
[q
] > (unsigned int)queryPatternLength
) {
2738 fprintf(stderr
, "HSPDPDBvsQuery(): Query position is larger than query length!\n");
2741 dpCell
[dpCellIndex
][dpCellUsed
].P
= queryPosList
[q
]; // posText points to the character right after the hit
2742 dpCell
[dpCellIndex
][dpCellUsed
].B
= maxSourceBit
; // initial score for a blank cell
2743 dpCell
[dpCellIndex
][dpCellUsed
].I
= DP_NEG_INFINITY
; // insertion not allowed here
2745 tracebackIndexAdjustment
--;
2749 if (q
< queryPosListIndex
[queryListInfo
+ 1] || i
< lastDpCellUsed
) {
2750 // Not enough memory
2752 numOfTextHitBeingProcessed
++;
2753 dpCellIndex
^= 1; // Swap dpCells
2754 lastDpCellIndex
^= 1; // Swap dpCells
2755 lastDpCellUsed
= dpCellUsed
;
2759 D
= DP_NEG_INFINITY
;
2761 // Clear current max score
2762 usedSourceBit
= nextSourceBit
;
2763 for (j
=0; j
<usedSourceBit
; j
++) {
2764 currentMaxScore
[j
] = 0;
2769 dpCellUsedAdjustedForEdge
= FALSE
;
2770 lastQPos
= ALL_ONE_MASK
;
2772 // check traceback buffer
2773 if (tracebackIndex
+ MAX_ALIGNMENT_LENGTH
>= tracebackAllocated
) {
2774 fprintf(stderr
, "HSPDPDBvsQuery(): Not enough traceback buffer!\n");
2778 while (i
< lastDpCellUsed
) {
2783 if (dpCellUsed
+ 3 >= MAX_ALIGNMENT_LENGTH
) {
2784 // 1 cell at most give 3 positive cells
2785 fprintf(stderr
, "HSPDPDBvsQuery(): Not enough query buffer(1)!\n");
2789 if (dpCell
[lastDpCellIndex
][i
].B
== maxSourceBit
) {
2790 // the cell is inserted through input query positions
2791 tracebackIndexAdjustment
++;
2794 qPos
= dpCell
[lastDpCellIndex
][i
].P
- 1;
2796 if (qPos
!= lastQPos
- 1) {
2797 // The cell on the left is non-positive; handle the insertion + deletion only
2798 if (dpCell
[lastDpCellIndex
][i
].I
>= minPositiveScore
|| D
>= minPositiveScore
) {
2800 traceback
[tracebackIndex
].indexDiff
= dpCellUsed
- i
+ lastDpCellUsed
+ tracebackIndexAdjustment
; // indexDiff always point to the cell directly above logically
2801 traceback
[tracebackIndex
].textChar
= textChar
;
2803 if (dpCell
[lastDpCellIndex
][i
].I
>= minPositiveScore
&& dpCell
[lastDpCellIndex
][i
].I
== dpCell
[lastDpCellIndex
][i
].B
+ gapOpenScoreShifted
+ gapExtendScoreShifted
) {
2804 traceback
[tracebackIndex
].IOpen
= 1; // Gap opening
2806 traceback
[tracebackIndex
].IOpen
= 0; // Not gap opening
2808 if (D
>= minPositiveScore
&& D
== dpCell
[dpCellIndex
][dpCellUsed
-1].B
+ gapOpenScoreShifted
+ gapExtendScoreShifted
) {
2809 traceback
[tracebackIndex
].DOpen
= 1; // Gap opening
2811 traceback
[tracebackIndex
].DOpen
= 0; // Not gap opening
2814 if (dpCell
[lastDpCellIndex
][i
].I
>= D
) {
2815 dpCell
[dpCellIndex
][dpCellUsed
].B
= dpCell
[lastDpCellIndex
][i
].I
;
2816 traceback
[tracebackIndex
].alignment
= DP_INSERT
;
2818 dpCell
[dpCellIndex
][dpCellUsed
].B
= D
;
2819 traceback
[tracebackIndex
].alignment
= DP_DELETE
;
2823 dpCell
[dpCellIndex
][dpCellUsed
].I
= dpCell
[lastDpCellIndex
][i
].I
+ gapExtendScoreShifted
; // insert cannot follow delete
2824 D
= D
+ gapExtendScoreShifted
; // delete cannot follow insert
2825 dpCell
[dpCellIndex
][dpCellUsed
].P
= qPos
+ 1;
2830 qChar
= convertedKey
[qPos
];
2832 // DP - Match/mismatch
2833 M
= dpCell
[lastDpCellIndex
][i
].B
+ scoringMatrix
[textChar
][qChar
];
2835 if (M
>= sourceBitMinScore
&&
2836 (M
& SOURCE_BIT_MASK
) == maxSourceBit
) {
2837 // Assign source bits
2838 if (nextSourceBit
< maxSourceBit
) {
2839 M
+= nextSourceBit
- maxSourceBit
;
2840 maxScore
[nextSourceBit
].score
= 0;
2843 fprintf(stderr
, "HSPDPDBvsQuery(): Not enough source bit space!\n");
2847 // Store the insert and delete score before updating them
2848 if (i
+1 < lastDpCellUsed
&& dpCell
[lastDpCellIndex
][i
+1].P
== qPos
) {
2849 // insert score available
2850 insertScore
= dpCell
[lastDpCellIndex
][i
+1].I
;
2852 insertScore
= DP_NEG_INFINITY
;
2856 // Determine the best score
2857 if (M
>= insertScore
&& M
>= D
) {
2858 // match score is maximum
2859 dpCell
[dpCellIndex
][dpCellUsed
].B
= M
;
2860 if (M
+ gapOpenScoreShifted
>= insertScore
) {
2861 dpCell
[dpCellIndex
][dpCellUsed
].I
= M
+ gapOpenScoreShifted
+ gapExtendScoreShifted
;
2863 dpCell
[dpCellIndex
][dpCellUsed
].I
= insertScore
+ gapExtendScoreShifted
;
2865 if (M
+ gapOpenScoreShifted
>= D
) {
2866 D
= M
+ gapOpenScoreShifted
+ gapExtendScoreShifted
;
2868 D
= D
+ gapExtendScoreShifted
;
2870 traceback
[tracebackIndex
].alignment
= DP_MATCH_MISMATCH
;
2872 if (insertScore
>= D
) {
2873 dpCell
[dpCellIndex
][dpCellUsed
].B
= insertScore
;
2874 traceback
[tracebackIndex
].alignment
= DP_INSERT
;
2876 dpCell
[dpCellIndex
][dpCellUsed
].B
= D
;
2877 traceback
[tracebackIndex
].alignment
= DP_DELETE
;
2879 dpCell
[dpCellIndex
][dpCellUsed
].I
= insertScore
+ gapExtendScoreShifted
;
2880 D
= D
+ gapExtendScoreShifted
;
2883 // check if the best score is positive
2884 if (dpCell
[dpCellIndex
][dpCellUsed
].B
>= minPositiveScore
) {
2886 traceback
[tracebackIndex
].indexDiff
= dpCellUsed
- i
-1 + lastDpCellUsed
+ tracebackIndexAdjustment
; // indexDiff always point to the cell directly above logically
2887 traceback
[tracebackIndex
].textChar
= textChar
;
2888 traceback
[tracebackIndex
].queryChar
= qChar
;
2889 if (insertScore
>= minPositiveScore
&& insertScore
== dpCell
[lastDpCellIndex
][i
+1].B
+ gapOpenScoreShifted
+ gapExtendScoreShifted
) {
2890 traceback
[tracebackIndex
].IOpen
= 1; // Gap opening
2892 traceback
[tracebackIndex
].IOpen
= 0; // Not gap opening
2894 if (deleteScore
>= minPositiveScore
&& deleteScore
== dpCell
[dpCellIndex
][dpCellUsed
-1].B
+ gapOpenScoreShifted
+ gapExtendScoreShifted
) {
2895 traceback
[tracebackIndex
].DOpen
= 1; // Gap opening
2897 traceback
[tracebackIndex
].DOpen
= 0; // Not gap opening
2900 dpCell
[dpCellIndex
][dpCellUsed
].P
= qPos
;
2902 sourceBit
= dpCell
[dpCellIndex
][dpCellUsed
].B
& SOURCE_BIT_MASK
;
2903 if (dpCell
[dpCellIndex
][dpCellUsed
].B
> maxScore
[sourceBit
].score
) {
2904 maxScore
[sourceBit
].score
= dpCell
[dpCellIndex
][dpCellUsed
].B
;
2905 maxScore
[sourceBit
].tracebackIndex
= tracebackIndex
;
2906 maxScore
[sourceBit
].posText
= textOffset
- 1;
2907 maxScore
[sourceBit
].posQuery
= qPos
;
2909 if (sourceBit
< usedSourceBit
&& dpCell
[dpCellIndex
][dpCellUsed
].B
> currentMaxScore
[sourceBit
]) {
2910 currentMaxScore
[sourceBit
] = dpCell
[dpCellIndex
][dpCellUsed
].B
;
2920 if ((i
+1 >= lastDpCellUsed
|| dpCell
[lastDpCellIndex
][i
+1].P
< (qPos
-1)) && qPos
> 0 && D
>= minPositiveScore
) {
2922 // delete score is still positive;
2924 dpCell
[dpCellIndex
][dpCellUsed
].B
= D
;
2925 dpCell
[dpCellIndex
][dpCellUsed
].I
= DP_NEG_INFINITY
;
2926 dpCell
[dpCellIndex
][dpCellUsed
].P
= qPos
;
2927 D
= D
+ gapExtendScoreShifted
;
2929 traceback
[tracebackIndex
].indexDiff
= traceback
[tracebackIndex
-1].indexDiff
;
2930 traceback
[tracebackIndex
].textChar
= textChar
;
2931 traceback
[tracebackIndex
].IOpen
= 0; // Not gap opening
2932 if (dpCell
[dpCellIndex
][dpCellUsed
].B
== dpCell
[dpCellIndex
][dpCellUsed
-1].B
+ gapOpenScoreShifted
+ gapExtendScoreShifted
) {
2933 traceback
[tracebackIndex
].DOpen
= 1; // Gap opening
2935 traceback
[tracebackIndex
].DOpen
= 0; // Not gap opening
2937 traceback
[tracebackIndex
].alignment
= DP_DELETE
;
2943 // Reached the start of query
2944 D
= DP_NEG_INFINITY
;
2945 if (dpCellUsed
> 0 && dpCell
[dpCellIndex
][dpCellUsed
- 1].P
== 0) {
2946 // No need to process the cell on the start of query
2948 dpCellUsedAdjustedForEdge
= TRUE
;
2957 if (dpCellUsedAdjustedForEdge
) {
2958 tracebackIndexAdjustment
= 1;
2960 tracebackIndexAdjustment
= 0;
2963 // Add to dropoff list if maxScore > currentMaxScore + cutoff
2964 for (j
=0; j
<usedSourceBit
; j
++) {
2965 if (maxScore
[j
].score
>= currentMaxScore
[j
] + dropoffScoreShifted
&& lastDropoffMaxScoreIndex
[j
] != maxScore
[j
].tracebackIndex
) {
2966 if (numOfDropoffMaxScore
< DROPOFF_MAX_ENTRY
) {
2967 dropoffMaxScore
[numOfDropoffMaxScore
] = maxScore
[j
];
2968 lastDropoffMaxScoreIndex
[j
] = maxScore
[j
].tracebackIndex
;
2969 numOfDropoffMaxScore
++;
2971 fprintf(stderr
, "HSPDPDBvsQuery(): Not enough dropoff space!\n");
2976 dpCellIndex
^= 1; // Swap dpCells
2977 lastDpCellIndex
^= 1; // Swap dpCells
2978 lastDpCellUsed
= dpCellUsed
;
2984 // Move scores reached cutoff to front
2986 for (i
=0; i
<nextSourceBit
; i
++) {
2987 if (maxScore
[i
].score
>= cutoffScoreShifted
) {
2988 maxScore
[j
] = maxScore
[i
];
2994 // Append dropoff max score
2995 for (i
=0; i
<numOfDropoffMaxScore
; i
++) {
2996 if (nextSourceBit
< (1<<NUM_SOURCE_BIT
)) {
2997 for (j
=0; j
<nextSourceBit
; j
++) {
2998 if (maxScore
[j
].tracebackIndex
== dropoffMaxScore
[i
].tracebackIndex
) {
3002 if (j
>= nextSourceBit
) {
3003 maxScore
[nextSourceBit
] = dropoffMaxScore
[i
];
3007 fprintf(stderr
, "HSPDPDBvsQuery(): Not enough source bit for dropoff!\n");
3011 // Traceback alignments
3012 for (sourceBit
=0; sourceBit
<nextSourceBit
; sourceBit
++) {
3014 bestScore
= maxScore
[sourceBit
].score
;
3015 evalue
= stat_gapCalcEvalue(stat_gapNominal2normalized(bestScore
>> NUM_SOURCE_BIT
));
3016 if (evalue
<= maxEvalue
) {
3019 tempAlignmentIndex
= 0;
3020 tempAuxiliaryTextIndex
= 0;
3022 textOffset
= maxScore
[sourceBit
].posText
;
3023 qPos
= maxScore
[sourceBit
].posQuery
;
3025 tracebackIndex
= maxScore
[sourceBit
].tracebackIndex
;
3027 while (bestScore
>= minPositiveScore
) {
3029 if (traceback
[tracebackIndex
].alignment
== DP_MATCH_MISMATCH
) {
3030 if (tempAlignmentIndex
>= MAX_ALIGNMENT_LENGTH
) {
3031 fprintf(stderr
, "HSPDPDBvsQuery(): Not enough temp alignment buffer!\n");
3034 textChar
= traceback
[tracebackIndex
].textChar
;
3035 qChar
= traceback
[tracebackIndex
].queryChar
;
3036 bestScore
-= scoringMatrix
[textChar
][qChar
];
3037 if (textChar
== qChar
&& textChar
< 4) { // match and not ambiguity
3038 tempAlignment
[tempAlignmentIndex
] = ALIGN_MATCH
;
3040 tempAlignment
[tempAlignmentIndex
] = ALIGN_MISMATCH_AMBIGUITY
;
3041 tempAuxiliaryText
[tempAuxiliaryTextIndex
] = (char)traceback
[tracebackIndex
].textChar
; // store text character if mismatch or ambiguity
3042 tempAuxiliaryTextIndex
++;
3046 tracebackIndex
-= traceback
[tracebackIndex
].indexDiff
+ 1; // indexDiff always refer to the cell directly above logically
3047 } else if (traceback
[tracebackIndex
].alignment
== DP_INSERT
) {
3048 while (!traceback
[tracebackIndex
].IOpen
) {
3049 if (tempAlignmentIndex
>= MAX_ALIGNMENT_LENGTH
) {
3050 fprintf(stderr
, "HSPDPDBvsQuery(): Not enough temp alignment buffer!\n");
3053 bestScore
-= gapExtendScoreShifted
;
3054 tempAlignment
[tempAlignmentIndex
] = ALIGN_INSERT
;
3055 tempAlignmentIndex
++;
3056 tempAuxiliaryText
[tempAuxiliaryTextIndex
] = (char)traceback
[tracebackIndex
].textChar
;
3057 tempAuxiliaryTextIndex
++;
3058 tracebackIndex
-= traceback
[tracebackIndex
].indexDiff
; // indexDiff always refer to the cell directly above logically
3061 if (tempAlignmentIndex
>= MAX_ALIGNMENT_LENGTH
) {
3062 fprintf(stderr
, "HSPDPDBvsQuery(): Not enough temp alignment buffer!\n");
3065 bestScore
-= gapOpenScoreShifted
+ gapExtendScoreShifted
;
3066 tempAlignment
[tempAlignmentIndex
] = ALIGN_INSERT
;
3067 tempAuxiliaryText
[tempAuxiliaryTextIndex
] = (char)traceback
[tracebackIndex
].textChar
;
3068 tempAuxiliaryTextIndex
++;
3069 tracebackIndex
-= traceback
[tracebackIndex
].indexDiff
; // indexDiff always refer to the cell directly above logically
3072 while (!traceback
[tracebackIndex
].DOpen
) {
3073 if (tempAlignmentIndex
>= MAX_ALIGNMENT_LENGTH
) {
3074 fprintf(stderr
, "HSPDPDBvsQuery(): Not enough temp alignment buffer!\n");
3077 bestScore
-= gapExtendScoreShifted
;
3078 tempAlignment
[tempAlignmentIndex
] = ALIGN_DELETE
;
3079 tempAlignmentIndex
++;
3083 if (tempAlignmentIndex
>= MAX_ALIGNMENT_LENGTH
) {
3084 fprintf(stderr
, "HSPDPDBvsQuery(): Not enough temp alignment buffer!\n");
3087 bestScore
-= gapOpenScoreShifted
+ gapExtendScoreShifted
;
3088 tempAlignment
[tempAlignmentIndex
] = ALIGN_DELETE
;
3093 tempAlignmentIndex
++;
3097 if (numOfGappedHit
>= maxNumOfGappedHit
) {
3098 fprintf(stderr
, "HSPDPDBvsQuery(): Not enough gapped hit buffer!\n");
3101 gappedHitList
[numOfGappedHit
].posText
= maxScore
[sourceBit
].posText
;
3102 gappedHitList
[numOfGappedHit
].posQuery
= maxScore
[sourceBit
].posQuery
;
3103 gappedHitList
[numOfGappedHit
].lengthText
= textOffset
- maxScore
[sourceBit
].posText
;
3104 gappedHitList
[numOfGappedHit
].lengthQuery
= qPos
- maxScore
[sourceBit
].posQuery
;
3105 gappedHitList
[numOfGappedHit
].score
= maxScore
[sourceBit
].score
>> NUM_SOURCE_BIT
;
3106 gappedHitList
[numOfGappedHit
].dbSeqIndex
= textDbSeqIndex
;
3108 numOfAlignmentWord
= (tempAlignmentIndex
+ ALIGN_PER_WORD
- 1) / ALIGN_PER_WORD
;
3109 gappedHitList
[numOfGappedHit
].alignmentOffset
= MMPoolDispatchOffset(alignmentPool
, numOfAlignmentWord
* sizeof(unsigned int));
3110 alignment
= (unsigned int*)((char*)alignmentPool
+ gappedHitList
[numOfGappedHit
].alignmentOffset
);
3111 for (i
=0; i
<numOfAlignmentWord
; i
++) {
3115 for (i
=0; i
<tempAlignmentIndex
; i
++) {
3116 alignment
[i
/ALIGN_PER_WORD
] |= (tempAlignment
[i
] << (BITS_IN_WORD
- (i
% ALIGN_PER_WORD
+ 1) * ALIGN_BIT
));
3119 if (tempAuxiliaryTextIndex
> 0) {
3120 numOfAuxiliaryTextWord
= (tempAuxiliaryTextIndex
+ AUX_TEXT_PER_WORD
- 1) / AUX_TEXT_PER_WORD
;
3121 gappedHitList
[numOfGappedHit
].auxiliaryTextOffset
= MMPoolDispatchOffset(alignmentPool
, numOfAuxiliaryTextWord
* sizeof(unsigned int));
3122 auxiliaryText
= (unsigned int*)((char*)alignmentPool
+ gappedHitList
[numOfGappedHit
].auxiliaryTextOffset
);
3123 for (i
=0; i
<numOfAuxiliaryTextWord
; i
++) {
3124 auxiliaryText
[i
] = 0;
3126 for (i
=0; i
<tempAuxiliaryTextIndex
; i
++) {
3127 auxiliaryText
[i
/AUX_TEXT_PER_WORD
] |= (tempAuxiliaryText
[i
] << (BITS_IN_WORD
- (i
% AUX_TEXT_PER_WORD
+ 1) * AUX_TEXT_BIT
));
3130 gappedHitList
[numOfGappedHit
].auxiliaryTextOffset
= 0;
3138 firstTextHitBeingProcessed
+= numOfTextHitBeingProcessed
;
3142 // free working memory
3143 MMPoolReturn(mmPool
, traceback
, tracebackAllocated
* sizeof(int));
3145 MMPoolReturn(mmPool
, tempAuxiliaryText
, (MAX_ALIGNMENT_LENGTH
* 2) * sizeof(char));
3146 MMPoolReturn(mmPool
, tempAlignment
, (MAX_ALIGNMENT_LENGTH
* 2) * sizeof(char));
3148 MMPoolReturn(mmPool
, lastDropoffMaxScoreIndex
, (1 << NUM_SOURCE_BIT
) * sizeof(int));
3149 MMPoolReturn(mmPool
, currentMaxScore
, (1 << NUM_SOURCE_BIT
) * sizeof(int));
3151 MMPoolReturn(mmPool
, dropoffMaxScore
, DROPOFF_MAX_ENTRY
* sizeof(DPMaxScore
));
3152 MMPoolReturn(mmPool
, maxScore
, (1 << NUM_SOURCE_BIT
) * sizeof(DPMaxScore
));
3154 MMPoolReturn(mmPool
, dpCell
[0], MAX_ALIGNMENT_LENGTH
* sizeof(DPCell
));
3155 MMPoolReturn(mmPool
, dpCell
[1], MAX_ALIGNMENT_LENGTH
* sizeof(DPCell
));
3156 MMPoolReturn(mmPool
, dpCell
, 2 * sizeof(DPCell
*));
3158 return numOfGappedHit
;
3164 unsigned int HSPRemoveDuplicateUngappedHit(HitListWithPosQuery
* __restrict ungappedHitList
, const unsigned long long numberOfHit
) {
3166 unsigned long long i
, j
;
3168 QSort(ungappedHitList
, numberOfHit
, sizeof(HitListWithPosQuery
), HitListPosTextQueryOrder
);
3171 while (i
<numberOfHit
) {
3172 ungappedHitList
[j
].posText
= ungappedHitList
[i
].posText
;
3173 ungappedHitList
[j
].posQuery
= ungappedHitList
[i
].posQuery
;
3175 while (i
<numberOfHit
&& ungappedHitList
[i
].posText
== ungappedHitList
[j
].posText
3176 && ungappedHitList
[i
].posQuery
== ungappedHitList
[j
].posQuery
) {
3186 unsigned int HSPSplitCrossBoundaryGappedHit(GappedHitList
* __restrict gappedHitList
, const unsigned long long numberOfHit
, const unsigned int queryPatternLength
,
3187 const SeqOffset
*seqOffset
,
3188 const int matchScore
, const int cutoffScore
) {
3190 unsigned long long i
;
3191 unsigned int numberOfSplitHit
;
3192 unsigned long long dbSeqIndex
;
3193 unsigned long long splitDbSeqIndex
;
3194 unsigned long long minHitLength
;
3195 unsigned long long oldHitLength
;
3196 unsigned long long newHitLength
;
3197 unsigned long long hitEnd
;
3201 minHitLength
= (cutoffScore
+ matchScore
- 1) / matchScore
;
3203 QSort(gappedHitList
, numberOfHit
, sizeof(GappedHitList
), GappedHitListPosTextQueryLengthScoreOrder
);
3206 numberOfSplitHit
= 0;
3209 while (i
<numberOfHit
) {
3211 // Find corresponding DB sequence
3212 while (seqOffset
[dbSeqIndex
].endPos
< gappedHitList
[i
].posText
) {
3215 gappedHitList
[i
].dbSeqIndex
= dbSeqIndex
;
3216 diagonalPos
= (LONG
)gappedHitList
[i
].ungappedPosText
- (LONG
)gappedHitList
[i
].ungappedPosQuery
;
3217 oldHitLength
= gappedHitList
[i
].lengthText
;
3219 splitDbSeqIndex
= dbSeqIndex
+ 1;
3220 while (seqOffset
[splitDbSeqIndex
].startPos
<= gappedHitList
[i
].posText
+ gappedHitList
[i
].lengthText
- 1 - minHitLength
) {
3222 // the hit span across 2 db sequences; and the portion on the right > minimum hit length
3224 gappedHitList
[numberOfHit
+ numberOfSplitHit
].posText
= seqOffset
[splitDbSeqIndex
].startPos
;
3225 if (gappedHitList
[numberOfHit
+ numberOfSplitHit
].posText
- diagonalPos
< queryPatternLength
) {
3226 gappedHitList
[numberOfHit
+ numberOfSplitHit
].posQuery
= (unsigned int)((LONG
)gappedHitList
[numberOfHit
+ numberOfSplitHit
].posText
- diagonalPos
);
3228 gappedHitList
[numberOfHit
+ numberOfSplitHit
].posQuery
= queryPatternLength
- 1;
3230 hitEnd
= gappedHitList
[i
].posText
+ gappedHitList
[i
].lengthText
- 1;
3231 if (hitEnd
> seqOffset
[splitDbSeqIndex
].endPos
) {
3232 hitEnd
= seqOffset
[splitDbSeqIndex
].endPos
;
3234 newHitLength
= hitEnd
- gappedHitList
[numberOfHit
+ numberOfSplitHit
].posText
+ 1;
3235 gappedHitList
[numberOfHit
+ numberOfSplitHit
].lengthText
= newHitLength
;
3236 if (gappedHitList
[numberOfHit
+ numberOfSplitHit
].lengthQuery
>= oldHitLength
- newHitLength
) {
3237 gappedHitList
[numberOfHit
+ numberOfSplitHit
].lengthQuery
+= newHitLength
- oldHitLength
;
3239 gappedHitList
[numberOfHit
+ numberOfSplitHit
].lengthQuery
= 0;
3242 if (gappedHitList
[i
].ungappedPosText
< seqOffset
[splitDbSeqIndex
].startPos
) {
3243 gappedHitList
[numberOfHit
+ numberOfSplitHit
].ungappedPosText
= seqOffset
[splitDbSeqIndex
].startPos
;
3244 } else if (gappedHitList
[i
].ungappedPosText
> hitEnd
) {
3245 gappedHitList
[numberOfHit
+ numberOfSplitHit
].ungappedPosText
= hitEnd
;
3247 gappedHitList
[numberOfHit
+ numberOfSplitHit
].ungappedPosText
= gappedHitList
[i
].ungappedPosText
;
3249 if (gappedHitList
[numberOfHit
+ numberOfSplitHit
].ungappedPosText
< diagonalPos
) {
3250 gappedHitList
[numberOfHit
+ numberOfSplitHit
].ungappedPosQuery
= 0;
3251 } else if (gappedHitList
[numberOfHit
+ numberOfSplitHit
].ungappedPosText
- diagonalPos
>= queryPatternLength
) {
3252 gappedHitList
[numberOfHit
+ numberOfSplitHit
].ungappedPosQuery
= queryPatternLength
- 1;
3254 gappedHitList
[numberOfHit
+ numberOfSplitHit
].ungappedPosQuery
= (unsigned int)((LONG
)gappedHitList
[numberOfHit
+ numberOfSplitHit
].ungappedPosText
- diagonalPos
);
3257 if (gappedHitList
[i
].ungappedPosText
>= seqOffset
[splitDbSeqIndex
].startPos
&&
3258 gappedHitList
[i
].ungappedPosText
<= hitEnd
) {
3259 gappedHitList
[numberOfHit
+ numberOfSplitHit
].score
= gappedHitList
[i
].score
;
3261 gappedHitList
[numberOfHit
+ numberOfSplitHit
].score
= 0;
3263 gappedHitList
[numberOfHit
+ numberOfSplitHit
].dbSeqIndex
= splitDbSeqIndex
;
3270 hitEnd
= gappedHitList
[i
].posText
+ gappedHitList
[i
].lengthText
- 1;
3271 if (hitEnd
> seqOffset
[splitDbSeqIndex
].endPos
) {
3272 hitEnd
= seqOffset
[splitDbSeqIndex
].endPos
;
3274 if (gappedHitList
[i
].posText
< seqOffset
[dbSeqIndex
].startPos
) {
3275 gappedHitList
[i
].posText
= seqOffset
[dbSeqIndex
].startPos
;
3276 if (gappedHitList
[i
].posText
- diagonalPos
< queryPatternLength
) {
3277 gappedHitList
[i
].posQuery
= (int)((LONG
)gappedHitList
[i
].posText
- diagonalPos
);
3279 gappedHitList
[i
].posQuery
= queryPatternLength
- 1;
3282 newHitLength
= hitEnd
- gappedHitList
[i
].posText
+ 1;
3283 gappedHitList
[i
].lengthText
= newHitLength
;
3284 if (gappedHitList
[i
].lengthQuery
>= oldHitLength
- newHitLength
) {
3285 gappedHitList
[i
].lengthQuery
+= newHitLength
- oldHitLength
;
3287 gappedHitList
[i
].lengthQuery
= 0;
3290 if (gappedHitList
[i
].ungappedPosText
< seqOffset
[dbSeqIndex
].startPos
) {
3291 gappedHitList
[i
].ungappedPosText
= seqOffset
[dbSeqIndex
].startPos
;
3292 } else if (gappedHitList
[i
].ungappedPosText
> hitEnd
) {
3293 gappedHitList
[i
].ungappedPosText
= hitEnd
;
3295 gappedHitList
[i
].ungappedPosText
= gappedHitList
[i
].ungappedPosText
;
3297 if (gappedHitList
[i
].ungappedPosText
< diagonalPos
) {
3298 gappedHitList
[i
].ungappedPosQuery
= 0;
3299 } else if (gappedHitList
[i
].ungappedPosText
- diagonalPos
>= queryPatternLength
) {
3300 gappedHitList
[i
].ungappedPosQuery
= queryPatternLength
- 1;
3302 gappedHitList
[i
].ungappedPosQuery
= (int)((LONG
)gappedHitList
[i
].ungappedPosText
- diagonalPos
);
3305 if (gappedHitList
[i
].ungappedPosText
>= seqOffset
[dbSeqIndex
].startPos
&&
3306 gappedHitList
[i
].ungappedPosText
<= hitEnd
) {
3307 gappedHitList
[i
].score
= gappedHitList
[i
].score
;
3309 gappedHitList
[i
].score
= 0;
3311 gappedHitList
[i
].dbSeqIndex
= dbSeqIndex
;
3313 if (newHitLength
>= minHitLength
) {
3319 return numberOfHit
+ numberOfSplitHit
;
3323 unsigned int HSPRemoveDuplicateGappedHit(GappedHitList
* __restrict gappedHitList
, const unsigned long long numberOfHit
) {
3325 unsigned long long i
, j
;
3327 QSort(gappedHitList
, numberOfHit
, sizeof(GappedHitList
), GappedHitListPosTextQueryLengthScoreOrder
);
3330 while (i
<numberOfHit
) {
3331 gappedHitList
[j
].posText
= gappedHitList
[i
].posText
;
3332 gappedHitList
[j
].posQuery
= gappedHitList
[i
].posQuery
;
3333 gappedHitList
[j
].lengthText
= gappedHitList
[i
].lengthText
;
3334 gappedHitList
[j
].lengthQuery
= gappedHitList
[i
].lengthQuery
;
3335 gappedHitList
[j
].score
= gappedHitList
[i
].score
;
3336 gappedHitList
[j
].dbSeqIndex
= gappedHitList
[i
].dbSeqIndex
;
3337 gappedHitList
[j
].ungappedPosText
= gappedHitList
[i
].ungappedPosText
;
3338 gappedHitList
[j
].ungappedPosQuery
= gappedHitList
[i
].ungappedPosQuery
;
3340 while (i
<numberOfHit
&& gappedHitList
[i
].posText
== gappedHitList
[j
].posText
3341 && gappedHitList
[i
].posQuery
== gappedHitList
[j
].posQuery
3342 && gappedHitList
[i
].lengthText
== gappedHitList
[j
].lengthText
3343 && gappedHitList
[i
].lengthQuery
== gappedHitList
[j
].lengthQuery
) {
3353 // alignment and auxiliary text must be allocated through MMPool
3354 // otherwise the pointer will be lost without freeing the memory
3355 unsigned int HSPFinalFilter(GappedHitListWithAlignment
* __restrict gappedHitList
, const unsigned int numberOfHit
) {
3357 unsigned long long i
, j
;
3358 unsigned int hitRemaining
;
3360 hitRemaining
= numberOfHit
;
3362 // First filter among hits with common starting points the longer and lower scoring hits
3364 QSort(gappedHitList
, hitRemaining
, sizeof(GappedHitList
), GappedHitListPosTextQueryScoreLengthOrder
);
3367 while (j
<hitRemaining
) {
3368 if (gappedHitList
[j
].alignmentOffset
!= 0) { // not marked as discarded
3370 while (i
<hitRemaining
&& gappedHitList
[i
].posText
== gappedHitList
[j
].posText
3371 && gappedHitList
[i
].posQuery
== gappedHitList
[j
].posQuery
) {
3372 if (gappedHitList
[i
].alignmentOffset
!= 0) {
3373 if (gappedHitList
[i
].lengthText
>= gappedHitList
[j
].lengthText
&&
3374 gappedHitList
[i
].lengthQuery
>= gappedHitList
[j
].lengthQuery
&&
3375 gappedHitList
[i
].score
<= gappedHitList
[j
].score
) {
3377 gappedHitList
[i
].alignmentOffset
= 0;
3386 // Pack retained hit to the beginning
3388 while (i
<hitRemaining
) {
3389 if (gappedHitList
[i
].alignmentOffset
!= 0) { // not marked as discarded
3390 gappedHitList
[j
].posText
= gappedHitList
[i
].posText
;
3391 gappedHitList
[j
].posQuery
= gappedHitList
[i
].posQuery
;
3392 gappedHitList
[j
].lengthText
= gappedHitList
[i
].lengthText
;
3393 gappedHitList
[j
].lengthQuery
= gappedHitList
[i
].lengthQuery
;
3394 gappedHitList
[j
].score
= gappedHitList
[i
].score
;
3395 gappedHitList
[j
].dbSeqIndex
= gappedHitList
[i
].dbSeqIndex
;
3396 gappedHitList
[j
].alignmentOffset
= gappedHitList
[i
].alignmentOffset
;
3397 gappedHitList
[j
].auxiliaryTextOffset
= gappedHitList
[i
].auxiliaryTextOffset
;
3405 // Second filter among hits with common ending points the longer and lower scoring hits
3407 QSort(gappedHitList
, hitRemaining
, sizeof(GappedHitList
), GappedHitListEndTextQueryScoreLengthOrder
);
3410 while (j
<hitRemaining
) {
3411 if (gappedHitList
[j
].alignmentOffset
!= 0) { // not marked as discarded
3413 while (i
<hitRemaining
&& gappedHitList
[i
].posText
+ gappedHitList
[i
].lengthText
== gappedHitList
[j
].posText
+ gappedHitList
[j
].lengthText
3414 && gappedHitList
[i
].posQuery
+ gappedHitList
[i
].lengthQuery
== gappedHitList
[j
].posQuery
+ gappedHitList
[j
].lengthQuery
) {
3415 if (gappedHitList
[i
].alignmentOffset
!= 0) {
3416 if (gappedHitList
[i
].lengthText
>= gappedHitList
[j
].lengthText
&&
3417 gappedHitList
[i
].lengthQuery
>= gappedHitList
[j
].lengthQuery
&&
3418 gappedHitList
[i
].score
<= gappedHitList
[j
].score
) {
3420 gappedHitList
[i
].alignmentOffset
= 0;
3429 // Pack retained hit to the beginning
3431 while (i
<hitRemaining
) {
3432 if (gappedHitList
[i
].alignmentOffset
!= 0) { // not marked as discarded
3433 gappedHitList
[j
].posText
= gappedHitList
[i
].posText
;
3434 gappedHitList
[j
].posQuery
= gappedHitList
[i
].posQuery
;
3435 gappedHitList
[j
].lengthText
= gappedHitList
[i
].lengthText
;
3436 gappedHitList
[j
].lengthQuery
= gappedHitList
[i
].lengthQuery
;
3437 gappedHitList
[j
].score
= gappedHitList
[i
].score
;
3438 gappedHitList
[j
].dbSeqIndex
= gappedHitList
[i
].dbSeqIndex
;
3439 gappedHitList
[j
].alignmentOffset
= gappedHitList
[i
].alignmentOffset
;
3440 gappedHitList
[j
].auxiliaryTextOffset
= gappedHitList
[i
].auxiliaryTextOffset
;
3448 // Third filter hits with common start points
3450 QSort(gappedHitList
, hitRemaining
, sizeof(GappedHitList
), GappedHitListPosTextQueryScoreLengthOrder
);
3453 while (i
<hitRemaining
) {
3454 gappedHitList
[j
].posText
= gappedHitList
[i
].posText
;
3455 gappedHitList
[j
].posQuery
= gappedHitList
[i
].posQuery
;
3456 gappedHitList
[j
].lengthText
= gappedHitList
[i
].lengthText
;
3457 gappedHitList
[j
].lengthQuery
= gappedHitList
[i
].lengthQuery
;
3458 gappedHitList
[j
].score
= gappedHitList
[i
].score
;
3459 gappedHitList
[j
].dbSeqIndex
= gappedHitList
[i
].dbSeqIndex
;
3460 gappedHitList
[j
].alignmentOffset
= gappedHitList
[i
].alignmentOffset
;
3461 gappedHitList
[j
].auxiliaryTextOffset
= gappedHitList
[i
].auxiliaryTextOffset
;
3463 while (i
<hitRemaining
&& gappedHitList
[i
].posText
== gappedHitList
[j
].posText
3464 && gappedHitList
[i
].posQuery
== gappedHitList
[j
].posQuery
) {
3466 gappedHitList
[i
].alignmentOffset
= 0;
3474 // Fourth filter hits with common end points
3476 QSort(gappedHitList
, hitRemaining
, sizeof(GappedHitListWithAlignment
), GappedHitListEndTextQueryScoreLengthOrder
);
3479 while (i
<hitRemaining
) {
3480 gappedHitList
[j
].posText
= gappedHitList
[i
].posText
;
3481 gappedHitList
[j
].posQuery
= gappedHitList
[i
].posQuery
;
3482 gappedHitList
[j
].lengthText
= gappedHitList
[i
].lengthText
;
3483 gappedHitList
[j
].lengthQuery
= gappedHitList
[i
].lengthQuery
;
3484 gappedHitList
[j
].score
= gappedHitList
[i
].score
;
3485 gappedHitList
[j
].dbSeqIndex
= gappedHitList
[i
].dbSeqIndex
;
3486 gappedHitList
[j
].alignmentOffset
= gappedHitList
[i
].alignmentOffset
;
3487 gappedHitList
[j
].auxiliaryTextOffset
= gappedHitList
[i
].auxiliaryTextOffset
;
3489 while (i
<hitRemaining
&& gappedHitList
[i
].posText
+ gappedHitList
[i
].lengthText
== gappedHitList
[j
].posText
+ gappedHitList
[j
].lengthText
3490 && gappedHitList
[i
].posQuery
+ gappedHitList
[i
].lengthQuery
== gappedHitList
[j
].posQuery
+ gappedHitList
[j
].lengthQuery
) {
3492 gappedHitList
[i
].alignmentOffset
= 0;
3500 // Fifth filter enclosed hits with lower scores
3502 QSort(gappedHitList
, hitRemaining
, sizeof(GappedHitList
), GappedHitListEnclosedScoreOrder
);
3505 while (j
<hitRemaining
) {
3506 if (gappedHitList
[j
].alignmentOffset
!= 0) { // not marked as discarded
3508 while (i
<hitRemaining
&& gappedHitList
[i
].posText
< gappedHitList
[j
].posText
+ gappedHitList
[j
].lengthText
) {
3509 if (gappedHitList
[i
].alignmentOffset
!= 0) {
3510 if (gappedHitList
[i
].posText
+ gappedHitList
[i
].lengthText
<= gappedHitList
[j
].posText
+ gappedHitList
[j
].lengthText
&&
3511 gappedHitList
[i
].posQuery
>= gappedHitList
[j
].posQuery
&&
3512 gappedHitList
[i
].posQuery
+ gappedHitList
[i
].lengthQuery
<= gappedHitList
[j
].posQuery
+ gappedHitList
[j
].lengthQuery
&&
3513 gappedHitList
[i
].score
<= gappedHitList
[j
].score
) {
3515 gappedHitList
[i
].alignmentOffset
= 0;
3524 // Pack retained hit to the beginning
3526 while (i
<hitRemaining
) {
3527 if (gappedHitList
[i
].alignmentOffset
!= 0) { // not marked as discarded
3528 gappedHitList
[j
].posText
= gappedHitList
[i
].posText
;
3529 gappedHitList
[j
].posQuery
= gappedHitList
[i
].posQuery
;
3530 gappedHitList
[j
].lengthText
= gappedHitList
[i
].lengthText
;
3531 gappedHitList
[j
].lengthQuery
= gappedHitList
[i
].lengthQuery
;
3532 gappedHitList
[j
].score
= gappedHitList
[i
].score
;
3533 gappedHitList
[j
].dbSeqIndex
= gappedHitList
[i
].dbSeqIndex
;
3534 gappedHitList
[j
].alignmentOffset
= gappedHitList
[i
].alignmentOffset
;
3535 gappedHitList
[j
].auxiliaryTextOffset
= gappedHitList
[i
].auxiliaryTextOffset
;
3543 return hitRemaining
;
3547 HSPUngappedExtLookupTable
*HSPGenUngappedExtLookupTable(MMPool
*mmPool
, const int matchScore
, const int mismatchScore
) {
3549 HSPUngappedExtLookupTable
*hspUngappedExtLookupTable
;
3551 hspUngappedExtLookupTable
= (HSPUngappedExtLookupTable
*) MMPoolDispatch(mmPool
, (1 << 16) * sizeof(HSPUngappedExtLookupTable
));
3553 HSPCalUngappedExtLookupTable(hspUngappedExtLookupTable
, matchScore
, mismatchScore
);
3555 return hspUngappedExtLookupTable
;
3558 void HSPCalUngappedExtLookupTable(HSPUngappedExtLookupTable
*hspUngappedExtLookupTable
, const int matchScore
, const int mismatchScore
) {
3560 static const unsigned int oddBitMask
= 0x55555555;
3561 static const unsigned int evenBitMask
= 0xAAAAAAAA;
3564 char maxScore
, maxScorePos
, finalScore
;
3566 // In entries where even bit = match/mismatch, odd bit = 0 -> matchMismatchBitVector is even bit compacted (used for forward extension)
3567 // In entries where odd bit = match/mismatch, even bit = 0 -> matchMismatchBitVector is odd bit compacted and then reversed (used for backward extension)
3568 // When all bits are 0, both above case can be handled without problem
3569 hspUngappedExtLookupTable
[0].matchMismatchBitVector
= 0;
3570 for (i
=1; i
<(1<<16); i
++) {
3571 hspUngappedExtLookupTable
[i
].matchMismatchBitVector
= 0;
3572 if ((i
& oddBitMask
) == 0) {
3573 for (j
=0; j
<8; j
++) {
3574 hspUngappedExtLookupTable
[i
].matchMismatchBitVector
|= ((i
>> (15-j
*2)) & 1) << (7-j
);
3577 if ((i
& evenBitMask
) == 0) {
3578 for (j
=0; j
<8; j
++) {
3579 hspUngappedExtLookupTable
[i
].matchMismatchBitVector
|= ((i
>> (14-j
*2)) & 1) << j
;
3582 // entries where both odd and even bits contain 1 are not used
3585 // For each 16 bit match/mismatch vector, calculate max score, position giving max score, and final score
3586 // The first match/mismatch is in the leftmost bit of the 16 bit vector
3587 // 0 is a match and 1 is a mismatch (from result of xor)
3588 for (i
=0; i
<(1<<16); i
++) {
3591 maxScorePos
= 0; // 0 -> before the 1st character; 16 -> after the 16th character
3594 for (j
=0; j
<16; j
++) {
3595 if (((i
>> (15-j
)) & 1) == 0) {
3597 finalScore
= finalScore
+ (char)matchScore
;
3598 // favor longer hit if scores are the same
3599 if (finalScore
>= maxScore
) {
3600 maxScore
= finalScore
;
3601 maxScorePos
= (char)(j
+1);
3605 finalScore
= finalScore
+ (char)mismatchScore
;
3609 hspUngappedExtLookupTable
[i
].maxScore
= maxScore
;
3610 hspUngappedExtLookupTable
[i
].maxScorePos
= maxScorePos
;
3611 hspUngappedExtLookupTable
[i
].finalScore
= finalScore
;
3616 void HSPFreeUngappedExtLookupTable(MMPool
*mmPool
, HSPUngappedExtLookupTable
* hspUngappedExtLookupTable
) {
3618 MMPoolReturn(mmPool
, hspUngappedExtLookupTable
, (1 << 16) * sizeof(HSPUngappedExtLookupTable
));
3622 Histogram
* HSPAllocateHistogram(double minEvalue
, double maxEvalue
) {
3624 Histogram
*histogram
;
3626 histogram
= (Histogram
*) MMUnitAllocate(sizeof(Histogram
));
3627 histogram
->minEvalue
= minEvalue
;
3628 histogram
->maxEvalue
= maxEvalue
;
3629 histogram
->histogramSize
= (int)(ceil(log10(maxEvalue
)) - floor(log10(minEvalue
))) + 1;
3630 histogram
->count
= (int*) MMUnitAllocate(histogram
->histogramSize
* sizeof(int));
3636 void HSPInitializeHistogram(Histogram
* histogram
) {
3640 for (i
=0; i
<histogram
->histogramSize
; i
++) {
3641 histogram
->count
[i
] = 0;
3646 void HSPFreeHistogram(Histogram
* histogram
) {
3648 MMUnitFree(histogram
->count
, histogram
->histogramSize
* sizeof(int));
3649 MMUnitFree(histogram
, histogram
->histogramSize
* sizeof(int));
3653 void HSPCountEvalueToHistogram(Histogram
* histogram
, const GappedHitListWithAlignment
* gappedHitList
, const int numberOfHit
, const int roundEvalue
) {
3659 for (i
=0; i
<numberOfHit
; i
++) {
3661 if (roundEvalue
== TRUE
) {
3662 tempEvalue
= HSPCalculateAndPrintEValue(NULL
, gappedHitList
[i
].score
);
3664 tempEvalue
= stat_gapCalcEvalue(stat_gapNominal2normalized(gappedHitList
[i
].score
));
3666 if (tempEvalue
== 0.0) {
3667 tempEvalue
= histogram
->minEvalue
;
3669 evalueIndex
= (int)ceil(log10(tempEvalue
) - floor(log10(histogram
->minEvalue
)));
3670 if (evalueIndex
< 0) {
3673 if (evalueIndex
< histogram
->histogramSize
) {
3674 histogram
->count
[evalueIndex
]++;
3681 void HSPPrintHistogram(FILE * outputFile
, Histogram
* histogram
) {
3686 Socketfprintf(outputFile
,"\nCount histogram of gapped extension by expectation value:\n");
3688 tempEvalue
= pow(10, (int)floor(log10(histogram
->minEvalue
)));
3689 Socketfprintf(outputFile
,"( 0,%6.2g] : %d\n", tempEvalue
, histogram
->count
[0]);
3691 for (i
=1; i
<histogram
->histogramSize
; i
++) {
3692 tempEvalue
= pow(10, i
- 1 + (int)log10(histogram
->minEvalue
));
3693 Socketfprintf(outputFile
,"(%6.2g,%6.2g] : %d\n", tempEvalue
, tempEvalue
*10, histogram
->count
[i
]);
3698 void HSPPrintHeader(FILE *outputFile
, const int outputFormat
,
3699 const char* databaseName
, const int numOfSeq
, const unsigned long long databaseLength
) {
3701 switch (outputFormat
) {
3703 case OUTPUT_PAIRWISE
:
3705 fprintf(outputFile
, "Database : %s\n", databaseName
);
3706 fprintf(outputFile
, "No. of sequences : %d\n", numOfSeq
);
3707 fprintf(outputFile
, "No. of letters : %llu\n", databaseLength
);
3708 fprintf(outputFile
, "\n");
3712 case OUTPUT_TABULAR
:
3716 case OUTPUT_TABULAR_COMMENT
:
3718 fprintf(outputFile
, "# Database : %s\n", databaseName
);
3719 fprintf(outputFile
, "# No. of sequences : %d\n", numOfSeq
);
3720 fprintf(outputFile
, "# No. of letters : %llu\n", databaseLength
);
3721 fprintf(outputFile
, "#\n");
3729 void HSPPrintInvalidReadTrailer(FILE *outputFile
, const int outputFormat
,
3730 const char* databaseName
, const int numOfSeq
, const unsigned long long databaseLength
) {
3732 switch (outputFormat
) {
3734 case OUTPUT_PAIRWISE
:
3736 fprintf(outputFile
, "Database : %s\n", databaseName
);
3737 fprintf(outputFile
, "No. of sequences : %d\n", numOfSeq
);
3738 fprintf(outputFile
, "No. of letters : %llu\n", databaseLength
);
3739 fprintf(outputFile
, "\n\n");
3740 fprintf(outputFile
,"Lambda K H\n");
3741 fprintf(outputFile
,"%10.2f %10.2f %10.2f\n", (double)-1, (double)-1, (double)-1);
3742 fprintf(outputFile
,"Gapped\n");
3743 fprintf(outputFile
,"Lambda K H\n");
3744 fprintf(outputFile
,"%10.2f %10.2f %10.2f\n\n", (double)-1, (double)-1, (double)-1);
3748 case OUTPUT_TABULAR
:
3752 case OUTPUT_TABULAR_COMMENT
:
3759 void HSPPrintTrailer(FILE *outputFile
, const int outputFormat
,
3760 const char* databaseName
, const int numOfSeq
, const unsigned long long databaseLength
) {
3762 switch (outputFormat
) {
3764 case OUTPUT_PAIRWISE
:
3766 fprintf(outputFile
, "Database : %s\n", databaseName
);
3767 fprintf(outputFile
, "No. of sequences : %d\n", numOfSeq
);
3768 fprintf(outputFile
, "No. of letters : %llu\n", databaseLength
);
3769 fprintf(outputFile
, "\n\n");
3771 printHSPstatistic(outputFile
);
3772 fprintf(outputFile
, "\n");
3776 case OUTPUT_TABULAR
:
3780 case OUTPUT_TABULAR_COMMENT
:
3788 void HSPPrintQueryHeader(FILE *outputFile
, const int outputFormat
,
3789 const char* queryPatternName
, const int queryPatternLength
,
3790 const int *dbOrder
, const int *dbScore
, const Annotation
*annotation
, const int numOfSeq
) {
3794 char stringFormat
[8] = "%00.00s";
3797 switch (outputFormat
) {
3799 case OUTPUT_PAIRWISE
:
3801 fprintf(outputFile
, "Query : %s\n", queryPatternName
);
3802 fprintf(outputFile
, "No. of letters : %u\n", queryPatternLength
);
3804 fprintf(outputFile
, "\n\n");
3805 fprintf(outputFile
, " Score E\n");
3806 fprintf(outputFile
, "Sequences producing significant alignments: (bits) Value\n");
3807 fprintf(outputFile
, "\n");
3810 for (i
=0; i
<numOfSeq
&& dbScore
[dbOrder
[i
]] > 0; i
++) {
3811 charPrinted
= 0; anything
= 1;
3812 if (annotation
[dbOrder
[i
]].gi
> 0) {
3813 charPrinted
+= fprintf(outputFile
, "gi|%d|", annotation
[dbOrder
[i
]].gi
);
3815 stringFormat
[1] = stringFormat
[4] = (char)('0' + (64 - charPrinted
) / 10);
3816 stringFormat
[2] = stringFormat
[5] = (char)('0' + (64 - charPrinted
) % 10);
3817 fprintf(outputFile
, stringFormat
, annotation
[dbOrder
[i
]].text
);
3818 fprintf(outputFile
, " ");
3819 HSPCalculateAndPrintBitScore(outputFile
, dbScore
[dbOrder
[i
]]);
3820 fprintf(outputFile
, " ");
3821 HSPCalculateAndPrintEValue(outputFile
, dbScore
[dbOrder
[i
]]);
3822 fprintf(outputFile
, "\n");
3824 if (anything
) fprintf(outputFile
, "\n\n");
3828 case OUTPUT_TABULAR
:
3832 case OUTPUT_TABULAR_COMMENT
:
3834 fprintf(outputFile
, "# Query : %s\n", queryPatternName
);
3835 fprintf(outputFile
, "# Query id, Subject id, %% identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score\n");
3841 void HSPPrintAlignment(MMPool
*mmPool
, FILE *outputFile
, MMPool
*alignmentPool
, const GappedHitListWithAlignment
* gappedHitList
, const int numOfHit
,
3842 int outputFormat
, const ContextInfo
* contextInfo
,
3843 const unsigned char *charMap
, const unsigned char *complementMap
,
3844 const char* queryPatternName
, const unsigned char* convertedQueryPattern
, const int queryPatternLength
,
3845 const int * dbOrder
, const HSP
*hsp
) {
3848 int lastDbSeqIndex
, dbSeqIndex
, contextNum
;
3849 char* __restrict tempDbChar
;
3850 char* __restrict tempQueryChar
;
3852 char queryName
[MAX_SEQ_NAME_LENGTH
+1];
3853 char seqName
[MAX_SEQ_NAME_LENGTH
+1];
3855 int matchMatrix
[DNA_CHAR_SIZE
][DNA_CHAR_SIZE
];
3857 unsigned long long queryIndex
, dbIndex
;
3858 int totalLength
, auxiliaryTextIndex
;
3862 int alignmentCharPrinted
;
3863 int textPosNumOfChar
;
3864 char positionFormat
[6] = "%-00u";
3866 char lowercase
[256];
3868 int numOfMatch
, numOfMismatch
, numOfSpace
, numOfGap
;
3870 unsigned int* __restrict alignment
;
3871 unsigned int* __restrict auxiliaryText
;
3874 tempDbChar
= (char*) MMPoolDispatch(mmPool
, MAX_ALIGNMENT_LENGTH
+ 1);
3875 tempQueryChar
= (char*) MMPoolDispatch(mmPool
, MAX_ALIGNMENT_LENGTH
+ 1);
3877 // duplicate query name
3878 strncpy(queryName
, queryPatternName
, MAX_SEQ_NAME_LENGTH
+1);
3880 // take the first white space as end of pattern name
3881 for (i
=0; i
<MAX_SEQ_NAME_LENGTH
; i
++) {
3882 if (queryName
[i
] == ' ' || queryName
[i
] == '\0' || queryName
[i
] == '\t' || queryName
[i
] == '\n') {
3883 queryName
[i
] = '\0';
3888 HSPFillScoringMatrix(matchMatrix
, 1, -1, 0);
3890 // initialize lowercase
3891 for (i
=0; i
<256; i
++) {
3892 lowercase
[i
] = (char)i
;
3894 for (i
=0; i
<26; i
++) {
3895 lowercase
['A' + i
] = 'a' + (char)i
;
3899 lastDbSeqIndex
= -1;
3900 for (i
=0; i
<numOfHit
; i
++) {
3902 if (gappedHitList
[i
].lengthQuery
> MAX_ALIGNMENT_LENGTH
|| gappedHitList
[i
].lengthText
> MAX_ALIGNMENT_LENGTH
) {
3903 fprintf(stderr
, "HSPPrintAlignment(): Alignment length > maximum!\n");
3907 dbSeqIndex
= dbOrder
[gappedHitList
[i
].dbSeqIndex
& CONTEXT_MASK
];
3908 contextNum
= gappedHitList
[i
].dbSeqIndex
>> (BITS_IN_WORD
- CONTEXT_BIT
);
3910 if (dbSeqIndex
!= lastDbSeqIndex
) {
3912 switch(outputFormat
) {
3913 case OUTPUT_PAIRWISE
:
3915 // Print header for sequence
3916 if (hsp
->annotation
[dbSeqIndex
].gi
> 0) {
3917 fprintf(outputFile
, ">gi|%d|%s", hsp
->annotation
[dbSeqIndex
].gi
, hsp
->annotation
[dbSeqIndex
].text
);
3919 fprintf(outputFile
, ">%s", hsp
->annotation
[dbSeqIndex
].text
);
3921 fprintf(outputFile
, "No. of letters : %llu\n", hsp
->seqOffset
[dbSeqIndex
].endPos
- hsp
->seqOffset
[dbSeqIndex
].startPos
+ 1);
3922 fprintf(outputFile
, "\n");
3926 case OUTPUT_TABULAR
:
3927 case OUTPUT_TABULAR_COMMENT
:
3929 // duplicate sequence name
3930 strncpy(seqName
, hsp
->annotation
[dbSeqIndex
].text
, MAX_SEQ_NAME_LENGTH
+1);
3931 // take the first white space as end of pattern name
3932 for (j
=0; j
<MAX_SEQ_NAME_LENGTH
; j
++) {
3933 if (seqName
[j
] == ' ' || seqName
[j
] == '\0' || seqName
[j
] == '\t' || seqName
[j
] == '\n') {
3944 lastDbSeqIndex
= dbSeqIndex
;
3946 // Decode alignment and DB characters
3950 auxiliaryTextIndex
= 0;
3958 alignment
= (unsigned int*)((char*)alignmentPool
+ gappedHitList
[i
].alignmentOffset
);
3959 auxiliaryText
= (unsigned int*)((char*)alignmentPool
+ gappedHitList
[i
].auxiliaryTextOffset
);
3961 while (queryIndex
< gappedHitList
[i
].lengthQuery
) {
3962 if (totalLength
% ALIGN_PER_WORD
== 0) {
3963 c
= alignment
[totalLength
/ ALIGN_PER_WORD
];
3965 a
= (char)(c
>> (BITS_IN_WORD
- ALIGN_BIT
));
3969 if (contextInfo
[contextNum
].reversed
) {
3970 tempQueryChar
[totalLength
] = dnaChar
[convertedQueryPattern
[queryPatternLength
- 1 - queryIndex
- gappedHitList
[i
].posQuery
]];
3972 tempQueryChar
[totalLength
] = dnaChar
[convertedQueryPattern
[queryIndex
+ gappedHitList
[i
].posQuery
]];
3974 tempDbChar
[totalLength
] = tempQueryChar
[totalLength
];
3978 case ALIGN_MISMATCH_AMBIGUITY
:
3979 tempDbChar
[totalLength
] = dnaChar
[(auxiliaryText
[auxiliaryTextIndex
/ AUX_TEXT_PER_WORD
] << ((auxiliaryTextIndex
% AUX_TEXT_PER_WORD
) * AUX_TEXT_BIT
) >> (BITS_IN_WORD
- AUX_TEXT_BIT
))];
3980 if (contextInfo
[contextNum
].reversed
) {
3981 tempDbChar
[totalLength
] = complementMap
[tempDbChar
[totalLength
]];
3982 tempQueryChar
[totalLength
] = dnaChar
[convertedQueryPattern
[queryPatternLength
- 1 - queryIndex
- gappedHitList
[i
].posQuery
]];
3984 tempQueryChar
[totalLength
] = dnaChar
[convertedQueryPattern
[queryIndex
+ gappedHitList
[i
].posQuery
]];
3986 if (matchMatrix
[charMap
[tempDbChar
[totalLength
]]][charMap
[tempQueryChar
[totalLength
]]] <= 0) {
3990 // Match with ambiguity
3991 if (tempDbChar
[totalLength
] == tempQueryChar
[totalLength
] && charMap
[tempDbChar
[totalLength
]] < ALPHABET_SIZE
) {
3992 fprintf(stderr
, "HSPPrintAlignment(): Alignment error!\n");
3998 auxiliaryTextIndex
++;
4001 tempDbChar
[totalLength
] = dnaChar
[(auxiliaryText
[auxiliaryTextIndex
/ AUX_TEXT_PER_WORD
] << ((auxiliaryTextIndex
% AUX_TEXT_PER_WORD
) * AUX_TEXT_BIT
) >> (BITS_IN_WORD
- AUX_TEXT_BIT
))];
4002 if (contextInfo
[contextNum
].reversed
) {
4003 tempDbChar
[totalLength
] = complementMap
[tempDbChar
[totalLength
]];
4005 tempQueryChar
[totalLength
] = '-';
4007 if (lastA
!= ALIGN_INSERT
) {
4010 auxiliaryTextIndex
++;
4013 tempDbChar
[totalLength
] = '-';
4014 if (contextInfo
[contextNum
].reversed
) {
4015 tempQueryChar
[totalLength
] = dnaChar
[convertedQueryPattern
[queryPatternLength
- 1 - queryIndex
- gappedHitList
[i
].posQuery
]];
4017 tempQueryChar
[totalLength
] = dnaChar
[convertedQueryPattern
[queryIndex
+ gappedHitList
[i
].posQuery
]];
4020 if (lastA
!= ALIGN_DELETE
) {
4032 switch (outputFormat
) {
4034 case OUTPUT_PAIRWISE
:
4036 fprintf(outputFile
, "Score = ");
4037 HSPCalculateAndPrintBitScore(outputFile
, gappedHitList
[i
].score
);
4038 fprintf(outputFile
, " bits (%d), Expect = ", gappedHitList
[i
].score
);
4039 HSPCalculateAndPrintEValue(outputFile
, gappedHitList
[i
].score
);
4040 fprintf(outputFile
, "\n");
4041 fprintf(outputFile
, "Identities = %d/%d (%.2f%%), Gaps = %d/%d (%.2f%%)\n", numOfMatch
, totalLength
, (float)numOfMatch
/ totalLength
* 100, numOfSpace
, totalLength
, (float)numOfSpace
/ totalLength
* 100);
4042 if (contextInfo
[contextNum
].reversed
) {
4043 fprintf(outputFile
, "Strand = Plus / Minus\n");
4045 fprintf(outputFile
, "Strand = Plus / Plus\n");
4047 fprintf(outputFile
, "\n\n");
4049 // Determine position length and format
4050 c
= gappedHitList
[i
].posText
+ gappedHitList
[i
].lengthText
- hsp
->seqOffset
[dbSeqIndex
].startPos
;
4051 textPosNumOfChar
= 0;
4056 if (textPosNumOfChar
>= 10) {
4057 positionFormat
[2] = '0' + (char)(textPosNumOfChar
/ 10);
4058 positionFormat
[3] = '0' + (char)(textPosNumOfChar
% 10);
4059 positionFormat
[4] = 'u';
4060 positionFormat
[5] = '\0';
4062 positionFormat
[2] = '0' + (char)textPosNumOfChar
;
4063 positionFormat
[3] = 'u';
4064 positionFormat
[4] = '\0';
4067 alignmentCharPrinted
= 0;
4068 /*if (contextInfo[contextNum].reversed) {
4069 queryIndex = queryPatternLength + 1 - gappedHitList[i].posQuery - gappedHitList[i].lengthQuery;
4070 dbIndex = gappedHitList[i].posText + gappedHitList[i].lengthText - hsp->seqOffset[dbSeqIndex].startPos;
4072 queryIndex = gappedHitList[i].posQuery + 1;
4073 dbIndex = gappedHitList[i].posText - hsp->seqOffset[dbSeqIndex].startPos + 1;
4076 //Handle the removal of >10-consecutive N in reference sequence.
4077 if (contextInfo
[contextNum
].reversed
) {
4078 queryIndex
= queryPatternLength
+ 1 - gappedHitList
[i
].posQuery
- gappedHitList
[i
].lengthQuery
;
4079 unsigned long long tp
= gappedHitList
[i
].posText
+ gappedHitList
[i
].lengthText
- 1;
4081 unsigned long long approxIndex
= tp
>>GRID_SAMPLING_FACTOR_2_POWER
;
4082 unsigned long long approxValue
= hsp
->ambiguityMap
[approxIndex
];
4083 while (hsp
->translate
[approxValue
].startPos
>tp
) {
4086 tp
-=hsp
->translate
[approxValue
].correction
;
4087 //unsigned int utp = TranslateUnambiguousTextPosition(hsp,gappedHitList[i].posText + gappedHitList[i].lengthText - 1,dbSeqIndex);
4088 //unsigned int rtp = ComputeRelativeChromosomePosition(hsp,utp,dbSeqIndex);
4091 queryIndex
= gappedHitList
[i
].posQuery
+ 1;
4092 unsigned long long tp
= gappedHitList
[i
].posText
;
4094 unsigned long long approxIndex
= tp
>>GRID_SAMPLING_FACTOR_2_POWER
;
4095 unsigned long long approxValue
= hsp
->ambiguityMap
[approxIndex
];
4096 while (hsp
->translate
[approxValue
].startPos
>tp
) {
4099 tp
-=hsp
->translate
[approxValue
].correction
;
4100 //unsigned int utp = TranslateUnambiguousTextPosition(hsp,gappedHitList[i].posText,dbSeqIndex);
4101 //unsigned int rtp = ComputeRelativeChromosomePosition(hsp,utp,dbSeqIndex);
4105 while (alignmentCharPrinted
< totalLength
) {
4107 if (contextInfo
[contextNum
].reversed
) {
4110 fprintf(outputFile
, "Query: ");
4111 fprintf(outputFile
, positionFormat
, queryIndex
);
4112 fprintf(outputFile
, " ");
4115 while (j
< 60 && alignmentCharPrinted
+ j
< totalLength
) {
4116 fprintf(outputFile
, "%c", lowercase
[tempQueryChar
[totalLength
- 1 - alignmentCharPrinted
- j
]]);
4117 if (tempQueryChar
[totalLength
- 1 - alignmentCharPrinted
- j
] != '-') {
4123 fprintf(outputFile
, " ");
4124 fprintf(outputFile
, "%llu", queryIndex
- 1);
4125 fprintf(outputFile
, "\n");
4127 // Print alignment line
4128 for (j
=0; j
<textPosNumOfChar
+ 8; j
++) {
4129 fprintf(outputFile
, " ");
4133 while (j
< 60 && alignmentCharPrinted
+ j
< totalLength
) {
4134 if (tempQueryChar
[totalLength
- 1 - alignmentCharPrinted
- j
] == tempDbChar
[totalLength
- 1 - alignmentCharPrinted
- j
]) {
4135 fprintf(outputFile
, "|");
4137 fprintf(outputFile
, " ");
4141 fprintf(outputFile
, "\n");
4143 // Print database line
4144 fprintf(outputFile
, "Sbjct: ");
4145 fprintf(outputFile
, positionFormat
, dbIndex
);
4146 fprintf(outputFile
, " ");
4149 while (j
< 60 && alignmentCharPrinted
+ j
< totalLength
) {
4150 fprintf(outputFile
, "%c", lowercase
[tempDbChar
[totalLength
- 1 - alignmentCharPrinted
- j
]]);
4151 if (tempDbChar
[totalLength
- 1 - alignmentCharPrinted
- j
] != '-') {
4157 fprintf(outputFile
, " ");
4158 fprintf(outputFile
, positionFormat
, dbIndex
+ 1);
4159 fprintf(outputFile
, "\n");
4164 fprintf(outputFile
, "Query: ");
4165 fprintf(outputFile
, positionFormat
, queryIndex
);
4166 fprintf(outputFile
, " ");
4169 while (j
< 60 && alignmentCharPrinted
+ j
< totalLength
) {
4170 fprintf(outputFile
, "%c", lowercase
[tempQueryChar
[alignmentCharPrinted
+ j
]]);
4171 if (tempQueryChar
[alignmentCharPrinted
+ j
] != '-') {
4177 fprintf(outputFile
, " ");
4178 fprintf(outputFile
, "%llu", queryIndex
- 1);
4179 fprintf(outputFile
, "\n");
4181 // Print alignment line
4182 for (j
=0; j
<textPosNumOfChar
+ 8; j
++) {
4183 fprintf(outputFile
, " ");
4187 while (j
< 60 && alignmentCharPrinted
+ j
< totalLength
) {
4188 if (matchMatrix
[charMap
[tempQueryChar
[alignmentCharPrinted
+ j
]]][charMap
[tempDbChar
[alignmentCharPrinted
+ j
]]] > 0) {
4189 fprintf(outputFile
, "|");
4191 fprintf(outputFile
, " ");
4195 fprintf(outputFile
, "\n");
4197 // Print database line
4198 fprintf(outputFile
, "Sbjct: ");
4199 fprintf(outputFile
, positionFormat
, dbIndex
);
4200 fprintf(outputFile
, " ");
4203 while (j
< 60 && alignmentCharPrinted
+ j
< totalLength
) {
4204 fprintf(outputFile
, "%c", lowercase
[tempDbChar
[alignmentCharPrinted
+ j
]]);
4205 if (tempDbChar
[alignmentCharPrinted
+ j
] != '-') {
4211 fprintf(outputFile
, " ");
4212 fprintf(outputFile
, positionFormat
, dbIndex
- 1);
4213 fprintf(outputFile
, "\n");
4217 fprintf(outputFile
, "\n\n");
4218 alignmentCharPrinted
+= j
;
4224 case OUTPUT_TABULAR
:
4225 case OUTPUT_TABULAR_COMMENT
:
4227 fprintf(outputFile
, "%s\t", queryName
);
4230 if (hsp
->annotation
[dbSeqIndex
].gi
> 0) {
4231 fprintf(outputFile
, "gi|%d|", hsp
->annotation
[dbSeqIndex
].gi
);
4234 // print db sequence name
4235 fprintf(outputFile
, "%s\t", seqName
);
4237 // print percentage identity
4238 fprintf(outputFile
, "%.2f\t", (float)(totalLength
- numOfMismatch
- numOfSpace
) / (float)(totalLength
) * 100);
4240 // print align length
4241 fprintf(outputFile
, "%d\t", totalLength
);
4243 // print number of mismatch
4244 fprintf(outputFile
, "%d\t", numOfMismatch
);
4246 // print number of gap opening
4247 fprintf(outputFile
, "%d\t", numOfGap
);
4249 // print query start and end
4250 if (contextInfo
[contextNum
].reversed
) {
4251 fprintf(outputFile
, "%u\t%u\t", queryPatternLength
- gappedHitList
[i
].posQuery
- gappedHitList
[i
].lengthQuery
+ 1,
4252 queryPatternLength
- gappedHitList
[i
].posQuery
);
4254 fprintf(outputFile
, "%u\t%u\t", gappedHitList
[i
].posQuery
+ 1,
4255 gappedHitList
[i
].posQuery
+ gappedHitList
[i
].lengthQuery
);
4258 // print text start and end
4259 if (contextInfo
[contextNum
].reversed
) {
4260 unsigned long long tp
= gappedHitList
[i
].posText
+ gappedHitList
[i
].lengthText
- 1;
4261 unsigned long long approxIndex
= tp
>>GRID_SAMPLING_FACTOR_2_POWER
;
4262 unsigned long long approxValue
= hsp
->ambiguityMap
[approxIndex
];
4263 while (hsp
->translate
[approxValue
].startPos
>tp
) {
4266 tp
-=hsp
->translate
[approxValue
].correction
;
4267 fprintf(outputFile
, "%llu\t%llu\t", tp
,
4268 tp
- gappedHitList
[i
].lengthText
+ 1);
4269 //fprintf(outputFile, "%u\t%u\t", gappedHitList[i].posText + gappedHitList[i].lengthText - hsp->seqOffset[dbSeqIndex].startPos,
4270 // gappedHitList[i].posText + 1 - hsp->seqOffset[dbSeqIndex].startPos);
4272 unsigned long long tp
= gappedHitList
[i
].posText
;
4274 unsigned long long approxIndex
= tp
>>GRID_SAMPLING_FACTOR_2_POWER
;
4275 unsigned long long approxValue
= hsp
->ambiguityMap
[approxIndex
];
4276 while (hsp
->translate
[approxValue
].startPos
>tp
) {
4279 tp
-=hsp
->translate
[approxValue
].correction
;
4280 fprintf(outputFile
, "%llu\t%llu\t", tp
,
4281 tp
+ gappedHitList
[i
].lengthText
-1);
4282 //fprintf(outputFile, "%u\t%u\t", gappedHitList[i].posText + 1 - hsp->seqOffset[dbSeqIndex].startPos,
4283 // gappedHitList[i].posText + gappedHitList[i].lengthText - hsp->seqOffset[dbSeqIndex].startPos);
4288 HSPCalculateAndPrintEValue(outputFile
, gappedHitList
[i
].score
);
4291 HSPCalculateAndPrintBitScore(outputFile
, gappedHitList
[i
].score
);
4293 fprintf(outputFile
, "\n");
4299 fprintf(stderr
, "HSPPrintAlignment() : Output format is invalid!\n");
4307 MMPoolReturn(mmPool
, tempDbChar
, MAX_ALIGNMENT_LENGTH
+ 1);
4308 MMPoolReturn(mmPool
, tempQueryChar
, MAX_ALIGNMENT_LENGTH
+ 1);
4312 void HSPPrintInvalidHit(FILE *outputFile
, int outputFormat
) {
4313 switch (outputFormat
) {
4315 case OUTPUT_PAIRWISE
:
4317 fprintf(outputFile
, "#Effective query length reaches zero. \n#Unable to compute Karlin-Altschul parameters.\n\n");
4320 case OUTPUT_TABULAR
:
4321 case OUTPUT_TABULAR_COMMENT
:
4323 fprintf(outputFile
, "\n");
4328 void HSPPrintNoAlignment(FILE *outputFile
, int outputFormat
) {
4329 switch (outputFormat
) {
4331 case OUTPUT_PAIRWISE
:
4333 fprintf(outputFile
, "#No hit\n\n");
4336 case OUTPUT_TABULAR
:
4337 case OUTPUT_TABULAR_COMMENT
:
4339 fprintf(outputFile
, "\n");
4345 double HSPCalculateAndPrintBitScore(FILE *outputFile
, const int score
) {
4350 bitScore
= stat_gapNominal2normalized(score
);
4352 if (outputFile
!= NULL
) {
4353 if (bitScore
> 9999) {
4354 fprintf(outputFile
, "%4.3le", bitScore
);
4355 } else if (bitScore
> 99.9) {
4356 fprintf(outputFile
, "%4.0d", (int)bitScore
);
4358 fprintf(outputFile
, "%4.1lf", bitScore
);
4366 double HSPCalculateAndPrintEValue(FILE *outputFile
, const int score
) {
4369 char evalueString
[10];
4371 evalue
= stat_gapCalcEvalue(stat_gapNominal2normalized(score
));
4373 if (evalue
< 1.0e-180) {
4374 sprintf(evalueString
, "0.0\t");
4375 } else if (evalue
< 1.0e-99) {
4376 sprintf(evalueString
, "%2.0le\t", evalue
);
4377 } else if (evalue
< 0.0009) {
4378 sprintf(evalueString
, "%3.0le\t", evalue
);
4379 } else if (evalue
< 0.1) {
4380 sprintf(evalueString
, "%4.3lf\t", evalue
);
4381 } else if (evalue
< 1.0) {
4382 sprintf(evalueString
, "%3.2lf\t", evalue
);
4383 } else if (evalue
< 10.0) {
4384 sprintf(evalueString
, "%2.1lf\t", evalue
);
4386 sprintf(evalueString
, "%5.0lf\t", evalue
);
4389 if (outputFile
!= NULL
) {
4390 fprintf(outputFile
, "%s\t", evalueString
);
4393 sscanf(evalueString
, "%lf", &evalue
);
4398 int HitListPosTextQueryOrder(const void *hitList
, const long long index1
, const long long index2
) {
4400 if (((HitListWithPosQuery
*)hitList
+ index1
)->posText
!= ((HitListWithPosQuery
*)hitList
+ index2
)->posText
) {
4401 if (((HitListWithPosQuery
*)hitList
+ index1
)->posText
> ((HitListWithPosQuery
*)hitList
+ index2
)->posText
) {
4407 return ((HitListWithPosQuery
*)hitList
+ index1
)->posQuery
- ((HitListWithPosQuery
*)hitList
+ index2
)->posQuery
;
4412 int GappedHitListPosTextQueryLengthScoreOrder(const void *gappedHitList
, const long long index1
, const long long index2
) {
4414 if (((GappedHitList
*)gappedHitList
+ index1
)->posText
!= ((GappedHitList
*)gappedHitList
+ index2
)->posText
) {
4415 if (((GappedHitList
*)gappedHitList
+ index1
)->posText
> ((GappedHitList
*)gappedHitList
+ index2
)->posText
) {
4421 if (((GappedHitList
*)gappedHitList
+ index1
)->posQuery
!= ((GappedHitList
*)gappedHitList
+ index2
)->posQuery
) {
4422 if (((GappedHitList
*)gappedHitList
+ index1
)->posQuery
> ((GappedHitList
*)gappedHitList
+ index2
)->posQuery
) {
4428 if (((GappedHitList
*)gappedHitList
+ index1
)->lengthText
!= ((GappedHitList
*)gappedHitList
+ index2
)->lengthText
) {
4429 return ((GappedHitList
*)gappedHitList
+ index1
)->lengthText
- ((GappedHitList
*)gappedHitList
+ index2
)->lengthText
;
4431 if (((GappedHitList
*)gappedHitList
+ index1
)->lengthQuery
!= ((GappedHitList
*)gappedHitList
+ index2
)->lengthQuery
) {
4432 return ((GappedHitList
*)gappedHitList
+ index1
)->lengthQuery
- ((GappedHitList
*)gappedHitList
+ index2
)->lengthQuery
;
4434 // higher score first
4435 return ((GappedHitList
*)gappedHitList
+ index2
)->score
- ((GappedHitList
*)gappedHitList
+ index1
)->score
;
4443 int GappedHitListPosTextQueryScoreLengthOrder(const void *gappedHitList
, const long long index1
, const long long index2
) {
4445 if (((GappedHitList
*)gappedHitList
+ index1
)->posText
!= ((GappedHitList
*)gappedHitList
+ index2
)->posText
) {
4446 if (((GappedHitList
*)gappedHitList
+ index1
)->posText
> ((GappedHitList
*)gappedHitList
+ index2
)->posText
) {
4452 if (((GappedHitList
*)gappedHitList
+ index1
)->posQuery
!= ((GappedHitList
*)gappedHitList
+ index2
)->posQuery
) {
4453 if (((GappedHitList
*)gappedHitList
+ index1
)->posQuery
> ((GappedHitList
*)gappedHitList
+ index2
)->posQuery
) {
4459 if (((GappedHitList
*)gappedHitList
+ index1
)->score
!= ((GappedHitList
*)gappedHitList
+ index2
)->score
) {
4460 // higher score first
4461 return ((GappedHitList
*)gappedHitList
+ index2
)->score
- ((GappedHitList
*)gappedHitList
+ index1
)->score
;
4463 // short query length first
4464 if (((GappedHitList
*)gappedHitList
+ index1
)->lengthQuery
!= ((GappedHitList
*)gappedHitList
+ index2
)->lengthQuery
) {
4465 if (((GappedHitList
*)gappedHitList
+ index1
)->lengthQuery
> ((GappedHitListWithAlignment
*)gappedHitList
+ index2
)->lengthQuery
) {
4471 // short text length first
4472 if (((GappedHitList
*)gappedHitList
+ index1
)->lengthText
!= ((GappedHitListWithAlignment
*)gappedHitList
+ index2
)->lengthText
) {
4473 if (((GappedHitList
*)gappedHitList
+ index1
)->lengthText
> ((GappedHitListWithAlignment
*)gappedHitList
+ index2
)->lengthText
) {
4488 int GappedHitListEndTextQueryScoreLengthOrder(const void *gappedHitList
, const long long index1
, const long long index2
) {
4490 if (((GappedHitList
*)gappedHitList
+ index1
)->posText
+ ((GappedHitList
*)gappedHitList
+ index1
)->lengthText
!=
4491 ((GappedHitList
*)gappedHitList
+ index2
)->posText
+ ((GappedHitList
*)gappedHitList
+ index2
)->lengthText
) {
4492 if (((GappedHitList
*)gappedHitList
+ index1
)->posText
+ ((GappedHitList
*)gappedHitList
+ index1
)->lengthText
>
4493 ((GappedHitList
*)gappedHitList
+ index2
)->posText
+ ((GappedHitList
*)gappedHitList
+ index2
)->lengthText
) {
4499 if (((GappedHitList
*)gappedHitList
+ index1
)->posQuery
+ ((GappedHitList
*)gappedHitList
+ index1
)->lengthQuery
!=
4500 ((GappedHitList
*)gappedHitList
+ index2
)->posQuery
+ ((GappedHitList
*)gappedHitList
+ index2
)->lengthQuery
) {
4501 if (((GappedHitList
*)gappedHitList
+ index1
)->posQuery
+ ((GappedHitList
*)gappedHitList
+ index1
)->lengthQuery
>
4502 ((GappedHitList
*)gappedHitList
+ index2
)->posQuery
+ ((GappedHitList
*)gappedHitList
+ index2
)->lengthQuery
) {
4508 if (((GappedHitList
*)gappedHitList
+ index1
)->score
!= ((GappedHitList
*)gappedHitList
+ index2
)->score
) {
4509 // higher score first
4510 return ((GappedHitList
*)gappedHitList
+ index2
)->score
- ((GappedHitList
*)gappedHitList
+ index1
)->score
;
4512 // short query length first
4513 if (((GappedHitList
*)gappedHitList
+ index1
)->lengthQuery
!= ((GappedHitListWithAlignment
*)gappedHitList
+ index2
)->lengthQuery
) {
4514 if (((GappedHitList
*)gappedHitList
+ index1
)->lengthQuery
> ((GappedHitListWithAlignment
*)gappedHitList
+ index2
)->lengthQuery
) {
4520 // short text length first
4521 if (((GappedHitList
*)gappedHitList
+ index1
)->lengthText
!= ((GappedHitListWithAlignment
*)gappedHitList
+ index2
)->lengthText
) {
4522 if (((GappedHitList
*)gappedHitList
+ index1
)->lengthText
> ((GappedHitListWithAlignment
*)gappedHitList
+ index2
)->lengthText
) {
4537 int GappedHitListEnclosedScoreOrder(const void *gappedHitList
, const long long index1
, const long long index2
) {
4539 if (((GappedHitList
*)gappedHitList
+ index1
)->posText
!= ((GappedHitList
*)gappedHitList
+ index2
)->posText
) {
4540 if (((GappedHitList
*)gappedHitList
+ index1
)->posText
> ((GappedHitList
*)gappedHitList
+ index2
)->posText
) {
4546 if (((GappedHitList
*)gappedHitList
+ index1
)->lengthText
!= ((GappedHitList
*)gappedHitList
+ index2
)->lengthText
) {
4547 if (((GappedHitList
*)gappedHitList
+ index1
)->lengthText
< ((GappedHitList
*)gappedHitList
+ index2
)->lengthText
) {
4553 if (((GappedHitList
*)gappedHitList
+ index1
)->posQuery
!= ((GappedHitList
*)gappedHitList
+ index2
)->posQuery
) {
4554 if (((GappedHitList
*)gappedHitList
+ index1
)->posQuery
> ((GappedHitList
*)gappedHitList
+ index2
)->posQuery
) {
4560 if (((GappedHitList
*)gappedHitList
+ index1
)->lengthQuery
!= ((GappedHitList
*)gappedHitList
+ index2
)->lengthQuery
) {
4561 if (((GappedHitList
*)gappedHitList
+ index1
)->lengthQuery
< ((GappedHitList
*)gappedHitList
+ index2
)->lengthQuery
) {
4567 // higher score first
4568 return ((GappedHitList
*)gappedHitList
+ index2
)->score
- ((GappedHitList
*)gappedHitList
+ index1
)->score
;