3 HSP.h BWTBlastn functions
5 This module contains miscellaneous BWTBlastn functions.
7 Copyright (C) 2004, Wong Chi Kwong.
9 This program is free software; you can redistribute it and/or
10 modify it under the terms of the GNU General Public License
11 as published by the Free Software Foundation; either version 2
12 of the License, or (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
28 #include "TypeNLimit.h"
29 #include "MemManager.h"
30 #include "TextConverter.h"
32 #define ALPHABET_SIZE 4
33 #define BIT_PER_CHAR 2
34 #define CHAR_PER_128 64
35 #define CHAR_PER_WORD 16
36 #define CHAR_PER_BYTE 4
38 #define OUTPUT_PAIRWISE 0
39 #define OUTPUT_WITH_IDENTITIES 1
40 #define OUTPUT_NO_IDENTITIES 2
41 #define OUTPUT_FLAT_WITH_IDENTITIES 3
42 #define OUTPUT_FLAT_NO_IDENTITIES 4
43 #define OUTPUT_BLUNT_ENDS 5
44 #define OUTPUT_FLAT_BLUNT_ENDS 6
46 #define OUTPUT_TABULAR 8
47 #define OUTPUT_TABULAR_COMMENT 9
48 #define OUTPUT_ASN_TEXT 10
49 #define OUTPUT_ASN_BINARY 11
51 #define QUERY_POS_STRAND 1
52 #define QUERY_NEG_STRAND 2
53 #define QUERY_BOTH_STRAND 3
56 #define CONTEXT_MASK 0x0FFFFFFF
58 #define MAX_ALIGNMENT_LENGTH 131072
59 #define SHORTEST_CHR 70
61 typedef struct _ChrBlock
{
63 unsigned int blockStart
;
64 unsigned int blockEnd
;
68 typedef struct _NewAnnotation
{
69 char chrName
[MAX_SEQ_NAME_LENGTH
];
71 unsigned int chrStart
;
77 typedef struct __ref_anno_t
{
88 typedef struct HSPUngappedExtLookupTable
{
90 unsigned char maxScorePos
;
92 unsigned char matchMismatchBitVector
; // In entries where even bit = match/mismatch, odd bit = 0 -> matchMismatchBitVector is even bit compacted
93 // In entries where odd bit = match/mismatch, even bit = 0 -> matchMismatchBitVector is odd bit compacted and then reversed
94 // When all bits are 0, both above case can be handled without problem
95 } HSPUngappedExtLookupTable
;
97 typedef struct HitList
{ // Information of hits generated
98 unsigned int posText
; // position in text
99 unsigned int info
; // extra hit information
102 typedef struct HitListWithPosQuery
{ // Information of hits generated
103 unsigned int posText
; // position in text
104 unsigned int posQuery
; // position in query
105 } HitListWithPosQuery
;
107 typedef struct HitListWithPosQueryLengthError
{// Information of hits generated
108 unsigned int posText
; // position in text
109 unsigned posQuery
: 16; // position in query
110 unsigned length
: 8; // length of hit
111 unsigned error
: 8; // error in hit
112 } HitListWithPosQueryLengthError
;
114 typedef struct GappedHitList
{
115 unsigned int posText
; // position in text
116 unsigned int posQuery
; // position in query
117 unsigned int lengthText
; // length of aligned text
118 unsigned int lengthQuery
; // length of aligned query
119 int score
; // score of hit
120 unsigned int dbSeqIndex
; // DB sequence
121 unsigned int ungappedPosText
; // position in text before gapped extension
122 unsigned int ungappedPosQuery
; // position in query before gapped extension
125 typedef struct GappedHitListWithAlignment
{
126 unsigned int posText
; // position in text
127 unsigned int posQuery
; // position in query
128 unsigned int lengthText
; // length of aligned text
129 unsigned int lengthQuery
; // length of aligned query
130 int score
; // score of alignment
131 unsigned int dbSeqIndex
; // DB sequence
132 unsigned int alignmentOffset
; // alignment; 2 bit per alignment for match, mismatch or ambigurous, insert, delete; offset to a memory pool
133 unsigned int auxiliaryTextOffset
; // text characters for mismatch, insert and ambigurous characters; offset to a memory pool
134 } GappedHitListWithAlignment
;
136 typedef struct Context
{
141 GappedHitListWithAlignment
*gappedHitList
;
144 typedef struct ContextInfo
{
150 typedef struct Annotation
{
152 char text
[MAX_SEQ_NAME_LENGTH
+1];
155 typedef struct SeqOffset
{
156 unsigned int startPos
;
158 int firstAmbiguityIndex
; // The index for the first ambiguity that starts on or after the sequence
159 int lastAmbiguityIndex
; // The index for the last ambiguity that ends on or before the sequence
162 typedef struct Ambiguity
{
163 unsigned int startPos
;
164 unsigned int rightOfEndPos
;
168 typedef struct Traceback
{
169 unsigned indexDiff
: 20; // the difference in traceback index for the logical cell directly above
170 unsigned textChar
: 4;
171 unsigned queryChar
: 4; // only filled when match/mismatch
174 unsigned alignment
: 2;
178 unsigned int* packedDNA
;
179 unsigned long long *ubit64_packedDNA
;
181 Annotation
* annotation
;
182 SeqOffset
* seqOffset
;
184 Ambiguity
* ambiguity
;
185 unsigned int dnaLength
;
186 unsigned int minSeqLength
;
189 typedef struct DPCell
{
191 int I
; // Score with insertion
192 unsigned int P
; // Logical index of the cell
195 typedef struct DPMaxScore
{
198 unsigned int posText
;
199 unsigned int posQuery
;
202 typedef struct Histogram
{
210 #define MAX_SEQ_NAME_LENGTH 256
212 #define MAX_HISTO_SIZE 256
214 #define INVALID_CHAR_INDEX 15
216 #define DP_MATCH_MISMATCH 0 // 0000
217 #define DP_INSERT 2 // 0010
218 #define DP_DELETE 3 // 0011
219 #define DP_INSERT_OPEN 4 // 0100
220 #define DP_DELETE_OPEN 8 // 1000
222 #define DP_MASK 3 // 0011
224 #define DP_INSERT_EXTEND 0 // 0000
225 #define DP_DELETE_EXTEND 0 // 0000
227 #define DP_NEG_INFINITY -1073741824 // use -(2^30) to leave room for decreasing the value without overflow
229 #define ALIGN_MATCH 0
230 #define ALIGN_MISMATCH_AMBIGUITY 1
231 #define ALIGN_INSERT 2
232 #define ALIGN_DELETE 3
234 #define ALIGN_PER_WORD 16
237 #define AUX_TEXT_PER_WORD 8
238 #define AUX_TEXT_BIT 4
240 static const char lowercaseDnaCharIndex
= 14; // Seems that BLAST treat masked characters as 'N' (still have 1/4 chance of matching)
241 static const char nonMatchDnaCharIndex
= 15;
242 static const char dnaChar
[16] = {'A', 'C', 'G', 'T', 'M', 'R', 'S', 'V', 'W', 'Y', 'H', 'K', 'D', 'B', 'N', 'L'};
243 static const char dnaComplement
[16] = {'T', 'G', 'C', 'A', 'K', 'Y', 'S', 'B', 'W', 'R', 'D', 'M', 'H', 'V', 'N', 'L'};
244 static const char ambiguityCount
[16] = { 1 , 1 , 1 , 1 , 2 , 2 , 2 , 3 , 2 , 2 , 3 , 2 , 3 , 3 , 4 , 0 };
245 static const char ambiguityMatch
[16][4] = {0, 0, 0, 0,
262 // Map must be allocated with char[256]
263 void HSPFillCharMap(unsigned char charMap
[255]);
264 void HSPFillComplementMap(unsigned char complementMap
[255]);
266 // scoringMatrix must be allocated with scoringMatrix[16][16]
267 void HSPFillScoringMatrix(int scoringMatrix
[16][16], const int matchScore
, const int mismatchScore
, const int leftShift
);
269 HSP
*HSPLoad(MMPool
*mmPool
, const char *PackedDNAFileName
, const char *AnnotationFileName
, const char *AmbiguityFileName
);
270 HSP
*HSPConvertFromText(MMPool
*mmPool
, const unsigned char *text
, const unsigned int textLength
,
271 const unsigned int FASTARandomSeed
, const int maskLowerCase
,
272 const int gi
, const char *seqName
);
273 void HSPFree(MMPool
*mmPool
, HSP
*hsp
);
275 unsigned int HSPParseFASTAToPacked(const char *, const char *, const char *, const char *, const char *, const unsigned int , const int);
276 unsigned int HSPPackedToFASTA(const char* FASTAFileName
, const char* annotationFileName
, const char* packedDNAFileName
, const char* ambiguityFileName
);