modified: makefile
[GalaxyCodeBases.git] / BGI / soap_src / soap_builder / HSP.h
blob163bbd53391ec1a50604d5a33bc5c1c55ecb022d
1 /*
3 HSP.h BWTBlastn functions
5 This module contains miscellaneous BWTBlastn functions.
7 Copyright (C) 2004, Wong Chi Kwong.
9 This program is free software; you can redistribute it and/or
10 modify it under the terms of the GNU General Public License
11 as published by the Free Software Foundation; either version 2
12 of the License, or (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
25 #ifndef __HSP_H__
26 #define __HSP_H__
28 #include "TypeNLimit.h"
29 #include "MemManager.h"
30 #include "TextConverter.h"
32 #define ALPHABET_SIZE 4
33 #define BIT_PER_CHAR 2
34 #define CHAR_PER_128 64
35 #define CHAR_PER_WORD 16
36 #define CHAR_PER_BYTE 4
38 #define OUTPUT_PAIRWISE 0
39 #define OUTPUT_WITH_IDENTITIES 1
40 #define OUTPUT_NO_IDENTITIES 2
41 #define OUTPUT_FLAT_WITH_IDENTITIES 3
42 #define OUTPUT_FLAT_NO_IDENTITIES 4
43 #define OUTPUT_BLUNT_ENDS 5
44 #define OUTPUT_FLAT_BLUNT_ENDS 6
45 #define OUTPUT_XML 7
46 #define OUTPUT_TABULAR 8
47 #define OUTPUT_TABULAR_COMMENT 9
48 #define OUTPUT_ASN_TEXT 10
49 #define OUTPUT_ASN_BINARY 11
51 #define QUERY_POS_STRAND 1
52 #define QUERY_NEG_STRAND 2
53 #define QUERY_BOTH_STRAND 3
55 #define CONTEXT_BIT 4
56 #define CONTEXT_MASK 0x0FFFFFFF
58 #define MAX_ALIGNMENT_LENGTH 131072
59 #define SHORTEST_CHR 70
61 typedef struct _ChrBlock{
62 int ref_id;
63 unsigned int blockStart;
64 unsigned int blockEnd;
65 unsigned int ori;
66 }ChrBlock;
68 typedef struct _NewAnnotation{
69 char chrName[MAX_SEQ_NAME_LENGTH];
70 int nameLen;
71 unsigned int chrStart;
72 unsigned int chrEnd;
73 int blockNum;
74 ChrBlock *blockInChr;
75 }NewAnnotation;
77 typedef struct __ref_anno_t{
78 unsigned total_char;
79 int chr_num;
80 char **chr_name;
81 int total_block;
82 ChrBlock *block_list;
83 }ref_anno_t;
86 //-end
88 typedef struct HSPUngappedExtLookupTable {
89 char maxScore;
90 unsigned char maxScorePos;
91 char finalScore;
92 unsigned char matchMismatchBitVector; // In entries where even bit = match/mismatch, odd bit = 0 -> matchMismatchBitVector is even bit compacted
93 // In entries where odd bit = match/mismatch, even bit = 0 -> matchMismatchBitVector is odd bit compacted and then reversed
94 // When all bits are 0, both above case can be handled without problem
95 } HSPUngappedExtLookupTable;
97 typedef struct HitList { // Information of hits generated
98 unsigned int posText; // position in text
99 unsigned int info; // extra hit information
100 } HitList;
102 typedef struct HitListWithPosQuery { // Information of hits generated
103 unsigned int posText; // position in text
104 unsigned int posQuery; // position in query
105 } HitListWithPosQuery;
107 typedef struct HitListWithPosQueryLengthError {// Information of hits generated
108 unsigned int posText; // position in text
109 unsigned posQuery : 16; // position in query
110 unsigned length : 8; // length of hit
111 unsigned error : 8; // error in hit
112 } HitListWithPosQueryLengthError;
114 typedef struct GappedHitList {
115 unsigned int posText; // position in text
116 unsigned int posQuery; // position in query
117 unsigned int lengthText; // length of aligned text
118 unsigned int lengthQuery; // length of aligned query
119 int score; // score of hit
120 unsigned int dbSeqIndex; // DB sequence
121 unsigned int ungappedPosText; // position in text before gapped extension
122 unsigned int ungappedPosQuery; // position in query before gapped extension
123 } GappedHitList;
125 typedef struct GappedHitListWithAlignment {
126 unsigned int posText; // position in text
127 unsigned int posQuery; // position in query
128 unsigned int lengthText; // length of aligned text
129 unsigned int lengthQuery; // length of aligned query
130 int score; // score of alignment
131 unsigned int dbSeqIndex; // DB sequence
132 unsigned int alignmentOffset; // alignment; 2 bit per alignment for match, mismatch or ambigurous, insert, delete; offset to a memory pool
133 unsigned int auxiliaryTextOffset; // text characters for mismatch, insert and ambigurous characters; offset to a memory pool
134 } GappedHitListWithAlignment;
136 typedef struct Context {
137 int contextNum;
138 int numOfHit;
139 int numOfDPPoint;
140 int numOfFinalHit;
141 GappedHitListWithAlignment *gappedHitList;
142 } Context;
144 typedef struct ContextInfo {
145 int reversed;
146 int complemented;
147 char name[16];
148 } ContextInfo;
150 typedef struct Annotation {
151 int gi;
152 char text[MAX_SEQ_NAME_LENGTH+1];
153 } Annotation;
155 typedef struct SeqOffset {
156 unsigned int startPos;
157 unsigned int endPos;
158 int firstAmbiguityIndex; // The index for the first ambiguity that starts on or after the sequence
159 int lastAmbiguityIndex; // The index for the last ambiguity that ends on or before the sequence
160 } SeqOffset;
162 typedef struct Ambiguity {
163 unsigned int startPos;
164 unsigned int rightOfEndPos;
165 int symbol;
166 } Ambiguity;
168 typedef struct Traceback {
169 unsigned indexDiff : 20; // the difference in traceback index for the logical cell directly above
170 unsigned textChar : 4;
171 unsigned queryChar : 4; // only filled when match/mismatch
172 unsigned DOpen : 1;
173 unsigned IOpen : 1;
174 unsigned alignment : 2;
175 } Traceback;
177 typedef struct HSP {
178 unsigned int* packedDNA;
179 unsigned long long *ubit64_packedDNA;
180 int numOfSeq;
181 Annotation* annotation;
182 SeqOffset* seqOffset;
183 int numOfAmbiguity;
184 Ambiguity* ambiguity;
185 unsigned int dnaLength;
186 unsigned int minSeqLength;
187 } HSP;
189 typedef struct DPCell {
190 int B; // Best score
191 int I; // Score with insertion
192 unsigned int P; // Logical index of the cell
193 } DPCell;
195 typedef struct DPMaxScore {
196 int score;
197 int tracebackIndex;
198 unsigned int posText;
199 unsigned int posQuery;
200 } DPMaxScore;
202 typedef struct Histogram {
203 double minEvalue;
204 double maxEvalue;
205 int histogramSize;
206 int *count;
207 } Histogram;
210 #define MAX_SEQ_NAME_LENGTH 256
212 #define MAX_HISTO_SIZE 256
214 #define INVALID_CHAR_INDEX 15
216 #define DP_MATCH_MISMATCH 0 // 0000
217 #define DP_INSERT 2 // 0010
218 #define DP_DELETE 3 // 0011
219 #define DP_INSERT_OPEN 4 // 0100
220 #define DP_DELETE_OPEN 8 // 1000
222 #define DP_MASK 3 // 0011
224 #define DP_INSERT_EXTEND 0 // 0000
225 #define DP_DELETE_EXTEND 0 // 0000
227 #define DP_NEG_INFINITY -1073741824 // use -(2^30) to leave room for decreasing the value without overflow
229 #define ALIGN_MATCH 0
230 #define ALIGN_MISMATCH_AMBIGUITY 1
231 #define ALIGN_INSERT 2
232 #define ALIGN_DELETE 3
234 #define ALIGN_PER_WORD 16
235 #define ALIGN_BIT 2
237 #define AUX_TEXT_PER_WORD 8
238 #define AUX_TEXT_BIT 4
240 static const char lowercaseDnaCharIndex = 14; // Seems that BLAST treat masked characters as 'N' (still have 1/4 chance of matching)
241 static const char nonMatchDnaCharIndex = 15;
242 static const char dnaChar[16] = {'A', 'C', 'G', 'T', 'M', 'R', 'S', 'V', 'W', 'Y', 'H', 'K', 'D', 'B', 'N', 'L'};
243 static const char dnaComplement[16] = {'T', 'G', 'C', 'A', 'K', 'Y', 'S', 'B', 'W', 'R', 'D', 'M', 'H', 'V', 'N', 'L'};
244 static const char ambiguityCount[16] = { 1 , 1 , 1 , 1 , 2 , 2 , 2 , 3 , 2 , 2 , 3 , 2 , 3 , 3 , 4 , 0 };
245 static const char ambiguityMatch[16][4] = {0, 0, 0, 0,
246 1, 0, 0, 0,
247 2, 0, 0, 0,
248 3, 0, 0, 0,
249 0, 1, 0, 0,
250 0, 2, 0, 0,
251 1, 2, 0, 0,
252 0, 1, 2, 0,
253 0, 3, 0, 0,
254 1, 3, 0, 0,
255 0, 1, 3, 0,
256 2, 3, 0, 0,
257 0, 2, 3, 0,
258 1, 2, 3, 0,
259 0, 1, 2, 3,
260 0, 0, 0, 0};
262 // Map must be allocated with char[256]
263 void HSPFillCharMap(unsigned char charMap[255]);
264 void HSPFillComplementMap(unsigned char complementMap[255]);
266 // scoringMatrix must be allocated with scoringMatrix[16][16]
267 void HSPFillScoringMatrix(int scoringMatrix[16][16], const int matchScore, const int mismatchScore, const int leftShift);
269 HSP *HSPLoad(MMPool *mmPool, const char *PackedDNAFileName, const char *AnnotationFileName, const char *AmbiguityFileName);
270 HSP *HSPConvertFromText(MMPool *mmPool, const unsigned char *text, const unsigned int textLength,
271 const unsigned int FASTARandomSeed, const int maskLowerCase,
272 const int gi, const char *seqName);
273 void HSPFree(MMPool *mmPool, HSP *hsp);
275 unsigned int HSPParseFASTAToPacked(const char *, const char *, const char *, const char *, const char *, const unsigned int , const int);
276 unsigned int HSPPackedToFASTA(const char* FASTAFileName, const char* annotationFileName, const char* packedDNAFileName, const char* ambiguityFileName);
279 #endif