modified: src1/input.c
[GalaxyCodeBases.git] / BGI / BASE / src / 2bwt / BWT.h
blob2a38a0484023f9b6aee60c3dade6160d34e12767
1 /*
3 BWT.h BWT-Index
5 This module contains an implementation of BWT-index for alphabet size = 4.
6 The functions provided include:
7 Load functions for loading BWT to memory;
8 Core functions for accessing core Inverse Psi values;
9 Search functions for searching patterns from text;
10 Text retrieval functions for retrieving text from BWT.
12 Copyright (C) 2004, Wong Chi Kwong.
14 This program is free software; you can redistribute it and/or
15 modify it under the terms of the GNU General Public License
16 as published by the Free Software Foundation; either version 2
17 of the License, or (at your option) any later version.
19 This program is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
24 You should have received a copy of the GNU General Public License
25 along with this program; if not, write to the Free Software
26 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
30 #ifndef __BWT_H__
31 #define __BWT_H__
33 #include "TypeNLimit.h"
34 #include "MemManager.h"
35 #include "TextConverter.h"
36 #include "HSP.h"
38 #define BITS_PER_OCC_VALUE 16
39 #define OCC_VALUE_PER_WORD 2
40 #define OCC_VALUE_PER_LONG 4
41 #define OCC_INTERVAL 256
42 #define WORD_BETWEEN_OCC 16
43 #define OCC_INTERVAL_MAJOR 65536
45 #define SORT_ALL 0
46 #define SORT_16_BIT 1
47 #define SORT_NONE 2
49 #define BUCKET_BIT 16
50 #define NUM_BUCKET 65536
52 #define MAX_APPROX_MATCH_ERROR 7
53 #define MAX_ARPROX_MATCH_LENGTH 64
55 #define BWTDP_MAX_SUBSTRING_LENGTH 512
57 #define ESTIMATED_OCC_DIFF 32 // 128 / 4
58 #define MAX_OCC_DIFF 128
62 typedef struct BWT {
63 unsigned long long textLength; // length of the text
64 unsigned long long saInterval; // interval between two SA values stored explicitly
65 unsigned long long inverseSaInterval; // interval between two inverse SA stored explicitly
66 unsigned long long inverseSa0; // SA-1[0]
67 unsigned long long *cumulativeFreq; // cumulative frequency
68 unsigned int *bwtCode; // BWT code
69 unsigned int *occValue; // Occurrence values stored explicitly
70 unsigned long long *occValueMajor; // Occurrence values stored explicitly
71 unsigned long long *saValue; // SA values stored explicitly
72 unsigned long long *inverseSa; // Inverse SA stored explicitly
73 unsigned long long *cachedSaIndex; // Cached SA index
74 unsigned long long cachedSaIndexNumOfChar; // Number of characters indexed in SA index range
75 unsigned long long *saValueOnBoundary; // Pre-calculated frequently referred data
76 unsigned long long *decodeTable; // For decoding BWT by table lookup
77 unsigned int decodeTableGenerated; // == TRUE if decode table is generated on load and will be freed
78 unsigned long long bwtSizeInWord; // Temporary variable to hold the memory allocated
79 unsigned long long occSizeInWord; // Temporary variable to hold the memory allocated
80 unsigned long long occMajorSizeInWord; // Temporary variable to hold the memory allocated
81 unsigned long long saValueSizeInWord; // Temporary variable to hold the memory allocated
82 unsigned long long inverseSaSizeInWord; // Temporary variable to hold the memory allocated
83 unsigned long long cachedSaIndexSizeInWord; // Temporary variable to hold the memory allocated
84 } BWT;
86 #define MAX_DIAGONAL_LEVEL 4 // Number of sub-pattern to keep for detecting diagonal hit
88 // Error information is stored as:
89 // 1. bitVector
90 // After hamming distance match
91 // 2. count
92 // After edit distance match
93 // 3. score
94 // After the hits are processed with scoring functions
96 typedef struct SaIndexGroupNew { // SA index range and information of a particular error arrangement of a matched sub-pattern
97 unsigned long long startSaIndex; // starting SA index
98 unsigned int numOfMatch; // number of match
99 unsigned int posQuery; // position in query; used for detecting diagonal hits
100 unsigned int info; // extra hit information; to be copied to hitList.info
101 } SaIndexGroupNew;
103 typedef struct SaIndexGroupTemp { // SA index range and information of a particular error arrangement of a matched sub-pattern
104 unsigned long long startSaIndex1; // starting SA index
105 unsigned int numOfMatch1; // number of match
106 unsigned long long startSaIndex2; // position in query; used for detecting diagonal hits
107 unsigned int numOfMatch2; // extra hit information; to be copied to hitList.info
108 } SaIndexGroupTemp;
110 typedef struct SaIndexGroupOld { // SA index range and information of a particular error arrangement of a matched sub-pattern
111 unsigned long long startSaIndex; // starting SA index
112 unsigned int numOfMatch; // number of match
113 unsigned int info; // extra hit information; to be copied to hitList.info
114 } SaIndexGroupOld;
116 typedef struct SaIndexGroup { // SA index range and information of a particular error arrangement of a matched sub-pattern
117 unsigned long long startSaIndex; // starting SA index
118 unsigned int numOfMatch; // number of match
119 unsigned int info; // extra hit information
120 } SaIndexGroup;
122 typedef struct SaIndexGroupWithErrorBitVector { // SA index range and information of a particular error arrangement of a matched sub-pattern
123 unsigned long long startSaIndex; // starting SA index
124 unsigned int numOfMatch; // number of match
125 unsigned int errorBitVector; // error bit vector
126 } SaIndexGroupWithErrorBitVector;
128 typedef struct SaIndexGroupWithLengthError { // SA index range and information of a particular error arrangement of a matched sub-pattern
129 unsigned long long startSaIndex; // starting SA index
130 unsigned int numOfMatch; // number of match
131 unsigned posQuery : 16; // position in query
132 unsigned length : 8; // length of hit
133 unsigned error : 8; // error in hit
134 } SaIndexGroupWithLengthError;
136 typedef struct SaIndexGroupProcessed { // Alternative usage of SaIndexGroup - once processed, error bit vector is replaced by index to text position
137 unsigned long long startSaIndex; // starting SA index
138 unsigned int numOfMatch; // number of match
139 unsigned long long textPositionIndex; // storing the pointer to text position
140 } SaIndexGroupProcessed;
142 typedef struct DupSaIndexGroup { // Alternative usage of SaIndexGroup - the group duplicates another group
143 unsigned long long lastDupSaIndexGroupIndex; // index to last duplicated group
144 unsigned long long saIndexGroupIndex; // index to the first SA into group among the duplicates
145 unsigned long long textPositionIndex; // storing the pointer to text position
146 } DupSaIndexGroup;
148 typedef struct SaIndexGroupHash { // Hash table for checking duplicate SA index group
149 unsigned long long startSaIndex;
150 unsigned long long saIndexGroupIndex;
151 } SaIndexGroupHash;
153 typedef struct BWTSaRetrievalStatistics {
154 unsigned long long bwtSaRetrieved;
155 unsigned long long saDiagonalLinked;
156 unsigned long long saDuplicated;
157 unsigned long long cachedSaRetrieved;
158 } BWTSaRetrievalStatistics;
160 typedef struct BWTDPStatistics {
161 int maxDepth;
162 int maxDPCell;
163 int maxDPMemoryInWord;
164 int totalMaxDepth;
165 int totalMaxDPCell;
166 int totalMaxDPMemoryInWord;
167 LONG acceptedPathDepth;
168 LONG acceptedPath;
169 LONG rejectedPathDepth;
170 LONG rejectedPath;
171 LONG* __restrict totalNode;
172 LONG* __restrict rejectedNode;
173 LONG* __restrict totalDPCell;
174 } BWTDPStatistics;
176 typedef struct SaIndexList {
177 unsigned long long saIndex;
178 unsigned long long textPositionIndex;
179 } SaIndexList;
181 typedef struct HitCombination {
182 int numOfCombination;
183 int maxError;
184 int keyLength;
185 int skipTableWidth;
186 int *errorPos;
187 int *skip;
188 int *skipErrorIndex;
189 } HitCombination;
191 typedef struct DPText {
192 int charBeingProcessed;
193 int dpCellIndex;
194 int numOfDpCellSegment;
195 unsigned long long dummy1; // Must not be removed; so that saIndexLeft and saIndexRight are aligned to 16 byte boundary
196 unsigned long long saIndexLeft[ALPHABET_SIZE];
197 unsigned long long saIndexRight[ALPHABET_SIZE];
198 } DPText;
200 typedef struct DPScanDepth {
201 unsigned P : 31;
202 unsigned withAmbiguity : 1;
203 } DPScanDepth;
206 // Load / unload functions
207 BWT *BWTCreate(MMPool *mmPool, const unsigned long long textLength, unsigned long long *decodeTable);
208 BWT *BWTLoad(MMPool *mmPool, const char *bwtCodeFileName, const char *occValueFileName,
209 const char *saValueFileName, const char *inverseSaFileName, const char *saIndexRangeFileName,
210 unsigned long long *decodeTable);
211 void BWTFree(MMPool *mmPool, BWT *bwt);
212 void BWTPrintMemoryUsage(const BWT *bwt, FILE *output, const unsigned long long packedDNASize);
214 // Precalculate frequenctly accessed data
215 void BWTGenerateSaValueOnBoundary(MMPool *mmPool, BWT *bwt);
217 // Core functions
218 // The following must be customized for differenet compression schemes ***
219 unsigned long long BWTDecode(const BWT *bwt, const unsigned long long index1, const unsigned long long index2, const unsigned int character);
220 void BWTDecodeAll(const BWT *bwt, const unsigned long long index1, const unsigned long long index2, unsigned long long* __restrict occValue);
221 unsigned long long BWTOccValue(const BWT *bwt, unsigned long long index, const unsigned int character);
222 void BWTOccValueTwoIndex(const BWT *bwt, unsigned long long index1, unsigned long long index2, const unsigned int character, unsigned long long* __restrict occValue);
223 void BWTAllOccValue(const BWT *bwt, unsigned long long index, unsigned long long* __restrict occValue);
224 void BWTAllOccValueTwoIndex(const BWT *bwt, unsigned long long index1, unsigned long long index2, unsigned long long* __restrict occValue1, unsigned long long* __restrict occValue2);
225 unsigned long long BWTOccValueOnSpot(const BWT *bwt, unsigned long long index, unsigned long long* __restrict character);
226 unsigned long long BWTSearchOccValue(const BWT *bwt, const unsigned int character, const unsigned long long searchOccValue);
229 // Utility functions for no compression only
230 unsigned long long BWTResidentSizeInWord(const unsigned long long numChar);
231 unsigned long long BWTFileSizeInWord(const unsigned long long numChar);
232 void BWTClearTrailingBwtCode(BWT *bwt);
234 // These are generic to different compression schemes (and generic to no compression as well)
235 unsigned long long BWTPsiMinusValue(const BWT *bwt, const unsigned long long index);
236 unsigned long long BWTPsiPlusValue(const BWT *bwt, const unsigned long long index);
237 unsigned long long BWTSaValue(const BWT *bwt, unsigned long long index);
238 unsigned long long BWTInverseSa(const BWT *bwt, unsigned long long saValue);
239 unsigned long long BWTOccIntervalMajor(const unsigned long long occInterval);
240 unsigned long long BWTOccValueMinorSizeInWord(const unsigned long long numChar);
241 unsigned long long BWTOccValueMajorSizeInWord(const unsigned long long numChar);
243 // Search functions
244 // packedText should be allocated with at least 1 Word buffer initialized to zero
245 int BWTForwardSearch(const unsigned int *packedKey, const unsigned int keyLength, const BWT *bwt, const unsigned int *packedText);
246 int BWTForwardSearchSaIndex(const unsigned int *packedKey, const unsigned int keyLength, const BWT *bwt, const unsigned int *packedText,
247 unsigned long long *resultSaIndexLeft, unsigned long long *resultSaIndexRight);
248 int BWTSaBinarySearch(const unsigned char *convertedKey, const unsigned int keyLength, const BWT *bwt, const unsigned int *packedText,
249 unsigned long long *resultSaIndexLeft, unsigned long long *resultSaIndexRight, unsigned int *tempKey); // tempKey = buffer large enough to hold packed key
250 int BWTBackwardSearch(const unsigned char *convertedKey, const unsigned int keyLength, const BWT *bwt,
251 unsigned long long *resultSaIndexLeft, unsigned long long *resultSaIndexRight);
252 int BWTBackwardSearchCheckWithText(const unsigned char *convertedKey, const unsigned int *packedKey, const unsigned int keyLength,
253 const BWT *bwt, const unsigned int *packedText, const unsigned int textCheckingCostFactor,
254 const unsigned int maxnumOfTextPosition, HitList* __restrict hitList,
255 unsigned long long *resultSaIndexLeft, unsigned long long *resultSaIndexRight);
257 // Approximate match functions - brute force deep first search by backward search is used
258 unsigned long long BWTHammingDistMaxSaIndexGroup(const unsigned int keyLength, const unsigned int maxError);
259 unsigned long long BWTHammingDistCountOcc(const unsigned char *convertedKey, const unsigned int keyLength, const BWT *bwt, const unsigned int maxError);
260 unsigned long long BWTHammingDistMatch(const BWT *bwt, const unsigned char *convertedKey, const HitCombination *hitCombination,
261 const unsigned long long *cachedSaIndex, const unsigned long long cachedSaIndexNumOfChar,
262 SaIndexGroupNew* __restrict saIndexGroup, const unsigned int maxSaIndexGroup);
263 int BWTHammingDistMatchOld(const unsigned char *convertedKey, const unsigned int keyLength, const BWT *bwt, const unsigned int maxError,
264 SaIndexGroupNew* __restrict saIndexGroup, const unsigned long long maxSaIndexGroup,
265 const unsigned long long posQuery, const unsigned long long info);
266 unsigned long long BWTEditDistMaxSaIndexGroup(const unsigned int keyLength, const unsigned int maxError);
267 // Does not insert characters on pattern boundary
268 unsigned long long BWTEditDistMatch(const unsigned char *convertedKey, const unsigned int keyLength, const BWT *bwt, const unsigned int maxError,
269 SaIndexGroupWithLengthError* __restrict saIndexGroup, const unsigned int maxSaIndexGroup);
270 unsigned long long BWTEditDistMatchOld(const unsigned char *convertedKey, const unsigned int keyLength, const BWT *bwt, const unsigned int maxError,
271 SaIndexGroupWithLengthError* __restrict saIndexGroup, const unsigned int maxSaIndexGroup);
274 unsigned long long BWTEliminateDupSaIndexGroup(SaIndexGroupWithLengthError* __restrict saIndexGroup, const unsigned long long numOfSaGroup);
276 unsigned long long BWTSubPatternHammingDistCountOcc(const unsigned char *convertedKey, const unsigned int keyLength, const unsigned int subPatternLength, const BWT *bwt,
277 const unsigned int maxError, const unsigned int skip);
278 int BWTSubPatternHammingDistSaIndex(const BWT *bwt, const unsigned char *convertedKey, const int keyLength, const int skip,
279 const HitCombination *hitCombination,
280 const unsigned long long *cachedSaIndex, const unsigned long long cachedSaIndexNumOfChar,
281 SaIndexGroupNew* __restrict saIndexGroup, const int maxnumOfSaIndexGroup,
282 int* __restrict firstSaIndexGroupForSubPattern);
283 int BWTSubPatternHammingDistSaIndexOld(const unsigned char *convertedKey, const int keyLength, const int subPatternLength, const BWT *bwt,
284 const int maxError, const int skip, const int lengthProcessed, int* __restrict lengthInCurrentRound,
285 SaIndexGroupNew* __restrict saIndexGroup, const int maxnumOfSaIndexGroup);
286 int BWTDPHit(const BWT *bwt, SaIndexGroupNew* __restrict saIndexGroup, const long long numOfSaIndexGroup,
287 const int firstSaIndexGrouptoProcess, int* __restrict saIndexGroupProcessed,
288 const int discardDiagonalHit,
289 char* workingMemory, const int workingMemorySize,
290 BWTSaRetrievalStatistics* __restrict bwtSaRetrievalStatistics);
291 //unsigned int BWTSubPatternHammingDistMatch(const unsigned char *convertedKey, const unsigned int keyLength, const unsigned int subPatternLength, const BWT *bwt,
292 // const unsigned int maxError, const unsigned int skip, const unsigned int matchBitVector,
293 // const SaIndexRange *saIndexRange, const unsigned int saIndexRangeNumOfChar,
294 // char *workingMemory, const unsigned int workingMemorySize, const unsigned int sortOption,
295 // BWTSaRetrievalStatistics* __restrict bwtSaRetrievalStatistics);
296 unsigned long long BWTSubPatternEditDistMatch(const unsigned char *convertedKey, const unsigned int keyLength, const unsigned int subPatternLength, const BWT *bwt,
297 const unsigned int maxError, const unsigned int skip, const unsigned int maxnumOfHit,
298 HitListWithPosQueryLengthError* __restrict hitList, BWTSaRetrievalStatistics* __restrict bwtSaRetrievalStatistics,
299 const unsigned int eliminateDuplicateStartingPos);
300 int BWTGappedDPDBvsQuery(BWT *bwt, const unsigned char *convertedKey, const int queryPatternLength,
301 char* __restrict workingMemory, const int workingMemorySize, int* __restrict totalNumOfQueryPos,
302 const int matchScore, const int mismatchScore,
303 const int gapOpenScore, const int gapExtendScore,
304 const int cutoffScore,
305 BWTDPStatistics* __restrict bwtDPStatistics,
306 const int printProgressDepth);
308 // Text retrieval functions
309 // Position in text will be placed at the first word of hitListSizeInWord
311 // startSaIndex + resultInfo must be sorted in increasing order; there must be no overlapping groups except that one group can completely enclose another
312 unsigned long long BWTTextPosition(const BWT *bwt, const SaIndexGroupNew *saIndexGroup, const unsigned long long numOfSaIndexGroups,
313 HitList* __restrict hitList,
314 SaIndexList* __restrict tempSaIndexList1, SaIndexList* __restrict tempSaIndexList2,
315 BWTSaRetrievalStatistics* __restrict bwtSaRetrievalStatistics, const unsigned long long eliminateDuplicateStartingPos);
317 unsigned long long BWTDecodeTextPosition(const BWT *bwt, SaIndexList* __restrict evenIterationSaIndexList, SaIndexList* __restrict oddIterationSaIndexList,
318 const unsigned long long numOfSaIndex, const unsigned long long numOfIterationProcessed, const unsigned long long maxnumOfIteration,
319 HitList* __restrict hitList, BWTSaRetrievalStatistics* __restrict bwtSaRetrievalStatistics);
322 unsigned long long BWTDecompressText(const BWT *bwt, const unsigned long long endSaIndex, const unsigned long long length, const unsigned char *reverseCharMap,
323 unsigned char *decompressedText);
324 unsigned long long BWTDecompressTextAsWordPacked(const BWT *bwt, const unsigned long long endSaIndex, const unsigned long long length, unsigned long long *decompressedText);
326 void BWTPrintDPStatistics(FILE * outFile, const BWTDPStatistics* bwtDPStatistics);
327 void BWTInitializeSaRetrievalStatistics(BWTSaRetrievalStatistics *bwtSaRetrievalStatistics);
328 void BWTAllocateDPStatistics(BWTDPStatistics *bwtDPStatistics);
329 void BWTInitializeDPStatistics(BWTDPStatistics *bwtDPStatistics);
330 void BWTFreeDPStatistics(BWTDPStatistics *bwtDPStatistics);
332 // QSort comparison functions
333 int SaIndexGroupStartSaIndexOrder(const void *saIndexGroup, const long long index1, const long long index2);
334 int SaIndexGroupStartSaIndexLengthErrorOrder(const void *saIndexGroup, const long long index1, const long long index2);
335 int HitListPosTextErrorLengthOrder(const void *hitList, const long long index1, const long long index2);
336 int HitListPosText16BitOrder(const void *hitList, const long long index1, const long long index2);
337 int HitListPosTextOrder(const void *hitList, const long long index1, const long long index2);
338 int GappedHitListScorePosTextOrder(const void *gappedHitList, const long long index1, const long long index2);
339 int GappedHitListDbSeqIndexScorePosTextOrder(const void *gappedHitList, const long long index1, const long long index2);
342 #endif