5 This module contains an implementation of BWT-index for alphabet size = 4.
6 The functions provided include:
7 Load functions for loading BWT to memory;
8 Core functions for accessing core Inverse Psi values;
9 Search functions for searching patterns from text;
10 Text retrieval functions for retrieving text from BWT.
12 Copyright (C) 2004, Wong Chi Kwong.
14 This program is free software; you can redistribute it and/or
15 modify it under the terms of the GNU General Public License
16 as published by the Free Software Foundation; either version 2
17 of the License, or (at your option) any later version.
19 This program is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 GNU General Public License for more details.
24 You should have received a copy of the GNU General Public License
25 along with this program; if not, write to the Free Software
26 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
33 #include "TypeNLimit.h"
34 #include "MemManager.h"
35 #include "TextConverter.h"
38 #define BITS_PER_OCC_VALUE 16
39 #define OCC_VALUE_PER_WORD 2
40 #define OCC_VALUE_PER_LONG 4
41 #define OCC_INTERVAL 256
42 #define WORD_BETWEEN_OCC 16
43 #define OCC_INTERVAL_MAJOR 65536
50 #define NUM_BUCKET 65536
52 #define MAX_APPROX_MATCH_ERROR 7
53 #define MAX_ARPROX_MATCH_LENGTH 64
55 #define BWTDP_MAX_SUBSTRING_LENGTH 512
57 #define ESTIMATED_OCC_DIFF 32 // 128 / 4
58 #define MAX_OCC_DIFF 128
63 unsigned long long textLength
; // length of the text
64 unsigned long long saInterval
; // interval between two SA values stored explicitly
65 unsigned long long inverseSaInterval
; // interval between two inverse SA stored explicitly
66 unsigned long long inverseSa0
; // SA-1[0]
67 unsigned long long *cumulativeFreq
; // cumulative frequency
68 unsigned int *bwtCode
; // BWT code
69 unsigned int *occValue
; // Occurrence values stored explicitly
70 unsigned long long *occValueMajor
; // Occurrence values stored explicitly
71 unsigned long long *saValue
; // SA values stored explicitly
72 unsigned long long *inverseSa
; // Inverse SA stored explicitly
73 unsigned long long *cachedSaIndex
; // Cached SA index
74 unsigned long long cachedSaIndexNumOfChar
; // Number of characters indexed in SA index range
75 unsigned long long *saValueOnBoundary
; // Pre-calculated frequently referred data
76 unsigned long long *decodeTable
; // For decoding BWT by table lookup
77 unsigned int decodeTableGenerated
; // == TRUE if decode table is generated on load and will be freed
78 unsigned long long bwtSizeInWord
; // Temporary variable to hold the memory allocated
79 unsigned long long occSizeInWord
; // Temporary variable to hold the memory allocated
80 unsigned long long occMajorSizeInWord
; // Temporary variable to hold the memory allocated
81 unsigned long long saValueSizeInWord
; // Temporary variable to hold the memory allocated
82 unsigned long long inverseSaSizeInWord
; // Temporary variable to hold the memory allocated
83 unsigned long long cachedSaIndexSizeInWord
; // Temporary variable to hold the memory allocated
86 #define MAX_DIAGONAL_LEVEL 4 // Number of sub-pattern to keep for detecting diagonal hit
88 // Error information is stored as:
90 // After hamming distance match
92 // After edit distance match
94 // After the hits are processed with scoring functions
96 typedef struct SaIndexGroupNew
{ // SA index range and information of a particular error arrangement of a matched sub-pattern
97 unsigned long long startSaIndex
; // starting SA index
98 unsigned int numOfMatch
; // number of match
99 unsigned int posQuery
; // position in query; used for detecting diagonal hits
100 unsigned int info
; // extra hit information; to be copied to hitList.info
103 typedef struct SaIndexGroupTemp
{ // SA index range and information of a particular error arrangement of a matched sub-pattern
104 unsigned long long startSaIndex1
; // starting SA index
105 unsigned int numOfMatch1
; // number of match
106 unsigned long long startSaIndex2
; // position in query; used for detecting diagonal hits
107 unsigned int numOfMatch2
; // extra hit information; to be copied to hitList.info
110 typedef struct SaIndexGroupOld
{ // SA index range and information of a particular error arrangement of a matched sub-pattern
111 unsigned long long startSaIndex
; // starting SA index
112 unsigned int numOfMatch
; // number of match
113 unsigned int info
; // extra hit information; to be copied to hitList.info
116 typedef struct SaIndexGroup
{ // SA index range and information of a particular error arrangement of a matched sub-pattern
117 unsigned long long startSaIndex
; // starting SA index
118 unsigned int numOfMatch
; // number of match
119 unsigned int info
; // extra hit information
122 typedef struct SaIndexGroupWithErrorBitVector
{ // SA index range and information of a particular error arrangement of a matched sub-pattern
123 unsigned long long startSaIndex
; // starting SA index
124 unsigned int numOfMatch
; // number of match
125 unsigned int errorBitVector
; // error bit vector
126 } SaIndexGroupWithErrorBitVector
;
128 typedef struct SaIndexGroupWithLengthError
{ // SA index range and information of a particular error arrangement of a matched sub-pattern
129 unsigned long long startSaIndex
; // starting SA index
130 unsigned int numOfMatch
; // number of match
131 unsigned posQuery
: 16; // position in query
132 unsigned length
: 8; // length of hit
133 unsigned error
: 8; // error in hit
134 } SaIndexGroupWithLengthError
;
136 typedef struct SaIndexGroupProcessed
{ // Alternative usage of SaIndexGroup - once processed, error bit vector is replaced by index to text position
137 unsigned long long startSaIndex
; // starting SA index
138 unsigned int numOfMatch
; // number of match
139 unsigned long long textPositionIndex
; // storing the pointer to text position
140 } SaIndexGroupProcessed
;
142 typedef struct DupSaIndexGroup
{ // Alternative usage of SaIndexGroup - the group duplicates another group
143 unsigned long long lastDupSaIndexGroupIndex
; // index to last duplicated group
144 unsigned long long saIndexGroupIndex
; // index to the first SA into group among the duplicates
145 unsigned long long textPositionIndex
; // storing the pointer to text position
148 typedef struct SaIndexGroupHash
{ // Hash table for checking duplicate SA index group
149 unsigned long long startSaIndex
;
150 unsigned long long saIndexGroupIndex
;
153 typedef struct BWTSaRetrievalStatistics
{
154 unsigned long long bwtSaRetrieved
;
155 unsigned long long saDiagonalLinked
;
156 unsigned long long saDuplicated
;
157 unsigned long long cachedSaRetrieved
;
158 } BWTSaRetrievalStatistics
;
160 typedef struct BWTDPStatistics
{
163 int maxDPMemoryInWord
;
166 int totalMaxDPMemoryInWord
;
167 LONG acceptedPathDepth
;
169 LONG rejectedPathDepth
;
171 LONG
* __restrict totalNode
;
172 LONG
* __restrict rejectedNode
;
173 LONG
* __restrict totalDPCell
;
176 typedef struct SaIndexList
{
177 unsigned long long saIndex
;
178 unsigned long long textPositionIndex
;
181 typedef struct HitCombination
{
182 int numOfCombination
;
191 typedef struct DPText
{
192 int charBeingProcessed
;
194 int numOfDpCellSegment
;
195 unsigned long long dummy1
; // Must not be removed; so that saIndexLeft and saIndexRight are aligned to 16 byte boundary
196 unsigned long long saIndexLeft
[ALPHABET_SIZE
];
197 unsigned long long saIndexRight
[ALPHABET_SIZE
];
200 typedef struct DPScanDepth
{
202 unsigned withAmbiguity
: 1;
206 // Load / unload functions
207 BWT
*BWTCreate(MMPool
*mmPool
, const unsigned long long textLength
, unsigned long long *decodeTable
);
208 BWT
*BWTLoad(MMPool
*mmPool
, const char *bwtCodeFileName
, const char *occValueFileName
,
209 const char *saValueFileName
, const char *inverseSaFileName
, const char *saIndexRangeFileName
,
210 unsigned long long *decodeTable
);
211 void BWTFree(MMPool
*mmPool
, BWT
*bwt
);
212 void BWTPrintMemoryUsage(const BWT
*bwt
, FILE *output
, const unsigned long long packedDNASize
);
214 // Precalculate frequenctly accessed data
215 void BWTGenerateSaValueOnBoundary(MMPool
*mmPool
, BWT
*bwt
);
218 // The following must be customized for differenet compression schemes ***
219 unsigned long long BWTDecode(const BWT
*bwt
, const unsigned long long index1
, const unsigned long long index2
, const unsigned int character
);
220 void BWTDecodeAll(const BWT
*bwt
, const unsigned long long index1
, const unsigned long long index2
, unsigned long long* __restrict occValue
);
221 unsigned long long BWTOccValue(const BWT
*bwt
, unsigned long long index
, const unsigned int character
);
222 void BWTOccValueTwoIndex(const BWT
*bwt
, unsigned long long index1
, unsigned long long index2
, const unsigned int character
, unsigned long long* __restrict occValue
);
223 void BWTAllOccValue(const BWT
*bwt
, unsigned long long index
, unsigned long long* __restrict occValue
);
224 void BWTAllOccValueTwoIndex(const BWT
*bwt
, unsigned long long index1
, unsigned long long index2
, unsigned long long* __restrict occValue1
, unsigned long long* __restrict occValue2
);
225 unsigned long long BWTOccValueOnSpot(const BWT
*bwt
, unsigned long long index
, unsigned long long* __restrict character
);
226 unsigned long long BWTSearchOccValue(const BWT
*bwt
, const unsigned int character
, const unsigned long long searchOccValue
);
229 // Utility functions for no compression only
230 unsigned long long BWTResidentSizeInWord(const unsigned long long numChar
);
231 unsigned long long BWTFileSizeInWord(const unsigned long long numChar
);
232 void BWTClearTrailingBwtCode(BWT
*bwt
);
234 // These are generic to different compression schemes (and generic to no compression as well)
235 unsigned long long BWTPsiMinusValue(const BWT
*bwt
, const unsigned long long index
);
236 unsigned long long BWTPsiPlusValue(const BWT
*bwt
, const unsigned long long index
);
237 unsigned long long BWTSaValue(const BWT
*bwt
, unsigned long long index
);
238 unsigned long long BWTInverseSa(const BWT
*bwt
, unsigned long long saValue
);
239 unsigned long long BWTOccIntervalMajor(const unsigned long long occInterval
);
240 unsigned long long BWTOccValueMinorSizeInWord(const unsigned long long numChar
);
241 unsigned long long BWTOccValueMajorSizeInWord(const unsigned long long numChar
);
244 // packedText should be allocated with at least 1 Word buffer initialized to zero
245 int BWTForwardSearch(const unsigned int *packedKey
, const unsigned int keyLength
, const BWT
*bwt
, const unsigned int *packedText
);
246 int BWTForwardSearchSaIndex(const unsigned int *packedKey
, const unsigned int keyLength
, const BWT
*bwt
, const unsigned int *packedText
,
247 unsigned long long *resultSaIndexLeft
, unsigned long long *resultSaIndexRight
);
248 int BWTSaBinarySearch(const unsigned char *convertedKey
, const unsigned int keyLength
, const BWT
*bwt
, const unsigned int *packedText
,
249 unsigned long long *resultSaIndexLeft
, unsigned long long *resultSaIndexRight
, unsigned int *tempKey
); // tempKey = buffer large enough to hold packed key
250 int BWTBackwardSearch(const unsigned char *convertedKey
, const unsigned int keyLength
, const BWT
*bwt
,
251 unsigned long long *resultSaIndexLeft
, unsigned long long *resultSaIndexRight
);
252 int BWTBackwardSearchCheckWithText(const unsigned char *convertedKey
, const unsigned int *packedKey
, const unsigned int keyLength
,
253 const BWT
*bwt
, const unsigned int *packedText
, const unsigned int textCheckingCostFactor
,
254 const unsigned int maxnumOfTextPosition
, HitList
* __restrict hitList
,
255 unsigned long long *resultSaIndexLeft
, unsigned long long *resultSaIndexRight
);
257 // Approximate match functions - brute force deep first search by backward search is used
258 unsigned long long BWTHammingDistMaxSaIndexGroup(const unsigned int keyLength
, const unsigned int maxError
);
259 unsigned long long BWTHammingDistCountOcc(const unsigned char *convertedKey
, const unsigned int keyLength
, const BWT
*bwt
, const unsigned int maxError
);
260 unsigned long long BWTHammingDistMatch(const BWT
*bwt
, const unsigned char *convertedKey
, const HitCombination
*hitCombination
,
261 const unsigned long long *cachedSaIndex
, const unsigned long long cachedSaIndexNumOfChar
,
262 SaIndexGroupNew
* __restrict saIndexGroup
, const unsigned int maxSaIndexGroup
);
263 int BWTHammingDistMatchOld(const unsigned char *convertedKey
, const unsigned int keyLength
, const BWT
*bwt
, const unsigned int maxError
,
264 SaIndexGroupNew
* __restrict saIndexGroup
, const unsigned long long maxSaIndexGroup
,
265 const unsigned long long posQuery
, const unsigned long long info
);
266 unsigned long long BWTEditDistMaxSaIndexGroup(const unsigned int keyLength
, const unsigned int maxError
);
267 // Does not insert characters on pattern boundary
268 unsigned long long BWTEditDistMatch(const unsigned char *convertedKey
, const unsigned int keyLength
, const BWT
*bwt
, const unsigned int maxError
,
269 SaIndexGroupWithLengthError
* __restrict saIndexGroup
, const unsigned int maxSaIndexGroup
);
270 unsigned long long BWTEditDistMatchOld(const unsigned char *convertedKey
, const unsigned int keyLength
, const BWT
*bwt
, const unsigned int maxError
,
271 SaIndexGroupWithLengthError
* __restrict saIndexGroup
, const unsigned int maxSaIndexGroup
);
274 unsigned long long BWTEliminateDupSaIndexGroup(SaIndexGroupWithLengthError
* __restrict saIndexGroup
, const unsigned long long numOfSaGroup
);
276 unsigned long long BWTSubPatternHammingDistCountOcc(const unsigned char *convertedKey
, const unsigned int keyLength
, const unsigned int subPatternLength
, const BWT
*bwt
,
277 const unsigned int maxError
, const unsigned int skip
);
278 int BWTSubPatternHammingDistSaIndex(const BWT
*bwt
, const unsigned char *convertedKey
, const int keyLength
, const int skip
,
279 const HitCombination
*hitCombination
,
280 const unsigned long long *cachedSaIndex
, const unsigned long long cachedSaIndexNumOfChar
,
281 SaIndexGroupNew
* __restrict saIndexGroup
, const int maxnumOfSaIndexGroup
,
282 int* __restrict firstSaIndexGroupForSubPattern
);
283 int BWTSubPatternHammingDistSaIndexOld(const unsigned char *convertedKey
, const int keyLength
, const int subPatternLength
, const BWT
*bwt
,
284 const int maxError
, const int skip
, const int lengthProcessed
, int* __restrict lengthInCurrentRound
,
285 SaIndexGroupNew
* __restrict saIndexGroup
, const int maxnumOfSaIndexGroup
);
286 int BWTDPHit(const BWT
*bwt
, SaIndexGroupNew
* __restrict saIndexGroup
, const long long numOfSaIndexGroup
,
287 const int firstSaIndexGrouptoProcess
, int* __restrict saIndexGroupProcessed
,
288 const int discardDiagonalHit
,
289 char* workingMemory
, const int workingMemorySize
,
290 BWTSaRetrievalStatistics
* __restrict bwtSaRetrievalStatistics
);
291 //unsigned int BWTSubPatternHammingDistMatch(const unsigned char *convertedKey, const unsigned int keyLength, const unsigned int subPatternLength, const BWT *bwt,
292 // const unsigned int maxError, const unsigned int skip, const unsigned int matchBitVector,
293 // const SaIndexRange *saIndexRange, const unsigned int saIndexRangeNumOfChar,
294 // char *workingMemory, const unsigned int workingMemorySize, const unsigned int sortOption,
295 // BWTSaRetrievalStatistics* __restrict bwtSaRetrievalStatistics);
296 unsigned long long BWTSubPatternEditDistMatch(const unsigned char *convertedKey
, const unsigned int keyLength
, const unsigned int subPatternLength
, const BWT
*bwt
,
297 const unsigned int maxError
, const unsigned int skip
, const unsigned int maxnumOfHit
,
298 HitListWithPosQueryLengthError
* __restrict hitList
, BWTSaRetrievalStatistics
* __restrict bwtSaRetrievalStatistics
,
299 const unsigned int eliminateDuplicateStartingPos
);
300 int BWTGappedDPDBvsQuery(BWT
*bwt
, const unsigned char *convertedKey
, const int queryPatternLength
,
301 char* __restrict workingMemory
, const int workingMemorySize
, int* __restrict totalNumOfQueryPos
,
302 const int matchScore
, const int mismatchScore
,
303 const int gapOpenScore
, const int gapExtendScore
,
304 const int cutoffScore
,
305 BWTDPStatistics
* __restrict bwtDPStatistics
,
306 const int printProgressDepth
);
308 // Text retrieval functions
309 // Position in text will be placed at the first word of hitListSizeInWord
311 // startSaIndex + resultInfo must be sorted in increasing order; there must be no overlapping groups except that one group can completely enclose another
312 unsigned long long BWTTextPosition(const BWT
*bwt
, const SaIndexGroupNew
*saIndexGroup
, const unsigned long long numOfSaIndexGroups
,
313 HitList
* __restrict hitList
,
314 SaIndexList
* __restrict tempSaIndexList1
, SaIndexList
* __restrict tempSaIndexList2
,
315 BWTSaRetrievalStatistics
* __restrict bwtSaRetrievalStatistics
, const unsigned long long eliminateDuplicateStartingPos
);
317 unsigned long long BWTDecodeTextPosition(const BWT
*bwt
, SaIndexList
* __restrict evenIterationSaIndexList
, SaIndexList
* __restrict oddIterationSaIndexList
,
318 const unsigned long long numOfSaIndex
, const unsigned long long numOfIterationProcessed
, const unsigned long long maxnumOfIteration
,
319 HitList
* __restrict hitList
, BWTSaRetrievalStatistics
* __restrict bwtSaRetrievalStatistics
);
322 unsigned long long BWTDecompressText(const BWT
*bwt
, const unsigned long long endSaIndex
, const unsigned long long length
, const unsigned char *reverseCharMap
,
323 unsigned char *decompressedText
);
324 unsigned long long BWTDecompressTextAsWordPacked(const BWT
*bwt
, const unsigned long long endSaIndex
, const unsigned long long length
, unsigned long long *decompressedText
);
326 void BWTPrintDPStatistics(FILE * outFile
, const BWTDPStatistics
* bwtDPStatistics
);
327 void BWTInitializeSaRetrievalStatistics(BWTSaRetrievalStatistics
*bwtSaRetrievalStatistics
);
328 void BWTAllocateDPStatistics(BWTDPStatistics
*bwtDPStatistics
);
329 void BWTInitializeDPStatistics(BWTDPStatistics
*bwtDPStatistics
);
330 void BWTFreeDPStatistics(BWTDPStatistics
*bwtDPStatistics
);
332 // QSort comparison functions
333 int SaIndexGroupStartSaIndexOrder(const void *saIndexGroup
, const long long index1
, const long long index2
);
334 int SaIndexGroupStartSaIndexLengthErrorOrder(const void *saIndexGroup
, const long long index1
, const long long index2
);
335 int HitListPosTextErrorLengthOrder(const void *hitList
, const long long index1
, const long long index2
);
336 int HitListPosText16BitOrder(const void *hitList
, const long long index1
, const long long index2
);
337 int HitListPosTextOrder(const void *hitList
, const long long index1
, const long long index2
);
338 int GappedHitListScorePosTextOrder(const void *gappedHitList
, const long long index1
, const long long index2
);
339 int GappedHitListDbSeqIndexScorePosTextOrder(const void *gappedHitList
, const long long index1
, const long long index2
);