3 # Copyright (C) 2015, The University of Hong Kong.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 3
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 03110-1301, USA.
21 Change : Packaging 2BWT library as a separate product.
22 Enhancing the 2BWT library with interface functions
27 #include "2BWT-Interface.h"
29 Idx2BWT
* BWTLoad2BWT(const char * indexFilePrefix
, const char * saFileNameExtension
) {
31 Idx2BWT
* idx2BWT
= (Idx2BWT
*) malloc(sizeof(Idx2BWT
));
36 char bwtFilename
[MAX_INDEX_FILENAME_LENGTH
];
37 char bwtOccFilename
[MAX_INDEX_FILENAME_LENGTH
];
38 char saFilename
[MAX_INDEX_FILENAME_LENGTH
];
39 char rev_bwtFilename
[MAX_INDEX_FILENAME_LENGTH
];
40 char rev_bwtOccFilename
[MAX_INDEX_FILENAME_LENGTH
];
41 char packedDnaFilename
[MAX_INDEX_FILENAME_LENGTH
];
42 char annotationFilename
[MAX_INDEX_FILENAME_LENGTH
];
43 char ambiguityFilename
[MAX_INDEX_FILENAME_LENGTH
];
44 char translateFilename
[MAX_INDEX_FILENAME_LENGTH
];
46 strcpy(bwtFilename
,indexFilePrefix
);
47 strcpy(bwtOccFilename
,indexFilePrefix
);
48 strcpy(saFilename
,indexFilePrefix
);
49 strcpy(rev_bwtFilename
,indexFilePrefix
);
50 strcpy(rev_bwtOccFilename
,indexFilePrefix
);
51 strcpy(packedDnaFilename
,indexFilePrefix
);
52 strcpy(annotationFilename
,indexFilePrefix
);
53 strcpy(ambiguityFilename
,indexFilePrefix
);
54 strcpy(translateFilename
,indexFilePrefix
);
56 strcat(bwtFilename
,".bwt");
57 strcat(bwtOccFilename
,".fmv");
58 strcat(saFilename
,saFileNameExtension
);
59 strcat(rev_bwtFilename
,".rev.bwt");
60 strcat(rev_bwtOccFilename
,".rev.fmv");
61 strcat(packedDnaFilename
,".pac");
62 strcat(annotationFilename
,".ann");
63 strcat(ambiguityFilename
,".amb");
64 strcat(translateFilename
,".tra");
66 MMMasterInitialize(3, 0, FALSE
, NULL
);
67 MMPool
* mmPool
= MMPoolCreate(2097152);
69 bwt
= BWTLoad(mmPool
, bwtFilename
, bwtOccFilename
, saFilename
, NULL
, NULL
, NULL
);
70 rev_bwt
= BWTLoad(mmPool
, rev_bwtFilename
, rev_bwtOccFilename
, NULL
, NULL
, NULL
, NULL
);
71 hsp
= HSPLoad(mmPool
, packedDnaFilename
, annotationFilename
, ambiguityFilename
,translateFilename
, 1);
73 HSPFillCharMap(idx2BWT
->charMap
);
74 HSPFillComplementMap(idx2BWT
->complementMap
);
77 idx2BWT
->rev_bwt
= rev_bwt
;
79 idx2BWT
->mmPool
= mmPool
;
81 idx2BWT
->numReads
= 0;
82 idx2BWT
->readIDtable
= NULL
;
87 Idx2BWT
*BWTLoad2BWTLite(const char * indexFilePrefix
) {
88 Idx2BWT
* idx2BWT
= (Idx2BWT
*) malloc(sizeof(Idx2BWT
));
92 char bwtFilename
[MAX_INDEX_FILENAME_LENGTH
];
93 char bwtOccFilename
[MAX_INDEX_FILENAME_LENGTH
];
94 char rev_bwtFilename
[MAX_INDEX_FILENAME_LENGTH
];
95 char rev_bwtOccFilename
[MAX_INDEX_FILENAME_LENGTH
];
96 char bwtReadIDtableFilename
[MAX_INDEX_FILENAME_LENGTH
];
98 strcpy(bwtFilename
,indexFilePrefix
);
99 strcpy(bwtOccFilename
,indexFilePrefix
);
100 strcpy(rev_bwtFilename
,indexFilePrefix
);
101 strcpy(rev_bwtOccFilename
,indexFilePrefix
);
102 strcpy(bwtReadIDtableFilename
, indexFilePrefix
);
104 strcat(bwtFilename
,".bwt");
105 strcat(bwtOccFilename
,".fmv");
106 strcat(rev_bwtFilename
,".rev.bwt");
107 strcat(rev_bwtOccFilename
,".rev.fmv");
108 strcat(bwtReadIDtableFilename
, ".ridt");
110 static int is_init
= 0;
112 MMMasterInitialize(6, 0, FALSE
, NULL
);
114 MMPool
* mmPool
= MMPoolCreate(2097152);
116 bwt
= BWTLoad(mmPool
, bwtFilename
, bwtOccFilename
, NULL
, NULL
, NULL
, NULL
);
117 rev_bwt
= BWTLoad(mmPool
, rev_bwtFilename
, rev_bwtOccFilename
, NULL
, NULL
, NULL
, NULL
);
119 HSPFillCharMap(idx2BWT
->charMap
);
120 HSPFillComplementMap(idx2BWT
->complementMap
);
123 idx2BWT
->rev_bwt
= rev_bwt
;
125 idx2BWT
->mmPool
= mmPool
;
127 // read read id table
128 FILE *readIDtableFile
= fopen64(bwtReadIDtableFilename
, "rb");
129 if (readIDtableFile
== NULL
) {
130 fprintf(stderr
, "%s: cannot open read_id_table!\n", __func__
);
133 fread(&(idx2BWT
->numReads
), sizeof(idx2BWT
->numReads
), 1, readIDtableFile
);
134 if (bwt
->cumulativeFreq
[4] % idx2BWT
->numReads
!= 0) {
135 fprintf(stderr
, "Number of reads and read length do not match!\n");
138 idx2BWT
->readLength
= bwt
->cumulativeFreq
[4] / idx2BWT
->numReads
;
139 fprintf(stderr
, "Read length %d\n", idx2BWT
->readLength
);
140 idx2BWT
->readIDtable
= (unsigned int*) malloc(sizeof(unsigned int) * idx2BWT
->numReads
);
141 if (idx2BWT
->readIDtable
== NULL
) {
142 fprintf(stderr
, "%s:%s:%d: memory alloc failed!\n", __FILE__
, __func__
, __LINE__
);
145 fread(idx2BWT
->readIDtable
, sizeof(unsigned int), idx2BWT
->numReads
, readIDtableFile
);
147 fclose(readIDtableFile
);
152 void BWTFree2BWT(Idx2BWT
* idx2BWT
) {
154 BWT
* bwt
= idx2BWT
->bwt
;
155 BWT
* rev_bwt
= idx2BWT
->rev_bwt
;
156 HSP
* hsp
= idx2BWT
->hsp
;
157 MMPool
* mmPool
= idx2BWT
->mmPool
;
160 HSPFree(mmPool
, hsp
, 1);
162 BWTFree(mmPool
, bwt
);
163 BWTFree(mmPool
, rev_bwt
);
166 if (idx2BWT
->readIDtable
!= NULL
) {
167 free(idx2BWT
->readIDtable
);
173 int BWTGetQuality(Idx2BWT
* idx2BWT
, unsigned long long readID
, int offset
) {
175 BWT
* bwt
= idx2BWT
->bwt
;
176 unsigned long long absOffset
= readID
* idx2BWT
->readLength
+ offset
;
177 return (bwt
->bwtCode
[absOffset
/ CHAR_PER_WORD
] >> (absOffset
% CHAR_PER_WORD
* BIT_PER_CHAR
+ (BIT_PER_CHAR
- 1))) & 1;
180 int BWTCheckUsed(Idx2BWT
* idx2BWT
, unsigned long long readID
) {
182 BWT
* bwt
= idx2BWT
->bwt
;
183 unsigned long long absOffset
= bwt
->cumulativeFreq
[4] + readID
;
184 return (bwt
->bwtCode
[absOffset
/ CHAR_PER_WORD
] >> (absOffset
% CHAR_PER_WORD
* BIT_PER_CHAR
+ (BIT_PER_CHAR
- 1))) & 1;
187 void BWTSetUsed(Idx2BWT
* idx2BWT
, unsigned long long readID
, int used
) {
188 BWT
* bwt
= idx2BWT
->bwt
;
189 unsigned long long absOffset
= bwt
->cumulativeFreq
[4] + readID
;
190 unsigned int *wordToUpdate
= bwt
->bwtCode
+ absOffset
/ CHAR_PER_WORD
;
191 int offsetInWord
= absOffset
% CHAR_PER_WORD
* BIT_PER_CHAR
+ (BIT_PER_CHAR
- 1);
194 *wordToUpdate
|= 1 << offsetInWord
;
196 *wordToUpdate
&= ~(1 << offsetInWord
);
200 void BWTCheckFlag(Idx2BWT
* idx2BWT
, unsigned long long readID
, int *used
, int *cons
) {
202 BWT
* bwt
= idx2BWT
->bwt
;
203 unsigned long long absOffset
= bwt
->cumulativeFreq
[4] + readID
;
204 *used
= (bwt
->bwtCode
[absOffset
/ CHAR_PER_WORD
] >> (absOffset
% CHAR_PER_WORD
* BIT_PER_CHAR
+ (BIT_PER_CHAR
- 1))) & 1;
206 bwt
= idx2BWT
->rev_bwt
;
207 absOffset
= bwt
->cumulativeFreq
[4] + readID
;
208 *cons
= (bwt
->bwtCode
[absOffset
/ CHAR_PER_WORD
] >> (absOffset
% CHAR_PER_WORD
* BIT_PER_CHAR
+ (BIT_PER_CHAR
- 1))) & 1;
211 void BWTSetFlag(Idx2BWT
* idx2BWT
, unsigned long long readID
, int used
, int cons
) {
212 BWT
* bwt
= idx2BWT
->bwt
;
213 unsigned long long absOffset
= bwt
->cumulativeFreq
[4] + readID
;
214 unsigned int *wordToUpdate
= bwt
->bwtCode
+ absOffset
/ CHAR_PER_WORD
;
215 int offsetInWord
= absOffset
% CHAR_PER_WORD
* BIT_PER_CHAR
+ (BIT_PER_CHAR
- 1);
218 *wordToUpdate
|= 1 << offsetInWord
;
220 *wordToUpdate
&= ~(1 << offsetInWord
);
223 bwt
= idx2BWT
->rev_bwt
;
224 absOffset
= bwt
->cumulativeFreq
[4] + readID
;
225 wordToUpdate
= bwt
->bwtCode
+ absOffset
/ CHAR_PER_WORD
;
226 offsetInWord
= absOffset
% CHAR_PER_WORD
* BIT_PER_CHAR
+ (BIT_PER_CHAR
- 1);
229 *wordToUpdate
|= 1 << offsetInWord
;
231 *wordToUpdate
&= ~(1 << offsetInWord
);
234 void BWTConvertPattern(Idx2BWT
* idx2BWT
, const char * patternSource
, int patternLength
, unsigned char * patternDestination
) {
237 for (i
=0;i
<patternLength
;i
++) {
238 patternDestination
[i
] = idx2BWT
->charMap
[patternSource
[i
]];
240 patternDestination
[i
]='\0';
244 void BWTSARangeInitial(Idx2BWT
* idx2BWT
, const unsigned char c
,
245 unsigned long long *saIndexLeft
, unsigned long long *saIndexRight
) {
247 BWT
* bwt
= idx2BWT
->bwt
;
248 BWT
* rev_bwt
= idx2BWT
->rev_bwt
;
250 (*saIndexLeft
) = bwt
->cumulativeFreq
[c
]; //+1;
251 (*saIndexRight
) = bwt
->cumulativeFreq
[c
+1]-1;
256 void BWTSARangeBackward(Idx2BWT
* idx2BWT
, const unsigned char c
,
257 unsigned long long *saIndexLeft
, unsigned long long *saIndexRight
) {
259 BWT
* bwt
= idx2BWT
->bwt
;
260 BWT
* rev_bwt
= idx2BWT
->rev_bwt
;
263 unsigned long long l
= (*saIndexLeft
);
264 unsigned long long r
= (*saIndexRight
);
265 (*saIndexLeft
) = bwt
->cumulativeFreq
[c
] + BWTOccValue(bwt
, l
, c
);// + 1;
266 (*saIndexRight
) = bwt
->cumulativeFreq
[c
] + BWTOccValue(bwt
, r
+ 1, c
) - 1;
269 void BWTSARangeBackward_Bidirection(Idx2BWT
* idx2BWT
, const unsigned char c
,
270 unsigned long long *saIndexLeft
, unsigned long long *saIndexRight
,
271 unsigned long long *rev_saIndexLeft
, unsigned long long *rev_saIndexRight
) {
273 BWT
* bwt
= idx2BWT
->bwt
;
274 BWT
* rev_bwt
= idx2BWT
->rev_bwt
;
276 unsigned long long oL
[ALPHABET_SIZE
];
277 unsigned long long oR
[ALPHABET_SIZE
];
278 unsigned long long oCount
[ALPHABET_SIZE
];
279 unsigned long long l
= (*saIndexLeft
);
280 unsigned long long r
= (*saIndexRight
);
281 unsigned long long rev_l
= (*rev_saIndexLeft
);
282 unsigned long long rev_r
= (*rev_saIndexRight
);
285 BWTAllOccValue(bwt
,l
,oL
);
286 BWTAllOccValue(bwt
,r
+ 1,oR
);
287 oCount
[ALPHABET_SIZE
-1]=0;
288 for (k
=ALPHABET_SIZE
-2;k
>=0;k
--) {
289 oCount
[k
]=oCount
[k
+1]+oR
[k
+1]-oL
[k
+1];
292 l
= bwt
->cumulativeFreq
[c
] + oL
[c
] + 1;
293 r
= bwt
->cumulativeFreq
[c
] + oR
[c
];
294 rev_r
= rev_r
- oCount
[c
];
295 rev_l
= rev_r
- (r
-l
);
302 (*rev_saIndexLeft
) = rev_l
;
303 (*rev_saIndexRight
) = rev_r
;
306 void BWTSARangeForward_Bidirection(Idx2BWT
* idx2BWT
, const unsigned char c
,
307 unsigned long long *saIndexLeft
, unsigned long long *saIndexRight
,
308 unsigned long long *rev_saIndexLeft
, unsigned long long *rev_saIndexRight
) {
310 BWT
* bwt
= idx2BWT
->bwt
;
311 BWT
* rev_bwt
= idx2BWT
->rev_bwt
;
313 unsigned long long oL
[ALPHABET_SIZE
];
314 unsigned long long oR
[ALPHABET_SIZE
];
315 unsigned long long oCount
[ALPHABET_SIZE
];
316 unsigned long long l
= (*saIndexLeft
);
317 unsigned long long r
= (*saIndexRight
);
318 unsigned long long rev_l
= (*rev_saIndexLeft
);
319 unsigned long long rev_r
= (*rev_saIndexRight
);
322 BWTAllOccValue(rev_bwt
,rev_l
,oL
);
323 BWTAllOccValue(rev_bwt
,rev_r
+ 1,oR
);
324 oCount
[ALPHABET_SIZE
-1]=0;
325 for (k
=ALPHABET_SIZE
-2;k
>=0;k
--) {
326 oCount
[k
]=oCount
[k
+1]+oR
[k
+1]-oL
[k
+1];
329 rev_l
= bwt
->cumulativeFreq
[c
] + oL
[c
] + 1;
330 rev_r
= bwt
->cumulativeFreq
[c
] + oR
[c
];
332 l
= r
- (rev_r
-rev_l
);
339 (*rev_saIndexLeft
) = rev_l
;
340 (*rev_saIndexRight
) = rev_r
;
343 void BWTAllSARangesBackward(Idx2BWT
* idx2BWT
,
344 const unsigned long long saIndexLeft
, const unsigned long long saIndexRight
,
345 unsigned long long *resultSaIndexesLeft
, unsigned long long *resultSaIndexesRight
) {
347 BWT
* bwt
= idx2BWT
->bwt
;
348 BWT
* rev_bwt
= idx2BWT
->rev_bwt
;
350 unsigned long long oL
[ALPHABET_SIZE
];
351 unsigned long long oR
[ALPHABET_SIZE
];
352 unsigned long long oCount
[ALPHABET_SIZE
];
353 unsigned long long l
= saIndexLeft
;
354 unsigned long long r
= saIndexRight
;
357 BWTAllOccValue(bwt
,l
,oL
);
358 BWTAllOccValue(bwt
,r
+ 1,oR
);
360 for (k
=0;k
<ALPHABET_SIZE
;k
++) {
361 resultSaIndexesLeft
[k
] = bwt
->cumulativeFreq
[k
] + oL
[k
] + 1 - 1;
362 resultSaIndexesRight
[k
] = bwt
->cumulativeFreq
[k
] + oR
[k
] - 1;
368 void BWTAllSARangesBackward_Bidirection(Idx2BWT
* idx2BWT
,
369 const unsigned long long saIndexLeft
, const unsigned long long saIndexRight
,
370 const unsigned long long rev_saIndexLeft
, const unsigned long long rev_saIndexRight
,
371 unsigned long long *resultSaIndexLeft
, unsigned long long *resultSaIndexRight
,
372 unsigned long long *rev_resultSaIndexLeft
, unsigned long long *rev_resultSaIndexRight
) {
374 BWT
* bwt
= idx2BWT
->bwt
;
375 BWT
* rev_bwt
= idx2BWT
->rev_bwt
;
377 unsigned long long oL
[ALPHABET_SIZE
];
378 unsigned long long oR
[ALPHABET_SIZE
];
379 unsigned long long oCount
[ALPHABET_SIZE
];
380 unsigned long long l
= saIndexLeft
;
381 unsigned long long r
= saIndexRight
;
382 unsigned long long rev_l
= rev_saIndexLeft
;
383 unsigned long long rev_r
= rev_saIndexRight
;
386 BWTAllOccValue(bwt
,l
,oL
);
387 BWTAllOccValue(bwt
,r
+ 1,oR
);
388 oCount
[ALPHABET_SIZE
-1]=0;
389 for (k
=ALPHABET_SIZE
-2;k
>=0;k
--) {
390 oCount
[k
]=oCount
[k
+1]+oR
[k
+1]-oL
[k
+1];
393 for (k
=0;k
<ALPHABET_SIZE
;k
++) {
395 resultSaIndexLeft
[k
] = bwt
->cumulativeFreq
[k
] + oL
[k
] + 1 - 1;
396 resultSaIndexRight
[k
] = bwt
->cumulativeFreq
[k
] + oR
[k
] - 1;
397 rev_resultSaIndexRight
[k
] = rev_r
- oCount
[k
] - 1;
398 rev_resultSaIndexLeft
[k
] = rev_resultSaIndexRight
[k
] -
399 (resultSaIndexRight
[k
]-resultSaIndexLeft
[k
]) - 1;
405 void BWTAllSARangesForward_Bidirection(Idx2BWT
* idx2BWT
,
406 const unsigned long long saIndexLeft
, const unsigned long long saIndexRight
,
407 const unsigned long long rev_saIndexLeft
, const unsigned long long rev_saIndexRight
,
408 unsigned long long *resultSaIndexLeft
, unsigned long long *resultSaIndexRight
,
409 unsigned long long *rev_resultSaIndexLeft
, unsigned long long *rev_resultSaIndexRight
) {
411 BWT
* bwt
= idx2BWT
->bwt
;
412 BWT
* rev_bwt
= idx2BWT
->rev_bwt
;
414 unsigned long long oL
[ALPHABET_SIZE
];
415 unsigned long long oR
[ALPHABET_SIZE
];
416 unsigned long long oCount
[ALPHABET_SIZE
];
417 unsigned long long l
= saIndexLeft
;
418 unsigned long long r
= saIndexRight
;
419 unsigned long long rev_l
= rev_saIndexLeft
;
420 unsigned long long rev_r
= rev_saIndexRight
;
423 BWTAllOccValue(rev_bwt
,rev_l
,oL
);
424 BWTAllOccValue(rev_bwt
,rev_r
+ 1,oR
);
425 oCount
[ALPHABET_SIZE
-1]=0;
426 for (k
=ALPHABET_SIZE
-2;k
>=0;k
--) {
427 oCount
[k
]=oCount
[k
+1]+oR
[k
+1]-oL
[k
+1];
430 for (k
=0;k
<ALPHABET_SIZE
;k
++) {
432 rev_resultSaIndexLeft
[k
] = bwt
->cumulativeFreq
[k
] + oL
[k
] + 1 - 1;
433 rev_resultSaIndexRight
[k
] = bwt
->cumulativeFreq
[k
] + oR
[k
] - 1;
434 resultSaIndexRight
[k
] = r
- oCount
[k
] - 1;
435 resultSaIndexLeft
[k
] = resultSaIndexRight
[k
] -
436 (rev_resultSaIndexRight
[k
]-rev_resultSaIndexLeft
[k
]) - 1;
441 void BWTRetrievePositionFromSAIndex(Idx2BWT
* idx2BWT
,
442 unsigned long long saIndex
,
443 unsigned int * sequenceId
, unsigned long long * offset
) {
444 BWT
* bwt
= idx2BWT
->bwt
;
445 BWT
* rev_bwt
= idx2BWT
->rev_bwt
;
446 HSP
* hsp
= idx2BWT
->hsp
;
447 unsigned short * ambiguityMap
= hsp
->ambiguityMap
;
448 Translate
* translate
= hsp
->translate
;
450 unsigned long long ambPosition
= BWTSaValue(bwt
,saIndex
);
451 unsigned long long approxIndex
= ambPosition
>>GRID_SAMPLING_FACTOR_2_POWER
;
452 unsigned long long approxValue
= ambiguityMap
[approxIndex
];
453 while (translate
[approxValue
].startPos
>ambPosition
) {
456 ambPosition
-=translate
[approxValue
].correction
;
458 (*sequenceId
) = translate
[approxValue
].chrID
;
459 (*offset
) = ambPosition
;