modified: nfig1.py
[GalaxyCodeBases.git] / BGI / BASE / src / 2bwt / 2BWT-Interface.c
blob737dcda569100359a11232a30aa74b3897dcea7a
1 /*
3 # Copyright (C) 2015, The University of Hong Kong.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 3
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 03110-1301, USA.
19 Date : 19th June 2011
20 Author : Edward MK Wu
21 Change : Packaging 2BWT library as a separate product.
22 Enhancing the 2BWT library with interface functions
23 for basic BWT search.
27 #include "2BWT-Interface.h"
29 Idx2BWT * BWTLoad2BWT(const char * indexFilePrefix, const char * saFileNameExtension) {
31 Idx2BWT * idx2BWT = (Idx2BWT*) malloc(sizeof(Idx2BWT));
32 BWT * bwt;
33 BWT * rev_bwt;
34 HSP * hsp;
36 char bwtFilename[MAX_INDEX_FILENAME_LENGTH];
37 char bwtOccFilename[MAX_INDEX_FILENAME_LENGTH];
38 char saFilename[MAX_INDEX_FILENAME_LENGTH];
39 char rev_bwtFilename[MAX_INDEX_FILENAME_LENGTH];
40 char rev_bwtOccFilename[MAX_INDEX_FILENAME_LENGTH];
41 char packedDnaFilename[MAX_INDEX_FILENAME_LENGTH];
42 char annotationFilename[MAX_INDEX_FILENAME_LENGTH];
43 char ambiguityFilename[MAX_INDEX_FILENAME_LENGTH];
44 char translateFilename[MAX_INDEX_FILENAME_LENGTH];
46 strcpy(bwtFilename,indexFilePrefix);
47 strcpy(bwtOccFilename,indexFilePrefix);
48 strcpy(saFilename,indexFilePrefix);
49 strcpy(rev_bwtFilename,indexFilePrefix);
50 strcpy(rev_bwtOccFilename,indexFilePrefix);
51 strcpy(packedDnaFilename,indexFilePrefix);
52 strcpy(annotationFilename,indexFilePrefix);
53 strcpy(ambiguityFilename,indexFilePrefix);
54 strcpy(translateFilename,indexFilePrefix);
56 strcat(bwtFilename,".bwt");
57 strcat(bwtOccFilename,".fmv");
58 strcat(saFilename,saFileNameExtension);
59 strcat(rev_bwtFilename,".rev.bwt");
60 strcat(rev_bwtOccFilename,".rev.fmv");
61 strcat(packedDnaFilename,".pac");
62 strcat(annotationFilename,".ann");
63 strcat(ambiguityFilename,".amb");
64 strcat(translateFilename,".tra");
66 MMMasterInitialize(3, 0, FALSE, NULL);
67 MMPool * mmPool = MMPoolCreate(2097152);
69 bwt = BWTLoad(mmPool, bwtFilename, bwtOccFilename, saFilename, NULL, NULL, NULL);
70 rev_bwt = BWTLoad(mmPool, rev_bwtFilename, rev_bwtOccFilename, NULL, NULL, NULL, NULL);
71 hsp = HSPLoad(mmPool, packedDnaFilename, annotationFilename, ambiguityFilename,translateFilename, 1);
73 HSPFillCharMap(idx2BWT->charMap);
74 HSPFillComplementMap(idx2BWT->complementMap);
76 idx2BWT->bwt = bwt;
77 idx2BWT->rev_bwt = rev_bwt;
78 idx2BWT->hsp = hsp;
79 idx2BWT->mmPool = mmPool;
81 idx2BWT->numReads = 0;
82 idx2BWT->readIDtable = NULL;
84 return idx2BWT;
87 Idx2BWT *BWTLoad2BWTLite(const char * indexFilePrefix) {
88 Idx2BWT * idx2BWT = (Idx2BWT*) malloc(sizeof(Idx2BWT));
89 BWT * bwt;
90 BWT * rev_bwt;
92 char bwtFilename[MAX_INDEX_FILENAME_LENGTH];
93 char bwtOccFilename[MAX_INDEX_FILENAME_LENGTH];
94 char rev_bwtFilename[MAX_INDEX_FILENAME_LENGTH];
95 char rev_bwtOccFilename[MAX_INDEX_FILENAME_LENGTH];
96 char bwtReadIDtableFilename[MAX_INDEX_FILENAME_LENGTH];
98 strcpy(bwtFilename,indexFilePrefix);
99 strcpy(bwtOccFilename,indexFilePrefix);
100 strcpy(rev_bwtFilename,indexFilePrefix);
101 strcpy(rev_bwtOccFilename,indexFilePrefix);
102 strcpy(bwtReadIDtableFilename, indexFilePrefix);
104 strcat(bwtFilename,".bwt");
105 strcat(bwtOccFilename,".fmv");
106 strcat(rev_bwtFilename,".rev.bwt");
107 strcat(rev_bwtOccFilename,".rev.fmv");
108 strcat(bwtReadIDtableFilename, ".ridt");
110 static int is_init = 0;
111 if (!is_init)
112 MMMasterInitialize(6, 0, FALSE, NULL);
113 is_init = 1;
114 MMPool * mmPool = MMPoolCreate(2097152);
116 bwt = BWTLoad(mmPool, bwtFilename, bwtOccFilename, NULL, NULL, NULL, NULL);
117 rev_bwt = BWTLoad(mmPool, rev_bwtFilename, rev_bwtOccFilename, NULL, NULL, NULL, NULL);
119 HSPFillCharMap(idx2BWT->charMap);
120 HSPFillComplementMap(idx2BWT->complementMap);
122 idx2BWT->bwt = bwt;
123 idx2BWT->rev_bwt = rev_bwt;
124 idx2BWT->hsp = NULL;
125 idx2BWT->mmPool = mmPool;
127 // read read id table
128 FILE *readIDtableFile = fopen64(bwtReadIDtableFilename, "rb");
129 if (readIDtableFile == NULL) {
130 fprintf(stderr, "%s: cannot open read_id_table!\n", __func__);
131 exit(1);
133 fread(&(idx2BWT->numReads), sizeof(idx2BWT->numReads), 1, readIDtableFile);
134 if (bwt->cumulativeFreq[4] % idx2BWT->numReads != 0) {
135 fprintf(stderr, "Number of reads and read length do not match!\n");
136 exit(1);
138 idx2BWT->readLength = bwt->cumulativeFreq[4] / idx2BWT->numReads;
139 fprintf(stderr, "Read length %d\n", idx2BWT->readLength);
140 idx2BWT->readIDtable = (unsigned int*) malloc(sizeof(unsigned int) * idx2BWT->numReads);
141 if (idx2BWT->readIDtable == NULL) {
142 fprintf(stderr, "%s:%s:%d: memory alloc failed!\n", __FILE__, __func__, __LINE__);
143 exit(1);
145 fread(idx2BWT->readIDtable, sizeof(unsigned int), idx2BWT->numReads, readIDtableFile);
147 fclose(readIDtableFile);
149 return idx2BWT;
152 void BWTFree2BWT(Idx2BWT * idx2BWT) {
154 BWT * bwt = idx2BWT->bwt;
155 BWT * rev_bwt = idx2BWT->rev_bwt;
156 HSP * hsp = idx2BWT->hsp;
157 MMPool * mmPool = idx2BWT->mmPool;
159 if (hsp != NULL) {
160 HSPFree(mmPool, hsp, 1);
162 BWTFree(mmPool, bwt);
163 BWTFree(mmPool, rev_bwt);
164 MMPoolFree(mmPool);
166 if (idx2BWT->readIDtable != NULL) {
167 free(idx2BWT->readIDtable);
170 free(idx2BWT);
173 int BWTGetQuality(Idx2BWT * idx2BWT, unsigned long long readID, int offset) {
175 BWT * bwt = idx2BWT->bwt;
176 unsigned long long absOffset = readID * idx2BWT->readLength + offset;
177 return (bwt->bwtCode[absOffset / CHAR_PER_WORD] >> (absOffset % CHAR_PER_WORD * BIT_PER_CHAR + (BIT_PER_CHAR - 1))) & 1;
180 int BWTCheckUsed(Idx2BWT * idx2BWT, unsigned long long readID) {
182 BWT * bwt = idx2BWT->bwt;
183 unsigned long long absOffset = bwt->cumulativeFreq[4] + readID;
184 return (bwt->bwtCode[absOffset / CHAR_PER_WORD] >> (absOffset % CHAR_PER_WORD * BIT_PER_CHAR + (BIT_PER_CHAR - 1))) & 1;
187 void BWTSetUsed(Idx2BWT * idx2BWT, unsigned long long readID, int used) {
188 BWT * bwt = idx2BWT->bwt;
189 unsigned long long absOffset = bwt->cumulativeFreq[4] + readID;
190 unsigned int *wordToUpdate = bwt->bwtCode + absOffset / CHAR_PER_WORD;
191 int offsetInWord = absOffset % CHAR_PER_WORD * BIT_PER_CHAR + (BIT_PER_CHAR - 1);
193 if (used) {
194 *wordToUpdate |= 1 << offsetInWord;
195 } else {
196 *wordToUpdate &= ~(1 << offsetInWord);
200 void BWTCheckFlag(Idx2BWT * idx2BWT, unsigned long long readID, int *used, int *cons) {
202 BWT * bwt = idx2BWT->bwt;
203 unsigned long long absOffset = bwt->cumulativeFreq[4] + readID;
204 *used = (bwt->bwtCode[absOffset / CHAR_PER_WORD] >> (absOffset % CHAR_PER_WORD * BIT_PER_CHAR + (BIT_PER_CHAR - 1))) & 1;
206 bwt = idx2BWT->rev_bwt;
207 absOffset = bwt->cumulativeFreq[4] + readID;
208 *cons = (bwt->bwtCode[absOffset / CHAR_PER_WORD] >> (absOffset % CHAR_PER_WORD * BIT_PER_CHAR + (BIT_PER_CHAR - 1))) & 1;
211 void BWTSetFlag(Idx2BWT * idx2BWT, unsigned long long readID, int used, int cons) {
212 BWT * bwt = idx2BWT->bwt;
213 unsigned long long absOffset = bwt->cumulativeFreq[4] + readID;
214 unsigned int *wordToUpdate = bwt->bwtCode + absOffset / CHAR_PER_WORD;
215 int offsetInWord = absOffset % CHAR_PER_WORD * BIT_PER_CHAR + (BIT_PER_CHAR - 1);
217 if (used) {
218 *wordToUpdate |= 1 << offsetInWord;
219 } else {
220 *wordToUpdate &= ~(1 << offsetInWord);
223 bwt = idx2BWT->rev_bwt;
224 absOffset = bwt->cumulativeFreq[4] + readID;
225 wordToUpdate = bwt->bwtCode + absOffset / CHAR_PER_WORD;
226 offsetInWord = absOffset % CHAR_PER_WORD * BIT_PER_CHAR + (BIT_PER_CHAR - 1);
228 if(cons)
229 *wordToUpdate |= 1 << offsetInWord;
230 else
231 *wordToUpdate &= ~(1 << offsetInWord);
234 void BWTConvertPattern(Idx2BWT * idx2BWT, const char * patternSource, int patternLength, unsigned char * patternDestination) {
236 int i;
237 for (i=0;i<patternLength;i++) {
238 patternDestination[i] = idx2BWT->charMap[patternSource[i]];
240 patternDestination[i]='\0';
244 void BWTSARangeInitial(Idx2BWT * idx2BWT, const unsigned char c,
245 unsigned long long *saIndexLeft, unsigned long long *saIndexRight) {
247 BWT * bwt = idx2BWT->bwt;
248 BWT * rev_bwt = idx2BWT->rev_bwt;
250 (*saIndexLeft) = bwt->cumulativeFreq[c]; //+1;
251 (*saIndexRight) = bwt->cumulativeFreq[c+1]-1;
256 void BWTSARangeBackward(Idx2BWT * idx2BWT, const unsigned char c,
257 unsigned long long *saIndexLeft, unsigned long long *saIndexRight) {
259 BWT * bwt = idx2BWT->bwt;
260 BWT * rev_bwt = idx2BWT->rev_bwt;
263 unsigned long long l = (*saIndexLeft);
264 unsigned long long r = (*saIndexRight);
265 (*saIndexLeft) = bwt->cumulativeFreq[c] + BWTOccValue(bwt, l, c);// + 1;
266 (*saIndexRight) = bwt->cumulativeFreq[c] + BWTOccValue(bwt, r + 1, c) - 1;
269 void BWTSARangeBackward_Bidirection(Idx2BWT * idx2BWT, const unsigned char c,
270 unsigned long long *saIndexLeft, unsigned long long *saIndexRight,
271 unsigned long long *rev_saIndexLeft, unsigned long long *rev_saIndexRight) {
273 BWT * bwt = idx2BWT->bwt;
274 BWT * rev_bwt = idx2BWT->rev_bwt;
276 unsigned long long oL[ALPHABET_SIZE];
277 unsigned long long oR[ALPHABET_SIZE];
278 unsigned long long oCount[ALPHABET_SIZE];
279 unsigned long long l = (*saIndexLeft);
280 unsigned long long r = (*saIndexRight);
281 unsigned long long rev_l = (*rev_saIndexLeft);
282 unsigned long long rev_r = (*rev_saIndexRight);
283 int k;
285 BWTAllOccValue(bwt,l,oL);
286 BWTAllOccValue(bwt,r + 1,oR);
287 oCount[ALPHABET_SIZE-1]=0;
288 for (k=ALPHABET_SIZE-2;k>=0;k--) {
289 oCount[k]=oCount[k+1]+oR[k+1]-oL[k+1];
292 l = bwt->cumulativeFreq[c] + oL[c] + 1;
293 r = bwt->cumulativeFreq[c] + oR[c];
294 rev_r = rev_r - oCount[c];
295 rev_l = rev_r - (r-l);
297 l--;
298 r--;
300 (*saIndexLeft) = l;
301 (*saIndexRight) = r;
302 (*rev_saIndexLeft) = rev_l;
303 (*rev_saIndexRight) = rev_r;
306 void BWTSARangeForward_Bidirection(Idx2BWT * idx2BWT, const unsigned char c,
307 unsigned long long *saIndexLeft, unsigned long long *saIndexRight,
308 unsigned long long *rev_saIndexLeft, unsigned long long *rev_saIndexRight) {
310 BWT * bwt = idx2BWT->bwt;
311 BWT * rev_bwt = idx2BWT->rev_bwt;
313 unsigned long long oL[ALPHABET_SIZE];
314 unsigned long long oR[ALPHABET_SIZE];
315 unsigned long long oCount[ALPHABET_SIZE];
316 unsigned long long l = (*saIndexLeft);
317 unsigned long long r = (*saIndexRight);
318 unsigned long long rev_l = (*rev_saIndexLeft);
319 unsigned long long rev_r = (*rev_saIndexRight);
320 int k;
322 BWTAllOccValue(rev_bwt,rev_l,oL);
323 BWTAllOccValue(rev_bwt,rev_r + 1,oR);
324 oCount[ALPHABET_SIZE-1]=0;
325 for (k=ALPHABET_SIZE-2;k>=0;k--) {
326 oCount[k]=oCount[k+1]+oR[k+1]-oL[k+1];
329 rev_l = bwt->cumulativeFreq[c] + oL[c] + 1;
330 rev_r = bwt->cumulativeFreq[c] + oR[c];
331 r = r - oCount[c];
332 l = r - (rev_r-rev_l);
334 rev_r--;
335 rev_l--;
337 (*saIndexLeft) = l;
338 (*saIndexRight) = r;
339 (*rev_saIndexLeft) = rev_l;
340 (*rev_saIndexRight) = rev_r;
343 void BWTAllSARangesBackward(Idx2BWT * idx2BWT,
344 const unsigned long long saIndexLeft, const unsigned long long saIndexRight,
345 unsigned long long *resultSaIndexesLeft, unsigned long long *resultSaIndexesRight) {
347 BWT * bwt = idx2BWT->bwt;
348 BWT * rev_bwt = idx2BWT->rev_bwt;
350 unsigned long long oL[ALPHABET_SIZE];
351 unsigned long long oR[ALPHABET_SIZE];
352 unsigned long long oCount[ALPHABET_SIZE];
353 unsigned long long l = saIndexLeft;
354 unsigned long long r = saIndexRight;
355 int k;
357 BWTAllOccValue(bwt,l,oL);
358 BWTAllOccValue(bwt,r + 1,oR);
360 for (k=0;k<ALPHABET_SIZE;k++) {
361 resultSaIndexesLeft[k] = bwt->cumulativeFreq[k] + oL[k] + 1 - 1;
362 resultSaIndexesRight[k] = bwt->cumulativeFreq[k] + oR[k] - 1;
368 void BWTAllSARangesBackward_Bidirection(Idx2BWT * idx2BWT,
369 const unsigned long long saIndexLeft, const unsigned long long saIndexRight,
370 const unsigned long long rev_saIndexLeft, const unsigned long long rev_saIndexRight,
371 unsigned long long *resultSaIndexLeft, unsigned long long *resultSaIndexRight,
372 unsigned long long *rev_resultSaIndexLeft, unsigned long long *rev_resultSaIndexRight) {
374 BWT * bwt = idx2BWT->bwt;
375 BWT * rev_bwt = idx2BWT->rev_bwt;
377 unsigned long long oL[ALPHABET_SIZE];
378 unsigned long long oR[ALPHABET_SIZE];
379 unsigned long long oCount[ALPHABET_SIZE];
380 unsigned long long l = saIndexLeft;
381 unsigned long long r = saIndexRight;
382 unsigned long long rev_l = rev_saIndexLeft;
383 unsigned long long rev_r = rev_saIndexRight;
384 int k;
386 BWTAllOccValue(bwt,l,oL);
387 BWTAllOccValue(bwt,r + 1,oR);
388 oCount[ALPHABET_SIZE-1]=0;
389 for (k=ALPHABET_SIZE-2;k>=0;k--) {
390 oCount[k]=oCount[k+1]+oR[k+1]-oL[k+1];
393 for (k=0;k<ALPHABET_SIZE;k++) {
395 resultSaIndexLeft[k] = bwt->cumulativeFreq[k] + oL[k] + 1 - 1;
396 resultSaIndexRight[k] = bwt->cumulativeFreq[k] + oR[k] - 1;
397 rev_resultSaIndexRight[k] = rev_r - oCount[k] - 1;
398 rev_resultSaIndexLeft[k] = rev_resultSaIndexRight[k] -
399 (resultSaIndexRight[k]-resultSaIndexLeft[k]) - 1;
405 void BWTAllSARangesForward_Bidirection(Idx2BWT * idx2BWT,
406 const unsigned long long saIndexLeft, const unsigned long long saIndexRight,
407 const unsigned long long rev_saIndexLeft, const unsigned long long rev_saIndexRight,
408 unsigned long long *resultSaIndexLeft, unsigned long long *resultSaIndexRight,
409 unsigned long long *rev_resultSaIndexLeft, unsigned long long *rev_resultSaIndexRight) {
411 BWT * bwt = idx2BWT->bwt;
412 BWT * rev_bwt = idx2BWT->rev_bwt;
414 unsigned long long oL[ALPHABET_SIZE];
415 unsigned long long oR[ALPHABET_SIZE];
416 unsigned long long oCount[ALPHABET_SIZE];
417 unsigned long long l = saIndexLeft;
418 unsigned long long r = saIndexRight;
419 unsigned long long rev_l = rev_saIndexLeft;
420 unsigned long long rev_r = rev_saIndexRight;
421 int k;
423 BWTAllOccValue(rev_bwt,rev_l,oL);
424 BWTAllOccValue(rev_bwt,rev_r + 1,oR);
425 oCount[ALPHABET_SIZE-1]=0;
426 for (k=ALPHABET_SIZE-2;k>=0;k--) {
427 oCount[k]=oCount[k+1]+oR[k+1]-oL[k+1];
430 for (k=0;k<ALPHABET_SIZE;k++) {
432 rev_resultSaIndexLeft[k] = bwt->cumulativeFreq[k] + oL[k] + 1 - 1;
433 rev_resultSaIndexRight[k] = bwt->cumulativeFreq[k] + oR[k] - 1;
434 resultSaIndexRight[k] = r - oCount[k] - 1;
435 resultSaIndexLeft[k] = resultSaIndexRight[k] -
436 (rev_resultSaIndexRight[k]-rev_resultSaIndexLeft[k]) - 1;
441 void BWTRetrievePositionFromSAIndex(Idx2BWT * idx2BWT,
442 unsigned long long saIndex,
443 unsigned int * sequenceId, unsigned long long * offset) {
444 BWT * bwt = idx2BWT->bwt;
445 BWT * rev_bwt = idx2BWT->rev_bwt;
446 HSP * hsp = idx2BWT->hsp;
447 unsigned short * ambiguityMap = hsp->ambiguityMap;
448 Translate * translate = hsp->translate;
450 unsigned long long ambPosition = BWTSaValue(bwt,saIndex);
451 unsigned long long approxIndex = ambPosition>>GRID_SAMPLING_FACTOR_2_POWER;
452 unsigned long long approxValue = ambiguityMap[approxIndex];
453 while (translate[approxValue].startPos>ambPosition) {
454 approxValue--;
456 ambPosition-=translate[approxValue].correction;
458 (*sequenceId) = translate[approxValue].chrID;
459 (*offset) = ambPosition;