modified: makefile
[GalaxyCodeBases.git] / BGI / soap_src / soap_builder / 2BWT-Builder.c
blob99b59432815235ce9a3e2aa1c7f10f7559578434
1 /*
3 BWTFormatdb.c Build index for FASTA database
5 This program builds index for FASTA database for use of BWTBlastn.
7 Copyright (C) 2006, Wong Chi Kwong.
9 This program is free software; you can redistribute it and/or
10 modify it under the terms of the GNU General Public License
11 as published by the Free Software Foundation; either version 2
12 of the License, or (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include "TypeNLimit.h"
28 #include "BWTConstruct.h"
29 #include "MiscUtilities.h"
30 #include "DNACount.h"
31 #include "TextConverter.h"
32 #include "MemManager.h"
33 #include "iniparser.h"
34 #include "HSP.h"
35 #include "Timing.h"
36 #include "lookupBuilder.h"
37 #include "highOccBuilder.h"
39 #define MAX_FILENAME_LEN 1024
41 // Database and ini
42 dictionary *ParseInput(int argc, char** argv);
43 void ParseIniFile(char *iniFileName);
44 void ProcessIni();
45 void ValidateIni();
46 void PrintIni();
47 void PrintShortDesc();
48 void PrintHelp();
50 void ProcessFileName(char *outputFileName, const char *inputFileName, const char *databaseName);
51 int buildLookupTable(char * PackedDNAFileName, char * LookupTableFileName, int lookupTableSize);
53 // Parameters
54 char IniFileName[MAX_FILENAME_LEN+1];
55 int Confirmation;
57 // BuildTasks parameters
58 int ParseFASTA = TRUE;
59 int BuildBWT = TRUE;
60 int BuildSaValue = TRUE;
61 int BuildSaIndex = TRUE;
62 int BuildLookUp = TRUE;
63 int BuildHOT = TRUE;
65 // Memory parameters
66 unsigned int PoolSize = 2097152; // 2M - fixed; not configurable through ini
68 // Display parameters
69 int ShowProgress = FALSE;
71 // Database parameters
72 char FASTAFileName[MAX_FILENAME_LEN+1] = "";
73 char DatabaseName[MAX_FILENAME_LEN+1] = "";
74 char AnnotationFileName[MAX_FILENAME_LEN+1] = "*.index.ann";
75 char AmbiguityFileName[MAX_FILENAME_LEN+1] = "*.index.amb";
76 char RepeatFileName[MAX_FILENAME_LEN+1] = "*.index.rep";
77 char PackedDNAFileName[MAX_FILENAME_LEN+1] = "*.index.pac";
78 char BWTCodeFileName[MAX_FILENAME_LEN+1] = "*.index.bwt";
79 char BWTOccValueFileName[MAX_FILENAME_LEN+1] = "*.index.fmv";
80 char SaValueFileName[MAX_FILENAME_LEN+1] = "*.index.sa";
81 char SaIndexFileName[MAX_FILENAME_LEN+1] = "*.index.sai";
83 char RevPackedDNAFileName[MAX_FILENAME_LEN+1] = "*.index.rev.pac";
84 char RevBWTCodeFileName[MAX_FILENAME_LEN+1] = "*.index.rev.bwt";
85 char RevBWTOccValueFileName[MAX_FILENAME_LEN+1] = "*.index.rev.fmv";
87 char LookupTableFileName[MAX_FILENAME_LEN+1] = "*.index.lkt";
88 char RevLookupTableFileName[MAX_FILENAME_LEN+1] = "*.index.rev.lkt";
90 char HighOccHashTableFileName[MAX_FILENAME_LEN+1] = "*.index.hot";
91 // Parse FASTA parameters
92 unsigned int FASTARandomSeed = 0;
93 int MaskLowerCase = FALSE;
95 // Build BWT parameters
96 unsigned int OccValueFreq = 256;
97 float TargetNBit = 2.5;
98 unsigned int InitialMaxBuildSize = 10000000;
99 unsigned int IncMaxBuildSize = 10000000;
101 // Build SA value parameters
102 unsigned int SaValueFreq = 8;
104 // Build SA index parameters
105 unsigned int SaIndexNumOfChar = 12;
107 //Look Up Table parameters
108 unsigned int LookUpTableSize=13;
109 unsigned int ReversedLookUpTableSize=13;
111 //High Occurrences Pattern Hash Table parameters
112 unsigned int HashPatternLength=35;
113 unsigned int HashOccThreshold=4;
115 void ExtractionHighOccPattern(const BWT *bwt, int index, unsigned int l, unsigned int r) {
116 if (l<=r && r-l+1>=HashOccThreshold) {
117 if (index>0) {
118 //If the packing is not finished
119 unsigned int new_l,new_r;
121 unsigned int occCount_pstart[4];
122 unsigned int occCount_pend[4];
123 BWTAllOccValue(bwt,l,occCount_pstart);
124 BWTAllOccValue(bwt,r + 1,occCount_pend);
126 unsigned char ec;
127 for (ec=0;ec<4;ec++) {
128 new_l = bwt->cumulativeFreq[ec] + occCount_pstart[ec] + 1;
129 new_r = bwt->cumulativeFreq[ec] + occCount_pend[ec];
131 ExtractionHighOccPattern(bwt,index-1,new_l,new_r);
133 } else if (index==0) {
134 //If the packing pattern reaches the maximum length
135 //l,r
136 ho_append(l,r);
141 void ho_writetofile(const char * fileName,BWT * bwt) {
142 //fprintf(stderr,"Writing Hash Table to File..\n");
143 unsigned int acc_index=0;
144 unsigned int acc_occIndex=0;
145 unsigned int i;
147 FILE * outFile;
148 if(!(outFile = fopen(fileName, "w"))) return;
150 fwrite(&ho_hashTableSize,sizeof(unsigned int),1,outFile);
151 fwrite(&ho_hash_a,sizeof(unsigned int),1,outFile);
152 fwrite(&ho_hash_b,sizeof(unsigned int),1,outFile);
153 fwrite(&ho_prime,sizeof(unsigned int),1,outFile);
154 fwrite(&ho_ttlItem,sizeof(unsigned int),1,outFile);
155 fwrite(&ho_ttlOccurrence,sizeof(unsigned int),1,outFile);
157 //fprintf(stderr,"Writing Hash Table to File..Phrase One\n");
158 for (i=0;i<ho_hashTableSize;i++) {
159 if (ho_hashtable[i].count==0) {
160 char c = 0;
161 fwrite(&c,1,1,outFile);
162 } else {
163 char c = 1;
164 fwrite(&c,1,1,outFile);
165 fwrite(&(acc_index),sizeof(unsigned int),1,outFile);
166 fwrite(&(ho_hashtable[i].count),sizeof(unsigned int),1,outFile);
167 acc_index+=ho_hashtable[i].count;
170 //fprintf(stderr,"Writing Hash Table to File..Phrase Two\n");
171 for (i=0;i<ho_hashTableSize;i++) {
172 hashItemPtr node = ho_hashtable[i].item;
173 while (node!=NULL) {
174 fwrite(&node->sa_l,sizeof(unsigned int),1,outFile);
175 fwrite(&node->sa_r,sizeof(unsigned int),1,outFile);
176 fwrite(&(acc_occIndex),sizeof(unsigned int),1,outFile);
177 acc_occIndex+=(node->sa_r)-(node->sa_l)+1;
178 node=node->next;
182 //fprintf(stderr,"Writing Hash Table to File..Phrase Three\n");
183 //unsigned int * buffer = malloc(sizeof(unsigned int)*1024);
184 // unsigned int bufferIndex =0 ;
185 for (i=0;i<ho_hashTableSize;i++) {
186 hashItemPtr node = ho_hashtable[i].item;
187 while (node!=NULL) {
188 unsigned int j;
189 for (j=node->sa_l;j<=node->sa_r;j++) {
190 unsigned int tp = BWTSaValue(bwt,j);
191 fwrite(&tp,sizeof(unsigned int),1,outFile);
192 // buffer[bufferIndex++]=tp;
193 // if (bufferIndex>=1024) {
194 //fwrite(buffer,sizeof(unsigned int)*1024,1,outFile);
195 // bufferIndex=0;
196 // }
198 node=node->next;
201 //free(buffer);
202 fclose(outFile);//*/
203 //fprintf(stderr,"Finished Writing Hash Table to File.\n");
206 void ho_buildandpopulate(BWT * bwt) {
208 //fprintf(stderr,"Build and Populate Hash Table.\n");
209 //Initialize the Hash Table of size doubled the number of pattern
210 ho_hashTableSize=ho_ttlItem*2;
211 ho_hashtable=malloc(sizeof(struct hashTyp)*ho_hashTableSize);
212 unsigned int i;
213 for (i=0;i<ho_hashTableSize;i++) {
214 ho_hashtable[i].count=0;
215 ho_hashtable[i].item=NULL;
218 ho_cellSpacePtr node = ho_root;
219 while (node!=NULL) {
220 for (i=0;i<node->count;i++) {
221 unsigned int index = ho_hash(node->saL[i]);
222 hashItemPtr newItem = malloc(sizeof(struct hashItem));
223 newItem->sa_l=node->saL[i];
224 newItem->sa_r=node->saR[i];
225 newItem->next=ho_hashtable[index].item;
226 ho_hashtable[index].item=newItem;
228 ho_hashtable[index].count++;
230 node=node->next;
233 ho_writetofile(HighOccHashTableFileName,bwt);
236 void buildReversePacked(const char *inputFileName, unsigned int *textLength, const unsigned int convertToWordPacked) {
238 FILE *inputFile;
239 FILE *outputFile;
240 unsigned char tempChar[4];
241 unsigned int writeBuffer=0;
242 unsigned int writeBufferIndex=0;
243 unsigned int *packedText;
244 unsigned int packedFileLen;
245 unsigned char lastByteLength;
246 unsigned int wordToProcess;
247 long long i;
248 int j,k;
250 inputFile = (FILE*)fopen(inputFileName, "rb");
251 outputFile = (FILE*)fopen(RevPackedDNAFileName, "wb");
253 if (inputFile == NULL) {
254 fprintf(stderr, "buildReversePacked() : Cannot open inputFileName!\n");
255 exit(1);
258 fseek(inputFile, -1, SEEK_END);
259 packedFileLen = ftell(inputFile);
260 if ((int)packedFileLen < 0) {
261 fprintf(stderr, "buildReversePacked(): Cannot determine file length!\n");
262 exit(1);
264 fread(&lastByteLength, sizeof(unsigned char), 1, inputFile);
266 *textLength = (packedFileLen - 1) * 4 + lastByteLength;
268 wordToProcess = (*textLength + 16 - 1) / 16;
269 packedText = MMUnitAllocate((wordToProcess + 1) * sizeof(unsigned int)); // allocate 1 more word at end
270 if (packedText == NULL) {
271 fprintf(stderr, "buildReversePacked(): Cannot MMUnitAllocate packedText\n");
272 exit(0);
274 packedText[wordToProcess - 1] = 0;
275 packedText[wordToProcess] = 0;
277 fseek(inputFile, 0L, SEEK_SET);
278 fread(packedText, 1, packedFileLen, inputFile);
279 fclose(inputFile);
282 //printf("lastByteLength = %u\n",lastByteLength);
283 long long currentLocation = (wordToProcess)*16;
284 //printf("currentLocation = %u\n",currentLocation);
285 //printf("*textLength = %u\n",*textLength);
286 for (i=wordToProcess-1; i>=0; i--) {
287 *(unsigned int*)tempChar = packedText[i];
288 packedText[i] = (tempChar[0] << 24) | (tempChar[1] << 16) | (tempChar[2] << 8) | tempChar[3];
290 unsigned int mk = packedText[i];
291 for (j=0;j<16;j++) {
292 if (writeBufferIndex>=16) {
293 *(unsigned int*)tempChar = writeBuffer;
294 writeBuffer = (tempChar[0] << 24) | (tempChar[1] << 16) | (tempChar[2] << 8) | tempChar[3];
295 fwrite(&writeBuffer,sizeof(unsigned int),1,outputFile);
296 writeBufferIndex=0;
298 writeBuffer<<=2;
299 unsigned char c = mk & 3;
300 mk>>=2;
301 currentLocation--;
302 if (currentLocation>=*textLength) {continue;}
303 writeBuffer|=c;writeBufferIndex++;
307 //*/
311 MMUnitFree(packedText, ((wordToProcess + 1) * sizeof(unsigned int)));
312 //printf("Finished main loop..\n");
313 if (writeBufferIndex>0) {
314 printf("Wiping..%d\n",writeBufferIndex);
315 for (k=writeBufferIndex;k<16;k++) writeBuffer<<=2;
316 *(unsigned int*)tempChar = writeBuffer;
317 if (writeBufferIndex<4) {
318 fwrite(&tempChar[3],sizeof(unsigned char),1,outputFile);
319 } else if (writeBufferIndex<8) {
320 fwrite(&tempChar[3],sizeof(unsigned char),1,outputFile);
321 fwrite(&tempChar[2],sizeof(unsigned char),1,outputFile);
322 } else if (writeBufferIndex<12) {
323 fwrite(&tempChar[3],sizeof(unsigned char),1,outputFile);
324 fwrite(&tempChar[2],sizeof(unsigned char),1,outputFile);
325 fwrite(&tempChar[1],sizeof(unsigned char),1,outputFile);
326 } else if (writeBufferIndex<16) {
327 fwrite(&tempChar[3],sizeof(unsigned char),1,outputFile);
328 fwrite(&tempChar[2],sizeof(unsigned char),1,outputFile);
329 fwrite(&tempChar[1],sizeof(unsigned char),1,outputFile);
330 fwrite(&tempChar[0],sizeof(unsigned char),1,outputFile);
333 fwrite(&lastByteLength,sizeof(unsigned char),1,outputFile);
334 fclose(outputFile);
335 //free(packedText);
336 return;
339 int main(int argc, char** argv) {
341 char c;
342 MMPool *mmPool;
343 dictionary *programInput;
344 double startTime;
345 double elapsedTime = 0, totalElapsedTime = 0;
347 char filename[MAX_FILENAME_LEN+1];
348 BWT *bwt = NULL;
349 unsigned int textLength = 0;
350 unsigned int numSeq;
352 BWTInc *bwtInc = NULL;
354 // Program input
355 programInput = ParseInput(argc, argv);
356 PrintShortDesc();
358 // Ini
359 if (strcmp(argv[0] + strlen(argv[0]) - 4, ".exe") == 0) {
360 *(argv[0] + strlen(argv[0]) - 4) = '\0';
362 sprintf(filename, "%s.ini", argv[0]);
363 ParseIniFile(filename);
364 //printf("\n");
365 ProcessIni();
366 ValidateIni();
367 PrintIni();
369 if (Confirmation == TRUE) {
370 printf("Press Y to go or N to cancel. ");
371 c = (char)getchar();
372 while (c != 'y' && c != 'Y' && c != 'n' && c!= 'N') {
373 c = (char)getchar();
375 if (c == 'n' || c == 'N') {
376 exit(0);
380 startTime = setStartTime();
382 MMMasterInitialize(1, 0, FALSE, NULL);
383 mmPool = MMPoolCreate(PoolSize);
385 // Parse FASTA file to produce packed DNA and annotation file
386 if (ParseFASTA == TRUE) {
388 printf("Parsing FASTA file..\n");
390 numSeq = HSPParseFASTAToPacked(FASTAFileName, AnnotationFileName, PackedDNAFileName, AmbiguityFileName, RepeatFileName, FASTARandomSeed, MaskLowerCase);
393 //Parse packed DNA to construct the packed reversed DNA
395 unsigned int textLen;
396 buildReversePacked(PackedDNAFileName,&textLen,TRUE);
397 //printf("Reversed Packed DNA generated..\n");
398 printf("Finished. Parsed %u sequences.\n", numSeq);
401 elapsedTime = getElapsedTime(startTime) - totalElapsedTime;
402 printf("Elapsed time = ");
403 printElapsedTime(stdout, FALSE, FALSE, TRUE, 2, elapsedTime);
404 totalElapsedTime += elapsedTime;
405 printf("\n");
409 if (BuildLookUp == TRUE) {
410 //Construct Look-Up Table
411 printf("Building Look-Up..\n");
412 buildLookupTable(PackedDNAFileName,LookupTableFileName,LookUpTableSize);
413 //printf("Look-Up Table is built.\n");
414 buildLookupTable(RevPackedDNAFileName,RevLookupTableFileName,ReversedLookUpTableSize);
415 //printf("Reversed Look-Up Table is built.\n");
417 elapsedTime = getElapsedTime(startTime) - totalElapsedTime;
418 printf("Finished.\nElapsed time = ");
419 printElapsedTime(stdout, FALSE, FALSE, TRUE, 2, elapsedTime);
420 totalElapsedTime += elapsedTime;
421 printf("\n");
424 // Construct BWTInc from text
425 if (BuildBWT == TRUE) {
427 printf("Building BWT..\n");
429 bwtInc = BWTIncConstructFromPacked(mmPool, PackedDNAFileName, ShowProgress,
430 TargetNBit, InitialMaxBuildSize, IncMaxBuildSize);
432 printf("Finished constructing BWT in %u iterations. ", bwtInc->numberOfIterationDone);
434 elapsedTime = getElapsedTime(startTime) - totalElapsedTime;
435 printf("Elapsed time = ");
436 printElapsedTime(stdout, FALSE, FALSE, TRUE, 2, elapsedTime);
437 totalElapsedTime += elapsedTime;
438 printf("\n");
440 printf("Saving BWT..\n");
441 BWTSaveBwtCodeAndOcc(bwtInc->bwt, BWTCodeFileName, BWTOccValueFileName);
442 printf("Finished saving BWT. ");
443 elapsedTime = getElapsedTime(startTime) - totalElapsedTime;
444 printf("Elapsed time = ");
445 printElapsedTime(stdout, FALSE, FALSE, TRUE, 2, elapsedTime);
446 totalElapsedTime += elapsedTime;
447 printf("\n");
449 textLength = bwtInc->bwt->textLength;
451 BWTIncFree(mmPool, bwtInc);
455 //Building Reversed BWT
456 printf("Building Reversed BWT..\n");
458 bwtInc = BWTIncConstructFromPacked(mmPool, RevPackedDNAFileName, ShowProgress,
459 TargetNBit, InitialMaxBuildSize, IncMaxBuildSize);
461 printf("Finished constructing Reversed BWT in %u iterations. ", bwtInc->numberOfIterationDone);
463 elapsedTime = getElapsedTime(startTime) - totalElapsedTime;
464 printf("Elapsed time = ");
465 printElapsedTime(stdout, FALSE, FALSE, TRUE, 2, elapsedTime);
466 totalElapsedTime += elapsedTime;
467 printf("\n");
469 printf("Saving BWT..\n");
470 BWTSaveBwtCodeAndOcc(bwtInc->bwt, RevBWTCodeFileName, RevBWTOccValueFileName);
471 printf("Finished saving BWT. ");
472 elapsedTime = getElapsedTime(startTime) - totalElapsedTime;
473 printf("Elapsed time = ");
474 printElapsedTime(stdout, FALSE, FALSE, TRUE, 2, elapsedTime);
475 totalElapsedTime += elapsedTime;
476 printf("\n");
478 textLength = bwtInc->bwt->textLength;
480 BWTIncFree(mmPool, bwtInc);
484 // Load BWT
485 if (BuildSaValue || BuildSaIndex) {
487 printf("Loading BWT...\n");
489 bwt = BWTLoad(mmPool, BWTCodeFileName, BWTOccValueFileName, NULL, NULL, NULL, NULL);
490 //Use BWT to build the hash table
492 printf("Finished loading BWT. ");
494 elapsedTime = getElapsedTime(startTime) - totalElapsedTime;
495 printf("Elapsed time = ");
496 printElapsedTime(stdout, FALSE, FALSE, TRUE, 2, elapsedTime);
497 totalElapsedTime += elapsedTime;
498 printf("\n");
500 textLength = bwt->textLength;
502 } else if (BuildHOT) {
503 printf("Loading BWT and SA Values...\n");
505 bwt = BWTLoad(mmPool, BWTCodeFileName, BWTOccValueFileName, SaValueFileName, NULL, NULL, NULL);
506 //Use BWT to build the hash table
508 printf("Finished loading BWT and SA Values. ");
510 elapsedTime = getElapsedTime(startTime) - totalElapsedTime;
511 printf("Elapsed time = ");
512 printElapsedTime(stdout, FALSE, FALSE, TRUE, 2, elapsedTime);
513 totalElapsedTime += elapsedTime;
514 printf("\n");
516 textLength = bwt->textLength;
521 if (BuildSaValue) {
523 printf("Building SA value...\n");
525 if (ShowProgress) {
526 BWTGenerateSaValue(mmPool, bwt, SaValueFreq, bwt->textLength / SaValueFreq / 10);
527 } else {
528 BWTGenerateSaValue(mmPool, bwt, SaValueFreq, 0);
530 BWTSaveSaValue(bwt, SaValueFileName);
532 printf("Finished building SA value. ");
534 elapsedTime = getElapsedTime(startTime) - totalElapsedTime;
535 printf("Elapsed time = ");
536 printElapsedTime(stdout, FALSE, FALSE, TRUE, 2, elapsedTime);
537 totalElapsedTime += elapsedTime;
538 printf("\n");
542 //Build High Occurrences Pattern Hash Table
543 if (BuildHOT) {
544 printf("Building High-Occ Hash Table...\n");
545 ho_initialize();
546 //printf("# of Occ = %u\n",ho_ttlOccurrence);
547 unsigned int l = 0;
548 unsigned int r = bwt->textLength;
549 ExtractionHighOccPattern(bwt,HashPatternLength,l,r);
550 //printf("# of SA = %u\n",ho_count());
551 //printf("# of Occ = %u\n",ho_ttlOccurrence);
552 ho_buildandpopulate(bwt);
553 elapsedTime = getElapsedTime(startTime) - totalElapsedTime;
554 printf("Finished.\nElapsed time = ");
555 printElapsedTime(stdout, FALSE, FALSE, TRUE, 2, elapsedTime);
556 totalElapsedTime += elapsedTime;
557 printf("\n");
558 ho_free();
561 if (BuildSaIndex) {
563 printf("Building SA index...\n");
565 BWTGenerateSaRangeTable(bwt, SaIndexNumOfChar, SaIndexFileName);
567 printf("Finished building SA index. ");
569 elapsedTime = getElapsedTime(startTime) - totalElapsedTime;
570 printf("Elapsed time = ");
571 printElapsedTime(stdout, FALSE, FALSE, TRUE, 2, elapsedTime);
572 totalElapsedTime += elapsedTime;
573 printf("\n");
577 // Free BWT
578 if (BuildSaValue || BuildSaIndex) {
579 BWTFree(mmPool, bwt);
582 // Finished all construction tasks
583 printf("Index building is completed.\n");
584 totalElapsedTime = getElapsedTime(startTime);
585 printf("Total elapsed time = ");
586 printElapsedTime(stdout, FALSE, FALSE, TRUE, 2, totalElapsedTime);
587 printf("\n");
589 //MMMasterPrintReport(stdout, FALSE, FALSE, FALSE);
590 if (BuildSaValue) {
591 //fprintf(stdout, "Number of char : %u\n", textLength);
592 //fprintf(stdout, "Bit per char : %.2f\n", (float)MMMasterMaxTotalByteDispatched() * BITS_IN_BYTE / textLength);
593 //printf("\n");
596 MMPoolFree(mmPool);
598 iniparser_freedict(programInput);
600 return 0;
604 dictionary *ParseInput(int argc, char** argv) {
606 dictionary *programInput;
607 char t1[3] = "-c"; // specify that this is a boolean type parameter
608 char t2[3] = "-U"; // specify that this is a boolean type parameter
609 char *d[2];
611 d[0] = t1;
612 d[1] = t2;
614 programInput = paraparser_load(argc, argv, 2, d); // 2 boolean type parameters
616 // Get database name
617 if (!iniparser_find_entry(programInput, "argument:1")) {
618 PrintHelp();
619 exit(1);
621 iniparser_copystring(programInput, "argument:1", DatabaseName, DatabaseName, MAX_FILENAME_LEN);
622 if (strlen(DatabaseName) + 4 > MAX_FILENAME_LEN) {
623 PrintHelp();
624 exit(1);
627 // Get FASTA file name
628 iniparser_copystring(programInput, "argument:2", FASTAFileName, DatabaseName, MAX_FILENAME_LEN);
629 if (strlen(FASTAFileName) > MAX_FILENAME_LEN) {
630 PrintHelp();
631 exit(1);
635 // Whether confirmation is needed
636 Confirmation = iniparser_find_entry(programInput, "parameter:-c");
638 MaskLowerCase = iniparser_find_entry(programInput, "parameter:-U");
640 return programInput;
644 void ParseIniFile(char *iniFileName) {
646 dictionary *ini;
648 //printf("Loading %s ..", iniFileName);
649 ini = iniparser_load(iniFileName, FALSE);
650 if (ini == NULL) {
651 // printf("not found.\n");
652 return;
654 //printf("done.\n");
656 // BuildTasks parameters
657 ParseFASTA = iniparser_getboolean(ini, "BuildTasks:ParseFASTA", ParseFASTA);
658 BuildBWT = iniparser_getboolean(ini, "BuildTasks:BuildBWT", BuildBWT);
659 BuildSaValue = iniparser_getboolean(ini, "BuildTasks:BuildSaValue", BuildSaValue);
660 BuildLookUp = iniparser_getboolean(ini, "BuildTasks:BuildLookUp", BuildLookUp);
661 BuildSaIndex = iniparser_getboolean(ini, "BuildTasks:BuildSaIndex", BuildSaIndex);
662 BuildHOT = iniparser_getboolean(ini, "BuildTasks:BuildHOT", BuildHOT);
664 // Display parameters
665 ShowProgress = iniparser_getboolean(ini, "Display:ShowProgress", ShowProgress);
667 // Parse FASTA parameters
668 FASTARandomSeed = iniparser_getint(ini, "ParseFASTA:RandomSeed", FASTARandomSeed);
669 if (FASTARandomSeed == 0) {
670 FASTARandomSeed = getRandomSeed();
673 // Build BWT parameters
674 OccValueFreq = iniparser_getint(ini, "BuildBWT:OccValueFreq", OccValueFreq);
675 TargetNBit = (float)iniparser_getdouble(ini, "BuildBWT:TargetNBit", TargetNBit);
676 InitialMaxBuildSize = iniparser_getint(ini, "BuildBWT:InitialMaxBuildSize", InitialMaxBuildSize);
677 IncMaxBuildSize = iniparser_getint(ini, "BuildBWT:IncMaxBuildSize", IncMaxBuildSize);
679 // Build SA value parameters
680 SaValueFreq = iniparser_getint(ini, "BuildSAValue:SaValueFreq", SaValueFreq);
682 // Build SA index parameters
683 SaIndexNumOfChar = iniparser_getint(ini, "BuildSAIndex:SaIndexNumOfChar", SaIndexNumOfChar);
685 // Build Look Up parameters
686 LookUpTableSize = iniparser_getint(ini, "BuildLookUp:LookUpTableSize", LookUpTableSize);
687 ReversedLookUpTableSize = iniparser_getint(ini, "BuildLookUp:ReversedLookUpTableSize", ReversedLookUpTableSize);
690 // Build High Occurrences Pattern Hash Table parameters
691 HashPatternLength = iniparser_getint(ini, "BuildHOT:PatternLength", HashPatternLength);
692 HashOccThreshold = iniparser_getint(ini, "BuildHOT:OccurrencesThreshold", HashOccThreshold);
693 // Database parameters
694 iniparser_copystring(ini, "Database:AnnotationFileName", AnnotationFileName, AnnotationFileName, MAX_FILENAME_LEN);
695 iniparser_copystring(ini, "Database:AmbiguityFileName", AmbiguityFileName, AmbiguityFileName, MAX_FILENAME_LEN);
696 iniparser_copystring(ini, "Database:PackedDNAFileName", PackedDNAFileName, PackedDNAFileName, MAX_FILENAME_LEN);
697 iniparser_copystring(ini, "Database:BWTCodeFileName", BWTCodeFileName, BWTCodeFileName, MAX_FILENAME_LEN);
698 iniparser_copystring(ini, "Database:BWTOccValueFileName", BWTOccValueFileName, BWTOccValueFileName, MAX_FILENAME_LEN);
699 iniparser_copystring(ini, "Database:SaValueFileName", SaValueFileName, SaValueFileName, MAX_FILENAME_LEN);
700 iniparser_copystring(ini, "Database:SaIndexFileName", SaIndexFileName, SaIndexFileName, MAX_FILENAME_LEN);
702 iniparser_copystring(ini, "Database:RevPackedDNAFileName", RevPackedDNAFileName, RevPackedDNAFileName, MAX_FILENAME_LEN);
703 iniparser_copystring(ini, "Database:RevBWTCodeFileName", RevBWTCodeFileName, RevBWTCodeFileName, MAX_FILENAME_LEN);
704 iniparser_copystring(ini, "Database:RevBWTOccValueFileName", RevBWTOccValueFileName, RevBWTOccValueFileName, MAX_FILENAME_LEN);
706 iniparser_copystring(ini, "Database:LookupTableFileName", LookupTableFileName, LookupTableFileName, MAX_FILENAME_LEN);
707 iniparser_copystring(ini, "Database:RevLookupTableFileName", RevLookupTableFileName, RevLookupTableFileName, MAX_FILENAME_LEN);
708 iniparser_copystring(ini, "Database:HighOccHashTableFileName", HighOccHashTableFileName, HighOccHashTableFileName, MAX_FILENAME_LEN);
709 iniparser_freedict(ini);
713 void ProcessIni() {
715 ProcessFileName(AnnotationFileName, AnnotationFileName, DatabaseName);
716 ProcessFileName(AmbiguityFileName, AmbiguityFileName, DatabaseName);
717 ProcessFileName(PackedDNAFileName, PackedDNAFileName, DatabaseName);
718 ProcessFileName(RevPackedDNAFileName, RevPackedDNAFileName, DatabaseName);
719 ProcessFileName(BWTCodeFileName, BWTCodeFileName, DatabaseName);
720 ProcessFileName(RevBWTCodeFileName, RevBWTCodeFileName, DatabaseName);
721 ProcessFileName(BWTOccValueFileName, BWTOccValueFileName, DatabaseName);
722 ProcessFileName(RevBWTOccValueFileName, RevBWTOccValueFileName, DatabaseName);
723 ProcessFileName(SaValueFileName, SaValueFileName, DatabaseName);
724 ProcessFileName(SaIndexFileName, SaIndexFileName, DatabaseName);
726 ProcessFileName(LookupTableFileName, LookupTableFileName, DatabaseName);
727 ProcessFileName(RevLookupTableFileName, RevLookupTableFileName, DatabaseName);
729 ProcessFileName(HighOccHashTableFileName, HighOccHashTableFileName, DatabaseName);
733 void ValidateIni() {
735 if (!ParseFASTA && !BuildBWT && !BuildSaValue && !BuildSaIndex && !BuildLookUp && !BuildHOT) {
736 fprintf(stderr, "No action is specified!\n");
737 exit(1);
739 if (BuildLookUp) {
740 if (PackedDNAFileName[0] == '\0') {
741 fprintf(stderr, "Packed DNA file name is not specified!\n");
742 exit(1);
744 if (RevPackedDNAFileName[0] == '\0') {
745 fprintf(stderr, "Reversed Packed DNA file name is not specified!\n");
746 exit(1);
749 if (ParseFASTA) {
750 if (PackedDNAFileName[0] == '\0') {
751 fprintf(stderr, "Packed DNA file name is not specified!\n");
752 exit(1);
754 if (AnnotationFileName[0] == '\0') {
755 fprintf(stderr, "Annotation file name is not specified!\n");
756 exit(1);
758 if (AmbiguityFileName[0] == '\0') {
759 fprintf(stderr, "Ambiguity file name is not specified!\n");
760 exit(1);
763 if (BuildBWT) {
764 if (PackedDNAFileName[0] == '\0') {
765 fprintf(stderr, "Packed DNA file is not specified!\n");
766 exit(1);
768 if (BWTCodeFileName[0] == '\0') {
769 fprintf(stderr, "BWT code file name is not specified!\n");
770 exit(1);
772 if (BWTOccValueFileName[0] == '\0') {
773 fprintf(stderr, "BWT Occ value file name is not specified!\n");
774 exit(1);
776 if (TargetNBit < 2.5) {
777 fprintf(stderr, "Target NBit should be at least 2.5!\n");
778 exit(1);
781 if (BuildSaValue) {
782 if (BWTCodeFileName[0] == '\0') {
783 fprintf(stderr, "BWT code file is not specified!\n");
784 exit(1);
786 if (BWTOccValueFileName[0] == '\0') {
787 fprintf(stderr, "BWT Occ value file is not specified!\n");
788 exit(1);
790 if (SaValueFileName[0] == '\0') {
791 fprintf(stderr, "SA value file name is not specified!\n");
792 exit(1);
794 if (SaValueFreq <= 0) {
795 fprintf(stderr, "SA value frequency must > 0!\n");
796 exit(1);
800 if (BuildSaIndex) {
801 if (BWTCodeFileName[0] == '\0') {
802 fprintf(stderr, "BWT code file is not specified!\n");
803 exit(1);
805 if (BWTOccValueFileName[0] == '\0') {
806 fprintf(stderr, "BWT Occ value file is not specified!\n");
807 exit(1);
809 if (SaIndexFileName[0] == '\0') {
810 fprintf(stderr, "SA index file name is not specified!\n");
811 exit(1);
813 if (SaIndexNumOfChar <= 0) {
814 fprintf(stderr, "SA index number of character must > 0!\n");
815 exit(1);
817 if (SaIndexNumOfChar > 13) {
818 fprintf(stderr, "SA index number of character must <= 13!\n");
819 exit(1);
823 if (BuildHOT) {
824 if (BWTCodeFileName[0] == '\0') {
825 fprintf(stderr, "BWT code file is not specified!\n");
826 exit(1);
828 if (BWTOccValueFileName[0] == '\0') {
829 fprintf(stderr, "BWT Occ value file is not specified!\n");
830 exit(1);
832 if (RevBWTCodeFileName[0] == '\0') {
833 fprintf(stderr, "Reversed BWT code file is not specified!\n");
834 exit(1);
836 if (RevBWTOccValueFileName[0] == '\0') {
837 fprintf(stderr, "Reversed BWT Occ value file is not specified!\n");
838 exit(1);
844 void PrintIni() {
846 char boolean[2];
848 boolean[0] = 'N';
849 boolean[1] = 'Y';
851 /*printf("Parse FASTA file : %c\n", boolean[ParseFASTA]);
852 printf("Build BWT : %c\n", boolean[BuildBWT]);
853 printf("Build SA value : %c\n", boolean[BuildSaValue]);
854 printf("Build SA index : %c\n", boolean[BuildSaIndex]);
855 printf("\n");
857 printf("Show progress : %c\n", boolean[ShowProgress]);
858 printf("\n");
860 if (ParseFASTA) {
861 printf("Parse FASTA :\n");
862 printf("Mask lower case : %c\n", boolean[MaskLowerCase]);
863 printf("Random seed : %u\n", FASTARandomSeed);
864 printf("\n");
867 if (BuildBWT) {
868 printf("Build BWT :\n");
869 printf("Target N Bits : %.2f\n", TargetNBit);
870 printf("Occ value frequency : %u\n", OccValueFreq);
871 printf("Initial Max Build Size : %u Inc Max Build Size : %u\n",
872 InitialMaxBuildSize, IncMaxBuildSize);
873 printf("\n");
876 if (BuildSaValue) {
877 printf("Build SA value :\n");
878 printf("SA value frequency : %u\n", SaValueFreq);
879 printf("\n");
882 if (BuildSaIndex) {
883 printf("Build SA index :\n");
884 printf("SA index no. of char : %u\n", SaIndexNumOfChar);
885 printf("\n");
888 printf("Annotation file : %s\n", AnnotationFileName);
889 printf("Ambigurity file : %s\n", AmbiguityFileName);
890 printf("Packed DNA file : %s\n", PackedDNAFileName);
891 printf("BWT Code file : %s\n", BWTCodeFileName);
892 printf("BWT Occ value file : %s\n", BWTOccValueFileName);
893 printf("SA value file : %s\n", SaValueFileName);
894 printf("SA index file : %s\n", SaIndexFileName);
895 printf("\n");
896 printf("Reversed Packed DNA file : %s\n", RevPackedDNAFileName);
897 printf("Reversed BWT Code file : %s\n", RevBWTCodeFileName);
898 printf("Reversed BWT Occ value file : %s\n", RevBWTOccValueFileName);
899 printf("\n");
900 printf("Look-Up Table file : %s\n", LookupTableFileName);
901 printf("Reversed Look-Up Table file : %s\n", RevLookupTableFileName);
902 printf("\n");
903 printf("High Occ Hash Table file : %s\n", HighOccHashTableFileName);
904 printf("\n");*/
908 void PrintShortDesc() {
910 /*printf("BWTFormatdb v1.0, Copyright (C) 2006, Wong Chi Kwong.\n");
911 printf("BWTFormatdb comes with ABSOLUTELY NO WARRENTY.\n");
912 printf("BWTFormatdb is free software, and you are welcome to\n");
913 printf("redistribute it under certain conditions.\n");
914 printf("For details type BWTFormatdb.\n");
915 printf("\n");*/
919 void PrintHelp() {
921 /*printf("BWTFormatdb v1.0, Copyright (C) 2006, Wong Chi Kwong.\n");
922 printf("\n");
924 printf("This program is free software; you can redistribute it and/or\n");
925 printf("modify it under the terms of the GNU General Public License\n");
926 printf("as published by the Free Software Foundation; either version 2\n");
927 printf("of the License, or (at your option) any later version.\n");
928 printf("\n");
930 printf("This program is distributed in the hope that it will be useful,\n");
931 printf("but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
932 printf("MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
933 printf("GNU General Public License for more details.\n");
934 printf("\n");
936 printf("You should have received a copy of the GNU General Public License\n");
937 printf("along with this program; if not, write to the Free Software\n");
938 printf("Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n");
939 printf("\n");*/
941 printf("Syntax: 2bwt-builder <sequence file>\n");
942 printf("Compile Time: " COMPILE_TIME"\n");
945 void ProcessFileName(char *outputFileName, const char *inputFileName, const char *databaseName) {
947 char tempChar[MAX_FILENAME_LEN];
948 unsigned int i;
950 if (inputFileName == NULL) {
951 if (outputFileName != inputFileName) {
952 outputFileName[0] = '\0';
954 return;
957 if (strlen(databaseName) + strlen(inputFileName) > MAX_FILENAME_LEN) {
958 fprintf(stderr, "File length is too long!\n");
959 exit(1);
962 strncpy(tempChar, inputFileName, MAX_FILENAME_LEN);
964 // locate the *
965 for (i=0; i<MAX_FILENAME_LEN; i++) {
966 if (tempChar[i] == '*') {
967 break;
970 if (i<MAX_FILENAME_LEN) {
971 tempChar[i] = '\0';
972 sprintf(outputFileName, "%s%s%s", tempChar, databaseName, tempChar + i + 1);
973 } else {
974 sprintf(outputFileName, "%s", tempChar);