3 BWTFormatdb.c Build index for FASTA database
5 This program builds index for FASTA database for use of BWTBlastn.
7 Copyright (C) 2006, Wong Chi Kwong.
9 This program is free software; you can redistribute it and/or
10 modify it under the terms of the GNU General Public License
11 as published by the Free Software Foundation; either version 2
12 of the License, or (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
27 #include "TypeNLimit.h"
28 #include "BWTConstruct.h"
29 #include "MiscUtilities.h"
31 #include "TextConverter.h"
32 #include "MemManager.h"
33 #include "iniparser.h"
36 #include "lookupBuilder.h"
37 #include "highOccBuilder.h"
39 #define MAX_FILENAME_LEN 1024
42 dictionary
*ParseInput(int argc
, char** argv
);
43 void ParseIniFile(char *iniFileName
);
47 void PrintShortDesc();
50 void ProcessFileName(char *outputFileName
, const char *inputFileName
, const char *databaseName
);
51 int buildLookupTable(char * PackedDNAFileName
, char * LookupTableFileName
, int lookupTableSize
);
54 char IniFileName
[MAX_FILENAME_LEN
+1];
57 // BuildTasks parameters
58 int ParseFASTA
= TRUE
;
60 int BuildSaValue
= TRUE
;
61 int BuildSaIndex
= TRUE
;
62 int BuildLookUp
= TRUE
;
66 unsigned int PoolSize
= 2097152; // 2M - fixed; not configurable through ini
69 int ShowProgress
= FALSE
;
71 // Database parameters
72 char FASTAFileName
[MAX_FILENAME_LEN
+1] = "";
73 char DatabaseName
[MAX_FILENAME_LEN
+1] = "";
74 char AnnotationFileName
[MAX_FILENAME_LEN
+1] = "*.index.ann";
75 char AmbiguityFileName
[MAX_FILENAME_LEN
+1] = "*.index.amb";
76 char RepeatFileName
[MAX_FILENAME_LEN
+1] = "*.index.rep";
77 char PackedDNAFileName
[MAX_FILENAME_LEN
+1] = "*.index.pac";
78 char BWTCodeFileName
[MAX_FILENAME_LEN
+1] = "*.index.bwt";
79 char BWTOccValueFileName
[MAX_FILENAME_LEN
+1] = "*.index.fmv";
80 char SaValueFileName
[MAX_FILENAME_LEN
+1] = "*.index.sa";
81 char SaIndexFileName
[MAX_FILENAME_LEN
+1] = "*.index.sai";
83 char RevPackedDNAFileName
[MAX_FILENAME_LEN
+1] = "*.index.rev.pac";
84 char RevBWTCodeFileName
[MAX_FILENAME_LEN
+1] = "*.index.rev.bwt";
85 char RevBWTOccValueFileName
[MAX_FILENAME_LEN
+1] = "*.index.rev.fmv";
87 char LookupTableFileName
[MAX_FILENAME_LEN
+1] = "*.index.lkt";
88 char RevLookupTableFileName
[MAX_FILENAME_LEN
+1] = "*.index.rev.lkt";
90 char HighOccHashTableFileName
[MAX_FILENAME_LEN
+1] = "*.index.hot";
91 // Parse FASTA parameters
92 unsigned int FASTARandomSeed
= 0;
93 int MaskLowerCase
= FALSE
;
95 // Build BWT parameters
96 unsigned int OccValueFreq
= 256;
97 float TargetNBit
= 2.5;
98 unsigned int InitialMaxBuildSize
= 10000000;
99 unsigned int IncMaxBuildSize
= 10000000;
101 // Build SA value parameters
102 unsigned int SaValueFreq
= 8;
104 // Build SA index parameters
105 unsigned int SaIndexNumOfChar
= 12;
107 //Look Up Table parameters
108 unsigned int LookUpTableSize
=13;
109 unsigned int ReversedLookUpTableSize
=13;
111 //High Occurrences Pattern Hash Table parameters
112 unsigned int HashPatternLength
=35;
113 unsigned int HashOccThreshold
=4;
115 void ExtractionHighOccPattern(const BWT
*bwt
, int index
, unsigned int l
, unsigned int r
) {
116 if (l
<=r
&& r
-l
+1>=HashOccThreshold
) {
118 //If the packing is not finished
119 unsigned int new_l
,new_r
;
121 unsigned int occCount_pstart
[4];
122 unsigned int occCount_pend
[4];
123 BWTAllOccValue(bwt
,l
,occCount_pstart
);
124 BWTAllOccValue(bwt
,r
+ 1,occCount_pend
);
127 for (ec
=0;ec
<4;ec
++) {
128 new_l
= bwt
->cumulativeFreq
[ec
] + occCount_pstart
[ec
] + 1;
129 new_r
= bwt
->cumulativeFreq
[ec
] + occCount_pend
[ec
];
131 ExtractionHighOccPattern(bwt
,index
-1,new_l
,new_r
);
133 } else if (index
==0) {
134 //If the packing pattern reaches the maximum length
141 void ho_writetofile(const char * fileName
,BWT
* bwt
) {
142 //fprintf(stderr,"Writing Hash Table to File..\n");
143 unsigned int acc_index
=0;
144 unsigned int acc_occIndex
=0;
148 if(!(outFile
= fopen(fileName
, "w"))) return;
150 fwrite(&ho_hashTableSize
,sizeof(unsigned int),1,outFile
);
151 fwrite(&ho_hash_a
,sizeof(unsigned int),1,outFile
);
152 fwrite(&ho_hash_b
,sizeof(unsigned int),1,outFile
);
153 fwrite(&ho_prime
,sizeof(unsigned int),1,outFile
);
154 fwrite(&ho_ttlItem
,sizeof(unsigned int),1,outFile
);
155 fwrite(&ho_ttlOccurrence
,sizeof(unsigned int),1,outFile
);
157 //fprintf(stderr,"Writing Hash Table to File..Phrase One\n");
158 for (i
=0;i
<ho_hashTableSize
;i
++) {
159 if (ho_hashtable
[i
].count
==0) {
161 fwrite(&c
,1,1,outFile
);
164 fwrite(&c
,1,1,outFile
);
165 fwrite(&(acc_index
),sizeof(unsigned int),1,outFile
);
166 fwrite(&(ho_hashtable
[i
].count
),sizeof(unsigned int),1,outFile
);
167 acc_index
+=ho_hashtable
[i
].count
;
170 //fprintf(stderr,"Writing Hash Table to File..Phrase Two\n");
171 for (i
=0;i
<ho_hashTableSize
;i
++) {
172 hashItemPtr node
= ho_hashtable
[i
].item
;
174 fwrite(&node
->sa_l
,sizeof(unsigned int),1,outFile
);
175 fwrite(&node
->sa_r
,sizeof(unsigned int),1,outFile
);
176 fwrite(&(acc_occIndex
),sizeof(unsigned int),1,outFile
);
177 acc_occIndex
+=(node
->sa_r
)-(node
->sa_l
)+1;
182 //fprintf(stderr,"Writing Hash Table to File..Phrase Three\n");
183 //unsigned int * buffer = malloc(sizeof(unsigned int)*1024);
184 // unsigned int bufferIndex =0 ;
185 for (i
=0;i
<ho_hashTableSize
;i
++) {
186 hashItemPtr node
= ho_hashtable
[i
].item
;
189 for (j
=node
->sa_l
;j
<=node
->sa_r
;j
++) {
190 unsigned int tp
= BWTSaValue(bwt
,j
);
191 fwrite(&tp
,sizeof(unsigned int),1,outFile
);
192 // buffer[bufferIndex++]=tp;
193 // if (bufferIndex>=1024) {
194 //fwrite(buffer,sizeof(unsigned int)*1024,1,outFile);
203 //fprintf(stderr,"Finished Writing Hash Table to File.\n");
206 void ho_buildandpopulate(BWT
* bwt
) {
208 //fprintf(stderr,"Build and Populate Hash Table.\n");
209 //Initialize the Hash Table of size doubled the number of pattern
210 ho_hashTableSize
=ho_ttlItem
*2;
211 ho_hashtable
=malloc(sizeof(struct hashTyp
)*ho_hashTableSize
);
213 for (i
=0;i
<ho_hashTableSize
;i
++) {
214 ho_hashtable
[i
].count
=0;
215 ho_hashtable
[i
].item
=NULL
;
218 ho_cellSpacePtr node
= ho_root
;
220 for (i
=0;i
<node
->count
;i
++) {
221 unsigned int index
= ho_hash(node
->saL
[i
]);
222 hashItemPtr newItem
= malloc(sizeof(struct hashItem
));
223 newItem
->sa_l
=node
->saL
[i
];
224 newItem
->sa_r
=node
->saR
[i
];
225 newItem
->next
=ho_hashtable
[index
].item
;
226 ho_hashtable
[index
].item
=newItem
;
228 ho_hashtable
[index
].count
++;
233 ho_writetofile(HighOccHashTableFileName
,bwt
);
236 void buildReversePacked(const char *inputFileName
, unsigned int *textLength
, const unsigned int convertToWordPacked
) {
240 unsigned char tempChar
[4];
241 unsigned int writeBuffer
=0;
242 unsigned int writeBufferIndex
=0;
243 unsigned int *packedText
;
244 unsigned int packedFileLen
;
245 unsigned char lastByteLength
;
246 unsigned int wordToProcess
;
250 inputFile
= (FILE*)fopen(inputFileName
, "rb");
251 outputFile
= (FILE*)fopen(RevPackedDNAFileName
, "wb");
253 if (inputFile
== NULL
) {
254 fprintf(stderr
, "buildReversePacked() : Cannot open inputFileName!\n");
258 fseek(inputFile
, -1, SEEK_END
);
259 packedFileLen
= ftell(inputFile
);
260 if ((int)packedFileLen
< 0) {
261 fprintf(stderr
, "buildReversePacked(): Cannot determine file length!\n");
264 fread(&lastByteLength
, sizeof(unsigned char), 1, inputFile
);
266 *textLength
= (packedFileLen
- 1) * 4 + lastByteLength
;
268 wordToProcess
= (*textLength
+ 16 - 1) / 16;
269 packedText
= MMUnitAllocate((wordToProcess
+ 1) * sizeof(unsigned int)); // allocate 1 more word at end
270 if (packedText
== NULL
) {
271 fprintf(stderr
, "buildReversePacked(): Cannot MMUnitAllocate packedText\n");
274 packedText
[wordToProcess
- 1] = 0;
275 packedText
[wordToProcess
] = 0;
277 fseek(inputFile
, 0L, SEEK_SET
);
278 fread(packedText
, 1, packedFileLen
, inputFile
);
282 //printf("lastByteLength = %u\n",lastByteLength);
283 long long currentLocation
= (wordToProcess
)*16;
284 //printf("currentLocation = %u\n",currentLocation);
285 //printf("*textLength = %u\n",*textLength);
286 for (i
=wordToProcess
-1; i
>=0; i
--) {
287 *(unsigned int*)tempChar
= packedText
[i
];
288 packedText
[i
] = (tempChar
[0] << 24) | (tempChar
[1] << 16) | (tempChar
[2] << 8) | tempChar
[3];
290 unsigned int mk
= packedText
[i
];
292 if (writeBufferIndex
>=16) {
293 *(unsigned int*)tempChar
= writeBuffer
;
294 writeBuffer
= (tempChar
[0] << 24) | (tempChar
[1] << 16) | (tempChar
[2] << 8) | tempChar
[3];
295 fwrite(&writeBuffer
,sizeof(unsigned int),1,outputFile
);
299 unsigned char c
= mk
& 3;
302 if (currentLocation
>=*textLength
) {continue;}
303 writeBuffer
|=c
;writeBufferIndex
++;
311 MMUnitFree(packedText
, ((wordToProcess
+ 1) * sizeof(unsigned int)));
312 //printf("Finished main loop..\n");
313 if (writeBufferIndex
>0) {
314 printf("Wiping..%d\n",writeBufferIndex
);
315 for (k
=writeBufferIndex
;k
<16;k
++) writeBuffer
<<=2;
316 *(unsigned int*)tempChar
= writeBuffer
;
317 if (writeBufferIndex
<4) {
318 fwrite(&tempChar
[3],sizeof(unsigned char),1,outputFile
);
319 } else if (writeBufferIndex
<8) {
320 fwrite(&tempChar
[3],sizeof(unsigned char),1,outputFile
);
321 fwrite(&tempChar
[2],sizeof(unsigned char),1,outputFile
);
322 } else if (writeBufferIndex
<12) {
323 fwrite(&tempChar
[3],sizeof(unsigned char),1,outputFile
);
324 fwrite(&tempChar
[2],sizeof(unsigned char),1,outputFile
);
325 fwrite(&tempChar
[1],sizeof(unsigned char),1,outputFile
);
326 } else if (writeBufferIndex
<16) {
327 fwrite(&tempChar
[3],sizeof(unsigned char),1,outputFile
);
328 fwrite(&tempChar
[2],sizeof(unsigned char),1,outputFile
);
329 fwrite(&tempChar
[1],sizeof(unsigned char),1,outputFile
);
330 fwrite(&tempChar
[0],sizeof(unsigned char),1,outputFile
);
333 fwrite(&lastByteLength
,sizeof(unsigned char),1,outputFile
);
339 int main(int argc
, char** argv
) {
343 dictionary
*programInput
;
345 double elapsedTime
= 0, totalElapsedTime
= 0;
347 char filename
[MAX_FILENAME_LEN
+1];
349 unsigned int textLength
= 0;
352 BWTInc
*bwtInc
= NULL
;
355 programInput
= ParseInput(argc
, argv
);
359 if (strcmp(argv
[0] + strlen(argv
[0]) - 4, ".exe") == 0) {
360 *(argv
[0] + strlen(argv
[0]) - 4) = '\0';
362 sprintf(filename
, "%s.ini", argv
[0]);
363 ParseIniFile(filename
);
369 if (Confirmation
== TRUE
) {
370 printf("Press Y to go or N to cancel. ");
372 while (c
!= 'y' && c
!= 'Y' && c
!= 'n' && c
!= 'N') {
375 if (c
== 'n' || c
== 'N') {
380 startTime
= setStartTime();
382 MMMasterInitialize(1, 0, FALSE
, NULL
);
383 mmPool
= MMPoolCreate(PoolSize
);
385 // Parse FASTA file to produce packed DNA and annotation file
386 if (ParseFASTA
== TRUE
) {
388 printf("Parsing FASTA file..\n");
390 numSeq
= HSPParseFASTAToPacked(FASTAFileName
, AnnotationFileName
, PackedDNAFileName
, AmbiguityFileName
, RepeatFileName
, FASTARandomSeed
, MaskLowerCase
);
393 //Parse packed DNA to construct the packed reversed DNA
395 unsigned int textLen
;
396 buildReversePacked(PackedDNAFileName
,&textLen
,TRUE
);
397 //printf("Reversed Packed DNA generated..\n");
398 printf("Finished. Parsed %u sequences.\n", numSeq
);
401 elapsedTime
= getElapsedTime(startTime
) - totalElapsedTime
;
402 printf("Elapsed time = ");
403 printElapsedTime(stdout
, FALSE
, FALSE
, TRUE
, 2, elapsedTime
);
404 totalElapsedTime
+= elapsedTime
;
409 if (BuildLookUp
== TRUE
) {
410 //Construct Look-Up Table
411 printf("Building Look-Up..\n");
412 buildLookupTable(PackedDNAFileName
,LookupTableFileName
,LookUpTableSize
);
413 //printf("Look-Up Table is built.\n");
414 buildLookupTable(RevPackedDNAFileName
,RevLookupTableFileName
,ReversedLookUpTableSize
);
415 //printf("Reversed Look-Up Table is built.\n");
417 elapsedTime
= getElapsedTime(startTime
) - totalElapsedTime
;
418 printf("Finished.\nElapsed time = ");
419 printElapsedTime(stdout
, FALSE
, FALSE
, TRUE
, 2, elapsedTime
);
420 totalElapsedTime
+= elapsedTime
;
424 // Construct BWTInc from text
425 if (BuildBWT
== TRUE
) {
427 printf("Building BWT..\n");
429 bwtInc
= BWTIncConstructFromPacked(mmPool
, PackedDNAFileName
, ShowProgress
,
430 TargetNBit
, InitialMaxBuildSize
, IncMaxBuildSize
);
432 printf("Finished constructing BWT in %u iterations. ", bwtInc
->numberOfIterationDone
);
434 elapsedTime
= getElapsedTime(startTime
) - totalElapsedTime
;
435 printf("Elapsed time = ");
436 printElapsedTime(stdout
, FALSE
, FALSE
, TRUE
, 2, elapsedTime
);
437 totalElapsedTime
+= elapsedTime
;
440 printf("Saving BWT..\n");
441 BWTSaveBwtCodeAndOcc(bwtInc
->bwt
, BWTCodeFileName
, BWTOccValueFileName
);
442 printf("Finished saving BWT. ");
443 elapsedTime
= getElapsedTime(startTime
) - totalElapsedTime
;
444 printf("Elapsed time = ");
445 printElapsedTime(stdout
, FALSE
, FALSE
, TRUE
, 2, elapsedTime
);
446 totalElapsedTime
+= elapsedTime
;
449 textLength
= bwtInc
->bwt
->textLength
;
451 BWTIncFree(mmPool
, bwtInc
);
455 //Building Reversed BWT
456 printf("Building Reversed BWT..\n");
458 bwtInc
= BWTIncConstructFromPacked(mmPool
, RevPackedDNAFileName
, ShowProgress
,
459 TargetNBit
, InitialMaxBuildSize
, IncMaxBuildSize
);
461 printf("Finished constructing Reversed BWT in %u iterations. ", bwtInc
->numberOfIterationDone
);
463 elapsedTime
= getElapsedTime(startTime
) - totalElapsedTime
;
464 printf("Elapsed time = ");
465 printElapsedTime(stdout
, FALSE
, FALSE
, TRUE
, 2, elapsedTime
);
466 totalElapsedTime
+= elapsedTime
;
469 printf("Saving BWT..\n");
470 BWTSaveBwtCodeAndOcc(bwtInc
->bwt
, RevBWTCodeFileName
, RevBWTOccValueFileName
);
471 printf("Finished saving BWT. ");
472 elapsedTime
= getElapsedTime(startTime
) - totalElapsedTime
;
473 printf("Elapsed time = ");
474 printElapsedTime(stdout
, FALSE
, FALSE
, TRUE
, 2, elapsedTime
);
475 totalElapsedTime
+= elapsedTime
;
478 textLength
= bwtInc
->bwt
->textLength
;
480 BWTIncFree(mmPool
, bwtInc
);
485 if (BuildSaValue
|| BuildSaIndex
) {
487 printf("Loading BWT...\n");
489 bwt
= BWTLoad(mmPool
, BWTCodeFileName
, BWTOccValueFileName
, NULL
, NULL
, NULL
, NULL
);
490 //Use BWT to build the hash table
492 printf("Finished loading BWT. ");
494 elapsedTime
= getElapsedTime(startTime
) - totalElapsedTime
;
495 printf("Elapsed time = ");
496 printElapsedTime(stdout
, FALSE
, FALSE
, TRUE
, 2, elapsedTime
);
497 totalElapsedTime
+= elapsedTime
;
500 textLength
= bwt
->textLength
;
502 } else if (BuildHOT
) {
503 printf("Loading BWT and SA Values...\n");
505 bwt
= BWTLoad(mmPool
, BWTCodeFileName
, BWTOccValueFileName
, SaValueFileName
, NULL
, NULL
, NULL
);
506 //Use BWT to build the hash table
508 printf("Finished loading BWT and SA Values. ");
510 elapsedTime
= getElapsedTime(startTime
) - totalElapsedTime
;
511 printf("Elapsed time = ");
512 printElapsedTime(stdout
, FALSE
, FALSE
, TRUE
, 2, elapsedTime
);
513 totalElapsedTime
+= elapsedTime
;
516 textLength
= bwt
->textLength
;
523 printf("Building SA value...\n");
526 BWTGenerateSaValue(mmPool
, bwt
, SaValueFreq
, bwt
->textLength
/ SaValueFreq
/ 10);
528 BWTGenerateSaValue(mmPool
, bwt
, SaValueFreq
, 0);
530 BWTSaveSaValue(bwt
, SaValueFileName
);
532 printf("Finished building SA value. ");
534 elapsedTime
= getElapsedTime(startTime
) - totalElapsedTime
;
535 printf("Elapsed time = ");
536 printElapsedTime(stdout
, FALSE
, FALSE
, TRUE
, 2, elapsedTime
);
537 totalElapsedTime
+= elapsedTime
;
542 //Build High Occurrences Pattern Hash Table
544 printf("Building High-Occ Hash Table...\n");
546 //printf("# of Occ = %u\n",ho_ttlOccurrence);
548 unsigned int r
= bwt
->textLength
;
549 ExtractionHighOccPattern(bwt
,HashPatternLength
,l
,r
);
550 //printf("# of SA = %u\n",ho_count());
551 //printf("# of Occ = %u\n",ho_ttlOccurrence);
552 ho_buildandpopulate(bwt
);
553 elapsedTime
= getElapsedTime(startTime
) - totalElapsedTime
;
554 printf("Finished.\nElapsed time = ");
555 printElapsedTime(stdout
, FALSE
, FALSE
, TRUE
, 2, elapsedTime
);
556 totalElapsedTime
+= elapsedTime
;
563 printf("Building SA index...\n");
565 BWTGenerateSaRangeTable(bwt
, SaIndexNumOfChar
, SaIndexFileName
);
567 printf("Finished building SA index. ");
569 elapsedTime
= getElapsedTime(startTime
) - totalElapsedTime
;
570 printf("Elapsed time = ");
571 printElapsedTime(stdout
, FALSE
, FALSE
, TRUE
, 2, elapsedTime
);
572 totalElapsedTime
+= elapsedTime
;
578 if (BuildSaValue
|| BuildSaIndex
) {
579 BWTFree(mmPool
, bwt
);
582 // Finished all construction tasks
583 printf("Index building is completed.\n");
584 totalElapsedTime
= getElapsedTime(startTime
);
585 printf("Total elapsed time = ");
586 printElapsedTime(stdout
, FALSE
, FALSE
, TRUE
, 2, totalElapsedTime
);
589 //MMMasterPrintReport(stdout, FALSE, FALSE, FALSE);
591 //fprintf(stdout, "Number of char : %u\n", textLength);
592 //fprintf(stdout, "Bit per char : %.2f\n", (float)MMMasterMaxTotalByteDispatched() * BITS_IN_BYTE / textLength);
598 iniparser_freedict(programInput
);
604 dictionary
*ParseInput(int argc
, char** argv
) {
606 dictionary
*programInput
;
607 char t1
[3] = "-c"; // specify that this is a boolean type parameter
608 char t2
[3] = "-U"; // specify that this is a boolean type parameter
614 programInput
= paraparser_load(argc
, argv
, 2, d
); // 2 boolean type parameters
617 if (!iniparser_find_entry(programInput
, "argument:1")) {
621 iniparser_copystring(programInput
, "argument:1", DatabaseName
, DatabaseName
, MAX_FILENAME_LEN
);
622 if (strlen(DatabaseName
) + 4 > MAX_FILENAME_LEN
) {
627 // Get FASTA file name
628 iniparser_copystring(programInput
, "argument:2", FASTAFileName
, DatabaseName
, MAX_FILENAME_LEN
);
629 if (strlen(FASTAFileName
) > MAX_FILENAME_LEN
) {
635 // Whether confirmation is needed
636 Confirmation
= iniparser_find_entry(programInput
, "parameter:-c");
638 MaskLowerCase
= iniparser_find_entry(programInput
, "parameter:-U");
644 void ParseIniFile(char *iniFileName
) {
648 //printf("Loading %s ..", iniFileName);
649 ini
= iniparser_load(iniFileName
, FALSE
);
651 // printf("not found.\n");
656 // BuildTasks parameters
657 ParseFASTA
= iniparser_getboolean(ini
, "BuildTasks:ParseFASTA", ParseFASTA
);
658 BuildBWT
= iniparser_getboolean(ini
, "BuildTasks:BuildBWT", BuildBWT
);
659 BuildSaValue
= iniparser_getboolean(ini
, "BuildTasks:BuildSaValue", BuildSaValue
);
660 BuildLookUp
= iniparser_getboolean(ini
, "BuildTasks:BuildLookUp", BuildLookUp
);
661 BuildSaIndex
= iniparser_getboolean(ini
, "BuildTasks:BuildSaIndex", BuildSaIndex
);
662 BuildHOT
= iniparser_getboolean(ini
, "BuildTasks:BuildHOT", BuildHOT
);
664 // Display parameters
665 ShowProgress
= iniparser_getboolean(ini
, "Display:ShowProgress", ShowProgress
);
667 // Parse FASTA parameters
668 FASTARandomSeed
= iniparser_getint(ini
, "ParseFASTA:RandomSeed", FASTARandomSeed
);
669 if (FASTARandomSeed
== 0) {
670 FASTARandomSeed
= getRandomSeed();
673 // Build BWT parameters
674 OccValueFreq
= iniparser_getint(ini
, "BuildBWT:OccValueFreq", OccValueFreq
);
675 TargetNBit
= (float)iniparser_getdouble(ini
, "BuildBWT:TargetNBit", TargetNBit
);
676 InitialMaxBuildSize
= iniparser_getint(ini
, "BuildBWT:InitialMaxBuildSize", InitialMaxBuildSize
);
677 IncMaxBuildSize
= iniparser_getint(ini
, "BuildBWT:IncMaxBuildSize", IncMaxBuildSize
);
679 // Build SA value parameters
680 SaValueFreq
= iniparser_getint(ini
, "BuildSAValue:SaValueFreq", SaValueFreq
);
682 // Build SA index parameters
683 SaIndexNumOfChar
= iniparser_getint(ini
, "BuildSAIndex:SaIndexNumOfChar", SaIndexNumOfChar
);
685 // Build Look Up parameters
686 LookUpTableSize
= iniparser_getint(ini
, "BuildLookUp:LookUpTableSize", LookUpTableSize
);
687 ReversedLookUpTableSize
= iniparser_getint(ini
, "BuildLookUp:ReversedLookUpTableSize", ReversedLookUpTableSize
);
690 // Build High Occurrences Pattern Hash Table parameters
691 HashPatternLength
= iniparser_getint(ini
, "BuildHOT:PatternLength", HashPatternLength
);
692 HashOccThreshold
= iniparser_getint(ini
, "BuildHOT:OccurrencesThreshold", HashOccThreshold
);
693 // Database parameters
694 iniparser_copystring(ini
, "Database:AnnotationFileName", AnnotationFileName
, AnnotationFileName
, MAX_FILENAME_LEN
);
695 iniparser_copystring(ini
, "Database:AmbiguityFileName", AmbiguityFileName
, AmbiguityFileName
, MAX_FILENAME_LEN
);
696 iniparser_copystring(ini
, "Database:PackedDNAFileName", PackedDNAFileName
, PackedDNAFileName
, MAX_FILENAME_LEN
);
697 iniparser_copystring(ini
, "Database:BWTCodeFileName", BWTCodeFileName
, BWTCodeFileName
, MAX_FILENAME_LEN
);
698 iniparser_copystring(ini
, "Database:BWTOccValueFileName", BWTOccValueFileName
, BWTOccValueFileName
, MAX_FILENAME_LEN
);
699 iniparser_copystring(ini
, "Database:SaValueFileName", SaValueFileName
, SaValueFileName
, MAX_FILENAME_LEN
);
700 iniparser_copystring(ini
, "Database:SaIndexFileName", SaIndexFileName
, SaIndexFileName
, MAX_FILENAME_LEN
);
702 iniparser_copystring(ini
, "Database:RevPackedDNAFileName", RevPackedDNAFileName
, RevPackedDNAFileName
, MAX_FILENAME_LEN
);
703 iniparser_copystring(ini
, "Database:RevBWTCodeFileName", RevBWTCodeFileName
, RevBWTCodeFileName
, MAX_FILENAME_LEN
);
704 iniparser_copystring(ini
, "Database:RevBWTOccValueFileName", RevBWTOccValueFileName
, RevBWTOccValueFileName
, MAX_FILENAME_LEN
);
706 iniparser_copystring(ini
, "Database:LookupTableFileName", LookupTableFileName
, LookupTableFileName
, MAX_FILENAME_LEN
);
707 iniparser_copystring(ini
, "Database:RevLookupTableFileName", RevLookupTableFileName
, RevLookupTableFileName
, MAX_FILENAME_LEN
);
708 iniparser_copystring(ini
, "Database:HighOccHashTableFileName", HighOccHashTableFileName
, HighOccHashTableFileName
, MAX_FILENAME_LEN
);
709 iniparser_freedict(ini
);
715 ProcessFileName(AnnotationFileName
, AnnotationFileName
, DatabaseName
);
716 ProcessFileName(AmbiguityFileName
, AmbiguityFileName
, DatabaseName
);
717 ProcessFileName(PackedDNAFileName
, PackedDNAFileName
, DatabaseName
);
718 ProcessFileName(RevPackedDNAFileName
, RevPackedDNAFileName
, DatabaseName
);
719 ProcessFileName(BWTCodeFileName
, BWTCodeFileName
, DatabaseName
);
720 ProcessFileName(RevBWTCodeFileName
, RevBWTCodeFileName
, DatabaseName
);
721 ProcessFileName(BWTOccValueFileName
, BWTOccValueFileName
, DatabaseName
);
722 ProcessFileName(RevBWTOccValueFileName
, RevBWTOccValueFileName
, DatabaseName
);
723 ProcessFileName(SaValueFileName
, SaValueFileName
, DatabaseName
);
724 ProcessFileName(SaIndexFileName
, SaIndexFileName
, DatabaseName
);
726 ProcessFileName(LookupTableFileName
, LookupTableFileName
, DatabaseName
);
727 ProcessFileName(RevLookupTableFileName
, RevLookupTableFileName
, DatabaseName
);
729 ProcessFileName(HighOccHashTableFileName
, HighOccHashTableFileName
, DatabaseName
);
735 if (!ParseFASTA
&& !BuildBWT
&& !BuildSaValue
&& !BuildSaIndex
&& !BuildLookUp
&& !BuildHOT
) {
736 fprintf(stderr
, "No action is specified!\n");
740 if (PackedDNAFileName
[0] == '\0') {
741 fprintf(stderr
, "Packed DNA file name is not specified!\n");
744 if (RevPackedDNAFileName
[0] == '\0') {
745 fprintf(stderr
, "Reversed Packed DNA file name is not specified!\n");
750 if (PackedDNAFileName
[0] == '\0') {
751 fprintf(stderr
, "Packed DNA file name is not specified!\n");
754 if (AnnotationFileName
[0] == '\0') {
755 fprintf(stderr
, "Annotation file name is not specified!\n");
758 if (AmbiguityFileName
[0] == '\0') {
759 fprintf(stderr
, "Ambiguity file name is not specified!\n");
764 if (PackedDNAFileName
[0] == '\0') {
765 fprintf(stderr
, "Packed DNA file is not specified!\n");
768 if (BWTCodeFileName
[0] == '\0') {
769 fprintf(stderr
, "BWT code file name is not specified!\n");
772 if (BWTOccValueFileName
[0] == '\0') {
773 fprintf(stderr
, "BWT Occ value file name is not specified!\n");
776 if (TargetNBit
< 2.5) {
777 fprintf(stderr
, "Target NBit should be at least 2.5!\n");
782 if (BWTCodeFileName
[0] == '\0') {
783 fprintf(stderr
, "BWT code file is not specified!\n");
786 if (BWTOccValueFileName
[0] == '\0') {
787 fprintf(stderr
, "BWT Occ value file is not specified!\n");
790 if (SaValueFileName
[0] == '\0') {
791 fprintf(stderr
, "SA value file name is not specified!\n");
794 if (SaValueFreq
<= 0) {
795 fprintf(stderr
, "SA value frequency must > 0!\n");
801 if (BWTCodeFileName
[0] == '\0') {
802 fprintf(stderr
, "BWT code file is not specified!\n");
805 if (BWTOccValueFileName
[0] == '\0') {
806 fprintf(stderr
, "BWT Occ value file is not specified!\n");
809 if (SaIndexFileName
[0] == '\0') {
810 fprintf(stderr
, "SA index file name is not specified!\n");
813 if (SaIndexNumOfChar
<= 0) {
814 fprintf(stderr
, "SA index number of character must > 0!\n");
817 if (SaIndexNumOfChar
> 13) {
818 fprintf(stderr
, "SA index number of character must <= 13!\n");
824 if (BWTCodeFileName
[0] == '\0') {
825 fprintf(stderr
, "BWT code file is not specified!\n");
828 if (BWTOccValueFileName
[0] == '\0') {
829 fprintf(stderr
, "BWT Occ value file is not specified!\n");
832 if (RevBWTCodeFileName
[0] == '\0') {
833 fprintf(stderr
, "Reversed BWT code file is not specified!\n");
836 if (RevBWTOccValueFileName
[0] == '\0') {
837 fprintf(stderr
, "Reversed BWT Occ value file is not specified!\n");
851 /*printf("Parse FASTA file : %c\n", boolean[ParseFASTA]);
852 printf("Build BWT : %c\n", boolean[BuildBWT]);
853 printf("Build SA value : %c\n", boolean[BuildSaValue]);
854 printf("Build SA index : %c\n", boolean[BuildSaIndex]);
857 printf("Show progress : %c\n", boolean[ShowProgress]);
861 printf("Parse FASTA :\n");
862 printf("Mask lower case : %c\n", boolean[MaskLowerCase]);
863 printf("Random seed : %u\n", FASTARandomSeed);
868 printf("Build BWT :\n");
869 printf("Target N Bits : %.2f\n", TargetNBit);
870 printf("Occ value frequency : %u\n", OccValueFreq);
871 printf("Initial Max Build Size : %u Inc Max Build Size : %u\n",
872 InitialMaxBuildSize, IncMaxBuildSize);
877 printf("Build SA value :\n");
878 printf("SA value frequency : %u\n", SaValueFreq);
883 printf("Build SA index :\n");
884 printf("SA index no. of char : %u\n", SaIndexNumOfChar);
888 printf("Annotation file : %s\n", AnnotationFileName);
889 printf("Ambigurity file : %s\n", AmbiguityFileName);
890 printf("Packed DNA file : %s\n", PackedDNAFileName);
891 printf("BWT Code file : %s\n", BWTCodeFileName);
892 printf("BWT Occ value file : %s\n", BWTOccValueFileName);
893 printf("SA value file : %s\n", SaValueFileName);
894 printf("SA index file : %s\n", SaIndexFileName);
896 printf("Reversed Packed DNA file : %s\n", RevPackedDNAFileName);
897 printf("Reversed BWT Code file : %s\n", RevBWTCodeFileName);
898 printf("Reversed BWT Occ value file : %s\n", RevBWTOccValueFileName);
900 printf("Look-Up Table file : %s\n", LookupTableFileName);
901 printf("Reversed Look-Up Table file : %s\n", RevLookupTableFileName);
903 printf("High Occ Hash Table file : %s\n", HighOccHashTableFileName);
908 void PrintShortDesc() {
910 /*printf("BWTFormatdb v1.0, Copyright (C) 2006, Wong Chi Kwong.\n");
911 printf("BWTFormatdb comes with ABSOLUTELY NO WARRENTY.\n");
912 printf("BWTFormatdb is free software, and you are welcome to\n");
913 printf("redistribute it under certain conditions.\n");
914 printf("For details type BWTFormatdb.\n");
921 /*printf("BWTFormatdb v1.0, Copyright (C) 2006, Wong Chi Kwong.\n");
924 printf("This program is free software; you can redistribute it and/or\n");
925 printf("modify it under the terms of the GNU General Public License\n");
926 printf("as published by the Free Software Foundation; either version 2\n");
927 printf("of the License, or (at your option) any later version.\n");
930 printf("This program is distributed in the hope that it will be useful,\n");
931 printf("but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
932 printf("MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
933 printf("GNU General Public License for more details.\n");
936 printf("You should have received a copy of the GNU General Public License\n");
937 printf("along with this program; if not, write to the Free Software\n");
938 printf("Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n");
941 printf("Syntax: 2bwt-builder <sequence file>\n");
942 printf("Compile Time: " COMPILE_TIME
"\n");
945 void ProcessFileName(char *outputFileName
, const char *inputFileName
, const char *databaseName
) {
947 char tempChar
[MAX_FILENAME_LEN
];
950 if (inputFileName
== NULL
) {
951 if (outputFileName
!= inputFileName
) {
952 outputFileName
[0] = '\0';
957 if (strlen(databaseName
) + strlen(inputFileName
) > MAX_FILENAME_LEN
) {
958 fprintf(stderr
, "File length is too long!\n");
962 strncpy(tempChar
, inputFileName
, MAX_FILENAME_LEN
);
965 for (i
=0; i
<MAX_FILENAME_LEN
; i
++) {
966 if (tempChar
[i
] == '*') {
970 if (i
<MAX_FILENAME_LEN
) {
972 sprintf(outputFileName
, "%s%s%s", tempChar
, databaseName
, tempChar
+ i
+ 1);
974 sprintf(outputFileName
, "%s", tempChar
);