3 BWTFormatdb.c Build index for FASTA database
5 This program builds index for FASTA database for use of BWTBlastn.
7 Copyright (C) 2006, Wong Chi Kwong.
9 This program is free software; you can redistribute it and/or
10 modify it under the terms of the GNU General Public License
11 as published by the Free Software Foundation; either version 2
12 of the License, or (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
27 #include "TypeNLimit.h"
28 #include "BWTConstruct.h"
29 #include "MiscUtilities.h"
31 #include "TextConverter.h"
32 #include "MemManager.h"
33 #include "iniparser.h"
38 dictionary
*ParseInput(int argc
, char** argv
);
39 void ParseIniFile(char *iniFileName
);
43 void PrintShortDesc();
46 void ProcessFileName(char *outputFileName
, const char *inputFileName
, const char *databaseName
);
50 char IniFileName
[MAX_FILENAME_LEN
+1];
53 // BuildTasks parameters
54 int ParseFASTA
= TRUE
;
56 int BuildSaValue
= TRUE
;
57 int BuildSaIndex
= TRUE
;
60 unsigned int PoolSize
= 2097152; // 2M - fixed; not configurable through ini
63 int ShowProgress
= FALSE
;
65 // Database parameters
66 char FASTAFileName
[MAX_FILENAME_LEN
+1] = "";
67 char DatabaseName
[MAX_FILENAME_LEN
+1] = "";
68 char AnnotationFileName
[MAX_FILENAME_LEN
+1] = "*.ann";
69 char AmbiguityFileName
[MAX_FILENAME_LEN
+1] = "*.amb";
70 char PackedDNAFileName
[MAX_FILENAME_LEN
+1] = "*.pac";
71 char BWTCodeFileName
[MAX_FILENAME_LEN
+1] = "*.bwt";
72 char BWTOccValueFileName
[MAX_FILENAME_LEN
+1] = "*.fmv";
73 char SaValueFileName
[MAX_FILENAME_LEN
+1] = "*.sa";
74 char SaIndexFileName
[MAX_FILENAME_LEN
+1] = "*.sai";
76 // Parse FASTA parameters
77 unsigned int FASTARandomSeed
= 0;
78 int MaskLowerCase
= FALSE
;
80 // Build BWT parameters
81 unsigned int OccValueFreq
= 256;
82 float TargetNBit
= 2.5;
83 unsigned int InitialMaxBuildSize
= 10000000;
84 unsigned int IncMaxBuildSize
= 10000000;
86 // Build SA value parameters
87 unsigned int SaValueFreq
= 8;
89 // Build SA index parameters
90 unsigned int SaIndexNumOfChar
= 12;
94 int main(int argc
, char** argv
) {
98 dictionary
*programInput
;
100 double elapsedTime
= 0, totalElapsedTime
= 0;
102 char filename
[MAX_FILENAME_LEN
+1];
104 unsigned int textLength
= 0;
107 BWTInc
*bwtInc
= NULL
;
110 programInput
= ParseInput(argc
, argv
);
114 if (strcmp(argv
[0] + strlen(argv
[0]) - 4, ".exe") == 0) {
115 *(argv
[0] + strlen(argv
[0]) - 4) = '\0';
117 sprintf(filename
, "%s.ini", argv
[0]);
118 ParseIniFile(filename
);
124 if (Confirmation
== TRUE
) {
125 printf("Press Y to go or N to cancel. ");
127 while (c
!= 'y' && c
!= 'Y' && c
!= 'n' && c
!= 'N') {
130 if (c
== 'n' || c
== 'N') {
135 startTime
= setStartTime();
137 MMMasterInitialize(1, 0, FALSE
, NULL
);
138 mmPool
= MMPoolCreate(PoolSize
);
140 // Parse FASTA file to produce packed DNA and annotation file
141 if (ParseFASTA
== TRUE
) {
143 printf("Parsing FASTA file..\n");
145 numSeq
= HSPParseFASTAToPacked(FASTAFileName
, AnnotationFileName
, PackedDNAFileName
, AmbiguityFileName
, FASTARandomSeed
, MaskLowerCase
);
147 printf("Finished. Parsed %u sequences.\n", numSeq
);
149 elapsedTime
= getElapsedTime(startTime
) - totalElapsedTime
;
150 printf("Elapsed time = ");
151 printElapsedTime(stdout
, FALSE
, FALSE
, TRUE
, 2, elapsedTime
);
152 totalElapsedTime
+= elapsedTime
;
157 // Construct BWTInc from text
158 if (BuildBWT
== TRUE
) {
160 printf("Building BWT..\n");
162 bwtInc
= BWTIncConstructFromPacked(mmPool
, PackedDNAFileName
, ShowProgress
,
163 TargetNBit
, InitialMaxBuildSize
, IncMaxBuildSize
);
165 printf("Finished constructing BWT in %u iterations. ", bwtInc
->numberOfIterationDone
);
166 elapsedTime
= getElapsedTime(startTime
) - totalElapsedTime
;
167 printf("Elapsed time = ");
168 printElapsedTime(stdout
, FALSE
, FALSE
, TRUE
, 2, elapsedTime
);
169 totalElapsedTime
+= elapsedTime
;
172 printf("Saving BWT..\n");
173 BWTSaveBwtCodeAndOcc(bwtInc
->bwt
, BWTCodeFileName
, BWTOccValueFileName
);
174 printf("Finished saving BWT. ");
175 elapsedTime
= getElapsedTime(startTime
) - totalElapsedTime
;
176 printf("Elapsed time = ");
177 printElapsedTime(stdout
, FALSE
, FALSE
, TRUE
, 2, elapsedTime
);
178 totalElapsedTime
+= elapsedTime
;
181 textLength
= bwtInc
->bwt
->textLength
;
183 BWTIncFree(mmPool
, bwtInc
);
188 if (BuildSaValue
|| BuildSaIndex
) {
190 printf("Loading BWT...\n");
192 bwt
= BWTLoad(mmPool
, BWTCodeFileName
, BWTOccValueFileName
, NULL
, NULL
, NULL
, NULL
);
194 printf("Finished loading BWT. ");
196 elapsedTime
= getElapsedTime(startTime
) - totalElapsedTime
;
197 printf("Elapsed time = ");
198 printElapsedTime(stdout
, FALSE
, FALSE
, TRUE
, 2, elapsedTime
);
199 totalElapsedTime
+= elapsedTime
;
202 textLength
= bwt
->textLength
;
208 printf("Building SA value...\n");
211 BWTGenerateSaValue(mmPool
, bwt
, SaValueFreq
, bwt
->textLength
/ SaValueFreq
/ 10);
213 BWTGenerateSaValue(mmPool
, bwt
, SaValueFreq
, 0);
215 BWTSaveSaValue(bwt
, SaValueFileName
);
217 printf("Finished building SA value. ");
219 elapsedTime
= getElapsedTime(startTime
) - totalElapsedTime
;
220 printf("Elapsed time = ");
221 printElapsedTime(stdout
, FALSE
, FALSE
, TRUE
, 2, elapsedTime
);
222 totalElapsedTime
+= elapsedTime
;
229 printf("Building SA index...\n");
231 BWTGenerateSaRangeTable(bwt
, SaIndexNumOfChar
, SaIndexFileName
);
233 printf("Finished building SA index. ");
235 elapsedTime
= getElapsedTime(startTime
) - totalElapsedTime
;
236 printf("Elapsed time = ");
237 printElapsedTime(stdout
, FALSE
, FALSE
, TRUE
, 2, elapsedTime
);
238 totalElapsedTime
+= elapsedTime
;
244 if (BuildSaValue
|| BuildSaIndex
) {
245 BWTFree(mmPool
, bwt
);
248 // Finished all construction tasks
249 printf("Finished all tasks. ");
250 totalElapsedTime
= getElapsedTime(startTime
);
251 printf("Total elapsed time = ");
252 printElapsedTime(stdout
, FALSE
, FALSE
, TRUE
, 2, totalElapsedTime
);
255 MMMasterPrintReport(stdout
, FALSE
, FALSE
, FALSE
);
257 fprintf(stdout
, "Number of char : %u\n", textLength
);
258 fprintf(stdout
, "Bit per char : %.2f\n", (float)MMMasterMaxTotalByteDispatched() * BITS_IN_BYTE
/ textLength
);
264 iniparser_freedict(programInput
);
270 dictionary
*ParseInput(int argc
, char** argv
) {
272 dictionary
*programInput
;
273 char t1
[3] = "-c"; // specify that this is a boolean type parameter
274 char t2
[3] = "-U"; // specify that this is a boolean type parameter
280 programInput
= paraparser_load(argc
, argv
, 2, d
); // 2 boolean type parameters
283 if (!iniparser_find_entry(programInput
, "argument:1")) {
287 iniparser_copystring(programInput
, "argument:1", DatabaseName
, DatabaseName
, MAX_FILENAME_LEN
);
288 if (strlen(DatabaseName
) + 4 > MAX_FILENAME_LEN
) {
293 // Get FASTA file name
294 iniparser_copystring(programInput
, "argument:2", FASTAFileName
, DatabaseName
, MAX_FILENAME_LEN
);
295 if (strlen(FASTAFileName
) > MAX_FILENAME_LEN
) {
301 // Whether confirmation is needed
302 Confirmation
= iniparser_find_entry(programInput
, "parameter:-c");
304 MaskLowerCase
= iniparser_find_entry(programInput
, "parameter:-U");
310 void ParseIniFile(char *iniFileName
) {
314 printf("Loading %s ..", iniFileName
);
315 ini
= iniparser_load(iniFileName
, FALSE
);
317 printf("not found.\n");
322 // BuildTasks parameters
323 ParseFASTA
= iniparser_getboolean(ini
, "BuildTasks:ParseFASTA", ParseFASTA
);
324 BuildBWT
= iniparser_getboolean(ini
, "BuildTasks:BuildBWT", BuildBWT
);
325 BuildSaValue
= iniparser_getboolean(ini
, "BuildTasks:BuildSaValue", BuildSaValue
);
326 BuildSaIndex
= iniparser_getboolean(ini
, "BuildTasks:BuildSaIndex", BuildSaIndex
);
328 // Display parameters
329 ShowProgress
= iniparser_getboolean(ini
, "Display:ShowProgress", ShowProgress
);
331 // Parse FASTA parameters
332 FASTARandomSeed
= iniparser_getint(ini
, "ParseFASTA:RandomSeed", FASTARandomSeed
);
333 if (FASTARandomSeed
== 0) {
334 FASTARandomSeed
= getRandomSeed();
337 // Build BWT parameters
338 OccValueFreq
= iniparser_getint(ini
, "BuildBWT:OccValueFreq", OccValueFreq
);
339 TargetNBit
= (float)iniparser_getdouble(ini
, "BuildBWT:TargetNBit", TargetNBit
);
340 InitialMaxBuildSize
= iniparser_getint(ini
, "BuildBWT:InitialMaxBuildSize", InitialMaxBuildSize
);
341 IncMaxBuildSize
= iniparser_getint(ini
, "BuildBWT:IncMaxBuildSize", IncMaxBuildSize
);
343 // Build SA value parameters
344 SaValueFreq
= iniparser_getint(ini
, "BuildSAValue:SaValueFreq", SaValueFreq
);
346 // Build SA index parameters
347 SaIndexNumOfChar
= iniparser_getint(ini
, "BuildSAIndex:SaIndexNumOfChar", SaIndexNumOfChar
);
349 // Database parameters
350 iniparser_copystring(ini
, "Database:AnnotationFileName", AnnotationFileName
, AnnotationFileName
, MAX_FILENAME_LEN
);
351 iniparser_copystring(ini
, "Database:AmbiguityFileName", AmbiguityFileName
, AmbiguityFileName
, MAX_FILENAME_LEN
);
352 iniparser_copystring(ini
, "Database:PackedDNAFileName", PackedDNAFileName
, PackedDNAFileName
, MAX_FILENAME_LEN
);
353 iniparser_copystring(ini
, "Database:BWTCodeFileName", BWTCodeFileName
, BWTCodeFileName
, MAX_FILENAME_LEN
);
354 iniparser_copystring(ini
, "Database:BWTOccValueFileName", BWTOccValueFileName
, BWTOccValueFileName
, MAX_FILENAME_LEN
);
355 iniparser_copystring(ini
, "Database:SaValueFileName", SaValueFileName
, SaValueFileName
, MAX_FILENAME_LEN
);
356 iniparser_copystring(ini
, "Database:SaIndexFileName", SaIndexFileName
, SaIndexFileName
, MAX_FILENAME_LEN
);
358 iniparser_freedict(ini
);
364 ProcessFileName(AnnotationFileName
, AnnotationFileName
, DatabaseName
);
365 ProcessFileName(AmbiguityFileName
, AmbiguityFileName
, DatabaseName
);
366 ProcessFileName(PackedDNAFileName
, PackedDNAFileName
, DatabaseName
);
367 ProcessFileName(BWTCodeFileName
, BWTCodeFileName
, DatabaseName
);
368 ProcessFileName(BWTOccValueFileName
, BWTOccValueFileName
, DatabaseName
);
369 ProcessFileName(SaValueFileName
, SaValueFileName
, DatabaseName
);
370 ProcessFileName(SaIndexFileName
, SaIndexFileName
, DatabaseName
);
376 if (!ParseFASTA
&& !BuildBWT
&& !BuildSaValue
&& !BuildSaIndex
) {
377 fprintf(stderr
, "No action is specified!\n");
381 if (PackedDNAFileName
[0] == '\0') {
382 fprintf(stderr
, "Packed DNA file name is not specified!\n");
385 if (AnnotationFileName
[0] == '\0') {
386 fprintf(stderr
, "Annotation file name is not specified!\n");
389 if (AmbiguityFileName
[0] == '\0') {
390 fprintf(stderr
, "Ambiguity file name is not specified!\n");
395 if (PackedDNAFileName
[0] == '\0') {
396 fprintf(stderr
, "Packed DNA file is not specified!\n");
399 if (BWTCodeFileName
[0] == '\0') {
400 fprintf(stderr
, "BWT code file name is not specified!\n");
403 if (BWTOccValueFileName
[0] == '\0') {
404 fprintf(stderr
, "BWT Occ value file name is not specified!\n");
407 if (TargetNBit
< 2.5) {
408 fprintf(stderr
, "Target NBit should be at least 2.5!\n");
413 if (BWTCodeFileName
[0] == '\0') {
414 fprintf(stderr
, "BWT code file is not specified!\n");
417 if (BWTOccValueFileName
[0] == '\0') {
418 fprintf(stderr
, "BWT Occ value file is not specified!\n");
421 if (SaValueFileName
[0] == '\0') {
422 fprintf(stderr
, "SA value file name is not specified!\n");
425 if (SaValueFreq
<= 0) {
426 fprintf(stderr
, "SA value frequency must > 0!\n");
432 if (BWTCodeFileName
[0] == '\0') {
433 fprintf(stderr
, "BWT code file is not specified!\n");
436 if (BWTOccValueFileName
[0] == '\0') {
437 fprintf(stderr
, "BWT Occ value file is not specified!\n");
440 if (SaIndexFileName
[0] == '\0') {
441 fprintf(stderr
, "SA index file name is not specified!\n");
444 if (SaIndexNumOfChar
<= 0) {
445 fprintf(stderr
, "SA index number of character must > 0!\n");
448 if (SaIndexNumOfChar
> 13) {
449 fprintf(stderr
, "SA index number of character must <= 13!\n");
464 printf("Parse FASTA file : %c\n", boolean
[ParseFASTA
]);
465 printf("Build BWT : %c\n", boolean
[BuildBWT
]);
466 printf("Build SA value : %c\n", boolean
[BuildSaValue
]);
467 printf("Build SA index : %c\n", boolean
[BuildSaIndex
]);
470 printf("Show progress : %c\n", boolean
[ShowProgress
]);
474 printf("Parse FASTA :\n");
475 printf("Mask lower case : %c\n", boolean
[MaskLowerCase
]);
476 printf("Random seed : %u\n", FASTARandomSeed
);
481 printf("Build BWT :\n");
482 printf("Target N Bits : %.2f\n", TargetNBit
);
483 printf("Occ value frequency : %u\n", OccValueFreq
);
484 printf("Initial Max Build Size : %u Inc Max Build Size : %u\n",
485 InitialMaxBuildSize
, IncMaxBuildSize
);
490 printf("Build SA value :\n");
491 printf("SA value frequency : %u\n", SaValueFreq
);
496 printf("Build SA index :\n");
497 printf("SA index no. of char : %u\n", SaIndexNumOfChar
);
501 printf("Annotation file : %s\n", AnnotationFileName
);
502 printf("Ambigurity file : %s\n", AmbiguityFileName
);
503 printf("Packed DNA file : %s\n", PackedDNAFileName
);
504 printf("BWT Code file : %s\n", BWTCodeFileName
);
505 printf("BWT Occ value file : %s\n", BWTOccValueFileName
);
506 printf("SA value file : %s\n", SaValueFileName
);
507 printf("SA index file : %s\n", SaIndexFileName
);
512 void PrintShortDesc() {
514 printf("BWTFormatdb v1.0, Copyright (C) 2006, Wong Chi Kwong.\n");
515 printf("BWTFormatdb comes with ABSOLUTELY NO WARRENTY.\n");
516 printf("BWTFormatdb is free software, and you are welcome to\n");
517 printf("redistribute it under certain conditions.\n");
518 printf("For details type BWTFormatdb.\n");
525 printf("BWTFormatdb v1.0, Copyright (C) 2006, Wong Chi Kwong.\n");
528 printf("This program is free software; you can redistribute it and/or\n");
529 printf("modify it under the terms of the GNU General Public License\n");
530 printf("as published by the Free Software Foundation; either version 2\n");
531 printf("of the License, or (at your option) any later version.\n");
534 printf("This program is distributed in the hope that it will be useful,\n");
535 printf("but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
536 printf("MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
537 printf("GNU General Public License for more details.\n");
540 printf("You should have received a copy of the GNU General Public License\n");
541 printf("along with this program; if not, write to the Free Software\n");
542 printf("Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n");
545 printf("Syntax: BWTFormatdb <database> [FASTA file; default = <database>]\n");
546 printf(" [-c Confirm]\n");
547 printf(" [-U Mask lower case]\n");
551 void ProcessFileName(char *outputFileName
, const char *inputFileName
, const char *databaseName
) {
553 char tempChar
[MAX_FILENAME_LEN
];
556 if (inputFileName
== NULL
) {
557 if (outputFileName
!= inputFileName
) {
558 outputFileName
[0] = '\0';
563 if (strlen(databaseName
) + strlen(inputFileName
) > MAX_FILENAME_LEN
) {
564 fprintf(stderr
, "File length is too long!\n");
568 strncpy(tempChar
, inputFileName
, MAX_FILENAME_LEN
);
571 for (i
=0; i
<MAX_FILENAME_LEN
; i
++) {
572 if (tempChar
[i
] == '*') {
576 if (i
<MAX_FILENAME_LEN
) {
578 sprintf(outputFileName
, "%s%s%s", tempChar
, databaseName
, tempChar
+ i
+ 1);
580 sprintf(outputFileName
, "%s", tempChar
);