modified: makefile
[GalaxyCodeBases.git] / BGI / soap_src / soap_builder / BWTFormatdb.c
blobc0e12a8ce76a81420c3c4d929305ac2fd51a2926
1 /*
3 BWTFormatdb.c Build index for FASTA database
5 This program builds index for FASTA database for use of BWTBlastn.
7 Copyright (C) 2006, Wong Chi Kwong.
9 This program is free software; you can redistribute it and/or
10 modify it under the terms of the GNU General Public License
11 as published by the Free Software Foundation; either version 2
12 of the License, or (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include "TypeNLimit.h"
28 #include "BWTConstruct.h"
29 #include "MiscUtilities.h"
30 #include "DNACount.h"
31 #include "TextConverter.h"
32 #include "MemManager.h"
33 #include "iniparser.h"
34 #include "HSP.h"
35 #include "Timing.h"
37 // Database and ini
38 dictionary *ParseInput(int argc, char** argv);
39 void ParseIniFile(char *iniFileName);
40 void ProcessIni();
41 void ValidateIni();
42 void PrintIni();
43 void PrintShortDesc();
44 void PrintHelp();
46 void ProcessFileName(char *outputFileName, const char *inputFileName, const char *databaseName);
49 // Parameters
50 char IniFileName[MAX_FILENAME_LEN+1];
51 int Confirmation;
53 // BuildTasks parameters
54 int ParseFASTA = TRUE;
55 int BuildBWT = TRUE;
56 int BuildSaValue = TRUE;
57 int BuildSaIndex = TRUE;
59 // Memory parameters
60 unsigned int PoolSize = 2097152; // 2M - fixed; not configurable through ini
62 // Display parameters
63 int ShowProgress = FALSE;
65 // Database parameters
66 char FASTAFileName[MAX_FILENAME_LEN+1] = "";
67 char DatabaseName[MAX_FILENAME_LEN+1] = "";
68 char AnnotationFileName[MAX_FILENAME_LEN+1] = "*.ann";
69 char AmbiguityFileName[MAX_FILENAME_LEN+1] = "*.amb";
70 char PackedDNAFileName[MAX_FILENAME_LEN+1] = "*.pac";
71 char BWTCodeFileName[MAX_FILENAME_LEN+1] = "*.bwt";
72 char BWTOccValueFileName[MAX_FILENAME_LEN+1] = "*.fmv";
73 char SaValueFileName[MAX_FILENAME_LEN+1] = "*.sa";
74 char SaIndexFileName[MAX_FILENAME_LEN+1] = "*.sai";
76 // Parse FASTA parameters
77 unsigned int FASTARandomSeed = 0;
78 int MaskLowerCase = FALSE;
80 // Build BWT parameters
81 unsigned int OccValueFreq = 256;
82 float TargetNBit = 2.5;
83 unsigned int InitialMaxBuildSize = 10000000;
84 unsigned int IncMaxBuildSize = 10000000;
86 // Build SA value parameters
87 unsigned int SaValueFreq = 8;
89 // Build SA index parameters
90 unsigned int SaIndexNumOfChar = 12;
94 int main(int argc, char** argv) {
96 char c;
97 MMPool *mmPool;
98 dictionary *programInput;
99 double startTime;
100 double elapsedTime = 0, totalElapsedTime = 0;
102 char filename[MAX_FILENAME_LEN+1];
103 BWT *bwt = NULL;
104 unsigned int textLength = 0;
105 unsigned int numSeq;
107 BWTInc *bwtInc = NULL;
109 // Program input
110 programInput = ParseInput(argc, argv);
111 PrintShortDesc();
113 // Ini
114 if (strcmp(argv[0] + strlen(argv[0]) - 4, ".exe") == 0) {
115 *(argv[0] + strlen(argv[0]) - 4) = '\0';
117 sprintf(filename, "%s.ini", argv[0]);
118 ParseIniFile(filename);
119 printf("\n");
120 ProcessIni();
121 ValidateIni();
122 PrintIni();
124 if (Confirmation == TRUE) {
125 printf("Press Y to go or N to cancel. ");
126 c = (char)getchar();
127 while (c != 'y' && c != 'Y' && c != 'n' && c!= 'N') {
128 c = (char)getchar();
130 if (c == 'n' || c == 'N') {
131 exit(0);
135 startTime = setStartTime();
137 MMMasterInitialize(1, 0, FALSE, NULL);
138 mmPool = MMPoolCreate(PoolSize);
140 // Parse FASTA file to produce packed DNA and annotation file
141 if (ParseFASTA == TRUE) {
143 printf("Parsing FASTA file..\n");
145 numSeq = HSPParseFASTAToPacked(FASTAFileName, AnnotationFileName, PackedDNAFileName, AmbiguityFileName, FASTARandomSeed, MaskLowerCase);
147 printf("Finished. Parsed %u sequences.\n", numSeq);
149 elapsedTime = getElapsedTime(startTime) - totalElapsedTime;
150 printf("Elapsed time = ");
151 printElapsedTime(stdout, FALSE, FALSE, TRUE, 2, elapsedTime);
152 totalElapsedTime += elapsedTime;
153 printf("\n");
157 // Construct BWTInc from text
158 if (BuildBWT == TRUE) {
160 printf("Building BWT..\n");
162 bwtInc = BWTIncConstructFromPacked(mmPool, PackedDNAFileName, ShowProgress,
163 TargetNBit, InitialMaxBuildSize, IncMaxBuildSize);
165 printf("Finished constructing BWT in %u iterations. ", bwtInc->numberOfIterationDone);
166 elapsedTime = getElapsedTime(startTime) - totalElapsedTime;
167 printf("Elapsed time = ");
168 printElapsedTime(stdout, FALSE, FALSE, TRUE, 2, elapsedTime);
169 totalElapsedTime += elapsedTime;
170 printf("\n");
172 printf("Saving BWT..\n");
173 BWTSaveBwtCodeAndOcc(bwtInc->bwt, BWTCodeFileName, BWTOccValueFileName);
174 printf("Finished saving BWT. ");
175 elapsedTime = getElapsedTime(startTime) - totalElapsedTime;
176 printf("Elapsed time = ");
177 printElapsedTime(stdout, FALSE, FALSE, TRUE, 2, elapsedTime);
178 totalElapsedTime += elapsedTime;
179 printf("\n");
181 textLength = bwtInc->bwt->textLength;
183 BWTIncFree(mmPool, bwtInc);
187 // Load BWT
188 if (BuildSaValue || BuildSaIndex) {
190 printf("Loading BWT...\n");
192 bwt = BWTLoad(mmPool, BWTCodeFileName, BWTOccValueFileName, NULL, NULL, NULL, NULL);
194 printf("Finished loading BWT. ");
196 elapsedTime = getElapsedTime(startTime) - totalElapsedTime;
197 printf("Elapsed time = ");
198 printElapsedTime(stdout, FALSE, FALSE, TRUE, 2, elapsedTime);
199 totalElapsedTime += elapsedTime;
200 printf("\n");
202 textLength = bwt->textLength;
206 if (BuildSaValue) {
208 printf("Building SA value...\n");
210 if (ShowProgress) {
211 BWTGenerateSaValue(mmPool, bwt, SaValueFreq, bwt->textLength / SaValueFreq / 10);
212 } else {
213 BWTGenerateSaValue(mmPool, bwt, SaValueFreq, 0);
215 BWTSaveSaValue(bwt, SaValueFileName);
217 printf("Finished building SA value. ");
219 elapsedTime = getElapsedTime(startTime) - totalElapsedTime;
220 printf("Elapsed time = ");
221 printElapsedTime(stdout, FALSE, FALSE, TRUE, 2, elapsedTime);
222 totalElapsedTime += elapsedTime;
223 printf("\n");
227 if (BuildSaIndex) {
229 printf("Building SA index...\n");
231 BWTGenerateSaRangeTable(bwt, SaIndexNumOfChar, SaIndexFileName);
233 printf("Finished building SA index. ");
235 elapsedTime = getElapsedTime(startTime) - totalElapsedTime;
236 printf("Elapsed time = ");
237 printElapsedTime(stdout, FALSE, FALSE, TRUE, 2, elapsedTime);
238 totalElapsedTime += elapsedTime;
239 printf("\n");
243 // Free BWT
244 if (BuildSaValue || BuildSaIndex) {
245 BWTFree(mmPool, bwt);
248 // Finished all construction tasks
249 printf("Finished all tasks. ");
250 totalElapsedTime = getElapsedTime(startTime);
251 printf("Total elapsed time = ");
252 printElapsedTime(stdout, FALSE, FALSE, TRUE, 2, totalElapsedTime);
253 printf("\n");
255 MMMasterPrintReport(stdout, FALSE, FALSE, FALSE);
256 if (BuildSaValue) {
257 fprintf(stdout, "Number of char : %u\n", textLength);
258 fprintf(stdout, "Bit per char : %.2f\n", (float)MMMasterMaxTotalByteDispatched() * BITS_IN_BYTE / textLength);
259 printf("\n");
262 MMPoolFree(mmPool);
264 iniparser_freedict(programInput);
266 return 0;
270 dictionary *ParseInput(int argc, char** argv) {
272 dictionary *programInput;
273 char t1[3] = "-c"; // specify that this is a boolean type parameter
274 char t2[3] = "-U"; // specify that this is a boolean type parameter
275 char *d[2];
277 d[0] = t1;
278 d[1] = t2;
280 programInput = paraparser_load(argc, argv, 2, d); // 2 boolean type parameters
282 // Get database name
283 if (!iniparser_find_entry(programInput, "argument:1")) {
284 PrintHelp();
285 exit(1);
287 iniparser_copystring(programInput, "argument:1", DatabaseName, DatabaseName, MAX_FILENAME_LEN);
288 if (strlen(DatabaseName) + 4 > MAX_FILENAME_LEN) {
289 PrintHelp();
290 exit(1);
293 // Get FASTA file name
294 iniparser_copystring(programInput, "argument:2", FASTAFileName, DatabaseName, MAX_FILENAME_LEN);
295 if (strlen(FASTAFileName) > MAX_FILENAME_LEN) {
296 PrintHelp();
297 exit(1);
301 // Whether confirmation is needed
302 Confirmation = iniparser_find_entry(programInput, "parameter:-c");
304 MaskLowerCase = iniparser_find_entry(programInput, "parameter:-U");
306 return programInput;
310 void ParseIniFile(char *iniFileName) {
312 dictionary *ini;
314 printf("Loading %s ..", iniFileName);
315 ini = iniparser_load(iniFileName, FALSE);
316 if (ini == NULL) {
317 printf("not found.\n");
318 return;
320 printf("done.\n");
322 // BuildTasks parameters
323 ParseFASTA = iniparser_getboolean(ini, "BuildTasks:ParseFASTA", ParseFASTA);
324 BuildBWT = iniparser_getboolean(ini, "BuildTasks:BuildBWT", BuildBWT);
325 BuildSaValue = iniparser_getboolean(ini, "BuildTasks:BuildSaValue", BuildSaValue);
326 BuildSaIndex = iniparser_getboolean(ini, "BuildTasks:BuildSaIndex", BuildSaIndex);
328 // Display parameters
329 ShowProgress = iniparser_getboolean(ini, "Display:ShowProgress", ShowProgress);
331 // Parse FASTA parameters
332 FASTARandomSeed = iniparser_getint(ini, "ParseFASTA:RandomSeed", FASTARandomSeed);
333 if (FASTARandomSeed == 0) {
334 FASTARandomSeed = getRandomSeed();
337 // Build BWT parameters
338 OccValueFreq = iniparser_getint(ini, "BuildBWT:OccValueFreq", OccValueFreq);
339 TargetNBit = (float)iniparser_getdouble(ini, "BuildBWT:TargetNBit", TargetNBit);
340 InitialMaxBuildSize = iniparser_getint(ini, "BuildBWT:InitialMaxBuildSize", InitialMaxBuildSize);
341 IncMaxBuildSize = iniparser_getint(ini, "BuildBWT:IncMaxBuildSize", IncMaxBuildSize);
343 // Build SA value parameters
344 SaValueFreq = iniparser_getint(ini, "BuildSAValue:SaValueFreq", SaValueFreq);
346 // Build SA index parameters
347 SaIndexNumOfChar = iniparser_getint(ini, "BuildSAIndex:SaIndexNumOfChar", SaIndexNumOfChar);
349 // Database parameters
350 iniparser_copystring(ini, "Database:AnnotationFileName", AnnotationFileName, AnnotationFileName, MAX_FILENAME_LEN);
351 iniparser_copystring(ini, "Database:AmbiguityFileName", AmbiguityFileName, AmbiguityFileName, MAX_FILENAME_LEN);
352 iniparser_copystring(ini, "Database:PackedDNAFileName", PackedDNAFileName, PackedDNAFileName, MAX_FILENAME_LEN);
353 iniparser_copystring(ini, "Database:BWTCodeFileName", BWTCodeFileName, BWTCodeFileName, MAX_FILENAME_LEN);
354 iniparser_copystring(ini, "Database:BWTOccValueFileName", BWTOccValueFileName, BWTOccValueFileName, MAX_FILENAME_LEN);
355 iniparser_copystring(ini, "Database:SaValueFileName", SaValueFileName, SaValueFileName, MAX_FILENAME_LEN);
356 iniparser_copystring(ini, "Database:SaIndexFileName", SaIndexFileName, SaIndexFileName, MAX_FILENAME_LEN);
358 iniparser_freedict(ini);
362 void ProcessIni() {
364 ProcessFileName(AnnotationFileName, AnnotationFileName, DatabaseName);
365 ProcessFileName(AmbiguityFileName, AmbiguityFileName, DatabaseName);
366 ProcessFileName(PackedDNAFileName, PackedDNAFileName, DatabaseName);
367 ProcessFileName(BWTCodeFileName, BWTCodeFileName, DatabaseName);
368 ProcessFileName(BWTOccValueFileName, BWTOccValueFileName, DatabaseName);
369 ProcessFileName(SaValueFileName, SaValueFileName, DatabaseName);
370 ProcessFileName(SaIndexFileName, SaIndexFileName, DatabaseName);
374 void ValidateIni() {
376 if (!ParseFASTA && !BuildBWT && !BuildSaValue && !BuildSaIndex) {
377 fprintf(stderr, "No action is specified!\n");
378 exit(1);
380 if (ParseFASTA) {
381 if (PackedDNAFileName[0] == '\0') {
382 fprintf(stderr, "Packed DNA file name is not specified!\n");
383 exit(1);
385 if (AnnotationFileName[0] == '\0') {
386 fprintf(stderr, "Annotation file name is not specified!\n");
387 exit(1);
389 if (AmbiguityFileName[0] == '\0') {
390 fprintf(stderr, "Ambiguity file name is not specified!\n");
391 exit(1);
394 if (BuildBWT) {
395 if (PackedDNAFileName[0] == '\0') {
396 fprintf(stderr, "Packed DNA file is not specified!\n");
397 exit(1);
399 if (BWTCodeFileName[0] == '\0') {
400 fprintf(stderr, "BWT code file name is not specified!\n");
401 exit(1);
403 if (BWTOccValueFileName[0] == '\0') {
404 fprintf(stderr, "BWT Occ value file name is not specified!\n");
405 exit(1);
407 if (TargetNBit < 2.5) {
408 fprintf(stderr, "Target NBit should be at least 2.5!\n");
409 exit(1);
412 if (BuildSaValue) {
413 if (BWTCodeFileName[0] == '\0') {
414 fprintf(stderr, "BWT code file is not specified!\n");
415 exit(1);
417 if (BWTOccValueFileName[0] == '\0') {
418 fprintf(stderr, "BWT Occ value file is not specified!\n");
419 exit(1);
421 if (SaValueFileName[0] == '\0') {
422 fprintf(stderr, "SA value file name is not specified!\n");
423 exit(1);
425 if (SaValueFreq <= 0) {
426 fprintf(stderr, "SA value frequency must > 0!\n");
427 exit(1);
431 if (BuildSaIndex) {
432 if (BWTCodeFileName[0] == '\0') {
433 fprintf(stderr, "BWT code file is not specified!\n");
434 exit(1);
436 if (BWTOccValueFileName[0] == '\0') {
437 fprintf(stderr, "BWT Occ value file is not specified!\n");
438 exit(1);
440 if (SaIndexFileName[0] == '\0') {
441 fprintf(stderr, "SA index file name is not specified!\n");
442 exit(1);
444 if (SaIndexNumOfChar <= 0) {
445 fprintf(stderr, "SA index number of character must > 0!\n");
446 exit(1);
448 if (SaIndexNumOfChar > 13) {
449 fprintf(stderr, "SA index number of character must <= 13!\n");
450 exit(1);
457 void PrintIni() {
459 char boolean[2];
461 boolean[0] = 'N';
462 boolean[1] = 'Y';
464 printf("Parse FASTA file : %c\n", boolean[ParseFASTA]);
465 printf("Build BWT : %c\n", boolean[BuildBWT]);
466 printf("Build SA value : %c\n", boolean[BuildSaValue]);
467 printf("Build SA index : %c\n", boolean[BuildSaIndex]);
468 printf("\n");
470 printf("Show progress : %c\n", boolean[ShowProgress]);
471 printf("\n");
473 if (ParseFASTA) {
474 printf("Parse FASTA :\n");
475 printf("Mask lower case : %c\n", boolean[MaskLowerCase]);
476 printf("Random seed : %u\n", FASTARandomSeed);
477 printf("\n");
480 if (BuildBWT) {
481 printf("Build BWT :\n");
482 printf("Target N Bits : %.2f\n", TargetNBit);
483 printf("Occ value frequency : %u\n", OccValueFreq);
484 printf("Initial Max Build Size : %u Inc Max Build Size : %u\n",
485 InitialMaxBuildSize, IncMaxBuildSize);
486 printf("\n");
489 if (BuildSaValue) {
490 printf("Build SA value :\n");
491 printf("SA value frequency : %u\n", SaValueFreq);
492 printf("\n");
495 if (BuildSaIndex) {
496 printf("Build SA index :\n");
497 printf("SA index no. of char : %u\n", SaIndexNumOfChar);
498 printf("\n");
501 printf("Annotation file : %s\n", AnnotationFileName);
502 printf("Ambigurity file : %s\n", AmbiguityFileName);
503 printf("Packed DNA file : %s\n", PackedDNAFileName);
504 printf("BWT Code file : %s\n", BWTCodeFileName);
505 printf("BWT Occ value file : %s\n", BWTOccValueFileName);
506 printf("SA value file : %s\n", SaValueFileName);
507 printf("SA index file : %s\n", SaIndexFileName);
508 printf("\n");
512 void PrintShortDesc() {
514 printf("BWTFormatdb v1.0, Copyright (C) 2006, Wong Chi Kwong.\n");
515 printf("BWTFormatdb comes with ABSOLUTELY NO WARRENTY.\n");
516 printf("BWTFormatdb is free software, and you are welcome to\n");
517 printf("redistribute it under certain conditions.\n");
518 printf("For details type BWTFormatdb.\n");
519 printf("\n");
523 void PrintHelp() {
525 printf("BWTFormatdb v1.0, Copyright (C) 2006, Wong Chi Kwong.\n");
526 printf("\n");
528 printf("This program is free software; you can redistribute it and/or\n");
529 printf("modify it under the terms of the GNU General Public License\n");
530 printf("as published by the Free Software Foundation; either version 2\n");
531 printf("of the License, or (at your option) any later version.\n");
532 printf("\n");
534 printf("This program is distributed in the hope that it will be useful,\n");
535 printf("but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
536 printf("MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n");
537 printf("GNU General Public License for more details.\n");
538 printf("\n");
540 printf("You should have received a copy of the GNU General Public License\n");
541 printf("along with this program; if not, write to the Free Software\n");
542 printf("Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n");
543 printf("\n");
545 printf("Syntax: BWTFormatdb <database> [FASTA file; default = <database>]\n");
546 printf(" [-c Confirm]\n");
547 printf(" [-U Mask lower case]\n");
551 void ProcessFileName(char *outputFileName, const char *inputFileName, const char *databaseName) {
553 char tempChar[MAX_FILENAME_LEN];
554 unsigned int i;
556 if (inputFileName == NULL) {
557 if (outputFileName != inputFileName) {
558 outputFileName[0] = '\0';
560 return;
563 if (strlen(databaseName) + strlen(inputFileName) > MAX_FILENAME_LEN) {
564 fprintf(stderr, "File length is too long!\n");
565 exit(1);
568 strncpy(tempChar, inputFileName, MAX_FILENAME_LEN);
570 // locate the *
571 for (i=0; i<MAX_FILENAME_LEN; i++) {
572 if (tempChar[i] == '*') {
573 break;
576 if (i<MAX_FILENAME_LEN) {
577 tempChar[i] = '\0';
578 sprintf(outputFileName, "%s%s%s", tempChar, databaseName, tempChar + i + 1);
579 } else {
580 sprintf(outputFileName, "%s", tempChar);