modified: src1/worker.c
[GalaxyCodeBases.git] / c_cpp / readscorr / main.c
blob0e9f6ffed2cecd0d13f7cc40df51dc570585d4b8
1 #define _GNU_SOURCE
2 #include <stdlib.h> //EXIT_FAILURE
3 #include <stdio.h>
4 #include <stdint.h>
5 #include <string.h>
6 //#include <errno.h>
7 #include <err.h>
8 #include <argp.h>
9 #include <math.h>
10 #include "MurmurHash3.h"
11 #include "getch.h"
12 #include "2bitarray.h"
13 #include "gFileIO.h"
14 #include "bihash.h"
15 #include "chrseq.h"
16 #include "cfgparser.h"
17 #include "timer.h"
18 #include "gtools.h"
20 #define SUBARRAY_MAX 8192u
22 const char *argp_program_version =
23 "readscorr 0.1 @"__TIME__ "," __DATE__;
24 const char *argp_program_bug_address =
25 "<huxuesong@genomics.org.cn>";
27 /* Program documentation. */
28 static char doc[] =
29 "a Solexa Reads Kmer Counter using simplied D-Left hashing"
30 #ifdef DEBUG
31 " (Debug Version)"
32 #endif
33 #ifdef TEST
34 " (Test Version)"
35 #endif
38 /* A description of the arguments we accept. */
39 static char args_doc[] = "input_config (of nfo & FASTA or FASTQ files)";
41 /* The options we understand. */
42 static struct argp_option options[] = {
43 {"interactive", 'i', 0, 0, "Prompt arguments before procedure"},
44 {"verbose", 'v', 0, 0, "Produce verbose output" },
45 {"quiet", 'q', 0, 0, "Don't produce any output" },
46 //{"silent", 's', 0, OPTION_ALIAS },
47 {"outprefix",'o', "./out",0, "Output to [./out.{dat,stat,log}]" },
48 {"kmersize", 'k', "21", 0, "K-mer size" },
49 {"countbit",'c', "9", 0, "length of kmer freq. Count in bit" },
50 {"arrayklen",'a', "512", 0, "Size in kilo(x1024) for Simplied D-Left Array" },
51 {"subarray",'s', "32", 0, "Size of single sub array of the SDLA (mod " CAT(SDL_SUBARRAY_UNIT) ")" },
52 {"remainkeybit",'r', "31", 0, "length of SDLA Remain-Key in bit.\nWill elongate to align(8) with countbit." },
53 {"example", 'e', 0, 0, "OVERWRITE an example to [input_config]"},
54 { 0 }
57 /* Used by main to communicate with parse_opt. */
58 struct arguments {
59 char *args[1]; /* arg1 */
60 uint_fast8_t silent, verbose, interactive, writexample; //_Bool is the same under amd64, as typedef unsigned char uint_fast8_t;
61 int kmersize;
62 unsigned char CountBit, rBit;
63 size_t ArraySizeK;
64 uint16_t SubItemCount;
65 char *outprefix;
68 /* Parse a single option. */
69 static error_t
70 parse_opt (int key, char *arg, struct argp_state *state) {
71 /* Get the input argument from argp_parse, which we
72 know is a pointer to our arguments structure. */
73 struct arguments *arguments = state->input;
75 int tmpArgValue;
76 switch (key) {
77 case 'q':// case 's':
78 arguments->silent = 1;
79 break;
80 case 'v':
81 arguments->verbose = 1;
82 break;
83 case 'e':
84 arguments->writexample = 1;
85 break;
86 case 'i':
87 arguments->interactive = 1;
88 break;
89 case 'o':
90 arguments->outprefix = arg;
91 break;
92 case 'k':
93 tmpArgValue = atoi(arg);
94 if (tmpArgValue>=2 && tmpArgValue <= MAX_KMER_LEN) {
95 arguments->kmersize = tmpArgValue;
96 } else {
97 errx(2,"-k \"%s\"=%i is not between [2,%d] !",arg,tmpArgValue,MAX_KMER_LEN);
99 break;
100 case 'c':
101 tmpArgValue = atoi(arg);
102 if (tmpArgValue>2 && tmpArgValue <= 64) {
103 arguments->CountBit = tmpArgValue;
104 } else {
105 errx(2,"-c \"%s\"=%i is not a integer of [3,64] !",arg,tmpArgValue);
107 break;
108 case 'a':
109 tmpArgValue = atoi(arg);
110 if (tmpArgValue>0 && tmpArgValue <= SIZE_MAX/1024) {
111 arguments->ArraySizeK = tmpArgValue;
112 } else {
113 errx(2,"-a \"%s\"=%i is not a integer of [1,%zu] !",arg,tmpArgValue,SIZE_MAX/1024);
115 break;
116 case 's':
117 tmpArgValue = atoi(arg);
118 if (tmpArgValue>0 && tmpArgValue <= SUBARRAY_MAX) {
119 arguments->SubItemCount = (tmpArgValue+SDL_SUBARRAY_UNIT-1)&(~(uint16_t)(SDL_SUBARRAY_UNIT-1));
120 } else {
121 errx(2,"-s \"%s\"=%i is not a integer of [1,%u] !",arg,tmpArgValue,SUBARRAY_MAX);
123 break;
124 case 'r':
125 tmpArgValue = atoi(arg);
126 if (tmpArgValue>5 && tmpArgValue <= 64) {
127 arguments->rBit = tmpArgValue;
128 } else {
129 errx(2,"-r \"%s\"=%i is not a integer of [6,64] !",arg,tmpArgValue);
131 break;
133 case ARGP_KEY_ARG:
134 if (state->arg_num >= 1)
135 /* Too many arguments. */
136 argp_usage (state);
137 arguments->args[state->arg_num] = arg;
138 break;
140 case ARGP_KEY_END:
141 if (state->arg_num < 1) /* Not enough arguments. */
142 //argp_usage (state);
143 argp_state_help (state,stderr,ARGP_HELP_STD_HELP);
144 break;
146 default:
147 return ARGP_ERR_UNKNOWN;
149 return 0;
152 /* Our argp parser. */
153 static struct argp argp = { options, parse_opt, args_doc, doc };
155 char *strlinker(char *main, char *suffix) {
156 char *outstr = malloc(strlen(main)+strlen(suffix)+1);
157 strcpy(outstr,main);
158 strcat(outstr,suffix);
159 return outstr;
162 int main (int argc, char **argv) {
163 struct arguments arguments;
165 // Default values.
166 arguments.silent = 0;
167 arguments.verbose = 0;
168 arguments.interactive = 0;
169 arguments.writexample = 0;
170 arguments.outprefix = "./out";
171 arguments.kmersize = 21;
172 arguments.CountBit = 9;
173 arguments.rBit = 31;
174 arguments.ArraySizeK = 512;
175 arguments.SubItemCount = 32;
177 // Parse our arguments; every option seen by parse_opt will be reflected in arguments.
178 argp_parse (&argp, argc, argv, 0, 0, &arguments);
180 if (arguments.writexample) {
181 printf("[!] Going to OVERWRITE an example to [%s] !\n",arguments.args[0]);
182 pressAnyKey();
183 write_example_cfg(arguments.args[0]);
184 exit(EXIT_SUCCESS);
186 if (arguments.interactive) {
187 unsigned char rBit = arguments.rBit;
188 uint16_t SubItemCount = arguments.SubItemCount;
189 unsigned char itemByte = GETitemByte_PADrBit_trimSubItemCount(arguments.CountBit,&rBit,&SubItemCount);
190 uint64_t ArraySize = arguments.ArraySizeK*1024;
191 uint64_t Capacity = ArraySize*SubItemCount;
192 uint64_t SDLsize = itemByte*Capacity;
193 double HashBit = rBit + log(ArraySize)/log(2);
194 printf ("input_config = %s\nOutputPrefix = %s\n"
195 "VERBOSE = %s\nSILENT = %s\n"
196 "SDLA { CountBit:%u. rBit:%u(%u), ArraySize:%lu(k), SubItemCount:%u(%u) }\n"
197 "kmersize = %i, SDLAsize = %.1f MiB (%s Bytes)\n"
198 "HashBit:%.3g, ErrRate:%.12g\nMaxCapacity: %s\n",
199 arguments.args[0],
200 arguments.outprefix,
201 arguments.verbose ? "yes" : "no",
202 arguments.silent ? "yes" : "no",
203 arguments.CountBit,rBit,arguments.rBit,arguments.ArraySizeK,SubItemCount,arguments.SubItemCount,
204 arguments.kmersize,(double)SDLsize/1048576.0,commaprint(SDLsize),
205 HashBit,pow(0.5,rBit)/ArraySize,commaprint(Capacity)
207 pressAnyKey();
209 char *outStat = strlinker(arguments.outprefix, ".stat");
210 char *outDat = strlinker(arguments.outprefix, ".dat");
211 char *outLog = strlinker(arguments.outprefix, ".log");
212 //printf("Out[%s][%s][%s]\n",outStat,outDat,outLog);
213 SDLConfig *psdlcfg=read_SDL_cfg((double)arguments.kmersize,arguments.args[0]);
214 ssize_t read;
215 char *line;
216 uint64_t insertedCount=0;
218 SDLeftArray_t *dleftp = dleft_arrayinit(arguments.CountBit,arguments.ArraySizeK*1024,arguments.SubItemCount,arguments.kmersize); // (9,31,5000000,48);
219 fputs("SDLA nfo: ", stderr);
220 fprintSDLAnfo(stderr,dleftp);
222 G_TIMER_START;
224 fputs("\nParsing Sequence Files:\n", stderr);
225 while ((read = get_next_seqfile(psdlcfg)) != -1) {
226 ssize_t readlength;
227 line=psdlcfg->seqfilename;
228 fprintf(stderr, " <%s> ...", line);
229 //sleep(1); // the Call ...
230 SeqFileObj *seqobj = inSeqFinit(line,GFIOCHRBASE);
231 if (seqobj) {
232 while ( (readlength = (*seqobj->getNextSeq)(seqobj)) >= 0 ) {
233 #ifdef DEBUG
234 puts(line);
235 printf("-ID:[%s,%s] %zu %zu\nSeq:[%s]\nQ:[%s] *%zx,%u\n",
236 seqobj->name,seqobj->comment,seqobj->readlength,seqobj->binMallocedQQWord,
237 seqobj->seq,seqobj->qual,(size_t)seqobj->seq,seqobj->type);
238 /*char *tmpseq;
239 //NormalizeChrSeq((char*)seqobj->seq);
240 printf("rc[%s]\n",tmpseq=ChrSeqRevComp(seqobj->seq,seqobj->readlength));
241 free(tmpseq);*/
242 #endif
243 size_t insertedKmer = dleft_insert_read(arguments.kmersize,seqobj->seq,seqobj->readlength,dleftp);
244 insertedCount += insertedKmer;
245 #ifdef DEBUG
246 printf("Inserted:[%zu of %lu]\n",insertedKmer,insertedCount);
247 #endif
249 } else continue;
250 fputs("\b\b\b\b, done !\n", stderr);
251 fprintf(stderr, "[!]Accumulated Inserted Kmer:[%lu] times\n", insertedCount);
252 inSeqFdestroy(seqobj);
255 rewind(fp);
256 fputs("\nSecond pass:\n", stderr);
257 while ((read = getline(&line, &len, fp)) != -1) {
258 if (*(line+read-1)=='\n') *(line+(--read))='\0';
259 printf("[%zu]+%zu <%s>\n", read, len, line);
262 fputs("\nHashing Done!\n", stderr);
263 //__clock_diff = clock() - __clock_start;
264 //fprintf(stderr,"\nHashing Done with CPU time: %zd.%06zd s\n", __clock_diff/CLOCKS_PER_SEC, __clock_diff%CLOCKS_PER_SEC);
266 destory_seqfile(psdlcfg);
267 fputs("SDLA nfo: ", stderr);
268 fprintSDLAnfo(stderr,dleftp);
269 FILE *fpstat, *fpdat;
270 fpstat = fopen(outStat, "w");
271 fpdat = fopen(outDat, "w");
272 fprintf(fpstat,"#KmerSize: %d\n#Kmer_theory_count: %.34Lg\n",arguments.kmersize,powl(4.0,(long double)arguments.kmersize)); // 34 from http://en.wikipedia.org/wiki/Quadruple_precision
274 #ifdef TEST /* in TEST mode, "out.stat" changed to subArray filling count */
275 SDLeftStat_t *SDLeftStat = dleft_stat(dleftp,fpstat,fpdat);
276 #else /* Normal out.stat */
277 SDLeftStat_t *SDLeftStat = dleft_stat(dleftp,fpstat);
279 SDLdumpHead *DatHead = malloc(sizeof(SDLdumpHead));
280 DatHead->kmersize = arguments.kmersize;
281 DatHead->HistMaxCntVal = SDLeftStat->HistMaxCntVal;
282 DatHead->HistMaxHistVal = SDLeftStat->HistMaxHistVal;
283 DatHead->HistMean = SDLeftStat->HistMean;
284 DatHead->HistSStd = SDLeftStat->HistSStd;
285 //free(SDLeftStat);
286 dleft_dump(dleftp,DatHead,fpdat);
287 free(DatHead);
288 #endif
289 fclose(fpstat);
290 fclose(fpdat);
292 free(SDLeftStat);
293 dleft_arraydestroy(dleftp);
294 free(outStat); free(outDat); free(outLog); // just filenames
296 G_TIMER_END;
297 G_TIMER_PRINT;
299 exit(EXIT_SUCCESS);