modified: src1/worker.c
[GalaxyCodeBases.git] / c_cpp / readscorr / gcount.c
blob116afa6350dcfd3d49ae87a09de1f3cee3b56ee4
1 #define _GNU_SOURCE
2 #include <stdlib.h> //EXIT_FAILURE
3 #include <stdio.h>
4 #include <stdint.h>
5 #include <string.h>
6 //#include <errno.h>
7 #include <err.h>
8 #include <argp.h>
9 #include "getch.h"
10 #include "gFileIO.h"
11 #include "chrseq.h"
12 #include "timer.h"
14 #define MAXHOMOPOLYLEN (2l*1024*1024)
15 #define BaseA 1
16 #define BaseT 3
17 #define BaseC 2
18 #define BaseG 4
19 #define BaseN 0
20 uint64_t HomoPoly[5][MAXHOMOPOLYLEN];
21 uint64_t maxHomoPoly[5];
22 unsigned char theBase;
24 const char *argp_program_version =
25 "HomoPolymer Counter 0.1 @"__TIME__ "," __DATE__;
26 const char *argp_program_bug_address =
27 "<huxuesong@genomics.org.cn>";
29 /* Program documentation. */
30 static char doc[] =
31 "HomoPolymer Counter"
32 #ifdef DEBUG
33 " (Debug Version)"
34 #endif
37 /* A description of the arguments we accept. */
38 static char args_doc[] = "input_files (FASTA or FASTQ)";
40 /* The options we understand. */
41 static struct argp_option options[] = {
42 {"interactive", 'i', 0, 0, "Prompt arguments before procedure"},
43 {"outfile", 'o', "./out.stat",0, "Output file" },
44 // {"countbase", 'c', 0, 0, "Also count bases" },
45 { 0 }
48 /* Used by main to communicate with parse_opt. */
49 struct arguments {
50 char **args;
51 uint_fast8_t countbase, interactive;
52 char *outfile;
55 /* Parse a single option. */
56 static error_t
57 parse_opt (int key, char *arg, struct argp_state *state) {
58 /* Get the input argument from argp_parse, which we
59 know is a pointer to our arguments structure. */
60 struct arguments *arguments = state->input;
62 switch (key) {
63 case 'i':
64 arguments->interactive = 1;
65 break;
66 case 'c':
67 arguments->countbase = 1;
68 break;
69 case 'o':
70 arguments->outfile = arg;
71 break;
73 case ARGP_KEY_ARG:
74 arguments->args[state->arg_num] = arg;
75 break;
77 case ARGP_KEY_END:
78 if (state->arg_num < 1) /* Not enough arguments. */
79 //argp_usage (state);
80 argp_state_help (state,stderr,ARGP_HELP_STD_HELP);
81 break;
83 default:
84 return ARGP_ERR_UNKNOWN;
86 return 0;
89 /* Our argp parser. */
90 static struct argp argp = { options, parse_opt, args_doc, doc };
92 int main (int argc, char **argv) {
93 struct arguments arguments;
94 arguments.args=calloc(sizeof(size_t),argc);
96 // Default values.
97 arguments.countbase = 0;
98 arguments.interactive = 0;
99 arguments.outfile = "./out.stat";
101 // Parse our arguments; every option seen by parse_opt will be reflected in arguments.
102 argp_parse (&argp, argc, argv, 0, 0, &arguments);
104 if (arguments.interactive) {
105 printf ("Output = %s\n",
106 arguments.outfile
108 pressAnyKey();
111 char lastbase;
112 ssize_t lastpos,polymerLen;
113 uint64_t allbases=0;
115 char *const*line = arguments.args-1;
117 G_TIMER_START;
119 fputs("\nParsing Sequence Files:\n", stderr);
120 while ( *(++line) ) {
121 ssize_t readlength;
122 fprintf(stderr, " <%s> ...", *line);
123 SeqFileObj *seqobj = inSeqFinit(*line,GFIOCHRBASE);
124 if (seqobj) {
125 while ( (readlength = (*seqobj->getNextSeq)(seqobj)) >= 0 ) {
126 #ifdef DEBUG
127 puts(*line);
128 printf("-ID:[%s,%s] %zu %zu %zd\nSeq:[%s]\nQ:[%s] *%zx,%u\n",
129 seqobj->name,seqobj->comment,seqobj->readlength,seqobj->binMallocedQQWord,readlength,
130 seqobj->seq,seqobj->qual,(size_t)seqobj->seq,seqobj->type);
131 /*char *tmpseq;
132 //NormalizeChrSeq((char*)seqobj->seq);
133 printf("rc[%s]\n",tmpseq=ChrSeqRevComp(seqobj->seq,seqobj->readlength));
134 free(tmpseq);*/
135 #endif
136 lastbase=seqobj->seq[0];
137 lastpos=0;
138 allbases += readlength;
139 for (ssize_t i=1;i<=readlength;i++) { // *(readlength+1)=='\0'
140 if (seqobj->seq[i] != lastbase) {
141 polymerLen = i-lastpos;
142 switch (lastbase) {
143 case 'A':
144 theBase=BaseA;
145 break;
146 case 'T':
147 theBase=BaseT;
148 break;
149 case 'C':
150 theBase=BaseC;
151 break;
152 case 'G':
153 theBase=BaseG;
154 break;
155 default:
156 theBase=BaseN;
157 break;
159 if (polymerLen<MAXHOMOPOLYLEN) {
160 ++HomoPoly[theBase][polymerLen];
161 if (maxHomoPoly[theBase]<polymerLen) maxHomoPoly[theBase]=polymerLen;
162 } else {
163 ++HomoPoly[theBase][0];
165 lastbase=seqobj->seq[i];
166 lastpos=i;
170 } else continue;
171 fputs("\b\b\b\b, done !\n", stderr);
172 inSeqFdestroy(seqobj);
174 free(arguments.args);
175 fputs("\nStat Done!\n", stderr);
177 FILE *fp = fopen(arguments.outfile, "w");
178 fprintf(fp,"#Total_Bases: %lu\n"
179 "\n#Base\tRepeat_Count\tTimes\n",allbases);
180 for (unsigned char base=0;base<5;base++) {
181 switch (base) {
182 case BaseA:
183 theBase='A';
184 break;
185 case BaseT:
186 theBase='T';
187 break;
188 case BaseC:
189 theBase='C';
190 break;
191 case BaseG:
192 theBase='G';
193 break;
194 default:
195 theBase='N';
196 break;
198 for (size_t polymerLen=1;polymerLen<=maxHomoPoly[base];polymerLen++) {
199 fprintf(fp,"%c\t%lu\t%lu\n",theBase,polymerLen,HomoPoly[base][polymerLen]);
201 if (HomoPoly[base][0]) {
202 fprintf(fp,"#%c\t>=%lu\t%lu\n",theBase,MAXHOMOPOLYLEN,HomoPoly[base][0]);
204 fprintf(fp,"#----------------------------------------------------------------\n");
206 fclose(fp);
208 G_TIMER_END;
209 G_TIMER_PRINT;
211 exit(EXIT_SUCCESS);