2 #include <stdlib.h> //EXIT_FAILURE
14 #define MAXHOMOPOLYLEN (2l*1024*1024)
20 uint64_t HomoPoly
[5][MAXHOMOPOLYLEN
];
21 uint64_t maxHomoPoly
[5];
22 unsigned char theBase
;
24 const char *argp_program_version
=
25 "HomoPolymer Counter 0.1 @"__TIME__
"," __DATE__
;
26 const char *argp_program_bug_address
=
27 "<huxuesong@genomics.org.cn>";
29 /* Program documentation. */
37 /* A description of the arguments we accept. */
38 static char args_doc
[] = "input_files (FASTA or FASTQ)";
40 /* The options we understand. */
41 static struct argp_option options
[] = {
42 {"interactive", 'i', 0, 0, "Prompt arguments before procedure"},
43 {"outfile", 'o', "./out.stat",0, "Output file" },
44 // {"countbase", 'c', 0, 0, "Also count bases" },
48 /* Used by main to communicate with parse_opt. */
51 uint_fast8_t countbase
, interactive
;
55 /* Parse a single option. */
57 parse_opt (int key
, char *arg
, struct argp_state
*state
) {
58 /* Get the input argument from argp_parse, which we
59 know is a pointer to our arguments structure. */
60 struct arguments
*arguments
= state
->input
;
64 arguments
->interactive
= 1;
67 arguments
->countbase
= 1;
70 arguments
->outfile
= arg
;
74 arguments
->args
[state
->arg_num
] = arg
;
78 if (state
->arg_num
< 1) /* Not enough arguments. */
80 argp_state_help (state
,stderr
,ARGP_HELP_STD_HELP
);
84 return ARGP_ERR_UNKNOWN
;
89 /* Our argp parser. */
90 static struct argp argp
= { options
, parse_opt
, args_doc
, doc
};
92 int main (int argc
, char **argv
) {
93 struct arguments arguments
;
94 arguments
.args
=calloc(sizeof(size_t),argc
);
97 arguments
.countbase
= 0;
98 arguments
.interactive
= 0;
99 arguments
.outfile
= "./out.stat";
101 // Parse our arguments; every option seen by parse_opt will be reflected in arguments.
102 argp_parse (&argp
, argc
, argv
, 0, 0, &arguments
);
104 if (arguments
.interactive
) {
105 printf ("Output = %s\n",
112 ssize_t lastpos
,polymerLen
;
115 char *const*line
= arguments
.args
-1;
119 fputs("\nParsing Sequence Files:\n", stderr
);
120 while ( *(++line
) ) {
122 fprintf(stderr
, " <%s> ...", *line
);
123 SeqFileObj
*seqobj
= inSeqFinit(*line
,GFIOCHRBASE
);
125 while ( (readlength
= (*seqobj
->getNextSeq
)(seqobj
)) >= 0 ) {
128 printf("-ID:[%s,%s] %zu %zu %zd\nSeq:[%s]\nQ:[%s] *%zx,%u\n",
129 seqobj
->name
,seqobj
->comment
,seqobj
->readlength
,seqobj
->binMallocedQQWord
,readlength
,
130 seqobj
->seq
,seqobj
->qual
,(size_t)seqobj
->seq
,seqobj
->type
);
132 //NormalizeChrSeq((char*)seqobj->seq);
133 printf("rc[%s]\n",tmpseq=ChrSeqRevComp(seqobj->seq,seqobj->readlength));
136 lastbase
=seqobj
->seq
[0];
138 allbases
+= readlength
;
139 for (ssize_t i
=1;i
<=readlength
;i
++) { // *(readlength+1)=='\0'
140 if (seqobj
->seq
[i
] != lastbase
) {
141 polymerLen
= i
-lastpos
;
159 if (polymerLen
<MAXHOMOPOLYLEN
) {
160 ++HomoPoly
[theBase
][polymerLen
];
161 if (maxHomoPoly
[theBase
]<polymerLen
) maxHomoPoly
[theBase
]=polymerLen
;
163 ++HomoPoly
[theBase
][0];
165 lastbase
=seqobj
->seq
[i
];
171 fputs("\b\b\b\b, done !\n", stderr
);
172 inSeqFdestroy(seqobj
);
174 free(arguments
.args
);
175 fputs("\nStat Done!\n", stderr
);
177 FILE *fp
= fopen(arguments
.outfile
, "w");
178 fprintf(fp
,"#Total_Bases: %lu\n"
179 "\n#Base\tRepeat_Count\tTimes\n",allbases
);
180 for (unsigned char base
=0;base
<5;base
++) {
198 for (size_t polymerLen
=1;polymerLen
<=maxHomoPoly
[base
];polymerLen
++) {
199 fprintf(fp
,"%c\t%lu\t%lu\n",theBase
,polymerLen
,HomoPoly
[base
][polymerLen
]);
201 if (HomoPoly
[base
][0]) {
202 fprintf(fp
,"#%c\t>=%lu\t%lu\n",theBase
,MAXHOMOPOLYLEN
,HomoPoly
[base
][0]);
204 fprintf(fp
,"#----------------------------------------------------------------\n");