modified: Makefile
[GalaxyCodeBases.git] / c_cpp / readsdedump / simple.c
blob8abcb96d882205d2c51aad9ae910213df22c2400
1 #define _GNU_SOURCE
2 #include <assert.h>
3 #include <limits.h>
4 #include <stdio.h>
5 #include <stdlib.h> //EXIT_FAILURE
6 #include "gkseq.h"
8 #include <stdint.h>
9 #include <string.h>
10 #include <err.h>
11 #include <argp.h>
12 #include <math.h>
13 #include "timer.h"
15 // 128k + 16k
16 #define READSCOUNT_INIT (128*1024)
17 #define READSCOUNT_INC (16*1024)
18 //#define READSCOUNT_INIT (16)
19 //#define READSCOUNT_INC (4)
21 const char *argp_program_version =
22 "readsdedump simple 0.1 @"__TIME__ "," __DATE__;
23 const char *argp_program_bug_address =
24 "<huxuesong@genomics.org.cn>";
26 /* Program documentation. */
27 static char doc[] =
28 "Simple readsdedump"
29 #ifdef DEBUG
30 " (Debug Version)"
31 #endif
32 #ifdef TEST
33 " (Test Version)"
34 #endif
37 /* A description of the arguments we accept. */
38 static char args_doc[] = "input_files (FASTA or FASTQ)";
40 /* The options we understand. */
41 static struct argp_option options[] = {
42 {"maxmismatch", 'm', "10" ,0, "Max mismatch to be PCR duplicate"},
43 {"outfile", 'o', "./out.stat" ,0, "Output file" },
44 { 0 }
47 /* Used by main to communicate with parse_opt. */
48 struct arguments {
49 uint_fast16_t mismatch;
50 char **args;
51 char *outfile;
54 /* Parse a single option. */
55 static error_t
56 parse_opt (int key, char *arg, struct argp_state *state) {
57 /* Get the input argument from argp_parse, which we
58 know is a pointer to our arguments structure. */
59 struct arguments *arguments = state->input;
60 int tmpArgValue;
61 switch (key) {
62 case 'o':
63 arguments->outfile = arg;
64 break;
65 case 'm':
66 tmpArgValue = atoi(arg);
67 if (tmpArgValue>=0 && tmpArgValue <= UINT16_MAX) {
68 arguments->mismatch = tmpArgValue;
69 } else {
70 errx(2,"-m \"%s\"=%i is not between [0,%d] !",arg,tmpArgValue,UINT16_MAX);
72 break;
74 case ARGP_KEY_ARG:
75 arguments->args[state->arg_num] = arg;
76 break;
78 case ARGP_KEY_END:
79 if (state->arg_num < 1) /* Not enough arguments. */
80 //argp_usage (state);
81 argp_state_help (state,stderr,ARGP_HELP_STD_HELP);
82 break;
84 default:
85 return ARGP_ERR_UNKNOWN;
87 return 0;
90 /* Our argp parser. */
91 static struct argp argp = { options, parse_opt, args_doc, doc };
93 static inline uint_fast16_t compseq(char const* strA, char const* strB, uint_fast16_t maxMismatch) {
94 uint_fast16_t mismatch=0;
95 #ifdef DEBUG
96 printf("[%s]-[%s]:\n",strA,strB);
97 #endif
98 while (*strA && *strB) {
99 #ifdef DEBUG
100 printf(" %.1s,%.1s:",strA,strB);
101 #endif
102 mismatch += (*strA++ != *strB++);
103 #ifdef DEBUG
104 printf("%d ",(int)mismatch);
105 #endif
106 if (mismatch > maxMismatch) {
107 #ifdef DEBUG
108 puts("");
109 #endif
110 return 0;
113 #ifdef DEBUG
114 puts("");
115 #endif
116 return mismatch;
119 int main (int argc, char **argv) {
120 struct arguments arguments;
121 arguments.args=calloc(sizeof(size_t),argc);
123 // Default values.
124 arguments.outfile = "./out.stat";
125 arguments.mismatch = 10;
126 char *const*line = arguments.args-1; // so that we can use *(++line)
128 // Parse our arguments; every option seen by parse_opt will be reflected in arguments.
129 argp_parse (&argp, argc, argv, 0, 0, &arguments);
131 fprintf(stderr,"Arguments: max_mismatch=%d, out_file=%s\nInput_files:",
132 (int)arguments.mismatch,arguments.outfile);
133 while ( *(++line) ) {
134 fprintf(stderr, " [%s]", *line);
136 fputs("\n", stderr);
137 line = arguments.args-1;
139 //uint8_t* pSeqMismatchArray = calloc(READSCOUNT_INIT,sizeof(uint8_t));
140 char** pSeqArray = malloc(READSCOUNT_INIT*sizeof(size_t));
141 size_t SeqArrayLength = READSCOUNT_INIT;
142 uint64_t* pMismatchCount = calloc(arguments.mismatch+1,sizeof(uint64_t));
144 uint64_t BasesCount=0;
145 uint64_t ReadsCount=0;
146 uint64_t MaxReadLength=0;
147 int_fast8_t diffRL=0;
149 fputs("\nParsing Sequence Files:\n", stderr);
150 G_TIMER_START;
152 kseq_t *kseq;
153 int_fast8_t ReadType;
155 while ( *(++line) ) {
156 fprintf(stderr, " <%s>", *line);
157 kseq = kseq_open(*line);
158 uint64_t RealSize,RealOffset;
159 size_t ReadLength;
160 if (kseq) {
161 RealSize = kseq->f->RealSize;
162 fprintf(stderr," RealSize:%ld ...",RealSize);
163 while ( (ReadType = kseq_read(kseq)) > 0 ) {
164 ReadLength = kseq->seq.l;
165 RealOffset = kseq->f->RealOffset;
166 #ifdef DEBUG
167 printf("\n-ID:[%s,%s] %zu %zu %zd %ld\nSeq:[%s]\nQ:[%s] *%zx,%d \n",
168 kseq->name.s,kseq->comment.s,kseq->seq.l,kseq->qual.l,ReadLength,kseq->f->RealOffset,
169 kseq->seq.s,kseq->qual.s,(size_t)&(kseq->seq.s),ReadType);
170 #endif
171 if (ReadLength>0) {
172 if (ReadLength>MaxReadLength) MaxReadLength=ReadLength;
173 if (ReadsCount+1>SeqArrayLength) {
174 pSeqArray = realloc(pSeqArray,(SeqArrayLength+READSCOUNT_INC)*sizeof(size_t));
175 //memset(pSeqSeqArray+SeqMismatchArrayLength,0,READSCOUNT_INC*sizeof(size_t));
176 SeqArrayLength += READSCOUNT_INC;
178 pSeqArray[ReadsCount]=malloc(ReadLength+1);
179 strcpy(pSeqArray[ReadsCount],kseq->seq.s);
180 ++ReadsCount;
181 BasesCount += ReadLength;
182 if (MaxReadLength != ReadLength) diffRL=1;
185 //printf(" lasttype:%d lastpos:%ld ------",ReadType,kseq->f->RealOffset);
186 } else {
187 fputs(": File not found !\n", stderr);
188 continue;
190 fputs("\b\b\b\b, done !\n", stderr);
191 kseq_destroy(kseq);
193 uint8_t* pSeqMismatchArray = calloc(ReadsCount,sizeof(uint8_t));
195 fprintf(stderr,"[!]Total_Reads: %ld\tMean_Read_Length: %f\n",ReadsCount,(double)BasesCount/(double)ReadsCount);
197 //uint64_t CmpCount=0; // CmpCount == ReadsCount*(ReadsCount-1)/2
198 uint64_t MisCount=0;
199 uint64_t MisChgCount=0;
200 for (size_t i=0;i<ReadsCount;++i) {
201 for (size_t j=i+1;j<ReadsCount;++j) {
202 //++CmpCount;
203 uint_fast16_t miscount=compseq(pSeqArray[i],pSeqArray[j],arguments.mismatch);
204 if (miscount) {
205 ++MisCount;
206 if (pSeqMismatchArray[i]==0 || pSeqMismatchArray[i]>miscount) {
207 pSeqMismatchArray[i]=miscount;
208 ++MisChgCount;
210 if (pSeqMismatchArray[j]==0 || pSeqMismatchArray[j]>miscount) {
211 pSeqMismatchArray[j]=miscount;
212 ++MisChgCount;
218 for (size_t i=0;i<ReadsCount;++i) {
219 #ifdef DEBUG
220 printf("%zd/%zd\t[%s] %d\n",i,ReadsCount,pSeqArray[i],(int)pSeqMismatchArray[i]);
221 #endif
222 ++pMismatchCount[pSeqMismatchArray[i]];
225 fprintf(stderr,"[!]CmpTimes:%ld, mismatch_OK:%ld, mismatch_Update:%ld\n",
226 ReadsCount*(ReadsCount-1)/2,MisCount,MisChgCount);
227 fputs("\nCount Done!\n\nMismatch\tReadsCount\n", stderr);
228 if (diffRL) fprintf(stderr,"[!]Different Read Length Found. Mismatch may be in correct.\n");
230 FILE *fp = fopen(arguments.outfile, "w");
231 fprintf(fp,"#Total_Reads: %ld\n#Mean_Read_Length: %f\n#Max_mismatch: %d\n\n#Mismatch\tReadsCount\n",
232 ReadsCount,(double)BasesCount/(double)ReadsCount,(int)arguments.mismatch);
233 for (uint_fast16_t i=0;i<=arguments.mismatch;++i) {
234 if (pMismatchCount[i]) {
235 fprintf(fp,"%d\t%ld\n",(int)i,pMismatchCount[i]);
236 fprintf(stderr,"%d\t%ld\n",(int)i,pMismatchCount[i]);
239 fprintf(fp,"\n# Mismatch=0 means unique.\n");
240 fclose(fp);
242 for (size_t i=0;i<ReadsCount;++i) {
243 free(pSeqArray[i]);
245 free(pSeqArray);
246 free(pSeqMismatchArray);
247 free(arguments.args);
249 G_TIMER_END;
250 G_TIMER_PRINT;
252 exit(EXIT_SUCCESS);