modified: makefile
[GalaxyCodeBases.git] / c_cpp / readsdedump / main.c
blob1115bf8ee3eed3d375590548d013e264fa7b43fc
1 #define _GNU_SOURCE
2 #include <assert.h>
3 #include <limits.h>
4 #include <stdio.h>
5 #include <stdlib.h> //EXIT_FAILURE
6 #include "gkseq.h"
8 #include <stdint.h>
9 #include <string.h>
10 #include <err.h>
11 #include <argp.h>
12 #include <math.h>
13 #include "timer.h"
15 #include "prb.h"
16 #include "prb-tree.h"
18 #define CAT0(x) #x
19 #define CAT(x) CAT0(x)
20 // 1048576 + 128k
21 //#define READSCOUNT_INIT (1048576)
22 //#define READSCOUNT_INC (128*1024)
23 #define READSCOUNT_INIT (16)
24 #define READSCOUNT_INC (4)
26 const char *argp_program_version =
27 "readsdedump simple 0.1 @"__TIME__ "," __DATE__;
28 const char *argp_program_bug_address =
29 "<huxuesong@genomics.org.cn>";
31 /* Program documentation. */
32 static char doc[] =
33 "Seeding readsdedump"
34 #ifdef DEBUG
35 " (Debug Version)"
36 #endif
37 #ifdef TEST
38 " (Test Version)"
39 #endif
42 /* A description of the arguments we accept. */
43 static char args_doc[] = "input_files (FASTA or FASTQ)";
45 /* The options we understand. */
46 static struct argp_option options[] = {
47 {"maxmismatch", 'm', "10" ,0, "Max mismatch to be PCR duplicate"},
48 {"seedlength", 's', "7" ,0, "Seed length"},
49 {"outfile", 'o', "./out.stat" ,0, "Output file" },
50 { 0 }
53 /* Used by main to communicate with parse_opt. */
54 struct arguments {
55 uint_fast16_t mismatch,seedLen;
56 char **args;
57 char *outfile;
60 /* Parse a single option. */
61 static error_t
62 parse_opt (int key, char *arg, struct argp_state *state) {
63 /* Get the input argument from argp_parse, which we
64 know is a pointer to our arguments structure. */
65 struct arguments *arguments = state->input;
66 int tmpArgValue;
67 switch (key) {
68 case 'o':
69 arguments->outfile = arg;
70 break;
71 case 'm':
72 tmpArgValue = atoi(arg);
73 if (tmpArgValue>=0 && tmpArgValue <= UINT16_MAX) {
74 arguments->mismatch = tmpArgValue;
75 } else {
76 errx(2,"-m \"%s\"=%i is not between [0,%d] !",arg,tmpArgValue,UINT16_MAX);
78 break;
79 case 's':
80 tmpArgValue = atoi(arg);
81 if (tmpArgValue>=0 && tmpArgValue <= UINT16_MAX) {
82 arguments->seedLen = tmpArgValue;
83 } else {
84 errx(2,"-s \"%s\"=%i is not between [0,%d] !",arg,tmpArgValue,UINT16_MAX);
86 break;
88 case ARGP_KEY_ARG:
89 arguments->args[state->arg_num] = arg;
90 break;
92 case ARGP_KEY_END:
93 if (state->arg_num < 1) /* Not enough arguments. */
94 //argp_usage (state);
95 argp_state_help (state,stderr,ARGP_HELP_STD_HELP);
96 break;
98 default:
99 return ARGP_ERR_UNKNOWN;
101 return 0;
104 /* Our argp parser. */
105 static struct argp argp = { options, parse_opt, args_doc, doc };
107 static inline uint_fast16_t compseq(char const* strA, char const* strB, uint_fast16_t maxMismatch) {
108 uint_fast16_t mismatch=0;
109 #ifdef DEBUG
110 printf("[%s]-[%s]:\n",strA,strB);
111 #endif
112 while (*strA && *strB) {
113 #ifdef DEBUG
114 printf(" %.1s,%.1s:",strA,strB);
115 #endif
116 mismatch += (*strA++ != *strB++);
117 #ifdef DEBUG
118 printf("%d ",(int)mismatch);
119 #endif
120 if (mismatch > maxMismatch) {
121 #ifdef DEBUG
122 puts("");
123 #endif
124 return 0;
127 #ifdef DEBUG
128 puts("");
129 #endif
130 return mismatch;
133 struct prb_table *MainKmerPosTree, *KmerCacheTree;
135 void insertSeeds(char const* seq, struct prb_table * tree, uint_fast16_t SeedLen) {
136 size_t ReadLength = strlen(seq);
137 #ifdef DEBUG
138 if (ReadLength<SeedLen) {
139 fprintf(stderr,"[!]Read_Length(%d) < Seed_Length(%d) !\n",
140 (unsigned int)ReadLength,(unsigned int)SeedLen);
141 exit(1);
143 #endif
144 for(uint_fast16_t i=0;i<=ReadLength-SeedLen;++i){
145 currentPos=i;
149 int main (int argc, char **argv) {
150 struct arguments arguments;
151 arguments.args=calloc(sizeof(size_t),argc);
153 // Default values.
154 arguments.outfile = "./out.stat";
155 arguments.mismatch = 10;
156 arguments.seedLen = 7;
157 char *const*line = arguments.args-1; // so that we can use *(++line)
159 // Parse our arguments; every option seen by parse_opt will be reflected in arguments.
160 argp_parse (&argp, argc, argv, 0, 0, &arguments);
162 fprintf(stderr,"Arguments: max_mismatch=%u, seed_length=%u, out_file=%s\nInput_files:",
163 (unsigned int)arguments.mismatch,(unsigned int)arguments.seedLen,arguments.outfile);
164 while ( *(++line) ) {
165 fprintf(stderr, " [%s]", *line);
167 fputs("\n", stderr);
168 line = arguments.args-1;
170 //uint8_t* pSeqMismatchArray = calloc(READSCOUNT_INIT,sizeof(uint8_t));
171 char** pSeqArray = malloc(READSCOUNT_INIT*sizeof(size_t));
172 size_t SeqArrayLength = READSCOUNT_INIT;
173 uint64_t* pMismatchCount = calloc(arguments.mismatch+1,sizeof(uint64_t));
175 uint64_t BasesCount=0;
176 uint64_t ReadsCount=0;
177 uint64_t MaxReadLength=0;
178 int_fast8_t diffRL=0;
179 int ReadsTooLong=0;
180 int ReadsTooShort=0;
182 fputs("\nParsing Sequence Files:\n", stderr);
183 G_TIMER_START;
185 kseq_t *kseq;
186 int_fast8_t ReadType;
188 while ( *(++line) ) {
189 fprintf(stderr, " <%s>", *line);
190 kseq = kseq_open(*line);
191 uint64_t RealSize,RealOffset;
192 size_t ReadLength;
193 if (kseq) {
194 RealSize = kseq->f->RealSize;
195 fprintf(stderr," RealSize:%ld ...",RealSize);
196 while ( (ReadType = kseq_read(kseq)) > 0 ) {
197 ReadLength = kseq->seq.l;
198 RealOffset = kseq->f->RealOffset;
199 #ifdef DEBUG
200 printf("\n-ID:[%s,%s] %zu %zu %zd %ld\nSeq:[%s]\nQ:[%s] *%zx,%d \n",
201 kseq->name.s,kseq->comment.s,kseq->seq.l,kseq->qual.l,ReadLength,kseq->f->RealOffset,
202 kseq->seq.s,kseq->qual.s,(size_t)&(kseq->seq.s),ReadType);
203 #endif
204 if (ReadLength>0) {
205 if (ReadLength>UINT16_MAX) {
206 ReadLength=UINT16_MAX;
207 ReadsTooLong = 1;
208 } else if (ReadLength<arguments.seedLen) {
209 ReadsTooShort = 1;
210 continue;
212 if (ReadLength>MaxReadLength) MaxReadLength=ReadLength;
213 if (ReadsCount+1>SeqArrayLength) {
214 pSeqArray = realloc(pSeqArray,(SeqArrayLength+READSCOUNT_INC)*sizeof(size_t));
215 //memset(pSeqSeqArray+SeqMismatchArrayLength,0,READSCOUNT_INC*sizeof(size_t));
216 SeqArrayLength += READSCOUNT_INC;
218 pSeqArray[ReadsCount]=malloc(ReadLength+1);
219 strncpy(pSeqArray[ReadsCount],kseq->seq.s,ReadLength);
220 pSeqArray[ReadsCount][ReadLength]='\0';
221 ++ReadsCount;
222 BasesCount += ReadLength;
223 if (MaxReadLength != ReadLength) diffRL=1;
226 //printf(" lasttype:%d lastpos:%ld ------",ReadType,kseq->f->RealOffset);
227 } else {
228 fputs(": File not found !\n", stderr);
229 continue;
231 fputs("\b\b\b\b, done !\n", stderr);
232 kseq_destroy(kseq);
234 uint8_t* pSeqMismatchArray = calloc(ReadsCount,sizeof(uint8_t));
236 fprintf(stderr,"[!]Total_Reads: %ld\tMean_Read_Length: %f\n",ReadsCount,(double)BasesCount/(double)ReadsCount);
238 KmerCacheTree = prb_create(compare_fixed_strings, &arguments.seedLen, NULL);
239 MainKmerPosTree = prb_create(compare_fixed_strings, &arguments.seedLen, NULL);
240 for(uint_fast32_t i=0;i<ReadsCount;++i) {
241 insertSeeds(pSeqArray[i],KmerCacheTree,arguments.seedLen);
244 //uint64_t CmpCount=0; // CmpCount == ReadsCount*(ReadsCount-1)/2
245 uint64_t MisCount=0;
246 uint64_t MisChgCount=0;
247 for (size_t i=0;i<ReadsCount;++i) {
248 for (size_t j=i+1;j<ReadsCount;++j) {
249 //++CmpCount;
250 uint_fast16_t miscount=compseq(pSeqArray[i],pSeqArray[j],arguments.mismatch);
251 if (miscount) {
252 ++MisCount;
253 if (pSeqMismatchArray[i]==0 || pSeqMismatchArray[i]>miscount) {
254 pSeqMismatchArray[i]=miscount;
255 ++MisChgCount;
257 if (pSeqMismatchArray[j]==0 || pSeqMismatchArray[j]>miscount) {
258 pSeqMismatchArray[j]=miscount;
259 ++MisChgCount;
265 for (size_t i=0;i<ReadsCount;++i) {
266 #ifdef DEBUG
267 printf("%zd/%zd\t[%s] %d\n",i,ReadsCount,pSeqArray[i],(int)pSeqMismatchArray[i]);
268 #endif
269 ++pMismatchCount[pSeqMismatchArray[i]];
272 fprintf(stderr,"[!]CmpTimes:%ld, mismatch_OK:%ld, mismatch_Update:%ld\n",
273 ReadsCount*(ReadsCount-1)/2,MisCount,MisChgCount);
274 fputs("\nCount Done!\n\nMismatch\tReadsCount\n", stderr);
275 if (diffRL) fprintf(stderr,"[!]Different Read Length Found. Mismatch may be in correct.\n");
277 FILE *fp = fopen(arguments.outfile, "w");
278 fprintf(fp,"#Total_Reads: %ld\n#Mean_Read_Length: %f\n#Max_mismatch: %d\n\n#Mismatch\tReadsCount\n",
279 ReadsCount,(double)BasesCount/(double)ReadsCount,(int)arguments.mismatch);
280 for (uint_fast16_t i=0;i<=arguments.mismatch;++i) {
281 if (pMismatchCount[i]) {
282 fprintf(fp,"%d\t%ld\n",(int)i,pMismatchCount[i]);
283 fprintf(stderr,"%d\t%ld\n",(int)i,pMismatchCount[i]);
286 fprintf(fp,"\n# Mismatch=0 means unique.\n");
287 if (ReadsTooLong) {
288 fputs("#There are reads longer than "CAT(UINT16_MAX)".\n", fp);
289 fputs("[!]There are reads longer than "CAT(UINT16_MAX)".\n", stderr);
291 if (ReadsTooShort) {
292 fprintf(fp, "#There are reads shorter than Seed_Length:%u.\n", (unsigned int)arguments.seedLen);
293 fprintf(stderr, "#There are reads shorter than Seed_Length:%u.\n", (unsigned int)arguments.seedLen);
295 fclose(fp);
297 for (size_t i=0;i<ReadsCount;++i) {
298 free(pSeqArray[i]);
300 free(pSeqArray);
301 free(pSeqMismatchArray);
302 free(arguments.args);
304 G_TIMER_END;
305 G_TIMER_PRINT;
307 exit(EXIT_SUCCESS);