5 #include <stdlib.h> //EXIT_FAILURE
19 #define CAT(x) CAT0(x)
21 //#define READSCOUNT_INIT (1048576)
22 //#define READSCOUNT_INC (128*1024)
23 #define READSCOUNT_INIT (16)
24 #define READSCOUNT_INC (4)
26 const char *argp_program_version
=
27 "readsdedump simple 0.1 @"__TIME__
"," __DATE__
;
28 const char *argp_program_bug_address
=
29 "<huxuesong@genomics.org.cn>";
31 /* Program documentation. */
42 /* A description of the arguments we accept. */
43 static char args_doc
[] = "input_files (FASTA or FASTQ)";
45 /* The options we understand. */
46 static struct argp_option options
[] = {
47 {"maxmismatch", 'm', "10" ,0, "Max mismatch to be PCR duplicate"},
48 {"seedlength", 's', "7" ,0, "Seed length"},
49 {"outfile", 'o', "./out.stat" ,0, "Output file" },
53 /* Used by main to communicate with parse_opt. */
55 uint_fast16_t mismatch
,seedLen
;
60 /* Parse a single option. */
62 parse_opt (int key
, char *arg
, struct argp_state
*state
) {
63 /* Get the input argument from argp_parse, which we
64 know is a pointer to our arguments structure. */
65 struct arguments
*arguments
= state
->input
;
69 arguments
->outfile
= arg
;
72 tmpArgValue
= atoi(arg
);
73 if (tmpArgValue
>=0 && tmpArgValue
<= UINT16_MAX
) {
74 arguments
->mismatch
= tmpArgValue
;
76 errx(2,"-m \"%s\"=%i is not between [0,%d] !",arg
,tmpArgValue
,UINT16_MAX
);
80 tmpArgValue
= atoi(arg
);
81 if (tmpArgValue
>=0 && tmpArgValue
<= UINT16_MAX
) {
82 arguments
->seedLen
= tmpArgValue
;
84 errx(2,"-s \"%s\"=%i is not between [0,%d] !",arg
,tmpArgValue
,UINT16_MAX
);
89 arguments
->args
[state
->arg_num
] = arg
;
93 if (state
->arg_num
< 1) /* Not enough arguments. */
95 argp_state_help (state
,stderr
,ARGP_HELP_STD_HELP
);
99 return ARGP_ERR_UNKNOWN
;
104 /* Our argp parser. */
105 static struct argp argp
= { options
, parse_opt
, args_doc
, doc
};
107 static inline uint_fast16_t compseq(char const* strA
, char const* strB
, uint_fast16_t maxMismatch
) {
108 uint_fast16_t mismatch
=0;
110 printf("[%s]-[%s]:\n",strA
,strB
);
112 while (*strA
&& *strB
) {
114 printf(" %.1s,%.1s:",strA
,strB
);
116 mismatch
+= (*strA
++ != *strB
++);
118 printf("%d ",(int)mismatch
);
120 if (mismatch
> maxMismatch
) {
133 struct prb_table
*MainKmerPosTree
, *KmerCacheTree
;
135 void insertSeeds(char const* seq
, struct prb_table
* tree
, uint_fast16_t SeedLen
) {
136 size_t ReadLength
= strlen(seq
);
138 if (ReadLength
<SeedLen
) {
139 fprintf(stderr
,"[!]Read_Length(%d) < Seed_Length(%d) !\n",
140 (unsigned int)ReadLength
,(unsigned int)SeedLen
);
144 for(uint_fast16_t i
=0;i
<=ReadLength
-SeedLen
;++i
){
149 int main (int argc
, char **argv
) {
150 struct arguments arguments
;
151 arguments
.args
=calloc(sizeof(size_t),argc
);
154 arguments
.outfile
= "./out.stat";
155 arguments
.mismatch
= 10;
156 arguments
.seedLen
= 7;
157 char *const*line
= arguments
.args
-1; // so that we can use *(++line)
159 // Parse our arguments; every option seen by parse_opt will be reflected in arguments.
160 argp_parse (&argp
, argc
, argv
, 0, 0, &arguments
);
162 fprintf(stderr
,"Arguments: max_mismatch=%u, seed_length=%u, out_file=%s\nInput_files:",
163 (unsigned int)arguments
.mismatch
,(unsigned int)arguments
.seedLen
,arguments
.outfile
);
164 while ( *(++line
) ) {
165 fprintf(stderr
, " [%s]", *line
);
168 line
= arguments
.args
-1;
170 //uint8_t* pSeqMismatchArray = calloc(READSCOUNT_INIT,sizeof(uint8_t));
171 char** pSeqArray
= malloc(READSCOUNT_INIT
*sizeof(size_t));
172 size_t SeqArrayLength
= READSCOUNT_INIT
;
173 uint64_t* pMismatchCount
= calloc(arguments
.mismatch
+1,sizeof(uint64_t));
175 uint64_t BasesCount
=0;
176 uint64_t ReadsCount
=0;
177 uint64_t MaxReadLength
=0;
178 int_fast8_t diffRL
=0;
182 fputs("\nParsing Sequence Files:\n", stderr
);
186 int_fast8_t ReadType
;
188 while ( *(++line
) ) {
189 fprintf(stderr
, " <%s>", *line
);
190 kseq
= kseq_open(*line
);
191 uint64_t RealSize
,RealOffset
;
194 RealSize
= kseq
->f
->RealSize
;
195 fprintf(stderr
," RealSize:%ld ...",RealSize
);
196 while ( (ReadType
= kseq_read(kseq
)) > 0 ) {
197 ReadLength
= kseq
->seq
.l
;
198 RealOffset
= kseq
->f
->RealOffset
;
200 printf("\n-ID:[%s,%s] %zu %zu %zd %ld\nSeq:[%s]\nQ:[%s] *%zx,%d \n",
201 kseq
->name
.s
,kseq
->comment
.s
,kseq
->seq
.l
,kseq
->qual
.l
,ReadLength
,kseq
->f
->RealOffset
,
202 kseq
->seq
.s
,kseq
->qual
.s
,(size_t)&(kseq
->seq
.s
),ReadType
);
205 if (ReadLength
>UINT16_MAX
) {
206 ReadLength
=UINT16_MAX
;
208 } else if (ReadLength
<arguments
.seedLen
) {
212 if (ReadLength
>MaxReadLength
) MaxReadLength
=ReadLength
;
213 if (ReadsCount
+1>SeqArrayLength
) {
214 pSeqArray
= realloc(pSeqArray
,(SeqArrayLength
+READSCOUNT_INC
)*sizeof(size_t));
215 //memset(pSeqSeqArray+SeqMismatchArrayLength,0,READSCOUNT_INC*sizeof(size_t));
216 SeqArrayLength
+= READSCOUNT_INC
;
218 pSeqArray
[ReadsCount
]=malloc(ReadLength
+1);
219 strncpy(pSeqArray
[ReadsCount
],kseq
->seq
.s
,ReadLength
);
220 pSeqArray
[ReadsCount
][ReadLength
]='\0';
222 BasesCount
+= ReadLength
;
223 if (MaxReadLength
!= ReadLength
) diffRL
=1;
226 //printf(" lasttype:%d lastpos:%ld ------",ReadType,kseq->f->RealOffset);
228 fputs(": File not found !\n", stderr
);
231 fputs("\b\b\b\b, done !\n", stderr
);
234 uint8_t* pSeqMismatchArray
= calloc(ReadsCount
,sizeof(uint8_t));
236 fprintf(stderr
,"[!]Total_Reads: %ld\tMean_Read_Length: %f\n",ReadsCount
,(double)BasesCount
/(double)ReadsCount
);
238 KmerCacheTree
= prb_create(compare_fixed_strings
, &arguments
.seedLen
, NULL
);
239 MainKmerPosTree
= prb_create(compare_fixed_strings
, &arguments
.seedLen
, NULL
);
240 for(uint_fast32_t i
=0;i
<ReadsCount
;++i
) {
241 insertSeeds(pSeqArray
[i
],KmerCacheTree
,arguments
.seedLen
);
244 //uint64_t CmpCount=0; // CmpCount == ReadsCount*(ReadsCount-1)/2
246 uint64_t MisChgCount
=0;
247 for (size_t i
=0;i
<ReadsCount
;++i
) {
248 for (size_t j
=i
+1;j
<ReadsCount
;++j
) {
250 uint_fast16_t miscount
=compseq(pSeqArray
[i
],pSeqArray
[j
],arguments
.mismatch
);
253 if (pSeqMismatchArray
[i
]==0 || pSeqMismatchArray
[i
]>miscount
) {
254 pSeqMismatchArray
[i
]=miscount
;
257 if (pSeqMismatchArray
[j
]==0 || pSeqMismatchArray
[j
]>miscount
) {
258 pSeqMismatchArray
[j
]=miscount
;
265 for (size_t i
=0;i
<ReadsCount
;++i
) {
267 printf("%zd/%zd\t[%s] %d\n",i
,ReadsCount
,pSeqArray
[i
],(int)pSeqMismatchArray
[i
]);
269 ++pMismatchCount
[pSeqMismatchArray
[i
]];
272 fprintf(stderr
,"[!]CmpTimes:%ld, mismatch_OK:%ld, mismatch_Update:%ld\n",
273 ReadsCount
*(ReadsCount
-1)/2,MisCount
,MisChgCount
);
274 fputs("\nCount Done!\n\nMismatch\tReadsCount\n", stderr
);
275 if (diffRL
) fprintf(stderr
,"[!]Different Read Length Found. Mismatch may be in correct.\n");
277 FILE *fp
= fopen(arguments
.outfile
, "w");
278 fprintf(fp
,"#Total_Reads: %ld\n#Mean_Read_Length: %f\n#Max_mismatch: %d\n\n#Mismatch\tReadsCount\n",
279 ReadsCount
,(double)BasesCount
/(double)ReadsCount
,(int)arguments
.mismatch
);
280 for (uint_fast16_t i
=0;i
<=arguments
.mismatch
;++i
) {
281 if (pMismatchCount
[i
]) {
282 fprintf(fp
,"%d\t%ld\n",(int)i
,pMismatchCount
[i
]);
283 fprintf(stderr
,"%d\t%ld\n",(int)i
,pMismatchCount
[i
]);
286 fprintf(fp
,"\n# Mismatch=0 means unique.\n");
288 fputs("#There are reads longer than "CAT(UINT16_MAX
)".\n", fp
);
289 fputs("[!]There are reads longer than "CAT(UINT16_MAX
)".\n", stderr
);
292 fprintf(fp
, "#There are reads shorter than Seed_Length:%u.\n", (unsigned int)arguments
.seedLen
);
293 fprintf(stderr
, "#There are reads shorter than Seed_Length:%u.\n", (unsigned int)arguments
.seedLen
);
297 for (size_t i
=0;i
<ReadsCount
;++i
) {
301 free(pSeqMismatchArray
);
302 free(arguments
.args
);