5 #include <stdlib.h> //EXIT_FAILURE
16 #define READSCOUNT_INIT (128*1024)
17 #define READSCOUNT_INC (16*1024)
18 //#define READSCOUNT_INIT (16)
19 //#define READSCOUNT_INC (4)
21 const char *argp_program_version
=
22 "readsdedump simple 0.1 @"__TIME__
"," __DATE__
;
23 const char *argp_program_bug_address
=
24 "<huxuesong@genomics.org.cn>";
26 /* Program documentation. */
37 /* A description of the arguments we accept. */
38 static char args_doc
[] = "input_files (FASTA or FASTQ)";
40 /* The options we understand. */
41 static struct argp_option options
[] = {
42 {"maxmismatch", 'm', "10" ,0, "Max mismatch to be PCR duplicate"},
43 {"outfile", 'o', "./out.stat" ,0, "Output file" },
47 /* Used by main to communicate with parse_opt. */
49 uint_fast16_t mismatch
;
54 /* Parse a single option. */
56 parse_opt (int key
, char *arg
, struct argp_state
*state
) {
57 /* Get the input argument from argp_parse, which we
58 know is a pointer to our arguments structure. */
59 struct arguments
*arguments
= state
->input
;
63 arguments
->outfile
= arg
;
66 tmpArgValue
= atoi(arg
);
67 if (tmpArgValue
>=0 && tmpArgValue
<= UINT16_MAX
) {
68 arguments
->mismatch
= tmpArgValue
;
70 errx(2,"-m \"%s\"=%i is not between [0,%d] !",arg
,tmpArgValue
,UINT16_MAX
);
75 arguments
->args
[state
->arg_num
] = arg
;
79 if (state
->arg_num
< 1) /* Not enough arguments. */
81 argp_state_help (state
,stderr
,ARGP_HELP_STD_HELP
);
85 return ARGP_ERR_UNKNOWN
;
90 /* Our argp parser. */
91 static struct argp argp
= { options
, parse_opt
, args_doc
, doc
};
93 static inline uint_fast16_t compseq(char const* strA
, char const* strB
, uint_fast16_t maxMismatch
) {
94 uint_fast16_t mismatch
=0;
96 printf("[%s]-[%s]:\n",strA
,strB
);
98 while (*strA
&& *strB
) {
100 printf(" %.1s,%.1s:",strA
,strB
);
102 mismatch
+= (*strA
++ != *strB
++);
104 printf("%d ",(int)mismatch
);
106 if (mismatch
> maxMismatch
) {
119 int main (int argc
, char **argv
) {
120 struct arguments arguments
;
121 arguments
.args
=calloc(sizeof(size_t),argc
);
124 arguments
.outfile
= "./out.stat";
125 arguments
.mismatch
= 10;
126 char *const*line
= arguments
.args
-1; // so that we can use *(++line)
128 // Parse our arguments; every option seen by parse_opt will be reflected in arguments.
129 argp_parse (&argp
, argc
, argv
, 0, 0, &arguments
);
131 fprintf(stderr
,"Arguments: max_mismatch=%d, out_file=%s\nInput_files:",
132 (int)arguments
.mismatch
,arguments
.outfile
);
133 while ( *(++line
) ) {
134 fprintf(stderr
, " [%s]", *line
);
137 line
= arguments
.args
-1;
139 //uint8_t* pSeqMismatchArray = calloc(READSCOUNT_INIT,sizeof(uint8_t));
140 char** pSeqArray
= malloc(READSCOUNT_INIT
*sizeof(size_t));
141 size_t SeqArrayLength
= READSCOUNT_INIT
;
142 uint64_t* pMismatchCount
= calloc(arguments
.mismatch
+1,sizeof(uint64_t));
144 uint64_t BasesCount
=0;
145 uint64_t ReadsCount
=0;
146 uint64_t MaxReadLength
=0;
147 int_fast8_t diffRL
=0;
149 fputs("\nParsing Sequence Files:\n", stderr
);
153 int_fast8_t ReadType
;
155 while ( *(++line
) ) {
156 fprintf(stderr
, " <%s>", *line
);
157 kseq
= kseq_open(*line
);
158 uint64_t RealSize
,RealOffset
;
161 RealSize
= kseq
->f
->RealSize
;
162 fprintf(stderr
," RealSize:%ld ...",RealSize
);
163 while ( (ReadType
= kseq_read(kseq
)) > 0 ) {
164 ReadLength
= kseq
->seq
.l
;
165 RealOffset
= kseq
->f
->RealOffset
;
167 printf("\n-ID:[%s,%s] %zu %zu %zd %ld\nSeq:[%s]\nQ:[%s] *%zx,%d \n",
168 kseq
->name
.s
,kseq
->comment
.s
,kseq
->seq
.l
,kseq
->qual
.l
,ReadLength
,kseq
->f
->RealOffset
,
169 kseq
->seq
.s
,kseq
->qual
.s
,(size_t)&(kseq
->seq
.s
),ReadType
);
172 if (ReadLength
>MaxReadLength
) MaxReadLength
=ReadLength
;
173 if (ReadsCount
+1>SeqArrayLength
) {
174 pSeqArray
= realloc(pSeqArray
,(SeqArrayLength
+READSCOUNT_INC
)*sizeof(size_t));
175 //memset(pSeqSeqArray+SeqMismatchArrayLength,0,READSCOUNT_INC*sizeof(size_t));
176 SeqArrayLength
+= READSCOUNT_INC
;
178 pSeqArray
[ReadsCount
]=malloc(ReadLength
+1);
179 strcpy(pSeqArray
[ReadsCount
],kseq
->seq
.s
);
181 BasesCount
+= ReadLength
;
182 if (MaxReadLength
!= ReadLength
) diffRL
=1;
185 //printf(" lasttype:%d lastpos:%ld ------",ReadType,kseq->f->RealOffset);
187 fputs(": File not found !\n", stderr
);
190 fputs("\b\b\b\b, done !\n", stderr
);
193 uint8_t* pSeqMismatchArray
= calloc(ReadsCount
,sizeof(uint8_t));
195 fprintf(stderr
,"[!]Total_Reads: %ld\tMean_Read_Length: %f\n",ReadsCount
,(double)BasesCount
/(double)ReadsCount
);
197 //uint64_t CmpCount=0; // CmpCount == ReadsCount*(ReadsCount-1)/2
199 uint64_t MisChgCount
=0;
200 for (size_t i
=0;i
<ReadsCount
;++i
) {
201 for (size_t j
=i
+1;j
<ReadsCount
;++j
) {
203 uint_fast16_t miscount
=compseq(pSeqArray
[i
],pSeqArray
[j
],arguments
.mismatch
);
206 if (pSeqMismatchArray
[i
]==0 || pSeqMismatchArray
[i
]>miscount
) {
207 pSeqMismatchArray
[i
]=miscount
;
210 if (pSeqMismatchArray
[j
]==0 || pSeqMismatchArray
[j
]>miscount
) {
211 pSeqMismatchArray
[j
]=miscount
;
218 for (size_t i
=0;i
<ReadsCount
;++i
) {
220 printf("%zd/%zd\t[%s] %d\n",i
,ReadsCount
,pSeqArray
[i
],(int)pSeqMismatchArray
[i
]);
222 ++pMismatchCount
[pSeqMismatchArray
[i
]];
225 fprintf(stderr
,"[!]CmpTimes:%ld, mismatch_OK:%ld, mismatch_Update:%ld\n",
226 ReadsCount
*(ReadsCount
-1)/2,MisCount
,MisChgCount
);
227 fputs("\nCount Done!\n\nMismatch\tReadsCount\n", stderr
);
228 if (diffRL
) fprintf(stderr
,"[!]Different Read Length Found. Mismatch may be in correct.\n");
230 FILE *fp
= fopen(arguments
.outfile
, "w");
231 fprintf(fp
,"#Total_Reads: %ld\n#Mean_Read_Length: %f\n#Max_mismatch: %d\n\n#Mismatch\tReadsCount\n",
232 ReadsCount
,(double)BasesCount
/(double)ReadsCount
,(int)arguments
.mismatch
);
233 for (uint_fast16_t i
=0;i
<=arguments
.mismatch
;++i
) {
234 if (pMismatchCount
[i
]) {
235 fprintf(fp
,"%d\t%ld\n",(int)i
,pMismatchCount
[i
]);
236 fprintf(stderr
,"%d\t%ld\n",(int)i
,pMismatchCount
[i
]);
239 fprintf(fp
,"\n# Mismatch=0 means unique.\n");
242 for (size_t i
=0;i
<ReadsCount
;++i
) {
246 free(pSeqMismatchArray
);
247 free(arguments
.args
);