2 From Heng Li <lh3@sanger.ac.uk>, as in bwa package.
3 Macro translated by Hu Xuesong.
6 20110817 > Add kseq_t *kseq_open(char *const filename);
8 kseq_t* kseq = kseq_open("filename");
9 // gzFile fp=gzopen("filename", "r"); kseq_t* kseq=kseq_init(fp);
11 kseq_destroy(kseq); // gzclose(fp);
12 > Add RealSize,RealOffset as kseq->f->*
24 #include <sys/types.h>
30 #define __GKSEQ_FILETYPE gzFile // Well, EP enough to use define here.
31 #define __GKSEQ_READFUNC gzread // Maybe "lh3" just want to make debug harder for his code ?
32 #define __GKSEQ_OPENFUNC gzopen
33 #define __GKSEQ_CLOSEFUNC gzclose
34 //#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
35 //#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
36 #define __GKSEQ_BUFSIZE (4096)
39 #define KROUNDUP32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
44 // from gzip-1.4/gzip.h, added __gkseq_ / __GKSEQ_ prefix.
45 typedef unsigned char __gkseq_uch
;
46 typedef unsigned short __gkseq_ush
;
47 typedef unsigned long __gkseq_ulg
;
48 /* Macros for getting two-byte and four-byte header values */
49 #define __GKSEQ_SH(p) ((__gkseq_ush)(__gkseq_uch)((p)[0]) | ((__gkseq_ush)(__gkseq_uch)((p)[1]) << 8))
50 #define __GKSEQ_LG(p) ((__gkseq_ulg)(__GKSEQ_SH(p)) | ((__gkseq_ulg)(__GKSEQ_SH((p)+2)) << 16))
51 // from gzip-1.4/gzip.h
55 //#74 "t.h" // append `KSEQ_INIT(gzFile, gzread)` to kseq.h, remove all `#include`, you will get "t.h".
57 #define KSTRING_T kstring_t
58 typedef struct __kstring_t
{
64 //#215 "t.h" // gcc -E t.h > gkseq.h && indent -kr -brf -l120 -nut t.h -o gkseqn.h
66 /* RealSize: original (uncompressed) size of sequence file
67 RealOffset: file offset in RealSize. Begin from 0, should pointing to '>' or '@'.
68 ReadBuffer: current buffer count. The first buffer is 0. (init value is -1)
70 typedef struct __kstream_t
{
72 int begin
, end
, is_eof
;
73 unsigned long RealSize
,RealOffset
;
79 kstring_t name
, comment
, seq
, qual
;
84 static inline kstream_t
*ks_init(__GKSEQ_FILETYPE f
) {
85 kstream_t
*ks
= (kstream_t
*) calloc(1, sizeof(kstream_t
));
87 ks
->buf
= (char *) malloc(__GKSEQ_BUFSIZE
);
92 static inline void ks_destroy(kstream_t
* ks
) {
95 __GKSEQ_CLOSEFUNC(ks
->f
);
100 static inline int ks_getc(kstream_t
* ks
) {
101 if (ks
->is_eof
&& ks
->begin
>= ks
->end
)
103 if (ks
->begin
>= ks
->end
) {
105 ks
->end
= __GKSEQ_READFUNC(ks
->f
, ks
->buf
, __GKSEQ_BUFSIZE
);
107 if (ks
->end
< __GKSEQ_BUFSIZE
)
112 return (int) ks
->buf
[ks
->begin
++];
115 static int ks_getuntil(kstream_t
* ks
, int delimiter
, kstring_t
* str
, int *dret
) {
119 if (ks
->begin
>= ks
->end
&& ks
->is_eof
)
123 if (ks
->begin
>= ks
->end
) {
126 ks
->end
= __GKSEQ_READFUNC(ks
->f
, ks
->buf
, __GKSEQ_BUFSIZE
);
128 if (ks
->end
< __GKSEQ_BUFSIZE
)
136 for (i
= ks
->begin
; i
< ks
->end
; ++i
)
137 if (ks
->buf
[i
] == delimiter
)
140 for (i
= ks
->begin
; i
< ks
->end
; ++i
)
141 if (isspace(ks
->buf
[i
]))
144 if (str
->m
- str
->l
< i
- ks
->begin
+ 1) {
145 str
->m
= str
->l
+ (i
- ks
->begin
) + 1;
147 str
->s
= (char *) realloc(str
->s
, str
->m
);
149 memcpy(str
->s
+ str
->l
, ks
->buf
+ ks
->begin
, i
- ks
->begin
);
150 str
->l
= str
->l
+ (i
- ks
->begin
);
158 str
->s
[str
->l
] = '\0';
162 static inline kseq_t
*kseq_init(__GKSEQ_FILETYPE fd
) {
163 kseq_t
*s
= (kseq_t
*) calloc(1, sizeof(kseq_t
));
165 /* realloc() may change the address,
166 thus, even we `malloc(32)` here and set `s->seq.m`,
167 `s->seq.s` still may be changed later on realloc().
171 static kseq_t
*kseq_open(char *const filename
) {
172 unsigned long realsize
=0;
173 int ifd
= open(filename
, O_RDONLY
);
174 if (ifd
==-1) return NULL
;
175 unsigned char buf
[4];
176 off_t bytes_in
= lseek(ifd
, (off_t
)(-4), SEEK_END
);
177 if ( bytes_in
!= -1L) {
179 if (read(ifd
, (char*)buf
, sizeof(buf
)) == sizeof(buf
)) {
180 realsize
= __GKSEQ_LG(buf
);
185 fp
= __GKSEQ_OPENFUNC(filename
, "r");
186 kseq_t
*s
= kseq_init(fp
);
187 s
->f
->RealSize
= realsize
;
191 static inline void kseq_rewind(kseq_t
* ks
) {
193 ks
->f
->is_eof
= ks
->f
->begin
= ks
->f
->end
= 0;
196 static inline void kseq_destroy(kseq_t
* ks
) {
198 free(ks
->name
.s
); free(ks
->comment
.s
); free(ks
->seq
.s
); free(ks
->qual
.s
);
204 // >=0 length of the sequence (normal)
206 3 FASTQ, both bases and Q
208 -2 truncated quality string
210 static int_fast8_t kseq_read(kseq_t
* seq
) { // better to be ssize_t
212 //uint_fast8_t with_last_chr = 0;
213 kstream_t
*ks
= seq
->f
;
214 if (seq
->last_char
== 0) { // then jump to the next header line
215 while ((c
= ks_getc(ks
)) != -1 && c
!= '>' && c
!= '@');
217 return -1; // end of file
219 } // the first header char has been read
220 seq
->comment
.l
= seq
->seq
.l
= seq
->qual
.l
= 0;
221 if (ks_getuntil(ks
, 0, &seq
->name
, &c
) < 0)
224 ks_getuntil(ks
, '\n', &seq
->comment
, 0);
225 while ((c
= ks_getc(ks
)) != -1 && c
!= '>' && c
!= '+' && c
!= '@') {
226 if (isgraph(c
)) { // printable non-space character
227 if (seq
->seq
.l
+ 1 >= seq
->seq
.m
) { // double the memory
228 seq
->seq
.m
= seq
->seq
.l
+ 2;
229 KROUNDUP32(seq
->seq
.m
); // rounded to next closest 2^k
230 seq
->seq
.s
= (char *) realloc(seq
->seq
.s
, seq
->seq
.m
);
232 seq
->seq
.s
[seq
->seq
.l
++] = (char) c
;
235 if (c
== '>' || c
== '@')
236 seq
->last_char
= c
; // the first header char has been read
238 seq
->seq
.s
[seq
->seq
.l
] = 0; // null terminated string
240 //seq->qual.s[0] = 0; // set Q to "". also bool, 0 for no Q. JUST TEMP, should change back later !!!
241 seq
->f
->RealOffset
= __GKSEQ_BUFSIZE
*(seq
->f
->ReadBuffer
) + seq
->f
->begin
- 1;//with_last_chr;
242 return 1; // FASTA, only bases
244 if (seq
->qual
.m
< seq
->seq
.m
) { // allocate enough memory
245 seq
->qual
.m
= seq
->seq
.m
;
246 seq
->qual
.s
= (char *) realloc(seq
->qual
.s
, seq
->qual
.m
);
248 while ((c
= ks_getc(ks
)) != -1 && c
!= '\n'); // skip the rest of '+' line
250 return -2; // we should not stop here
251 while ((c
= ks_getc(ks
)) != -1 && seq
->qual
.l
< seq
->seq
.l
)
252 if (c
>= 33 && c
<= 127)
253 seq
->qual
.s
[seq
->qual
.l
++] = (unsigned char) c
;
254 seq
->qual
.s
[seq
->qual
.l
] = 0; // null terminated string
255 seq
->last_char
= 0; // we have not come to the next header line
257 seq
->f
->RealOffset
= __GKSEQ_BUFSIZE
*(seq
->f
->ReadBuffer
)+seq
->f
->begin
;
258 if (seq
->seq
.l
!= seq
->qual
.l
)
259 return -2; // qual string is shorter than seq string
261 return 3; // both base and Q