modified: src1/input.c
[GalaxyCodeBases.git] / tools / bioawk / kseq.h
bloba5cec7c289aeda5fe36edb2a828a9f669741d3a3
1 /* The MIT License
3 Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 SOFTWARE.
26 /* Last Modified: 05MAR2012 */
28 #ifndef AC_KSEQ_H
29 #define AC_KSEQ_H
31 #include <ctype.h>
32 #include <string.h>
33 #include <stdlib.h>
35 #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
36 #define KS_SEP_TAB 1 // isspace() && !' '
37 #define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
38 #define KS_SEP_MAX 2
40 #define __KS_TYPE(type_t) \
41 typedef struct __kstream_t { \
42 unsigned char *buf; \
43 int begin, end, is_eof; \
44 type_t f; \
45 } kstream_t;
47 #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
48 #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
50 #define __KS_BASIC(type_t, __bufsize) \
51 static inline kstream_t *ks_init(type_t f) \
52 { \
53 kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
54 ks->f = f; \
55 ks->buf = (unsigned char*)malloc(__bufsize); \
56 return ks; \
57 } \
58 static inline void ks_destroy(kstream_t *ks) \
59 { \
60 if (ks) { \
61 free(ks->buf); \
62 free(ks); \
63 } \
66 #define __KS_GETC(__read, __bufsize) \
67 static inline int ks_getc(kstream_t *ks) \
68 { \
69 if (ks->is_eof && ks->begin >= ks->end) return -1; \
70 if (ks->begin >= ks->end) { \
71 ks->begin = 0; \
72 ks->end = __read(ks->f, ks->buf, __bufsize); \
73 if (ks->end < __bufsize) ks->is_eof = 1; \
74 if (ks->end == 0) return -1; \
75 } \
76 return (int)ks->buf[ks->begin++]; \
79 #ifndef KSTRING_T
80 #define KSTRING_T kstring_t
81 typedef struct __kstring_t {
82 size_t l, m;
83 char *s;
84 } kstring_t;
85 #endif
87 #ifndef kroundup32
88 #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
89 #endif
91 #define __KS_GETUNTIL(__read, __bufsize) \
92 static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
93 { \
94 if (dret) *dret = 0; \
95 str->l = append? str->l : 0; \
96 if (ks->begin >= ks->end && ks->is_eof) return -1; \
97 for (;;) { \
98 int i; \
99 if (ks->begin >= ks->end) { \
100 if (!ks->is_eof) { \
101 ks->begin = 0; \
102 ks->end = __read(ks->f, ks->buf, __bufsize); \
103 if (ks->end < __bufsize) ks->is_eof = 1; \
104 if (ks->end == 0) break; \
105 } else break; \
107 if (delimiter == KS_SEP_LINE) { \
108 for (i = ks->begin; i < ks->end; ++i) \
109 if (ks->buf[i] == '\n') break; \
110 } else if (delimiter > KS_SEP_MAX) { \
111 for (i = ks->begin; i < ks->end; ++i) \
112 if (ks->buf[i] == delimiter) break; \
113 } else if (delimiter == KS_SEP_SPACE) { \
114 for (i = ks->begin; i < ks->end; ++i) \
115 if (isspace(ks->buf[i])) break; \
116 } else if (delimiter == KS_SEP_TAB) { \
117 for (i = ks->begin; i < ks->end; ++i) \
118 if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
119 } else i = 0; /* never come to here! */ \
120 if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
121 str->m = str->l + (i - ks->begin) + 1; \
122 kroundup32(str->m); \
123 str->s = (char*)realloc(str->s, str->m); \
125 memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
126 str->l = str->l + (i - ks->begin); \
127 ks->begin = i + 1; \
128 if (i < ks->end) { \
129 if (dret) *dret = ks->buf[i]; \
130 break; \
133 if (str->s == 0) { \
134 str->m = 1; \
135 str->s = (char*)calloc(1, 1); \
136 } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
137 str->s[str->l] = '\0'; \
138 return str->l; \
140 static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
141 { return ks_getuntil2(ks, delimiter, str, dret, 0); }
143 #define KSTREAM_INIT(type_t, __read, __bufsize) \
144 __KS_TYPE(type_t) \
145 __KS_BASIC(type_t, __bufsize) \
146 __KS_GETC(__read, __bufsize) \
147 __KS_GETUNTIL(__read, __bufsize)
149 #define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
151 #define __KSEQ_BASIC(SCOPE, type_t) \
152 SCOPE kseq_t *kseq_init(type_t fd) \
154 kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
155 s->f = ks_init(fd); \
156 return s; \
158 SCOPE void kseq_destroy(kseq_t *ks) \
160 if (!ks) return; \
161 free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
162 ks_destroy(ks->f); \
163 free(ks); \
166 /* Return value:
167 >=0 length of the sequence (normal)
168 -1 end-of-file
169 -2 truncated quality string
171 #define __KSEQ_READ(SCOPE) \
172 SCOPE int kseq_read(kseq_t *seq) \
174 int c; \
175 kstream_t *ks = seq->f; \
176 if (seq->last_char == 0) { /* then jump to the next header line */ \
177 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
178 if (c == -1) return -1; /* end of file */ \
179 seq->last_char = c; \
180 } /* else: the first header char has been read in the previous call */ \
181 seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
182 if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \
183 if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
184 if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
185 seq->seq.m = 256; \
186 seq->seq.s = (char*)malloc(seq->seq.m); \
188 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
189 if (c == '\n') continue; /* skip empty lines */ \
190 seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
191 ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
193 if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
194 if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
195 seq->seq.m = seq->seq.l + 2; \
196 kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \
197 seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
199 seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
200 if (c != '+') return seq->seq.l; /* FASTA */ \
201 if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
202 seq->qual.m = seq->seq.m; \
203 seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
205 while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
206 if (c == -1) return -2; /* error: no quality string */ \
207 while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \
208 seq->last_char = 0; /* we have not come to the next header line */ \
209 if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
210 return seq->seq.l; \
213 #define __KSEQ_TYPE(type_t) \
214 typedef struct { \
215 kstring_t name, comment, seq, qual; \
216 int last_char; \
217 kstream_t *f; \
218 } kseq_t;
220 #define KSEQ_INIT2(SCOPE, type_t, __read) \
221 KSTREAM_INIT(type_t, __read, 16384) \
222 __KSEQ_TYPE(type_t) \
223 __KSEQ_BASIC(SCOPE, type_t) \
224 __KSEQ_READ(SCOPE)
226 #define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
228 #define KSEQ_DECLARE(type_t) \
229 __KS_TYPE(type_t) \
230 __KSEQ_TYPE(type_t) \
231 extern kseq_t *kseq_init(type_t fd); \
232 void kseq_destroy(kseq_t *ks); \
233 int kseq_read(kseq_t *seq);
235 #endif