modified: makesrc3.mk
[GalaxyCodeBases.git] / c_cpp / faststater / gkseq.h
blob933b5191878b1db7554e614189b5ccd287e75bf5
1 /* The MIT License
2 From Heng Li <lh3@sanger.ac.uk>, as in bwa package.
3 Macro translated by Hu Xuesong.
4 */
6 #ifndef _G_KSEQ_H
7 #define _G_KSEQ_H
9 #include <ctype.h>
10 #include <string.h>
11 #include <stdlib.h>
12 #include <zlib.h>
14 #define __GKSEQ_FILETYPE gzFile // Well, EP enough to use define here.
15 #define __GKSEQ_READFUNC gzread // Maybe "lh3" just want to make debug harder for his code ?
16 //#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
17 //#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
18 #define __GKSEQ_BUFSIZE 4096
20 #ifndef KROUNDUP32
21 #define KROUNDUP32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
22 #endif
24 #ifndef KSTRING_T
25 #define KSTRING_T kstring_t
26 typedef struct __kstring_t {
27 size_t l, m;
28 char *s;
29 } kstring_t;
30 #endif
32 typedef struct __kstream_t {
33 char *buf;
34 int begin, end, is_eof;
35 __GKSEQ_FILETYPE f;
36 } kstream_t;
38 typedef struct {
39 kstring_t name, comment, seq, qual;
40 int last_char;
41 kstream_t *f;
42 } kseq_t;
44 static inline kstream_t *ks_init(__GKSEQ_FILETYPE f) {
45 kstream_t *ks = (kstream_t *) calloc(1, sizeof(kstream_t));
46 ks->f = f;
47 ks->buf = (char *) malloc(__GKSEQ_BUFSIZE);
48 return ks;
51 static inline void ks_destroy(kstream_t * ks) {
52 if (ks) {
53 free(ks->buf);
54 free(ks);
58 static inline int ks_getc(kstream_t * ks) {
59 if (ks->is_eof && ks->begin >= ks->end)
60 return -1;
61 if (ks->begin >= ks->end) {
62 ks->begin = 0;
63 ks->end = __GKSEQ_READFUNC(ks->f, ks->buf, __GKSEQ_BUFSIZE);
64 if (ks->end < __GKSEQ_BUFSIZE)
65 ks->is_eof = 1;
66 if (ks->end == 0)
67 return -1;
69 return (int) ks->buf[ks->begin++];
72 static int ks_getuntil(kstream_t * ks, int delimiter, kstring_t * str, int *dret) {
73 if (dret)
74 *dret = 0;
75 str->l = 0;
76 if (ks->begin >= ks->end && ks->is_eof)
77 return -1;
78 for (;;) {
79 int i;
80 if (ks->begin >= ks->end) {
81 if (!ks->is_eof) {
82 ks->begin = 0;
83 ks->end = __GKSEQ_READFUNC(ks->f, ks->buf, __GKSEQ_BUFSIZE);
84 if (ks->end < __GKSEQ_BUFSIZE)
85 ks->is_eof = 1;
86 if (ks->end == 0)
87 break;
88 } else
89 break;
91 if (delimiter) {
92 for (i = ks->begin; i < ks->end; ++i)
93 if (ks->buf[i] == delimiter)
94 break;
95 } else {
96 for (i = ks->begin; i < ks->end; ++i)
97 if (isspace(ks->buf[i]))
98 break;
100 if (str->m - str->l < i - ks->begin + 1) {
101 str->m = str->l + (i - ks->begin) + 1;
102 KROUNDUP32(str->m);
103 str->s = (char *) realloc(str->s, str->m);
105 memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin);
106 str->l = str->l + (i - ks->begin);
107 ks->begin = i + 1;
108 if (i < ks->end) {
109 if (dret)
110 *dret = ks->buf[i];
111 break;
114 str->s[str->l] = '\0';
115 return str->l;
118 static inline kseq_t *kseq_init(__GKSEQ_FILETYPE fd) {
119 kseq_t *s = (kseq_t *) calloc(1, sizeof(kseq_t));
120 s->f = ks_init(fd);
121 return s;
124 static inline void kseq_rewind(kseq_t * ks) {
125 ks->last_char = 0;
126 ks->f->is_eof = ks->f->begin = ks->f->end = 0;
129 static inline void kseq_destroy(kseq_t * ks) {
130 if (!ks) return;
131 free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s);
132 ks_destroy(ks->f);
133 free(ks);
136 /* Return value:
137 // >=0 length of the sequence (normal)
138 1 FASTA, just bases
139 3 FASTQ, both bases and Q
140 -1 end-of-file
141 -2 truncated quality string
143 static int_fast8_t kseq_read(kseq_t * seq) {
144 int c;
145 kstream_t *ks = seq->f;
146 if (seq->last_char == 0) { // then jump to the next header line
147 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@');
148 if (c == -1)
149 return -1; // end of file
150 seq->last_char = c;
151 } // the first header char has been read
152 seq->comment.l = seq->seq.l = seq->qual.l = 0;
153 if (ks_getuntil(ks, 0, &seq->name, &c) < 0)
154 return -1;
155 if (c != '\n')
156 ks_getuntil(ks, '\n', &seq->comment, 0);
157 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') {
158 if (isgraph(c)) { // printable non-space character
159 if (seq->seq.l + 1 >= seq->seq.m) { // double the memory
160 seq->seq.m = seq->seq.l + 2;
161 KROUNDUP32(seq->seq.m); // rounded to next closest 2^k
162 seq->seq.s = (char *) realloc(seq->seq.s, seq->seq.m);
164 seq->seq.s[seq->seq.l++] = (char) c;
167 if (c == '>' || c == '@')
168 seq->last_char = c; // the first header char has been read
169 seq->seq.s[seq->seq.l] = 0; // null terminated string
170 if (c != '+') {
171 return 1; // FASTA, only bases
173 if (seq->qual.m < seq->seq.m) { // allocate enough memory
174 seq->qual.m = seq->seq.m;
175 seq->qual.s = (char *) realloc(seq->qual.s, seq->qual.m);
177 while ((c = ks_getc(ks)) != -1 && c != '\n'); // skip the rest of '+' line
178 if (c == -1)
179 return -2; // we should not stop here
180 while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l)
181 if (c >= 33 && c <= 127)
182 seq->qual.s[seq->qual.l++] = (unsigned char) c;
183 seq->qual.s[seq->qual.l] = 0; // null terminated string
184 seq->last_char = 0; // we have not come to the next header line
185 if (seq->seq.l != seq->qual.l)
186 return -2; // qual string is shorter than seq string
187 //return seq->seq.l;
188 return 3; // both base and Q
191 #endif