modified: src1/common.h
[GalaxyCodeBases.git] / c_cpp / libgFileIO / gkseq.h
blob1c6ebc16e800d8b9e36aaf3651b46a13684d7edf
1 /* The MIT License
2 From Heng Li <lh3@sanger.ac.uk>, as in bwa package.
3 Macro translated by Hu Xuesong.
4 */
6 #ifndef _G_KSEQ_H
7 #define _G_KSEQ_H
9 #include <ctype.h>
10 #include <string.h>
11 #include <stdlib.h>
12 #include <zlib.h>
14 #define __GKSEQ_FILETYPE gzFile // Well, EP enough to use define here.
15 #define __GKSEQ_READFUNC gzread // Maybe "lh3" just want to make debug harder for his code ?
16 //#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
17 //#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
18 #define __GKSEQ_BUFSIZE 4096
20 #ifndef KROUNDUP32
21 #define KROUNDUP32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
22 #endif
24 //#74 "t.h" // append `KSEQ_INIT(gzFile, gzread)` to kseq.h, remove all `#include`, you will get "t.h".
25 #ifndef KSTRING_T
26 #define KSTRING_T kstring_t
27 typedef struct __kstring_t {
28 size_t l, m;
29 char *s;
30 } kstring_t;
31 #endif
33 //#215 "t.h" // gcc -E t.h > gkseq.h && indent -kr -brf -l120 -nut t.h -o gkseqn.h
34 typedef struct __kstream_t {
35 char *buf;
36 int begin, end, is_eof;
37 __GKSEQ_FILETYPE f;
38 } kstream_t;
40 typedef struct {
41 kstring_t name, comment, seq, qual;
42 int last_char;
43 kstream_t *f;
44 } kseq_t;
46 static inline kstream_t *ks_init(__GKSEQ_FILETYPE f) {
47 kstream_t *ks = (kstream_t *) calloc(1, sizeof(kstream_t));
48 ks->f = f;
49 ks->buf = (char *) malloc(__GKSEQ_BUFSIZE);
50 return ks;
53 static inline void ks_destroy(kstream_t * ks) {
54 if (ks) {
55 free(ks->buf);
56 free(ks);
60 static inline int ks_getc(kstream_t * ks) {
61 if (ks->is_eof && ks->begin >= ks->end)
62 return -1;
63 if (ks->begin >= ks->end) {
64 ks->begin = 0;
65 ks->end = __GKSEQ_READFUNC(ks->f, ks->buf, __GKSEQ_BUFSIZE);
66 if (ks->end < __GKSEQ_BUFSIZE)
67 ks->is_eof = 1;
68 if (ks->end == 0)
69 return -1;
71 return (int) ks->buf[ks->begin++];
74 static int ks_getuntil(kstream_t * ks, int delimiter, kstring_t * str, int *dret) {
75 if (dret)
76 *dret = 0;
77 str->l = 0;
78 if (ks->begin >= ks->end && ks->is_eof)
79 return -1;
80 for (;;) {
81 int i;
82 if (ks->begin >= ks->end) {
83 if (!ks->is_eof) {
84 ks->begin = 0;
85 ks->end = __GKSEQ_READFUNC(ks->f, ks->buf, __GKSEQ_BUFSIZE);
86 if (ks->end < __GKSEQ_BUFSIZE)
87 ks->is_eof = 1;
88 if (ks->end == 0)
89 break;
90 } else
91 break;
93 if (delimiter) {
94 for (i = ks->begin; i < ks->end; ++i)
95 if (ks->buf[i] == delimiter)
96 break;
97 } else {
98 for (i = ks->begin; i < ks->end; ++i)
99 if (isspace(ks->buf[i]))
100 break;
102 if (str->m - str->l < i - ks->begin + 1) {
103 str->m = str->l + (i - ks->begin) + 1;
104 KROUNDUP32(str->m);
105 str->s = (char *) realloc(str->s, str->m);
107 memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin);
108 str->l = str->l + (i - ks->begin);
109 ks->begin = i + 1;
110 if (i < ks->end) {
111 if (dret)
112 *dret = ks->buf[i];
113 break;
116 str->s[str->l] = '\0';
117 return str->l;
120 static inline kseq_t *kseq_init(__GKSEQ_FILETYPE fd) {
121 kseq_t *s = (kseq_t *) calloc(1, sizeof(kseq_t));
122 s->f = ks_init(fd);
123 /* realloc() may change the address,
124 thus, even we `malloc(32)` here and set `s->seq.m`,
125 `s->seq.s` still may be changed later on realloc().
127 return s;
130 static inline void kseq_rewind(kseq_t * ks) {
131 ks->last_char = 0;
132 ks->f->is_eof = ks->f->begin = ks->f->end = 0;
135 static inline void kseq_destroy(kseq_t * ks) {
136 if (!ks) return;
137 free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s);
138 ks_destroy(ks->f);
139 free(ks);
142 /* Return value:
143 // >=0 length of the sequence (normal)
144 1 FASTA, just bases
145 3 FASTQ, both bases and Q
146 -1 end-of-file
147 -2 truncated quality string
149 static int_fast8_t kseq_read(kseq_t * seq) { // better to be ssize_t
150 int c;
151 kstream_t *ks = seq->f;
152 if (seq->last_char == 0) { // then jump to the next header line
153 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@');
154 if (c == -1)
155 return -1; // end of file
156 seq->last_char = c;
157 } // the first header char has been read
158 seq->comment.l = seq->seq.l = seq->qual.l = 0;
159 if (ks_getuntil(ks, 0, &seq->name, &c) < 0)
160 return -1;
161 if (c != '\n')
162 ks_getuntil(ks, '\n', &seq->comment, 0);
163 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') {
164 if (isgraph(c)) { // printable non-space character
165 if (seq->seq.l + 1 >= seq->seq.m) { // double the memory
166 seq->seq.m = seq->seq.l + 2;
167 KROUNDUP32(seq->seq.m); // rounded to next closest 2^k
168 seq->seq.s = (char *) realloc(seq->seq.s, seq->seq.m);
170 seq->seq.s[seq->seq.l++] = (char) c;
173 if (c == '>' || c == '@')
174 seq->last_char = c; // the first header char has been read
175 seq->seq.s[seq->seq.l] = 0; // null terminated string
176 if (c != '+') {
177 //seq->qual.s[0] = 0; // set Q to "". also bool, 0 for no Q. JUST TEMP, should change back later !!!
178 return 1; // FASTA, only bases
180 if (seq->qual.m < seq->seq.m) { // allocate enough memory
181 seq->qual.m = seq->seq.m;
182 seq->qual.s = (char *) realloc(seq->qual.s, seq->qual.m);
184 while ((c = ks_getc(ks)) != -1 && c != '\n'); // skip the rest of '+' line
185 if (c == -1)
186 return -2; // we should not stop here
187 while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l)
188 if (c >= 33 && c <= 127)
189 seq->qual.s[seq->qual.l++] = (unsigned char) c;
190 seq->qual.s[seq->qual.l] = 0; // null terminated string
191 seq->last_char = 0; // we have not come to the next header line
192 if (seq->seq.l != seq->qual.l)
193 return -2; // qual string is shorter than seq string
194 //return seq->seq.l;
195 return 3; // both base and Q
198 #endif