modified: src1/input.c
[GalaxyCodeBases.git] / c_cpp / lib / Galaxy / gkseq.h
blobf7b1c63b7ca88bb16ded997346d95294dd00c56b
1 /* The MIT License
2 From Heng Li <lh3@sanger.ac.uk>, as in bwa package.
3 Macro translated by Hu Xuesong.
5 Changelog:
6 20110817 > Add kseq_t *kseq_open(char *const filename);
7 To open:
8 kseq_t* kseq = kseq_open("filename");
9 // gzFile fp=gzopen("filename", "r"); kseq_t* kseq=kseq_init(fp);
10 To close:
11 kseq_destroy(kseq); // gzclose(fp);
12 > Add RealSize,RealOffset as kseq->f->*
15 #ifndef _G_KSEQ_H
16 #define _G_KSEQ_H
18 #include <ctype.h>
19 #include <stdint.h>
20 #include <string.h>
21 #include <stdlib.h>
22 #include <zlib.h>
23 //open
24 #include <sys/types.h>
25 #include <sys/stat.h>
26 #include <fcntl.h>
27 //lseek
28 #include <unistd.h>
30 #define __GKSEQ_FILETYPE gzFile // Well, EP enough to use define here.
31 #define __GKSEQ_READFUNC gzread // Maybe "lh3" just want to make debug harder for his code ?
32 #define __GKSEQ_OPENFUNC gzopen
33 #define __GKSEQ_CLOSEFUNC gzclose
34 //#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
35 //#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
36 #define __GKSEQ_BUFSIZE (4096)
38 #ifndef KROUNDUP32
39 #define KROUNDUP32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
40 #endif
44 // from gzip-1.4/gzip.h, added __gkseq_ / __GKSEQ_ prefix.
45 typedef unsigned char __gkseq_uch;
46 typedef unsigned short __gkseq_ush;
47 typedef unsigned long __gkseq_ulg;
48 /* Macros for getting two-byte and four-byte header values */
49 #define __GKSEQ_SH(p) ((__gkseq_ush)(__gkseq_uch)((p)[0]) | ((__gkseq_ush)(__gkseq_uch)((p)[1]) << 8))
50 #define __GKSEQ_LG(p) ((__gkseq_ulg)(__GKSEQ_SH(p)) | ((__gkseq_ulg)(__GKSEQ_SH((p)+2)) << 16))
51 // from gzip-1.4/gzip.h
55 //#74 "t.h" // append `KSEQ_INIT(gzFile, gzread)` to kseq.h, remove all `#include`, you will get "t.h".
56 #ifndef KSTRING_T
57 #define KSTRING_T kstring_t
58 typedef struct __kstring_t {
59 size_t l, m;
60 char *s;
61 } kstring_t;
62 #endif
64 //#215 "t.h" // gcc -E t.h > gkseq.h && indent -kr -brf -l120 -nut t.h -o gkseqn.h
66 /* RealSize: original (uncompressed) size of sequence file
67 RealOffset: file offset in RealSize. Begin from 0, should pointing to '>' or '@'.
68 ReadBuffer: current buffer count. The first buffer is 0. (init value is -1)
70 typedef struct __kstream_t {
71 char *buf;
72 int begin, end, is_eof;
73 unsigned long RealSize,RealOffset;
74 long ReadBuffer;
75 __GKSEQ_FILETYPE f;
76 } kstream_t;
78 typedef struct {
79 kstring_t name, comment, seq, qual;
80 int last_char;
81 kstream_t *f;
82 } kseq_t;
84 static inline kstream_t *ks_init(__GKSEQ_FILETYPE f) {
85 kstream_t *ks = (kstream_t *) calloc(1, sizeof(kstream_t));
86 ks->f = f;
87 ks->buf = (char *) malloc(__GKSEQ_BUFSIZE);
88 ks->ReadBuffer=-1L;
89 return ks;
92 static inline void ks_destroy(kstream_t * ks) {
93 if (ks) {
94 free(ks->buf);
95 __GKSEQ_CLOSEFUNC(ks->f);
96 free(ks);
100 static inline int ks_getc(kstream_t * ks) {
101 if (ks->is_eof && ks->begin >= ks->end)
102 return -1;
103 if (ks->begin >= ks->end) {
104 ks->begin = 0;
105 ks->end = __GKSEQ_READFUNC(ks->f, ks->buf, __GKSEQ_BUFSIZE);
106 ++ks->ReadBuffer;
107 if (ks->end < __GKSEQ_BUFSIZE)
108 ks->is_eof = 1;
109 if (ks->end == 0)
110 return -1;
112 return (int) ks->buf[ks->begin++];
115 static int ks_getuntil(kstream_t * ks, int delimiter, kstring_t * str, int *dret) {
116 if (dret)
117 *dret = 0;
118 str->l = 0;
119 if (ks->begin >= ks->end && ks->is_eof)
120 return -1;
121 for (;;) {
122 int i;
123 if (ks->begin >= ks->end) {
124 if (!ks->is_eof) {
125 ks->begin = 0;
126 ks->end = __GKSEQ_READFUNC(ks->f, ks->buf, __GKSEQ_BUFSIZE);
127 ++ks->ReadBuffer;
128 if (ks->end < __GKSEQ_BUFSIZE)
129 ks->is_eof = 1;
130 if (ks->end == 0)
131 break;
132 } else
133 break;
135 if (delimiter) {
136 for (i = ks->begin; i < ks->end; ++i)
137 if (ks->buf[i] == delimiter)
138 break;
139 } else {
140 for (i = ks->begin; i < ks->end; ++i)
141 if (isspace(ks->buf[i]))
142 break;
144 if (str->m - str->l < i - ks->begin + 1) {
145 str->m = str->l + (i - ks->begin) + 1;
146 KROUNDUP32(str->m);
147 str->s = (char *) realloc(str->s, str->m);
149 memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin);
150 str->l = str->l + (i - ks->begin);
151 ks->begin = i + 1;
152 if (i < ks->end) {
153 if (dret)
154 *dret = ks->buf[i];
155 break;
158 str->s[str->l] = '\0';
159 return str->l;
162 static inline kseq_t *kseq_init(__GKSEQ_FILETYPE fd) {
163 kseq_t *s = (kseq_t *) calloc(1, sizeof(kseq_t));
164 s->f = ks_init(fd);
165 /* realloc() may change the address,
166 thus, even we `malloc(32)` here and set `s->seq.m`,
167 `s->seq.s` still may be changed later on realloc().
169 return s;
171 static kseq_t *kseq_open(char *const filename) {
172 unsigned long realsize=0;
173 int ifd = open(filename, O_RDONLY);
174 if (ifd==-1) return NULL;
175 unsigned char buf[4];
176 off_t bytes_in = lseek(ifd, (off_t)(-4), SEEK_END);
177 if ( bytes_in != -1L) {
178 bytes_in += 4L;
179 if (read(ifd, (char*)buf, sizeof(buf)) == sizeof(buf)) {
180 realsize = __GKSEQ_LG(buf);
183 close(ifd);
184 __GKSEQ_FILETYPE fp;
185 fp = __GKSEQ_OPENFUNC(filename, "r");
186 kseq_t *s = kseq_init(fp);
187 s->f->RealSize = realsize;
188 return s;
191 static inline void kseq_rewind(kseq_t * ks) {
192 ks->last_char = 0;
193 ks->f->is_eof = ks->f->begin = ks->f->end = 0;
196 static inline void kseq_destroy(kseq_t * ks) {
197 if (!ks) return;
198 free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s);
199 ks_destroy(ks->f);
200 free(ks);
203 /* Return value:
204 // >=0 length of the sequence (normal)
205 1 FASTA, just bases
206 3 FASTQ, both bases and Q
207 -1 end-of-file
208 -2 truncated quality string
210 static int_fast8_t kseq_read(kseq_t * seq) { // better to be ssize_t
211 int c;
212 //uint_fast8_t with_last_chr = 0;
213 kstream_t *ks = seq->f;
214 if (seq->last_char == 0) { // then jump to the next header line
215 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@');
216 if (c == -1)
217 return -1; // end of file
218 seq->last_char = c;
219 } // the first header char has been read
220 seq->comment.l = seq->seq.l = seq->qual.l = 0;
221 if (ks_getuntil(ks, 0, &seq->name, &c) < 0)
222 return -1;
223 if (c != '\n')
224 ks_getuntil(ks, '\n', &seq->comment, 0);
225 while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') {
226 if (isgraph(c)) { // printable non-space character
227 if (seq->seq.l + 1 >= seq->seq.m) { // double the memory
228 seq->seq.m = seq->seq.l + 2;
229 KROUNDUP32(seq->seq.m); // rounded to next closest 2^k
230 seq->seq.s = (char *) realloc(seq->seq.s, seq->seq.m);
232 seq->seq.s[seq->seq.l++] = (char) c;
235 if (c == '>' || c == '@')
236 seq->last_char = c; // the first header char has been read
237 //with_last_chr = 1;
238 seq->seq.s[seq->seq.l] = 0; // null terminated string
239 if (c != '+') {
240 //seq->qual.s[0] = 0; // set Q to "". also bool, 0 for no Q. JUST TEMP, should change back later !!!
241 seq->f->RealOffset = __GKSEQ_BUFSIZE*(seq->f->ReadBuffer) + seq->f->begin - 1;//with_last_chr;
242 return 1; // FASTA, only bases
244 if (seq->qual.m < seq->seq.m) { // allocate enough memory
245 seq->qual.m = seq->seq.m;
246 seq->qual.s = (char *) realloc(seq->qual.s, seq->qual.m);
248 while ((c = ks_getc(ks)) != -1 && c != '\n'); // skip the rest of '+' line
249 if (c == -1)
250 return -2; // we should not stop here
251 while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l)
252 if (c >= 33 && c <= 127)
253 seq->qual.s[seq->qual.l++] = (unsigned char) c;
254 seq->qual.s[seq->qual.l] = 0; // null terminated string
255 seq->last_char = 0; // we have not come to the next header line
256 //with_last_chr = 0;
257 seq->f->RealOffset = __GKSEQ_BUFSIZE*(seq->f->ReadBuffer)+seq->f->begin;
258 if (seq->seq.l != seq->qual.l)
259 return -2; // qual string is shorter than seq string
260 //return seq->seq.l;
261 return 3; // both base and Q
264 #endif