modified: makefile
[GalaxyCodeBases.git] / tools / bioawk / addon.c
bloba57223750c19643850afaec271c5e59d75d40ce8
1 #include <math.h>
2 #include <ctype.h>
3 #include <stdio.h>
4 #include <string.h>
5 #include <stdlib.h>
6 #include "awk.h"
8 int bio_flag = 0, bio_fmt = BIO_NULL;
10 static const char *col_defs[][15] = { /* FIXME: this is convenient, but not memory efficient. Shouldn't matter. */
11 {"header", NULL},
12 {"bed", "chrom", "start", "end", "name", "score", "strand", "thickstart", "thickend", "rgb", "blockcount", "blocksizes", "blockstarts", NULL},
13 {"sam", "qname", "flag", "rname", "pos", "mapq", "cigar", "rnext", "pnext", "tlen", "seq", "qual", NULL},
14 {"vcf", "chrom", "pos", "id", "ref", "alt", "qual", "filter", "info", NULL},
15 {"gff", "seqname", "source", "feature", "start", "end", "score", "filter", "strand", "group", "attribute", NULL},
16 {"fastx", "name", "seq", "qual", "comment", NULL},
17 {NULL}
20 static const char *tab_delim = "nyyyyyn", *hdr_chr = "\0#@##\0\0";
22 /************************
23 * Setting column names *
24 ************************/
26 static void set_colnm_aux(const char *p, int col)
28 const char *q;
29 char *r = 0;
30 Cell *x;
31 for (q = p; *q; ++q) /* test if there are punctuations */
32 if (ispunct(*q) && *q != '_') break;
33 if (*q || isdigit(*p)) { /* there are punctuations or the first is digit */
34 char *qq;
35 r = malloc(strlen(p) + 2);
36 if (isdigit(*p)) {
37 *r = '_';
38 strcpy(r + 1, p);
39 } else strcpy(r, p);
40 for (qq = r; *qq; ++qq)
41 if (ispunct(*qq)) *qq = '_';
42 q = r;
43 } else q = p;
44 if ((x = lookup(q, symtab)) != NULL) /* do not add if not appear in the program */
45 setfval(x, (Awkfloat)col);
46 if (r) free(r);
49 int bio_get_fmt(const char *s)
51 int i, j;
52 if (strcmp(s, "hdr") == 0) return BIO_HDR;
53 for (i = 0; col_defs[i][0]; ++i)
54 if (strcmp(s, col_defs[i][0]) == 0) return i;
55 for (i = 1; col_defs[i][0]; ++i) {
56 printf("%s:\n\t", col_defs[i][0]);
57 for (j = 1; col_defs[i][j]; ++j)
58 printf("%d:%s ", j, col_defs[i][j]);
59 putchar('\n');
61 return BIO_NULL;
64 int bio_skip_hdr(const char *r)
66 if (bio_fmt <= BIO_HDR) return 0;
67 if (*r && *r == hdr_chr[bio_fmt]) {
68 if (bio_flag & BIO_SHOW_HDR) puts(r);
69 return 1;
70 } else return 0;
73 void bio_set_colnm()
75 int i;
76 if (bio_fmt == BIO_NULL) {
77 return;
78 } else if (bio_fmt == BIO_HDR) {
79 char *p, *q, c;
80 for (p = record; *p && isspace(*p); ++p); /* skip leading spaces */
81 for (i = 1, q = p; *q; ++q) {
82 if (!isspace(*q)) continue;
83 c = *q; /* backup the space */
84 *q = 0; /* terminate the field */
85 set_colnm_aux(p, i);
86 *q = c; /* change back */
87 ++i;
88 for (p = q + 1; *p && isspace(*p); ++p); /* skip contiguous spaces */
89 q = p;
91 set_colnm_aux(p, i); /* the last column */
92 } else {
93 for (i = 0; col_defs[bio_fmt][i] != NULL; ++i)
94 set_colnm_aux(col_defs[bio_fmt][i], i);
95 if (tab_delim[bio_fmt] == 'y') *FS = *OFS = "\t";
99 /**********************
100 * Built-in functions *
101 **********************/
103 static char comp_tab[] = {
104 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
105 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
106 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
107 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
108 64, 'T', 'V', 'G', 'H', 'E', 'F', 'C', 'D', 'I', 'J', 'M', 'L', 'K', 'N', 'O',
109 'P', 'Q', 'Y', 'S', 'A', 'A', 'B', 'W', 'X', 'R', 'Z', 91, 92, 93, 94, 95,
110 64, 't', 'v', 'g', 'h', 'e', 'f', 'c', 'd', 'i', 'j', 'm', 'l', 'k', 'n', 'o',
111 'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127
114 static float q_int2real[128];
116 #define tempfree(x) if (istemp(x)) tfree(x); else
118 Cell *bio_func(int f, Cell *x, Node **a)
120 Cell *y, *z;
121 y = gettemp();
122 if (f == BIO_FAND) {
123 if (a[1]->nnext == 0) {
124 WARNING("and requires two arguments; returning 0.0");
125 setfval(y, 0.0);
126 } else {
127 z = execute(a[1]->nnext);
128 setfval(y, (Awkfloat)((long)getfval(x) & (long)getfval(z))); /* FIXME: does (long) always work??? */
129 tempfree(z);
131 } else if (f == BIO_FOR) {
132 if (a[1]->nnext == 0) {
133 WARNING("or requires two arguments; returning 0.0");
134 setfval(y, 0.0);
135 } else {
136 z = execute(a[1]->nnext);
137 setfval(y, (Awkfloat)((long)getfval(x) | (long)getfval(z)));
138 tempfree(z);
140 } else if (f == BIO_FXOR) {
141 if (a[1]->nnext == 0) {
142 WARNING("xor requires two arguments; returning 0.0");
143 setfval(y, 0.0);
144 } else {
145 z = execute(a[1]->nnext);
146 setfval(y, (Awkfloat)((long)getfval(x) ^ (long)getfval(z)));
147 tempfree(z);
149 } else if (f == BIO_FREVERSE) {
150 char *buf = getsval(x);
151 int i, l, tmp;
152 l = strlen(buf);
153 for (i = 0; i < l>>1; ++i)
154 tmp = buf[i], buf[i] = buf[l-1-i], buf[l-1-i] = tmp;
155 setsval(y, buf);
156 } else if (f == BIO_FREVCOMP) {
157 char *buf;
158 int i, l, tmp;
159 buf = getsval(x);
160 l = strlen(buf);
161 for (i = 0; i < l>>1; ++i)
162 tmp = comp_tab[(int)buf[i]], buf[i] = comp_tab[(int)buf[l-1-i]], buf[l-1-i] = tmp;
163 if (l&1) buf[l>>1] = comp_tab[(int)buf[l>>1]];
164 setsval(y, buf);
165 } else if (f == BIO_FGC) {
166 char *buf;
167 int i, l, gc = 0;
168 buf = getsval(x);
169 l = strlen(buf);
170 if (l) { /* don't try for empty strings */
171 for (i = 0; i < l; ++i)
172 if (buf[i] == 'g' || buf[i] == 'c' ||
173 buf[i] == 'G' || buf[i] == 'C')
174 gc++;
175 setfval(y, (Awkfloat)gc / l);
177 } else if (f == BIO_FMEANQUAL) {
178 char *buf;
179 int i, l, total_qual = 0;
180 buf = getsval(x);
181 l = strlen(buf);
182 if (l) { /* don't try for empty strings */
183 for (i = 0; i < l; ++i)
184 total_qual += buf[i] - 33;
185 setfval(y, (Awkfloat)total_qual / l);
187 } else if (f == BIO_FTRIMQ) {
188 char *buf;
189 double thres = 0.05, s, max;
190 int i, l, tmp, beg, end;
191 Cell *u = 0, *v = 0;
192 if (a[1]->nnext) {
193 u = execute(a[1]->nnext); /* begin */
194 if (a[1]->nnext->nnext) {
195 v = execute(a[1]->nnext->nnext); /* end */
196 if (a[1]->nnext->nnext->nnext) {
197 z = execute(a[1]->nnext->nnext->nnext);
198 thres = getfval(z); /* user defined threshold */
199 tempfree(z);
203 buf = getsval(x);
204 l = strlen(buf);
205 if (q_int2real[0] == 0.) /* to initialize */
206 for (i = 0; i < 128; ++i)
207 q_int2real[i] = pow(10., -(i - 33) / 10.);
208 for (i = 0, beg = tmp = 0, end = l, s = max = 0.; i < l; ++i) {
209 int q = buf[i];
210 if (q < 36) q = 36;
211 if (q > 127) q = 127;
212 s += thres - q_int2real[q];
213 if (s > max) max = s, beg = tmp, end = i + 1;
214 if (s < 0) s = 0, tmp = i + 1;
216 if (u) { setfval(u, beg); tempfree(u); } /* 1-based position; as substr() is 1-based. */
217 if (v) { setfval(v, end); tempfree(v); }
218 } else if (f == BIO_FQUALCOUNT) {
219 if (a[1]->nnext == 0) {
220 WARNING("qualcount requires two arguments; returning 0.0");
221 setfval(y, 0.0);
222 } else {
223 char *buf;
224 int i, l, thres, cnt = 0;
225 buf = getsval(x);
226 l = strlen(buf);
227 z = execute(a[1]->nnext); /* threshold */
228 thres = (int)(getfval(z) + .499);
229 for (i = 0; i < l; ++i)
230 if (buf[i] - 33 >= thres) ++cnt;
231 setfval(y, (Awkfloat)cnt);
233 } /* else: never happens */
234 return y;
237 /************************
238 * getrec() replacement *
239 ************************/
241 #include <zlib.h> /* FIXME: it would be better to drop this dependency... */
242 #include "kseq.h"
243 KSEQ_INIT2(, gzFile, gzread)
245 static gzFile g_fp;
246 static kseq_t *g_kseq;
247 static int g_firsttime = 1, g_is_stdin = 0;
248 static kstring_t g_str;
250 int bio_getrec(char **pbuf, int *psize, int isrecord)
252 extern Awkfloat *ARGC;
253 extern int argno, recsize;
254 extern char *file;
255 extern Cell **fldtab;
257 int i, c, saveb0, dret, bufsize = *psize, savesize = *psize;
258 char *p, *buf = *pbuf;
259 if (g_firsttime) { /* mimicing initgetrec() in lib.c */
260 g_firsttime = 0;
261 for (i = 1; i < *ARGC; i++) {
262 p = getargv(i); /* find 1st real filename */
263 if (p == NULL || *p == '\0') { /* deleted or zapped */
264 argno++;
265 continue;
267 if (!isclvar(p)) {
268 setsval(lookup("FILENAME", symtab), p);
269 goto getrec_start;
271 setclvar(p); /* a commandline assignment before filename */
272 argno++;
274 g_fp = gzdopen(fileno(stdin), "r"); /* no filenames, so use stdin */
275 g_kseq = kseq_init(g_fp);
276 g_is_stdin = 1;
279 getrec_start:
280 if (isrecord) {
281 donefld = 0; /* these are defined in lib.c */
282 donerec = 1;
284 saveb0 = buf[0];
285 buf[0] = 0; /* this is effective at the end of file */
286 while (argno < *ARGC || g_is_stdin) {
287 if (g_kseq == 0) { /* have to open a new file */
288 file = getargv(argno);
289 if (file == NULL || *file == '\0') { /* deleted or zapped */
290 argno++;
291 continue;
293 if (isclvar(file)) { /* a var=value arg */
294 setclvar(file);
295 argno++;
296 continue;
298 *FILENAME = file;
299 if (*file == '-' && *(file+1) == '\0') {
300 g_fp = gzdopen(fileno(stdin), "r");
301 g_kseq = kseq_init(g_fp);
302 g_is_stdin = 1;
303 } else {
304 if ((g_fp = gzopen(file, "r")) == NULL)
305 FATAL("can't open file %s", file);
306 g_kseq = kseq_init(g_fp);
307 g_is_stdin = 0;
309 setfval(fnrloc, 0.0);
311 if (bio_fmt != BIO_FASTX) {
312 c = ks_getuntil(g_kseq->f, **RS, &g_str, &dret);
313 } else {
314 c = kseq_read(g_kseq);
315 if (c >= 0) {
316 g_str.l = 0;
317 g_str.m = g_kseq->name.l + g_kseq->comment.l + g_kseq->seq.l + g_kseq->qual.l + 4;
318 kroundup32(g_str.m);
319 g_str.s = (char*)realloc(g_str.s, g_str.m);
320 for (i = 0; i < g_kseq->name.l; ++i)
321 g_str.s[g_str.l++] = g_kseq->name.s[i];
322 g_str.s[g_str.l++] = '\t';
323 for (i = 0; i < g_kseq->seq.l; ++i)
324 g_str.s[g_str.l++] = g_kseq->seq.s[i];
325 g_str.s[g_str.l++] = '\t';
326 for (i = 0; i < g_kseq->qual.l; ++i)
327 g_str.s[g_str.l++] = g_kseq->qual.s[i];
328 g_str.s[g_str.l++] = '\t';
329 for (i = 0; i < g_kseq->comment.l; ++i)
330 g_str.s[g_str.l++] = g_kseq->comment.s[i];
331 g_str.s[g_str.l++] = '\0';
332 } else {
333 g_str.l = 0;
334 if (g_str.s) g_str.s[0] = '\0';
337 adjbuf(&buf, &bufsize, g_str.l + 1, recsize, 0, "bio_getrec");
338 memcpy(buf, g_str.s, g_str.l + 1);
339 if (c >= 0) { /* normal record */
340 if (isrecord) {
341 if (freeable(fldtab[0]))
342 xfree(fldtab[0]->sval);
343 fldtab[0]->sval = buf; /* buf == record */
344 fldtab[0]->tval = REC | STR | DONTFREE;
345 if (is_number(fldtab[0]->sval)) {
346 fldtab[0]->fval = atof(fldtab[0]->sval);
347 fldtab[0]->tval |= NUM;
350 setfval(nrloc, nrloc->fval+1);
351 setfval(fnrloc, fnrloc->fval+1);
352 *pbuf = buf;
353 *psize = bufsize;
354 return 1;
356 /* EOF arrived on this file; set up next */
357 if (!g_is_stdin) {
358 kseq_destroy(g_kseq);
359 gzclose(g_fp);
361 g_fp = 0; g_kseq = 0; g_is_stdin = 0;
362 argno++;
364 buf[0] = saveb0;
365 *pbuf = buf;
366 *psize = savesize;
367 return 0; /* true end of file */