8 int bio_flag
= 0, bio_fmt
= BIO_NULL
;
10 static const char *col_defs
[][15] = { /* FIXME: this is convenient, but not memory efficient. Shouldn't matter. */
12 {"bed", "chrom", "start", "end", "name", "score", "strand", "thickstart", "thickend", "rgb", "blockcount", "blocksizes", "blockstarts", NULL
},
13 {"sam", "qname", "flag", "rname", "pos", "mapq", "cigar", "rnext", "pnext", "tlen", "seq", "qual", NULL
},
14 {"vcf", "chrom", "pos", "id", "ref", "alt", "qual", "filter", "info", NULL
},
15 {"gff", "seqname", "source", "feature", "start", "end", "score", "filter", "strand", "group", "attribute", NULL
},
16 {"fastx", "name", "seq", "qual", "comment", NULL
},
20 static const char *tab_delim
= "nyyyyyn", *hdr_chr
= "\0#@##\0\0";
22 /************************
23 * Setting column names *
24 ************************/
26 static void set_colnm_aux(const char *p
, int col
)
31 for (q
= p
; *q
; ++q
) /* test if there are punctuations */
32 if (ispunct(*q
) && *q
!= '_') break;
33 if (*q
|| isdigit(*p
)) { /* there are punctuations or the first is digit */
35 r
= malloc(strlen(p
) + 2);
40 for (qq
= r
; *qq
; ++qq
)
41 if (ispunct(*qq
)) *qq
= '_';
44 if ((x
= lookup(q
, symtab
)) != NULL
) /* do not add if not appear in the program */
45 setfval(x
, (Awkfloat
)col
);
49 int bio_get_fmt(const char *s
)
52 if (strcmp(s
, "hdr") == 0) return BIO_HDR
;
53 for (i
= 0; col_defs
[i
][0]; ++i
)
54 if (strcmp(s
, col_defs
[i
][0]) == 0) return i
;
55 for (i
= 1; col_defs
[i
][0]; ++i
) {
56 printf("%s:\n\t", col_defs
[i
][0]);
57 for (j
= 1; col_defs
[i
][j
]; ++j
)
58 printf("%d:%s ", j
, col_defs
[i
][j
]);
64 int bio_skip_hdr(const char *r
)
66 if (bio_fmt
<= BIO_HDR
) return 0;
67 if (*r
&& *r
== hdr_chr
[bio_fmt
]) {
68 if (bio_flag
& BIO_SHOW_HDR
) puts(r
);
76 if (bio_fmt
== BIO_NULL
) {
78 } else if (bio_fmt
== BIO_HDR
) {
80 for (p
= record
; *p
&& isspace(*p
); ++p
); /* skip leading spaces */
81 for (i
= 1, q
= p
; *q
; ++q
) {
82 if (!isspace(*q
)) continue;
83 c
= *q
; /* backup the space */
84 *q
= 0; /* terminate the field */
86 *q
= c
; /* change back */
88 for (p
= q
+ 1; *p
&& isspace(*p
); ++p
); /* skip contiguous spaces */
91 set_colnm_aux(p
, i
); /* the last column */
93 for (i
= 0; col_defs
[bio_fmt
][i
] != NULL
; ++i
)
94 set_colnm_aux(col_defs
[bio_fmt
][i
], i
);
95 if (tab_delim
[bio_fmt
] == 'y') *FS
= *OFS
= "\t";
99 /**********************
100 * Built-in functions *
101 **********************/
103 static char comp_tab
[] = {
104 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
105 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
106 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
107 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
108 64, 'T', 'V', 'G', 'H', 'E', 'F', 'C', 'D', 'I', 'J', 'M', 'L', 'K', 'N', 'O',
109 'P', 'Q', 'Y', 'S', 'A', 'A', 'B', 'W', 'X', 'R', 'Z', 91, 92, 93, 94, 95,
110 64, 't', 'v', 'g', 'h', 'e', 'f', 'c', 'd', 'i', 'j', 'm', 'l', 'k', 'n', 'o',
111 'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127
114 static float q_int2real
[128];
116 #define tempfree(x) if (istemp(x)) tfree(x); else
118 Cell
*bio_func(int f
, Cell
*x
, Node
**a
)
123 if (a
[1]->nnext
== 0) {
124 WARNING("and requires two arguments; returning 0.0");
127 z
= execute(a
[1]->nnext
);
128 setfval(y
, (Awkfloat
)((long)getfval(x
) & (long)getfval(z
))); /* FIXME: does (long) always work??? */
131 } else if (f
== BIO_FOR
) {
132 if (a
[1]->nnext
== 0) {
133 WARNING("or requires two arguments; returning 0.0");
136 z
= execute(a
[1]->nnext
);
137 setfval(y
, (Awkfloat
)((long)getfval(x
) | (long)getfval(z
)));
140 } else if (f
== BIO_FXOR
) {
141 if (a
[1]->nnext
== 0) {
142 WARNING("xor requires two arguments; returning 0.0");
145 z
= execute(a
[1]->nnext
);
146 setfval(y
, (Awkfloat
)((long)getfval(x
) ^ (long)getfval(z
)));
149 } else if (f
== BIO_FREVERSE
) {
150 char *buf
= getsval(x
);
153 for (i
= 0; i
< l
>>1; ++i
)
154 tmp
= buf
[i
], buf
[i
] = buf
[l
-1-i
], buf
[l
-1-i
] = tmp
;
156 } else if (f
== BIO_FREVCOMP
) {
161 for (i
= 0; i
< l
>>1; ++i
)
162 tmp
= comp_tab
[(int)buf
[i
]], buf
[i
] = comp_tab
[(int)buf
[l
-1-i
]], buf
[l
-1-i
] = tmp
;
163 if (l
&1) buf
[l
>>1] = comp_tab
[(int)buf
[l
>>1]];
165 } else if (f
== BIO_FGC
) {
170 if (l
) { /* don't try for empty strings */
171 for (i
= 0; i
< l
; ++i
)
172 if (buf
[i
] == 'g' || buf
[i
] == 'c' ||
173 buf
[i
] == 'G' || buf
[i
] == 'C')
175 setfval(y
, (Awkfloat
)gc
/ l
);
177 } else if (f
== BIO_FMEANQUAL
) {
179 int i
, l
, total_qual
= 0;
182 if (l
) { /* don't try for empty strings */
183 for (i
= 0; i
< l
; ++i
)
184 total_qual
+= buf
[i
] - 33;
185 setfval(y
, (Awkfloat
)total_qual
/ l
);
187 } else if (f
== BIO_FTRIMQ
) {
189 double thres
= 0.05, s
, max
;
190 int i
, l
, tmp
, beg
, end
;
193 u
= execute(a
[1]->nnext
); /* begin */
194 if (a
[1]->nnext
->nnext
) {
195 v
= execute(a
[1]->nnext
->nnext
); /* end */
196 if (a
[1]->nnext
->nnext
->nnext
) {
197 z
= execute(a
[1]->nnext
->nnext
->nnext
);
198 thres
= getfval(z
); /* user defined threshold */
205 if (q_int2real
[0] == 0.) /* to initialize */
206 for (i
= 0; i
< 128; ++i
)
207 q_int2real
[i
] = pow(10., -(i
- 33) / 10.);
208 for (i
= 0, beg
= tmp
= 0, end
= l
, s
= max
= 0.; i
< l
; ++i
) {
211 if (q
> 127) q
= 127;
212 s
+= thres
- q_int2real
[q
];
213 if (s
> max
) max
= s
, beg
= tmp
, end
= i
+ 1;
214 if (s
< 0) s
= 0, tmp
= i
+ 1;
216 if (u
) { setfval(u
, beg
); tempfree(u
); } /* 1-based position; as substr() is 1-based. */
217 if (v
) { setfval(v
, end
); tempfree(v
); }
218 } else if (f
== BIO_FQUALCOUNT
) {
219 if (a
[1]->nnext
== 0) {
220 WARNING("qualcount requires two arguments; returning 0.0");
224 int i
, l
, thres
, cnt
= 0;
227 z
= execute(a
[1]->nnext
); /* threshold */
228 thres
= (int)(getfval(z
) + .499);
229 for (i
= 0; i
< l
; ++i
)
230 if (buf
[i
] - 33 >= thres
) ++cnt
;
231 setfval(y
, (Awkfloat
)cnt
);
233 } /* else: never happens */
237 /************************
238 * getrec() replacement *
239 ************************/
241 #include <zlib.h> /* FIXME: it would be better to drop this dependency... */
243 KSEQ_INIT2(, gzFile
, gzread
)
246 static kseq_t
*g_kseq
;
247 static int g_firsttime
= 1, g_is_stdin
= 0;
248 static kstring_t g_str
;
250 int bio_getrec(char **pbuf
, int *psize
, int isrecord
)
252 extern Awkfloat
*ARGC
;
253 extern int argno
, recsize
;
255 extern Cell
**fldtab
;
257 int i
, c
, saveb0
, dret
, bufsize
= *psize
, savesize
= *psize
;
258 char *p
, *buf
= *pbuf
;
259 if (g_firsttime
) { /* mimicing initgetrec() in lib.c */
261 for (i
= 1; i
< *ARGC
; i
++) {
262 p
= getargv(i
); /* find 1st real filename */
263 if (p
== NULL
|| *p
== '\0') { /* deleted or zapped */
268 setsval(lookup("FILENAME", symtab
), p
);
271 setclvar(p
); /* a commandline assignment before filename */
274 g_fp
= gzdopen(fileno(stdin
), "r"); /* no filenames, so use stdin */
275 g_kseq
= kseq_init(g_fp
);
281 donefld
= 0; /* these are defined in lib.c */
285 buf
[0] = 0; /* this is effective at the end of file */
286 while (argno
< *ARGC
|| g_is_stdin
) {
287 if (g_kseq
== 0) { /* have to open a new file */
288 file
= getargv(argno
);
289 if (file
== NULL
|| *file
== '\0') { /* deleted or zapped */
293 if (isclvar(file
)) { /* a var=value arg */
299 if (*file
== '-' && *(file
+1) == '\0') {
300 g_fp
= gzdopen(fileno(stdin
), "r");
301 g_kseq
= kseq_init(g_fp
);
304 if ((g_fp
= gzopen(file
, "r")) == NULL
)
305 FATAL("can't open file %s", file
);
306 g_kseq
= kseq_init(g_fp
);
309 setfval(fnrloc
, 0.0);
311 if (bio_fmt
!= BIO_FASTX
) {
312 c
= ks_getuntil(g_kseq
->f
, **RS
, &g_str
, &dret
);
314 c
= kseq_read(g_kseq
);
317 g_str
.m
= g_kseq
->name
.l
+ g_kseq
->comment
.l
+ g_kseq
->seq
.l
+ g_kseq
->qual
.l
+ 4;
319 g_str
.s
= (char*)realloc(g_str
.s
, g_str
.m
);
320 for (i
= 0; i
< g_kseq
->name
.l
; ++i
)
321 g_str
.s
[g_str
.l
++] = g_kseq
->name
.s
[i
];
322 g_str
.s
[g_str
.l
++] = '\t';
323 for (i
= 0; i
< g_kseq
->seq
.l
; ++i
)
324 g_str
.s
[g_str
.l
++] = g_kseq
->seq
.s
[i
];
325 g_str
.s
[g_str
.l
++] = '\t';
326 for (i
= 0; i
< g_kseq
->qual
.l
; ++i
)
327 g_str
.s
[g_str
.l
++] = g_kseq
->qual
.s
[i
];
328 g_str
.s
[g_str
.l
++] = '\t';
329 for (i
= 0; i
< g_kseq
->comment
.l
; ++i
)
330 g_str
.s
[g_str
.l
++] = g_kseq
->comment
.s
[i
];
331 g_str
.s
[g_str
.l
++] = '\0';
334 if (g_str
.s
) g_str
.s
[0] = '\0';
337 adjbuf(&buf
, &bufsize
, g_str
.l
+ 1, recsize
, 0, "bio_getrec");
338 memcpy(buf
, g_str
.s
, g_str
.l
+ 1);
339 if (c
>= 0) { /* normal record */
341 if (freeable(fldtab
[0]))
342 xfree(fldtab
[0]->sval
);
343 fldtab
[0]->sval
= buf
; /* buf == record */
344 fldtab
[0]->tval
= REC
| STR
| DONTFREE
;
345 if (is_number(fldtab
[0]->sval
)) {
346 fldtab
[0]->fval
= atof(fldtab
[0]->sval
);
347 fldtab
[0]->tval
|= NUM
;
350 setfval(nrloc
, nrloc
->fval
+1);
351 setfval(fnrloc
, fnrloc
->fval
+1);
356 /* EOF arrived on this file; set up next */
358 kseq_destroy(g_kseq
);
361 g_fp
= 0; g_kseq
= 0; g_is_stdin
= 0;
367 return 0; /* true end of file */