modified: src1/input.c
[GalaxyCodeBases.git] / tools / bioawk / lex.c
blobd470364302d06f526d6825ce0fc76711b1e49213
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "ytab.h"
32 extern YYSTYPE yylval;
33 extern int infunc;
35 int lineno = 1;
36 int bracecnt = 0;
37 int brackcnt = 0;
38 int parencnt = 0;
40 typedef struct Keyword {
41 const char *word;
42 int sub;
43 int type;
44 } Keyword;
46 Keyword keywords[] ={ /* keep sorted: binary searched */
47 { "BEGIN", XBEGIN, XBEGIN },
48 { "END", XEND, XEND },
49 { "NF", VARNF, VARNF },
50 { "and", BIO_FAND, BLTIN },
51 { "atan2", FATAN, BLTIN },
52 { "break", BREAK, BREAK },
53 { "close", CLOSE, CLOSE },
54 { "continue", CONTINUE, CONTINUE },
55 { "cos", FCOS, BLTIN },
56 { "delete", DELETE, DELETE },
57 { "do", DO, DO },
58 { "else", ELSE, ELSE },
59 { "exit", EXIT, EXIT },
60 { "exp", FEXP, BLTIN },
61 { "fflush", FFLUSH, BLTIN },
62 { "for", FOR, FOR },
63 { "func", FUNC, FUNC },
64 { "function", FUNC, FUNC },
65 { "gc", BIO_FGC, BLTIN },
66 { "getline", GETLINE, GETLINE },
67 { "gsub", GSUB, GSUB },
68 { "if", IF, IF },
69 { "in", IN, IN },
70 { "index", INDEX, INDEX },
71 { "int", FINT, BLTIN },
72 { "length", FLENGTH, BLTIN },
73 { "log", FLOG, BLTIN },
74 { "match", MATCHFCN, MATCHFCN },
75 { "meanqual", BIO_FMEANQUAL, BLTIN },
76 { "next", NEXT, NEXT },
77 { "nextfile", NEXTFILE, NEXTFILE },
78 { "or", BIO_FOR, BLTIN },
79 { "print", PRINT, PRINT },
80 { "printf", PRINTF, PRINTF },
81 { "qualcount", BIO_FQUALCOUNT, BLTIN },
82 { "rand", FRAND, BLTIN },
83 { "return", RETURN, RETURN },
84 { "revcomp",BIO_FREVCOMP, BLTIN },
85 { "reverse",BIO_FREVERSE, BLTIN },
86 { "sin", FSIN, BLTIN },
87 { "split", SPLIT, SPLIT },
88 { "sprintf", SPRINTF, SPRINTF },
89 { "sqrt", FSQRT, BLTIN },
90 { "srand", FSRAND, BLTIN },
91 { "sub", SUB, SUB },
92 { "substr", SUBSTR, SUBSTR },
93 { "system", FSYSTEM, BLTIN },
94 { "tolower", FTOLOWER, BLTIN },
95 { "toupper", FTOUPPER, BLTIN },
96 { "trimq", BIO_FTRIMQ, BLTIN },
97 { "while", WHILE, WHILE },
98 { "xor", BIO_FXOR, BLTIN }
101 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
103 int peek(void)
105 int c = input();
106 unput(c);
107 return c;
110 int gettok(char **pbuf, int *psz) /* get next input token */
112 int c, retc;
113 char *buf = *pbuf;
114 int sz = *psz;
115 char *bp = buf;
117 c = input();
118 if (c == 0)
119 return 0;
120 buf[0] = c;
121 buf[1] = 0;
122 if (!isalnum(c) && c != '.' && c != '_')
123 return c;
125 *bp++ = c;
126 if (isalpha(c) || c == '_') { /* it's a varname */
127 for ( ; (c = input()) != 0; ) {
128 if (bp-buf >= sz)
129 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
130 FATAL( "out of space for name %.10s...", buf );
131 if (isalnum(c) || c == '_')
132 *bp++ = c;
133 else {
134 *bp = 0;
135 unput(c);
136 break;
139 *bp = 0;
140 retc = 'a'; /* alphanumeric */
141 } else { /* maybe it's a number, but could be . */
142 char *rem;
143 /* read input until can't be a number */
144 for ( ; (c = input()) != 0; ) {
145 if (bp-buf >= sz)
146 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
147 FATAL( "out of space for number %.10s...", buf );
148 if (isdigit(c) || c == 'e' || c == 'E'
149 || c == '.' || c == '+' || c == '-')
150 *bp++ = c;
151 else {
152 unput(c);
153 break;
156 *bp = 0;
157 strtod(buf, &rem); /* parse the number */
158 if (rem == buf) { /* it wasn't a valid number at all */
159 buf[1] = 0; /* return one character as token */
160 retc = buf[0]; /* character is its own type */
161 unputstr(rem+1); /* put rest back for later */
162 } else { /* some prefix was a number */
163 unputstr(rem); /* put rest back for later */
164 rem[0] = 0; /* truncate buf after number part */
165 retc = '0'; /* type is number */
168 *pbuf = buf;
169 *psz = sz;
170 return retc;
173 int word(char *);
174 int string(void);
175 int regexpr(void);
176 int sc = 0; /* 1 => return a } right now */
177 int reg = 0; /* 1 => return a REGEXPR now */
179 int yylex(void)
181 int c;
182 static char *buf = 0;
183 static int bufsize = 5; /* BUG: setting this small causes core dump! */
185 if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
186 FATAL( "out of space in yylex" );
187 if (sc) {
188 sc = 0;
189 RET('}');
191 if (reg) {
192 reg = 0;
193 return regexpr();
195 for (;;) {
196 c = gettok(&buf, &bufsize);
197 if (c == 0)
198 return 0;
199 if (isalpha(c) || c == '_')
200 return word(buf);
201 if (isdigit(c)) {
202 yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
203 /* should this also have STR set? */
204 RET(NUMBER);
207 yylval.i = c;
208 switch (c) {
209 case '\n': /* {EOL} */
210 RET(NL);
211 case '\r': /* assume \n is coming */
212 case ' ': /* {WS}+ */
213 case '\t':
214 break;
215 case '#': /* #.* strip comments */
216 while ((c = input()) != '\n' && c != 0)
218 unput(c);
219 break;
220 case ';':
221 RET(';');
222 case '\\':
223 if (peek() == '\n') {
224 input();
225 } else if (peek() == '\r') {
226 input(); input(); /* \n */
227 lineno++;
228 } else {
229 RET(c);
231 break;
232 case '&':
233 if (peek() == '&') {
234 input(); RET(AND);
235 } else
236 RET('&');
237 case '|':
238 if (peek() == '|') {
239 input(); RET(BOR);
240 } else
241 RET('|');
242 case '!':
243 if (peek() == '=') {
244 input(); yylval.i = NE; RET(NE);
245 } else if (peek() == '~') {
246 input(); yylval.i = NOTMATCH; RET(MATCHOP);
247 } else
248 RET(NOT);
249 case '~':
250 yylval.i = MATCH;
251 RET(MATCHOP);
252 case '<':
253 if (peek() == '=') {
254 input(); yylval.i = LE; RET(LE);
255 } else {
256 yylval.i = LT; RET(LT);
258 case '=':
259 if (peek() == '=') {
260 input(); yylval.i = EQ; RET(EQ);
261 } else {
262 yylval.i = ASSIGN; RET(ASGNOP);
264 case '>':
265 if (peek() == '=') {
266 input(); yylval.i = GE; RET(GE);
267 } else if (peek() == '>') {
268 input(); yylval.i = APPEND; RET(APPEND);
269 } else {
270 yylval.i = GT; RET(GT);
272 case '+':
273 if (peek() == '+') {
274 input(); yylval.i = INCR; RET(INCR);
275 } else if (peek() == '=') {
276 input(); yylval.i = ADDEQ; RET(ASGNOP);
277 } else
278 RET('+');
279 case '-':
280 if (peek() == '-') {
281 input(); yylval.i = DECR; RET(DECR);
282 } else if (peek() == '=') {
283 input(); yylval.i = SUBEQ; RET(ASGNOP);
284 } else
285 RET('-');
286 case '*':
287 if (peek() == '=') { /* *= */
288 input(); yylval.i = MULTEQ; RET(ASGNOP);
289 } else if (peek() == '*') { /* ** or **= */
290 input(); /* eat 2nd * */
291 if (peek() == '=') {
292 input(); yylval.i = POWEQ; RET(ASGNOP);
293 } else {
294 RET(POWER);
296 } else
297 RET('*');
298 case '/':
299 RET('/');
300 case '%':
301 if (peek() == '=') {
302 input(); yylval.i = MODEQ; RET(ASGNOP);
303 } else
304 RET('%');
305 case '^':
306 if (peek() == '=') {
307 input(); yylval.i = POWEQ; RET(ASGNOP);
308 } else
309 RET(POWER);
311 case '$':
312 /* BUG: awkward, if not wrong */
313 c = gettok(&buf, &bufsize);
314 if (isalpha(c)) {
315 if (strcmp(buf, "NF") == 0) { /* very special */
316 unputstr("(NF)");
317 RET(INDIRECT);
319 c = peek();
320 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
321 unputstr(buf);
322 RET(INDIRECT);
324 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
325 RET(IVAR);
326 } else if (c == 0) { /* */
327 SYNTAX( "unexpected end of input after $" );
328 RET(';');
329 } else {
330 unputstr(buf);
331 RET(INDIRECT);
334 case '}':
335 if (--bracecnt < 0)
336 SYNTAX( "extra }" );
337 sc = 1;
338 RET(';');
339 case ']':
340 if (--brackcnt < 0)
341 SYNTAX( "extra ]" );
342 RET(']');
343 case ')':
344 if (--parencnt < 0)
345 SYNTAX( "extra )" );
346 RET(')');
347 case '{':
348 bracecnt++;
349 RET('{');
350 case '[':
351 brackcnt++;
352 RET('[');
353 case '(':
354 parencnt++;
355 RET('(');
357 case '"':
358 return string(); /* BUG: should be like tran.c ? */
360 default:
361 RET(c);
366 int string(void)
368 int c, n;
369 char *s, *bp;
370 static char *buf = 0;
371 static int bufsz = 500;
373 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
374 FATAL("out of space for strings");
375 for (bp = buf; (c = input()) != '"'; ) {
376 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
377 FATAL("out of space for string %.10s...", buf);
378 switch (c) {
379 case '\n':
380 case '\r':
381 case 0:
382 SYNTAX( "non-terminated string %.10s...", buf );
383 lineno++;
384 if (c == 0) /* hopeless */
385 FATAL( "giving up" );
386 break;
387 case '\\':
388 c = input();
389 switch (c) {
390 case '"': *bp++ = '"'; break;
391 case 'n': *bp++ = '\n'; break;
392 case 't': *bp++ = '\t'; break;
393 case 'f': *bp++ = '\f'; break;
394 case 'r': *bp++ = '\r'; break;
395 case 'b': *bp++ = '\b'; break;
396 case 'v': *bp++ = '\v'; break;
397 case 'a': *bp++ = '\007'; break;
398 case '\\': *bp++ = '\\'; break;
400 case '0': case '1': case '2': /* octal: \d \dd \ddd */
401 case '3': case '4': case '5': case '6': case '7':
402 n = c - '0';
403 if ((c = peek()) >= '0' && c < '8') {
404 n = 8 * n + input() - '0';
405 if ((c = peek()) >= '0' && c < '8')
406 n = 8 * n + input() - '0';
408 *bp++ = n;
409 break;
411 case 'x': /* hex \x0-9a-fA-F + */
412 { char xbuf[100], *px;
413 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
414 if (isdigit(c)
415 || (c >= 'a' && c <= 'f')
416 || (c >= 'A' && c <= 'F'))
417 *px++ = c;
418 else
419 break;
421 *px = 0;
422 unput(c);
423 sscanf(xbuf, "%x", (unsigned int *) &n);
424 *bp++ = n;
425 break;
428 default:
429 *bp++ = c;
430 break;
432 break;
433 default:
434 *bp++ = c;
435 break;
438 *bp = 0;
439 s = tostring(buf);
440 *bp++ = ' '; *bp++ = 0;
441 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
442 RET(STRING);
446 int binsearch(char *w, Keyword *kp, int n)
448 int cond, low, mid, high;
450 low = 0;
451 high = n - 1;
452 while (low <= high) {
453 mid = (low + high) / 2;
454 if ((cond = strcmp(w, kp[mid].word)) < 0)
455 high = mid - 1;
456 else if (cond > 0)
457 low = mid + 1;
458 else
459 return mid;
461 return -1;
464 int word(char *w)
466 Keyword *kp;
467 int c, n;
469 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
470 /* BUG: this ought to be inside the if; in theory could fault (daniel barrett) */
471 kp = keywords + n;
472 if (n != -1) { /* found in table */
473 yylval.i = kp->sub;
474 switch (kp->type) { /* special handling */
475 case BLTIN:
476 if (kp->sub == FSYSTEM && safe)
477 SYNTAX( "system is unsafe" );
478 RET(kp->type);
479 case FUNC:
480 if (infunc)
481 SYNTAX( "illegal nested function" );
482 RET(kp->type);
483 case RETURN:
484 if (!infunc)
485 SYNTAX( "return not in function" );
486 RET(kp->type);
487 case VARNF:
488 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
489 RET(VARNF);
490 default:
491 RET(kp->type);
494 c = peek(); /* look for '(' */
495 if (c != '(' && infunc && (n=isarg(w)) >= 0) {
496 yylval.i = n;
497 RET(ARG);
498 } else {
499 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
500 if (c == '(') {
501 RET(CALL);
502 } else {
503 RET(VAR);
508 void startreg(void) /* next call to yylex will return a regular expression */
510 reg = 1;
513 int regexpr(void)
515 int c;
516 static char *buf = 0;
517 static int bufsz = 500;
518 char *bp;
520 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
521 FATAL("out of space for rex expr");
522 bp = buf;
523 for ( ; (c = input()) != '/' && c != 0; ) {
524 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
525 FATAL("out of space for reg expr %.10s...", buf);
526 if (c == '\n') {
527 SYNTAX( "newline in regular expression %.10s...", buf );
528 unput('\n');
529 break;
530 } else if (c == '\\') {
531 *bp++ = '\\';
532 *bp++ = input();
533 } else {
534 *bp++ = c;
537 *bp = 0;
538 if (c == 0)
539 SYNTAX("non-terminated regular expression %.10s...", buf);
540 yylval.s = tostring(buf);
541 unput('/');
542 RET(REGEXPR);
545 /* low-level lexical stuff, sort of inherited from lex */
547 char ebuf[300];
548 char *ep = ebuf;
549 char yysbuf[100]; /* pushback buffer */
550 char *yysptr = yysbuf;
551 FILE *yyin = 0;
553 int input(void) /* get next lexical input character */
555 int c;
556 extern char *lexprog;
558 if (yysptr > yysbuf)
559 c = (uschar)*--yysptr;
560 else if (lexprog != NULL) { /* awk '...' */
561 if ((c = (uschar)*lexprog) != 0)
562 lexprog++;
563 } else /* awk -f ... */
564 c = pgetc();
565 if (c == '\n')
566 lineno++;
567 else if (c == EOF)
568 c = 0;
569 if (ep >= ebuf + sizeof ebuf)
570 ep = ebuf;
571 return *ep++ = c;
574 void unput(int c) /* put lexical character back on input */
576 if (c == '\n')
577 lineno--;
578 if (yysptr >= yysbuf + sizeof(yysbuf))
579 FATAL("pushed back too much: %.20s...", yysbuf);
580 *yysptr++ = c;
581 if (--ep < ebuf)
582 ep = ebuf + sizeof(ebuf) - 1;
585 void unputstr(const char *s) /* put a string back on input */
587 int i;
589 for (i = strlen(s)-1; i >= 0; i--)
590 unput(s[i]);