port of netbsd's tr
[minix.git] / commands / awk / lex.c
bloba7e225d7743999a240a861cb5513b7036b451b3e
1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "ytab.h"
32 extern YYSTYPE yylval;
33 extern int infunc;
35 int lineno = 1;
36 int bracecnt = 0;
37 int brackcnt = 0;
38 int parencnt = 0;
40 typedef struct Keyword {
41 const char *word;
42 int sub;
43 int type;
44 } Keyword;
46 Keyword keywords[] ={ /* keep sorted: binary searched */
47 { "BEGIN", XBEGIN, XBEGIN },
48 { "END", XEND, XEND },
49 { "NF", VARNF, VARNF },
50 { "atan2", FATAN, BLTIN },
51 { "break", BREAK, BREAK },
52 { "close", CLOSE, CLOSE },
53 { "continue", CONTINUE, CONTINUE },
54 { "cos", FCOS, BLTIN },
55 { "delete", DELETE, DELETE },
56 { "do", DO, DO },
57 { "else", ELSE, ELSE },
58 { "exit", EXIT, EXIT },
59 { "exp", FEXP, BLTIN },
60 { "fflush", FFLUSH, BLTIN },
61 { "for", FOR, FOR },
62 { "func", FUNC, FUNC },
63 { "function", FUNC, FUNC },
64 { "getline", GETLINE, GETLINE },
65 { "gsub", GSUB, GSUB },
66 { "if", IF, IF },
67 { "in", IN, IN },
68 { "index", INDEX, INDEX },
69 { "int", FINT, BLTIN },
70 { "length", FLENGTH, BLTIN },
71 { "log", FLOG, BLTIN },
72 { "match", MATCHFCN, MATCHFCN },
73 { "next", NEXT, NEXT },
74 { "nextfile", NEXTFILE, NEXTFILE },
75 { "print", PRINT, PRINT },
76 { "printf", PRINTF, PRINTF },
77 { "rand", FRAND, BLTIN },
78 { "return", RETURN, RETURN },
79 { "sin", FSIN, BLTIN },
80 { "split", SPLIT, SPLIT },
81 { "sprintf", SPRINTF, SPRINTF },
82 { "sqrt", FSQRT, BLTIN },
83 { "srand", FSRAND, BLTIN },
84 { "sub", SUB, SUB },
85 { "substr", SUBSTR, SUBSTR },
86 { "system", FSYSTEM, BLTIN },
87 { "tolower", FTOLOWER, BLTIN },
88 { "toupper", FTOUPPER, BLTIN },
89 { "while", WHILE, WHILE },
92 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
94 int peek(void)
96 int c = input();
97 unput(c);
98 return c;
101 int gettok(char **pbuf, int *psz) /* get next input token */
103 int c, retc;
104 char *buf = *pbuf;
105 int sz = *psz;
106 char *bp = buf;
108 c = input();
109 if (c == 0)
110 return 0;
111 buf[0] = c;
112 buf[1] = 0;
113 if (!isalnum(c) && c != '.' && c != '_')
114 return c;
116 *bp++ = c;
117 if (isalpha(c) || c == '_') { /* it's a varname */
118 for ( ; (c = input()) != 0; ) {
119 if (bp-buf >= sz)
120 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
121 FATAL( "out of space for name %.10s...", buf );
122 if (isalnum(c) || c == '_')
123 *bp++ = c;
124 else {
125 *bp = 0;
126 unput(c);
127 break;
130 *bp = 0;
131 retc = 'a'; /* alphanumeric */
132 } else { /* maybe it's a number, but could be . */
133 char *rem;
134 /* read input until can't be a number */
135 for ( ; (c = input()) != 0; ) {
136 if (bp-buf >= sz)
137 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
138 FATAL( "out of space for number %.10s...", buf );
139 if (isdigit(c) || c == 'e' || c == 'E'
140 || c == '.' || c == '+' || c == '-')
141 *bp++ = c;
142 else {
143 unput(c);
144 break;
147 *bp = 0;
148 strtod(buf, &rem); /* parse the number */
149 if (rem == buf) { /* it wasn't a valid number at all */
150 buf[1] = 0; /* return one character as token */
151 retc = buf[0]; /* character is its own type */
152 unputstr(rem+1); /* put rest back for later */
153 } else { /* some prefix was a number */
154 unputstr(rem); /* put rest back for later */
155 rem[0] = 0; /* truncate buf after number part */
156 retc = '0'; /* type is number */
159 *pbuf = buf;
160 *psz = sz;
161 return retc;
164 int word(char *);
165 int string(void);
166 int regexpr(void);
167 int sc = 0; /* 1 => return a } right now */
168 int reg = 0; /* 1 => return a REGEXPR now */
170 int yylex(void)
172 int c;
173 static char *buf = 0;
174 static int bufsize = 5; /* BUG: setting this small causes core dump! */
176 if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
177 FATAL( "out of space in yylex" );
178 if (sc) {
179 sc = 0;
180 RET('}');
182 if (reg) {
183 reg = 0;
184 return regexpr();
186 for (;;) {
187 c = gettok(&buf, &bufsize);
188 if (c == 0)
189 return 0;
190 if (isalpha(c) || c == '_')
191 return word(buf);
192 if (isdigit(c)) {
193 yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
194 /* should this also have STR set? */
195 RET(NUMBER);
198 yylval.i = c;
199 switch (c) {
200 case '\n': /* {EOL} */
201 RET(NL);
202 case '\r': /* assume \n is coming */
203 case ' ': /* {WS}+ */
204 case '\t':
205 break;
206 case '#': /* #.* strip comments */
207 while ((c = input()) != '\n' && c != 0)
209 unput(c);
210 break;
211 case ';':
212 RET(';');
213 case '\\':
214 if (peek() == '\n') {
215 input();
216 } else if (peek() == '\r') {
217 input(); input(); /* \n */
218 lineno++;
219 } else {
220 RET(c);
222 break;
223 case '&':
224 if (peek() == '&') {
225 input(); RET(AND);
226 } else
227 RET('&');
228 case '|':
229 if (peek() == '|') {
230 input(); RET(BOR);
231 } else
232 RET('|');
233 case '!':
234 if (peek() == '=') {
235 input(); yylval.i = NE; RET(NE);
236 } else if (peek() == '~') {
237 input(); yylval.i = NOTMATCH; RET(MATCHOP);
238 } else
239 RET(NOT);
240 case '~':
241 yylval.i = MATCH;
242 RET(MATCHOP);
243 case '<':
244 if (peek() == '=') {
245 input(); yylval.i = LE; RET(LE);
246 } else {
247 yylval.i = LT; RET(LT);
249 case '=':
250 if (peek() == '=') {
251 input(); yylval.i = EQ; RET(EQ);
252 } else {
253 yylval.i = ASSIGN; RET(ASGNOP);
255 case '>':
256 if (peek() == '=') {
257 input(); yylval.i = GE; RET(GE);
258 } else if (peek() == '>') {
259 input(); yylval.i = APPEND; RET(APPEND);
260 } else {
261 yylval.i = GT; RET(GT);
263 case '+':
264 if (peek() == '+') {
265 input(); yylval.i = INCR; RET(INCR);
266 } else if (peek() == '=') {
267 input(); yylval.i = ADDEQ; RET(ASGNOP);
268 } else
269 RET('+');
270 case '-':
271 if (peek() == '-') {
272 input(); yylval.i = DECR; RET(DECR);
273 } else if (peek() == '=') {
274 input(); yylval.i = SUBEQ; RET(ASGNOP);
275 } else
276 RET('-');
277 case '*':
278 if (peek() == '=') { /* *= */
279 input(); yylval.i = MULTEQ; RET(ASGNOP);
280 } else if (peek() == '*') { /* ** or **= */
281 input(); /* eat 2nd * */
282 if (peek() == '=') {
283 input(); yylval.i = POWEQ; RET(ASGNOP);
284 } else {
285 RET(POWER);
287 } else
288 RET('*');
289 case '/':
290 RET('/');
291 case '%':
292 if (peek() == '=') {
293 input(); yylval.i = MODEQ; RET(ASGNOP);
294 } else
295 RET('%');
296 case '^':
297 if (peek() == '=') {
298 input(); yylval.i = POWEQ; RET(ASGNOP);
299 } else
300 RET(POWER);
302 case '$':
303 /* BUG: awkward, if not wrong */
304 c = gettok(&buf, &bufsize);
305 if (isalpha(c)) {
306 if (strcmp(buf, "NF") == 0) { /* very special */
307 unputstr("(NF)");
308 RET(INDIRECT);
310 c = peek();
311 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
312 unputstr(buf);
313 RET(INDIRECT);
315 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
316 RET(IVAR);
317 } else if (c == 0) { /* */
318 SYNTAX( "unexpected end of input after $" );
319 RET(';');
320 } else {
321 unputstr(buf);
322 RET(INDIRECT);
325 case '}':
326 if (--bracecnt < 0)
327 SYNTAX( "extra }" );
328 sc = 1;
329 RET(';');
330 case ']':
331 if (--brackcnt < 0)
332 SYNTAX( "extra ]" );
333 RET(']');
334 case ')':
335 if (--parencnt < 0)
336 SYNTAX( "extra )" );
337 RET(')');
338 case '{':
339 bracecnt++;
340 RET('{');
341 case '[':
342 brackcnt++;
343 RET('[');
344 case '(':
345 parencnt++;
346 RET('(');
348 case '"':
349 return string(); /* BUG: should be like tran.c ? */
351 default:
352 RET(c);
357 int string(void)
359 int c, n;
360 char *s, *bp;
361 static char *buf = 0;
362 static int bufsz = 500;
364 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
365 FATAL("out of space for strings");
366 for (bp = buf; (c = input()) != '"'; ) {
367 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
368 FATAL("out of space for string %.10s...", buf);
369 switch (c) {
370 case '\n':
371 case '\r':
372 case 0:
373 SYNTAX( "non-terminated string %.10s...", buf );
374 lineno++;
375 if (c == 0) /* hopeless */
376 FATAL( "giving up" );
377 break;
378 case '\\':
379 c = input();
380 switch (c) {
381 case '"': *bp++ = '"'; break;
382 case 'n': *bp++ = '\n'; break;
383 case 't': *bp++ = '\t'; break;
384 case 'f': *bp++ = '\f'; break;
385 case 'r': *bp++ = '\r'; break;
386 case 'b': *bp++ = '\b'; break;
387 case 'v': *bp++ = '\v'; break;
388 case 'a': *bp++ = '\007'; break;
389 case '\\': *bp++ = '\\'; break;
391 case '0': case '1': case '2': /* octal: \d \dd \ddd */
392 case '3': case '4': case '5': case '6': case '7':
393 n = c - '0';
394 if ((c = peek()) >= '0' && c < '8') {
395 n = 8 * n + input() - '0';
396 if ((c = peek()) >= '0' && c < '8')
397 n = 8 * n + input() - '0';
399 *bp++ = n;
400 break;
402 case 'x': /* hex \x0-9a-fA-F + */
403 { char xbuf[100], *px;
404 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
405 if (isdigit(c)
406 || (c >= 'a' && c <= 'f')
407 || (c >= 'A' && c <= 'F'))
408 *px++ = c;
409 else
410 break;
412 *px = 0;
413 unput(c);
414 sscanf(xbuf, "%x", &n);
415 *bp++ = n;
416 break;
419 default:
420 *bp++ = c;
421 break;
423 break;
424 default:
425 *bp++ = c;
426 break;
429 *bp = 0;
430 s = tostring(buf);
431 *bp++ = ' '; *bp++ = 0;
432 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
433 RET(STRING);
437 int binsearch(const char *w, const Keyword *kp, int n)
439 int cond, low, mid, high;
441 low = 0;
442 high = n - 1;
443 while (low <= high) {
444 mid = (low + high) / 2;
445 if ((cond = strcmp(w, kp[mid].word)) < 0)
446 high = mid - 1;
447 else if (cond > 0)
448 low = mid + 1;
449 else
450 return mid;
452 return -1;
455 int word(char *w)
457 Keyword *kp;
458 int c, n;
460 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
461 /* BUG: this ought to be inside the if; in theory could fault (daniel barrett) */
462 kp = keywords + n;
463 if (n != -1) { /* found in table */
464 yylval.i = kp->sub;
465 switch (kp->type) { /* special handling */
466 case BLTIN:
467 if (kp->sub == FSYSTEM && safe)
468 SYNTAX( "system is unsafe" );
469 RET(kp->type);
470 case FUNC:
471 if (infunc)
472 SYNTAX( "illegal nested function" );
473 RET(kp->type);
474 case RETURN:
475 if (!infunc)
476 SYNTAX( "return not in function" );
477 RET(kp->type);
478 case VARNF:
479 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
480 RET(VARNF);
481 default:
482 RET(kp->type);
485 c = peek(); /* look for '(' */
486 if (c != '(' && infunc && (n=isarg(w)) >= 0) {
487 yylval.i = n;
488 RET(ARG);
489 } else {
490 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
491 if (c == '(') {
492 RET(CALL);
493 } else {
494 RET(VAR);
499 void startreg(void) /* next call to yylex will return a regular expression */
501 reg = 1;
504 int regexpr(void)
506 int c;
507 static char *buf = 0;
508 static int bufsz = 500;
509 char *bp;
511 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
512 FATAL("out of space for rex expr");
513 bp = buf;
514 for ( ; (c = input()) != '/' && c != 0; ) {
515 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
516 FATAL("out of space for reg expr %.10s...", buf);
517 if (c == '\n') {
518 SYNTAX( "newline in regular expression %.10s...", buf );
519 unput('\n');
520 break;
521 } else if (c == '\\') {
522 *bp++ = '\\';
523 *bp++ = input();
524 } else {
525 *bp++ = c;
528 *bp = 0;
529 if (c == 0)
530 SYNTAX("non-terminated regular expression %.10s...", buf);
531 yylval.s = tostring(buf);
532 unput('/');
533 RET(REGEXPR);
536 /* low-level lexical stuff, sort of inherited from lex */
538 char ebuf[300];
539 char *ep = ebuf;
540 char yysbuf[100]; /* pushback buffer */
541 char *yysptr = yysbuf;
542 FILE *yyin = 0;
544 int input(void) /* get next lexical input character */
546 int c;
547 extern char *lexprog;
549 if (yysptr > yysbuf)
550 c = (uschar)*--yysptr;
551 else if (lexprog != NULL) { /* awk '...' */
552 if ((c = (uschar)*lexprog) != 0)
553 lexprog++;
554 } else /* awk -f ... */
555 c = pgetc();
556 if (c == '\n')
557 lineno++;
558 else if (c == EOF)
559 c = 0;
560 if (ep >= ebuf + sizeof ebuf)
561 ep = ebuf;
562 return *ep++ = c;
565 void unput(int c) /* put lexical character back on input */
567 if (c == '\n')
568 lineno--;
569 if (yysptr >= yysbuf + sizeof(yysbuf))
570 FATAL("pushed back too much: %.20s...", yysbuf);
571 *yysptr++ = c;
572 if (--ep < ebuf)
573 ep = ebuf + sizeof(ebuf) - 1;
576 void unputstr(const char *s) /* put a string back on input */
578 int i;
580 for (i = strlen(s)-1; i >= 0; i--)
581 unput(s[i]);