dist/nawk/lex.c

   1 /****************************************************************
   2 Copyright (C) Lucent Technologies 1997
   3 All Rights Reserved
   4
   5 Permission to use, copy, modify, and distribute this software and
   6 its documentation for any purpose and without fee is hereby
   7 granted, provided that the above copyright notice appear in all
   8 copies and that both that the copyright notice and this
   9 permission notice and warranty disclaimer appear in supporting
  10 documentation, and that the name Lucent Technologies or any of
  11 its entities not be used in advertising or publicity pertaining
  12 to distribution of the software without specific, written prior
  13 permission.
  14
  15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
  16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
  17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
  18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
  20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
  21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
  22 THIS SOFTWARE.
  23 ****************************************************************/
  24
  25 #if HAVE_NBTOOL_CONFIG_H
  26 #include "nbtool_config.h"
  27 #endif
  28
  29 #include <stdio.h>
  30 #include <stdlib.h>
  31 #include <string.h>
  32 #include <ctype.h>
  33 #include "awk.h"
  34 #include "awkgram.h"
  35
  36 extern YYSTYPE  yylval;
  37 extern int      infunc;
  38
  39 int     lineno  = 1;
  40 int     bracecnt = 0;
  41 int     brackcnt  = 0;
  42 int     parencnt = 0;
  43
  44 typedef struct Keyword {
  45         const char *word;
  46         int     sub;
  47         int     type;
  48 } Keyword;
  49
  50 int peek(void);
  51 int gettok(char **, int *);
  52 int binsearch(const char *, const Keyword *, int);
  53
  54 const Keyword keywords[] ={     /* keep sorted: binary searched */
  55         { "BEGIN",      XBEGIN,         XBEGIN },
  56         { "END",        XEND,           XEND },
  57         { "NF",         VARNF,          VARNF },
  58         { "atan2",      FATAN,          BLTIN },
  59         { "break",      BREAK,          BREAK },
  60         { "close",      CLOSE,          CLOSE },
  61         { "continue",   CONTINUE,       CONTINUE },
  62         { "cos",        FCOS,           BLTIN },
  63         { "delete",     DELETE,         DELETE },
  64         { "do",         DO,             DO },
  65         { "else",       ELSE,           ELSE },
  66         { "exit",       EXIT,           EXIT },
  67         { "exp",        FEXP,           BLTIN },
  68         { "fflush",     FFLUSH,         BLTIN },
  69         { "for",        FOR,            FOR },
  70         { "func",       FUNC,           FUNC },
  71         { "function",   FUNC,           FUNC },
  72         { "gensub",     GENSUB,         GENSUB },
  73         { "getline",    GETLINE,        GETLINE },
  74         { "gsub",       GSUB,           GSUB },
  75         { "if",         IF,             IF },
  76         { "in",         IN,             IN },
  77         { "index",      INDEX,          INDEX },
  78         { "int",        FINT,           BLTIN },
  79         { "length",     FLENGTH,        BLTIN },
  80         { "log",        FLOG,           BLTIN },
  81         { "match",      MATCHFCN,       MATCHFCN },
  82         { "next",       NEXT,           NEXT },
  83         { "nextfile",   NEXTFILE,       NEXTFILE },
  84         { "print",      PRINT,          PRINT },
  85         { "printf",     PRINTF,         PRINTF },
  86         { "rand",       FRAND,          BLTIN },
  87         { "return",     RETURN,         RETURN },
  88         { "sin",        FSIN,           BLTIN },
  89         { "split",      SPLIT,          SPLIT },
  90         { "sprintf",    SPRINTF,        SPRINTF },
  91         { "sqrt",       FSQRT,          BLTIN },
  92         { "srand",      FSRAND,         BLTIN },
  93         { "strftime",   FSTRFTIME,      BLTIN },
  94         { "sub",        SUB,            SUB },
  95         { "substr",     SUBSTR,         SUBSTR },
  96         { "system",     FSYSTEM,        BLTIN },
  97         { "systime",    FSYSTIME,       BLTIN },
  98         { "tolower",    FTOLOWER,       BLTIN },
  99         { "toupper",    FTOUPPER,       BLTIN },
 100         { "while",      WHILE,          WHILE },
 101 };
 102
 103 #define RET(x)  { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
 104
 105 int peek(void)
 106 {
 107         int c = input();
 108         unput(c);
 109         return c;
 110 }
 111
 112 int gettok(char **pbuf, int *psz)       /* get next input token */
 113 {
 114         int c, retc;
 115         uschar *buf = (uschar *) *pbuf;
 116         int sz = *psz;
 117         uschar *bp = buf;
 118
 119         c = input();
 120         if (c == 0)
 121                 return 0;
 122         buf[0] = c;
 123         buf[1] = 0;
 124         if (!isalnum(c) && c != '.' && c != '_')
 125                 return c;
 126
 127         *bp++ = c;
 128         if (isalpha(c) || c == '_') {   /* it's a varname */
 129                 for ( ; (c = input()) != 0; ) {
 130                         if (bp-buf >= sz)
 131                                 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
 132                                         FATAL( "out of space for name %.10s...", buf );
 133                         if (isalnum(c) || c == '_')
 134                                 *bp++ = c;
 135                         else {
 136                                 *bp = 0;
 137                                 unput(c);
 138                                 break;
 139                         }
 140                 }
 141                 *bp = 0;
 142                 retc = 'a';     /* alphanumeric */
 143         } else {        /* maybe it's a number, but could be . */
 144                 char *rem;
 145                 /* read input until can't be a number */
 146                 for ( ; (c = input()) != 0; ) {
 147                         if (bp-buf >= sz)
 148                                 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
 149                                         FATAL( "out of space for number %.10s...", buf );
 150                         if (isdigit(c) || c == 'e' || c == 'E'
 151                           || c == '.' || c == '+' || c == '-')
 152                                 *bp++ = c;
 153                         else {
 154                                 unput(c);
 155                                 break;
 156                         }
 157                 }
 158                 *bp = 0;
 159                 strtod(buf, &rem);      /* parse the number */
 160                 if (rem == (char *)buf) {       /* it wasn't a valid number at all */
 161                         buf[1] = 0;     /* return one character as token */
 162                         retc = buf[0];  /* character is its own type */
 163                         unputstr(rem+1); /* put rest back for later */
 164                 } else {        /* some prefix was a number */
 165                         unputstr(rem);  /* put rest back for later */
 166                         rem[0] = 0;     /* truncate buf after number part */
 167                         retc = '0';     /* type is number */
 168                 }
 169         }
 170         *pbuf = buf;
 171         *psz = sz;
 172         return retc;
 173 }
 174
 175 int     word(char *);
 176 int     string(void);
 177 int     regexpr(void);
 178 int     sc      = 0;    /* 1 => return a } right now */
 179 int     reg     = 0;    /* 1 => return a REGEXPR now */
 180
 181 int yylex(void)
 182 {
 183         int c;
 184         static char *buf = 0;
 185         static int bufsize = 5; /* BUG: setting this small causes core dump! */
 186
 187         if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
 188                 FATAL( "out of space in yylex" );
 189         if (sc) {
 190                 sc = 0;
 191                 RET('}');
 192         }
 193         if (reg) {
 194                 reg = 0;
 195                 return regexpr();
 196         }
 197         for (;;) {
 198                 c = gettok(&buf, &bufsize);
 199                 if (c == 0)
 200                         return 0;
 201                 if (isalpha(c) || c == '_')
 202                         return word(buf);
 203                 if (isdigit(c)) {
 204                         yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
 205                         /* should this also have STR set? */
 206                         RET(NUMBER);
 207                 }
 208
 209                 yylval.i = c;
 210                 switch (c) {
 211                 case '\n':      /* {EOL} */
 212                         RET(NL);
 213                 case '\r':      /* assume \n is coming */
 214                 case ' ':       /* {WS}+ */
 215                 case '\t':
 216                         break;
 217                 case '#':       /* #.* strip comments */
 218                         while ((c = input()) != '\n' && c != 0)
 219                                 ;
 220                         unput(c);
 221                         break;
 222                 case ';':
 223                         RET(';');
 224                 case '\\':
 225                         if (peek() == '\n') {
 226                                 input();
 227                         } else if (peek() == '\r') {
 228                                 input(); input();       /* \n */
 229                                 lineno++;
 230                         } else {
 231                                 RET(c);
 232                         }
 233                         break;
 234                 case '&':
 235                         if (peek() == '&') {
 236                                 input(); RET(AND);
 237                         } else
 238                                 RET('&');
 239                 case '|':
 240                         if (peek() == '|') {
 241                                 input(); RET(BOR);
 242                         } else
 243                                 RET('|');
 244                 case '!':
 245                         if (peek() == '=') {
 246                                 input(); yylval.i = NE; RET(NE);
 247                         } else if (peek() == '~') {
 248                                 input(); yylval.i = NOTMATCH; RET(MATCHOP);
 249                         } else
 250                                 RET(NOT);
 251                 case '~':
 252                         yylval.i = MATCH;
 253                         RET(MATCHOP);
 254                 case '<':
 255                         if (peek() == '=') {
 256                                 input(); yylval.i = LE; RET(LE);
 257                         } else {
 258                                 yylval.i = LT; RET(LT);
 259                         }
 260                 case '=':
 261                         if (peek() == '=') {
 262                                 input(); yylval.i = EQ; RET(EQ);
 263                         } else {
 264                                 yylval.i = ASSIGN; RET(ASGNOP);
 265                         }
 266                 case '>':
 267                         if (peek() == '=') {
 268                                 input(); yylval.i = GE; RET(GE);
 269                         } else if (peek() == '>') {
 270                                 input(); yylval.i = APPEND; RET(APPEND);
 271                         } else {
 272                                 yylval.i = GT; RET(GT);
 273                         }
 274                 case '+':
 275                         if (peek() == '+') {
 276                                 input(); yylval.i = INCR; RET(INCR);
 277                         } else if (peek() == '=') {
 278                                 input(); yylval.i = ADDEQ; RET(ASGNOP);
 279                         } else
 280                                 RET('+');
 281                 case '-':
 282                         if (peek() == '-') {
 283                                 input(); yylval.i = DECR; RET(DECR);
 284                         } else if (peek() == '=') {
 285                                 input(); yylval.i = SUBEQ; RET(ASGNOP);
 286                         } else
 287                                 RET('-');
 288                 case '*':
 289                         if (peek() == '=') {    /* *= */
 290                                 input(); yylval.i = MULTEQ; RET(ASGNOP);
 291                         } else if (peek() == '*') {     /* ** or **= */
 292                                 input();        /* eat 2nd * */
 293                                 if (peek() == '=') {
 294                                         input(); yylval.i = POWEQ; RET(ASGNOP);
 295                                 } else {
 296                                         RET(POWER);
 297                                 }
 298                         } else
 299                                 RET('*');
 300                 case '/':
 301                         RET('/');
 302                 case '%':
 303                         if (peek() == '=') {
 304                                 input(); yylval.i = MODEQ; RET(ASGNOP);
 305                         } else
 306                                 RET('%');
 307                 case '^':
 308                         if (peek() == '=') {
 309                                 input(); yylval.i = POWEQ; RET(ASGNOP);
 310                         } else
 311                                 RET(POWER);
 312
 313                 case '$':
 314                         /* BUG: awkward, if not wrong */
 315                         c = gettok(&buf, &bufsize);
 316                         if (isalpha(c)) {
 317                                 if (strcmp(buf, "NF") == 0) {   /* very special */
 318                                         unputstr("(NF)");
 319                                         RET(INDIRECT);
 320                                 }
 321                                 c = peek();
 322                                 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
 323                                         unputstr(buf);
 324                                         RET(INDIRECT);
 325                                 }
 326                                 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
 327                                 RET(IVAR);
 328                         } else if (c == 0) {    /*  */
 329                                 SYNTAX( "unexpected end of input after $" );
 330                                 RET(';');
 331                         } else {
 332                                 unputstr(buf);
 333                                 RET(INDIRECT);
 334                         }
 335
 336                 case '}':
 337                         if (--bracecnt < 0)
 338                                 SYNTAX( "extra }" );
 339                         sc = 1;
 340                         RET(';');
 341                 case ']':
 342                         if (--brackcnt < 0)
 343                                 SYNTAX( "extra ]" );
 344                         RET(']');
 345                 case ')':
 346                         if (--parencnt < 0)
 347                                 SYNTAX( "extra )" );
 348                         RET(')');
 349                 case '{':
 350                         bracecnt++;
 351                         RET('{');
 352                 case '[':
 353                         brackcnt++;
 354                         RET('[');
 355                 case '(':
 356                         parencnt++;
 357                         RET('(');
 358
 359                 case '"':
 360                         return string();        /* BUG: should be like tran.c ? */
 361
 362                 default:
 363                         RET(c);
 364                 }
 365         }
 366 }
 367
 368 int string(void)
 369 {
 370         int c, n;
 371         uschar *s, *bp;
 372         static uschar *buf = 0;
 373         static int bufsz = 500;
 374
 375         if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
 376                 FATAL("out of space for strings");
 377         for (bp = buf; (c = input()) != '"'; ) {
 378                 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
 379                         FATAL("out of space for string %.10s...", buf);
 380                 switch (c) {
 381                 case '\n':
 382                 case '\r':
 383                 case 0:
 384                         SYNTAX( "non-terminated string %.10s...", buf );
 385                         lineno++;
 386                         if (c == 0)     /* hopeless */
 387                                 FATAL( "giving up" );
 388                         break;
 389                 case '\\':
 390                         c = input();
 391                         switch (c) {
 392                         case '\n': break;
 393                         case '"': *bp++ = '"'; break;
 394                         case 'n': *bp++ = '\n'; break;
 395                         case 't': *bp++ = '\t'; break;
 396                         case 'f': *bp++ = '\f'; break;
 397                         case 'r': *bp++ = '\r'; break;
 398                         case 'b': *bp++ = '\b'; break;
 399                         case 'v': *bp++ = '\v'; break;
 400                         case 'a': *bp++ = '\007'; break;
 401                         case '\\': *bp++ = '\\'; break;
 402
 403                         case '0': case '1': case '2': /* octal: \d \dd \ddd */
 404                         case '3': case '4': case '5': case '6': case '7':
 405                                 n = c - '0';
 406                                 if ((c = peek()) >= '0' && c < '8') {
 407                                         n = 8 * n + input() - '0';
 408                                         if ((c = peek()) >= '0' && c < '8')
 409                                                 n = 8 * n + input() - '0';
 410                                 }
 411                                 *bp++ = n;
 412                                 break;
 413
 414                         case 'x':       /* hex  \x0-9a-fA-F + */
 415                             {   char xbuf[100], *px;
 416                                 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
 417                                         if (isdigit(c)
 418                                          || (c >= 'a' && c <= 'f')
 419                                          || (c >= 'A' && c <= 'F'))
 420                                                 *px++ = c;
 421                                         else
 422                                                 break;
 423                                 }
 424                                 *px = 0;
 425                                 unput(c);
 426                                 sscanf(xbuf, "%x", &n);
 427                                 *bp++ = n;
 428                                 break;
 429                             }
 430
 431                         default:
 432                                 WARNING("warning: escape sequence `\\%c' "
 433                                     "treated as plain `%c'", c, c);
 434                                 *bp++ = c;
 435                                 break;
 436                         }
 437                         break;
 438                 default:
 439                         *bp++ = c;
 440                         break;
 441                 }
 442         }
 443         *bp = 0;
 444         s = tostring(buf);
 445         *bp++ = ' '; *bp++ = 0;
 446         yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
 447         RET(STRING);
 448 }
 449
 450
 451 int binsearch(const char *w, const Keyword *kp, int n)
 452 {
 453         int cond, low, mid, high;
 454
 455         low = 0;
 456         high = n - 1;
 457         while (low <= high) {
 458                 mid = (low + high) / 2;
 459                 if ((cond = strcmp(w, kp[mid].word)) < 0)
 460                         high = mid - 1;
 461                 else if (cond > 0)
 462                         low = mid + 1;
 463                 else
 464                         return mid;
 465         }
 466         return -1;
 467 }
 468
 469 int word(char *w)
 470 {
 471         const Keyword *kp;
 472         int c, n;
 473
 474         n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
 475 /* BUG: this ought to be inside the if; in theory could fault (daniel barrett) */
 476         kp = keywords + n;
 477         if (n != -1) {  /* found in table */
 478                 yylval.i = kp->sub;
 479                 switch (kp->type) {     /* special handling */
 480                 case BLTIN:
 481                         if (kp->sub == FSYSTEM && safe)
 482                                 SYNTAX( "system is unsafe" );
 483                         RET(kp->type);
 484                 case FUNC:
 485                         if (infunc)
 486                                 SYNTAX( "illegal nested function" );
 487                         RET(kp->type);
 488                 case RETURN:
 489                         if (!infunc)
 490                                 SYNTAX( "return not in function" );
 491                         RET(kp->type);
 492                 case VARNF:
 493                         yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
 494                         RET(VARNF);
 495                 default:
 496                         RET(kp->type);
 497                 }
 498         }
 499         c = peek();     /* look for '(' */
 500         if (c != '(' && infunc && (n=isarg(w)) >= 0) {
 501                 yylval.i = n;
 502                 RET(ARG);
 503         } else {
 504                 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
 505                 if (c == '(') {
 506                         RET(CALL);
 507                 } else {
 508                         RET(VAR);
 509                 }
 510         }
 511 }
 512
 513 void startreg(void)     /* next call to yylex will return a regular expression */
 514 {
 515         reg = 1;
 516 }
 517
 518 int regexpr(void)
 519 {
 520         int c;
 521         static uschar *buf = 0;
 522         static int bufsz = 500;
 523         uschar *bp;
 524
 525         if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
 526                 FATAL("out of space for rex expr");
 527         bp = buf;
 528         for ( ; (c = input()) != '/' && c != 0; ) {
 529                 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
 530                         FATAL("out of space for reg expr %.10s...", buf);
 531                 if (c == '\n') {
 532                         SYNTAX( "newline in regular expression %.10s...", buf );
 533                         unput('\n');
 534                         break;
 535                 } else if (c == '\\') {
 536                         *bp++ = '\\';
 537                         *bp++ = input();
 538                 } else {
 539                         *bp++ = c;
 540                 }
 541         }
 542         *bp = 0;
 543         if (c == 0)
 544                 SYNTAX("non-terminated regular expression %.10s...", buf);
 545         yylval.s = tostring(buf);
 546         unput('/');
 547         RET(REGEXPR);
 548 }
 549
 550 /* low-level lexical stuff, sort of inherited from lex */
 551
 552 char    ebuf[300];
 553 char    *ep = ebuf;
 554 char    yysbuf[100];    /* pushback buffer */
 555 char    *yysptr = yysbuf;
 556 FILE    *yyin = 0;
 557
 558 int input(void) /* get next lexical input character */
 559 {
 560         int c;
 561         extern char *lexprog;
 562
 563         if (yysptr > yysbuf)
 564                 c = (uschar)*--yysptr;
 565         else if (lexprog != NULL) {     /* awk '...' */
 566                 if ((c = (uschar)*lexprog) != 0)
 567                         lexprog++;
 568         } else                          /* awk -f ... */
 569                 c = pgetc();
 570         if (c == '\n')
 571                 lineno++;
 572         else if (c == EOF)
 573                 c = 0;
 574         if (ep >= ebuf + sizeof ebuf)
 575                 ep = ebuf;
 576         return *ep++ = c;
 577 }
 578
 579 void unput(int c)       /* put lexical character back on input */
 580 {
 581         if (c == '\n')
 582                 lineno--;
 583         if (yysptr >= yysbuf + sizeof(yysbuf))
 584                 FATAL("pushed back too much: %.20s...", yysbuf);
 585         *yysptr++ = c;
 586         if (--ep < ebuf)
 587                 ep = ebuf + sizeof(ebuf) - 1;
 588 }
 589
 590 void unputstr(const char *s)    /* put a string back on input */
 591 {
 592         int i;
 593
 594         for (i = strlen(s)-1; i >= 0; i--)
 595                 unput(s[i]);
 596 }