usr/src/tools/ndrgen/ndr_lex.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26
  27 #include <errno.h>
  28 #include <stdarg.h>
  29 #include "ndrgen.h"
  30 #include "y.tab.h"
  31
  32 /*
  33  * C-like lexical analysis.
  34  *
  35  * 1. Define a "struct node"
  36  * 2. Define a "struct symbol" that encapsulates a struct node.
  37  * 3. Define a "struct integer" that encapsulates a struct node.
  38  * 4. Set the YACC stack type in the grammar:
  39  *              %{
  40  *              #define YYSTYPE struct node *
  41  *              %}
  42  * 5. Define %token's in the grammer for IDENTIFIER, STRING and INTEGER.
  43  *    Using "_KW" as a suffix for keyword tokens, i.e. "struct" is
  44  *    "%token STRUCT_KW":
  45  *      // atomic values
  46  *      %token INTEGER STRING IDENTIFIER
  47  *      // keywords
  48  *      %token STRUCT_KW CASE_KW
  49  *      // operators
  50  *      %token PLUS MINUS ASSIGN ARROW
  51  *      // overloaded tokens (++ --, < > <= >=, == !=, += -= *= ...)
  52  *      %token INCOP RELOP EQUOP ASSOP
  53  * 6. It's easiest to use the yacc(1) generated token numbers for node
  54  *    labels.  For node labels that are not actually part of the grammer,
  55  *    use a %token with an L_ prefix:
  56  *      // node labels (can't be generated by lex)
  57  *      %token L_LT L_LTE L_GT L_GTE L_EQU L_NEQ
  58  * 7. Call set_lex_input() before parsing.
  59  */
  60
  61 #define SQ      '\''
  62 #define DQ      '"'
  63
  64 #define isquote(c) ((c) == SQ || (c) == DQ)
  65 #define iswhite(c) ((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) == '\f')
  66
  67 #define is_between(c, l, u)  ((l) <= (c) && (c) <= (u))
  68 #define is_white(c)     ((c) == ' ' || c == '\r' || c == '\t' || c == '\f')
  69 #define is_lower(c)     is_between((c), 'a', 'z')
  70 #define is_upper(c)     is_between((c), 'A', 'Z')
  71 #define is_alpha(c)     (is_lower(c) || is_upper(c))
  72 #define is_digit(c)     is_between((c), '0', '9')
  73 #define is_sstart(c)    (is_alpha(c) || (c) == '_')
  74 #define is_sfollow(c)   (is_sstart(c) || is_digit(c))
  75 #define is_xdigit(c)    \
  76         (is_digit(c) || is_between((c), 'A', 'F') || is_between((c), 'a', 'f'))
  77
  78 ndr_symbol_t            *symbol_list;
  79 static ndr_integer_t    *integer_list;
  80 static FILE             *lex_infp;
  81 static ndr_symbol_t     *file_name;
  82 int                     line_number;
  83 int                     n_compile_error;
  84
  85 static int              lex_at_bol;
  86
  87 /* In yacc(1) generated parser */
  88 extern struct node      *yylval;
  89
  90 /*
  91  * The keywtab[] and optable[] could be external to this lex
  92  * and it would all still work.
  93  */
  94 static ndr_keyword_t keywtable[] = {
  95         { "struct",     STRUCT_KW,      0 },
  96         { "union",      UNION_KW,       0 },
  97         { "typedef",    TYPEDEF_KW,     0 },
  98
  99         { "interface",  INTERFACE_KW,   0 },
 100         { "uuid",       UUID_KW,        0 },
 101         { "_no_reorder", _NO_REORDER_KW, 0 },
 102         { "extern",     EXTERN_KW,      0 },
 103         { "reference",  REFERENCE_KW,   0 },
 104
 105         { "align",      ALIGN_KW,       0 },
 106         { "operation",  OPERATION_KW,   0 },
 107         { "in",         IN_KW,          0 },
 108         { "out",        OUT_KW,         0 },
 109
 110         { "string",     STRING_KW,      0 },
 111         { "size_is",    SIZE_IS_KW,     0 },
 112         { "length_is",  LENGTH_IS_KW,   0 },
 113
 114         { "switch_is",  SWITCH_IS_KW,   0 },
 115         { "case",       CASE_KW,        0 },
 116         { "default",    DEFAULT_KW,     0 },
 117
 118         { "transmit_as", TRANSMIT_AS_KW, 0 },
 119         { "arg_is",     ARG_IS_KW,      0 },
 120
 121         { "char",       BASIC_TYPE,     1 },
 122         { "uchar",      BASIC_TYPE,     1 },
 123         { "wchar",      BASIC_TYPE,     2 },
 124         { "short",      BASIC_TYPE,     2 },
 125         { "ushort",     BASIC_TYPE,     2 },
 126         { "long",       BASIC_TYPE,     4 },
 127         { "ulong",      BASIC_TYPE,     4 },
 128         {0}
 129 };
 130
 131 static ndr_keyword_t optable[] = {
 132         { "{",          LC,             0 },
 133         { "}",          RC,             0 },
 134         { "(",          LP,             0 },
 135         { ")",          RP,             0 },
 136         { "[",          LB,             0 },
 137         { "]",          RB,             0 },
 138         { "*",          STAR,           0 },
 139         { "/",          DIV,            0 },
 140         { "%",          MOD,            0 },
 141         { "-",          MINUS,          0 },
 142         { "+",          PLUS,           0 },
 143         { "&",          AND,            0 },
 144         { "|",          OR,             0 },
 145         { "^",          XOR,            0 },
 146         { ";",          SEMI,           0 },
 147         {0}
 148 };
 149
 150 static int getch(FILE *fp);
 151 static ndr_integer_t *int_enter(long);
 152 static ndr_symbol_t *sym_enter(char *);
 153 static ndr_symbol_t *sym_find(char *);
 154 static int str_to_sv(char *, char *sv[]);
 155
 156 /*
 157  * Enter the symbols for keyword.
 158  */
 159 static void
 160 keyw_tab_init(ndr_keyword_t kwtable[])
 161 {
 162         int                     i;
 163         ndr_keyword_t           *kw;
 164         ndr_symbol_t            *sym;
 165
 166         for (i = 0; kwtable[i].name; i++) {
 167                 kw = &kwtable[i];
 168
 169                 sym = sym_enter(kw->name);
 170                 sym->kw = kw;
 171         }
 172 }
 173
 174 void
 175 set_lex_input(FILE *fp, char *name)
 176 {
 177         keyw_tab_init(keywtable);
 178         keyw_tab_init(optable);
 179
 180         lex_infp = fp;
 181         file_name = sym_enter(name);
 182         line_number = 1;
 183         lex_at_bol = 1;
 184 }
 185
 186 static int
 187 getch(FILE *fp)
 188 {
 189         return (getc(fp));
 190 }
 191
 192 int
 193 yylex(void)
 194 {
 195         char            lexeme[512];
 196         char            *p = lexeme;
 197         FILE            *fp = lex_infp;
 198         int             c, xc;
 199         ndr_symbol_t    *sym;
 200         ndr_integer_t   *intg;
 201
 202 top:
 203         p = lexeme;
 204
 205         c = getch(fp);
 206         if (c == EOF)
 207                 return (EOF);
 208
 209         if (c == '\n') {
 210                 line_number++;
 211                 lex_at_bol = 1;
 212                 goto top;
 213         }
 214
 215         /*
 216          * Handle preprocessor lines. This just notes
 217          * which file we're processing.
 218          */
 219         if (c == '#' && lex_at_bol) {
 220                 char            *sv[10];
 221                 int             sc;
 222
 223                 while ((c = getch(fp)) != EOF && c != '\n')
 224                         *p++ = c;
 225
 226                 *p = 0;
 227                 /* note: no ungetc() of newline, we don't want to count it */
 228
 229                 if (*lexeme != ' ') {
 230                         /* not a line we know */
 231                         goto top;
 232                 }
 233
 234                 sc = str_to_sv(lexeme, sv);
 235                 if (sc < 2)
 236                         goto top;
 237
 238                 file_name = sym_enter(sv[1]);
 239                 line_number = atoi(sv[0]);      /* for next input line */
 240                 lex_at_bol = 1;
 241                 goto top;
 242         }
 243
 244         lex_at_bol = 0;
 245
 246         /*
 247          * Skip white space
 248          */
 249         if (is_white(c))
 250                 goto top;
 251
 252         /*
 253          * Symbol? Might be a keyword or just an identifier
 254          */
 255         if (is_sstart(c)) {
 256                 /* we got a symbol */
 257                 do {
 258                         *p++ = c;
 259                         c = getch(fp);
 260                 } while (is_sfollow(c));
 261                 (void) ungetc(c, fp);
 262                 *p = 0;
 263
 264                 sym = sym_enter(lexeme);
 265
 266                 yylval = &sym->s_node;
 267
 268                 if (sym->kw) {
 269                         return (sym->kw->token);
 270                 } else {
 271                         return (IDENTIFIER);
 272                 }
 273         }
 274
 275         /*
 276          * Integer constant?
 277          */
 278         if (is_digit(c)) {
 279                 /* we got a number */
 280                 *p++ = c;
 281                 if (c == '0') {
 282                         c = getch(fp);
 283                         if (c == 'x' || c == 'X') {
 284                                 /* handle hex specially */
 285                                 do {
 286                                         *p++ = c;
 287                                         c = getch(fp);
 288                                 } while (is_xdigit(c));
 289                                 goto convert_icon;
 290                         } else if (c == 'b' || c == 'B' ||
 291                             c == 'd' || c == 'D' ||
 292                             c == 'o' || c == 'O') {
 293                                 do {
 294                                         *p++ = c;
 295                                         c = getch(fp);
 296                                 } while (is_digit(c));
 297                                 goto convert_icon;
 298                         }
 299                         (void) ungetc(c, fp);
 300                 }
 301                 /* could be anything */
 302                 c = getch(fp);
 303                 while (is_digit(c)) {
 304                         *p++ = c;
 305                         c = getch(fp);
 306                 }
 307
 308 convert_icon:
 309                 *p = 0;
 310                 (void) ungetc(c, fp);
 311
 312                 intg = int_enter(strtol(lexeme, 0, 0));
 313                 yylval = &intg->s_node;
 314
 315                 return (INTEGER);
 316         }
 317
 318         /* Could handle strings. We don't seem to need them yet */
 319
 320         yylval = 0;             /* operator tokens have no value */
 321         xc = getch(fp);         /* get look-ahead for two-char lexemes */
 322
 323         lexeme[0] = c;
 324         lexeme[1] = xc;
 325         lexeme[2] = 0;
 326
 327         /*
 328          * Look for to-end-of-line comment
 329          */
 330         if (c == '/' && xc == '/') {
 331                 /* eat the comment */
 332                 while ((c = getch(fp)) != EOF && c != '\n')
 333                         ;
 334                 (void) ungetc(c, fp);           /* put back newline */
 335                 goto top;
 336         }
 337
 338         /*
 339          * Look for multi-line comment
 340          */
 341         if (c == '/' && xc == '*') {
 342                 /* eat the comment */
 343                 xc = -1;
 344                 while ((c = getch(fp)) != EOF) {
 345                         if (xc == '*' && c == '/') {
 346                                 /* that's it */
 347                                 break;
 348                         }
 349                         xc = c;
 350                         if (c == '\n')
 351                                 line_number++;
 352                 }
 353                 goto top;
 354         }
 355
 356         /*
 357          * Use symbol table lookup for two-character and
 358          * one character operator tokens.
 359          */
 360         sym = sym_find(lexeme);
 361         if (sym) {
 362                 /* there better be a keyword attached */
 363                 yylval = &sym->s_node;
 364                 return (sym->kw->token);
 365         }
 366
 367         /* Try a one-character form */
 368         (void) ungetc(xc, fp);
 369         lexeme[1] = 0;
 370         sym = sym_find(lexeme);
 371         if (sym) {
 372                 /* there better be a keyword attached */
 373                 yylval = &sym->s_node;
 374                 return (sym->kw->token);
 375         }
 376
 377         if (is_between(c, ' ', '~'))
 378                 compile_error("unrecognized character: 0x%02x (%c)", c, c);
 379         else
 380                 compile_error("unrecognized character: 0x%02x", c);
 381         goto top;
 382 }
 383
 384 static ndr_symbol_t *
 385 sym_find(char *name)
 386 {
 387         ndr_symbol_t            **pp;
 388         ndr_symbol_t            *p;
 389
 390         for (pp = &symbol_list; (p = *pp) != 0; pp = &p->next) {
 391                 if (strcmp(p->name, name) == 0)
 392                         return (p);
 393         }
 394
 395         return (0);
 396 }
 397
 398 static ndr_symbol_t *
 399 sym_enter(char *name)
 400 {
 401         ndr_symbol_t            **pp;
 402         ndr_symbol_t            *p;
 403
 404         for (pp = &symbol_list; (p = *pp) != 0; pp = &p->next) {
 405                 if (strcmp(p->name, name) == 0)
 406                         return (p);
 407         }
 408
 409         p = ndr_alloc(1, sizeof (ndr_symbol_t));
 410
 411         if ((p->name = strdup(name)) == NULL)
 412                 fatal_error("%s", strerror(ENOMEM));
 413
 414         p->s_node.label = IDENTIFIER;
 415         p->s_node.n_sym = p;
 416
 417         *pp = p;
 418
 419         return (p);
 420 }
 421
 422 static ndr_integer_t *
 423 int_enter(long value)
 424 {
 425         ndr_integer_t           **pp;
 426         ndr_integer_t           *p;
 427
 428         for (pp = &integer_list; (p = *pp) != 0; pp = &p->next) {
 429                 if (p->value == value)
 430                         return (p);
 431         }
 432
 433         p = ndr_alloc(1, sizeof (ndr_integer_t));
 434
 435         p->value = value;
 436         p->s_node.label = INTEGER;
 437         p->s_node.n_int = value;
 438
 439         *pp = p;
 440
 441         return (p);
 442 }
 443
 444 void *
 445 ndr_alloc(size_t nelem, size_t elsize)
 446 {
 447         void *p;
 448
 449         if ((p = calloc(nelem, elsize)) == NULL) {
 450                 fatal_error("%s", strerror(ENOMEM));
 451                 /* NOTREACHED */
 452         }
 453
 454         return (p);
 455 }
 456
 457 /*
 458  * The input context (filename, line number) is maintained by the
 459  * lexical analysis, and we generally want such info reported for
 460  * errors in a consistent manner.
 461  */
 462 void
 463 compile_error(const char *fmt, ...)
 464 {
 465         char    buf[NDLBUFSZ];
 466         va_list ap;
 467
 468         va_start(ap, fmt);
 469         (void) vsnprintf(buf, NDLBUFSZ, fmt, ap);
 470         va_end(ap);
 471
 472         (void) fprintf(stderr, "ndrgen: compile error: %s:%d: %s\n",
 473             file_name->name, line_number, buf);
 474
 475         n_compile_error++;
 476 }
 477
 478 void
 479 fatal_error(const char *fmt, ...)
 480 {
 481         char    buf[NDLBUFSZ];
 482         va_list ap;
 483
 484         va_start(ap, fmt);
 485         (void) vsnprintf(buf, NDLBUFSZ, fmt, ap);
 486         va_end(ap);
 487
 488         (void) fprintf(stderr, "ndrgen: fatal error: %s\n", buf);
 489         exit(1);
 490 }
 491
 492 /*
 493  * Setup nodes for the lexical analyzer.
 494  */
 495 struct node *
 496 n_cons(int label, ...)
 497 {
 498         ndr_node_t              *np;
 499         va_list ap;
 500
 501         np = ndr_alloc(1, sizeof (ndr_node_t));
 502
 503         va_start(ap, label);
 504         np->label = label;
 505         np->n_arg[0] = va_arg(ap, void *);
 506         np->n_arg[1] = va_arg(ap, void *);
 507         np->n_arg[2] = va_arg(ap, void *);
 508         va_end(ap);
 509
 510         np->line_number = line_number;
 511         np->file_name = file_name;
 512
 513         return (np);
 514 }
 515
 516 /*
 517  *      list:   item
 518  *      |       list item       ={ n_splice($1, $2); }
 519  *      ;
 520  */
 521 void
 522 n_splice(struct node *np1, struct node *np2)
 523 {
 524         while (np1->n_next)
 525                 np1 = np1->n_next;
 526
 527         np1->n_next = np2;
 528 }
 529
 530 /*
 531  * Convert a string of words to a vector of strings.
 532  * Returns the number of words.
 533  */
 534 static int
 535 str_to_sv(char *buf, char *sv[])
 536 {
 537         char            **pp = sv;
 538         char            *p = buf;
 539         char            *q = buf;
 540         int             in_word = 0;
 541         int             c;
 542
 543         for (;;) {
 544                 c = *p++;
 545                 if (c == 0)
 546                         break;
 547
 548                 if (!in_word) {
 549                         if (iswhite(c))
 550                                 continue;
 551
 552                         *pp++ = q;
 553                         in_word = 1;
 554                 }
 555
 556                 if (isquote(c)) {
 557                         int             qc = c;
 558
 559                         while (((c = *p++) != 0) && (c != qc))
 560                                 *q++ = c;
 561                         if (c == 0)
 562                                 break;
 563                 } else if (iswhite(c)) {
 564                         /* end of word */
 565                         *q++ = 0;
 566                         in_word = 0;
 567                 } else {
 568                         /* still inside word */
 569                         *q++ = c;
 570                 }
 571         }
 572
 573         if (in_word)
 574                 *q++ = 0;
 575
 576         *pp = (char *)0;
 577         return (pp - sv);
 578 }