compilers/bcc/linux86-0.16.17/unproto/tok_io.c

   1 /*++
   2 /* NAME
   3 /*      tok_io 3
   4 /* SUMMARY
   5 /*      token I/O
   6 /* PACKAGE
   7 /*      unproto
   8 /* SYNOPSIS
   9 /*      #include "token.h"
  10 /*
  11 /*      struct token *tok_get()
  12 /*
  13 /*      void tok_flush(t)
  14 /*      struct token *t;
  15 /*
  16 /*      void tok_show(t)
  17 /*      struct token *t;
  18 /*
  19 /*      void tok_show_ch(t)
  20 /*      struct token *t;
  21 /*
  22 /*      void put_str(s)
  23 /*      char *s;
  24 /*
  25 /*      void put_ch(c)
  26 /*      int c;
  27 /*
  28 /*      void put_nl()
  29 /*
  30 /*      char *in_path;
  31 /*      int in_line;
  32 /* DESCRIPTION
  33 /*      These functions read from stdin and write to stdout. The
  34 /*      tokenizer keeps track of where the token appeared in the input
  35 /*      stream; on output, this information is used to preserve correct
  36 /*      line number information (even after lots of token lookahead or
  37 /*      after function-header rewriting) so that diagnostics from the
  38 /*      next compiler stage make sense.
  39 /*
  40 /*      tok_get() reads the next token from standard input. It returns
  41 /*      a null pointer when the end of input is reached.
  42 /*
  43 /*      tok_show() displays the contents of a (possibly composite) token
  44 /*      on the standard output.
  45 /*
  46 /*      tok_show_ch() displays the contents of a single-character token
  47 /*      on the standard output. The character should not be a newline.
  48 /*
  49 /*      tok_flush() displays the contents of a (possibly composite) token
  50 /*      on the standard output and makes it available for re-use.
  51 /*
  52 /*      put_str() writes a null-terminated string to standard output.
  53 /*      There should be no newline characters in the string argument.
  54 /*
  55 /*      put_ch() writes one character to standard output. The character
  56 /*      should not be a newline.
  57 /*
  58 /*      put_nl() outputs a newline character and adjusts the program's idea of
  59 /*      the current output line.
  60 /*
  61 /*      The in_path and in_line variables contain the file name and
  62 /*      line number of the most recently read token.
  63 /* BUGS
  64 /*      The tokenizer is just good enough for the unproto filter.
  65 /*      As a benefit, it is quite fast.
  66 /* AUTHOR(S)
  67 /*      Wietse Venema
  68 /*      Eindhoven University of Technology
  69 /*      Department of Mathematics and Computer Science
  70 /*      Den Dolech 2, P.O. Box 513, 5600 MB Eindhoven, The Netherlands
  71 /* LAST MODIFICATION
  72 /*      92/01/15 21:52:59
  73 /* VERSION/RELEASE
  74 /*      1.3
  75 /*--*/
  76
  77 static char io_sccsid[] = "@(#) tok_io.c 1.3 92/01/15 21:52:59";
  78
  79 /* C library */
  80
  81 #include <stdio.h>
  82 #include <ctype.h>
  83
  84 extern char *strchr();
  85 extern char *malloc();
  86 extern char *realloc();
  87 extern char *strcpy();
  88
  89 /* Application-specific stuff */
  90
  91 #include "token.h"
  92 #include "vstring.h"
  93 #include "error.h"
  94
  95 extern char *strsave();                 /* XXX need include file */
  96
  97 /* Stuff to keep track of original source file name and position */
  98
  99 static char def_path[] = "";            /* default path name */
 100
 101 char   *in_path = def_path;             /* current input file name */
 102 int     in_line = 1;                    /* current input line number */
 103
 104 static char *out_path = def_path;       /* last name in output line control */
 105 static int out_line = 1;                /* current output line number */
 106 int     last_ch;                        /* type of last output */
 107
 108 /* Forward declarations */
 109
 110 static int read_quoted();
 111 static void read_comment();
 112 static int backslash_newline();
 113 static char *read_hex();
 114 static char *read_octal();
 115 static void fix_line_control();
 116
 117  /*
 118   * Character input with one level of pushback. The INPUT() macro recursively
 119   * strips backslash-newline pairs from the input stream. The UNPUT() macro
 120   * should be used only for characters obtained through the INPUT() macro.
 121   *
 122   * After skipping a backslash-newline pair, the input line counter is not
 123   * updated, and we continue with the same logical source line. We just
 124   * update a counter with the number of backslash-newline sequences that must
 125   * be accounted for (backslash_newline() updates the counter). At the end of
 126   * the logical source line, an appropriate number of newline characters is
 127   * pushed back (in tok_get()). I do not know how GCC handles this, but it
 128   * seems to produce te same output.
 129   *
 130   * Because backslash_newline() recursively calls itself (through the INPUT()
 131   * macro), we will run out of stack space, given a sufficiently long
 132   * sequence of backslash-newline pairs.
 133   */
 134
 135 static char in_char = 0;                /* push-back storage */
 136 static int in_flag = 0;                 /* pushback available */
 137 static int nl_compensate = 0;           /* line continuation kluge */
 138
 139 #define INPUT(c) (in_flag ? (in_flag = 0, c = in_char) : \
 140                     (c = getchar()) != '\\' ? c : \
 141                     (c = getchar()) != '\n' ? (ungetc(c, stdin), c = '\\') : \
 142                     (c = backslash_newline()))
 143 #define UNPUT(c) (in_flag = 1, in_char = c)
 144
 145 /* Directives that should be ignored. */
 146
 147 #ifdef IGNORE_DIRECTIVES
 148
 149 static char *ignore_directives[] = {
 150     IGNORE_DIRECTIVES,
 151     0,
 152 };
 153
 154 #endif
 155
 156 /* Modified string and ctype stuff. */
 157
 158 #define STREQUAL(x,y)   (*(x) == *(y) && strcmp((x),(y)) == 0)
 159
 160 #define ISALNUM(c)      (isalnum(c) || (c) == '_')
 161 #define ISALPHA(c)      (isalpha(c) || (c) == '_')
 162 #define ISSPACE(c)      (isspace(c) && c != '\n')
 163 #define ISDOT(c)        (c == '.')
 164 #define ISHEX(c)        (isdigit(c) || strchr("abcdefABCDEF", c) != 0)
 165 #define ISOCTAL(c)      (isdigit(c) && (c) != '8' && (c) != '9')
 166
 167 /* Collect all characters that satisfy one condition */
 168
 169 #define COLLECT(v,c,cond) { \
 170                                 register struct vstring *vs = v; \
 171                                 register char *cp = vs->str; \
 172                                 *cp++ = c; \
 173                                 while (INPUT(c) != EOF) { \
 174                                     if (cond) { \
 175                                         if (VS_ADDCH(vs, cp, c) == 0) \
 176                                             fatal("out of memory"); \
 177                                     } else { \
 178                                         UNPUT(c); \
 179                                         break; \
 180                                     } \
 181                                 } \
 182                                 *cp = 0; \
 183                             }
 184
 185 /* Ensure that output line information is correct */
 186
 187 #define CHECK_LINE_CONTROL(p,l) { if (out_path != (p) || out_line != (l)) \
 188                                         fix_line_control((p),(l)); }
 189
 190 /* do_control - parse control line */
 191
 192 static int do_control()
 193 {
 194     struct token *t;
 195     int     line;
 196     char   *path;
 197
 198     /* Make sure that the directive shows up in the right place. */
 199
 200     CHECK_LINE_CONTROL(in_path, in_line);
 201
 202     while (t = tok_get()) {
 203         switch (t->tokno) {
 204
 205         case TOK_WSPACE:
 206             /* Ignore blanks after "#" token. */
 207             tok_free(t);
 208             break;
 209
 210         case TOK_NUMBER:
 211
 212             /*
 213              * Line control is of the form: number pathname junk. Since we
 214              * have no idea what junk the preprocessor may generate, we copy
 215              * all line control tokens to stdout.
 216              */
 217
 218             put_str("# ");
 219             line = atoi(t->vstr->str);          /* extract line number */
 220             tok_flush(t);
 221             while ((t = tok_get()) && t->tokno == TOK_WSPACE)
 222                 tok_flush(t);                   /* copy white space */
 223             if (t) {                            /* extract path name */
 224                 path = (t->tokno == '"') ? strsave(t->vstr->str) : in_path;
 225                 do {
 226                     tok_flush(t);               /* copy until newline */
 227                 } while (t->tokno != '\n' && (t = tok_get()));
 228             }
 229             out_line = in_line = line;          /* synchronize */
 230             out_path = in_path = path;          /* synchronize */
 231             return;
 232
 233 #ifdef IGNORE_DIRECTIVES
 234
 235         case TOK_WORD:
 236
 237             /*
 238              * Optionally ignore other #directives. This is only a partial
 239              * solution, because the preprocessor will still see them.
 240              */
 241             {
 242                 char  **cpp;
 243                 char   *cp = t->vstr->str;
 244
 245                 for (cpp = ignore_directives; *cpp; cpp++) {
 246                     if (STREQUAL(cp, *cpp)) {
 247                         do {
 248                             tok_free(t);
 249                         } while (t->tokno != '\n' && (t = tok_get()));
 250                         return;
 251                     }
 252                 }
 253             }
 254             /* FALLTHROUGH */
 255 #endif
 256         default:
 257             /* Pass through. */
 258             put_ch('#');
 259             do {
 260                 tok_flush(t);
 261             } while (t->tokno != '\n' && (t = tok_get()));
 262             return;
 263
 264         case 0:
 265             /* Hit EOF, punt. */
 266             put_ch('#');
 267             return;
 268         }
 269     }
 270 }
 271
 272 /* backslash_newline - fix up things after reading a backslash-newline pair */
 273
 274 static int backslash_newline()
 275 {
 276     register int c;
 277
 278     nl_compensate++;
 279     return (INPUT(c));
 280 }
 281
 282 /* tok_get - get next token */
 283
 284 static int last_tokno = '\n';
 285
 286 struct token *tok_get()
 287 {
 288     register struct token *t;
 289     register int c;
 290     int     d;
 291
 292     /*
 293      * Get one from the pool and fill it in. The loop is here in case we hit
 294      * a preprocessor control line, which happens in a minority of all cases.
 295      * We update the token input path and line info *after* backslash-newline
 296      * processing or the newline compensation would go wrong.
 297      */
 298
 299     t = tok_alloc();
 300
 301     for (;;) {
 302         if ((INPUT(c)) == EOF) {
 303             tok_free(t);
 304             return (0);
 305         } else if ((t->line = in_line, t->path = in_path), !isascii(c)) {
 306             t->vstr->str[0] = c;
 307             t->vstr->str[1] = 0;
 308             t->tokno = TOK_OTHER;
 309             break;
 310         } else if (ISSPACE(c)) {
 311             COLLECT(t->vstr, c, ISSPACE(c));
 312             t->tokno = TOK_WSPACE;
 313             break;
 314         } else if (ISALPHA(c)) {
 315             COLLECT(t->vstr, c, ISALNUM(c));
 316             t->tokno = TOK_WORD;
 317             break;
 318         } else if (isdigit(c)) {
 319             COLLECT(t->vstr, c, isdigit(c));
 320             t->tokno = TOK_NUMBER;
 321             break;
 322         } else if (c == '"' || c == '\'') {
 323             t->tokno = read_quoted(t->vstr, c); /* detect missing end quote */
 324             break;
 325         } else if (ISDOT(c)) {
 326             COLLECT(t->vstr, c, ISDOT(c));
 327             t->tokno = TOK_OTHER;
 328             break;
 329         } else if (c == '#' && last_tokno == '\n') {
 330             do_control();
 331             continue;
 332         } else {
 333             t->vstr->str[0] = c;
 334             if (c == '\n') {
 335                 in_line++;
 336                 if (nl_compensate > 0) {        /* compensation for bs-nl */
 337                     UNPUT('\n');
 338                     nl_compensate--;
 339                 }
 340             } else if (c == '/') {
 341                 if ((INPUT(d)) == '*') {
 342                     t->vstr->str[1] = d;        /* comment */
 343                     read_comment(t->vstr);
 344                     t->tokno = TOK_WSPACE;
 345                     break;
 346                 } else {
 347                     if (d != EOF)
 348                         UNPUT(d);
 349                 }
 350             } else if (c == '\\') {
 351                 t->vstr->str[1] = (INPUT(c) == EOF ? 0 : c);
 352                 t->vstr->str[2] = 0;
 353                 t->tokno = TOK_OTHER;
 354                 break;
 355             }
 356             t->vstr->str[1] = 0;
 357             t->tokno = c;
 358             break;
 359         }
 360     }
 361     last_tokno = t->tokno;
 362     t->end_line = in_line;
 363     return (t);
 364 }
 365
 366 /* read_quoted - read string or character literal, canonicalize escapes */
 367
 368 static int read_quoted(vs, ch)
 369 register struct vstring *vs;
 370 int     ch;
 371 {
 372     register char *cp = vs->str;
 373     register int c;
 374     int     ret = TOK_OTHER;
 375
 376     *cp++ = ch;
 377
 378     /*
 379      * Clobber the token type in case of a premature newline or EOF. This
 380      * prevents us from attempting to concatenate string constants with
 381      * broken ones that have no closing quote.
 382      */
 383
 384     while (INPUT(c) != EOF) {
 385         if (c == '\n') {                        /* newline in string */
 386             UNPUT(c);
 387             break;
 388         }
 389         if (VS_ADDCH(vs, cp, c) == 0)           /* store character */
 390             fatal("out of memory");
 391         if (c == ch) {                          /* closing quote */
 392             ret = c;
 393             break;
 394         }
 395         if (c == '\\') {                        /* parse escape sequence */
 396             if ((INPUT(c)) == EOF) {            /* EOF, punt */
 397                 break;
 398             } else if (c == 'a') {              /* \a -> audible bell */
 399 #ifdef BELL
 400                 if ((cp = vs_strcpy(vs, cp, BELL)) == 0)
 401 #else
 402                 if ((cp = vs_strcpy(vs, cp, "\007")) == 0)
 403 #endif
 404                     fatal("out of memory");
 405             } else if (c == 'x') {              /* \xhh -> \nnn */
 406                 cp = read_hex(vs, cp);
 407             } else if (ISOCTAL(c) && ch != '\'') {
 408                 cp = read_octal(vs, cp, c);     /* canonicalize \octal */
 409             } else {
 410                 if (VS_ADDCH(vs, cp, c) == 0)   /* \other: leave alone */
 411                     fatal("out of memory");
 412             }
 413         }
 414     }
 415     *cp = 0;
 416     return (ret);
 417 }
 418
 419 /* read_comment - stuff a whole comment into one huge token */
 420
 421 static void read_comment(vs)
 422 register struct vstring *vs;
 423 {
 424     register char *cp = vs->str + 2;    /* skip slash star */
 425     register int c;
 426     register int d;
 427
 428     while (INPUT(c) != EOF) {
 429         if (VS_ADDCH(vs, cp, c) == 0)
 430             fatal("out of memory");
 431         if (c == '*') {
 432             if ((INPUT(d)) == '/') {
 433                 if (VS_ADDCH(vs, cp, d) == 0)
 434                     fatal("out of memory");
 435                 break;
 436             } else {
 437                 if (d != EOF)
 438                     UNPUT(d);
 439             }
 440         } else if (c == '\n') {
 441             in_line++;
 442         } else if (c == '\\') {
 443             if ((INPUT(d)) != EOF && VS_ADDCH(vs, cp, d) == 0)
 444                 fatal("out of memory");
 445         }
 446     }
 447     *cp = 0;
 448 }
 449
 450 /* read_hex - rewrite hex escape to three-digit octal escape */
 451
 452 static char *read_hex(vs, cp)
 453 struct vstring *vs;
 454 register char *cp;
 455 {
 456     register int c;
 457     register int i;
 458     char    buf[BUFSIZ];
 459     int     len;
 460     unsigned val;
 461
 462     /*
 463      * Eat up all subsequent hex digits. Complain later when there are too
 464      * many.
 465      */
 466
 467     for (i = 0; i < sizeof(buf) && (INPUT(c) != EOF) && ISHEX(c); i++)
 468         buf[i] = c;
 469     buf[i] = 0;
 470
 471     if (i < sizeof(buf) && c)
 472         UNPUT(c);
 473
 474     /*
 475      * Convert hex form to three-digit octal form. The three-digit form is
 476      * used so that strings can be concatenated without problems. Complain
 477      * about malformed input; truncate the result to at most three octal
 478      * digits.
 479      */
 480
 481     if (i == 0) {
 482         error("\\x escape sequence without hexadecimal digits");
 483         if (VS_ADDCH(vs, cp, 'x') == 0)
 484             fatal("out of memory");
 485     } else {
 486         (void) sscanf(buf, "%x", &val);
 487         sprintf(buf, "%03o", val);
 488         if ((len = strlen(buf)) > 3)
 489             error("\\x escape sequence yields non-character value");
 490         if ((cp = vs_strcpy(vs, cp, buf + len - 3)) == 0)
 491             fatal("out of memory");
 492     }
 493     return (cp);
 494 }
 495
 496 /* read_octal - convert octal escape to three-digit format */
 497
 498 static char obuf[] = "00123";
 499
 500 static char *read_octal(vs, cp, c)
 501 register struct vstring *vs;
 502 register char *cp;
 503 register int c;
 504 {
 505     register int i;
 506
 507 #define buf_input (obuf + 2)
 508
 509     /* Eat up at most three octal digits. */
 510
 511     buf_input[0] = c;
 512     for (i = 1; i < 3 && (INPUT(c) != EOF) && ISOCTAL(c); i++)
 513         buf_input[i] = c;
 514     buf_input[i] = 0;
 515
 516     if (i < 3 && c)
 517         UNPUT(c);
 518
 519     /*
 520      * Leave three-digit octal escapes alone. Convert one-digit and two-digit
 521      * octal escapes to three-digit form by prefixing them with a suitable
 522      * number of '0' characters. This is done so that strings can be
 523      * concatenated without problems.
 524      */
 525
 526     if ((cp = vs_strcpy(vs, cp, buf_input + i - 3)) == 0)
 527         fatal("out of memory");
 528     return (cp);
 529 }
 530
 531 /* put_nl - emit newline and adjust output line count */
 532
 533 void    put_nl()
 534 {
 535     put_ch('\n');
 536     out_line++;
 537 }
 538
 539 /* fix_line_control - to adjust path and/or line count info in output */
 540
 541 static void fix_line_control(path, line)
 542 register char *path;
 543 register int line;
 544 {
 545
 546     /*
 547      * This function is called sporadically, so it should not be a problem
 548      * that we repeat some of the tests that preceded this function call.
 549      *
 550      * Emit a newline if we are not at the start of a line.
 551      *
 552      * If we switch files, or if we jump backwards, emit line control. If we
 553      * jump forward, emit the proper number of newlines to compensate.
 554      */
 555
 556     if (last_ch != '\n')                        /* terminate open line */
 557         put_nl();
 558     if (path != out_path || line < out_line) {  /* file switch or back jump */
 559         printf("# %d %s\n", out_line = line, out_path = path);
 560         last_ch = '\n';
 561     } else {                                    /* forward jump */
 562         while (line > out_line)
 563             put_nl();
 564     }
 565 }
 566
 567 /* tok_show_ch - output single-character token (not newline) */
 568
 569 void    tok_show_ch(t)
 570 register struct token *t;
 571 {
 572     CHECK_LINE_CONTROL(t->path, t->line);
 573
 574     put_ch(t->tokno);                           /* show token contents */
 575 }
 576
 577 /* tok_show - output (possibly composite) token */
 578
 579 void    tok_show(t)
 580 register struct token *t;
 581 {
 582     register struct token *p;
 583
 584     if (t->tokno == TOK_LIST) {
 585         register struct token *s;
 586
 587         /*
 588          * This branch is completely in terms of tok_xxx() primitives, so
 589          * there is no need to check the line control information.
 590          */
 591
 592         for (s = t->head; s; s = s->next) {
 593             tok_show_ch(s);                     /* '(' or ',' or ')' */
 594             for (p = s->head; p; p = p->next)
 595                 tok_show(p);                    /* show list element */
 596         }
 597     } else {
 598         register char *cp = t->vstr->str;
 599
 600         /*
 601          * Measurements show that it pays off to give special treatment to
 602          * single-character tokens. Note that both types of token may cause a
 603          * change of output line number.
 604          */
 605
 606         CHECK_LINE_CONTROL(t->path, t->line);
 607         if (cp[1] == 0) {
 608             put_ch(*cp);                        /* single-character token */
 609         } else {
 610             put_str(cp);                        /* multi_character token */
 611         }
 612         out_line = t->end_line;                 /* may span multiple lines */
 613         for (p = t->head; p; p = p->next)
 614             tok_show(p);                        /* trailing blanks */
 615     }
 616 }