tokenize.c

   1 /*
   2  * This is a really stupid C tokenizer. It doesn't do any include
   3  * files or anything complex at all. That's the preprocessor.
   4  *
   5  * Copyright (C) 2003 Transmeta Corp.
   6  *               2003 Linus Torvalds
   7  *
   8  * Permission is hereby granted, free of charge, to any person obtaining a copy
   9  * of this software and associated documentation files (the "Software"), to deal
  10  * in the Software without restriction, including without limitation the rights
  11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12  * copies of the Software, and to permit persons to whom the Software is
  13  * furnished to do so, subject to the following conditions:
  14  *
  15  * The above copyright notice and this permission notice shall be included in
  16  * all copies or substantial portions of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  24  * THE SOFTWARE.
  25  */
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <stdarg.h>
  29 #include <stddef.h>
  30 #include <string.h>
  31 #include <ctype.h>
  32 #include <unistd.h>
  33 #include <stdint.h>
  34
  35 #include "lib.h"
  36 #include "allocate.h"
  37 #include "token.h"
  38 #include "symbol.h"
  39
  40 #define EOF (-1)
  41
  42 int input_stream_nr = 0;
  43 struct stream *input_streams;
  44 static int input_streams_allocated;
  45 unsigned int tabstop = 8;
  46 int no_lineno = 0;
  47
  48 #define BUFSIZE (8192)
  49
  50 typedef struct {
  51         int fd, offset, size;
  52         int pos, line, nr;
  53         int newline, whitespace;
  54         struct token **tokenlist;
  55         struct token *token;
  56         unsigned char *buffer;
  57 } stream_t;
  58
  59 const char *stream_name(int stream)
  60 {
  61         if (stream < 0 || stream > input_stream_nr)
  62                 return "<bad stream>";
  63         return input_streams[stream].name;
  64 }
  65
  66 int stream_prev(int stream)
  67 {
  68         if (stream < 0 || stream > input_stream_nr)
  69                 return -1;
  70         stream = input_streams[stream].pos.stream;
  71         if (stream > input_stream_nr)
  72                 return -1;
  73         return stream;
  74 }
  75
  76 static struct position stream_pos(stream_t *stream)
  77 {
  78         struct position pos;
  79         pos.type = 0;
  80         pos.stream = stream->nr;
  81         pos.newline = stream->newline;
  82         pos.whitespace = stream->whitespace;
  83         pos.pos = stream->pos;
  84
  85         pos.line = stream->line;
  86         if (no_lineno)
  87                 pos.line = 123456;
  88
  89         pos.noexpand = 0;
  90         return pos;
  91 }
  92
  93 const char *show_special(int val)
  94 {
  95         static char buffer[4];
  96
  97         buffer[0] = val;
  98         buffer[1] = 0;
  99         if (val >= SPECIAL_BASE)
 100                 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
 101         return buffer;
 102 }
 103
 104 const char *show_ident(const struct ident *ident)
 105 {
 106         static char buff[4][256];
 107         static int n;
 108         char *buffer;
 109
 110         if (!ident)
 111                 return "<noident>";
 112         buffer = buff[3 & ++n];
 113         sprintf(buffer, "%.*s", ident->len, ident->name);
 114         return buffer;
 115 }
 116
 117 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
 118 {
 119         if (isprint(c)) {
 120                 if (c == escape || c == '\\')
 121                         *ptr++ = '\\';
 122                 *ptr++ = c;
 123                 return ptr;
 124         }
 125         *ptr++ = '\\';
 126         switch (c) {
 127         case '\n':
 128                 *ptr++ = 'n';
 129                 return ptr;
 130         case '\t':
 131                 *ptr++ = 't';
 132                 return ptr;
 133         }
 134         if (!isdigit(next))
 135                 return ptr + sprintf(ptr, "%o", c);
 136
 137         return ptr + sprintf(ptr, "%03o", c);
 138 }
 139
 140 const char *show_string(const struct string *string)
 141 {
 142         static char buffer[4 * MAX_STRING + 3];
 143         char *ptr;
 144         int i;
 145
 146         if (!string || !string->length)
 147                 return "<bad_string>";
 148         ptr = buffer;
 149         *ptr++ = '"';
 150         for (i = 0; i < string->length-1; i++) {
 151                 const char *p = string->data + i;
 152                 ptr = charstr(ptr, p[0], '"', p[1]);
 153         }
 154         *ptr++ = '"';
 155         *ptr = '\0';
 156         return buffer;
 157 }
 158
 159 static const char *show_char(const char *s, size_t len, char prefix, char delim)
 160 {
 161         static char buffer[MAX_STRING + 4];
 162         char *p = buffer;
 163         if (prefix)
 164                 *p++ = prefix;
 165         *p++ = delim;
 166         memcpy(p, s, len);
 167         p += len;
 168         *p++ = delim;
 169         *p++ = '\0';
 170         return buffer;
 171 }
 172
 173 static const char *quote_char(const char *s, size_t len, char prefix, char delim)
 174 {
 175         static char buffer[2*MAX_STRING + 6];
 176         size_t i;
 177         char *p = buffer;
 178         if (prefix)
 179                 *p++ = prefix;
 180         if (delim == '"')
 181                 *p++ = '\\';
 182         *p++ = delim;
 183         for (i = 0; i < len; i++) {
 184                 if (s[i] == '"' || s[i] == '\\')
 185                         *p++ = '\\';
 186                 *p++ = s[i];
 187         }
 188         if (delim == '"')
 189                 *p++ = '\\';
 190         *p++ = delim;
 191         *p++ = '\0';
 192         return buffer;
 193 }
 194
 195 const char *show_token(const struct token *token)
 196 {
 197         static char buffer[256];
 198
 199         if (!token)
 200                 return "<no token>";
 201         switch (token_type(token)) {
 202         case TOKEN_ERROR:
 203                 return "syntax error";
 204
 205         case TOKEN_EOF:
 206                 return "end-of-input";
 207
 208         case TOKEN_IDENT:
 209         case TOKEN_ZERO_IDENT:
 210                 return show_ident(token->ident);
 211
 212         case TOKEN_NUMBER:
 213                 return token->number;
 214
 215         case TOKEN_SPECIAL:
 216                 return show_special(token->special);
 217
 218         case TOKEN_CHAR:
 219                 return show_char(token->string->data,
 220                         token->string->length - 1, 0, '\'');
 221         case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
 222                 return show_char(token->embedded,
 223                         token_type(token) - TOKEN_CHAR, 0, '\'');
 224         case TOKEN_WIDE_CHAR:
 225                 return show_char(token->string->data,
 226                         token->string->length - 1, 'L', '\'');
 227         case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
 228                 return show_char(token->embedded,
 229                         token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
 230         case TOKEN_STRING:
 231                 return show_char(token->string->data,
 232                         token->string->length - 1, 0, '"');
 233         case TOKEN_WIDE_STRING:
 234                 return show_char(token->string->data,
 235                         token->string->length - 1, 'L', '"');
 236
 237         case TOKEN_STREAMBEGIN:
 238                 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
 239                 return buffer;
 240
 241         case TOKEN_STREAMEND:
 242                 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
 243                 return buffer;
 244
 245         case TOKEN_UNTAINT:
 246                 sprintf(buffer, "<untaint>");
 247                 return buffer;
 248
 249         case TOKEN_ARG_COUNT:
 250                 sprintf(buffer, "<argcnt>");
 251                 return buffer;
 252
 253         default:
 254                 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 255                 return buffer;
 256         }
 257 }
 258
 259 const char *quote_token(const struct token *token)
 260 {
 261         static char buffer[256];
 262
 263         switch (token_type(token)) {
 264         case TOKEN_ERROR:
 265                 return "syntax error";
 266
 267         case TOKEN_IDENT:
 268         case TOKEN_ZERO_IDENT:
 269                 return show_ident(token->ident);
 270
 271         case TOKEN_NUMBER:
 272                 return token->number;
 273
 274         case TOKEN_SPECIAL:
 275                 return show_special(token->special);
 276
 277         case TOKEN_CHAR:
 278                 return quote_char(token->string->data,
 279                         token->string->length - 1, 0, '\'');
 280         case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
 281                 return quote_char(token->embedded,
 282                         token_type(token) - TOKEN_CHAR, 0, '\'');
 283         case TOKEN_WIDE_CHAR:
 284                 return quote_char(token->string->data,
 285                         token->string->length - 1, 'L', '\'');
 286         case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
 287                 return quote_char(token->embedded,
 288                         token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
 289         case TOKEN_STRING:
 290                 return quote_char(token->string->data,
 291                         token->string->length - 1, 0, '"');
 292         case TOKEN_WIDE_STRING:
 293                 return quote_char(token->string->data,
 294                         token->string->length - 1, 'L', '"');
 295         default:
 296                 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 297                 return buffer;
 298         }
 299 }
 300
 301 #define HASHED_INPUT_BITS (6)
 302 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
 303 #define HASH_PRIME 0x9e370001UL
 304
 305 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
 306
 307 int *hash_stream(const char *name)
 308 {
 309         uint32_t hash = 0;
 310         unsigned char c;
 311
 312         while ((c = *name++) != 0)
 313                 hash = (hash + (c << 4) + (c >> 4)) * 11;
 314
 315         hash *= HASH_PRIME;
 316         hash >>= 32 - HASHED_INPUT_BITS;
 317         return input_stream_hashes + hash;
 318 }
 319
 320 int init_stream(const struct position *pos, const char *name, int fd, const char **next_path)
 321 {
 322         int stream = input_stream_nr, *hash;
 323         struct stream *current;
 324
 325         if (stream >= input_streams_allocated) {
 326                 int newalloc = stream * 4 / 3 + 10;
 327                 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 328                 if (!input_streams)
 329                         die("Unable to allocate more streams space");
 330                 input_streams_allocated = newalloc;
 331         }
 332         current = input_streams + stream;
 333         memset(current, 0, sizeof(*current));
 334         current->name = name;
 335         current->fd = fd;
 336         current->next_path = next_path;
 337         current->path = NULL;
 338         current->constant = CONSTANT_FILE_MAYBE;
 339         if (pos)
 340                 current->pos = *pos;
 341         else
 342                 current->pos.stream = -1;
 343         input_stream_nr = stream+1;
 344         hash = hash_stream(name);
 345         current->next_stream = *hash;
 346         *hash = stream;
 347         return stream;
 348 }
 349
 350 static struct token * alloc_token(stream_t *stream)
 351 {
 352         struct token *token = __alloc_token(0);
 353         token->pos = stream_pos(stream);
 354         return token;
 355 }
 356
 357 /*
 358  *  Argh...  That was surprisingly messy - handling '\r' complicates the
 359  *  things a _lot_.
 360  */
 361 static int nextchar_slow(stream_t *stream)
 362 {
 363         int offset = stream->offset;
 364         int size = stream->size;
 365         int c;
 366         int spliced = 0, had_cr, had_backslash;
 367
 368 restart:
 369         had_cr = had_backslash = 0;
 370
 371 repeat:
 372         if (offset >= size) {
 373                 if (stream->fd < 0)
 374                         goto got_eof;
 375                 size = read(stream->fd, stream->buffer, BUFSIZE);
 376                 if (size <= 0)
 377                         goto got_eof;
 378                 stream->size = size;
 379                 stream->offset = offset = 0;
 380         }
 381
 382         c = stream->buffer[offset++];
 383         if (had_cr)
 384                 goto check_lf;
 385
 386         if (c == '\r') {
 387                 had_cr = 1;
 388                 goto repeat;
 389         }
 390
 391 norm:
 392         if (!had_backslash) {
 393                 switch (c) {
 394                 case '\t':
 395                         stream->pos += tabstop - stream->pos % tabstop;
 396                         break;
 397                 case '\n':
 398                         stream->line++;
 399                         stream->pos = 0;
 400                         stream->newline = 1;
 401                         break;
 402                 case '\\':
 403                         had_backslash = 1;
 404                         stream->pos++;
 405                         goto repeat;
 406                 default:
 407                         stream->pos++;
 408                 }
 409         } else {
 410                 if (c == '\n') {
 411                         stream->line++;
 412                         stream->pos = 0;
 413                         spliced = 1;
 414                         goto restart;
 415                 }
 416                 offset--;
 417                 c = '\\';
 418         }
 419 out:
 420         stream->offset = offset;
 421
 422         return c;
 423
 424 check_lf:
 425         if (c != '\n')
 426                 offset--;
 427         c = '\n';
 428         goto norm;
 429
 430 got_eof:
 431         if (had_backslash) {
 432                 c = '\\';
 433                 goto out;
 434         }
 435         if (stream->pos & Wnewline_eof)
 436                 warning(stream_pos(stream), "no newline at end of file");
 437         else if (spliced)
 438                 warning(stream_pos(stream), "backslash-newline at end of file");
 439         return EOF;
 440 }
 441
 442 /*
 443  *  We want that as light as possible while covering all normal cases.
 444  *  Slow path (including the logics with line-splicing and EOF sanity
 445  *  checks) is in nextchar_slow().
 446  */
 447 static inline int nextchar(stream_t *stream)
 448 {
 449         int offset = stream->offset;
 450
 451         if (offset < stream->size) {
 452                 int c = stream->buffer[offset++];
 453                 static const char special[256] = {
 454                         ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
 455                 };
 456                 if (!special[c]) {
 457                         stream->offset = offset;
 458                         stream->pos++;
 459                         return c;
 460                 }
 461         }
 462         return nextchar_slow(stream);
 463 }
 464
 465 struct token eof_token_entry;
 466
 467 static struct token *mark_eof(stream_t *stream)
 468 {
 469         struct token *end;
 470
 471         end = alloc_token(stream);
 472         eof_token_entry.pos = end->pos;
 473         token_type(end) = TOKEN_STREAMEND;
 474         end->pos.newline = 1;
 475
 476         eof_token_entry.next = &eof_token_entry;
 477         eof_token_entry.pos.newline = 1;
 478
 479         end->next =  &eof_token_entry;
 480         *stream->tokenlist = end;
 481         stream->tokenlist = NULL;
 482         return end;
 483 }
 484
 485 static void add_token(stream_t *stream)
 486 {
 487         struct token *token = stream->token;
 488
 489         stream->token = NULL;
 490         token->next = NULL;
 491         *stream->tokenlist = token;
 492         stream->tokenlist = &token->next;
 493 }
 494
 495 static void drop_token(stream_t *stream)
 496 {
 497         stream->newline |= stream->token->pos.newline;
 498         stream->whitespace |= stream->token->pos.whitespace;
 499         stream->token = NULL;
 500 }
 501
 502 enum {
 503         Letter = 1,
 504         Digit = 2,
 505         Hex = 4,
 506         Exp = 8,
 507         Dot = 16,
 508         ValidSecond = 32,
 509         Quote = 64,
 510 };
 511
 512 static const char cclass[257] = {
 513         ['0' + 1 ... '9' + 1] = Digit | Hex,
 514         ['A' + 1 ... 'D' + 1] = Letter | Hex,
 515         ['E' + 1] = Letter | Hex | Exp, /* E<exp> */
 516         ['F' + 1] = Letter | Hex,
 517         ['G' + 1 ... 'O' + 1] = Letter,
 518         ['P' + 1] = Letter | Exp,       /* P<exp> */
 519         ['Q' + 1 ... 'Z' + 1] = Letter,
 520         ['a' + 1 ... 'd' + 1] = Letter | Hex,
 521         ['e' + 1] = Letter | Hex | Exp, /* e<exp> */
 522         ['f' + 1] = Letter | Hex,
 523         ['g' + 1 ... 'o' + 1] = Letter,
 524         ['p' + 1] = Letter | Exp,       /* p<exp> */
 525         ['q' + 1 ... 'z' + 1] = Letter,
 526         ['_' + 1] = Letter,
 527         ['.' + 1] = Dot | ValidSecond,
 528         ['=' + 1] = ValidSecond,
 529         ['+' + 1] = ValidSecond,
 530         ['-' + 1] = ValidSecond,
 531         ['>' + 1] = ValidSecond,
 532         ['<' + 1] = ValidSecond,
 533         ['&' + 1] = ValidSecond,
 534         ['|' + 1] = ValidSecond,
 535         ['#' + 1] = ValidSecond,
 536         ['\'' + 1] = Quote,
 537         ['"' + 1] = Quote,
 538 };
 539
 540 /*
 541  * pp-number:
 542  *      digit
 543  *      . digit
 544  *      pp-number digit
 545  *      pp-number identifier-nodigit
 546  *      pp-number e sign
 547  *      pp-number E sign
 548  *      pp-number p sign
 549  *      pp-number P sign
 550  *      pp-number .
 551  */
 552 static int get_one_number(int c, int next, stream_t *stream)
 553 {
 554         struct token *token;
 555         static char buffer[4095];
 556         char *p = buffer, *buffer_end = buffer + sizeof (buffer);
 557
 558         *p++ = c;
 559         for (;;) {
 560                 long class =  cclass[next + 1];
 561                 if (!(class & (Dot | Digit | Letter)))
 562                         break;
 563                 if (p != buffer_end)
 564                         *p++ = next;
 565                 next = nextchar(stream);
 566                 if (class & Exp) {
 567                         if (next == '-' || next == '+') {
 568                                 if (p != buffer_end)
 569                                         *p++ = next;
 570                                 next = nextchar(stream);
 571                         }
 572                 }
 573         }
 574
 575         if (p == buffer_end) {
 576                 sparse_error(stream_pos(stream), "number token exceeds %td characters",
 577                       buffer_end - buffer);
 578                 // Pretend we saw just "1".
 579                 buffer[0] = '1';
 580                 p = buffer + 1;
 581         }
 582
 583         *p++ = 0;
 584         token = stream->token;
 585         token_type(token) = TOKEN_NUMBER;
 586         token->number = xmemdup(buffer, p - buffer);
 587         add_token(stream);
 588
 589         return next;
 590 }
 591
 592 static int eat_string(int next, stream_t *stream, enum token_type type)
 593 {
 594         static char buffer[MAX_STRING];
 595         struct string *string;
 596         struct token *token = stream->token;
 597         int len = 0;
 598         int escape;
 599         int want_hex = 0;
 600         char delim = type < TOKEN_STRING ? '\'' : '"';
 601
 602         for (escape = 0; escape || next != delim; next = nextchar(stream)) {
 603                 if (len < MAX_STRING)
 604                         buffer[len] = next;
 605                 len++;
 606                 if (next == '\n') {
 607                         warning(stream_pos(stream),
 608                                 "missing terminating %c character", delim);
 609                         /* assume delimiter is lost */
 610                         break;
 611                 }
 612                 if (next == EOF) {
 613                         warning(stream_pos(stream),
 614                                 "End of file in middle of string");
 615                         return next;
 616                 }
 617                 if (!escape) {
 618                         if (want_hex && !(cclass[next + 1] & Hex))
 619                                 warning(stream_pos(stream),
 620                                         "\\x used with no following hex digits");
 621                         want_hex = 0;
 622                         escape = next == '\\';
 623                 } else {
 624                         escape = 0;
 625                         want_hex = next == 'x';
 626                 }
 627         }
 628         if (want_hex)
 629                 warning(stream_pos(stream),
 630                         "\\x used with no following hex digits");
 631         if (len > MAX_STRING) {
 632                 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
 633                 len = MAX_STRING;
 634         }
 635         if (delim == '\'' && len && len <= 4) {
 636                 token_type(token) = type + len;
 637                 memset(buffer + len, '\0', 4 - len);
 638                 memcpy(token->embedded, buffer, 4);
 639         } else {
 640                 token_type(token) = type;
 641                 string = __alloc_string(len+1);
 642                 memcpy(string->data, buffer, len);
 643                 string->data[len] = '\0';
 644                 string->length = len+1;
 645                 token->string = string;
 646         }
 647
 648         /* Pass it on.. */
 649         token = stream->token;
 650         add_token(stream);
 651         return nextchar(stream);
 652 }
 653
 654 static int drop_stream_eoln(stream_t *stream)
 655 {
 656         drop_token(stream);
 657         for (;;) {
 658                 switch (nextchar(stream)) {
 659                 case EOF:
 660                         return EOF;
 661                 case '\n':
 662                         return nextchar(stream);
 663                 }
 664         }
 665 }
 666
 667 static int drop_stream_comment(stream_t *stream)
 668 {
 669         int newline;
 670         int next;
 671         drop_token(stream);
 672         newline = stream->newline;
 673
 674         next = nextchar(stream);
 675         for (;;) {
 676                 int curr = next;
 677                 if (curr == EOF) {
 678                         warning(stream_pos(stream), "End of file in the middle of a comment");
 679                         return curr;
 680                 }
 681                 next = nextchar(stream);
 682                 if (curr == '*' && next == '/')
 683                         break;
 684         }
 685         stream->newline = newline;
 686         return nextchar(stream);
 687 }
 688
 689 unsigned char combinations[][4] = COMBINATION_STRINGS;
 690
 691 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
 692
 693 /* hash function for two-character punctuators - all give unique values */
 694 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
 695
 696 /*
 697  * note that we won't get false positives - special_hash(0,0) is 0 and
 698  * entry 0 is filled (by +=), so all the missing ones are OK.
 699  */
 700 static unsigned char hash_results[32][2] = {
 701 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
 702         RES('+', '='), /* 00 */
 703         RES('/', '='), /* 01 */
 704         RES('^', '='), /* 05 */
 705         RES('&', '&'), /* 07 */
 706         RES('#', '#'), /* 08 */
 707         RES('<', '<'), /* 0a */
 708         RES('<', '='), /* 0c */
 709         RES('!', '='), /* 0e */
 710         RES('%', '='), /* 0f */
 711         RES('-', '-'), /* 10 */
 712         RES('-', '='), /* 11 */
 713         RES('-', '>'), /* 13 */
 714         RES('=', '='), /* 15 */
 715         RES('&', '='), /* 17 */
 716         RES('*', '='), /* 18 */
 717         RES('.', '.'), /* 1a */
 718         RES('+', '+'), /* 1b */
 719         RES('|', '='), /* 1c */
 720         RES('>', '='), /* 1d */
 721         RES('|', '|'), /* 1e */
 722         RES('>', '>')  /* 1f */
 723 #undef RES
 724 };
 725 static int code[32] = {
 726 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
 727         CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
 728         CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
 729         CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
 730         CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
 731         CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
 732         CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
 733         CODE('<', '=', SPECIAL_LTE), /* 0c */
 734         CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
 735         CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
 736         CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
 737         CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
 738         CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
 739         CODE('=', '=', SPECIAL_EQUAL), /* 15 */
 740         CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
 741         CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
 742         CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
 743         CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
 744         CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
 745         CODE('>', '=', SPECIAL_GTE), /* 1d */
 746         CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
 747         CODE('>', '>', SPECIAL_RIGHTSHIFT)  /* 1f */
 748 #undef CODE
 749 };
 750
 751 static int get_one_special(int c, stream_t *stream)
 752 {
 753         struct token *token;
 754         int next, value, i;
 755
 756         next = nextchar(stream);
 757
 758         /*
 759          * Check for numbers, strings, character constants, and comments
 760          */
 761         switch (c) {
 762         case '.':
 763                 if (next >= '0' && next <= '9')
 764                         return get_one_number(c, next, stream);
 765                 break;
 766         case '"':
 767                 return eat_string(next, stream, TOKEN_STRING);
 768         case '\'':
 769                 return eat_string(next, stream, TOKEN_CHAR);
 770         case '/':
 771                 if (next == '/')
 772                         return drop_stream_eoln(stream);
 773                 if (next == '*')
 774                         return drop_stream_comment(stream);
 775         }
 776
 777         /*
 778          * Check for combinations
 779          */
 780         value = c;
 781         if (cclass[next + 1] & ValidSecond) {
 782                 i = special_hash(c, next);
 783                 if (hash_results[i][0] == c && hash_results[i][1] == next) {
 784                         value = code[i];
 785                         next = nextchar(stream);
 786                         if (value >= SPECIAL_LEFTSHIFT &&
 787                             next == "==."[value - SPECIAL_LEFTSHIFT]) {
 788                                 value += 3;
 789                                 next = nextchar(stream);
 790                         }
 791                 }
 792         }
 793
 794         /* Pass it on.. */
 795         token = stream->token;
 796         token_type(token) = TOKEN_SPECIAL;
 797         token->special = value;
 798         add_token(stream);
 799         return next;
 800 }
 801
 802 #define IDENT_HASH_BITS (13)
 803 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 804 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 805
 806 #define ident_hash_init(c)              (c)
 807 #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 808 #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 809
 810 static struct ident *hash_table[IDENT_HASH_SIZE];
 811 static int ident_hit, ident_miss, idents;
 812
 813 void show_identifier_stats(void)
 814 {
 815         int i;
 816         int distribution[100];
 817
 818         fprintf(stderr, "identifiers: %d hits, %d misses\n",
 819                 ident_hit, ident_miss);
 820
 821         for (i = 0; i < 100; i++)
 822                 distribution[i] = 0;
 823
 824         for (i = 0; i < IDENT_HASH_SIZE; i++) {
 825                 struct ident * ident = hash_table[i];
 826                 int count = 0;
 827
 828                 while (ident) {
 829                         count++;
 830                         ident = ident->next;
 831                 }
 832                 if (count > 99)
 833                         count = 99;
 834                 distribution[count]++;
 835         }
 836
 837         for (i = 0; i < 100; i++) {
 838                 if (distribution[i])
 839                         fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 840         }
 841 }
 842
 843 struct ident *alloc_ident(const char *name, int len)
 844 {
 845         struct ident *ident = __alloc_ident(len);
 846         ident->symbols = NULL;
 847         ident->len = len;
 848         ident->tainted = 0;
 849         memcpy(ident->name, name, len);
 850         return ident;
 851 }
 852
 853 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
 854 {
 855         ident->next = hash_table[hash];
 856         hash_table[hash] = ident;
 857         ident_miss++;
 858         return ident;
 859 }
 860
 861 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 862 {
 863         struct ident *ident;
 864         struct ident **p;
 865
 866         p = &hash_table[hash];
 867         while ((ident = *p) != NULL) {
 868                 if (ident->len == (unsigned char) len) {
 869                         if (strncmp(name, ident->name, len) != 0)
 870                                 goto next;
 871
 872                         ident_hit++;
 873                         return ident;
 874                 }
 875 next:
 876                 //misses++;
 877                 p = &ident->next;
 878         }
 879         ident = alloc_ident(name, len);
 880         *p = ident;
 881         ident->next = NULL;
 882         ident_miss++;
 883         idents++;
 884         return ident;
 885 }
 886
 887 static unsigned long hash_name(const char *name, int len)
 888 {
 889         unsigned long hash;
 890         const unsigned char *p = (const unsigned char *)name;
 891
 892         hash = ident_hash_init(*p++);
 893         while (--len) {
 894                 unsigned int i = *p++;
 895                 hash = ident_hash_add(hash, i);
 896         }
 897         return ident_hash_end(hash);
 898 }
 899
 900 struct ident *hash_ident(struct ident *ident)
 901 {
 902         return insert_hash(ident, hash_name(ident->name, ident->len));
 903 }
 904
 905 struct ident *built_in_ident(const char *name)
 906 {
 907         int len = strlen(name);
 908         return create_hashed_ident(name, len, hash_name(name, len));
 909 }
 910
 911 struct token *built_in_token(int stream, struct ident *ident)
 912 {
 913         struct token *token;
 914
 915         token = __alloc_token(0);
 916         token->pos.stream = stream;
 917         token_type(token) = TOKEN_IDENT;
 918         token->ident = ident;
 919         return token;
 920 }
 921
 922 static int get_one_identifier(int c, stream_t *stream)
 923 {
 924         struct token *token;
 925         struct ident *ident;
 926         unsigned long hash;
 927         char buf[256];
 928         int len = 1;
 929         int next;
 930
 931         hash = ident_hash_init(c);
 932         buf[0] = c;
 933         for (;;) {
 934                 next = nextchar(stream);
 935                 if (!(cclass[next + 1] & (Letter | Digit)))
 936                         break;
 937                 if (len >= sizeof(buf))
 938                         break;
 939                 hash = ident_hash_add(hash, next);
 940                 buf[len] = next;
 941                 len++;
 942         };
 943         if (cclass[next + 1] & Quote) {
 944                 if (len == 1 && (buf[0] == 'L' || buf[0] == 'u')) {
 945                         if (next == '\'')
 946                                 return eat_string(nextchar(stream), stream,
 947                                                         TOKEN_WIDE_CHAR);
 948                         else
 949                                 return eat_string(nextchar(stream), stream,
 950                                                         TOKEN_WIDE_STRING);
 951                 }
 952         }
 953         hash = ident_hash_end(hash);
 954         ident = create_hashed_ident(buf, len, hash);
 955
 956         /* Pass it on.. */
 957         token = stream->token;
 958         token_type(token) = TOKEN_IDENT;
 959         token->ident = ident;
 960         add_token(stream);
 961         return next;
 962 }
 963
 964 static int get_one_token(int c, stream_t *stream)
 965 {
 966         long class = cclass[c + 1];
 967         if (class & Digit)
 968                 return get_one_number(c, nextchar(stream), stream);
 969         if (class & Letter)
 970                 return get_one_identifier(c, stream);
 971         return get_one_special(c, stream);
 972 }
 973
 974 static struct token *setup_stream(stream_t *stream, int idx, int fd,
 975         unsigned char *buf, unsigned int buf_size)
 976 {
 977         struct token *begin;
 978
 979         stream->nr = idx;
 980         stream->line = 1;
 981         stream->newline = 1;
 982         stream->whitespace = 0;
 983         stream->pos = 0;
 984
 985         stream->token = NULL;
 986         stream->fd = fd;
 987         stream->offset = 0;
 988         stream->size = buf_size;
 989         stream->buffer = buf;
 990
 991         begin = alloc_token(stream);
 992         token_type(begin) = TOKEN_STREAMBEGIN;
 993         stream->tokenlist = &begin->next;
 994         return begin;
 995 }
 996
 997 static struct token *tokenize_stream(stream_t *stream)
 998 {
 999         int c = nextchar(stream);
1000         while (c != EOF) {
1001                 if (!isspace(c)) {
1002                         struct token *token = alloc_token(stream);
1003                         stream->token = token;
1004                         stream->newline = 0;
1005                         stream->whitespace = 0;
1006                         c = get_one_token(c, stream);
1007                         continue;
1008                 }
1009                 stream->whitespace = 1;
1010                 c = nextchar(stream);
1011         }
1012         return mark_eof(stream);
1013 }
1014
1015 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1016 {
1017         stream_t stream;
1018         struct token *begin;
1019
1020         begin = setup_stream(&stream, 0, -1, buffer, size);
1021         *endtoken = tokenize_stream(&stream);
1022         return begin;
1023 }
1024
1025 struct token * tokenize(const struct position *pos, const char *name, int fd, struct token *endtoken, const char **next_path)
1026 {
1027         struct token *begin, *end;
1028         stream_t stream;
1029         unsigned char buffer[BUFSIZE];
1030         int idx;
1031
1032         idx = init_stream(pos, name, fd, next_path);
1033         if (idx < 0) {
1034                 // info(endtoken->pos, "File %s is const", name);
1035                 return endtoken;
1036         }
1037
1038         begin = setup_stream(&stream, idx, fd, buffer, 0);
1039         end = tokenize_stream(&stream);
1040         if (endtoken)
1041                 end->next = endtoken;
1042         return begin;
1043 }