src/st-lexer.c

   1 /*
   2  * st-lexer.c
   3  *
   4  * Copyright (C) 2008 Vincent Geddes
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23 */
  24
  25 /* Notes:
  26  *
  27  * we expand utf8-encoded text to ucs4 format and then lex it. Yes it's a bit
  28  * inefficient, but more straightforward than munging around with
  29  * multi-byte characters.
  30  *
  31  * Character input is supplied by the st_input object. It keeps track of
  32  * line/column numbers and has the ability to mark() and rewind() on the
  33  * input stream.
  34  *
  35  */
  36
  37 #include <config.h>
  38
  39 #include "st-lexer.h"
  40 #include "st-input.h"
  41 #include "st-utils.h"
  42
  43 #include <stdbool.h>
  44 #include <setjmp.h>
  45 #include <string.h>
  46 #include <stdio.h>
  47
  48 #include <stdlib.h>
  49 #include <limits.h>
  50 #include <ctype.h>
  51
  52 #define lookahead(self, k)   ((char) st_input_look_ahead (self->input, k))
  53 #define consume(self)        (st_input_consume (self->input))
  54 #define mark(self)           (st_input_mark (self->input))
  55 #define rewind(self)         (st_input_rewind (self->input))
  56
  57 typedef enum
  58 {
  59     ERROR_MISMATCHED_CHAR,
  60     ERROR_NO_VIABLE_ALT_FOR_CHAR,
  61     ERROR_ILLEGAL_CHAR,
  62     ERROR_UNTERMINATED_COMMENT,
  63     ERROR_UNTERMINATED_STRING_LITERAL,
  64     ERROR_INVALID_RADIX,
  65     ERROR_INVALID_CHAR_CONST,
  66     ERROR_NO_ALT_FOR_POUND,
  67
  68 } ErrorCode;
  69
  70 struct st_lexer
  71 {
  72     st_input *input;
  73
  74     bool filter_comments;
  75
  76     bool token_matched;
  77
  78     /* data for next token */
  79     st_uint line;
  80     st_uint column;
  81     st_uint start;
  82     st_token *token;
  83
  84     /* error control */
  85     bool    failed;
  86     jmp_buf main_loop;
  87
  88     /* last error information */
  89     ErrorCode error_code;
  90     st_uint   error_line;
  91     st_uint   error_column;
  92     char      error_char;
  93
  94     /* delayed deallocation */
  95     st_list *allocated_tokens;
  96 };
  97
  98 struct st_token
  99 {
 100     st_token_type type;
 101     int   line;
 102     int   column;
 103
 104     union {
 105         struct {
 106             char *text;
 107         };
 108         /* Number Token */
 109         struct {
 110             bool  negative;
 111             char *number;
 112             int   radix;
 113             int   exponent;
 114         };
 115     };
 116 };
 117
 118 static void
 119 make_token (st_lexer      *lexer,
 120             st_token_type  type,
 121             char        *text)
 122 {
 123     st_token *token;
 124
 125     token = st_new0 (st_token);
 126
 127     token->type   = type;
 128     token->text   = text ? text : st_strdup ("");
 129     token->type   = type;
 130     token->line   = lexer->line;
 131     token->column = lexer->column;
 132
 133     lexer->token = token;
 134     lexer->token_matched = true;
 135
 136
 137     lexer->allocated_tokens = st_list_prepend (lexer->allocated_tokens, token);
 138 }
 139
 140 static void
 141 make_number_token (st_lexer *lexer, int radix, int exponent, char *number, bool negative)
 142 {
 143     st_token *token;
 144
 145     token = st_new0 (st_token);
 146
 147     token->type   = ST_TOKEN_NUMBER_CONST;
 148     token->line   = lexer->line;
 149     token->column = lexer->column;
 150
 151     token->negative = negative;
 152     token->number   = number;
 153     token->radix    = radix;
 154     token->exponent = exponent;
 155
 156     lexer->token = token;
 157     lexer->token_matched = true;
 158
 159     lexer->allocated_tokens = st_list_prepend (lexer->allocated_tokens, token);
 160 }
 161
 162 static void
 163 raise_error (st_lexer   *lexer,
 164              ErrorCode  error_code,
 165              char      error_char)
 166 {
 167     lexer->failed = true;
 168
 169     lexer->error_code   = error_code;
 170     lexer->error_char   = error_char;
 171     lexer->error_line   = lexer->line;
 172     lexer->error_column = lexer->column;
 173
 174     /* create an token of type invalid */
 175     make_token (lexer, ST_TOKEN_INVALID, NULL);
 176
 177     /* hopefully recover after consuming char */
 178     consume (lexer);
 179
 180     /* go back to main loop */
 181     longjmp (lexer->main_loop, 0);
 182
 183 }
 184
 185 static void
 186 match_range (st_lexer *lexer, char a, char b)
 187 {
 188     if (lookahead (lexer, 1) < a || lookahead (lexer, 1) > b) {
 189         // mismatch error
 190         raise_error (lexer, ERROR_MISMATCHED_CHAR, lookahead (lexer, 1));
 191     }
 192     consume (lexer);
 193 }
 194
 195 static void
 196 match (st_lexer *lexer, char c)
 197 {
 198     if (lookahead (lexer, 1) != c) {
 199         // mismatch error
 200         raise_error (lexer, ERROR_MISMATCHED_CHAR, lookahead (lexer, 1));
 201     }
 202     consume (lexer);
 203 }
 204
 205 static bool
 206 is_special_char (char c)
 207 {
 208     switch (c) {
 209
 210     case '+': case '/': case '\\': case '*': case '~':
 211     case '<': case '>': case '=': case '@': case '%':
 212     case '|': case '&': case '?': case '!': case ',':
 213         return true;
 214
 215     default:
 216         return false;
 217
 218     }
 219 }
 220
 221 /* check if a char is valid numeral identifier for a given radix
 222  *
 223  * for example, 2r1010301 is an invalid number since the '3' is not within the radix.
 224  *
 225  **/
 226 static bool
 227 is_radix_numeral (st_uint radix, char c)
 228 {
 229     st_assert (radix >= 2 && radix <= 36);
 230
 231     if (radix > 10)
 232         return (c >= '0' && c <= '9') || (c >= 'A' && c <= ('A' - 1 + (radix - 10)));
 233     else
 234         return c >= '0' && c <= ('0' - 1 + radix);
 235 }
 236
 237 /* Numbers. We do just do basic matching here. Actual parsing and conversion can
 238  * be done in the parser.
 239  */
 240 static void
 241 match_number (st_lexer *lexer)
 242 {
 243     /* We don't match any leading '-'. The parser will resolve whether a '-'
 244      * specifies a negative number or a binary selector
 245      */
 246
 247     bool negative = false;
 248     long radix = 10;
 249     long exponent = 0;
 250     int k, j, l;
 251     char *string;
 252
 253     if (lookahead (lexer, 1) == '-') {
 254         negative = true;
 255         consume (lexer);
 256     }
 257
 258     k = st_input_index (lexer->input);
 259
 260     do {
 261         match_range (lexer, '0', '9');
 262     } while (isdigit (lookahead (lexer, 1)));
 263
 264     if (lookahead (lexer, 1) != 'r') {
 265
 266         j = st_input_index (lexer->input);
 267         goto out1;
 268
 269     } else {
 270
 271         string = st_input_range (lexer->input, k,
 272                                  st_input_index (lexer->input));
 273
 274         radix = strtol (string, NULL, 10);
 275         st_free (string);
 276         if (radix < 2 || radix > 36) {
 277             raise_error (lexer, ERROR_INVALID_RADIX, lookahead (lexer, 1));
 278         }
 279
 280         consume (lexer);
 281
 282     }
 283
 284     k = st_input_index (lexer->input);
 285
 286     if (lookahead (lexer, 1) == '-')
 287         raise_error (lexer, ERROR_NO_VIABLE_ALT_FOR_CHAR, lookahead (lexer, 1));
 288
 289 out1:
 290
 291     while (is_radix_numeral (radix, lookahead (lexer, 1)))
 292         consume (lexer);
 293
 294     if (lookahead (lexer, 1) == '.' && is_radix_numeral (radix, lookahead (lexer, 2))) {
 295         consume (lexer);
 296
 297         do {
 298             consume (lexer);
 299         } while (is_radix_numeral (radix, lookahead (lexer, 1)));
 300     }
 301
 302     j = st_input_index (lexer->input);
 303
 304     if (lookahead (lexer, 1) == 'e') {
 305
 306         consume (lexer);
 307
 308         l = st_input_index (lexer->input);
 309
 310         if (lookahead (lexer, 1) == '-' && isdigit (lookahead (lexer, 2)))
 311             consume (lexer);
 312
 313         while (isdigit (lookahead (lexer, 1)))
 314                 consume (lexer);
 315
 316         if (l == st_input_index (lexer->input))
 317             goto out2;
 318
 319         string = st_input_range (lexer->input, l,
 320                                  st_input_index (lexer->input));
 321         exponent = strtol (string, NULL, 10);
 322         st_free (string);
 323     }
 324
 325 out2:
 326
 327     make_number_token (lexer, radix, exponent,
 328                        st_input_range (lexer->input, k, j),
 329                        negative);
 330 }
 331
 332
 333 static void
 334 match_identifier (st_lexer *lexer, bool create_token)
 335 {
 336     if (isalpha (lookahead (lexer, 1)))
 337         consume (lexer);
 338     else {
 339         raise_error (lexer, ERROR_NO_VIABLE_ALT_FOR_CHAR, lookahead (lexer, 1));
 340     }
 341
 342     while (true) {
 343         if (isalpha (lookahead (lexer, 1)))
 344             consume (lexer);
 345         else if (lookahead (lexer, 1) >= '0' && lookahead (lexer, 1) <= '9')
 346             consume (lexer);
 347         else if (lookahead (lexer, 1) == '_')
 348             consume (lexer);
 349         else
 350             break;
 351     }
 352
 353     if (create_token) {
 354         make_token (lexer, ST_TOKEN_IDENTIFIER,
 355                     st_input_range (lexer->input, lexer->start,
 356                                     st_input_index (lexer->input)));
 357     }
 358 }
 359
 360 static void
 361 match_keyword_or_identifier (st_lexer *lexer, bool create_token)
 362 {
 363     if (isalpha (lookahead (lexer, 1)))
 364         consume (lexer);
 365     else {
 366         raise_error (lexer, ERROR_NO_VIABLE_ALT_FOR_CHAR, lookahead (lexer, 1));
 367     }
 368
 369     while (true) {
 370
 371         if (isalpha (lookahead (lexer, 1)))
 372             consume (lexer);
 373         else if (lookahead (lexer, 1) >= '0' && lookahead (lexer, 1) <= '9')
 374             consume (lexer);
 375         else if (lookahead (lexer, 1) == '_')
 376             consume (lexer);
 377         else
 378             break;
 379     }
 380
 381     st_token_type token_type;
 382
 383     if (lookahead (lexer, 1) == ':' && lookahead (lexer, 2) != '=') {
 384         consume (lexer);
 385         token_type = ST_TOKEN_KEYWORD_SELECTOR;
 386     } else {
 387         token_type = ST_TOKEN_IDENTIFIER;
 388     }
 389
 390     if (create_token) {
 391         char *text;
 392
 393         if (token_type == ST_TOKEN_KEYWORD_SELECTOR)
 394             text = st_input_range (lexer->input, lexer->start,
 395                                    st_input_index (lexer->input));
 396         else
 397             text = st_input_range (lexer->input, lexer->start,
 398                                    st_input_index (lexer->input));
 399
 400         make_token (lexer, token_type, text);
 401     }
 402
 403 }
 404
 405 static void
 406 match_string_constant (st_lexer *lexer)
 407 {
 408     mark (lexer);
 409
 410     match (lexer, '\'');
 411
 412     while (lookahead (lexer, 1) != '\'') {
 413         consume (lexer);
 414
 415         if (lookahead (lexer, 1) == ST_INPUT_EOF) {
 416             rewind (lexer);
 417             raise_error (lexer, ERROR_UNTERMINATED_STRING_LITERAL, lookahead (lexer, 1));
 418         }
 419     }
 420
 421     match (lexer, '\'');
 422
 423     char *string;
 424
 425     string = st_input_range (lexer->input,
 426                              lexer->start + 1,
 427                              st_input_index (lexer->input) - 1);
 428
 429     make_token (lexer, ST_TOKEN_STRING_CONST, string);
 430 }
 431
 432 static void
 433 match_comment (st_lexer *lexer)
 434 {
 435     mark (lexer);
 436
 437     match (lexer, '"');
 438
 439     while (lookahead (lexer, 1) != '"') {
 440         consume (lexer);
 441
 442         if (lookahead (lexer, 1) == ST_INPUT_EOF) {
 443             rewind (lexer);
 444             raise_error (lexer, ERROR_UNTERMINATED_COMMENT, lookahead (lexer, 1));
 445         }
 446     }
 447
 448     match (lexer, '"');
 449
 450     if (!lexer->filter_comments) {
 451
 452         char *comment;
 453
 454         comment = st_input_range (lexer->input,
 455                                   lexer->start + 1,
 456                                   st_input_index (lexer->input) - 1);
 457
 458         make_token (lexer, ST_TOKEN_COMMENT, comment);
 459     }
 460
 461 }
 462
 463 static void
 464 match_tuple_begin (st_lexer *lexer)
 465 {
 466     match (lexer, '#');
 467     match (lexer, '(');
 468
 469     make_token (lexer, ST_TOKEN_TUPLE_BEGIN, st_strdup ("#("));
 470 }
 471
 472 static void
 473 match_binary_selector (st_lexer *lexer, bool create_token)
 474 {
 475     if (lookahead (lexer, 1) == '-') {
 476         match (lexer, '-');
 477
 478         if (is_special_char (lookahead (lexer, 1)))
 479             match (lexer, lookahead (lexer, 1));
 480
 481     } else if (is_special_char (lookahead (lexer, 1))) {
 482         match (lexer, lookahead (lexer, 1));
 483
 484         if (is_special_char (lookahead (lexer, 1)))
 485             match (lexer, lookahead (lexer, 1));
 486
 487     } else {
 488         raise_error (lexer, ERROR_NO_VIABLE_ALT_FOR_CHAR, lookahead (lexer, 1));
 489     }
 490
 491     if (create_token) {
 492         make_token (lexer, ST_TOKEN_BINARY_SELECTOR,
 493                     st_input_range (lexer->input,
 494                                     lexer->start,
 495                                     st_input_index (lexer->input)));
 496     }
 497 }
 498
 499 static void
 500 match_symbol_constant (st_lexer *lexer)
 501 {
 502     match (lexer, '#');
 503
 504     if (isalpha (lookahead (lexer, 1))) {
 505
 506         do {
 507             match_keyword_or_identifier (lexer, false);
 508         } while (isalpha (lookahead (lexer, 1)));
 509
 510     } else if (lookahead (lexer, 1) == '-' || is_special_char (lookahead (lexer, 1))) {
 511         match_binary_selector (lexer, false);
 512     } else {
 513         raise_error (lexer, ERROR_NO_ALT_FOR_POUND, lookahead (lexer, 1));
 514     }
 515
 516     // discard #
 517     char *symbol_text = st_input_range (lexer->input,
 518                                         lexer->start + 1,
 519                                         st_input_index (lexer->input));
 520
 521     make_token (lexer, ST_TOKEN_SYMBOL_CONST, symbol_text);
 522 }
 523
 524 static void
 525 match_block_begin (st_lexer *lexer)
 526 {
 527     match (lexer, '[');
 528
 529     make_token (lexer, ST_TOKEN_BLOCK_BEGIN, NULL);
 530 }
 531
 532 static void
 533 match_block_end (st_lexer *lexer)
 534 {
 535     match (lexer, ']');
 536
 537     make_token (lexer, ST_TOKEN_BLOCK_END, NULL);
 538 }
 539
 540 static void
 541 match_lparen (st_lexer *lexer)
 542 {
 543     match (lexer, '(');
 544
 545     make_token (lexer, ST_TOKEN_LPAREN, NULL);
 546 }
 547
 548 static void
 549 match_rparen (st_lexer *lexer)
 550 {
 551     match (lexer, ')');
 552
 553     make_token (lexer, ST_TOKEN_RPAREN, NULL);
 554 }
 555
 556 static void
 557 match_char_constant (st_lexer *lexer)
 558 {
 559     char ch = 0;
 560     match (lexer, '$');
 561
 562     if (lookahead (lexer, 1) == '\\') {
 563
 564         if (lookahead (lexer, 2) == 't') {
 565             ch = '\t';
 566             consume (lexer);
 567             consume (lexer);
 568         } else if (lookahead (lexer, 2) == 'f') {
 569             ch = '\f';
 570             consume (lexer);
 571             consume (lexer);
 572         } else if (lookahead (lexer, 2) == 'n') {
 573             ch = '\n';
 574             consume (lexer);
 575             consume (lexer);
 576         } else if (lookahead (lexer, 2) == 'r') {
 577             ch = '\r';
 578             consume (lexer);
 579             consume (lexer);
 580         } else if (isxdigit (lookahead (lexer, 2))) {
 581             consume (lexer);
 582             int start = st_input_index (lexer->input);
 583
 584             do {
 585                 consume (lexer);
 586             } while (isxdigit (lookahead (lexer, 1)));
 587
 588             char *string = st_input_range (lexer->input, start, st_input_index (lexer->input));
 589             ch = strtol (string, NULL, 16);
 590             st_free (string);
 591
 592         } else {
 593             // just match the '\' char then
 594             ch = '\\';
 595             consume (lexer);
 596         }
 597
 598     } else if (isgraph (lookahead (lexer, 1))) {
 599         ch = lookahead (lexer, 1);
 600         consume (lexer);
 601     } else {
 602         raise_error (lexer, ERROR_INVALID_CHAR_CONST, lookahead (lexer, 1));
 603     }
 604
 605     char outbuf[6];
 606     st_unichar_to_utf8 (ch, outbuf);
 607     make_token (lexer, ST_TOKEN_CHARACTER_CONST, st_strdup (outbuf));
 608 }
 609
 610 static void
 611 match_eof (st_lexer *lexer)
 612 {
 613     match (lexer, ST_INPUT_EOF);
 614
 615     make_token (lexer, ST_TOKEN_EOF, NULL);
 616 }
 617
 618 static void
 619 match_white_space (st_lexer *lexer)
 620 {
 621     /* gobble up white space */
 622     while (true) {
 623         switch (lookahead (lexer, 1)) {
 624         case  ' ': case '\r':
 625         case '\n': case '\t': case '\f':
 626             consume (lexer);
 627             break;
 628         default:
 629             return;
 630         }
 631     }
 632 }
 633
 634 static void
 635 match_colon (st_lexer *lexer)
 636 {
 637     match (lexer, ':');
 638     make_token (lexer, ST_TOKEN_COLON, NULL);
 639 }
 640
 641 static void
 642 match_semicolon (st_lexer *lexer)
 643 {
 644     match (lexer, ';');
 645     make_token (lexer, ST_TOKEN_SEMICOLON, NULL);
 646 }
 647
 648 static void
 649 match_assign (st_lexer *lexer)
 650 {
 651     match (lexer, ':');
 652     match (lexer, '=');
 653     make_token (lexer, ST_TOKEN_ASSIGN, NULL);
 654 }
 655
 656 static void
 657 match_period (st_lexer *lexer)
 658 {
 659     match (lexer, '.');
 660     make_token (lexer, ST_TOKEN_PERIOD, NULL);
 661 }
 662
 663 static void
 664 match_return (st_lexer *lexer)
 665 {
 666     match (lexer, '^');
 667     make_token (lexer, ST_TOKEN_RETURN, NULL);
 668 }
 669
 670 /* st_lexer_next_token:
 671  * lexer: a st_lexer
 672  *
 673  * Returns the next matched token from the input stream. Caller takes
 674  * ownership of returned token.
 675  *
 676  * If the end of the input stream is reached, tokens of type ST_TOKEN_EOF
 677  * will be returned. Similarly, if there are matching errors, then tokens
 678  * of type ST_TOKEN_INVALID will be returned;
 679  *
 680  */
 681 st_token *
 682 st_lexer_next_token (st_lexer *lexer)
 683 {
 684     st_assert (lexer != NULL);
 685
 686     while (true) {
 687
 688         /* reset token and error state */
 689         lexer->failed = false;
 690         lexer->token_matched = false;
 691         lexer->line   = st_input_get_line (lexer->input);
 692         lexer->column = st_input_get_column (lexer->input);
 693         lexer->start  = st_input_index (lexer->input);
 694
 695         /* we return here on match errors and then goto out */
 696         if (setjmp (lexer->main_loop))
 697             goto out;
 698
 699         switch (lookahead (lexer, 1)) {
 700
 701         case ' ': case '\n': case '\r': case '\t': case '\f':
 702             match_white_space (lexer);
 703             break;
 704
 705         case '(':
 706             match_lparen (lexer);
 707             break;
 708
 709         case ')':
 710             match_rparen (lexer);
 711             break;
 712
 713         case '[':
 714             match_block_begin (lexer);
 715             break;
 716
 717         case ']':
 718             match_block_end (lexer);
 719             break;
 720
 721         case '^':
 722             match_return (lexer);
 723             break;
 724
 725         case '.':
 726             match_period (lexer);
 727             break;
 728
 729         case ';':
 730             match_semicolon (lexer);
 731             break;
 732
 733         case '+': case '/': case '\\':
 734         case '*': case '<': case '>': case '=':
 735         case '@': case '%': case '|': case '&':
 736         case '?': case '!': case '~': case ',':
 737             match_binary_selector (lexer, true);
 738             break;
 739
 740         case '$':
 741             match_char_constant (lexer);
 742             break;
 743
 744         case '"':
 745             match_comment (lexer);
 746             break;
 747
 748         case '\'':
 749             match_string_constant (lexer);
 750             break;
 751
 752         case ST_INPUT_EOF:
 753             match_eof (lexer);
 754             break;
 755
 756         default:
 757
 758             if (isalpha (lookahead (lexer, 1)))
 759                 match_keyword_or_identifier (lexer, true);
 760
 761             else if (lookahead (lexer, 1) == '-' && isdigit (lookahead (lexer, 2)))
 762                 match_number (lexer);
 763
 764             else if (isdigit (lookahead (lexer, 1)))
 765                 match_number (lexer);
 766
 767             else if (lookahead (lexer, 1) == '-')
 768                 match_binary_selector (lexer, true);
 769
 770             else if (lookahead (lexer, 1) == '#' && lookahead (lexer, 2) == '(')
 771                 match_tuple_begin (lexer);
 772
 773             else if (lookahead (lexer, 1) == '#')
 774                 match_symbol_constant (lexer);
 775
 776             // match assign or colon
 777             else if (lookahead (lexer, 1) == ':' && lookahead (lexer, 2) == '=')
 778                 match_assign (lexer);
 779
 780             else if (lookahead (lexer, 1) == ':')
 781                 match_colon (lexer);
 782
 783             else
 784                 raise_error (lexer, ERROR_ILLEGAL_CHAR, lookahead (lexer, 1));
 785         }
 786
 787       out:
 788
 789         // we return the matched token or an invalid token on error
 790         if (lexer->token_matched || lexer->failed)
 791             return lexer->token;
 792         else
 793             continue;
 794
 795     }
 796 }
 797
 798 static void
 799 lexer_initialize (st_lexer *lexer, st_input *input)
 800 {
 801     lexer->input = input;
 802     lexer->token = NULL;
 803     lexer->line = 1;
 804     lexer->column = 1;
 805     lexer->start = -1;
 806     lexer->error_code = 0;
 807     lexer->failed = false;
 808     lexer->filter_comments = true;
 809
 810     lexer->allocated_tokens = NULL;
 811 }
 812
 813 st_lexer *
 814 st_lexer_new (const char *string)
 815 {
 816     st_lexer *lexer;
 817     st_input *input;
 818
 819     st_assert (string != NULL);
 820
 821     lexer = st_new0 (st_lexer);
 822     input = st_input_new (string);
 823     if (!input)
 824         return NULL;
 825
 826     lexer_initialize (lexer, input);
 827
 828     return lexer;
 829 }
 830
 831 void
 832 destroy_token (st_token *token)
 833 {
 834     if (token->type != ST_TOKEN_NUMBER_CONST)
 835         st_free (token->text);
 836     else
 837         st_free (token->number);
 838
 839     st_free (token);
 840 }
 841
 842 void
 843 st_lexer_destroy (st_lexer *lexer)
 844 {
 845     st_assert (lexer != NULL);
 846
 847     st_input_destroy (lexer->input);
 848
 849     st_list_foreach (lexer->allocated_tokens, (st_list_foreach_func) destroy_token);
 850     st_list_destroy (lexer->allocated_tokens);
 851
 852     st_free (lexer);
 853 }
 854
 855 st_token_type
 856 st_token_get_type (st_token *token)
 857 {
 858     st_assert (token != NULL);
 859
 860     return token->type;
 861 }
 862
 863 char *
 864 st_token_get_text (st_token *token)
 865 {
 866     st_assert (token != NULL);
 867
 868     return token->text;
 869 }
 870
 871 st_uint
 872 st_token_get_line (st_token *token)
 873 {
 874     st_assert (token != NULL);
 875
 876     return token->line;
 877 }
 878
 879 st_uint
 880 st_token_get_column (st_token *token)
 881 {
 882     st_assert (token != NULL);
 883
 884     return token->column;
 885 }
 886
 887 st_uint
 888 st_lexer_error_line (st_lexer *lexer)
 889 {
 890     st_assert (lexer != NULL);
 891
 892     return lexer->error_line;
 893 }
 894
 895 st_uint
 896 st_lexer_error_column (st_lexer *lexer)
 897 {
 898     st_assert (lexer != NULL);
 899
 900     return lexer->error_column;
 901 }
 902
 903 char
 904 st_lexer_error_char (st_lexer *lexer)
 905 {
 906     st_assert (lexer != NULL);
 907
 908     return lexer->error_char;
 909 }
 910
 911 char *
 912 st_lexer_error_message (st_lexer *lexer)
 913 {
 914     st_assert (lexer != NULL);
 915
 916     static const char *msgformats[] = {
 917         "mismatched character \\%04X",
 918         "no viable alternative for character \\%04X",
 919         "illegal character \\%04X",
 920         "unterminated comment",
 921         "unterminated string literal",
 922         "invalid radix for number",
 923         "non-whitespace character expected after '$'",
 924         "expected '(' after '#'",
 925     };
 926
 927     switch (lexer->error_code) {
 928
 929     case ERROR_UNTERMINATED_COMMENT:
 930     case ERROR_UNTERMINATED_STRING_LITERAL:
 931     case ERROR_INVALID_RADIX:
 932     case ERROR_INVALID_CHAR_CONST:
 933     case ERROR_NO_ALT_FOR_POUND:
 934
 935         return st_strdup_printf (msgformats[lexer->error_code]);
 936
 937     case ERROR_MISMATCHED_CHAR:
 938     case ERROR_NO_VIABLE_ALT_FOR_CHAR:
 939     case ERROR_ILLEGAL_CHAR:
 940
 941         return st_strdup_printf (msgformats[lexer->error_code], lexer->error_char);
 942
 943     default:
 944         return NULL;
 945     }
 946 }
 947
 948 st_token *
 949 st_lexer_current_token (st_lexer *lexer)
 950 {
 951     return lexer->token;
 952 }
 953
 954 void
 955 st_lexer_filter_comments (st_lexer *lexer, bool filter)
 956 {
 957     lexer->filter_comments = filter;
 958 }
 959
 960 bool
 961 st_number_token_negative (st_token *token)
 962 {
 963     return token->negative;
 964 }
 965
 966 char *
 967 st_number_token_number (st_token *token)
 968 {
 969     return token->number;
 970 }
 971
 972 st_uint
 973 st_number_token_radix (st_token *token)
 974 {
 975     return token->radix;
 976 }
 977
 978 int
 979 st_number_token_exponent (st_token *token)
 980 {
 981     return token->exponent;
 982 }