4 * Copyright (C) 2008 Vincent Geddes
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
27 * we expand utf8-encoded text to ucs4 format and then lex it. Yes it's a bit
28 * inefficient, but more straightforward than munging around with
29 * multi-byte characters.
31 * Character input is supplied by the st_input object. It keeps track of
32 * line/column numbers and has the ability to mark() and rewind() on the
52 #define lookahead(self, k) ((char) st_input_look_ahead (self->input, k))
53 #define consume(self) (st_input_consume (self->input))
54 #define mark(self) (st_input_mark (self->input))
55 #define rewind(self) (st_input_rewind (self->input))
59 ERROR_MISMATCHED_CHAR
,
60 ERROR_NO_VIABLE_ALT_FOR_CHAR
,
62 ERROR_UNTERMINATED_COMMENT
,
63 ERROR_UNTERMINATED_STRING_LITERAL
,
65 ERROR_INVALID_CHAR_CONST
,
66 ERROR_NO_ALT_FOR_POUND
,
78 /* data for next token */
88 /* last error information */
94 /* delayed deallocation */
95 st_list
*allocated_tokens
;
119 make_token (st_lexer
*lexer
,
125 token
= st_new0 (st_token
);
128 token
->text
= text
? text
: st_strdup ("");
130 token
->line
= lexer
->line
;
131 token
->column
= lexer
->column
;
133 lexer
->token
= token
;
134 lexer
->token_matched
= true;
137 lexer
->allocated_tokens
= st_list_prepend (lexer
->allocated_tokens
, token
);
141 make_number_token (st_lexer
*lexer
, int radix
, int exponent
, char *number
, bool negative
)
145 token
= st_new0 (st_token
);
147 token
->type
= ST_TOKEN_NUMBER_CONST
;
148 token
->line
= lexer
->line
;
149 token
->column
= lexer
->column
;
151 token
->negative
= negative
;
152 token
->number
= number
;
153 token
->radix
= radix
;
154 token
->exponent
= exponent
;
156 lexer
->token
= token
;
157 lexer
->token_matched
= true;
159 lexer
->allocated_tokens
= st_list_prepend (lexer
->allocated_tokens
, token
);
163 raise_error (st_lexer
*lexer
,
164 ErrorCode error_code
,
167 lexer
->failed
= true;
169 lexer
->error_code
= error_code
;
170 lexer
->error_char
= error_char
;
171 lexer
->error_line
= lexer
->line
;
172 lexer
->error_column
= lexer
->column
;
174 /* create an token of type invalid */
175 make_token (lexer
, ST_TOKEN_INVALID
, NULL
);
177 /* hopefully recover after consuming char */
180 /* go back to main loop */
181 longjmp (lexer
->main_loop
, 0);
186 match_range (st_lexer
*lexer
, char a
, char b
)
188 if (lookahead (lexer
, 1) < a
|| lookahead (lexer
, 1) > b
) {
190 raise_error (lexer
, ERROR_MISMATCHED_CHAR
, lookahead (lexer
, 1));
196 match (st_lexer
*lexer
, char c
)
198 if (lookahead (lexer
, 1) != c
) {
200 raise_error (lexer
, ERROR_MISMATCHED_CHAR
, lookahead (lexer
, 1));
206 is_special_char (char c
)
210 case '+': case '/': case '\\': case '*': case '~':
211 case '<': case '>': case '=': case '@': case '%':
212 case '|': case '&': case '?': case '!': case ',':
221 /* check if a char is valid numeral identifier for a given radix
223 * for example, 2r1010301 is an invalid number since the '3' is not within the radix.
227 is_radix_numeral (st_uint radix
, char c
)
229 st_assert (radix
>= 2 && radix
<= 36);
232 return (c
>= '0' && c
<= '9') || (c
>= 'A' && c
<= ('A' - 1 + (radix
- 10)));
234 return c
>= '0' && c
<= ('0' - 1 + radix
);
237 /* Numbers. We do just do basic matching here. Actual parsing and conversion can
238 * be done in the parser.
241 match_number (st_lexer
*lexer
)
243 /* We don't match any leading '-'. The parser will resolve whether a '-'
244 * specifies a negative number or a binary selector
247 bool negative
= false;
253 if (lookahead (lexer
, 1) == '-') {
258 k
= st_input_index (lexer
->input
);
261 match_range (lexer
, '0', '9');
262 } while (isdigit (lookahead (lexer
, 1)));
264 if (lookahead (lexer
, 1) != 'r') {
266 j
= st_input_index (lexer
->input
);
271 string
= st_input_range (lexer
->input
, k
,
272 st_input_index (lexer
->input
));
274 radix
= strtol (string
, NULL
, 10);
276 if (radix
< 2 || radix
> 36) {
277 raise_error (lexer
, ERROR_INVALID_RADIX
, lookahead (lexer
, 1));
284 k
= st_input_index (lexer
->input
);
286 if (lookahead (lexer
, 1) == '-')
287 raise_error (lexer
, ERROR_NO_VIABLE_ALT_FOR_CHAR
, lookahead (lexer
, 1));
291 while (is_radix_numeral (radix
, lookahead (lexer
, 1)))
294 if (lookahead (lexer
, 1) == '.' && is_radix_numeral (radix
, lookahead (lexer
, 2))) {
299 } while (is_radix_numeral (radix
, lookahead (lexer
, 1)));
302 j
= st_input_index (lexer
->input
);
304 if (lookahead (lexer
, 1) == 'e') {
308 l
= st_input_index (lexer
->input
);
310 if (lookahead (lexer
, 1) == '-' && isdigit (lookahead (lexer
, 2)))
313 while (isdigit (lookahead (lexer
, 1)))
316 if (l
== st_input_index (lexer
->input
))
319 string
= st_input_range (lexer
->input
, l
,
320 st_input_index (lexer
->input
));
321 exponent
= strtol (string
, NULL
, 10);
327 make_number_token (lexer
, radix
, exponent
,
328 st_input_range (lexer
->input
, k
, j
),
334 match_identifier (st_lexer
*lexer
, bool create_token
)
336 if (isalpha (lookahead (lexer
, 1)))
339 raise_error (lexer
, ERROR_NO_VIABLE_ALT_FOR_CHAR
, lookahead (lexer
, 1));
343 if (isalpha (lookahead (lexer
, 1)))
345 else if (lookahead (lexer
, 1) >= '0' && lookahead (lexer
, 1) <= '9')
347 else if (lookahead (lexer
, 1) == '_')
354 make_token (lexer
, ST_TOKEN_IDENTIFIER
,
355 st_input_range (lexer
->input
, lexer
->start
,
356 st_input_index (lexer
->input
)));
361 match_keyword_or_identifier (st_lexer
*lexer
, bool create_token
)
363 if (isalpha (lookahead (lexer
, 1)))
366 raise_error (lexer
, ERROR_NO_VIABLE_ALT_FOR_CHAR
, lookahead (lexer
, 1));
371 if (isalpha (lookahead (lexer
, 1)))
373 else if (lookahead (lexer
, 1) >= '0' && lookahead (lexer
, 1) <= '9')
375 else if (lookahead (lexer
, 1) == '_')
381 st_token_type token_type
;
383 if (lookahead (lexer
, 1) == ':' && lookahead (lexer
, 2) != '=') {
385 token_type
= ST_TOKEN_KEYWORD_SELECTOR
;
387 token_type
= ST_TOKEN_IDENTIFIER
;
393 if (token_type
== ST_TOKEN_KEYWORD_SELECTOR
)
394 text
= st_input_range (lexer
->input
, lexer
->start
,
395 st_input_index (lexer
->input
));
397 text
= st_input_range (lexer
->input
, lexer
->start
,
398 st_input_index (lexer
->input
));
400 make_token (lexer
, token_type
, text
);
406 match_string_constant (st_lexer
*lexer
)
412 while (lookahead (lexer
, 1) != '\'') {
415 if (lookahead (lexer
, 1) == ST_INPUT_EOF
) {
417 raise_error (lexer
, ERROR_UNTERMINATED_STRING_LITERAL
, lookahead (lexer
, 1));
425 string
= st_input_range (lexer
->input
,
427 st_input_index (lexer
->input
) - 1);
429 make_token (lexer
, ST_TOKEN_STRING_CONST
, string
);
433 match_comment (st_lexer
*lexer
)
439 while (lookahead (lexer
, 1) != '"') {
442 if (lookahead (lexer
, 1) == ST_INPUT_EOF
) {
444 raise_error (lexer
, ERROR_UNTERMINATED_COMMENT
, lookahead (lexer
, 1));
450 if (!lexer
->filter_comments
) {
454 comment
= st_input_range (lexer
->input
,
456 st_input_index (lexer
->input
) - 1);
458 make_token (lexer
, ST_TOKEN_COMMENT
, comment
);
464 match_tuple_begin (st_lexer
*lexer
)
469 make_token (lexer
, ST_TOKEN_TUPLE_BEGIN
, st_strdup ("#("));
473 match_binary_selector (st_lexer
*lexer
, bool create_token
)
475 if (lookahead (lexer
, 1) == '-') {
478 if (is_special_char (lookahead (lexer
, 1)))
479 match (lexer
, lookahead (lexer
, 1));
481 } else if (is_special_char (lookahead (lexer
, 1))) {
482 match (lexer
, lookahead (lexer
, 1));
484 if (is_special_char (lookahead (lexer
, 1)))
485 match (lexer
, lookahead (lexer
, 1));
488 raise_error (lexer
, ERROR_NO_VIABLE_ALT_FOR_CHAR
, lookahead (lexer
, 1));
492 make_token (lexer
, ST_TOKEN_BINARY_SELECTOR
,
493 st_input_range (lexer
->input
,
495 st_input_index (lexer
->input
)));
500 match_symbol_constant (st_lexer
*lexer
)
504 if (isalpha (lookahead (lexer
, 1))) {
507 match_keyword_or_identifier (lexer
, false);
508 } while (isalpha (lookahead (lexer
, 1)));
510 } else if (lookahead (lexer
, 1) == '-' || is_special_char (lookahead (lexer
, 1))) {
511 match_binary_selector (lexer
, false);
513 raise_error (lexer
, ERROR_NO_ALT_FOR_POUND
, lookahead (lexer
, 1));
517 char *symbol_text
= st_input_range (lexer
->input
,
519 st_input_index (lexer
->input
));
521 make_token (lexer
, ST_TOKEN_SYMBOL_CONST
, symbol_text
);
525 match_block_begin (st_lexer
*lexer
)
529 make_token (lexer
, ST_TOKEN_BLOCK_BEGIN
, NULL
);
533 match_block_end (st_lexer
*lexer
)
537 make_token (lexer
, ST_TOKEN_BLOCK_END
, NULL
);
541 match_lparen (st_lexer
*lexer
)
545 make_token (lexer
, ST_TOKEN_LPAREN
, NULL
);
549 match_rparen (st_lexer
*lexer
)
553 make_token (lexer
, ST_TOKEN_RPAREN
, NULL
);
557 match_char_constant (st_lexer
*lexer
)
562 if (lookahead (lexer
, 1) == '\\') {
564 if (lookahead (lexer
, 2) == 't') {
568 } else if (lookahead (lexer
, 2) == 'f') {
572 } else if (lookahead (lexer
, 2) == 'n') {
576 } else if (lookahead (lexer
, 2) == 'r') {
580 } else if (isxdigit (lookahead (lexer
, 2))) {
582 int start
= st_input_index (lexer
->input
);
586 } while (isxdigit (lookahead (lexer
, 1)));
588 char *string
= st_input_range (lexer
->input
, start
, st_input_index (lexer
->input
));
589 ch
= strtol (string
, NULL
, 16);
593 // just match the '\' char then
598 } else if (isgraph (lookahead (lexer
, 1))) {
599 ch
= lookahead (lexer
, 1);
602 raise_error (lexer
, ERROR_INVALID_CHAR_CONST
, lookahead (lexer
, 1));
606 st_unichar_to_utf8 (ch
, outbuf
);
607 make_token (lexer
, ST_TOKEN_CHARACTER_CONST
, st_strdup (outbuf
));
611 match_eof (st_lexer
*lexer
)
613 match (lexer
, ST_INPUT_EOF
);
615 make_token (lexer
, ST_TOKEN_EOF
, NULL
);
619 match_white_space (st_lexer
*lexer
)
621 /* gobble up white space */
623 switch (lookahead (lexer
, 1)) {
625 case '\n': case '\t': case '\f':
635 match_colon (st_lexer
*lexer
)
638 make_token (lexer
, ST_TOKEN_COLON
, NULL
);
642 match_semicolon (st_lexer
*lexer
)
645 make_token (lexer
, ST_TOKEN_SEMICOLON
, NULL
);
649 match_assign (st_lexer
*lexer
)
653 make_token (lexer
, ST_TOKEN_ASSIGN
, NULL
);
657 match_period (st_lexer
*lexer
)
660 make_token (lexer
, ST_TOKEN_PERIOD
, NULL
);
664 match_return (st_lexer
*lexer
)
667 make_token (lexer
, ST_TOKEN_RETURN
, NULL
);
670 /* st_lexer_next_token:
673 * Returns the next matched token from the input stream. Caller takes
674 * ownership of returned token.
676 * If the end of the input stream is reached, tokens of type ST_TOKEN_EOF
677 * will be returned. Similarly, if there are matching errors, then tokens
678 * of type ST_TOKEN_INVALID will be returned;
682 st_lexer_next_token (st_lexer
*lexer
)
684 st_assert (lexer
!= NULL
);
688 /* reset token and error state */
689 lexer
->failed
= false;
690 lexer
->token_matched
= false;
691 lexer
->line
= st_input_get_line (lexer
->input
);
692 lexer
->column
= st_input_get_column (lexer
->input
);
693 lexer
->start
= st_input_index (lexer
->input
);
695 /* we return here on match errors and then goto out */
696 if (setjmp (lexer
->main_loop
))
699 switch (lookahead (lexer
, 1)) {
701 case ' ': case '\n': case '\r': case '\t': case '\f':
702 match_white_space (lexer
);
706 match_lparen (lexer
);
710 match_rparen (lexer
);
714 match_block_begin (lexer
);
718 match_block_end (lexer
);
722 match_return (lexer
);
726 match_period (lexer
);
730 match_semicolon (lexer
);
733 case '+': case '/': case '\\':
734 case '*': case '<': case '>': case '=':
735 case '@': case '%': case '|': case '&':
736 case '?': case '!': case '~': case ',':
737 match_binary_selector (lexer
, true);
741 match_char_constant (lexer
);
745 match_comment (lexer
);
749 match_string_constant (lexer
);
758 if (isalpha (lookahead (lexer
, 1)))
759 match_keyword_or_identifier (lexer
, true);
761 else if (lookahead (lexer
, 1) == '-' && isdigit (lookahead (lexer
, 2)))
762 match_number (lexer
);
764 else if (isdigit (lookahead (lexer
, 1)))
765 match_number (lexer
);
767 else if (lookahead (lexer
, 1) == '-')
768 match_binary_selector (lexer
, true);
770 else if (lookahead (lexer
, 1) == '#' && lookahead (lexer
, 2) == '(')
771 match_tuple_begin (lexer
);
773 else if (lookahead (lexer
, 1) == '#')
774 match_symbol_constant (lexer
);
776 // match assign or colon
777 else if (lookahead (lexer
, 1) == ':' && lookahead (lexer
, 2) == '=')
778 match_assign (lexer
);
780 else if (lookahead (lexer
, 1) == ':')
784 raise_error (lexer
, ERROR_ILLEGAL_CHAR
, lookahead (lexer
, 1));
789 // we return the matched token or an invalid token on error
790 if (lexer
->token_matched
|| lexer
->failed
)
799 lexer_initialize (st_lexer
*lexer
, st_input
*input
)
801 lexer
->input
= input
;
806 lexer
->error_code
= 0;
807 lexer
->failed
= false;
808 lexer
->filter_comments
= true;
810 lexer
->allocated_tokens
= NULL
;
814 st_lexer_new (const char *string
)
819 st_assert (string
!= NULL
);
821 lexer
= st_new0 (st_lexer
);
822 input
= st_input_new (string
);
826 lexer_initialize (lexer
, input
);
832 destroy_token (st_token
*token
)
834 if (token
->type
!= ST_TOKEN_NUMBER_CONST
)
835 st_free (token
->text
);
837 st_free (token
->number
);
843 st_lexer_destroy (st_lexer
*lexer
)
845 st_assert (lexer
!= NULL
);
847 st_input_destroy (lexer
->input
);
849 st_list_foreach (lexer
->allocated_tokens
, (st_list_foreach_func
) destroy_token
);
850 st_list_destroy (lexer
->allocated_tokens
);
856 st_token_get_type (st_token
*token
)
858 st_assert (token
!= NULL
);
864 st_token_get_text (st_token
*token
)
866 st_assert (token
!= NULL
);
872 st_token_get_line (st_token
*token
)
874 st_assert (token
!= NULL
);
880 st_token_get_column (st_token
*token
)
882 st_assert (token
!= NULL
);
884 return token
->column
;
888 st_lexer_error_line (st_lexer
*lexer
)
890 st_assert (lexer
!= NULL
);
892 return lexer
->error_line
;
896 st_lexer_error_column (st_lexer
*lexer
)
898 st_assert (lexer
!= NULL
);
900 return lexer
->error_column
;
904 st_lexer_error_char (st_lexer
*lexer
)
906 st_assert (lexer
!= NULL
);
908 return lexer
->error_char
;
912 st_lexer_error_message (st_lexer
*lexer
)
914 st_assert (lexer
!= NULL
);
916 static const char *msgformats
[] = {
917 "mismatched character \\%04X",
918 "no viable alternative for character \\%04X",
919 "illegal character \\%04X",
920 "unterminated comment",
921 "unterminated string literal",
922 "invalid radix for number",
923 "non-whitespace character expected after '$'",
924 "expected '(' after '#'",
927 switch (lexer
->error_code
) {
929 case ERROR_UNTERMINATED_COMMENT
:
930 case ERROR_UNTERMINATED_STRING_LITERAL
:
931 case ERROR_INVALID_RADIX
:
932 case ERROR_INVALID_CHAR_CONST
:
933 case ERROR_NO_ALT_FOR_POUND
:
935 return st_strdup_printf (msgformats
[lexer
->error_code
]);
937 case ERROR_MISMATCHED_CHAR
:
938 case ERROR_NO_VIABLE_ALT_FOR_CHAR
:
939 case ERROR_ILLEGAL_CHAR
:
941 return st_strdup_printf (msgformats
[lexer
->error_code
], lexer
->error_char
);
949 st_lexer_current_token (st_lexer
*lexer
)
955 st_lexer_filter_comments (st_lexer
*lexer
, bool filter
)
957 lexer
->filter_comments
= filter
;
961 st_number_token_negative (st_token
*token
)
963 return token
->negative
;
967 st_number_token_number (st_token
*token
)
969 return token
->number
;
973 st_number_token_radix (st_token
*token
)
979 st_number_token_exponent (st_token
*token
)
981 return token
->exponent
;