Add methods for converting between Collections (asBag, asSet, etc)
[panda.git] / src / st-lexer.c
blob8df1e703fdf40cd329c420a4a96abddebdfdfae8
1 /*
2 * st-lexer.c
4 * Copyright (C) 2008 Vincent Geddes
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
25 /* Notes:
27 * we expand utf8-encoded text to ucs4 format and then lex it. Yes it's a bit
28 * inefficient, but more straightforward than munging around with
29 * multi-byte characters.
31 * Character input is supplied by the st_input object. It keeps track of
32 * line/column numbers and has the ability to mark() and rewind() on the
33 * input stream.
37 #include <config.h>
39 #include "st-lexer.h"
40 #include "st-input.h"
41 #include "st-utils.h"
43 #include <stdbool.h>
44 #include <setjmp.h>
45 #include <string.h>
46 #include <stdio.h>
48 #include <stdlib.h>
49 #include <limits.h>
50 #include <ctype.h>
52 #define lookahead(self, k) ((char) st_input_look_ahead (self->input, k))
53 #define consume(self) (st_input_consume (self->input))
54 #define mark(self) (st_input_mark (self->input))
55 #define rewind(self) (st_input_rewind (self->input))
57 typedef enum
59 ERROR_MISMATCHED_CHAR,
60 ERROR_NO_VIABLE_ALT_FOR_CHAR,
61 ERROR_ILLEGAL_CHAR,
62 ERROR_UNTERMINATED_COMMENT,
63 ERROR_UNTERMINATED_STRING_LITERAL,
64 ERROR_INVALID_RADIX,
65 ERROR_INVALID_CHAR_CONST,
66 ERROR_NO_ALT_FOR_POUND,
68 } ErrorCode;
70 struct st_lexer
72 st_input *input;
74 bool filter_comments;
76 bool token_matched;
78 /* data for next token */
79 st_uint line;
80 st_uint column;
81 st_uint start;
82 st_token *token;
84 /* error control */
85 bool failed;
86 jmp_buf main_loop;
88 /* last error information */
89 ErrorCode error_code;
90 st_uint error_line;
91 st_uint error_column;
92 char error_char;
94 /* delayed deallocation */
95 st_list *allocated_tokens;
98 struct st_token
100 st_token_type type;
101 int line;
102 int column;
104 union {
105 struct {
106 char *text;
108 /* Number Token */
109 struct {
110 bool negative;
111 char *number;
112 int radix;
113 int exponent;
118 static void
119 make_token (st_lexer *lexer,
120 st_token_type type,
121 char *text)
123 st_token *token;
125 token = st_new0 (st_token);
127 token->type = type;
128 token->text = text ? text : st_strdup ("");
129 token->type = type;
130 token->line = lexer->line;
131 token->column = lexer->column;
133 lexer->token = token;
134 lexer->token_matched = true;
137 lexer->allocated_tokens = st_list_prepend (lexer->allocated_tokens, token);
140 static void
141 make_number_token (st_lexer *lexer, int radix, int exponent, char *number, bool negative)
143 st_token *token;
145 token = st_new0 (st_token);
147 token->type = ST_TOKEN_NUMBER_CONST;
148 token->line = lexer->line;
149 token->column = lexer->column;
151 token->negative = negative;
152 token->number = number;
153 token->radix = radix;
154 token->exponent = exponent;
156 lexer->token = token;
157 lexer->token_matched = true;
159 lexer->allocated_tokens = st_list_prepend (lexer->allocated_tokens, token);
162 static void
163 raise_error (st_lexer *lexer,
164 ErrorCode error_code,
165 char error_char)
167 lexer->failed = true;
169 lexer->error_code = error_code;
170 lexer->error_char = error_char;
171 lexer->error_line = lexer->line;
172 lexer->error_column = lexer->column;
174 /* create an token of type invalid */
175 make_token (lexer, ST_TOKEN_INVALID, NULL);
177 /* hopefully recover after consuming char */
178 consume (lexer);
180 /* go back to main loop */
181 longjmp (lexer->main_loop, 0);
185 static void
186 match_range (st_lexer *lexer, char a, char b)
188 if (lookahead (lexer, 1) < a || lookahead (lexer, 1) > b) {
189 // mismatch error
190 raise_error (lexer, ERROR_MISMATCHED_CHAR, lookahead (lexer, 1));
192 consume (lexer);
195 static void
196 match (st_lexer *lexer, char c)
198 if (lookahead (lexer, 1) != c) {
199 // mismatch error
200 raise_error (lexer, ERROR_MISMATCHED_CHAR, lookahead (lexer, 1));
202 consume (lexer);
205 static bool
206 is_special_char (char c)
208 switch (c) {
210 case '+': case '/': case '\\': case '*': case '~':
211 case '<': case '>': case '=': case '@': case '%':
212 case '|': case '&': case '?': case '!': case ',':
213 return true;
215 default:
216 return false;
221 /* check if a char is valid numeral identifier for a given radix
223 * for example, 2r1010301 is an invalid number since the '3' is not within the radix.
226 static bool
227 is_radix_numeral (st_uint radix, char c)
229 st_assert (radix >= 2 && radix <= 36);
231 if (radix > 10)
232 return (c >= '0' && c <= '9') || (c >= 'A' && c <= ('A' - 1 + (radix - 10)));
233 else
234 return c >= '0' && c <= ('0' - 1 + radix);
237 /* Numbers. We do just do basic matching here. Actual parsing and conversion can
238 * be done in the parser.
240 static void
241 match_number (st_lexer *lexer)
243 /* We don't match any leading '-'. The parser will resolve whether a '-'
244 * specifies a negative number or a binary selector
247 bool negative = false;
248 long radix = 10;
249 long exponent = 0;
250 int k, j, l;
251 char *string;
253 if (lookahead (lexer, 1) == '-') {
254 negative = true;
255 consume (lexer);
258 k = st_input_index (lexer->input);
260 do {
261 match_range (lexer, '0', '9');
262 } while (isdigit (lookahead (lexer, 1)));
264 if (lookahead (lexer, 1) != 'r') {
266 j = st_input_index (lexer->input);
267 goto out1;
269 } else {
271 string = st_input_range (lexer->input, k,
272 st_input_index (lexer->input));
274 radix = strtol (string, NULL, 10);
275 st_free (string);
276 if (radix < 2 || radix > 36) {
277 raise_error (lexer, ERROR_INVALID_RADIX, lookahead (lexer, 1));
280 consume (lexer);
284 k = st_input_index (lexer->input);
286 if (lookahead (lexer, 1) == '-')
287 raise_error (lexer, ERROR_NO_VIABLE_ALT_FOR_CHAR, lookahead (lexer, 1));
289 out1:
291 while (is_radix_numeral (radix, lookahead (lexer, 1)))
292 consume (lexer);
294 if (lookahead (lexer, 1) == '.' && is_radix_numeral (radix, lookahead (lexer, 2))) {
295 consume (lexer);
297 do {
298 consume (lexer);
299 } while (is_radix_numeral (radix, lookahead (lexer, 1)));
302 j = st_input_index (lexer->input);
304 if (lookahead (lexer, 1) == 'e') {
306 consume (lexer);
308 l = st_input_index (lexer->input);
310 if (lookahead (lexer, 1) == '-' && isdigit (lookahead (lexer, 2)))
311 consume (lexer);
313 while (isdigit (lookahead (lexer, 1)))
314 consume (lexer);
316 if (l == st_input_index (lexer->input))
317 goto out2;
319 string = st_input_range (lexer->input, l,
320 st_input_index (lexer->input));
321 exponent = strtol (string, NULL, 10);
322 st_free (string);
325 out2:
327 make_number_token (lexer, radix, exponent,
328 st_input_range (lexer->input, k, j),
329 negative);
333 static void
334 match_identifier (st_lexer *lexer, bool create_token)
336 if (isalpha (lookahead (lexer, 1)))
337 consume (lexer);
338 else {
339 raise_error (lexer, ERROR_NO_VIABLE_ALT_FOR_CHAR, lookahead (lexer, 1));
342 while (true) {
343 if (isalpha (lookahead (lexer, 1)))
344 consume (lexer);
345 else if (lookahead (lexer, 1) >= '0' && lookahead (lexer, 1) <= '9')
346 consume (lexer);
347 else if (lookahead (lexer, 1) == '_')
348 consume (lexer);
349 else
350 break;
353 if (create_token) {
354 make_token (lexer, ST_TOKEN_IDENTIFIER,
355 st_input_range (lexer->input, lexer->start,
356 st_input_index (lexer->input)));
360 static void
361 match_keyword_or_identifier (st_lexer *lexer, bool create_token)
363 if (isalpha (lookahead (lexer, 1)))
364 consume (lexer);
365 else {
366 raise_error (lexer, ERROR_NO_VIABLE_ALT_FOR_CHAR, lookahead (lexer, 1));
369 while (true) {
371 if (isalpha (lookahead (lexer, 1)))
372 consume (lexer);
373 else if (lookahead (lexer, 1) >= '0' && lookahead (lexer, 1) <= '9')
374 consume (lexer);
375 else if (lookahead (lexer, 1) == '_')
376 consume (lexer);
377 else
378 break;
381 st_token_type token_type;
383 if (lookahead (lexer, 1) == ':' && lookahead (lexer, 2) != '=') {
384 consume (lexer);
385 token_type = ST_TOKEN_KEYWORD_SELECTOR;
386 } else {
387 token_type = ST_TOKEN_IDENTIFIER;
390 if (create_token) {
391 char *text;
393 if (token_type == ST_TOKEN_KEYWORD_SELECTOR)
394 text = st_input_range (lexer->input, lexer->start,
395 st_input_index (lexer->input));
396 else
397 text = st_input_range (lexer->input, lexer->start,
398 st_input_index (lexer->input));
400 make_token (lexer, token_type, text);
405 static void
406 match_string_constant (st_lexer *lexer)
408 mark (lexer);
410 match (lexer, '\'');
412 while (lookahead (lexer, 1) != '\'') {
413 consume (lexer);
415 if (lookahead (lexer, 1) == ST_INPUT_EOF) {
416 rewind (lexer);
417 raise_error (lexer, ERROR_UNTERMINATED_STRING_LITERAL, lookahead (lexer, 1));
421 match (lexer, '\'');
423 char *string;
425 string = st_input_range (lexer->input,
426 lexer->start + 1,
427 st_input_index (lexer->input) - 1);
429 make_token (lexer, ST_TOKEN_STRING_CONST, string);
432 static void
433 match_comment (st_lexer *lexer)
435 mark (lexer);
437 match (lexer, '"');
439 while (lookahead (lexer, 1) != '"') {
440 consume (lexer);
442 if (lookahead (lexer, 1) == ST_INPUT_EOF) {
443 rewind (lexer);
444 raise_error (lexer, ERROR_UNTERMINATED_COMMENT, lookahead (lexer, 1));
448 match (lexer, '"');
450 if (!lexer->filter_comments) {
452 char *comment;
454 comment = st_input_range (lexer->input,
455 lexer->start + 1,
456 st_input_index (lexer->input) - 1);
458 make_token (lexer, ST_TOKEN_COMMENT, comment);
463 static void
464 match_tuple_begin (st_lexer *lexer)
466 match (lexer, '#');
467 match (lexer, '(');
469 make_token (lexer, ST_TOKEN_TUPLE_BEGIN, st_strdup ("#("));
472 static void
473 match_binary_selector (st_lexer *lexer, bool create_token)
475 if (lookahead (lexer, 1) == '-') {
476 match (lexer, '-');
478 if (is_special_char (lookahead (lexer, 1)))
479 match (lexer, lookahead (lexer, 1));
481 } else if (is_special_char (lookahead (lexer, 1))) {
482 match (lexer, lookahead (lexer, 1));
484 if (is_special_char (lookahead (lexer, 1)))
485 match (lexer, lookahead (lexer, 1));
487 } else {
488 raise_error (lexer, ERROR_NO_VIABLE_ALT_FOR_CHAR, lookahead (lexer, 1));
491 if (create_token) {
492 make_token (lexer, ST_TOKEN_BINARY_SELECTOR,
493 st_input_range (lexer->input,
494 lexer->start,
495 st_input_index (lexer->input)));
499 static void
500 match_symbol_constant (st_lexer *lexer)
502 match (lexer, '#');
504 if (isalpha (lookahead (lexer, 1))) {
506 do {
507 match_keyword_or_identifier (lexer, false);
508 } while (isalpha (lookahead (lexer, 1)));
510 } else if (lookahead (lexer, 1) == '-' || is_special_char (lookahead (lexer, 1))) {
511 match_binary_selector (lexer, false);
512 } else {
513 raise_error (lexer, ERROR_NO_ALT_FOR_POUND, lookahead (lexer, 1));
516 // discard #
517 char *symbol_text = st_input_range (lexer->input,
518 lexer->start + 1,
519 st_input_index (lexer->input));
521 make_token (lexer, ST_TOKEN_SYMBOL_CONST, symbol_text);
524 static void
525 match_block_begin (st_lexer *lexer)
527 match (lexer, '[');
529 make_token (lexer, ST_TOKEN_BLOCK_BEGIN, NULL);
532 static void
533 match_block_end (st_lexer *lexer)
535 match (lexer, ']');
537 make_token (lexer, ST_TOKEN_BLOCK_END, NULL);
540 static void
541 match_lparen (st_lexer *lexer)
543 match (lexer, '(');
545 make_token (lexer, ST_TOKEN_LPAREN, NULL);
548 static void
549 match_rparen (st_lexer *lexer)
551 match (lexer, ')');
553 make_token (lexer, ST_TOKEN_RPAREN, NULL);
556 static void
557 match_char_constant (st_lexer *lexer)
559 char ch = 0;
560 match (lexer, '$');
562 if (lookahead (lexer, 1) == '\\') {
564 if (lookahead (lexer, 2) == 't') {
565 ch = '\t';
566 consume (lexer);
567 consume (lexer);
568 } else if (lookahead (lexer, 2) == 'f') {
569 ch = '\f';
570 consume (lexer);
571 consume (lexer);
572 } else if (lookahead (lexer, 2) == 'n') {
573 ch = '\n';
574 consume (lexer);
575 consume (lexer);
576 } else if (lookahead (lexer, 2) == 'r') {
577 ch = '\r';
578 consume (lexer);
579 consume (lexer);
580 } else if (isxdigit (lookahead (lexer, 2))) {
581 consume (lexer);
582 int start = st_input_index (lexer->input);
584 do {
585 consume (lexer);
586 } while (isxdigit (lookahead (lexer, 1)));
588 char *string = st_input_range (lexer->input, start, st_input_index (lexer->input));
589 ch = strtol (string, NULL, 16);
590 st_free (string);
592 } else {
593 // just match the '\' char then
594 ch = '\\';
595 consume (lexer);
598 } else if (isgraph (lookahead (lexer, 1))) {
599 ch = lookahead (lexer, 1);
600 consume (lexer);
601 } else {
602 raise_error (lexer, ERROR_INVALID_CHAR_CONST, lookahead (lexer, 1));
605 char outbuf[6];
606 st_unichar_to_utf8 (ch, outbuf);
607 make_token (lexer, ST_TOKEN_CHARACTER_CONST, st_strdup (outbuf));
610 static void
611 match_eof (st_lexer *lexer)
613 match (lexer, ST_INPUT_EOF);
615 make_token (lexer, ST_TOKEN_EOF, NULL);
618 static void
619 match_white_space (st_lexer *lexer)
621 /* gobble up white space */
622 while (true) {
623 switch (lookahead (lexer, 1)) {
624 case ' ': case '\r':
625 case '\n': case '\t': case '\f':
626 consume (lexer);
627 break;
628 default:
629 return;
634 static void
635 match_colon (st_lexer *lexer)
637 match (lexer, ':');
638 make_token (lexer, ST_TOKEN_COLON, NULL);
641 static void
642 match_semicolon (st_lexer *lexer)
644 match (lexer, ';');
645 make_token (lexer, ST_TOKEN_SEMICOLON, NULL);
648 static void
649 match_assign (st_lexer *lexer)
651 match (lexer, ':');
652 match (lexer, '=');
653 make_token (lexer, ST_TOKEN_ASSIGN, NULL);
656 static void
657 match_period (st_lexer *lexer)
659 match (lexer, '.');
660 make_token (lexer, ST_TOKEN_PERIOD, NULL);
663 static void
664 match_return (st_lexer *lexer)
666 match (lexer, '^');
667 make_token (lexer, ST_TOKEN_RETURN, NULL);
670 /* st_lexer_next_token:
671 * lexer: a st_lexer
673 * Returns the next matched token from the input stream. Caller takes
674 * ownership of returned token.
676 * If the end of the input stream is reached, tokens of type ST_TOKEN_EOF
677 * will be returned. Similarly, if there are matching errors, then tokens
678 * of type ST_TOKEN_INVALID will be returned;
681 st_token *
682 st_lexer_next_token (st_lexer *lexer)
684 st_assert (lexer != NULL);
686 while (true) {
688 /* reset token and error state */
689 lexer->failed = false;
690 lexer->token_matched = false;
691 lexer->line = st_input_get_line (lexer->input);
692 lexer->column = st_input_get_column (lexer->input);
693 lexer->start = st_input_index (lexer->input);
695 /* we return here on match errors and then goto out */
696 if (setjmp (lexer->main_loop))
697 goto out;
699 switch (lookahead (lexer, 1)) {
701 case ' ': case '\n': case '\r': case '\t': case '\f':
702 match_white_space (lexer);
703 break;
705 case '(':
706 match_lparen (lexer);
707 break;
709 case ')':
710 match_rparen (lexer);
711 break;
713 case '[':
714 match_block_begin (lexer);
715 break;
717 case ']':
718 match_block_end (lexer);
719 break;
721 case '^':
722 match_return (lexer);
723 break;
725 case '.':
726 match_period (lexer);
727 break;
729 case ';':
730 match_semicolon (lexer);
731 break;
733 case '+': case '/': case '\\':
734 case '*': case '<': case '>': case '=':
735 case '@': case '%': case '|': case '&':
736 case '?': case '!': case '~': case ',':
737 match_binary_selector (lexer, true);
738 break;
740 case '$':
741 match_char_constant (lexer);
742 break;
744 case '"':
745 match_comment (lexer);
746 break;
748 case '\'':
749 match_string_constant (lexer);
750 break;
752 case ST_INPUT_EOF:
753 match_eof (lexer);
754 break;
756 default:
758 if (isalpha (lookahead (lexer, 1)))
759 match_keyword_or_identifier (lexer, true);
761 else if (lookahead (lexer, 1) == '-' && isdigit (lookahead (lexer, 2)))
762 match_number (lexer);
764 else if (isdigit (lookahead (lexer, 1)))
765 match_number (lexer);
767 else if (lookahead (lexer, 1) == '-')
768 match_binary_selector (lexer, true);
770 else if (lookahead (lexer, 1) == '#' && lookahead (lexer, 2) == '(')
771 match_tuple_begin (lexer);
773 else if (lookahead (lexer, 1) == '#')
774 match_symbol_constant (lexer);
776 // match assign or colon
777 else if (lookahead (lexer, 1) == ':' && lookahead (lexer, 2) == '=')
778 match_assign (lexer);
780 else if (lookahead (lexer, 1) == ':')
781 match_colon (lexer);
783 else
784 raise_error (lexer, ERROR_ILLEGAL_CHAR, lookahead (lexer, 1));
787 out:
789 // we return the matched token or an invalid token on error
790 if (lexer->token_matched || lexer->failed)
791 return lexer->token;
792 else
793 continue;
798 static void
799 lexer_initialize (st_lexer *lexer, st_input *input)
801 lexer->input = input;
802 lexer->token = NULL;
803 lexer->line = 1;
804 lexer->column = 1;
805 lexer->start = -1;
806 lexer->error_code = 0;
807 lexer->failed = false;
808 lexer->filter_comments = true;
810 lexer->allocated_tokens = NULL;
813 st_lexer *
814 st_lexer_new (const char *string)
816 st_lexer *lexer;
817 st_input *input;
819 st_assert (string != NULL);
821 lexer = st_new0 (st_lexer);
822 input = st_input_new (string);
823 if (!input)
824 return NULL;
826 lexer_initialize (lexer, input);
828 return lexer;
831 void
832 destroy_token (st_token *token)
834 if (token->type != ST_TOKEN_NUMBER_CONST)
835 st_free (token->text);
836 else
837 st_free (token->number);
839 st_free (token);
842 void
843 st_lexer_destroy (st_lexer *lexer)
845 st_assert (lexer != NULL);
847 st_input_destroy (lexer->input);
849 st_list_foreach (lexer->allocated_tokens, (st_list_foreach_func) destroy_token);
850 st_list_destroy (lexer->allocated_tokens);
852 st_free (lexer);
855 st_token_type
856 st_token_get_type (st_token *token)
858 st_assert (token != NULL);
860 return token->type;
863 char *
864 st_token_get_text (st_token *token)
866 st_assert (token != NULL);
868 return token->text;
871 st_uint
872 st_token_get_line (st_token *token)
874 st_assert (token != NULL);
876 return token->line;
879 st_uint
880 st_token_get_column (st_token *token)
882 st_assert (token != NULL);
884 return token->column;
887 st_uint
888 st_lexer_error_line (st_lexer *lexer)
890 st_assert (lexer != NULL);
892 return lexer->error_line;
895 st_uint
896 st_lexer_error_column (st_lexer *lexer)
898 st_assert (lexer != NULL);
900 return lexer->error_column;
903 char
904 st_lexer_error_char (st_lexer *lexer)
906 st_assert (lexer != NULL);
908 return lexer->error_char;
911 char *
912 st_lexer_error_message (st_lexer *lexer)
914 st_assert (lexer != NULL);
916 static const char *msgformats[] = {
917 "mismatched character \\%04X",
918 "no viable alternative for character \\%04X",
919 "illegal character \\%04X",
920 "unterminated comment",
921 "unterminated string literal",
922 "invalid radix for number",
923 "non-whitespace character expected after '$'",
924 "expected '(' after '#'",
927 switch (lexer->error_code) {
929 case ERROR_UNTERMINATED_COMMENT:
930 case ERROR_UNTERMINATED_STRING_LITERAL:
931 case ERROR_INVALID_RADIX:
932 case ERROR_INVALID_CHAR_CONST:
933 case ERROR_NO_ALT_FOR_POUND:
935 return st_strdup_printf (msgformats[lexer->error_code]);
937 case ERROR_MISMATCHED_CHAR:
938 case ERROR_NO_VIABLE_ALT_FOR_CHAR:
939 case ERROR_ILLEGAL_CHAR:
941 return st_strdup_printf (msgformats[lexer->error_code], lexer->error_char);
943 default:
944 return NULL;
948 st_token *
949 st_lexer_current_token (st_lexer *lexer)
951 return lexer->token;
954 void
955 st_lexer_filter_comments (st_lexer *lexer, bool filter)
957 lexer->filter_comments = filter;
960 bool
961 st_number_token_negative (st_token *token)
963 return token->negative;
966 char *
967 st_number_token_number (st_token *token)
969 return token->number;
972 st_uint
973 st_number_token_radix (st_token *token)
975 return token->radix;
979 st_number_token_exponent (st_token *token)
981 return token->exponent;