2 * Copyright (c) 2011 Jiri Svoboda
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 /** @file Lexer (lexical analyzer).
31 * Consumes a text file and produces a sequence of lexical elements (lems).
52 static void lex_touch(lex_t
*lex
);
53 static bool_t
lex_read_try(lex_t
*lex
);
55 static void lex_skip_comment(lex_t
*lex
);
56 static void lex_skip_ws(lex_t
*lex
);
57 static bool_t
is_wstart(char c
);
58 static bool_t
is_wcont(char c
);
59 static bool_t
is_digit(char c
);
60 static void lex_word(lex_t
*lex
);
61 static void lex_char(lex_t
*lex
);
62 static void lex_number(lex_t
*lex
);
63 static void lex_string(lex_t
*lex
);
64 static void lex_char_string_core(lex_t
*lex
, chr_str_t cs
);
65 static int digit_value(char c
);
67 /* Note: This imposes an implementation limit on identifier length. */
69 static char ident_buf
[IBUF_SIZE
+ 1];
71 /* XXX This imposes an implementation limit on string literal length. */
72 #define SLBUF_SIZE 128
73 static char strlit_buf
[SLBUF_SIZE
+ 1];
75 /** Lclass-string pair */
81 /** Keyword names. Used both for printing and recognition. */
82 static struct lc_name keywords
[] = {
86 { lc_break
, "break" },
87 { lc_builtin
, "builtin" },
89 { lc_class
, "class" },
90 { lc_deleg
, "deleg" },
96 { lc_except
, "except" },
97 { lc_false
, "false" },
98 { lc_finally
, "finally" },
105 { lc_interface
, "interface" },
111 { lc_override
, "override" },
112 { lc_packed
, "packed" },
113 { lc_private
, "private" },
115 { lc_protected
, "protected" },
116 { lc_public
, "public" },
117 { lc_raise
, "raise" },
118 { lc_resource
, "resource" },
119 { lc_return
, "return" },
122 { lc_static
, "static" },
123 { lc_string
, "string" },
124 { lc_struct
, "struct" },
125 { lc_switch
, "switch" },
132 { lc_while
, "while" },
133 { lc_yield
, "yield" },
138 /** Other simple lclasses. Only used for printing. */
139 static struct lc_name simple_lc
[] = {
140 { lc_invalid
, "INVALID" },
151 { lc_notequal
, "!=" },
154 { lc_lt_equal
, "<=" },
155 { lc_gt_equal
, ">=" },
160 { lc_increase
, "+=" },
170 /** Print lclass value.
172 * Prints lclass (lexical element class) value in human-readable form
175 * @param lclass Lclass value for display.
177 void lclass_print(lclass_t lclass
)
182 while (dp
->name
!= NULL
) {
183 if (dp
->lclass
== lclass
) {
184 printf("%s", dp
->name
);
191 while (dp
->name
!= NULL
) {
192 if (dp
->lclass
== lclass
) {
193 printf("%s", dp
->name
);
204 printf("int_literal");
207 printf("string_literal");
210 printf("<unknown?>");
215 /** Print lexical element.
217 * Prints lexical element in human-readable form (for debugging).
219 * @param lem Lexical element for display.
221 void lem_print(lem_t
*lem
)
223 lclass_print(lem
->lclass
);
225 switch (lem
->lclass
) {
227 printf("('%s')", strtab_get_str(lem
->u
.ident
.sid
));
231 bigint_print(&lem
->u
.lit_int
.value
);
235 printf("(\"%s\")", lem
->u
.lit_string
.value
);
241 /** Print lem coordinates.
243 * Print the coordinates (line number, column number) of a lexical element.
245 * @param lem Lexical element for coordinate printing.
247 void lem_print_coords(lem_t
*lem
)
249 cspan_print(lem
->cspan
);
252 /** Initialize lexer instance.
254 * @param lex Lexer object to initialize.
255 * @param input Input to associate with lexer.
257 void lex_init(lex_t
*lex
, struct input
*input
)
263 rc
= input_get_line(lex
->input
, &lex
->inbuf
);
265 printf("Error reading input.\n");
269 lex
->ibp
= lex
->inbuf
;
271 lex
->prev_valid
= b_false
;
272 lex
->current_valid
= b_true
;
275 /** Advance to next lexical element.
277 * The new element is read in lazily then it is actually accessed.
279 * @param lex Lexer object.
281 void lex_next(lex_t
*lex
)
283 /* Make sure the current lem has already been read in. */
286 /* Force a new lem to be read on next access. */
287 lex
->current_valid
= b_false
;
292 * The returned pointer is invalidated by next call to lex_next()
294 * @param lex Lexer object.
295 * @return Pointer to current lem. Owned by @a lex and only valid
296 * until next call to lex_xxx().
298 lem_t
*lex_get_current(lex_t
*lex
)
301 return &lex
->current
;
304 /** Get previous lem if valid.
306 * The returned pointer is invalidated by next call to lex_next()
308 * @param lex Lexer object.
309 * @return Pointer to previous lem. Owned by @a lex and only valid
310 * until next call to lex_xxx().
312 lem_t
*lex_peek_prev(lex_t
*lex
)
314 if (lex
->current_valid
== b_false
) {
316 * This means the head is advanced but next lem was not read.
317 * Thus the previous lem is still in @a current.
319 return &lex
->current
;
322 if (lex
->prev_valid
!= b_true
) {
323 /* Looks like we are still at the first lem. */
328 * Current lem has been read in. Thus the previous lem was moved to
334 /** Read in the current lexical element (unless already read in).
336 * @param lex Lexer object.
338 static void lex_touch(lex_t
*lex
)
342 if (lex
->current_valid
== b_true
)
345 /* Copy previous lem */
346 lex
->prev
= lex
->current
;
347 lex
->prev_valid
= b_true
;
350 got_lem
= lex_read_try(lex
);
351 } while (got_lem
== b_false
);
353 lex
->current_valid
= b_true
;
356 /** Try reading next lexical element.
358 * Attemps to read the next lexical element. In some cases (such as a comment)
359 * this function will need to give it another try and returns @c b_false
362 * @param lex Lexer object.
363 * @return @c b_true on success or @c b_false if it needs
364 * restarting. On success the lem is stored to
365 * the current lem in @a lex.
367 static bool_t
lex_read_try(lex_t
*lex
)
375 * Record lem coordinates. Line number we already have. For column
376 * number we start with position in the input buffer. This works
377 * for all characters except tab. Thus we keep track of tabs
378 * separately using col_adj.
380 line0
= input_get_line_no(lex
->input
);
381 col0
= 1 + lex
->col_adj
+ (lex
->ibp
- lex
->inbuf
);
383 lex
->current
.cspan
= cspan_new(lex
->input
, line0
, col0
, line0
, col0
);
390 lex
->current
.lclass
= lc_eof
;
394 if (is_wstart(bp
[0])) {
404 if (is_digit(bp
[0])) {
414 if (bp
[0] == '-' && bp
[1] == '-') {
415 lex_skip_comment(lex
);
417 /* Compute ending column number */
418 lex
->current
.cspan
->col1
= col0
+ (lex
->ibp
- lsp
) - 1;
426 lex
->current
.lclass
= lc_comma
;
430 lex
->current
.lclass
= lc_colon
;
434 lex
->current
.lclass
= lc_scolon
;
439 lex
->current
.lclass
= lc_period
;
443 lex
->current
.lclass
= lc_slash
;
447 lex
->current
.lclass
= lc_lparen
;
451 lex
->current
.lclass
= lc_rparen
;
455 lex
->current
.lclass
= lc_lsbr
;
459 lex
->current
.lclass
= lc_rsbr
;
465 lex
->current
.lclass
= lc_equal
;
469 lex
->current
.lclass
= lc_assign
;
475 lex
->current
.lclass
= lc_notequal
;
483 lex
->current
.lclass
= lc_increase
;
487 lex
->current
.lclass
= lc_plus
;
492 lex
->current
.lclass
= lc_minus
;
497 lex
->current
.lclass
= lc_mult
;
503 lex
->current
.lclass
= lc_lt_equal
;
507 lex
->current
.lclass
= lc_lt
;
513 lex
->current
.lclass
= lc_gt_equal
;
517 lex
->current
.lclass
= lc_gt
;
528 /* Compute ending column number */
529 lex
->current
.cspan
->col1
= col0
+ (lex
->ibp
- lsp
) - 1;
533 lex
->current
.lclass
= lc_invalid
;
540 /** Lex a word (identifier or keyword).
542 * Read in a word. This may later turn out to be a keyword or a regular
543 * identifier. It is stored in the current lem in @a lex.
545 * @param lex Lexer object.
547 static void lex_word(lex_t
*lex
)
554 ident_buf
[0] = bp
[0];
557 while (is_wcont(bp
[idx
])) {
558 if (idx
>= IBUF_SIZE
) {
559 printf("Error: Identifier too long.\n");
563 ident_buf
[idx
] = bp
[idx
];
569 ident_buf
[idx
] = '\0';
572 while (dp
->name
!= NULL
) {
573 if (os_str_cmp(ident_buf
, dp
->name
) == 0) {
575 lex
->current
.lclass
= dp
->lclass
;
581 /* No matching keyword -- it must be an identifier. */
582 lex
->current
.lclass
= lc_ident
;
583 lex
->current
.u
.ident
.sid
= strtab_get_sid(ident_buf
);
586 /** Lex a character literal.
588 * Reads in a character literal and stores it in the current lem in @a lex.
590 * @param lex Lexer object.
592 static void lex_char(lex_t
*lex
)
597 lex_char_string_core(lex
, cs_chr
);
599 len
= os_str_length(strlit_buf
);
601 printf("Character literal should contain one character, "
602 "but contains %u characters instead.\n", (unsigned) len
);
606 os_str_get_char(strlit_buf
, 0, &char_val
);
607 lex
->current
.lclass
= lc_lit_char
;
608 bigint_init(&lex
->current
.u
.lit_char
.value
, char_val
);
611 /** Lex a numeric literal.
613 * Reads in a numeric literal and stores it in the current lem in @a lex.
615 * @param lex Lexer object.
617 static void lex_number(lex_t
*lex
)
627 bigint_init(&value
, 0);
628 bigint_init(&base
, 10);
630 while (is_digit(*bp
)) {
631 bigint_mul(&value
, &base
, &tprod
);
632 bigint_init(&dgval
, digit_value(*bp
));
634 bigint_destroy(&value
);
635 bigint_add(&tprod
, &dgval
, &value
);
636 bigint_destroy(&tprod
);
637 bigint_destroy(&dgval
);
642 bigint_destroy(&base
);
646 lex
->current
.lclass
= lc_lit_int
;
647 bigint_shallow_copy(&value
, &lex
->current
.u
.lit_int
.value
);
650 /** Lex a string literal.
652 * Reads in a string literal and stores it in the current lem in @a lex.
654 * @param lex Lexer object.
656 static void lex_string(lex_t
*lex
)
658 lex_char_string_core(lex
, cs_str
);
660 lex
->current
.lclass
= lc_lit_string
;
661 lex
->current
.u
.lit_string
.value
= os_str_dup(strlit_buf
);
664 static void lex_char_string_core(lex_t
*lex
, chr_str_t cs
)
669 const char *descr
, *cap_descr
;
672 /* Make compiler happy */
681 cap_descr
= "Character";
686 cap_descr
= "String";
693 while (bp
[sidx
] != term
) {
694 if (didx
>= SLBUF_SIZE
) {
695 printf("Error: %s literal too long.\n", cap_descr
);
699 if (bp
[sidx
] == '\0') {
700 printf("Error: Unterminated %s literal.\n", descr
);
704 if (bp
[sidx
] == '\\') {
705 switch (bp
[sidx
+ 1]) {
722 printf("Error: Unknown character escape sequence.\n");
726 strlit_buf
[didx
] = spchar
;
730 strlit_buf
[didx
] = bp
[sidx
];
736 lex
->ibp
= bp
+ sidx
+ 1;
738 strlit_buf
[didx
] = '\0';
741 /** Lex a single-line comment.
743 * This does not produce any lem. The comment is just skipped.
745 * @param lex Lexer object.
747 static void lex_skip_comment(lex_t
*lex
)
753 while (*bp
!= '\n' && *bp
!= '\0') {
760 /** Skip whitespace characters.
762 * This does not produce any lem. The whitespace is just skipped.
764 * @param lex Lexer object.
766 static void lex_skip_ws(lex_t
*lex
)
774 while (*bp
== ' ' || *bp
== '\t') {
776 /* XXX This is too simplifed. */
777 lex
->col_adj
+= (TAB_WIDTH
- 1);
786 rc
= input_get_line(lex
->input
, &lex
->inbuf
);
788 printf("Error reading input.\n");
799 /** Determine if character can start a word.
801 * @param c Character.
802 * @return @c b_true if @a c can start a word, @c b_false otherwise.
804 static bool_t
is_wstart(char c
)
806 return ((c
>= 'a') && (c
<= 'z')) || ((c
>= 'A') && (c
<= 'Z')) ||
810 /** Determine if character can continue a word.
812 * @param c Character.
813 * @return @c b_true if @a c can start continue word, @c b_false
816 static bool_t
is_wcont(char c
)
818 return is_digit(c
) || is_wstart(c
);
821 /** Determine if character is a numeric digit.
823 * @param c Character.
824 * @return @c b_true if @a c is a numeric digit, @c b_false otherwise.
826 static bool_t
is_digit(char c
)
828 return ((c
>= '0') && (c
<= '9'));
831 /** Determine numeric value of digit character.
833 * @param c Character, must be a valid decimal digit.
834 * @return Value of the digit (0-9).
836 static int digit_value(char c
)