3 * Copyright (C) 2008-2009 Jürg Billeter
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * Jürg Billeter <j@bitron.ch>
27 * Lexical scanner for Vala source files.
29 public class Vala
.Scanner
{
30 public SourceFile source_file
{ get; private set; }
40 public Scanner (SourceFile source_file
) {
41 this
.source_file
= source_file
;
43 char* begin
= source_file
.get_mapped_contents ();
44 end
= begin
+ source_file
.get_mapped_length ();
52 bool is_ident_char (char c
) {
53 return (c
.isalnum () || c
== '_');
56 TokenType
get_identifier_or_keyword (char* begin
, int len
) {
61 if (matches (begin
, "as")) return TokenType
.AS
;
64 if (matches (begin
, "do")) return TokenType
.DO
;
81 if (matches (begin
, "for")) return TokenType
.FOR
;
84 if (matches (begin
, "get")) return TokenType
.GET
;
87 if (matches (begin
, "new")) return TokenType
.NEW
;
90 if (matches (begin
, "out")) return TokenType
.OUT
;
93 if (matches (begin
, "ref")) return TokenType
.REF
;
96 if (matches (begin
, "set")) return TokenType
.SET
;
99 if (matches (begin
, "try")) return TokenType
.TRY
;
102 if (matches (begin
, "var")) return TokenType
.VAR
;
109 if (matches (begin
, "base")) return TokenType
.BASE
;
112 if (matches (begin
, "case")) return TokenType
.CASE
;
117 if (matches (begin
, "else")) return TokenType
.ELSE
;
120 if (matches (begin
, "enum")) return TokenType
.ENUM
;
125 if (matches (begin
, "lock")) return TokenType
.LOCK
;
128 if (matches (begin
, "null")) return TokenType
.NULL
;
133 if (matches (begin
, "this")) return TokenType
.THIS
;
136 if (matches (begin
, "true")) return TokenType
.TRUE
;
141 if (matches (begin
, "void")) return TokenType
.VOID
;
144 if (matches (begin
, "weak")) return TokenType
.WEAK
;
151 if (matches (begin
, "break")) return TokenType
.BREAK
;
156 if (matches (begin
, "catch")) return TokenType
.CATCH
;
159 if (matches (begin
, "class")) return TokenType
.CLASS
;
162 if (matches (begin
, "const")) return TokenType
.CONST
;
167 if (matches (begin
, "false")) return TokenType
.FALSE
;
170 if (matches (begin
, "owned")) return TokenType
.OWNED
;
173 if (matches (begin
, "throw")) return TokenType
.THROW
;
176 if (matches (begin
, "using")) return TokenType
.USING
;
179 if (matches (begin
, "while")) return TokenType
.WHILE
;
182 if (matches (begin
, "yield")) return TokenType
.YIELD
;
189 if (matches (begin
, "delete")) return TokenType
.DELETE
;
192 if (matches (begin
, "extern")) return TokenType
.EXTERN
;
195 if (matches (begin
, "inline")) return TokenType
.INLINE
;
200 if (matches (begin
, "params")) return TokenType
.PARAMS
;
203 if (matches (begin
, "public")) return TokenType
.PUBLIC
;
208 if (matches (begin
, "return")) return TokenType
.RETURN
;
215 if (matches (begin
, "signal")) return TokenType
.SIGNAL
;
218 if (matches (begin
, "sizeof")) return TokenType
.SIZEOF
;
225 if (matches (begin
, "static")) return TokenType
.STATIC
;
228 if (matches (begin
, "struct")) return TokenType
.STRUCT
;
233 if (matches (begin
, "switch")) return TokenType
.SWITCH
;
240 if (matches (begin
, "throws")) return TokenType
.THROWS
;
243 if (matches (begin
, "typeof")) return TokenType
.TYPEOF
;
248 if (matches (begin
, "yields")) return TokenType
.YIELDS
;
257 if (matches (begin
, "default")) return TokenType
.DEFAULT
;
260 if (matches (begin
, "dynamic")) return TokenType
.DYNAMIC
;
265 if (matches (begin
, "ensures")) return TokenType
.ENSURES
;
270 if (matches (begin
, "finally")) return TokenType
.FINALLY
;
273 if (matches (begin
, "foreach")) return TokenType
.FOREACH
;
278 if (matches (begin
, "private")) return TokenType
.PRIVATE
;
281 if (matches (begin
, "unowned")) return TokenType
.UNOWNED
;
284 if (matches (begin
, "virtual")) return TokenType
.VIRTUAL
;
291 if (matches (begin
, "abstract")) return TokenType
.ABSTRACT
;
294 if (matches (begin
, "continue")) return TokenType
.CONTINUE
;
297 if (matches (begin
, "delegate")) return TokenType
.DELEGATE
;
300 if (matches (begin
, "internal")) return TokenType
.INTERNAL
;
303 if (matches (begin
, "override")) return TokenType
.OVERRIDE
;
306 if (matches (begin
, "requires")) return TokenType
.REQUIRES
;
309 if (matches (begin
, "volatile")) return TokenType
.VOLATILE
;
316 if (matches (begin
, "construct")) return TokenType
.CONSTRUCT
;
319 if (matches (begin
, "interface")) return TokenType
.INTERFACE
;
322 if (matches (begin
, "namespace")) return TokenType
.NAMESPACE
;
325 if (matches (begin
, "protected")) return TokenType
.PROTECTED
;
330 if (matches (begin
, "errordomain")) return TokenType
.ERRORDOMAIN
;
333 return TokenType
.IDENTIFIER
;
336 TokenType
read_number () {
337 var type
= TokenType
.INTEGER_LITERAL
;
340 if (current
< end
- 2 && current
[0] == '0'
341 && current
[1] == 'x' && current
[2].isxdigit ()) {
342 // hexadecimal integer literal
344 while (current
< end
&& current
[0].isxdigit ()) {
349 while (current
< end
&& current
[0].isdigit ()) {
355 if (current
< end
- 1 && current
[0] == '.' && current
[1].isdigit ()) {
356 type
= TokenType
.REAL_LITERAL
;
358 while (current
< end
&& current
[0].isdigit ()) {
364 if (current
< end
&& current
[0].tolower () == 'e') {
365 type
= TokenType
.REAL_LITERAL
;
367 if (current
< end
&& (current
[0] == '+' || current
[0] == '-')) {
370 while (current
< end
&& current
[0].isdigit ()) {
377 bool real_literal
= (type
== TokenType
.REAL_LITERAL
);
379 switch (current
[0]) {
382 if (type
== TokenType
.INTEGER_LITERAL
) {
384 if (current
< end
&& current
[0].tolower () == 'l') {
391 if (type
== TokenType
.INTEGER_LITERAL
) {
393 if (current
< end
&& current
[0].tolower () == 'l') {
395 if (current
< end
&& current
[0].tolower () == 'l') {
405 type
= TokenType
.REAL_LITERAL
;
410 if (!real_literal
&& is_ident_char (current
[0])) {
411 // allow identifiers to start with a digit
412 // as long as they contain at least one char
413 while (current
< end
&& is_ident_char (current
[0])) {
416 type
= TokenType
.IDENTIFIER
;
423 public TokenType
read_token (out SourceLocation token_begin
, out SourceLocation token_end
) {
427 char* begin
= current
;
428 token_begin
.pos
= begin
;
429 token_begin
.line
= line
;
430 token_begin
.column
= column
;
432 int token_length_in_chars
= -1;
434 if (current
>= end
) {
435 type
= TokenType
.EOF
;
436 } else if (current
[0].isalpha () || current
[0] == '_') {
438 while (current
< end
&& is_ident_char (current
[0])) {
442 type
= get_identifier_or_keyword (begin
, len
);
443 } else if (current
[0] == '@') {
444 token_begin
.pos
++; // @ is not part of the identifier
447 while (current
< end
&& is_ident_char (current
[0])) {
451 type
= TokenType
.IDENTIFIER
;
452 } else if (current
[0].isdigit ()) {
453 type
= read_number ();
455 switch (current
[0]) {
457 type
= TokenType
.OPEN_BRACE
;
461 type
= TokenType
.CLOSE_BRACE
;
465 type
= TokenType
.OPEN_PARENS
;
469 type
= TokenType
.CLOSE_PARENS
;
473 type
= TokenType
.OPEN_BRACKET
;
477 type
= TokenType
.CLOSE_BRACKET
;
481 type
= TokenType
.DOT
;
483 if (current
< end
- 1) {
484 if (current
[0] == '.' && current
[1] == '.') {
485 type
= TokenType
.ELLIPSIS
;
491 type
= TokenType
.COLON
;
493 if (current
< end
&& current
[0] == ':') {
494 type
= TokenType
.DOUBLE_COLON
;
499 type
= TokenType
.COMMA
;
503 type
= TokenType
.SEMICOLON
;
507 type
= TokenType
.HASH
;
511 type
= TokenType
.INTERR
;
515 type
= TokenType
.BITWISE_OR
;
518 switch (current
[0]) {
520 type
= TokenType
.ASSIGN_BITWISE_OR
;
524 type
= TokenType
.OP_OR
;
531 type
= TokenType
.BITWISE_AND
;
534 switch (current
[0]) {
536 type
= TokenType
.ASSIGN_BITWISE_AND
;
540 type
= TokenType
.OP_AND
;
547 type
= TokenType
.CARRET
;
549 if (current
< end
&& current
[0] == '=') {
550 type
= TokenType
.ASSIGN_BITWISE_XOR
;
555 type
= TokenType
.TILDE
;
559 type
= TokenType
.ASSIGN
;
562 switch (current
[0]) {
564 type
= TokenType
.OP_EQ
;
568 type
= TokenType
.LAMBDA
;
575 type
= TokenType
.OP_LT
;
578 switch (current
[0]) {
580 type
= TokenType
.OP_LE
;
584 type
= TokenType
.OP_SHIFT_LEFT
;
586 if (current
< end
&& current
[0] == '=') {
587 type
= TokenType
.ASSIGN_SHIFT_LEFT
;
595 type
= TokenType
.OP_GT
;
597 if (current
< end
&& current
[0] == '=') {
598 type
= TokenType
.OP_GE
;
603 type
= TokenType
.OP_NEG
;
605 if (current
< end
&& current
[0] == '=') {
606 type
= TokenType
.OP_NE
;
611 type
= TokenType
.PLUS
;
614 switch (current
[0]) {
616 type
= TokenType
.ASSIGN_ADD
;
620 type
= TokenType
.OP_INC
;
627 type
= TokenType
.MINUS
;
630 switch (current
[0]) {
632 type
= TokenType
.ASSIGN_SUB
;
636 type
= TokenType
.OP_DEC
;
640 type
= TokenType
.OP_PTR
;
647 type
= TokenType
.STAR
;
649 if (current
< end
&& current
[0] == '=') {
650 type
= TokenType
.ASSIGN_MUL
;
655 type
= TokenType
.DIV
;
657 if (current
< end
&& current
[0] == '=') {
658 type
= TokenType
.ASSIGN_DIV
;
663 type
= TokenType
.PERCENT
;
665 if (current
< end
&& current
[0] == '=') {
666 type
= TokenType
.ASSIGN_PERCENT
;
672 if (begin
[0] == '\'') {
673 type
= TokenType
.CHARACTER_LITERAL
;
674 } else if (current
< end
- 6 && begin
[1] == '"' && begin
[2] == '"') {
675 type
= TokenType
.VERBATIM_STRING_LITERAL
;
676 token_length_in_chars
= 6;
678 while (current
< end
- 4) {
679 if (current
[0] == '"' && current
[1] == '"' && current
[2] == '"') {
681 } else if (current
[0] == '\n') {
685 token_length_in_chars
= 3;
687 unichar u
= ((string) current
).get_char_validated ((long) (end
- current
));
688 if (u
!= (unichar
) (-1)) {
689 current
+= u
.to_utf8 (null);
690 token_length_in_chars
++;
692 Report
.error (new
SourceReference (source_file
, line
, column
+ token_length_in_chars
, line
, column
+ token_length_in_chars
), "invalid UTF-8 character");
696 if (current
[0] == '"' && current
[1] == '"' && current
[2] == '"') {
699 Report
.error (new
SourceReference (source_file
, line
, column
+ token_length_in_chars
, line
, column
+ token_length_in_chars
), "syntax error, expected \"\"\"");
703 type
= TokenType
.STRING_LITERAL
;
705 token_length_in_chars
= 2;
707 while (current
< end
&& current
[0] != begin
[0]) {
708 if (current
[0] == '\\') {
710 token_length_in_chars
++;
711 if (current
< end
&& current
[0] == 'x') {
712 // hexadecimal escape character
714 token_length_in_chars
++;
715 while (current
< end
&& current
[0].isxdigit ()) {
717 token_length_in_chars
++;
721 token_length_in_chars
++;
723 } else if (current
[0] == '\n') {
726 unichar u
= ((string) current
).get_char_validated ((long) (end
- current
));
727 if (u
!= (unichar
) (-1)) {
728 current
+= u
.to_utf8 (null);
729 token_length_in_chars
++;
731 Report
.error (new
SourceReference (source_file
, line
, column
+ token_length_in_chars
, line
, column
+ token_length_in_chars
), "invalid UTF-8 character");
735 if (current
< end
&& current
[0] != '\n') {
738 Report
.error (new
SourceReference (source_file
, line
, column
+ token_length_in_chars
, line
, column
+ token_length_in_chars
), "syntax error, expected %c".printf (begin
[0]));
742 unichar u
= ((string) current
).get_char_validated ((long) (end
- current
));
743 if (u
!= (unichar
) (-1)) {
744 current
+= u
.to_utf8 (null);
745 Report
.error (new
SourceReference (source_file
, line
, column
, line
, column
), "syntax error, unexpected character");
748 Report
.error (new
SourceReference (source_file
, line
, column
, line
, column
), "invalid UTF-8 character");
751 return read_token (out token_begin
, out token_end
);
755 if (token_length_in_chars
< 0) {
756 column
+= (int) (current
- begin
);
758 column
+= token_length_in_chars
;
761 token_end
.pos
= current
;
762 token_end
.line
= line
;
763 token_end
.column
= column
- 1;
768 bool matches (char* begin
, string keyword
) {
769 char* keyword_array
= keyword
;
770 long len
= keyword
.len ();
771 for (int i
= 0; i
< len
; i
++) {
772 if (begin
[i
] != keyword_array
[i
]) {
781 while (current
< end
&& current
[0].isspace ()) {
782 if (current
[0] == '\n') {
794 if (current
> end
- 2
796 || (current
[1] != '/' && current
[1] != '*')) {
800 if (current
[1] == '/') {
801 // single-line comment
803 char* begin
= current
;
804 // skip until end of line or end of file
805 while (current
< end
&& current
[0] != '\n') {
808 push_comment (((string) begin
).ndup ((long) (current
- begin
)), line
== 1);
812 char* begin
= current
;
813 int begin_line
= line
;
814 while (current
< end
- 1
815 && (current
[0] != '*' || current
[1] != '/')) {
816 if (current
[0] == '\n') {
823 if (current
== end
- 1) {
824 Report
.error (new
SourceReference (source_file
, line
, column
, line
, column
), "syntax error, expected */");
827 push_comment (((string) begin
).ndup ((long) (current
- begin
)), begin_line
== 1);
836 while (whitespace () || comment ()) {
840 void push_comment (string comment_item
, bool file_comment
) {
841 if (_comment
== null) {
842 _comment
= comment_item
;
844 _comment
= "%s\n%s".printf (_comment
, comment_item
);
847 source_file
.comment
= _comment
;
853 * Clears and returns the content of the comment stack.
855 * @return saved comment
857 public string?
pop_comment () {
858 if (_comment
== null) {
862 var result_builder
= new
StringBuilder (_comment
);
866 while ((index
= result_builder
.str
.chr (-1, '\t')) != null) {
867 result_builder
.erase (result_builder
.str
.pointer_to_offset (index
), 1);
870 return result_builder
.str
;