3 * Copyright (C) 2008 Jürg Billeter
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * Jürg Billeter <j@bitron.ch>
27 * Lexical scanner for Vala source files.
29 public class Vala
.Scanner
{
30 public SourceFile source_file
{ get; private set; }
40 public Scanner (SourceFile source_file
) {
41 this
.source_file
= source_file
;
43 char* begin
= source_file
.get_mapped_contents ();
44 end
= begin
+ source_file
.get_mapped_length ();
52 bool is_ident_char (char c
) {
53 return (c
.isalnum () || c
== '_');
56 TokenType
get_identifier_or_keyword (char* begin
, int len
) {
61 if (matches (begin
, "as")) return TokenType
.AS
;
64 if (matches (begin
, "do")) return TokenType
.DO
;
81 if (matches (begin
, "for")) return TokenType
.FOR
;
84 if (matches (begin
, "get")) return TokenType
.GET
;
87 if (matches (begin
, "new")) return TokenType
.NEW
;
90 if (matches (begin
, "out")) return TokenType
.OUT
;
93 if (matches (begin
, "ref")) return TokenType
.REF
;
96 if (matches (begin
, "set")) return TokenType
.SET
;
99 if (matches (begin
, "try")) return TokenType
.TRY
;
102 if (matches (begin
, "var")) return TokenType
.VAR
;
109 if (matches (begin
, "base")) return TokenType
.BASE
;
112 if (matches (begin
, "case")) return TokenType
.CASE
;
117 if (matches (begin
, "else")) return TokenType
.ELSE
;
120 if (matches (begin
, "enum")) return TokenType
.ENUM
;
125 if (matches (begin
, "lock")) return TokenType
.LOCK
;
128 if (matches (begin
, "null")) return TokenType
.NULL
;
133 if (matches (begin
, "this")) return TokenType
.THIS
;
136 if (matches (begin
, "true")) return TokenType
.TRUE
;
141 if (matches (begin
, "void")) return TokenType
.VOID
;
144 if (matches (begin
, "weak")) return TokenType
.WEAK
;
151 if (matches (begin
, "break")) return TokenType
.BREAK
;
156 if (matches (begin
, "catch")) return TokenType
.CATCH
;
159 if (matches (begin
, "class")) return TokenType
.CLASS
;
162 if (matches (begin
, "const")) return TokenType
.CONST
;
167 if (matches (begin
, "false")) return TokenType
.FALSE
;
170 if (matches (begin
, "owned")) return TokenType
.OWNED
;
173 if (matches (begin
, "throw")) return TokenType
.THROW
;
176 if (matches (begin
, "using")) return TokenType
.USING
;
179 if (matches (begin
, "while")) return TokenType
.WHILE
;
182 if (matches (begin
, "yield")) return TokenType
.YIELD
;
189 if (matches (begin
, "delete")) return TokenType
.DELETE
;
192 if (matches (begin
, "extern")) return TokenType
.EXTERN
;
195 if (matches (begin
, "inline")) return TokenType
.INLINE
;
200 if (matches (begin
, "params")) return TokenType
.PARAMS
;
203 if (matches (begin
, "public")) return TokenType
.PUBLIC
;
208 if (matches (begin
, "return")) return TokenType
.RETURN
;
215 if (matches (begin
, "signal")) return TokenType
.SIGNAL
;
218 if (matches (begin
, "sizeof")) return TokenType
.SIZEOF
;
225 if (matches (begin
, "static")) return TokenType
.STATIC
;
228 if (matches (begin
, "struct")) return TokenType
.STRUCT
;
233 if (matches (begin
, "switch")) return TokenType
.SWITCH
;
240 if (matches (begin
, "throws")) return TokenType
.THROWS
;
243 if (matches (begin
, "typeof")) return TokenType
.TYPEOF
;
248 if (matches (begin
, "yields")) return TokenType
.YIELDS
;
257 if (matches (begin
, "default")) return TokenType
.DEFAULT
;
260 if (matches (begin
, "dynamic")) return TokenType
.DYNAMIC
;
265 if (matches (begin
, "ensures")) return TokenType
.ENSURES
;
270 if (matches (begin
, "finally")) return TokenType
.FINALLY
;
273 if (matches (begin
, "foreach")) return TokenType
.FOREACH
;
278 if (matches (begin
, "private")) return TokenType
.PRIVATE
;
281 if (matches (begin
, "unowned")) return TokenType
.UNOWNED
;
284 if (matches (begin
, "virtual")) return TokenType
.VIRTUAL
;
291 if (matches (begin
, "abstract")) return TokenType
.ABSTRACT
;
294 if (matches (begin
, "continue")) return TokenType
.CONTINUE
;
297 if (matches (begin
, "delegate")) return TokenType
.DELEGATE
;
300 if (matches (begin
, "internal")) return TokenType
.INTERNAL
;
303 if (matches (begin
, "override")) return TokenType
.OVERRIDE
;
306 if (matches (begin
, "requires")) return TokenType
.REQUIRES
;
309 if (matches (begin
, "volatile")) return TokenType
.VOLATILE
;
316 if (matches (begin
, "construct")) return TokenType
.CONSTRUCT
;
319 if (matches (begin
, "interface")) return TokenType
.INTERFACE
;
322 if (matches (begin
, "namespace")) return TokenType
.NAMESPACE
;
325 if (matches (begin
, "protected")) return TokenType
.PROTECTED
;
330 if (matches (begin
, "errordomain")) return TokenType
.ERRORDOMAIN
;
333 return TokenType
.IDENTIFIER
;
336 public TokenType
read_token (out SourceLocation token_begin
, out SourceLocation token_end
) {
340 char* begin
= current
;
341 token_begin
.pos
= begin
;
342 token_begin
.line
= line
;
343 token_begin
.column
= column
;
345 int token_length_in_chars
= -1;
347 if (current
>= end
) {
348 type
= TokenType
.EOF
;
349 } else if (current
[0].isalpha () || current
[0] == '_') {
351 while (current
< end
&& is_ident_char (current
[0])) {
355 type
= get_identifier_or_keyword (begin
, len
);
356 } else if (current
[0] == '@') {
357 token_begin
.pos
++; // @ is not part of the identifier
360 while (current
< end
&& is_ident_char (current
[0])) {
364 type
= TokenType
.IDENTIFIER
;
365 } else if (current
[0].isdigit ()) {
366 while (current
< end
&& current
[0].isdigit ()) {
369 type
= TokenType
.INTEGER_LITERAL
;
370 if (current
< end
&& current
[0].tolower () == 'l') {
372 if (current
< end
&& current
[0].tolower () == 'l') {
375 } else if (current
< end
&& current
[0].tolower () == 'u') {
377 if (current
< end
&& current
[0].tolower () == 'l') {
379 if (current
< end
&& current
[0].tolower () == 'l') {
383 } else if (current
< end
- 1 && current
[0] == '.' && current
[1].isdigit ()) {
385 while (current
< end
&& current
[0].isdigit ()) {
388 if (current
< end
&& current
[0].tolower () == 'e') {
390 if (current
< end
&& (current
[0] == '+' || current
[0] == '-')) {
393 while (current
< end
&& current
[0].isdigit ()) {
397 if (current
< end
&& current
[0].tolower () == 'f') {
400 type
= TokenType
.REAL_LITERAL
;
401 } else if (current
< end
&& current
== begin
+ 1
402 && begin
[0] == '0' && begin
[1] == 'x' && begin
[2].isxdigit ()) {
403 // hexadecimal integer literal
405 while (current
< end
&& current
[0].isxdigit ()) {
408 } else if (current
< end
&& is_ident_char (current
[0])) {
409 // allow identifiers to start with a digit
410 // as long as they contain at least one char
411 while (current
< end
&& is_ident_char (current
[0])) {
414 type
= TokenType
.IDENTIFIER
;
417 switch (current
[0]) {
419 type
= TokenType
.OPEN_BRACE
;
423 type
= TokenType
.CLOSE_BRACE
;
427 type
= TokenType
.OPEN_PARENS
;
431 type
= TokenType
.CLOSE_PARENS
;
435 type
= TokenType
.OPEN_BRACKET
;
439 type
= TokenType
.CLOSE_BRACKET
;
443 type
= TokenType
.DOT
;
445 if (current
< end
- 1) {
446 if (current
[0] == '.' && current
[1] == '.') {
447 type
= TokenType
.ELLIPSIS
;
453 type
= TokenType
.COLON
;
455 if (current
< end
&& current
[0] == ':') {
456 type
= TokenType
.DOUBLE_COLON
;
461 type
= TokenType
.COMMA
;
465 type
= TokenType
.SEMICOLON
;
469 type
= TokenType
.HASH
;
473 type
= TokenType
.INTERR
;
477 type
= TokenType
.BITWISE_OR
;
480 switch (current
[0]) {
482 type
= TokenType
.ASSIGN_BITWISE_OR
;
486 type
= TokenType
.OP_OR
;
493 type
= TokenType
.BITWISE_AND
;
496 switch (current
[0]) {
498 type
= TokenType
.ASSIGN_BITWISE_AND
;
502 type
= TokenType
.OP_AND
;
509 type
= TokenType
.CARRET
;
511 if (current
< end
&& current
[0] == '=') {
512 type
= TokenType
.ASSIGN_BITWISE_XOR
;
517 type
= TokenType
.TILDE
;
521 type
= TokenType
.ASSIGN
;
524 switch (current
[0]) {
526 type
= TokenType
.OP_EQ
;
530 type
= TokenType
.LAMBDA
;
537 type
= TokenType
.OP_LT
;
540 switch (current
[0]) {
542 type
= TokenType
.OP_LE
;
546 type
= TokenType
.OP_SHIFT_LEFT
;
548 if (current
< end
&& current
[0] == '=') {
549 type
= TokenType
.ASSIGN_SHIFT_LEFT
;
557 type
= TokenType
.OP_GT
;
559 if (current
< end
&& current
[0] == '=') {
560 type
= TokenType
.OP_GE
;
565 type
= TokenType
.OP_NEG
;
567 if (current
< end
&& current
[0] == '=') {
568 type
= TokenType
.OP_NE
;
573 type
= TokenType
.PLUS
;
576 switch (current
[0]) {
578 type
= TokenType
.ASSIGN_ADD
;
582 type
= TokenType
.OP_INC
;
589 type
= TokenType
.MINUS
;
592 switch (current
[0]) {
594 type
= TokenType
.ASSIGN_SUB
;
598 type
= TokenType
.OP_DEC
;
602 type
= TokenType
.OP_PTR
;
609 type
= TokenType
.STAR
;
611 if (current
< end
&& current
[0] == '=') {
612 type
= TokenType
.ASSIGN_MUL
;
617 type
= TokenType
.DIV
;
619 if (current
< end
&& current
[0] == '=') {
620 type
= TokenType
.ASSIGN_DIV
;
625 type
= TokenType
.PERCENT
;
627 if (current
< end
&& current
[0] == '=') {
628 type
= TokenType
.ASSIGN_PERCENT
;
634 if (begin
[0] == '\'') {
635 type
= TokenType
.CHARACTER_LITERAL
;
636 } else if (current
< end
- 6 && begin
[1] == '"' && begin
[2] == '"') {
637 type
= TokenType
.VERBATIM_STRING_LITERAL
;
638 token_length_in_chars
= 6;
640 while (current
< end
- 4) {
641 if (current
[0] == '"' && current
[1] == '"' && current
[2] == '"') {
643 } else if (current
[0] == '\n') {
647 token_length_in_chars
= 3;
649 unichar u
= ((string) current
).get_char_validated ((long) (end
- current
));
650 if (u
!= (unichar
) (-1)) {
651 current
+= u
.to_utf8 (null);
652 token_length_in_chars
++;
654 Report
.error (new
SourceReference (source_file
, line
, column
+ token_length_in_chars
, line
, column
+ token_length_in_chars
), "invalid UTF-8 character");
658 if (current
[0] == '"' && current
[1] == '"' && current
[2] == '"') {
661 Report
.error (new
SourceReference (source_file
, line
, column
+ token_length_in_chars
, line
, column
+ token_length_in_chars
), "syntax error, expected \"\"\"");
665 type
= TokenType
.STRING_LITERAL
;
667 token_length_in_chars
= 2;
669 while (current
< end
&& current
[0] != begin
[0]) {
670 if (current
[0] == '\\') {
672 token_length_in_chars
++;
673 if (current
< end
&& current
[0] == 'x') {
674 // hexadecimal escape character
676 token_length_in_chars
++;
677 while (current
< end
&& current
[0].isxdigit ()) {
679 token_length_in_chars
++;
683 token_length_in_chars
++;
685 } else if (current
[0] == '\n') {
688 unichar u
= ((string) current
).get_char_validated ((long) (end
- current
));
689 if (u
!= (unichar
) (-1)) {
690 current
+= u
.to_utf8 (null);
691 token_length_in_chars
++;
693 Report
.error (new
SourceReference (source_file
, line
, column
+ token_length_in_chars
, line
, column
+ token_length_in_chars
), "invalid UTF-8 character");
697 if (current
< end
&& current
[0] != '\n') {
700 Report
.error (new
SourceReference (source_file
, line
, column
+ token_length_in_chars
, line
, column
+ token_length_in_chars
), "syntax error, expected %c".printf (begin
[0]));
704 unichar u
= ((string) current
).get_char_validated ((long) (end
- current
));
705 if (u
!= (unichar
) (-1)) {
706 current
+= u
.to_utf8 (null);
707 Report
.error (new
SourceReference (source_file
, line
, column
, line
, column
), "syntax error, unexpected character");
710 Report
.error (new
SourceReference (source_file
, line
, column
, line
, column
), "invalid UTF-8 character");
713 return read_token (out token_begin
, out token_end
);
717 if (token_length_in_chars
< 0) {
718 column
+= (int) (current
- begin
);
720 column
+= token_length_in_chars
;
723 token_end
.pos
= current
;
724 token_end
.line
= line
;
725 token_end
.column
= column
- 1;
730 bool matches (char* begin
, string keyword
) {
731 char* keyword_array
= keyword
;
732 long len
= keyword
.len ();
733 for (int i
= 0; i
< len
; i
++) {
734 if (begin
[i
] != keyword_array
[i
]) {
743 while (current
< end
&& current
[0].isspace ()) {
744 if (current
[0] == '\n') {
756 if (current
> end
- 2
758 || (current
[1] != '/' && current
[1] != '*')) {
762 if (current
[1] == '/') {
763 // single-line comment
765 char* begin
= current
;
766 // skip until end of line or end of file
767 while (current
< end
&& current
[0] != '\n') {
770 push_comment (((string) begin
).ndup ((long) (current
- begin
)), line
== 1);
774 char* begin
= current
;
775 int begin_line
= line
;
776 while (current
< end
- 1
777 && (current
[0] != '*' || current
[1] != '/')) {
778 if (current
[0] == '\n') {
785 if (current
== end
- 1) {
786 Report
.error (new
SourceReference (source_file
, line
, column
, line
, column
), "syntax error, expected */");
789 push_comment (((string) begin
).ndup ((long) (current
- begin
)), begin_line
== 1);
798 while (whitespace () || comment ()) {
802 void push_comment (string comment_item
, bool file_comment
) {
803 if (_comment
== null) {
804 _comment
= comment_item
;
806 _comment
= "%s\n%s".printf (_comment
, comment_item
);
809 source_file
.comment
= _comment
;
815 * Clears and returns the content of the comment stack.
817 * @return saved comment
819 public string?
pop_comment () {
820 if (_comment
== null) {
824 var result
= new
StringBuilder (_comment
);
828 while ((index
= result
.str
.chr (-1, '\t')) != null) {
829 result
.erase (result
.str
.pointer_to_offset (index
), 1);