1 /* valageniescanner.vala
3 * Copyright (C) 2008 Jamie McCracken, Jürg Billeter
4 * Based on code by Jürg Billeter
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 * Jamie McCracken jamiemcc gnome org
28 * Lexical scanner for Genie source files.
30 public class Vala
.Genie
.Scanner
{
31 public SourceFile source_file
{ get; private set; }
33 public int indent_spaces
{ get; set;}
42 int current_indent_level
;
51 public Scanner (SourceFile source_file
) {
52 this
.source_file
= source_file
;
54 begin
= source_file
.get_mapped_contents ();
55 end
= begin
+ source_file
.get_mapped_length ();
62 current_indent_level
= 0;
66 parse_started
= false;
67 last_token
= TokenType
.NONE
;
71 bool is_ident_char (char c
) {
72 return (c
.isalnum () || c
== '_');
75 TokenType
get_identifier_or_keyword (char* begin
, int len
) {
80 if (matches (begin
, "as")) return TokenType
.AS
;
83 if (matches (begin
, "do")) return TokenType
.DO
;
96 if (matches (begin
, "of")) return TokenType
.OF
;
98 if (matches (begin
, "or")) return TokenType
.OP_OR
;
101 if (matches (begin
, "to")) return TokenType
.TO
;
108 if (matches (begin
, "and")) return TokenType
.OP_AND
;
111 if (matches (begin
, "def")) return TokenType
.DEF
;
114 if (matches (begin
, "for")) return TokenType
.FOR
;
117 if (matches (begin
, "get")) return TokenType
.GET
;
120 if (matches (begin
, "isa")) return TokenType
.ISA
;
125 if (matches (begin
, "new")) return TokenType
.NEW
;
128 if (matches (begin
, "not")) return TokenType
.OP_NEG
;
133 if (matches (begin
, "out")) return TokenType
.OUT
;
136 if (matches (begin
, "ref")) return TokenType
.REF
;
139 if (matches (begin
, "set")) return TokenType
.SET
;
142 if (matches (begin
, "try")) return TokenType
.TRY
;
145 if (matches (begin
, "var")) return TokenType
.VAR
;
152 if (matches (begin
, "case")) return TokenType
.CASE
;
155 if (matches (begin
, "dict")) return TokenType
.DICT
;
160 if (matches (begin
, "else")) return TokenType
.ELSE
;
163 if (matches (begin
, "enum")) return TokenType
.ENUM
;
168 if (matches (begin
, "init")) return TokenType
.INIT
;
173 if (matches (begin
, "list")) return TokenType
.LIST
;
176 if (matches (begin
, "lock")) return TokenType
.LOCK
;
182 if (matches (begin
, "null")) return TokenType
.NULL
;
187 if (matches (begin
, "pass")) return TokenType
.PASS
;
190 if (matches (begin
, "prop")) return TokenType
.PROP
;
195 if (matches (begin
, "self")) return TokenType
.THIS
;
198 if (matches (begin
, "true")) return TokenType
.TRUE
;
201 if (matches (begin
, "uses")) return TokenType
.USES
;
204 if (matches (begin
, "void")) return TokenType
.VOID
;
209 if (matches (begin
, "weak")) return TokenType
.WEAK
;
212 if (matches (begin
, "when")) return TokenType
.WHEN
;
221 if (matches (begin
, "array")) return TokenType
.ARRAY
;
224 if (matches (begin
, "break")) return TokenType
.BREAK
;
229 if (matches (begin
, "class")) return TokenType
.CLASS
;
232 if (matches (begin
, "const")) return TokenType
.CONST
;
237 if (matches (begin
, "event")) return TokenType
.EVENT
;
242 if (matches (begin
, "false")) return TokenType
.FALSE
;
245 if (matches (begin
, "final")) return TokenType
.FINAL
;
250 if (matches (begin
, "print")) return TokenType
.PRINT
;
253 if (matches (begin
, "super")) return TokenType
.SUPER
;
256 if (matches (begin
, "raise")) return TokenType
.RAISE
;
259 if (matches (begin
, "while")) return TokenType
.WHILE
;
266 if (matches (begin
, "assert")) return TokenType
.ASSERT
;
271 if (matches (begin
, "delete")) return TokenType
.DELETE
;
274 if (matches (begin
, "downto")) return TokenType
.DOWNTO
;
283 if (matches (begin
, "except")) return TokenType
.EXCEPT
;
286 if (matches (begin
, "extern")) return TokenType
.EXTERN
;
293 if (matches (begin
, "inline")) return TokenType
.INLINE
;
296 if (matches (begin
, "public")) return TokenType
.PUBLIC
;
301 if (matches (begin
, "raises")) return TokenType
.RAISES
;
304 if (matches (begin
, "return")) return TokenType
.RETURN
;
311 if (matches (begin
, "sizeof")) return TokenType
.SIZEOF
;
316 if (matches (begin
, "static")) return TokenType
.STATIC
;
319 if (matches (begin
, "struct")) return TokenType
.STRUCT
;
326 if (matches (begin
, "typeof")) return TokenType
.TYPEOF
;
335 if (matches (begin
, "default")) return TokenType
.DEFAULT
;
338 if (matches (begin
, "dynamic")) return TokenType
.DYNAMIC
;
343 if (matches (begin
, "ensures")) return TokenType
.ENSURES
;
348 if (matches (begin
, "finally")) return TokenType
.FINALLY
;
351 if (matches (begin
, "foreach")) return TokenType
.FOREACH
;
356 if (matches (begin
, "private")) return TokenType
.PRIVATE
;
359 if (matches (begin
, "virtual")) return TokenType
.VIRTUAL
;
366 if (matches (begin
, "abstract")) return TokenType
.ABSTRACT
;
369 if (matches (begin
, "continue")) return TokenType
.CONTINUE
;
372 if (matches (begin
, "delegate")) return TokenType
.DELEGATE
;
375 if (matches (begin
, "override")) return TokenType
.OVERRIDE
;
380 if (matches (begin
, "readonly")) return TokenType
.READONLY
;
383 if (matches (begin
, "requires")) return TokenType
.REQUIRES
;
388 if (matches (begin
, "volatile")) return TokenType
.VOLATILE
;
395 if (matches (begin
, "construct")) return TokenType
.CONSTRUCT
;
398 if (matches (begin
, "exception")) return TokenType
.ERRORDOMAIN
;
401 if (matches (begin
, "interface")) return TokenType
.INTERFACE
;
404 if (matches (begin
, "namespace")) return TokenType
.NAMESPACE
;
407 if (matches (begin
, "protected")) return TokenType
.PROTECTED
;
410 if (matches (begin
, "writeonly")) return TokenType
.WRITEONLY
;
417 if (matches (begin
, "implements")) return TokenType
.IMPLEMENTS
;
422 return TokenType
.IDENTIFIER
;
425 public TokenType
read_token (out SourceLocation token_begin
, out SourceLocation token_end
) {
426 /* emit dedents if outstanding before checking any other chars */
428 if (pending_dedents
> 0) {
433 token_begin
.pos
= current
;
434 token_begin
.line
= line
;
435 token_begin
.column
= column
;
437 token_end
.pos
= current
;
438 token_end
.line
= line
;
439 token_end
.column
= column
;
441 last_token
= TokenType
.DEDENT
;
443 return TokenType
.DEDENT
;
447 if ((_indent_spaces
== 0 ) || (last_token
!= TokenType
.EOL
)) {
448 /* scrub whitespace (excluding newlines) and comments */
452 /* handle line continuation (lines ending with \) */
453 while (current
< end
&& current
[0] == '\\' && current
[1] == '\n') {
459 /* handle non-consecutive new line once parsing is underway - EOL */
460 if (newline () && parse_started
&& last_token
!= TokenType
.EOL
&& last_token
!= TokenType
.SEMICOLON
) {
461 token_begin
.pos
= current
;
462 token_begin
.line
= line
;
463 token_begin
.column
= column
;
465 token_end
.pos
= current
;
466 token_end
.line
= line
;
467 token_end
.column
= column
;
469 last_token
= TokenType
.EOL
;
471 return TokenType
.EOL
;
475 while (skip_newlines ()) {
476 token_begin
.pos
= current
;
477 token_begin
.line
= line
;
478 token_begin
.column
= column
;
480 current_indent_level
= count_tabs ();
482 /* if its an empty new line then ignore */
483 if (current_indent_level
== -1) {
487 if (current_indent_level
> indent_level
) {
488 indent_level
= current_indent_level
;
490 token_end
.pos
= current
;
491 token_end
.line
= line
;
492 token_end
.column
= column
;
494 last_token
= TokenType
.INDENT
;
496 return TokenType
.INDENT
;
497 } else if (current_indent_level
< indent_level
) {
500 pending_dedents
= (indent_level
- current_indent_level
);
502 token_end
.pos
= current
;
503 token_end
.line
= line
;
504 token_end
.column
= column
;
506 last_token
= TokenType
.DEDENT
;
508 return TokenType
.DEDENT
;
513 char* begin
= current
;
514 token_begin
.pos
= begin
;
515 token_begin
.line
= line
;
516 token_begin
.column
= column
;
518 int token_length_in_chars
= -1;
520 parse_started
= true;
522 if (current
>= end
) {
523 if (indent_level
> 0) {
526 pending_dedents
= indent_level
;
528 type
= TokenType
.DEDENT
;
530 type
= TokenType
.EOF
;
532 } else if (current
[0].isalpha () || current
[0] == '_') {
534 while (current
< end
&& is_ident_char (current
[0])) {
538 type
= get_identifier_or_keyword (begin
, len
);
539 } else if (current
[0] == '@') {
541 if (current
[1] == '@') {
542 token_begin
.pos
+= 2; // @@ is not part of the identifier
548 while (current
< end
&& is_ident_char (current
[0])) {
552 type
= TokenType
.IDENTIFIER
;
553 } else if (current
[0].isdigit ()) {
554 while (current
< end
&& current
[0].isdigit ()) {
557 type
= TokenType
.INTEGER_LITERAL
;
558 if (current
< end
&& current
[0].tolower () == 'l') {
560 if (current
< end
&& current
[0].tolower () == 'l') {
563 } else if (current
< end
&& current
[0].tolower () == 'u') {
565 if (current
< end
&& current
[0].tolower () == 'l') {
567 if (current
< end
&& current
[0].tolower () == 'l') {
571 } else if (current
< end
- 1 && current
[0] == '.' && current
[1].isdigit ()) {
573 while (current
< end
&& current
[0].isdigit ()) {
576 if (current
< end
&& current
[0].tolower () == 'e') {
578 if (current
< end
&& (current
[0] == '+' || current
[0] == '-')) {
581 while (current
< end
&& current
[0].isdigit ()) {
585 if (current
< end
&& current
[0].tolower () == 'f') {
588 type
= TokenType
.REAL_LITERAL
;
589 } else if (current
< end
&& current
== begin
+ 1
590 && begin
[0] == '0' && begin
[1] == 'x' && begin
[2].isxdigit ()) {
591 // hexadecimal integer literal
593 while (current
< end
&& current
[0].isxdigit ()) {
596 } else if (current
< end
&& is_ident_char (current
[0])) {
597 // allow identifiers to start with a digit
598 // as long as they contain at least one char
599 while (current
< end
&& is_ident_char (current
[0])) {
602 type
= TokenType
.IDENTIFIER
;
605 switch (current
[0]) {
607 type
= TokenType
.OPEN_BRACE
;
611 type
= TokenType
.CLOSE_BRACE
;
615 type
= TokenType
.OPEN_PARENS
;
619 type
= TokenType
.CLOSE_PARENS
;
623 type
= TokenType
.OPEN_BRACKET
;
627 type
= TokenType
.CLOSE_BRACKET
;
631 type
= TokenType
.DOT
;
633 if (current
< end
- 1) {
634 if (current
[0] == '.' && current
[1] == '.') {
635 type
= TokenType
.ELLIPSIS
;
641 type
= TokenType
.COLON
;
645 type
= TokenType
.COMMA
;
649 type
= TokenType
.SEMICOLON
;
653 type
= TokenType
.HASH
;
657 type
= TokenType
.INTERR
;
661 type
= TokenType
.BITWISE_OR
;
664 switch (current
[0]) {
666 type
= TokenType
.ASSIGN_BITWISE_OR
;
670 type
= TokenType
.OP_OR
;
677 type
= TokenType
.BITWISE_AND
;
680 switch (current
[0]) {
682 type
= TokenType
.ASSIGN_BITWISE_AND
;
686 type
= TokenType
.OP_AND
;
693 type
= TokenType
.CARRET
;
695 if (current
< end
&& current
[0] == '=') {
696 type
= TokenType
.ASSIGN_BITWISE_XOR
;
701 type
= TokenType
.TILDE
;
705 type
= TokenType
.ASSIGN
;
708 switch (current
[0]) {
710 type
= TokenType
.OP_EQ
;
714 type
= TokenType
.LAMBDA
;
721 type
= TokenType
.OP_LT
;
724 switch (current
[0]) {
726 type
= TokenType
.OP_LE
;
730 type
= TokenType
.OP_SHIFT_LEFT
;
732 if (current
< end
&& current
[0] == '=') {
733 type
= TokenType
.ASSIGN_SHIFT_LEFT
;
741 type
= TokenType
.OP_GT
;
743 if (current
< end
&& current
[0] == '=') {
744 type
= TokenType
.OP_GE
;
749 type
= TokenType
.OP_NEG
;
751 if (current
< end
&& current
[0] == '=') {
752 type
= TokenType
.OP_NE
;
757 type
= TokenType
.PLUS
;
760 switch (current
[0]) {
762 type
= TokenType
.ASSIGN_ADD
;
766 type
= TokenType
.OP_INC
;
773 type
= TokenType
.MINUS
;
776 switch (current
[0]) {
778 type
= TokenType
.ASSIGN_SUB
;
782 type
= TokenType
.OP_DEC
;
786 type
= TokenType
.OP_PTR
;
793 type
= TokenType
.STAR
;
795 if (current
< end
&& current
[0] == '=') {
796 type
= TokenType
.ASSIGN_MUL
;
801 type
= TokenType
.DIV
;
803 if (current
< end
&& current
[0] == '=') {
804 type
= TokenType
.ASSIGN_DIV
;
809 type
= TokenType
.PERCENT
;
811 if (current
< end
&& current
[0] == '=') {
812 type
= TokenType
.ASSIGN_PERCENT
;
818 if (begin
[0] == '\'') {
819 type
= TokenType
.CHARACTER_LITERAL
;
821 type
= TokenType
.STRING_LITERAL
;
823 token_length_in_chars
= 2;
825 while (current
< end
&& current
[0] != begin
[0]) {
826 if (current
[0] == '\\') {
828 token_length_in_chars
++;
829 if (current
< end
&& current
[0] == 'x') {
830 // hexadecimal escape character
832 token_length_in_chars
++;
833 while (current
< end
&& current
[0].isxdigit ()) {
835 token_length_in_chars
++;
839 token_length_in_chars
++;
841 } else if (current
[0] == '\n') {
844 unichar u
= ((string) current
).get_char_validated ((long) (end
- current
));
845 if (u
!= (unichar
) (-1)) {
846 current
+= u
.to_utf8 (null);
847 token_length_in_chars
++;
849 Report
.error (new
SourceReference (source_file
, line
, column
+ token_length_in_chars
, line
, column
+ token_length_in_chars
), "invalid UTF-8 character");
853 if (current
< end
&& current
[0] != '\n') {
856 Report
.error (new
SourceReference (source_file
, line
, column
+ token_length_in_chars
, line
, column
+ token_length_in_chars
), "syntax error, expected %c".printf (begin
[0]));
860 unichar u
= ((string) current
).get_char_validated ((long) (end
- current
));
861 if (u
!= (unichar
) (-1)) {
862 current
+= u
.to_utf8 (null);
863 Report
.error (new
SourceReference (source_file
, line
, column
, line
, column
), "syntax error, unexpected character");
866 Report
.error (new
SourceReference (source_file
, line
, column
, line
, column
), "invalid UTF-8 character");
869 last_token
= TokenType
.STRING_LITERAL
;
870 return read_token (out token_begin
, out token_end
);
874 if (token_length_in_chars
< 0) {
875 column
+= (int) (current
- begin
);
877 column
+= token_length_in_chars
;
880 token_end
.pos
= current
;
881 token_end
.line
= line
;
882 token_end
.column
= column
- 1;
895 if (_indent_spaces
== 0) {
896 while (current
< end
&& current
[0] == '\t') {
903 while (current
< end
&& current
[0] == ' ') {
909 tab_count
= space_count
/ _indent_spaces
;
913 /* ignore comments and whitspace and other lines that contain no code */
917 if ((current
< end
) && (current
[0] == '\n')) return -1;
922 bool matches (char* begin
, string keyword
) {
923 char* keyword_array
= keyword
;
924 long len
= keyword
.len ();
925 for (int i
= 0; i
< len
; i
++) {
926 if (begin
[i
] != keyword_array
[i
]) {
935 while (current
< end
&& current
[0].isspace () && current
[0] != '\n' ) {
944 inline
bool newline () {
945 if (current
[0] == '\n') {
952 bool skip_newlines () {
953 bool new_lines
= false;
960 current_indent_level
= 0;
969 if (current
> end
- 2
971 || (current
[1] != '/' && current
[1] != '*')) {
975 if (current
[1] == '/') {
976 // single-line comment
978 char* begin
= current
;
979 // skip until end of line or end of file
980 while (current
< end
&& current
[0] != '\n') {
983 push_comment (((string) begin
).ndup ((long) (current
- begin
)), line
== 1);
985 if (current
[0] == '\n') {
989 current_indent_level
= 0;
994 char* begin
= current
;
995 int begin_line
= line
;
996 while (current
< end
- 1
997 && (current
[0] != '*' || current
[1] != '/')) {
998 if (current
[0] == '\n') {
1005 if (current
== end
- 1) {
1006 Report
.error (new
SourceReference (source_file
, line
, column
, line
, column
), "syntax error, expected */");
1009 push_comment (((string) begin
).ndup ((long) (current
- begin
)), begin_line
== 1);
1019 while (current
< end
&& current
[0] == '\t' ) {
1028 void skip_space_tabs () {
1029 while (whitespace () || skip_tabs () || comment () ) {
1035 while (whitespace () || comment ()) {
1042 void push_comment (string comment_item
, bool file_comment
) {
1043 if (_comment
== null) {
1044 _comment
= comment_item
;
1046 _comment
= "%s\n%s".printf (_comment
, comment_item
);
1049 source_file
.comment
= _comment
;
1055 * Clears and returns the content of the comment stack.
1057 * @return saved comment
1059 public string?
pop_comment () {
1060 if (_comment
== null) {
1064 var result
= new
StringBuilder (_comment
);
1068 while ((index
= result
.str
.chr (-1, '\t')) != null) {
1069 result
.erase (result
.str
.pointer_to_offset (index
), 1);