Do not free values returned via g_object_get prematurely, require
[vala-lang.git] / vala / valascanner.vala
blob9891d52df5151c4bfd952762f66f1772c536eb2b
1 /* valascanner.vala
3 * Copyright (C) 2008-2009 Jürg Billeter
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 * Author:
20 * Jürg Billeter <j@bitron.ch>
23 using GLib;
24 using Gee;
26 /**
27 * Lexical scanner for Vala source files.
29 public class Vala.Scanner {
30 public SourceFile source_file { get; private set; }
32 char* current;
33 char* end;
35 int line;
36 int column;
38 string _comment;
40 public Scanner (SourceFile source_file) {
41 this.source_file = source_file;
43 char* begin = source_file.get_mapped_contents ();
44 end = begin + source_file.get_mapped_length ();
46 current = begin;
48 line = 1;
49 column = 1;
52 bool is_ident_char (char c) {
53 return (c.isalnum () || c == '_');
56 TokenType get_identifier_or_keyword (char* begin, int len) {
57 switch (len) {
58 case 2:
59 switch (begin[0]) {
60 case 'a':
61 if (matches (begin, "as")) return TokenType.AS;
62 break;
63 case 'd':
64 if (matches (begin, "do")) return TokenType.DO;
65 break;
66 case 'i':
67 switch (begin[1]) {
68 case 'f':
69 return TokenType.IF;
70 case 'n':
71 return TokenType.IN;
72 case 's':
73 return TokenType.IS;
75 break;
77 break;
78 case 3:
79 switch (begin[0]) {
80 case 'f':
81 if (matches (begin, "for")) return TokenType.FOR;
82 break;
83 case 'g':
84 if (matches (begin, "get")) return TokenType.GET;
85 break;
86 case 'n':
87 if (matches (begin, "new")) return TokenType.NEW;
88 break;
89 case 'o':
90 if (matches (begin, "out")) return TokenType.OUT;
91 break;
92 case 'r':
93 if (matches (begin, "ref")) return TokenType.REF;
94 break;
95 case 's':
96 if (matches (begin, "set")) return TokenType.SET;
97 break;
98 case 't':
99 if (matches (begin, "try")) return TokenType.TRY;
100 break;
101 case 'v':
102 if (matches (begin, "var")) return TokenType.VAR;
103 break;
105 break;
106 case 4:
107 switch (begin[0]) {
108 case 'b':
109 if (matches (begin, "base")) return TokenType.BASE;
110 break;
111 case 'c':
112 if (matches (begin, "case")) return TokenType.CASE;
113 break;
114 case 'e':
115 switch (begin[1]) {
116 case 'l':
117 if (matches (begin, "else")) return TokenType.ELSE;
118 break;
119 case 'n':
120 if (matches (begin, "enum")) return TokenType.ENUM;
121 break;
123 break;
124 case 'l':
125 if (matches (begin, "lock")) return TokenType.LOCK;
126 break;
127 case 'n':
128 if (matches (begin, "null")) return TokenType.NULL;
129 break;
130 case 't':
131 switch (begin[1]) {
132 case 'h':
133 if (matches (begin, "this")) return TokenType.THIS;
134 break;
135 case 'r':
136 if (matches (begin, "true")) return TokenType.TRUE;
137 break;
139 break;
140 case 'v':
141 if (matches (begin, "void")) return TokenType.VOID;
142 break;
143 case 'w':
144 if (matches (begin, "weak")) return TokenType.WEAK;
145 break;
147 break;
148 case 5:
149 switch (begin[0]) {
150 case 'b':
151 if (matches (begin, "break")) return TokenType.BREAK;
152 break;
153 case 'c':
154 switch (begin[1]) {
155 case 'a':
156 if (matches (begin, "catch")) return TokenType.CATCH;
157 break;
158 case 'l':
159 if (matches (begin, "class")) return TokenType.CLASS;
160 break;
161 case 'o':
162 if (matches (begin, "const")) return TokenType.CONST;
163 break;
165 break;
166 case 'f':
167 if (matches (begin, "false")) return TokenType.FALSE;
168 break;
169 case 'o':
170 if (matches (begin, "owned")) return TokenType.OWNED;
171 break;
172 case 't':
173 if (matches (begin, "throw")) return TokenType.THROW;
174 break;
175 case 'u':
176 if (matches (begin, "using")) return TokenType.USING;
177 break;
178 case 'w':
179 if (matches (begin, "while")) return TokenType.WHILE;
180 break;
181 case 'y':
182 if (matches (begin, "yield")) return TokenType.YIELD;
183 break;
185 break;
186 case 6:
187 switch (begin[0]) {
188 case 'd':
189 if (matches (begin, "delete")) return TokenType.DELETE;
190 break;
191 case 'e':
192 if (matches (begin, "extern")) return TokenType.EXTERN;
193 break;
194 case 'i':
195 if (matches (begin, "inline")) return TokenType.INLINE;
196 break;
197 case 'p':
198 switch (begin[1]) {
199 case 'a':
200 if (matches (begin, "params")) return TokenType.PARAMS;
201 break;
202 case 'u':
203 if (matches (begin, "public")) return TokenType.PUBLIC;
204 break;
206 break;
207 case 'r':
208 if (matches (begin, "return")) return TokenType.RETURN;
209 break;
210 case 's':
211 switch (begin[1]) {
212 case 'i':
213 switch (begin[2]) {
214 case 'g':
215 if (matches (begin, "signal")) return TokenType.SIGNAL;
216 break;
217 case 'z':
218 if (matches (begin, "sizeof")) return TokenType.SIZEOF;
219 break;
221 break;
222 case 't':
223 switch (begin[2]) {
224 case 'a':
225 if (matches (begin, "static")) return TokenType.STATIC;
226 break;
227 case 'r':
228 if (matches (begin, "struct")) return TokenType.STRUCT;
229 break;
231 break;
232 case 'w':
233 if (matches (begin, "switch")) return TokenType.SWITCH;
234 break;
236 break;
237 case 't':
238 switch (begin[1]) {
239 case 'h':
240 if (matches (begin, "throws")) return TokenType.THROWS;
241 break;
242 case 'y':
243 if (matches (begin, "typeof")) return TokenType.TYPEOF;
244 break;
246 break;
247 case 'y':
248 if (matches (begin, "yields")) return TokenType.YIELDS;
249 break;
251 break;
252 case 7:
253 switch (begin[0]) {
254 case 'd':
255 switch (begin[1]) {
256 case 'e':
257 if (matches (begin, "default")) return TokenType.DEFAULT;
258 break;
259 case 'y':
260 if (matches (begin, "dynamic")) return TokenType.DYNAMIC;
261 break;
263 break;
264 case 'e':
265 if (matches (begin, "ensures")) return TokenType.ENSURES;
266 break;
267 case 'f':
268 switch (begin[1]) {
269 case 'i':
270 if (matches (begin, "finally")) return TokenType.FINALLY;
271 break;
272 case 'o':
273 if (matches (begin, "foreach")) return TokenType.FOREACH;
274 break;
276 break;
277 case 'p':
278 if (matches (begin, "private")) return TokenType.PRIVATE;
279 break;
280 case 'u':
281 if (matches (begin, "unowned")) return TokenType.UNOWNED;
282 break;
283 case 'v':
284 if (matches (begin, "virtual")) return TokenType.VIRTUAL;
285 break;
287 break;
288 case 8:
289 switch (begin[0]) {
290 case 'a':
291 if (matches (begin, "abstract")) return TokenType.ABSTRACT;
292 break;
293 case 'c':
294 if (matches (begin, "continue")) return TokenType.CONTINUE;
295 break;
296 case 'd':
297 if (matches (begin, "delegate")) return TokenType.DELEGATE;
298 break;
299 case 'i':
300 if (matches (begin, "internal")) return TokenType.INTERNAL;
301 break;
302 case 'o':
303 if (matches (begin, "override")) return TokenType.OVERRIDE;
304 break;
305 case 'r':
306 if (matches (begin, "requires")) return TokenType.REQUIRES;
307 break;
308 case 'v':
309 if (matches (begin, "volatile")) return TokenType.VOLATILE;
310 break;
312 break;
313 case 9:
314 switch (begin[0]) {
315 case 'c':
316 if (matches (begin, "construct")) return TokenType.CONSTRUCT;
317 break;
318 case 'i':
319 if (matches (begin, "interface")) return TokenType.INTERFACE;
320 break;
321 case 'n':
322 if (matches (begin, "namespace")) return TokenType.NAMESPACE;
323 break;
324 case 'p':
325 if (matches (begin, "protected")) return TokenType.PROTECTED;
326 break;
328 break;
329 case 11:
330 if (matches (begin, "errordomain")) return TokenType.ERRORDOMAIN;
331 break;
333 return TokenType.IDENTIFIER;
336 TokenType read_number () {
337 var type = TokenType.INTEGER_LITERAL;
339 // integer part
340 if (current < end - 2 && current[0] == '0'
341 && current[1] == 'x' && current[2].isxdigit ()) {
342 // hexadecimal integer literal
343 current += 2;
344 while (current < end && current[0].isxdigit ()) {
345 current++;
347 } else {
348 // decimal number
349 while (current < end && current[0].isdigit ()) {
350 current++;
354 // fractional part
355 if (current < end - 1 && current[0] == '.' && current[1].isdigit ()) {
356 type = TokenType.REAL_LITERAL;
357 current++;
358 while (current < end && current[0].isdigit ()) {
359 current++;
363 // exponent part
364 if (current < end && current[0].tolower () == 'e') {
365 type = TokenType.REAL_LITERAL;
366 current++;
367 if (current < end && (current[0] == '+' || current[0] == '-')) {
368 current++;
370 while (current < end && current[0].isdigit ()) {
371 current++;
375 // type suffix
376 if (current < end) {
377 bool real_literal = (type == TokenType.REAL_LITERAL);
379 switch (current[0]) {
380 case 'l':
381 case 'L':
382 if (type == TokenType.INTEGER_LITERAL) {
383 current++;
384 if (current < end && current[0].tolower () == 'l') {
385 current++;
388 break;
389 case 'u':
390 case 'U':
391 if (type == TokenType.INTEGER_LITERAL) {
392 current++;
393 if (current < end && current[0].tolower () == 'l') {
394 current++;
395 if (current < end && current[0].tolower () == 'l') {
396 current++;
400 break;
401 case 'f':
402 case 'F':
403 case 'd':
404 case 'D':
405 type = TokenType.REAL_LITERAL;
406 current++;
407 break;
410 if (!real_literal && is_ident_char (current[0])) {
411 // allow identifiers to start with a digit
412 // as long as they contain at least one char
413 while (current < end && is_ident_char (current[0])) {
414 current++;
416 type = TokenType.IDENTIFIER;
420 return type;
423 public TokenType read_token (out SourceLocation token_begin, out SourceLocation token_end) {
424 space ();
426 TokenType type;
427 char* begin = current;
428 token_begin.pos = begin;
429 token_begin.line = line;
430 token_begin.column = column;
432 int token_length_in_chars = -1;
434 if (current >= end) {
435 type = TokenType.EOF;
436 } else if (current[0].isalpha () || current[0] == '_') {
437 int len = 0;
438 while (current < end && is_ident_char (current[0])) {
439 current++;
440 len++;
442 type = get_identifier_or_keyword (begin, len);
443 } else if (current[0] == '@') {
444 token_begin.pos++; // @ is not part of the identifier
445 current++;
446 int len = 0;
447 while (current < end && is_ident_char (current[0])) {
448 current++;
449 len++;
451 type = TokenType.IDENTIFIER;
452 } else if (current[0].isdigit ()) {
453 type = read_number ();
454 } else {
455 switch (current[0]) {
456 case '{':
457 type = TokenType.OPEN_BRACE;
458 current++;
459 break;
460 case '}':
461 type = TokenType.CLOSE_BRACE;
462 current++;
463 break;
464 case '(':
465 type = TokenType.OPEN_PARENS;
466 current++;
467 break;
468 case ')':
469 type = TokenType.CLOSE_PARENS;
470 current++;
471 break;
472 case '[':
473 type = TokenType.OPEN_BRACKET;
474 current++;
475 break;
476 case ']':
477 type = TokenType.CLOSE_BRACKET;
478 current++;
479 break;
480 case '.':
481 type = TokenType.DOT;
482 current++;
483 if (current < end - 1) {
484 if (current[0] == '.' && current[1] == '.') {
485 type = TokenType.ELLIPSIS;
486 current += 2;
489 break;
490 case ':':
491 type = TokenType.COLON;
492 current++;
493 if (current < end && current[0] == ':') {
494 type = TokenType.DOUBLE_COLON;
495 current++;
497 break;
498 case ',':
499 type = TokenType.COMMA;
500 current++;
501 break;
502 case ';':
503 type = TokenType.SEMICOLON;
504 current++;
505 break;
506 case '#':
507 type = TokenType.HASH;
508 current++;
509 break;
510 case '?':
511 type = TokenType.INTERR;
512 current++;
513 break;
514 case '|':
515 type = TokenType.BITWISE_OR;
516 current++;
517 if (current < end) {
518 switch (current[0]) {
519 case '=':
520 type = TokenType.ASSIGN_BITWISE_OR;
521 current++;
522 break;
523 case '|':
524 type = TokenType.OP_OR;
525 current++;
526 break;
529 break;
530 case '&':
531 type = TokenType.BITWISE_AND;
532 current++;
533 if (current < end) {
534 switch (current[0]) {
535 case '=':
536 type = TokenType.ASSIGN_BITWISE_AND;
537 current++;
538 break;
539 case '&':
540 type = TokenType.OP_AND;
541 current++;
542 break;
545 break;
546 case '^':
547 type = TokenType.CARRET;
548 current++;
549 if (current < end && current[0] == '=') {
550 type = TokenType.ASSIGN_BITWISE_XOR;
551 current++;
553 break;
554 case '~':
555 type = TokenType.TILDE;
556 current++;
557 break;
558 case '=':
559 type = TokenType.ASSIGN;
560 current++;
561 if (current < end) {
562 switch (current[0]) {
563 case '=':
564 type = TokenType.OP_EQ;
565 current++;
566 break;
567 case '>':
568 type = TokenType.LAMBDA;
569 current++;
570 break;
573 break;
574 case '<':
575 type = TokenType.OP_LT;
576 current++;
577 if (current < end) {
578 switch (current[0]) {
579 case '=':
580 type = TokenType.OP_LE;
581 current++;
582 break;
583 case '<':
584 type = TokenType.OP_SHIFT_LEFT;
585 current++;
586 if (current < end && current[0] == '=') {
587 type = TokenType.ASSIGN_SHIFT_LEFT;
588 current++;
590 break;
593 break;
594 case '>':
595 type = TokenType.OP_GT;
596 current++;
597 if (current < end && current[0] == '=') {
598 type = TokenType.OP_GE;
599 current++;
601 break;
602 case '!':
603 type = TokenType.OP_NEG;
604 current++;
605 if (current < end && current[0] == '=') {
606 type = TokenType.OP_NE;
607 current++;
609 break;
610 case '+':
611 type = TokenType.PLUS;
612 current++;
613 if (current < end) {
614 switch (current[0]) {
615 case '=':
616 type = TokenType.ASSIGN_ADD;
617 current++;
618 break;
619 case '+':
620 type = TokenType.OP_INC;
621 current++;
622 break;
625 break;
626 case '-':
627 type = TokenType.MINUS;
628 current++;
629 if (current < end) {
630 switch (current[0]) {
631 case '=':
632 type = TokenType.ASSIGN_SUB;
633 current++;
634 break;
635 case '-':
636 type = TokenType.OP_DEC;
637 current++;
638 break;
639 case '>':
640 type = TokenType.OP_PTR;
641 current++;
642 break;
645 break;
646 case '*':
647 type = TokenType.STAR;
648 current++;
649 if (current < end && current[0] == '=') {
650 type = TokenType.ASSIGN_MUL;
651 current++;
653 break;
654 case '/':
655 type = TokenType.DIV;
656 current++;
657 if (current < end && current[0] == '=') {
658 type = TokenType.ASSIGN_DIV;
659 current++;
661 break;
662 case '%':
663 type = TokenType.PERCENT;
664 current++;
665 if (current < end && current[0] == '=') {
666 type = TokenType.ASSIGN_PERCENT;
667 current++;
669 break;
670 case '\'':
671 case '"':
672 if (begin[0] == '\'') {
673 type = TokenType.CHARACTER_LITERAL;
674 } else if (current < end - 6 && begin[1] == '"' && begin[2] == '"') {
675 type = TokenType.VERBATIM_STRING_LITERAL;
676 token_length_in_chars = 6;
677 current += 3;
678 while (current < end - 4) {
679 if (current[0] == '"' && current[1] == '"' && current[2] == '"') {
680 break;
681 } else if (current[0] == '\n') {
682 current++;
683 line++;
684 column = 1;
685 token_length_in_chars = 3;
686 } else {
687 unichar u = ((string) current).get_char_validated ((long) (end - current));
688 if (u != (unichar) (-1)) {
689 current += u.to_utf8 (null);
690 token_length_in_chars++;
691 } else {
692 Report.error (new SourceReference (source_file, line, column + token_length_in_chars, line, column + token_length_in_chars), "invalid UTF-8 character");
696 if (current[0] == '"' && current[1] == '"' && current[2] == '"') {
697 current += 3;
698 } else {
699 Report.error (new SourceReference (source_file, line, column + token_length_in_chars, line, column + token_length_in_chars), "syntax error, expected \"\"\"");
701 break;
702 } else {
703 type = TokenType.STRING_LITERAL;
705 token_length_in_chars = 2;
706 current++;
707 while (current < end && current[0] != begin[0]) {
708 if (current[0] == '\\') {
709 current++;
710 token_length_in_chars++;
711 if (current < end && current[0] == 'x') {
712 // hexadecimal escape character
713 current++;
714 token_length_in_chars++;
715 while (current < end && current[0].isxdigit ()) {
716 current++;
717 token_length_in_chars++;
719 } else {
720 current++;
721 token_length_in_chars++;
723 } else if (current[0] == '\n') {
724 break;
725 } else {
726 unichar u = ((string) current).get_char_validated ((long) (end - current));
727 if (u != (unichar) (-1)) {
728 current += u.to_utf8 (null);
729 token_length_in_chars++;
730 } else {
731 Report.error (new SourceReference (source_file, line, column + token_length_in_chars, line, column + token_length_in_chars), "invalid UTF-8 character");
735 if (current < end && current[0] != '\n') {
736 current++;
737 } else {
738 Report.error (new SourceReference (source_file, line, column + token_length_in_chars, line, column + token_length_in_chars), "syntax error, expected %c".printf (begin[0]));
740 break;
741 default:
742 unichar u = ((string) current).get_char_validated ((long) (end - current));
743 if (u != (unichar) (-1)) {
744 current += u.to_utf8 (null);
745 Report.error (new SourceReference (source_file, line, column, line, column), "syntax error, unexpected character");
746 } else {
747 current++;
748 Report.error (new SourceReference (source_file, line, column, line, column), "invalid UTF-8 character");
750 column++;
751 return read_token (out token_begin, out token_end);
755 if (token_length_in_chars < 0) {
756 column += (int) (current - begin);
757 } else {
758 column += token_length_in_chars;
761 token_end.pos = current;
762 token_end.line = line;
763 token_end.column = column - 1;
765 return type;
768 bool matches (char* begin, string keyword) {
769 char* keyword_array = keyword;
770 long len = keyword.len ();
771 for (int i = 0; i < len; i++) {
772 if (begin[i] != keyword_array[i]) {
773 return false;
776 return true;
779 bool whitespace () {
780 bool found = false;
781 while (current < end && current[0].isspace ()) {
782 if (current[0] == '\n') {
783 line++;
784 column = 0;
786 found = true;
787 current++;
788 column++;
790 return found;
793 bool comment () {
794 if (current > end - 2
795 || current[0] != '/'
796 || (current[1] != '/' && current[1] != '*')) {
797 return false;
800 if (current[1] == '/') {
801 // single-line comment
802 current += 2;
803 char* begin = current;
804 // skip until end of line or end of file
805 while (current < end && current[0] != '\n') {
806 current++;
808 push_comment (((string) begin).ndup ((long) (current - begin)), line == 1);
809 } else {
810 // delimited comment
811 current += 2;
812 char* begin = current;
813 int begin_line = line;
814 while (current < end - 1
815 && (current[0] != '*' || current[1] != '/')) {
816 if (current[0] == '\n') {
817 line++;
818 column = 0;
820 current++;
821 column++;
823 if (current == end - 1) {
824 Report.error (new SourceReference (source_file, line, column, line, column), "syntax error, expected */");
825 return true;
827 push_comment (((string) begin).ndup ((long) (current - begin)), begin_line == 1);
828 current += 2;
829 column += 2;
832 return true;
835 void space () {
836 while (whitespace () || comment ()) {
840 void push_comment (string comment_item, bool file_comment) {
841 if (_comment == null) {
842 _comment = comment_item;
843 } else {
844 _comment = "%s\n%s".printf (_comment, comment_item);
846 if (file_comment) {
847 source_file.comment = _comment;
848 _comment = null;
853 * Clears and returns the content of the comment stack.
855 * @return saved comment
857 public string? pop_comment () {
858 if (_comment == null) {
859 return null;
862 var result = new StringBuilder (_comment);
863 _comment = null;
865 weak string index;
866 while ((index = result.str.chr (-1, '\t')) != null) {
867 result.erase (result.str.pointer_to_offset (index), 1);
870 return result.str;