switched to GPLv3 ONLY, because i don't trust FSF anymore
[gaemu.git] / gaem / parser / lexer.d
blob3da0db8d204a9cd807cfff914466e6c3ea003aa3
1 /* GML parser
2 * coded by Ketmar // Invisible Vector <ketmar@ketmar.no-ip.org>
3 * Understanding is not required. Only obedience.
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, version 3 of the License ONLY.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 module gaem.parser.lexer is aliced;
19 import gaem.parser.tokens;
22 // ////////////////////////////////////////////////////////////////////////// //
23 public struct Loc {
24 string file;
25 int line, col;
26 uint tpos;
28 string toString () const { import std.string : format; return "%s (%s,%s)".format(file, line, col); }
29 string toStringNoFile () const { import std.string : format; return "(%s,%s)".format(line, col); }
31 @property bool valid () const pure nothrow @safe @nogc { pragma(inline, true); return (line > 0 && col > 0); }
35 // ////////////////////////////////////////////////////////////////////////// //
36 public class ErrorAt : Exception {
37 Loc loc;
39 this (string msg, Throwable next=null, string file=__FILE__, usize line=__LINE__) pure nothrow @safe @nogc { super(msg, file, line, next); }
40 this (in Loc aloc, string msg, Throwable next=null, string file=__FILE__, usize line=__LINE__) pure nothrow @safe @nogc { loc = aloc; super(msg, file, line, next); }
44 // ////////////////////////////////////////////////////////////////////////// //
45 public struct Token {
46 public:
47 enum Type {
48 EOF = -1,
49 Kw,
50 Id,
51 Str,
52 Num,
53 Spec,
56 private:
57 const(char)[] tkstr;
59 public:
60 Loc loc, eloc; // token start, token end (after last char)
61 Type type = Type.EOF; // token type
62 union {
63 Keyword kw;
64 float num;
67 @safe:
68 void mustbeType (Token.Type tp, string msg="identifier expected", string file=__FILE__, usize line=__LINE__) {
69 pragma(inline, true);
70 if (type != tp) throw new ErrorAt(loc, msg, null, file, line);
72 void mustbeId (string msg="identifier expected", string file=__FILE__, usize line=__LINE__) { pragma(inline, true); mustbeType(Type.Id, msg, file, line); }
73 void mustbeStr (string msg="string expected", string file=__FILE__, usize line=__LINE__) { pragma(inline, true); mustbeType(Type.Str, msg, file, line); }
74 void mustbeNum (string msg="number expected", string file=__FILE__, usize line=__LINE__) { pragma(inline, true); mustbeType(Type.Num, msg, file, line); }
76 string toString () const @trusted {
77 import std.string : format;
78 final switch (type) with (Type) {
79 case EOF: return "(%s,%d): <EOF>".format(loc.line, loc.col);
80 case Kw: return "(%s,%d): kw.%s <%s>".format(loc.line, loc.col, kw, tkstr);
81 case Id: return "(%s,%d): Id:%s".format(loc.line, loc.col, tkstr);
82 case Str: return "(%s,%d): Str:%s".format(loc.line, loc.col, Lexer.quote(tkstr));
83 case Num: return "(%s,%d): Num:%s".format(loc.line, loc.col, num);
84 case Spec: return "(%s,%d): Spec:<%s>".format(loc.line, loc.col, tkstr);
86 assert(0);
89 nothrow:
90 // get immutable string
91 // this converts id to `string` via `.idup`, use with caution!
92 // `.idup` is used to not anchor the whole source string
93 @property string istr () { pragma(inline, true); return (tkstr.length ? tkstr.idup : null); }
95 const pure nothrow @nogc:
96 bool opEquals (Keyword akw) { pragma(inline, true); return (type == Type.Kw && kw == akw); }
97 bool isKw (Keyword akw) { pragma(inline, true); return (type == Type.Kw && kw == akw); }
98 bool isKw () { pragma(inline, true); return (type == Type.Kw); }
100 @property:
101 const(char)[] str () { pragma(inline, true); return tkstr; }
102 Keyword Kw () { pragma(inline, true); return (type == Type.Kw ? kw : Keyword.NoKW); }
103 bool isId () { pragma(inline, true); return (type == Type.Id); }
104 bool isStr () { pragma(inline, true); return (type == Type.Str); }
105 bool isNum () { pragma(inline, true); return (type == Type.Num); }
106 bool isSpec () { pragma(inline, true); return (type == Type.Spec); }
107 bool isEOF () { pragma(inline, true); return (type == Type.EOF); }
111 // ////////////////////////////////////////////////////////////////////////// //
112 public final class Lexer {
113 private:
114 const(char)[] text;
115 uint tpos;
116 Loc cpos; // position for last `getChar()`
117 Loc pend; // end of previous token, for better error messages
118 bool eof;
119 bool lastWasEOL = true;
120 Token[] lookup;
121 Token tokeof; // will be fixed by `nextToken()`
123 public:
124 this(T) (const(char)[] atext, T afname=null) if (is(T : const(char)[])) {
125 text = atext;
126 if (afname.length > 0) { static if (is(T == string)) cpos.file = afname; else cpos.file = afname.idup; }
127 tokeof.loc.file = cpos.file;
128 nextToken();
129 pend.line = 1;
130 pend.col = 1;
131 pend.tpos = 0;
134 void error (string msg, string file=__FILE__, usize line=__LINE__) {
135 pragma(inline, true);
136 throw new ErrorAt((lookup.length == 0 ? loc : lookup[0].loc), msg, null, file, line);
139 static private void error (in ref Token tk, string msg, string file=__FILE__, usize line=__LINE__) {
140 pragma(inline, true);
141 throw new ErrorAt(tk.loc, msg, null, file, line);
144 static private void error() (in auto ref Loc loc, string msg, string file=__FILE__, usize line=__LINE__) {
145 pragma(inline, true);
146 throw new ErrorAt(loc, msg, null, file, line);
149 const(char)[] line (uint idx) {
150 if (idx == 0) ++idx;
151 uint pos = 0;
152 while (--idx > 0) {
153 while (pos < text.length && text.ptr[pos] != '\n') ++pos;
154 ++pos;
156 if (pos >= text.length) return null;
157 uint epos = pos;
158 while (epos < text.length && text.ptr[epos] != '\n') ++epos;
159 while (epos > pos && text.ptr[epos-1] <= ' ') --epos;
160 return text[pos..epos];
163 void popFront () {
164 if (lookup.length > 0) {
165 pend = lookup.ptr[0].eloc;
166 ++pend.col; // for better error messages
167 ++pend.tpos; // to be consistent
168 foreach (immutable idx; 1..lookup.length) lookup.ptr[idx-1] = lookup.ptr[idx];
169 lookup.length -= 1;
170 lookup.assumeSafeAppend;
172 nextToken();
175 @property pure nothrow @safe @nogc {
176 bool empty () const { pragma(inline, true); return (lookup.length == 0); }
177 ref inout(Token) front () inout { pragma(inline, true); return (lookup.length ? lookup.ptr[0] : tokeof); }
178 // current token's loc
179 auto loc () const { pragma(inline, true); return front.loc; }
180 auto eloc () const { pragma(inline, true); return front.eloc; }
181 auto peloc () const { pragma(inline, true); return pend; }
183 bool isId () const { pragma(inline, true); return front.isId; }
184 bool isStr () const { pragma(inline, true); return front.isStr; }
185 bool isNum () const { pragma(inline, true); return front.isNum; }
186 bool isSpec () const { pragma(inline, true); return front.isSpec; }
189 bool isKw (Keyword kw) const pure nothrow @safe @nogc { pragma(inline, true); return front.isKw(kw); }
190 bool isKw () const pure nothrow @safe @nogc { pragma(inline, true); return front.isKw(); }
192 bool opEquals (Keyword kw) const pure nothrow @safe @nogc { pragma(inline, true); return (front == kw); }
194 // this eats keyword
195 void expect (Keyword kw, string file=__FILE__, usize line=__LINE__) {
196 if (!front.isKw(kw)) error(loc, "`"~keywordtext(kw)~"` expected", file, line);
197 popFront();
200 // this converts id to `string` via `.idup`, use with caution!
201 // `.idup` is used to not anchor the whole source string
202 string expectId (string msg="identifier expected", string file=__FILE__, usize line=__LINE__) {
203 mustbeId(msg, file, line);
204 auto res = lookup[0].istr;
205 popFront();
206 return res;
209 // this converts id to `string` via `.idup`, use with caution!
210 // `.idup` is used to not anchor the whole source string
211 string expectStr (string msg="string expected", string file=__FILE__, usize line=__LINE__) {
212 //pragma(inline, true);
213 mustbeStr(msg, file, line);
214 auto res = lookup[0].istr;
215 popFront();
216 return res;
219 // `mustbe` doesn't eat token
220 void mustbeType (Token.Type tp, string msg="identifier expected", string file=__FILE__, usize line=__LINE__) { pragma(inline, true); return front.mustbeType(tp, msg, file, line); }
221 void mustbeId (string msg="identifier expected", string file=__FILE__, usize line=__LINE__) { pragma(inline, true); return front.mustbeId(msg, file, line); }
222 void mustbeStr (string msg="string expected", string file=__FILE__, usize line=__LINE__) { pragma(inline, true); return front.mustbeStr(msg, file, line); }
223 void mustbeNum (string msg="number expected", string file=__FILE__, usize line=__LINE__) { pragma(inline, true); return front.mustbeNum(msg, file, line); }
225 bool eatKw (Keyword kw) {
226 if (!isKw(kw)) return false;
227 popFront();
228 return true;
231 ref Token peek (uint dist) {
232 while (!eof && lookup.length <= dist) nextToken();
233 return (dist < lookup.length ? lookup.ptr[dist] : tokeof);
236 ref Token opIndex (usize dist) { pragma(inline, true); return peek(dist); }
238 // return loc for next `getChar()`
239 Loc nextLoc () nothrow @safe @nogc {
240 Loc res = cpos;
241 if (lastWasEOL) { ++res.line; res.col = 1; } else ++res.col;
242 return res;
245 char peekChar (uint dist=0) nothrow @trusted @nogc {
246 pragma(inline, true);
247 return (tpos+dist >= text.length ? '\0' : (text.ptr[tpos+dist] ? text.ptr[tpos+dist] : ' '));
250 // return char or 0
251 char getChar () nothrow @trusted @nogc {
252 if (tpos >= text.length) { tpos = text.length; eof = true; }
253 if (eof) return '\0';
254 cpos.tpos = tpos;
255 char ch = text.ptr[tpos++];
256 if (ch == '\0') ch = ' ';
257 if (lastWasEOL) { ++cpos.line; cpos.col = 1; } else ++cpos.col;
258 lastWasEOL = (ch == '\n');
259 return ch;
262 // skip blanks and comments
263 //TODO: make special "comment" token(s)?
264 void skipBlanks () @safe {
265 mainloop: for (;;) {
266 char ch = peekChar;
267 if (ch == '/') {
268 switch (peekChar(1)) {
269 case '/': // single-line comment
270 do { ch = getChar(); } while (ch != 0 && ch != '\n');
271 continue mainloop;
272 case '*': // multiline comment
273 getChar(); // skip slash
274 auto lc = cpos;
275 getChar(); // skip star
276 char pch = ' ';
277 ch = ' '; // we need this
278 for (;;) {
279 pch = ch;
280 ch = getChar();
281 if (ch == 0) error(lc, "unterminated comment");
282 if (ch == '/' && pch == '*') break;
284 continue mainloop;
285 default:
288 if (ch == 0 || ch > 32) return;
289 getChar();
293 private void nextToken () {
294 if (eof) return;
296 skipBlanks();
297 if (peekChar == '\0') {
298 eof = true;
299 tokeof.loc = cpos;
300 tokeof.eloc = cpos;
301 //++tokeof.eloc.col; // for better error messages
302 //++tokeof.eloc.tpos; // to be consistent
303 return;
306 Token tk;
307 auto tkspos = tpos;
308 char ch = getChar();
309 tk.loc = cpos;
311 // quoted string
312 if (ch == '"' || ch == '\'') {
313 char ech = ch;
314 tk.type = Token.Type.Str;
315 ++tkspos; // skip quote
316 for (;;) {
317 ch = getChar();
318 if (ch == 0) error(tk, "unterminated string");
319 if (ch == ech) break;
321 tk.tkstr = text[tkspos..tpos-1]; // -1 due to eaten quote
322 tk.eloc = cpos;
323 //++tk.eloc.col; // for better error messages
324 //++tk.eloc.tpos; // to be consistent
325 lookup ~= tk;
326 return;
329 // hex number
330 if (ch == '$') {
331 float n = 0;
332 tk.type = Token.Type.Num;
333 getChar(); // skip dollar
334 int dv = digitValue(peekChar);
335 if (dv < 0 || dv > 15) error(tk, "hex number expected");
336 for (;;) {
337 dv = digitValue(peekChar);
338 if (dv < 0 || dv > 15) break;
339 n = n*16+dv;
340 getChar();
342 ch = peekChar;
343 if (isIdChar(ch) || ch == '.') error(tk, "hex number expected");
344 tk.num = n;
345 tk.tkstr = text[tkspos..tpos];
346 tk.eloc = cpos;
347 //++tk.eloc.col; // for better error messages
348 //++tk.eloc.tpos; // to be consistent
349 lookup ~= tk;
350 return;
353 // number
354 if (isDigit(ch) || (ch == '.' && isDigit(peekChar))) {
355 float n = 0;
356 tk.type = Token.Type.Num;
357 if (ch != '.') n = ch-'0';
358 if (ch != '.') {
359 // integral part
360 for (;;) {
361 if (!isDigit(peekChar)) break;
362 ch = getChar();
363 n = n*10+ch-'0';
365 if (peekChar == '.') ch = getChar();
367 if (ch == '.') {
368 // fractional part
369 if (!isDigit(peekChar)) error(tk, "real number expected");
370 float div = 1;
371 for (;;) {
372 if (!isDigit(peekChar)) break;
373 ch = getChar();
374 div /= 10;
375 n += div*(ch-'0');
378 if (peekChar == 'e' || peekChar == 'E') {
379 // exponent
380 getChar();
381 bool neg = false;
382 if (peekChar == '+') getChar(); else if (peekChar == '-') { getChar(); neg = true; }
383 if (!isDigit(peekChar)) error(tk, "invalid number");
384 int e = 0;
385 while (isDigit(peekChar)) {
386 ch = getChar();
387 e = e*10+(ch-'0');
388 if (e < 0) error(tk, "invalid number (exponent overflow)");
390 //{ import std.conv : to; assert(0, to!string(e)); }
391 if (neg) {
392 while (e-- > 0) n = n/10;
393 } else {
394 while (e-- > 0) n = n*10;
397 tk.num = n;
398 tk.tkstr = text[tkspos..tpos];
399 tk.eloc = cpos;
400 //++tk.eloc.col; // for better error messages
401 //++tk.eloc.tpos; // to be consistent
402 ch = peekChar;
403 if (isIdChar(ch) || ch == '.') error(tk, "invalid number");
404 lookup ~= tk;
405 return;
408 // identifier
409 if (isIdStart(ch)) {
410 tk.type = Token.Type.Id;
411 while (isIdChar(peekChar)) getChar();
412 tk.tkstr = text[tkspos..tpos];
413 tk.eloc = cpos;
414 //++tk.eloc.col; // for better error messages
415 //++tk.eloc.tpos; // to be consistent
416 if (auto kw = tk.tkstr in keywords) {
417 tk.type = Token.Type.Kw;
418 tk.kw = *kw;
420 lookup ~= tk;
421 return;
424 // delimiter
425 char[5] dbuf;
426 dbuf[0] = ch;
427 if (auto xkw = dbuf[0..1] in keywords) {
428 tk.type = Token.Type.Kw;
429 tk.kw = *xkw;
430 foreach (uint dpos; 1..dbuf.length) {
431 dbuf[dpos] = peekChar;
432 if (auto kw = dbuf[0..dpos+1] in keywords) {
433 tk.type = Token.Type.Kw;
434 tk.kw = *kw;
435 getChar(); // eat token char
436 } else {
437 break;
440 } else {
441 tk.type = Token.Type.Spec;
443 tk.tkstr = text[tkspos..tpos];
444 tk.eloc = cpos;
445 //++tk.eloc.col; // for better error messages
446 //++tk.eloc.tpos; // to be consistent
447 lookup ~= tk;
450 auto select(RetType, string mode="peek", A...) (scope A args) { pragma(inline, true); return selectN!(RetType, mode)(0, args); }
452 auto selectN(RetType, string mode="peek", A...) (usize n, scope A args) {
453 import std.traits : ReturnType;
455 static assert(mode == "peek" || mode == "pop" || mode == "pop-nondefault", "selectN: invalid mode: '"~mode~"'");
457 template isGoodDg(usize idx, T) {
458 private import std.traits;
459 static if (idx < A.length && isCallable!(A[idx]) && arity!(args[idx]) == 1) {
460 enum isGoodDg = is(Parameters!(A[idx])[0] == T);
461 } else {
462 enum isGoodDg = false;
466 template isGoodArglessDg(usize idx) {
467 private import std.traits;
468 static if (idx < A.length && isCallable!(A[idx]) && arity!(args[idx]) == 0) {
469 enum isGoodArglessDg = true;
470 } else {
471 enum isGoodArglessDg = false;
475 // sorry, but this has to be string mixin, due to possible empty `arg`
476 enum DoCallDg(string arg) =
477 "static if (!is(ReturnType!(A[xidx]) == void)) return cast(RetType)(args[xidx]("~arg~")); else { args[xidx]("~arg~"); return RetType.init; }";
479 // we can't have inner mixin templates, so... sorry, it's string again
480 enum CallDg = q{
481 static if (isGoodDg!(xidx, Token)) { mixin(DoCallDg!"tk"); }
482 else static if (isGoodDg!(xidx, Loc)) { mixin(DoCallDg!"tk.loc"); }
483 else static if (isGoodDg!(xidx, Token.Type)) { mixin(DoCallDg!"tk.type"); }
484 else static if (isGoodDg!(xidx, Keyword)) { mixin(DoCallDg!"tk.Kw"); }
485 else static if (isGoodArglessDg!(xidx)) { mixin(DoCallDg!""); }
486 else static assert(0, "selectN: invalid delegate #"~xidx.stringof);
489 auto tk = peek(n);
490 bool found = false;
491 foreach (immutable aidx, auto arg; args) {
492 static if (aidx%2 == 0) {
493 static if (is(typeof(arg) == Keyword) || is(typeof(arg) == Token.Type)) {
494 static if (is(typeof(arg) == Keyword)) found = (tk == arg);
495 else static if (is(typeof(arg) == Token.Type)) found = (tk.type == arg);
496 else static assert(0, "wtf?!");
497 if (found) {
498 // process `mode`
499 static if (mode != "peek") popFront();
500 // call delegate
501 enum xidx = aidx+1;
502 mixin(CallDg);
504 } else {
505 // default
506 // process `mode`
507 static if (mode == "pop") popFront();
508 // call delegate
509 enum xidx = aidx;
510 mixin(CallDg);
514 error(tk, "selectN is out of nodes");
515 assert(0);
518 static:
519 private immutable byte[256] digitValues = {
520 byte[256] res = -1;
521 foreach (ubyte idx; '0'..'9'+1) res[idx] = cast(byte)(idx-'0');
522 foreach (ubyte idx; 'A'..'Z'+1) res[idx] = cast(byte)(idx-'A'+10);
523 foreach (ubyte idx; 'a'..'z'+1) res[idx] = cast(byte)(idx-'a'+10);
524 return res;
525 }();
527 private immutable bool[256] idStartChars = {
528 bool[256] res = false;
529 foreach (ubyte idx; 'A'..'Z'+1) res[idx] = true;
530 foreach (ubyte idx; 'a'..'z'+1) res[idx] = true;
531 res['_'] = true;
532 return res;
533 }();
535 private immutable bool[256] idChars = {
536 bool[256] res = false;
537 foreach (ubyte idx; '0'..'9'+1) res[idx] = true;
538 foreach (ubyte idx; 'A'..'Z'+1) res[idx] = true;
539 foreach (ubyte idx; 'a'..'z'+1) res[idx] = true;
540 res['_'] = true;
541 return res;
542 }();
544 bool isDigit() (char ch) { pragma(inline, true); return (ch >= '0' && ch <= '9'); }
545 int digitValue() (char ch) { pragma(inline, true); return digitValues.ptr[cast(ubyte)ch]; }
546 bool isIdStart() (char ch) { pragma(inline, true); return idStartChars.ptr[cast(ubyte)ch]; }
547 bool isIdChar() (char ch) { pragma(inline, true); return idChars.ptr[cast(ubyte)ch]; }
549 string gmlQuote (const(char)[] s) {
550 import std.array : appender;
551 auto res = appender!string();
552 enum Prev { Nothing, Char, Spec }
553 Prev prev = Prev.Nothing;
554 foreach (char ch; s) {
555 if (ch < ' ' || ch == 127 || ch == '"') {
556 import std.conv : to;
557 final switch (prev) with (Prev) {
558 case Nothing: break;
559 case Char: res.put(`"+`); break;
560 case Spec: res.put(`+`); break;
562 prev = Prev.Spec;
563 res.put("chr(");
564 res.put(to!string(cast(uint)ch));
565 res.put(")");
566 } else {
567 final switch (prev) with (Prev) {
568 case Nothing: res.put('"'); break;
569 case Char: break;
570 case Spec: res.put(`+"`); break;
572 prev = Prev.Char;
573 res.put(ch);
576 if (prev == Prev.Nothing) return `""`;
577 if (prev == Prev.Char) res.put('"');
578 return res.data;
581 /// quote string: append double quotes, screen all special chars;
582 /// so quoted string forms valid D string literal.
583 /// allocates.
584 string quote (const(char)[] s) {
585 import std.array : appender;
586 import std.format : formatElement, FormatSpec;
587 auto res = appender!string();
588 FormatSpec!char fspc; // defaults to 's'
589 formatElement(res, s, fspc);
590 return res.data;
595 version(gml_lexer_test) unittest {
596 import std.file;
597 import std.stdio;
598 auto s = readText("scrDrawHUD.gml");
599 auto lex = new Lexer(s, "scrDrawHUD.gml");
600 try {
601 while (!lex.empty) {
602 //if (lex == Keyword.RCurly) writeln("*******************");
603 auto v = lex.select!(int, "pop")(
604 Keyword.LCurly, (ref Token tk) => 1,
605 Keyword.RCurly, (Keyword kw) => 2,
606 Keyword.Semi, () => 6,
607 Keyword.Sub, (Loc loc) => 99,
608 Token.Type.Num, (ref Token tk) => 3,
609 (ref Token tk) => writeln(tk),
611 if (v) writeln("*** ", v);
612 //writeln(v, ": ", lex.front);
613 //lex.popFront();
615 } catch (ErrorAt e) {
616 writeln("PARSE ERROR: ", e.line);
617 writeln(e.loc);