formatting 90% done; encapsulated everything in the TinyJS namespace, and renamed...
[tinyjs-rewrite.git] / src / lexer.cpp
blob22a8ab588f12d39f226cd8b4fde33efe2047649e
2 #include "private.h"
4 namespace TinyJS
6 Lexer::Lexer(const std::string& input)
8 m_data = strncopy(input.c_str(), input.size());
9 m_dataOwned = true;
10 m_dataStart = 0;
11 m_dataEnd = int(input.size());
12 reset();
15 Lexer::Lexer(Lexer* owner, int startChar, int endChar)
17 m_data = owner->m_data;
18 m_dataOwned = false;
19 m_dataStart = startChar;
20 m_dataEnd = endChar;
21 reset();
24 Lexer::~Lexer(void)
26 if(m_dataOwned)
28 free((void*)m_data);
32 void Lexer::reset()
34 m_dataPos = m_dataStart;
35 m_tokenStart = 0;
36 m_tokenEnd = 0;
37 m_tokenLastEnd = 0;
38 m_tk = 0;
39 m_tkStr = "";
40 getNextCh();
41 getNextCh();
42 getNextToken();
45 void Lexer::match(int expected_tk)
47 if(m_tk != expected_tk)
49 std::stringstream errorString;
50 errorString << "Got " << getTokenStr(m_tk) << " expected " << getTokenStr(expected_tk)
51 << " at " << getPosition(m_tokenStart);
52 throw new RuntimeError(errorString.str());
54 getNextToken();
57 std::string Lexer::getTokenStr(int token)
59 if((token > 32) && (token < 128))
61 char buf[4] = "' '";
62 buf[1] = (char)token;
63 return buf;
65 switch(token)
67 case LEX_EOF :
68 return "EOF";
69 case LEX_ID :
70 return "ID";
71 case LEX_INT :
72 return "INT";
73 case LEX_FLOAT :
74 return "FLOAT";
75 case LEX_STR :
76 return "STRING";
77 case LEX_EQUAL :
78 return "==";
79 case LEX_TYPEEQUAL :
80 return "===";
81 case LEX_NEQUAL :
82 return "!=";
83 case LEX_NTYPEEQUAL :
84 return "!==";
85 case LEX_LEQUAL :
86 return "<=";
87 case LEX_LSHIFT :
88 return "<<";
89 case LEX_LSHIFTEQUAL :
90 return "<<=";
91 case LEX_GEQUAL :
92 return ">=";
93 case LEX_RSHIFT :
94 return ">>";
95 case LEX_RSHIFTUNSIGNED :
96 return ">>";
97 case LEX_RSHIFTEQUAL :
98 return ">>=";
99 case LEX_PLUSEQUAL :
100 return "+=";
101 case LEX_MINUSEQUAL :
102 return "-=";
103 case LEX_PLUSPLUS :
104 return "++";
105 case LEX_MINUSMINUS :
106 return "--";
107 case LEX_ANDEQUAL :
108 return "&=";
109 case LEX_ANDAND :
110 return "&&";
111 case LEX_OREQUAL :
112 return "|=";
113 case LEX_OROR :
114 return "||";
115 case LEX_XOREQUAL :
116 return "^=";
117 // reserved words
118 case LEX_R_IF :
119 return "if";
120 case LEX_R_ELSE :
121 return "else";
122 case LEX_R_DO :
123 return "do";
124 case LEX_R_WHILE :
125 return "while";
126 case LEX_R_FOR :
127 return "for";
128 case LEX_R_BREAK :
129 return "break";
130 case LEX_R_CONTINUE :
131 return "continue";
132 case LEX_R_FUNCTION :
133 return "function";
134 case LEX_R_RETURN :
135 return "return";
136 case LEX_R_VAR :
137 return "var";
138 case LEX_R_TRUE :
139 return "true";
140 case LEX_R_FALSE :
141 return "false";
142 case LEX_R_NULL :
143 return "null";
144 case LEX_R_UNDEFINED :
145 return "undefined";
146 case LEX_R_NEW :
147 return "new";
149 std::stringstream msg;
150 msg << "?[" << token << "]";
151 return msg.str();
154 void Lexer::getNextCh()
156 m_currCh = m_nextCh;
157 if(m_dataPos < m_dataEnd)
159 m_nextCh = m_data[m_dataPos];
161 else
163 m_nextCh = 0;
165 m_dataPos++;
168 void Lexer::getNextToken()
170 m_tk = LEX_EOF;
171 m_tkStr.clear();
172 while(m_currCh && isWhitespace(m_currCh))
174 getNextCh();
176 // newline comments
177 if((m_currCh == '/') && (m_nextCh == '/'))
179 while(m_currCh && (m_currCh != '\n'))
181 getNextCh();
183 getNextCh();
184 getNextToken();
185 return;
187 // block comments
188 if((m_currCh == '/') && (m_nextCh == '*'))
190 while(m_currCh && ((m_currCh != '*') || (m_nextCh != '/')))
192 getNextCh();
194 getNextCh();
195 getNextCh();
196 getNextToken();
197 return;
199 // record beginning of this token
200 m_tokenStart = (m_dataPos - 2);
201 // tokens
202 if(isAlpha(m_currCh)) // IDs
204 while(isAlpha(m_currCh) || isNumeric(m_currCh))
206 m_tkStr += m_currCh;
207 getNextCh();
209 m_tk = LEX_ID;
210 if(m_tkStr == "if")
212 m_tk = LEX_R_IF;
214 else if(m_tkStr == "else")
216 m_tk = LEX_R_ELSE;
218 else if(m_tkStr == "do")
220 m_tk = LEX_R_DO;
222 else if(m_tkStr == "while")
224 m_tk = LEX_R_WHILE;
226 else if(m_tkStr == "for")
228 m_tk = LEX_R_FOR;
230 else if(m_tkStr == "break")
232 m_tk = LEX_R_BREAK;
234 else if(m_tkStr == "continue")
236 m_tk = LEX_R_CONTINUE;
238 else if(m_tkStr == "function")
240 m_tk = LEX_R_FUNCTION;
242 else if(m_tkStr == "return")
244 m_tk = LEX_R_RETURN;
246 else if(m_tkStr == "var")
248 m_tk = LEX_R_VAR;
250 else if(m_tkStr == "true")
252 m_tk = LEX_R_TRUE;
254 else if(m_tkStr == "false")
256 m_tk = LEX_R_FALSE;
258 else if(m_tkStr == "null")
260 m_tk = LEX_R_NULL;
262 else if(m_tkStr == "undefined")
264 m_tk = LEX_R_UNDEFINED;
266 else if(m_tkStr == "new")
268 m_tk = LEX_R_NEW;
271 else if(isNumeric(m_currCh)) // Numbers
273 bool isHex = false;
274 if(m_currCh == '0')
276 m_tkStr += m_currCh;
277 getNextCh();
279 if(m_currCh == 'x')
281 isHex = true;
282 m_tkStr += m_currCh;
283 getNextCh();
285 m_tk = LEX_INT;
286 while(isNumeric(m_currCh) || (isHex && isHexadecimal(m_currCh)))
288 m_tkStr += m_currCh;
289 getNextCh();
291 if(!isHex && (m_currCh == '.'))
293 m_tk = LEX_FLOAT;
294 m_tkStr += '.';
295 getNextCh();
296 while(isNumeric(m_currCh))
298 m_tkStr += m_currCh;
299 getNextCh();
302 // do fancy e-style floating point
303 if(!isHex && ((m_currCh == 'e') || (m_currCh == 'E')))
305 m_tk = LEX_FLOAT;
306 m_tkStr += m_currCh;
307 getNextCh();
308 if(m_currCh == '-')
310 m_tkStr += m_currCh;
311 getNextCh();
313 while(isNumeric(m_currCh))
315 m_tkStr += m_currCh;
316 getNextCh();
320 else if(m_currCh == '"')
322 // strings...
323 getNextCh();
324 while(m_currCh && (m_currCh != '"'))
326 if(m_currCh == '\\')
328 getNextCh();
329 switch(m_currCh)
331 case 'n' :
332 m_tkStr += '\n';
333 break;
334 case '"' :
335 m_tkStr += '"';
336 break;
337 case '\\' :
338 m_tkStr += '\\';
339 break;
340 default:
341 m_tkStr += m_currCh;
344 else
346 m_tkStr += m_currCh;
348 getNextCh();
350 getNextCh();
351 m_tk = LEX_STR;
353 else if(m_currCh == '\'')
355 // strings again...
356 getNextCh();
357 while(m_currCh && (m_currCh != '\''))
359 if(m_currCh == '\\')
361 getNextCh();
362 switch(m_currCh)
364 case 'n' :
365 m_tkStr += '\n';
366 break;
367 case 'a' :
368 m_tkStr += '\a';
369 break;
370 case 'r' :
371 m_tkStr += '\r';
372 break;
373 case 't' :
374 m_tkStr += '\t';
375 break;
376 case '\'' :
377 m_tkStr += '\'';
378 break;
379 case '\\' :
380 m_tkStr += '\\';
381 break;
382 case 'x' : // hex digits
384 char buf[3] = "??";
385 getNextCh();
386 buf[0] = m_currCh;
387 getNextCh();
388 buf[1] = m_currCh;
389 m_tkStr += (char)strtol(buf,0,16);
391 break;
392 default:
393 if((m_currCh >= '0') && (m_currCh <= '7'))
395 // octal digits
396 char buf[4] = "???";
397 buf[0] = m_currCh;
398 getNextCh();
399 buf[1] = m_currCh;
400 getNextCh();
401 buf[2] = m_currCh;
402 m_tkStr += (char)strtol(buf,0,8);
404 else
406 m_tkStr += m_currCh;
410 else
412 m_tkStr += m_currCh;
414 getNextCh();
416 getNextCh();
417 m_tk = LEX_STR;
419 else
421 // single chars
422 m_tk = m_currCh;
423 if(m_currCh)
425 getNextCh();
427 if((m_tk == '=') && (m_currCh == '=')) // ==
429 m_tk = LEX_EQUAL;
430 getNextCh();
431 if(m_currCh == '=') // ===
433 m_tk = LEX_TYPEEQUAL;
434 getNextCh();
437 else if((m_tk == '!') && (m_currCh == '=')) // !=
439 m_tk = LEX_NEQUAL;
440 getNextCh();
441 if(m_currCh == '=') // !==
443 m_tk = LEX_NTYPEEQUAL;
444 getNextCh();
447 else if((m_tk == '<') && (m_currCh == '='))
449 m_tk = LEX_LEQUAL;
450 getNextCh();
452 else if((m_tk == '<') && (m_currCh == '<'))
454 m_tk = LEX_LSHIFT;
455 getNextCh();
456 if(m_currCh == '=') // <<=
458 m_tk = LEX_LSHIFTEQUAL;
459 getNextCh();
462 else if((m_tk == '>') && (m_currCh == '='))
464 m_tk = LEX_GEQUAL;
465 getNextCh();
467 else if((m_tk == '>') && (m_currCh == '>'))
469 m_tk = LEX_RSHIFT;
470 getNextCh();
471 if(m_currCh == '=') // >>=
473 m_tk = LEX_RSHIFTEQUAL;
474 getNextCh();
476 else if(m_currCh == '>') // >>>
478 m_tk = LEX_RSHIFTUNSIGNED;
479 getNextCh();
482 else if((m_tk == '+') && (m_currCh == '='))
484 m_tk = LEX_PLUSEQUAL;
485 getNextCh();
487 else if((m_tk == '-') && (m_currCh == '='))
489 m_tk = LEX_MINUSEQUAL;
490 getNextCh();
492 else if((m_tk == '+') && (m_currCh == '+'))
494 m_tk = LEX_PLUSPLUS;
495 getNextCh();
497 else if((m_tk == '-') && (m_currCh == '-'))
499 m_tk = LEX_MINUSMINUS;
500 getNextCh();
502 else if((m_tk == '&') && (m_currCh == '='))
504 m_tk = LEX_ANDEQUAL;
505 getNextCh();
507 else if((m_tk == '&') && (m_currCh == '&'))
509 m_tk = LEX_ANDAND;
510 getNextCh();
512 else if((m_tk == '|') && (m_currCh == '='))
514 m_tk = LEX_OREQUAL;
515 getNextCh();
517 else if((m_tk == '|') && (m_currCh == '|'))
519 m_tk = LEX_OROR;
520 getNextCh();
522 else if((m_tk == '^') && (m_currCh == '='))
524 m_tk = LEX_XOREQUAL;
525 getNextCh();
528 /* This isn't quite right yet */
529 m_tokenLastEnd = m_tokenEnd;
530 m_tokenEnd = (m_dataPos - 3);
533 std::string Lexer::getSubString(int lastPosition)
535 int lastCharIdx = (m_tokenLastEnd + 1);
536 if(lastCharIdx < m_dataEnd)
538 /* save a memory alloc by using our data array to create the string */
539 char old = m_data[lastCharIdx];
540 m_data[lastCharIdx] = 0;
541 std::string value = &m_data[lastPosition];
542 m_data[lastCharIdx] = old;
543 return value;
545 else
547 return std::string(&m_data[lastPosition]);
551 Lexer* Lexer::getSubLex(int lastPosition)
553 int lastCharIdx = (m_tokenLastEnd + 1);
554 if(lastCharIdx < m_dataEnd)
556 return new Lexer(this, lastPosition, lastCharIdx);
558 else
560 return new Lexer(this, lastPosition, m_dataEnd);
564 std::string Lexer::getPosition(int pos)
566 int line;
567 int col;
568 if(pos < 0)
570 pos = m_tokenLastEnd;
572 line = 1;
573 col = 1;
574 for(int i=0; i<pos; i++)
576 char ch;
577 if(i < m_dataEnd)
579 ch = m_data[i];
581 else
583 ch = 0;
585 col++;
586 if(ch == '\n')
588 line++;
589 col = 0;
592 char buf[256];
593 sprintf_s(buf, 256, "(line: %d, col: %d)", line, col);
594 return buf;