Merge remote-tracking branch 'redux/master' into sh4-pool
[tamarin-stm.git] / eval / eval-lex.h
blobfe28108b7f38bd412867ab274f7796927b4a5b09
1 /* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 4 -*- */
2 /* vi: set ts=4 sw=4 expandtab: (add to ~/.vimrc: set modeline modelines=5) */
3 /* ***** BEGIN LICENSE BLOCK *****
4 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
14 * License.
16 * The Original Code is [Open Source Virtual Machine.].
18 * The Initial Developer of the Original Code is
19 * Adobe System Incorporated.
20 * Portions created by the Initial Developer are Copyright (C) 2008
21 * the Initial Developer. All Rights Reserved.
23 * Contributor(s):
24 * Adobe AS3 Team
26 * Alternatively, the contents of this file may be used under the terms of
27 * either the GNU General Public License Version 2 or later (the "GPL"), or
28 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
29 * in which case the provisions of the GPL or the LGPL are applicable instead
30 * of those above. If you wish to allow use of your version of this file only
31 * under the terms of either the GPL or the LGPL, and not to allow others to
32 * use your version of this file under the terms of the MPL, indicate your
33 * decision by deleting the provisions above and replace them with the notice
34 * and other provisions required by the GPL or the LGPL. If you do not delete
35 * the provisions above, a recipient may use your version of this file under
36 * the terms of any one of the MPL, the GPL or the LGPL.
38 * ***** END LICENSE BLOCK ***** */
40 // This file is included into eval.h
41 namespace avmplus {
42 namespace RTC {
44 enum Token {
45 // Operators
47 // The values assigned for operators are fixed; they are used
48 // to construct the table Compiler::opcodeMapping in eval-parse.cpp.
49 // If you add entries to the operators list you *must* extend that
50 // table.
52 // Keep them alphabetical.
54 T_As = 0,
55 T_Assign,
56 T_BitwiseAnd,
57 T_BitwiseAndAssign,
58 T_BitwiseNot,
59 T_BitwiseOr,
60 T_BitwiseOrAssign,
61 T_BitwiseXor,
62 T_BitwiseXorAssign,
63 T_Delete,
64 T_Divide,
65 T_DivideAssign,
66 T_Equal,
67 T_GreaterThan,
68 T_GreaterThanOrEqual,
69 T_In,
70 T_InstanceOf,
71 T_Is,
72 T_LeftShift,
73 T_LeftShiftAssign,
74 T_LessThan,
75 T_LessThanOrEqual,
76 T_LogicalAnd,
77 T_LogicalAndAssign,
78 T_LogicalOr,
79 T_LogicalOrAssign,
80 T_Minus,
81 T_MinusAssign,
82 T_MinusMinus,
83 T_Multiply,
84 T_MultiplyAssign,
85 T_Not,
86 T_NotEqual,
87 T_Plus,
88 T_PlusAssign,
89 T_PlusPlus,
90 T_Remainder,
91 T_RemainderAssign,
92 T_RightShift,
93 T_RightShiftAssign,
94 T_StrictEqual,
95 T_StrictNotEqual,
96 T_TypeOf,
97 T_UnsignedRightShift,
98 T_UnsignedRightShiftAssign,
99 T_Void,
101 T_OPERATOR_SENTINEL,
103 // Sundry punctuation
105 T_LeftParen = 100,
106 T_RightParen,
107 T_Comma,
108 T_Dot,
109 T_DoubleDot,
110 T_TripleDot,
111 T_LeftDotAngle,
112 T_Colon,
113 T_DoubleColon,
114 T_Semicolon,
115 T_Question,
116 T_LeftBracket,
117 T_RightBracket,
118 T_LeftBrace,
119 T_RightBrace,
120 T_AtSign,
121 T_XmlLeftBrace,
122 T_XmlRightBrace,
123 T_XmlEquals,
124 T_XmlLeftAngle,
125 T_XmlRightAngle,
126 T_XmlLeftAngleSlash,
127 T_XmlSlashRightAngle,
129 // Keywords. Commented-out entries are operators, above.
131 /*T_As,*/
132 T_Break = 200,
133 T_Case,
134 T_Catch,
135 T_Class,
136 T_Const,
137 T_Continue,
138 T_Default,
139 /*T_Delete,*/
140 T_Do,
141 T_Dynamic,
142 T_Else,
143 T_False,
144 T_Final,
145 T_Finally,
146 T_For,
147 T_Function,
148 T_If,
149 T_Import,
150 /*T_In,*/
151 T_Include,
152 /*T_InstanceOf,*/
153 T_Interface,
154 T_Internal,
155 /*T_Is,*/
156 T_Namespace,
157 T_Native,
158 T_New,
159 T_Null,
160 T_Override,
161 T_Package,
162 T_Private,
163 T_Protected,
164 T_Public,
165 T_Return,
166 T_Static,
167 T_Super,
168 T_Switch,
169 T_This,
170 T_Throw,
171 T_True,
172 T_Try,
173 /*T_TypeOf,*/
174 T_Use,
175 T_Var,
176 /*T_Void,*/
177 T_While,
178 T_With,
180 // Sundry other tokens.
182 T_Identifier = 300,
183 T_IntLiteral,
184 T_UIntLiteral,
185 T_DoubleLiteral,
186 T_RegexpLiteral,
187 T_StringLiteral,
188 T_XmlCDATA, // "<![CDATA[...]]>" (including the punctuation, ditto for the three following tokens)
189 T_XmlComment, // "<!-- ... -->"
190 T_XmlProcessingInstruction, // "<? ... ?>
191 T_XmlString, // '...' or "..."
192 T_XmlName, // string of XMLName characters
193 T_XmlWhitespace, // string of XMLWhitespace characters
194 T_XmlText, // string of characters that are not XMLName or XMLWhitespace
196 // Meta-tokens.
198 T_EOS = 400,
199 T_BreakSlash, // "/" seen and consumed
200 T_BreakRightAngle, // ">" seen and consumed
201 T_BreakLeftAngle, // "<" seen and consumed
203 // LAST also serves double duty as NONE
205 T_LAST = 500
208 // Value carrier for tokens that carry values.
210 union TokenValue {
211 double d; // T_DoubleLiteral
212 int32_t i; // T_IntLiteral
213 uint32_t u; // T_UintLiteral
214 Str *s; // T_StringLiteral, T_RegexpLiteral, T_Identifier
219 * Lexical analysis.
221 * A client retrieves a stream of tokens from the lexer by calling
222 * lex() repeatedly. When the special tokens T_BreakSlash and
223 * T_BreakRightAngle are returned the client must disambiguate
224 * the context by calling divideOperator() or regexp() in the forme
225 * case and rightAngle() or shiftOrRelationalOperator() in the latter.
227 * A few tokens carry values. These values are available through
228 * accessor functions on the lexer when the most recent call to
229 * the lexer returned the particular token in question. In debug
230 * builds there are checks to catch incorrect uses of these APIs.
232 * A line number is maintained by the lexer and made available
233 * through an accessor function. Following the return of a token,
234 * the line number corresponds to the line number of the last
235 * consumed character of the most recently consumed token. The only
236 * multi-line tokens are strings, regular expression literals, and
237 * identifiers containing \<newline> sequences.
240 class Lexer {
241 public:
243 * @param compiler The compiler structure, from which we take flags and allocator
244 * @param src The source text as a string with a trailing NUL; it may contain
245 * embedded NULs but the last is considered a terminator, not part
246 * of the input
247 * @param keyword_or_ident True iff this scanner is simply being used to check
248 * whether an identifier that contains a backslash sequence looks
249 * like a keyword.
251 Lexer(Compiler* compiler, const wchar* src, uint32_t srclen, bool keyword_or_ident=false);
253 Token lex(uint32_t* linep, TokenValue* valuep); // Lex a token
254 Token regexp(uint32_t* linep, TokenValue* valuep); // Following T_BreakSlash, to lex a regex literal
255 Token divideOperator(uint32_t* linep); // Following T_BreakSlash, to lex a division operator
256 Token rightAngle(uint32_t* linep); // Following T_BreakRightAngle, to lex '>' at the end of a type instantiator
257 Token rightShiftOrRelationalOperator(uint32_t* linep); // Following T_BreakRightAngle, to lex a shift or relational operator
258 Token leftShiftOrRelationalOperator(uint32_t* linep); // Following T_BreakLeftAngle, to lex a shift or relational operator
261 * Last consumed character must have been c; back up once
263 void xmlPushback(wchar c);
266 * Lex one XML atom.
267 * xmlAtom returns one of:
269 * XmlComment
270 * XmlCDATA
271 * XmlProcessingInstruction
272 * XmlName
273 * XmlWhitespace
274 * XmlText
275 * XmlString
276 * XmlLeftBrace
277 * XmlRightBrace
278 * XmlEquals
279 * XmlLeftAngle
280 * XmlRightAngle
281 * XmlLeftAngleSlash
282 * XmlSlashRightAngle
284 * For XmlComment, XmlCDATA, XmlProcessingInstruction, XmlName, XmlWhitespace, XmlText,
285 * and XmlString, valuep->s is set to the actual text.
287 Token xmlAtom(uint32_t* linep, TokenValue* valuep);
289 #ifdef DEBUG
290 void trace(); // enable tracing
291 bool getTrace() const; // retrieve the current tracing flag
292 #endif
294 private:
295 enum {
296 // Special spaces
297 UNICHAR_LS = 0x2028,
298 UNICHAR_PS = 0x2029,
300 // Various Zs characters
301 UNICHAR_Zs1 = 0x1680,
302 UNICHAR_Zs2 = 0x180E,
303 UNICHAR_Zs3 = 0x2000,
304 UNICHAR_Zs4 = 0x2001,
305 UNICHAR_Zs5 = 0x2002,
306 UNICHAR_Zs6 = 0x2003,
307 UNICHAR_Zs7 = 0x2004,
308 UNICHAR_Zs8 = 0x2005,
309 UNICHAR_Zs9 = 0x2006,
310 UNICHAR_Zs10 = 0x2007,
311 UNICHAR_Zs11 = 0x2008,
312 UNICHAR_Zs12 = 0x2009,
313 UNICHAR_Zs13 = 0x200A,
314 UNICHAR_Zs14 = 0x202F,
315 UNICHAR_Zs15 = 0x205F,
316 UNICHAR_Zs16 = 0x3000,
318 // Byte-order mark - we treat it like a space
319 UNICHAR_BOM = 0xFEFF,
322 enum {
323 // The character among the LS/PS, BOM, and Zs* with the lowest value
324 UNICHAR_LOWEST_ODDSPACE = 0x1680
327 // 8 bits available in the char_attrs table
328 enum {
329 CHAR_ATTR_OCTAL = 1,
330 CHAR_ATTR_DECIMAL = 2,
331 CHAR_ATTR_HEX = 4,
332 CHAR_ATTR_LETTER = 8,
333 CHAR_ATTR_UNDERBAR = 16,
334 CHAR_ATTR_DOLLAR = 32,
336 CHAR_ATTR_INITIAL = CHAR_ATTR_LETTER | CHAR_ATTR_UNDERBAR | CHAR_ATTR_DOLLAR,
337 CHAR_ATTR_SUBSEQUENT = CHAR_ATTR_INITIAL | CHAR_ATTR_DECIMAL
340 Token lexImpl();
341 Token regexpImpl();
342 Token divideOperatorImpl();
343 Token rightAngleImpl();
344 Token rightShiftOrRelationalOperatorImpl();
345 Token leftShiftOrRelationalOperatorImpl();
347 Token xmlAtomImpl();
348 Token xmlMarkup(Token t);
349 Token xmlWhitespace();
350 Token xmlName();
351 Token xmlString();
352 Token xmlText();
353 bool isXmlNameStart(wchar c);
354 bool isXmlNameSubsequent(wchar c);
356 void lineComment();
357 void blockComment();
359 Token identifier();
361 Token stringLiteral(int delimiter);
363 int escapeSequence();
364 int octalOrNulEscape();
365 int octalEscape(int n);
366 int hexEscape(int n);
367 int unicodeEscape();
369 Token numberLiteral();
370 Token integerLiteral(int base);
371 Token floatingLiteral();
372 void checkNextCharForNumber();
373 bool numberLiteralPrime();
374 void numberFraction(bool has_leading_digits);
375 void numberExponent();
376 bool octalDigits(int k);
377 bool decimalDigits(int k);
378 bool hexDigits(int k);
379 bool digits(int k, int mask);
380 double parseDouble();
381 double parseInt(int base);
383 bool notPartOfIdent(int c);
384 bool isUnicodeIdentifierStart(int c);
385 bool isUnicodeIdentifierPart(int c);
386 #ifdef DEBUG
387 void print(Token t, uint32_t l, TokenValue v);
388 #endif
390 Compiler * const compiler;
391 const wchar* src; // input
392 const wchar* limit; // one past end of input
393 const wchar* idx; // next char in input
394 const wchar* mark; // a remembered position, typically the start of a lexeme (not always valid)
395 uint32_t lineno; // line number of last char of last token returned
396 const bool keyword_or_ident;
397 // true if this lexer instance is just used for checking whether an
398 // identifier that contains a backslash sequence looks like a keyword
399 #ifdef DEBUG
400 Token last_token; // last token returned
401 bool traceflag; // true iff we're tracing
402 #endif
403 TokenValue val; // temporary slot
405 // Character attributes for the ASCII range, bit vectors of the CHAR_ATTR_ values above.
406 static const uint8_t char_attrs[128];