1 /* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 4 -*- */
2 /* vi: set ts=4 sw=4 expandtab: (add to ~/.vimrc: set modeline modelines=5) */
3 /* ***** BEGIN LICENSE BLOCK *****
4 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
16 * The Original Code is [Open Source Virtual Machine.].
18 * The Initial Developer of the Original Code is
19 * Adobe System Incorporated.
20 * Portions created by the Initial Developer are Copyright (C) 2008
21 * the Initial Developer. All Rights Reserved.
26 * Alternatively, the contents of this file may be used under the terms of
27 * either the GNU General Public License Version 2 or later (the "GPL"), or
28 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
29 * in which case the provisions of the GPL or the LGPL are applicable instead
30 * of those above. If you wish to allow use of your version of this file only
31 * under the terms of either the GPL or the LGPL, and not to allow others to
32 * use your version of this file under the terms of the MPL, indicate your
33 * decision by deleting the provisions above and replace them with the notice
34 * and other provisions required by the GPL or the LGPL. If you do not delete
35 * the provisions above, a recipient may use your version of this file under
36 * the terms of any one of the MPL, the GPL or the LGPL.
38 * ***** END LICENSE BLOCK ***** */
40 // This file is included into eval.h
47 // The values assigned for operators are fixed; they are used
48 // to construct the table Compiler::opcodeMapping in eval-parse.cpp.
49 // If you add entries to the operators list you *must* extend that
52 // Keep them alphabetical.
98 T_UnsignedRightShiftAssign
,
103 // Sundry punctuation
127 T_XmlSlashRightAngle
,
129 // Keywords. Commented-out entries are operators, above.
180 // Sundry other tokens.
188 T_XmlCDATA
, // "<![CDATA[...]]>" (including the punctuation, ditto for the three following tokens)
189 T_XmlComment
, // "<!-- ... -->"
190 T_XmlProcessingInstruction
, // "<? ... ?>
191 T_XmlString
, // '...' or "..."
192 T_XmlName
, // string of XMLName characters
193 T_XmlWhitespace
, // string of XMLWhitespace characters
194 T_XmlText
, // string of characters that are not XMLName or XMLWhitespace
199 T_BreakSlash
, // "/" seen and consumed
200 T_BreakRightAngle
, // ">" seen and consumed
201 T_BreakLeftAngle
, // "<" seen and consumed
203 // LAST also serves double duty as NONE
208 // Value carrier for tokens that carry values.
211 double d
; // T_DoubleLiteral
212 int32_t i
; // T_IntLiteral
213 uint32_t u
; // T_UintLiteral
214 Str
*s
; // T_StringLiteral, T_RegexpLiteral, T_Identifier
221 * A client retrieves a stream of tokens from the lexer by calling
222 * lex() repeatedly. When the special tokens T_BreakSlash and
223 * T_BreakRightAngle are returned the client must disambiguate
224 * the context by calling divideOperator() or regexp() in the forme
225 * case and rightAngle() or shiftOrRelationalOperator() in the latter.
227 * A few tokens carry values. These values are available through
228 * accessor functions on the lexer when the most recent call to
229 * the lexer returned the particular token in question. In debug
230 * builds there are checks to catch incorrect uses of these APIs.
232 * A line number is maintained by the lexer and made available
233 * through an accessor function. Following the return of a token,
234 * the line number corresponds to the line number of the last
235 * consumed character of the most recently consumed token. The only
236 * multi-line tokens are strings, regular expression literals, and
237 * identifiers containing \<newline> sequences.
243 * @param compiler The compiler structure, from which we take flags and allocator
244 * @param src The source text as a string with a trailing NUL; it may contain
245 * embedded NULs but the last is considered a terminator, not part
247 * @param keyword_or_ident True iff this scanner is simply being used to check
248 * whether an identifier that contains a backslash sequence looks
251 Lexer(Compiler
* compiler
, const wchar
* src
, uint32_t srclen
, bool keyword_or_ident
=false);
253 Token
lex(uint32_t* linep
, TokenValue
* valuep
); // Lex a token
254 Token
regexp(uint32_t* linep
, TokenValue
* valuep
); // Following T_BreakSlash, to lex a regex literal
255 Token
divideOperator(uint32_t* linep
); // Following T_BreakSlash, to lex a division operator
256 Token
rightAngle(uint32_t* linep
); // Following T_BreakRightAngle, to lex '>' at the end of a type instantiator
257 Token
rightShiftOrRelationalOperator(uint32_t* linep
); // Following T_BreakRightAngle, to lex a shift or relational operator
258 Token
leftShiftOrRelationalOperator(uint32_t* linep
); // Following T_BreakLeftAngle, to lex a shift or relational operator
261 * Last consumed character must have been c; back up once
263 void xmlPushback(wchar c
);
267 * xmlAtom returns one of:
271 * XmlProcessingInstruction
284 * For XmlComment, XmlCDATA, XmlProcessingInstruction, XmlName, XmlWhitespace, XmlText,
285 * and XmlString, valuep->s is set to the actual text.
287 Token
xmlAtom(uint32_t* linep
, TokenValue
* valuep
);
290 void trace(); // enable tracing
291 bool getTrace() const; // retrieve the current tracing flag
300 // Various Zs characters
301 UNICHAR_Zs1
= 0x1680,
302 UNICHAR_Zs2
= 0x180E,
303 UNICHAR_Zs3
= 0x2000,
304 UNICHAR_Zs4
= 0x2001,
305 UNICHAR_Zs5
= 0x2002,
306 UNICHAR_Zs6
= 0x2003,
307 UNICHAR_Zs7
= 0x2004,
308 UNICHAR_Zs8
= 0x2005,
309 UNICHAR_Zs9
= 0x2006,
310 UNICHAR_Zs10
= 0x2007,
311 UNICHAR_Zs11
= 0x2008,
312 UNICHAR_Zs12
= 0x2009,
313 UNICHAR_Zs13
= 0x200A,
314 UNICHAR_Zs14
= 0x202F,
315 UNICHAR_Zs15
= 0x205F,
316 UNICHAR_Zs16
= 0x3000,
318 // Byte-order mark - we treat it like a space
319 UNICHAR_BOM
= 0xFEFF,
323 // The character among the LS/PS, BOM, and Zs* with the lowest value
324 UNICHAR_LOWEST_ODDSPACE
= 0x1680
327 // 8 bits available in the char_attrs table
330 CHAR_ATTR_DECIMAL
= 2,
332 CHAR_ATTR_LETTER
= 8,
333 CHAR_ATTR_UNDERBAR
= 16,
334 CHAR_ATTR_DOLLAR
= 32,
336 CHAR_ATTR_INITIAL
= CHAR_ATTR_LETTER
| CHAR_ATTR_UNDERBAR
| CHAR_ATTR_DOLLAR
,
337 CHAR_ATTR_SUBSEQUENT
= CHAR_ATTR_INITIAL
| CHAR_ATTR_DECIMAL
342 Token
divideOperatorImpl();
343 Token
rightAngleImpl();
344 Token
rightShiftOrRelationalOperatorImpl();
345 Token
leftShiftOrRelationalOperatorImpl();
348 Token
xmlMarkup(Token t
);
349 Token
xmlWhitespace();
353 bool isXmlNameStart(wchar c
);
354 bool isXmlNameSubsequent(wchar c
);
361 Token
stringLiteral(int delimiter
);
363 int escapeSequence();
364 int octalOrNulEscape();
365 int octalEscape(int n
);
366 int hexEscape(int n
);
369 Token
numberLiteral();
370 Token
integerLiteral(int base
);
371 Token
floatingLiteral();
372 void checkNextCharForNumber();
373 bool numberLiteralPrime();
374 void numberFraction(bool has_leading_digits
);
375 void numberExponent();
376 bool octalDigits(int k
);
377 bool decimalDigits(int k
);
378 bool hexDigits(int k
);
379 bool digits(int k
, int mask
);
380 double parseDouble();
381 double parseInt(int base
);
383 bool notPartOfIdent(int c
);
384 bool isUnicodeIdentifierStart(int c
);
385 bool isUnicodeIdentifierPart(int c
);
387 void print(Token t
, uint32_t l
, TokenValue v
);
390 Compiler
* const compiler
;
391 const wchar
* src
; // input
392 const wchar
* limit
; // one past end of input
393 const wchar
* idx
; // next char in input
394 const wchar
* mark
; // a remembered position, typically the start of a lexeme (not always valid)
395 uint32_t lineno
; // line number of last char of last token returned
396 const bool keyword_or_ident
;
397 // true if this lexer instance is just used for checking whether an
398 // identifier that contains a backslash sequence looks like a keyword
400 Token last_token
; // last token returned
401 bool traceflag
; // true iff we're tracing
403 TokenValue val
; // temporary slot
405 // Character attributes for the ASCII range, bit vectors of the CHAR_ATTR_ values above.
406 static const uint8_t char_attrs
[128];