eval/eval-lex.h

   1 /* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 4 -*- */
   2 /* vi: set ts=4 sw=4 expandtab: (add to ~/.vimrc: set modeline modelines=5) */
   3 /* ***** BEGIN LICENSE BLOCK *****
   4  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   5  *
   6  * The contents of this file are subject to the Mozilla Public License Version
   7  * 1.1 (the "License"); you may not use this file except in compliance with
   8  * the License. You may obtain a copy of the License at
   9  * http://www.mozilla.org/MPL/
  10  *
  11  * Software distributed under the License is distributed on an "AS IS" basis,
  12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  13  * for the specific language governing rights and limitations under the
  14  * License.
  15  *
  16  * The Original Code is [Open Source Virtual Machine.].
  17  *
  18  * The Initial Developer of the Original Code is
  19  * Adobe System Incorporated.
  20  * Portions created by the Initial Developer are Copyright (C) 2008
  21  * the Initial Developer. All Rights Reserved.
  22  *
  23  * Contributor(s):
  24  *   Adobe AS3 Team
  25  *
  26  * Alternatively, the contents of this file may be used under the terms of
  27  * either the GNU General Public License Version 2 or later (the "GPL"), or
  28  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  29  * in which case the provisions of the GPL or the LGPL are applicable instead
  30  * of those above. If you wish to allow use of your version of this file only
  31  * under the terms of either the GPL or the LGPL, and not to allow others to
  32  * use your version of this file under the terms of the MPL, indicate your
  33  * decision by deleting the provisions above and replace them with the notice
  34  * and other provisions required by the GPL or the LGPL. If you do not delete
  35  * the provisions above, a recipient may use your version of this file under
  36  * the terms of any one of the MPL, the GPL or the LGPL.
  37  *
  38  * ***** END LICENSE BLOCK ***** */
  39
  40 // This file is included into eval.h
  41 namespace avmplus {
  42 namespace RTC {
  43
  44 enum Token {
  45     // Operators
  46     //
  47     // The values assigned for operators are fixed; they are used
  48     // to construct the table Compiler::opcodeMapping in eval-parse.cpp.
  49     // If you add entries to the operators list you *must* extend that
  50     // table.
  51     //
  52     // Keep them alphabetical.
  53
  54     T_As = 0,
  55     T_Assign,
  56     T_BitwiseAnd,
  57     T_BitwiseAndAssign,
  58     T_BitwiseNot,
  59     T_BitwiseOr,
  60     T_BitwiseOrAssign,
  61     T_BitwiseXor,
  62     T_BitwiseXorAssign,
  63     T_Delete,
  64     T_Divide,
  65     T_DivideAssign,
  66     T_Equal,
  67     T_GreaterThan,
  68     T_GreaterThanOrEqual,
  69     T_In,
  70     T_InstanceOf,
  71     T_Is,
  72     T_LeftShift,
  73     T_LeftShiftAssign,
  74     T_LessThan,
  75     T_LessThanOrEqual,
  76     T_LogicalAnd,
  77     T_LogicalAndAssign,
  78     T_LogicalOr,
  79     T_LogicalOrAssign,
  80     T_Minus,
  81     T_MinusAssign,
  82     T_MinusMinus,
  83     T_Multiply,
  84     T_MultiplyAssign,
  85     T_Not,
  86     T_NotEqual,
  87     T_Plus,
  88     T_PlusAssign,
  89     T_PlusPlus,
  90     T_Remainder,
  91     T_RemainderAssign,
  92     T_RightShift,
  93     T_RightShiftAssign,
  94     T_StrictEqual,
  95     T_StrictNotEqual,
  96     T_TypeOf,
  97     T_UnsignedRightShift,
  98     T_UnsignedRightShiftAssign,
  99     T_Void,
 100
 101     T_OPERATOR_SENTINEL,
 102
 103     // Sundry punctuation
 104
 105     T_LeftParen = 100,
 106     T_RightParen,
 107     T_Comma,
 108     T_Dot,
 109     T_DoubleDot,
 110     T_TripleDot,
 111     T_LeftDotAngle,
 112     T_Colon,
 113     T_DoubleColon,
 114     T_Semicolon,
 115     T_Question,
 116     T_LeftBracket,
 117     T_RightBracket,
 118     T_LeftBrace,
 119     T_RightBrace,
 120     T_AtSign,
 121     T_XmlLeftBrace,
 122     T_XmlRightBrace,
 123     T_XmlEquals,
 124     T_XmlLeftAngle,
 125     T_XmlRightAngle,
 126     T_XmlLeftAngleSlash,
 127     T_XmlSlashRightAngle,
 128
 129     // Keywords.  Commented-out entries are operators, above.
 130
 131     /*T_As,*/
 132     T_Break = 200,
 133     T_Case,
 134     T_Catch,
 135     T_Class,
 136     T_Const,
 137     T_Continue,
 138     T_Default,
 139     /*T_Delete,*/
 140     T_Do,
 141     T_Dynamic,
 142     T_Else,
 143     T_False,
 144     T_Final,
 145     T_Finally,
 146     T_For,
 147     T_Function,
 148     T_If,
 149     T_Import,
 150     /*T_In,*/
 151     T_Include,
 152     /*T_InstanceOf,*/
 153     T_Interface,
 154     T_Internal,
 155     /*T_Is,*/
 156     T_Namespace,
 157     T_Native,
 158     T_New,
 159     T_Null,
 160     T_Override,
 161     T_Package,
 162     T_Private,
 163     T_Protected,
 164     T_Public,
 165     T_Return,
 166     T_Static,
 167     T_Super,
 168     T_Switch,
 169     T_This,
 170     T_Throw,
 171     T_True,
 172     T_Try,
 173     /*T_TypeOf,*/
 174     T_Use,
 175     T_Var,
 176     /*T_Void,*/
 177     T_While,
 178     T_With,
 179
 180     // Sundry other tokens.
 181
 182     T_Identifier = 300,
 183     T_IntLiteral,
 184     T_UIntLiteral,
 185     T_DoubleLiteral,
 186     T_RegexpLiteral,
 187     T_StringLiteral,
 188     T_XmlCDATA,                 //  "<![CDATA[...]]>"  (including the punctuation, ditto for the three following tokens)
 189     T_XmlComment,               //  "<!-- ... -->"
 190     T_XmlProcessingInstruction, //  "<? ... ?>
 191     T_XmlString,                //  '...' or "..."
 192     T_XmlName,                  //  string of XMLName characters
 193     T_XmlWhitespace,            //  string of XMLWhitespace characters
 194     T_XmlText,                  //  string of characters that are not XMLName or XMLWhitespace
 195
 196     // Meta-tokens.
 197
 198     T_EOS = 400,
 199     T_BreakSlash,               // "/" seen and consumed
 200     T_BreakRightAngle,          // ">" seen and consumed
 201     T_BreakLeftAngle,           // "<" seen and consumed
 202
 203     // LAST also serves double duty as NONE
 204
 205     T_LAST = 500
 206 };
 207
 208 // Value carrier for tokens that carry values.
 209
 210 union TokenValue {
 211     double    d;                // T_DoubleLiteral
 212     int32_t   i;                // T_IntLiteral
 213     uint32_t  u;                // T_UintLiteral
 214     Str      *s;                // T_StringLiteral, T_RegexpLiteral, T_Identifier
 215 };
 216
 217
 218 /**
 219  * Lexical analysis.
 220  *
 221  * A client retrieves a stream of tokens from the lexer by calling
 222  * lex() repeatedly.  When the special tokens T_BreakSlash and
 223  * T_BreakRightAngle are returned the client must disambiguate
 224  * the context by calling divideOperator() or regexp() in the forme
 225  * case and rightAngle() or shiftOrRelationalOperator() in the latter.
 226  *
 227  * A few tokens carry values.  These values are available through
 228  * accessor functions on the lexer when the most recent call to
 229  * the lexer returned the particular token in question.  In debug
 230  * builds there are checks to catch incorrect uses of these APIs.
 231  *
 232  * A line number is maintained by the lexer and made available
 233  * through an accessor function.  Following the return of a token,
 234  * the line number corresponds to the line number of the last
 235  * consumed character of the most recently consumed token.  The only
 236  * multi-line tokens are strings, regular expression literals, and
 237  * identifiers containing \<newline> sequences.
 238  */
 239
 240 class Lexer {
 241 public:
 242     /**
 243      * @param compiler  The compiler structure, from which we take flags and allocator
 244      * @param src  The source text as a string with a trailing NUL; it may contain
 245      *             embedded NULs but the last is considered a terminator, not part
 246      *             of the input
 247      * @param keyword_or_ident  True iff this scanner is simply being used to check
 248      *             whether an identifier that contains a backslash sequence looks
 249      *             like a keyword.
 250      */
 251     Lexer(Compiler* compiler, const wchar* src, uint32_t srclen, bool keyword_or_ident=false);
 252
 253     Token lex(uint32_t* linep, TokenValue* valuep);         // Lex a token
 254     Token regexp(uint32_t* linep, TokenValue* valuep);      // Following T_BreakSlash, to lex a regex literal
 255     Token divideOperator(uint32_t* linep);                  // Following T_BreakSlash, to lex a division operator
 256     Token rightAngle(uint32_t* linep);                      // Following T_BreakRightAngle, to lex '>' at the end of a type instantiator
 257     Token rightShiftOrRelationalOperator(uint32_t* linep);  // Following T_BreakRightAngle, to lex a shift or relational operator
 258     Token leftShiftOrRelationalOperator(uint32_t* linep);   // Following T_BreakLeftAngle, to lex a shift or relational operator
 259
 260     /**
 261      *  Last consumed character must have been c; back up once
 262      */
 263     void xmlPushback(wchar c);
 264
 265     /**
 266      * Lex one XML atom.
 267      * xmlAtom returns one of:
 268      *
 269      *   XmlComment
 270      *   XmlCDATA
 271      *   XmlProcessingInstruction
 272      *   XmlName
 273      *   XmlWhitespace
 274      *   XmlText
 275      *   XmlString
 276      *   XmlLeftBrace
 277      *   XmlRightBrace
 278      *   XmlEquals
 279      *   XmlLeftAngle
 280      *   XmlRightAngle
 281      *   XmlLeftAngleSlash
 282      *   XmlSlashRightAngle
 283      *
 284      * For XmlComment, XmlCDATA, XmlProcessingInstruction, XmlName, XmlWhitespace, XmlText,
 285      * and XmlString, valuep->s is set to the actual text.
 286      */
 287     Token xmlAtom(uint32_t* linep, TokenValue* valuep);
 288
 289 #ifdef DEBUG
 290     void trace();                                       // enable tracing
 291     bool getTrace() const;                              // retrieve the current tracing flag
 292 #endif
 293
 294 private:
 295     enum {
 296         // Special spaces
 297         UNICHAR_LS = 0x2028,
 298         UNICHAR_PS = 0x2029,
 299
 300         // Various Zs characters
 301         UNICHAR_Zs1 = 0x1680,
 302         UNICHAR_Zs2 = 0x180E,
 303         UNICHAR_Zs3 = 0x2000,
 304         UNICHAR_Zs4 = 0x2001,
 305         UNICHAR_Zs5 = 0x2002,
 306         UNICHAR_Zs6 = 0x2003,
 307         UNICHAR_Zs7 = 0x2004,
 308         UNICHAR_Zs8 = 0x2005,
 309         UNICHAR_Zs9 = 0x2006,
 310         UNICHAR_Zs10 = 0x2007,
 311         UNICHAR_Zs11 = 0x2008,
 312         UNICHAR_Zs12 = 0x2009,
 313         UNICHAR_Zs13 = 0x200A,
 314         UNICHAR_Zs14 = 0x202F,
 315         UNICHAR_Zs15 = 0x205F,
 316         UNICHAR_Zs16 = 0x3000,
 317
 318         // Byte-order mark - we treat it like a space
 319         UNICHAR_BOM = 0xFEFF,
 320     };
 321
 322     enum {
 323         // The character among the LS/PS, BOM, and Zs* with the lowest value
 324         UNICHAR_LOWEST_ODDSPACE = 0x1680
 325     };
 326
 327     // 8 bits available in the char_attrs table
 328     enum {
 329         CHAR_ATTR_OCTAL = 1,
 330         CHAR_ATTR_DECIMAL = 2,
 331         CHAR_ATTR_HEX = 4,
 332         CHAR_ATTR_LETTER = 8,
 333         CHAR_ATTR_UNDERBAR = 16,
 334         CHAR_ATTR_DOLLAR = 32,
 335
 336         CHAR_ATTR_INITIAL = CHAR_ATTR_LETTER | CHAR_ATTR_UNDERBAR | CHAR_ATTR_DOLLAR,
 337         CHAR_ATTR_SUBSEQUENT = CHAR_ATTR_INITIAL | CHAR_ATTR_DECIMAL
 338     };
 339
 340     Token lexImpl();
 341     Token regexpImpl();
 342     Token divideOperatorImpl();
 343     Token rightAngleImpl();
 344     Token rightShiftOrRelationalOperatorImpl();
 345     Token leftShiftOrRelationalOperatorImpl();
 346
 347     Token xmlAtomImpl();
 348     Token xmlMarkup(Token t);
 349     Token xmlWhitespace();
 350     Token xmlName();
 351     Token xmlString();
 352     Token xmlText();
 353     bool isXmlNameStart(wchar c);
 354     bool isXmlNameSubsequent(wchar c);
 355
 356     void lineComment();
 357     void blockComment();
 358
 359     Token identifier();
 360
 361     Token stringLiteral(int delimiter);
 362
 363     int escapeSequence();
 364     int octalOrNulEscape();
 365     int octalEscape(int n);
 366     int hexEscape(int n);
 367     int unicodeEscape();
 368
 369     Token numberLiteral();
 370     Token integerLiteral(int base);
 371     Token floatingLiteral();
 372     void checkNextCharForNumber();
 373     bool numberLiteralPrime();
 374     void numberFraction(bool has_leading_digits);
 375     void numberExponent();
 376     bool octalDigits(int k);
 377     bool decimalDigits(int k);
 378     bool hexDigits(int k);
 379     bool digits(int k, int mask);
 380     double parseDouble();
 381     double parseInt(int base);
 382
 383     bool notPartOfIdent(int c);
 384     bool isUnicodeIdentifierStart(int c);
 385     bool isUnicodeIdentifierPart(int c);
 386 #ifdef DEBUG
 387     void print(Token t, uint32_t l, TokenValue v);
 388 #endif
 389
 390     Compiler * const    compiler;
 391     const wchar*        src;        // input
 392     const wchar*        limit;      // one past end of input
 393     const wchar*        idx;        // next char in input
 394     const wchar*        mark;       // a remembered position, typically the start of a lexeme (not always valid)
 395     uint32_t            lineno;     // line number of last char of last token returned
 396     const bool          keyword_or_ident;
 397                                     // true if this lexer instance is just used for checking whether an
 398                                     // identifier that contains a backslash sequence looks like a keyword
 399 #ifdef DEBUG
 400     Token               last_token; // last token returned
 401     bool                traceflag;  // true iff we're tracing
 402 #endif
 403     TokenValue          val;        // temporary slot
 404
 405     // Character attributes for the ASCII range, bit vectors of the CHAR_ATTR_ values above.
 406     static const uint8_t char_attrs[128];
 407 };
 408 }}