llvm/lib/TableGen/TGLexer.h

   1 //===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This class represents the Lexer for tablegen files.
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H
  14 #define LLVM_LIB_TABLEGEN_TGLEXER_H
  15
  16 #include "llvm/ADT/StringRef.h"
  17 #include "llvm/ADT/StringSet.h"
  18 #include "llvm/Support/DataTypes.h"
  19 #include "llvm/Support/SMLoc.h"
  20 #include <cassert>
  21 #include <memory>
  22 #include <set>
  23 #include <string>
  24 #include <vector>
  25
  26 namespace llvm {
  27 template <typename T> class ArrayRef;
  28 class SourceMgr;
  29 class Twine;
  30
  31 namespace tgtok {
  32 enum TokKind {
  33   // Markers
  34   Eof,
  35   Error,
  36
  37   // Tokens with no info.
  38   minus,     // -
  39   plus,      // +
  40   l_square,  // [
  41   r_square,  // ]
  42   l_brace,   // {
  43   r_brace,   // }
  44   l_paren,   // (
  45   r_paren,   // )
  46   less,      // <
  47   greater,   // >
  48   colon,     // :
  49   semi,      // ;
  50   comma,     // ,
  51   dot,       // .
  52   equal,     // =
  53   question,  // ?
  54   paste,     // #
  55   dotdotdot, // ...
  56
  57   // Boolean literals.
  58   TrueVal,
  59   FalseVal,
  60
  61   // Integer value.
  62   IntVal,
  63
  64   // Binary constant.  Note that these are sized according to the number of
  65   // bits given.
  66   BinaryIntVal,
  67
  68   // Preprocessing tokens for internal usage by the lexer.
  69   // They are never returned as a result of Lex().
  70   Ifdef,
  71   Ifndef,
  72   Else,
  73   Endif,
  74   Define,
  75
  76   // Reserved keywords. ('ElseKW' is named to distinguish it from the
  77   // existing 'Else' that means the preprocessor #else.)
  78   Bit,
  79   Bits,
  80   Code,
  81   Dag,
  82   ElseKW,
  83   FalseKW,
  84   Field,
  85   In,
  86   Include,
  87   Int,
  88   List,
  89   String,
  90   Then,
  91   TrueKW,
  92
  93   // Object start tokens.
  94   OBJECT_START_FIRST,
  95   Assert = OBJECT_START_FIRST,
  96   Class,
  97   Def,
  98   Defm,
  99   Defset,
 100   Defvar,
 101   Dump,
 102   Foreach,
 103   If,
 104   Let,
 105   MultiClass,
 106   OBJECT_START_LAST = MultiClass,
 107
 108   // Bang operators.
 109   BANG_OPERATOR_FIRST,
 110   XConcat = BANG_OPERATOR_FIRST,
 111   XADD,
 112   XSUB,
 113   XMUL,
 114   XDIV,
 115   XNOT,
 116   XLOG2,
 117   XAND,
 118   XOR,
 119   XXOR,
 120   XSRA,
 121   XSRL,
 122   XSHL,
 123   XListConcat,
 124   XListSplat,
 125   XStrConcat,
 126   XInterleave,
 127   XSubstr,
 128   XFind,
 129   XCast,
 130   XSubst,
 131   XForEach,
 132   XFilter,
 133   XFoldl,
 134   XHead,
 135   XTail,
 136   XSize,
 137   XEmpty,
 138   XIf,
 139   XCond,
 140   XEq,
 141   XIsA,
 142   XDag,
 143   XNe,
 144   XLe,
 145   XLt,
 146   XGe,
 147   XGt,
 148   XSetDagOp,
 149   XGetDagOp,
 150   XExists,
 151   XListRemove,
 152   XToLower,
 153   XToUpper,
 154   XRange,
 155   XGetDagArg,
 156   XGetDagName,
 157   XSetDagArg,
 158   XSetDagName,
 159   XRepr,
 160   BANG_OPERATOR_LAST = XRepr,
 161
 162   // String valued tokens.
 163   STRING_VALUE_FIRST,
 164   Id = STRING_VALUE_FIRST,
 165   StrVal,
 166   VarName,
 167   CodeFragment,
 168   STRING_VALUE_LAST = CodeFragment,
 169 };
 170
 171 /// isBangOperator - Return true if this is a bang operator.
 172 static inline bool isBangOperator(tgtok::TokKind Kind) {
 173   return tgtok::BANG_OPERATOR_FIRST <= Kind && Kind <= BANG_OPERATOR_LAST;
 174 }
 175
 176 /// isObjectStart - Return true if this is a valid first token for a statement.
 177 static inline bool isObjectStart(tgtok::TokKind Kind) {
 178   return tgtok::OBJECT_START_FIRST <= Kind && Kind <= OBJECT_START_LAST;
 179 }
 180
 181 /// isStringValue - Return true if this is a string value.
 182 static inline bool isStringValue(tgtok::TokKind Kind) {
 183   return tgtok::STRING_VALUE_FIRST <= Kind && Kind <= STRING_VALUE_LAST;
 184 }
 185 } // namespace tgtok
 186
 187 /// TGLexer - TableGen Lexer class.
 188 class TGLexer {
 189   SourceMgr &SrcMgr;
 190
 191   const char *CurPtr = nullptr;
 192   StringRef CurBuf;
 193
 194   // Information about the current token.
 195   const char *TokStart = nullptr;
 196   tgtok::TokKind CurCode = tgtok::TokKind::Eof;
 197   std::string CurStrVal; // This is valid for Id, StrVal, VarName, CodeFragment
 198   int64_t CurIntVal = 0; // This is valid for IntVal.
 199
 200   /// CurBuffer - This is the current buffer index we're lexing from as managed
 201   /// by the SourceMgr object.
 202   unsigned CurBuffer = 0;
 203
 204 public:
 205   typedef std::set<std::string> DependenciesSetTy;
 206
 207 private:
 208   /// Dependencies - This is the list of all included files.
 209   DependenciesSetTy Dependencies;
 210
 211 public:
 212   TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros);
 213
 214   tgtok::TokKind Lex() {
 215     return CurCode = LexToken(CurPtr == CurBuf.begin());
 216   }
 217
 218   const DependenciesSetTy &getDependencies() const {
 219     return Dependencies;
 220   }
 221
 222   tgtok::TokKind getCode() const { return CurCode; }
 223
 224   const std::string &getCurStrVal() const {
 225     assert(tgtok::isStringValue(CurCode) &&
 226            "This token doesn't have a string value");
 227     return CurStrVal;
 228   }
 229   int64_t getCurIntVal() const {
 230     assert(CurCode == tgtok::IntVal && "This token isn't an integer");
 231     return CurIntVal;
 232   }
 233   std::pair<int64_t, unsigned> getCurBinaryIntVal() const {
 234     assert(CurCode == tgtok::BinaryIntVal &&
 235            "This token isn't a binary integer");
 236     return std::make_pair(CurIntVal, (CurPtr - TokStart)-2);
 237   }
 238
 239   SMLoc getLoc() const;
 240   SMRange getLocRange() const;
 241
 242 private:
 243   /// LexToken - Read the next token and return its code.
 244   tgtok::TokKind LexToken(bool FileOrLineStart = false);
 245
 246   tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg);
 247   tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg);
 248
 249   int getNextChar();
 250   int peekNextChar(int Index) const;
 251   void SkipBCPLComment();
 252   bool SkipCComment();
 253   tgtok::TokKind LexIdentifier();
 254   bool LexInclude();
 255   tgtok::TokKind LexString();
 256   tgtok::TokKind LexVarName();
 257   tgtok::TokKind LexNumber();
 258   tgtok::TokKind LexBracket();
 259   tgtok::TokKind LexExclaim();
 260
 261   // Process EOF encountered in LexToken().
 262   // If EOF is met in an include file, then the method will update
 263   // CurPtr, CurBuf and preprocessing include stack, and return true.
 264   // If EOF is met in the top-level file, then the method will
 265   // update and check the preprocessing include stack, and return false.
 266   bool processEOF();
 267
 268   // *** Structures and methods for preprocessing support ***
 269
 270   // A set of macro names that are defined either via command line or
 271   // by using:
 272   //     #define NAME
 273   StringSet<> DefinedMacros;
 274
 275   // Each of #ifdef and #else directives has a descriptor associated
 276   // with it.
 277   //
 278   // An ordered list of preprocessing controls defined by #ifdef/#else
 279   // directives that are in effect currently is called preprocessing
 280   // control stack.  It is represented as a vector of PreprocessorControlDesc's.
 281   //
 282   // The control stack is updated according to the following rules:
 283   //
 284   // For each #ifdef we add an element to the control stack.
 285   // For each #else we replace the top element with a descriptor
 286   // with an inverted IsDefined value.
 287   // For each #endif we pop the top element from the control stack.
 288   //
 289   // When CurPtr reaches the current buffer's end, the control stack
 290   // must be empty, i.e. #ifdef and the corresponding #endif
 291   // must be located in the same file.
 292   struct PreprocessorControlDesc {
 293     // Either tgtok::Ifdef or tgtok::Else.
 294     tgtok::TokKind Kind;
 295
 296     // True, if the condition for this directive is true, false - otherwise.
 297     // Examples:
 298     //     #ifdef NAME       : true, if NAME is defined, false - otherwise.
 299     //     ...
 300     //     #else             : false, if NAME is defined, true - otherwise.
 301     bool IsDefined;
 302
 303     // Pointer into CurBuf to the beginning of the preprocessing directive
 304     // word, e.g.:
 305     //     #ifdef NAME
 306     //      ^ - SrcPos
 307     SMLoc SrcPos;
 308   };
 309
 310   // We want to disallow code like this:
 311   //     file1.td:
 312   //         #define NAME
 313   //         #ifdef NAME
 314   //         include "file2.td"
 315   //     EOF
 316   //     file2.td:
 317   //         #endif
 318   //     EOF
 319   //
 320   // To do this, we clear the preprocessing control stack on entry
 321   // to each of the included file.  PrepIncludeStack is used to store
 322   // preprocessing control stacks for the current file and all its
 323   // parent files.  The back() element is the preprocessing control
 324   // stack for the current file.
 325   std::vector<std::unique_ptr<std::vector<PreprocessorControlDesc>>>
 326       PrepIncludeStack;
 327
 328   // Validate that the current preprocessing control stack is empty,
 329   // since we are about to exit a file, and pop the include stack.
 330   //
 331   // If IncludeStackMustBeEmpty is true, the include stack must be empty
 332   // after the popping, otherwise, the include stack must not be empty
 333   // after the popping.  Basically, the include stack must be empty
 334   // only if we exit the "top-level" file (i.e. finish lexing).
 335   //
 336   // The method returns false, if the current preprocessing control stack
 337   // is not empty (e.g. there is an unterminated #ifdef/#else),
 338   // true - otherwise.
 339   bool prepExitInclude(bool IncludeStackMustBeEmpty);
 340
 341   // Look ahead for a preprocessing directive starting from CurPtr.  The caller
 342   // must only call this method, if *(CurPtr - 1) is '#'.  If the method matches
 343   // a preprocessing directive word followed by a whitespace, then it returns
 344   // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define.
 345   //
 346   // CurPtr is not adjusted by this method.
 347   tgtok::TokKind prepIsDirective() const;
 348
 349   // Given a preprocessing token kind, adjusts CurPtr to the end
 350   // of the preprocessing directive word.  Returns true, unless
 351   // an unsupported token kind is passed in.
 352   //
 353   // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective()
 354   // to avoid adjusting CurPtr before we are sure that '#' is followed
 355   // by a preprocessing directive.  If it is not, then we fall back to
 356   // tgtok::paste interpretation of '#'.
 357   bool prepEatPreprocessorDirective(tgtok::TokKind Kind);
 358
 359   // The main "exit" point from the token parsing to preprocessor.
 360   //
 361   // The method is called for CurPtr, when prepIsDirective() returns
 362   // true.  The first parameter matches the result of prepIsDirective(),
 363   // denoting the actual preprocessor directive to be processed.
 364   //
 365   // If the preprocessing directive disables the tokens processing, e.g.:
 366   //     #ifdef NAME // NAME is undefined
 367   // then lexPreprocessor() enters the lines-skipping mode.
 368   // In this mode, it does not parse any tokens, because the code under
 369   // the #ifdef may not even be a correct tablegen code.  The preprocessor
 370   // looks for lines containing other preprocessing directives, which
 371   // may be prepended with whitespaces and C-style comments.  If the line
 372   // does not contain a preprocessing directive, it is skipped completely.
 373   // Otherwise, the preprocessing directive is processed by recursively
 374   // calling lexPreprocessor().  The processing of the encountered
 375   // preprocessing directives includes updating preprocessing control stack
 376   // and adding new macros into DefinedMacros set.
 377   //
 378   // The second parameter controls whether lexPreprocessor() is called from
 379   // LexToken() (true) or recursively from lexPreprocessor() (false).
 380   //
 381   // If ReturnNextLiveToken is true, the method returns the next
 382   // LEX token following the current directive or following the end
 383   // of the disabled preprocessing region corresponding to this directive.
 384   // If ReturnNextLiveToken is false, the method returns the first parameter,
 385   // unless there were errors encountered in the disabled preprocessing
 386   // region - in this case, it returns tgtok::Error.
 387   tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind,
 388                                  bool ReturnNextLiveToken = true);
 389
 390   // Worker method for lexPreprocessor() to skip lines after some
 391   // preprocessing directive up to the buffer end or to the directive
 392   // that re-enables token processing.  The method returns true
 393   // upon processing the next directive that re-enables tokens
 394   // processing.  False is returned if an error was encountered.
 395   //
 396   // Note that prepSkipRegion() calls lexPreprocessor() to process
 397   // encountered preprocessing directives.  In this case, the second
 398   // parameter to lexPreprocessor() is set to false.  Being passed
 399   // false ReturnNextLiveToken, lexPreprocessor() must never call
 400   // prepSkipRegion().  We assert this by passing ReturnNextLiveToken
 401   // to prepSkipRegion() and checking that it is never set to false.
 402   bool prepSkipRegion(bool MustNeverBeFalse);
 403
 404   // Lex name of the macro after either #ifdef or #define.  We could have used
 405   // LexIdentifier(), but it has special handling of "include" word, which
 406   // could result in awkward diagnostic errors.  Consider:
 407   // ----
 408   // #ifdef include
 409   // class ...
 410   // ----
 411   // LexIdentifier() will engage LexInclude(), which will complain about
 412   // missing file with name "class".  Instead, prepLexMacroName() will treat
 413   // "include" as a normal macro name.
 414   //
 415   // On entry, CurPtr points to the end of a preprocessing directive word.
 416   // The method allows for whitespaces between the preprocessing directive
 417   // and the macro name.  The allowed whitespaces are ' ' and '\t'.
 418   //
 419   // If the first non-whitespace symbol after the preprocessing directive
 420   // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then
 421   // the method updates TokStart to the position of the first non-whitespace
 422   // symbol, sets CurPtr to the position of the macro name's last symbol,
 423   // and returns a string reference to the macro name.  Otherwise,
 424   // TokStart is set to the first non-whitespace symbol after the preprocessing
 425   // directive, and the method returns an empty string reference.
 426   //
 427   // In all cases, TokStart may be used to point to the word following
 428   // the preprocessing directive.
 429   StringRef prepLexMacroName();
 430
 431   // Skip any whitespaces starting from CurPtr.  The method is used
 432   // only in the lines-skipping mode to find the first non-whitespace
 433   // symbol after or at CurPtr.  Allowed whitespaces are ' ', '\t', '\n'
 434   // and '\r'.  The method skips C-style comments as well, because
 435   // it is used to find the beginning of the preprocessing directive.
 436   // If we do not handle C-style comments the following code would
 437   // result in incorrect detection of a preprocessing directive:
 438   //     /*
 439   //     #ifdef NAME
 440   //     */
 441   // As long as we skip C-style comments, the following code is correctly
 442   // recognized as a preprocessing directive:
 443   //     /* first line comment
 444   //        second line comment */ #ifdef NAME
 445   //
 446   // The method returns true upon reaching the first non-whitespace symbol
 447   // or EOF, CurPtr is set to point to this symbol.  The method returns false,
 448   // if an error occurred during skipping of a C-style comment.
 449   bool prepSkipLineBegin();
 450
 451   // Skip any whitespaces or comments after a preprocessing directive.
 452   // The method returns true upon reaching either end of the line
 453   // or end of the file.  If there is a multiline C-style comment
 454   // after the preprocessing directive, the method skips
 455   // the comment, so the final CurPtr may point to one of the next lines.
 456   // The method returns false, if an error occurred during skipping
 457   // C- or C++-style comment, or a non-whitespace symbol appears
 458   // after the preprocessing directive.
 459   //
 460   // The method maybe called both during lines-skipping and tokens
 461   // processing.  It actually verifies that only whitespaces or/and
 462   // comments follow a preprocessing directive.
 463   //
 464   // After the execution of this mehod, CurPtr points either to new line
 465   // symbol, buffer end or non-whitespace symbol following the preprocesing
 466   // directive.
 467   bool prepSkipDirectiveEnd();
 468
 469   // Skip all symbols to the end of the line/file.
 470   // The method adjusts CurPtr, so that it points to either new line
 471   // symbol in the current line or the buffer end.
 472   void prepSkipToLineEnd();
 473
 474   // Return true, if the current preprocessor control stack is such that
 475   // we should allow lexer to process the next token, false - otherwise.
 476   //
 477   // In particular, the method returns true, if all the #ifdef/#else
 478   // controls on the stack have their IsDefined member set to true.
 479   bool prepIsProcessingEnabled();
 480
 481   // Report an error, if we reach EOF with non-empty preprocessing control
 482   // stack.  This means there is no matching #endif for the previous
 483   // #ifdef/#else.
 484   void prepReportPreprocessorStackError();
 485 };
 486
 487 } // end namespace llvm
 488
 489 #endif