llvm/lib/TableGen/TGLexer.cpp

   1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // Implement the Lexer for TableGen.
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include "TGLexer.h"
  14 #include "llvm/ADT/ArrayRef.h"
  15 #include "llvm/ADT/StringExtras.h"
  16 #include "llvm/ADT/StringSwitch.h"
  17 #include "llvm/ADT/Twine.h"
  18 #include "llvm/Config/config.h" // for strtoull()/strtoll() define
  19 #include "llvm/Support/Compiler.h"
  20 #include "llvm/Support/MemoryBuffer.h"
  21 #include "llvm/Support/SourceMgr.h"
  22 #include "llvm/TableGen/Error.h"
  23 #include <algorithm>
  24 #include <cerrno>
  25 #include <cstdint>
  26 #include <cstdio>
  27 #include <cstdlib>
  28 #include <cstring>
  29
  30 using namespace llvm;
  31
  32 namespace {
  33 // A list of supported preprocessing directives with their
  34 // internal token kinds and names.
  35 struct PreprocessorDir {
  36   tgtok::TokKind Kind;
  37   StringRef Word;
  38 };
  39 } // end anonymous namespace
  40
  41 /// Returns true if `C` is a valid character in an identifier. If `First` is
  42 /// true, returns true if `C` is a valid first character of an identifier,
  43 /// else returns true if `C` is a valid non-first character of an identifier.
  44 /// Identifiers match the following regular expression:
  45 ///   [a-zA-Z_][0-9a-zA-Z_]*
  46 static bool isValidIDChar(char C, bool First) {
  47   if (C == '_' || isAlpha(C))
  48     return true;
  49   return !First && isDigit(C);
  50 }
  51
  52 constexpr PreprocessorDir PreprocessorDirs[] = {{tgtok::Ifdef, "ifdef"},
  53                                                 {tgtok::Ifndef, "ifndef"},
  54                                                 {tgtok::Else, "else"},
  55                                                 {tgtok::Endif, "endif"},
  56                                                 {tgtok::Define, "define"}};
  57
  58 // Returns a pointer past the end of a valid macro name at the start of `Str`.
  59 // Valid macro names match the regular expression [a-zA-Z_][0-9a-zA-Z_]*.
  60 static const char *lexMacroName(StringRef Str) {
  61   assert(!Str.empty());
  62
  63   // Macro names start with [a-zA-Z_].
  64   const char *Next = Str.begin();
  65   if (!isValidIDChar(*Next, /*First=*/true))
  66     return Next;
  67   // Eat the first character of the name.
  68   ++Next;
  69
  70   // Match the rest of the identifier regex: [0-9a-zA-Z_]*
  71   const char *End = Str.end();
  72   while (Next != End && isValidIDChar(*Next, /*First=*/false))
  73     ++Next;
  74   return Next;
  75 }
  76
  77 TGLexer::TGLexer(SourceMgr &SM, ArrayRef<std::string> Macros) : SrcMgr(SM) {
  78   CurBuffer = SrcMgr.getMainFileID();
  79   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
  80   CurPtr = CurBuf.begin();
  81   TokStart = nullptr;
  82
  83   // Pretend that we enter the "top-level" include file.
  84   PrepIncludeStack.emplace_back();
  85
  86   // Add all macros defined on the command line to the DefinedMacros set.
  87   // Check invalid macro names and print fatal error if we find one.
  88   for (StringRef MacroName : Macros) {
  89     const char *End = lexMacroName(MacroName);
  90     if (End != MacroName.end())
  91       PrintFatalError("invalid macro name `" + MacroName +
  92                       "` specified on command line");
  93
  94     DefinedMacros.insert(MacroName);
  95   }
  96 }
  97
  98 SMLoc TGLexer::getLoc() const {
  99   return SMLoc::getFromPointer(TokStart);
 100 }
 101
 102 SMRange TGLexer::getLocRange() const {
 103   return {getLoc(), SMLoc::getFromPointer(CurPtr)};
 104 }
 105
 106 /// ReturnError - Set the error to the specified string at the specified
 107 /// location.  This is defined to always return tgtok::Error.
 108 tgtok::TokKind TGLexer::ReturnError(SMLoc Loc, const Twine &Msg) {
 109   PrintError(Loc, Msg);
 110   return tgtok::Error;
 111 }
 112
 113 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
 114   return ReturnError(SMLoc::getFromPointer(Loc), Msg);
 115 }
 116
 117 bool TGLexer::processEOF() {
 118   SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
 119   if (ParentIncludeLoc != SMLoc()) {
 120     // If prepExitInclude() detects a problem with the preprocessing
 121     // control stack, it will return false.  Pretend that we reached
 122     // the final EOF and stop lexing more tokens by returning false
 123     // to LexToken().
 124     if (!prepExitInclude(false))
 125       return false;
 126
 127     CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
 128     CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
 129     CurPtr = ParentIncludeLoc.getPointer();
 130     // Make sure TokStart points into the parent file's buffer.
 131     // LexToken() assigns to it before calling getNextChar(),
 132     // so it is pointing into the included file now.
 133     TokStart = CurPtr;
 134     return true;
 135   }
 136
 137   // Pretend that we exit the "top-level" include file.
 138   // Note that in case of an error (e.g. control stack imbalance)
 139   // the routine will issue a fatal error.
 140   prepExitInclude(true);
 141   return false;
 142 }
 143
 144 int TGLexer::getNextChar() {
 145   char CurChar = *CurPtr++;
 146   switch (CurChar) {
 147   default:
 148     return (unsigned char)CurChar;
 149
 150   case 0: {
 151     // A NUL character in the stream is either the end of the current buffer or
 152     // a spurious NUL in the file.  Disambiguate that here.
 153     if (CurPtr - 1 == CurBuf.end()) {
 154       --CurPtr; // Arrange for another call to return EOF again.
 155       return EOF;
 156     }
 157     PrintError(getLoc(),
 158                "NUL character is invalid in source; treated as space");
 159     return ' ';
 160   }
 161
 162   case '\n':
 163   case '\r':
 164     // Handle the newline character by ignoring it and incrementing the line
 165     // count.  However, be careful about 'dos style' files with \n\r in them.
 166     // Only treat a \n\r or \r\n as a single line.
 167     if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
 168         *CurPtr != CurChar)
 169       ++CurPtr;  // Eat the two char newline sequence.
 170     return '\n';
 171   }
 172 }
 173
 174 int TGLexer::peekNextChar(int Index) const {
 175   return *(CurPtr + Index);
 176 }
 177
 178 tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
 179   TokStart = CurPtr;
 180   // This always consumes at least one character.
 181   int CurChar = getNextChar();
 182
 183   switch (CurChar) {
 184   default:
 185     // Handle letters: [a-zA-Z_]
 186     if (isValidIDChar(CurChar, /*First=*/true))
 187       return LexIdentifier();
 188
 189     // Unknown character, emit an error.
 190     return ReturnError(TokStart, "unexpected character");
 191   case EOF:
 192     // Lex next token, if we just left an include file.
 193     // Note that leaving an include file means that the next
 194     // symbol is located at the end of the 'include "..."'
 195     // construct, so LexToken() is called with default
 196     // false parameter.
 197     if (processEOF())
 198       return LexToken();
 199
 200     // Return EOF denoting the end of lexing.
 201     return tgtok::Eof;
 202
 203   case ':': return tgtok::colon;
 204   case ';': return tgtok::semi;
 205   case ',': return tgtok::comma;
 206   case '<': return tgtok::less;
 207   case '>': return tgtok::greater;
 208   case ']': return tgtok::r_square;
 209   case '{': return tgtok::l_brace;
 210   case '}': return tgtok::r_brace;
 211   case '(': return tgtok::l_paren;
 212   case ')': return tgtok::r_paren;
 213   case '=': return tgtok::equal;
 214   case '?': return tgtok::question;
 215   case '#':
 216     if (FileOrLineStart) {
 217       tgtok::TokKind Kind = prepIsDirective();
 218       if (Kind != tgtok::Error)
 219         return lexPreprocessor(Kind);
 220     }
 221
 222     return tgtok::paste;
 223
 224   // The period is a separate case so we can recognize the "..."
 225   // range punctuator.
 226   case '.':
 227     if (peekNextChar(0) == '.') {
 228       ++CurPtr; // Eat second dot.
 229       if (peekNextChar(0) == '.') {
 230         ++CurPtr; // Eat third dot.
 231         return tgtok::dotdotdot;
 232       }
 233       return ReturnError(TokStart, "invalid '..' punctuation");
 234     }
 235     return tgtok::dot;
 236
 237   case '\r':
 238     PrintFatalError("getNextChar() must never return '\r'");
 239     return tgtok::Error;
 240
 241   case ' ':
 242   case '\t':
 243     // Ignore whitespace.
 244     return LexToken(FileOrLineStart);
 245   case '\n':
 246     // Ignore whitespace, and identify the new line.
 247     return LexToken(true);
 248   case '/':
 249     // If this is the start of a // comment, skip until the end of the line or
 250     // the end of the buffer.
 251     if (*CurPtr == '/')
 252       SkipBCPLComment();
 253     else if (*CurPtr == '*') {
 254       if (SkipCComment())
 255         return tgtok::Error;
 256     } else // Otherwise, this is an error.
 257       return ReturnError(TokStart, "unexpected character");
 258     return LexToken(FileOrLineStart);
 259   case '-': case '+':
 260   case '0': case '1': case '2': case '3': case '4': case '5': case '6':
 261   case '7': case '8': case '9': {
 262     int NextChar = 0;
 263     if (isDigit(CurChar)) {
 264       // Allow identifiers to start with a number if it is followed by
 265       // an identifier.  This can happen with paste operations like
 266       // foo#8i.
 267       int i = 0;
 268       do {
 269         NextChar = peekNextChar(i++);
 270       } while (isDigit(NextChar));
 271
 272       if (NextChar == 'x' || NextChar == 'b') {
 273         // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
 274         // likely a number.
 275         int NextNextChar = peekNextChar(i);
 276         switch (NextNextChar) {
 277         default:
 278           break;
 279         case '0': case '1':
 280           if (NextChar == 'b')
 281             return LexNumber();
 282           [[fallthrough]];
 283         case '2': case '3': case '4': case '5':
 284         case '6': case '7': case '8': case '9':
 285         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 286         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 287           if (NextChar == 'x')
 288             return LexNumber();
 289           break;
 290         }
 291       }
 292     }
 293
 294     if (isValidIDChar(NextChar, /*First=*/true))
 295       return LexIdentifier();
 296
 297     return LexNumber();
 298   }
 299   case '"': return LexString();
 300   case '$': return LexVarName();
 301   case '[': return LexBracket();
 302   case '!': return LexExclaim();
 303   }
 304 }
 305
 306 /// LexString - Lex "[^"]*"
 307 tgtok::TokKind TGLexer::LexString() {
 308   const char *StrStart = CurPtr;
 309
 310   CurStrVal = "";
 311
 312   while (*CurPtr != '"') {
 313     // If we hit the end of the buffer, report an error.
 314     if (*CurPtr == 0 && CurPtr == CurBuf.end())
 315       return ReturnError(StrStart, "end of file in string literal");
 316
 317     if (*CurPtr == '\n' || *CurPtr == '\r')
 318       return ReturnError(StrStart, "end of line in string literal");
 319
 320     if (*CurPtr != '\\') {
 321       CurStrVal += *CurPtr++;
 322       continue;
 323     }
 324
 325     ++CurPtr;
 326
 327     switch (*CurPtr) {
 328     case '\\': case '\'': case '"':
 329       // These turn into their literal character.
 330       CurStrVal += *CurPtr++;
 331       break;
 332     case 't':
 333       CurStrVal += '\t';
 334       ++CurPtr;
 335       break;
 336     case 'n':
 337       CurStrVal += '\n';
 338       ++CurPtr;
 339       break;
 340
 341     case '\n':
 342     case '\r':
 343       return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
 344
 345     // If we hit the end of the buffer, report an error.
 346     case '\0':
 347       if (CurPtr == CurBuf.end())
 348         return ReturnError(StrStart, "end of file in string literal");
 349       [[fallthrough]];
 350     default:
 351       return ReturnError(CurPtr, "invalid escape in string literal");
 352     }
 353   }
 354
 355   ++CurPtr;
 356   return tgtok::StrVal;
 357 }
 358
 359 tgtok::TokKind TGLexer::LexVarName() {
 360   if (!isValidIDChar(CurPtr[0], /*First=*/true))
 361     return ReturnError(TokStart, "invalid variable name");
 362
 363   // Otherwise, we're ok, consume the rest of the characters.
 364   const char *VarNameStart = CurPtr++;
 365
 366   while (isValidIDChar(*CurPtr, /*First=*/false))
 367     ++CurPtr;
 368
 369   CurStrVal.assign(VarNameStart, CurPtr);
 370   return tgtok::VarName;
 371 }
 372
 373 tgtok::TokKind TGLexer::LexIdentifier() {
 374   // The first letter is [a-zA-Z_].
 375   const char *IdentStart = TokStart;
 376
 377   // Match the rest of the identifier regex: [0-9a-zA-Z_]*
 378   while (isValidIDChar(*CurPtr, /*First=*/false))
 379     ++CurPtr;
 380
 381   // Check to see if this identifier is a reserved keyword.
 382   StringRef Str(IdentStart, CurPtr-IdentStart);
 383
 384   tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
 385                             .Case("int", tgtok::Int)
 386                             .Case("bit", tgtok::Bit)
 387                             .Case("bits", tgtok::Bits)
 388                             .Case("string", tgtok::String)
 389                             .Case("list", tgtok::List)
 390                             .Case("code", tgtok::Code)
 391                             .Case("dag", tgtok::Dag)
 392                             .Case("class", tgtok::Class)
 393                             .Case("def", tgtok::Def)
 394                             .Case("true", tgtok::TrueVal)
 395                             .Case("false", tgtok::FalseVal)
 396                             .Case("foreach", tgtok::Foreach)
 397                             .Case("defm", tgtok::Defm)
 398                             .Case("defset", tgtok::Defset)
 399                             .Case("deftype", tgtok::Deftype)
 400                             .Case("multiclass", tgtok::MultiClass)
 401                             .Case("field", tgtok::Field)
 402                             .Case("let", tgtok::Let)
 403                             .Case("in", tgtok::In)
 404                             .Case("defvar", tgtok::Defvar)
 405                             .Case("include", tgtok::Include)
 406                             .Case("if", tgtok::If)
 407                             .Case("then", tgtok::Then)
 408                             .Case("else", tgtok::ElseKW)
 409                             .Case("assert", tgtok::Assert)
 410                             .Case("dump", tgtok::Dump)
 411                             .Default(tgtok::Id);
 412
 413   // A couple of tokens require special processing.
 414   switch (Kind) {
 415     case tgtok::Include:
 416       if (LexInclude()) return tgtok::Error;
 417       return Lex();
 418     case tgtok::Id:
 419       CurStrVal.assign(Str.begin(), Str.end());
 420       break;
 421     default:
 422       break;
 423   }
 424
 425   return Kind;
 426 }
 427
 428 /// LexInclude - We just read the "include" token.  Get the string token that
 429 /// comes next and enter the include.
 430 bool TGLexer::LexInclude() {
 431   // The token after the include must be a string.
 432   tgtok::TokKind Tok = LexToken();
 433   if (Tok == tgtok::Error) return true;
 434   if (Tok != tgtok::StrVal) {
 435     PrintError(getLoc(), "expected filename after include");
 436     return true;
 437   }
 438
 439   // Get the string.
 440   std::string Filename = CurStrVal;
 441   std::string IncludedFile;
 442
 443   CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
 444                                     IncludedFile);
 445   if (!CurBuffer) {
 446     PrintError(getLoc(), "could not find include file '" + Filename + "'");
 447     return true;
 448   }
 449
 450   Dependencies.insert(IncludedFile);
 451   // Save the line number and lex buffer of the includer.
 452   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
 453   CurPtr = CurBuf.begin();
 454
 455   PrepIncludeStack.emplace_back();
 456   return false;
 457 }
 458
 459 /// SkipBCPLComment - Skip over the comment by finding the next CR or LF.
 460 /// Or we may end up at the end of the buffer.
 461 void TGLexer::SkipBCPLComment() {
 462   ++CurPtr;  // skip the second slash.
 463   auto EOLPos = CurBuf.find_first_of("\r\n", CurPtr - CurBuf.data());
 464   CurPtr = (EOLPos == StringRef::npos) ? CurBuf.end() : CurBuf.data() + EOLPos;
 465 }
 466
 467 /// SkipCComment - This skips C-style /**/ comments.  The only difference from C
 468 /// is that we allow nesting.
 469 bool TGLexer::SkipCComment() {
 470   ++CurPtr;  // skip the star.
 471   unsigned CommentDepth = 1;
 472
 473   while (true) {
 474     int CurChar = getNextChar();
 475     switch (CurChar) {
 476     case EOF:
 477       PrintError(TokStart, "unterminated comment");
 478       return true;
 479     case '*':
 480       // End of the comment?
 481       if (CurPtr[0] != '/') break;
 482
 483       ++CurPtr;   // End the */.
 484       if (--CommentDepth == 0)
 485         return false;
 486       break;
 487     case '/':
 488       // Start of a nested comment?
 489       if (CurPtr[0] != '*') break;
 490       ++CurPtr;
 491       ++CommentDepth;
 492       break;
 493     }
 494   }
 495 }
 496
 497 /// LexNumber - Lex:
 498 ///    [-+]?[0-9]+
 499 ///    0x[0-9a-fA-F]+
 500 ///    0b[01]+
 501 tgtok::TokKind TGLexer::LexNumber() {
 502   unsigned Base = 0;
 503   const char *NumStart;
 504
 505   // Check if it's a hex or a binary value.
 506   if (CurPtr[-1] == '0') {
 507     NumStart = CurPtr + 1;
 508     if (CurPtr[0] == 'x') {
 509       Base = 16;
 510       do
 511         ++CurPtr;
 512       while (isHexDigit(CurPtr[0]));
 513     } else if (CurPtr[0] == 'b') {
 514       Base = 2;
 515       do
 516         ++CurPtr;
 517       while (CurPtr[0] == '0' || CurPtr[0] == '1');
 518     }
 519   }
 520
 521   // For a hex or binary value, we always convert it to an unsigned value.
 522   bool IsMinus = false;
 523
 524   // Check if it's a decimal value.
 525   if (Base == 0) {
 526     // Check for a sign without a digit.
 527     if (!isDigit(CurPtr[0])) {
 528       if (CurPtr[-1] == '-')
 529         return tgtok::minus;
 530       else if (CurPtr[-1] == '+')
 531         return tgtok::plus;
 532     }
 533
 534     Base = 10;
 535     NumStart = TokStart;
 536     IsMinus = CurPtr[-1] == '-';
 537
 538     while (isDigit(CurPtr[0]))
 539       ++CurPtr;
 540   }
 541
 542   // Requires at least one digit.
 543   if (CurPtr == NumStart)
 544     return ReturnError(TokStart, "invalid number");
 545
 546   errno = 0;
 547   if (IsMinus)
 548     CurIntVal = strtoll(NumStart, nullptr, Base);
 549   else
 550     CurIntVal = strtoull(NumStart, nullptr, Base);
 551
 552   if (errno == EINVAL)
 553     return ReturnError(TokStart, "invalid number");
 554   if (errno == ERANGE)
 555     return ReturnError(TokStart, "number out of range");
 556
 557   return Base == 2 ? tgtok::BinaryIntVal : tgtok::IntVal;
 558 }
 559
 560 /// LexBracket - We just read '['.  If this is a code block, return it,
 561 /// otherwise return the bracket.  Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
 562 tgtok::TokKind TGLexer::LexBracket() {
 563   if (CurPtr[0] != '{')
 564     return tgtok::l_square;
 565   ++CurPtr;
 566   const char *CodeStart = CurPtr;
 567   while (true) {
 568     int Char = getNextChar();
 569     if (Char == EOF) break;
 570
 571     if (Char != '}') continue;
 572
 573     Char = getNextChar();
 574     if (Char == EOF) break;
 575     if (Char == ']') {
 576       CurStrVal.assign(CodeStart, CurPtr-2);
 577       return tgtok::CodeFragment;
 578     }
 579   }
 580
 581   return ReturnError(CodeStart - 2, "unterminated code block");
 582 }
 583
 584 /// LexExclaim - Lex '!' and '![a-zA-Z]+'.
 585 tgtok::TokKind TGLexer::LexExclaim() {
 586   if (!isAlpha(*CurPtr))
 587     return ReturnError(CurPtr - 1, "invalid \"!operator\"");
 588
 589   const char *Start = CurPtr++;
 590   while (isAlpha(*CurPtr))
 591     ++CurPtr;
 592
 593   // Check to see which operator this is.
 594   tgtok::TokKind Kind =
 595       StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start))
 596           .Case("eq", tgtok::XEq)
 597           .Case("ne", tgtok::XNe)
 598           .Case("le", tgtok::XLe)
 599           .Case("lt", tgtok::XLt)
 600           .Case("ge", tgtok::XGe)
 601           .Case("gt", tgtok::XGt)
 602           .Case("if", tgtok::XIf)
 603           .Case("cond", tgtok::XCond)
 604           .Case("isa", tgtok::XIsA)
 605           .Case("head", tgtok::XHead)
 606           .Case("tail", tgtok::XTail)
 607           .Case("size", tgtok::XSize)
 608           .Case("con", tgtok::XConcat)
 609           .Case("dag", tgtok::XDag)
 610           .Case("add", tgtok::XADD)
 611           .Case("sub", tgtok::XSUB)
 612           .Case("mul", tgtok::XMUL)
 613           .Case("div", tgtok::XDIV)
 614           .Case("not", tgtok::XNOT)
 615           .Case("logtwo", tgtok::XLOG2)
 616           .Case("and", tgtok::XAND)
 617           .Case("or", tgtok::XOR)
 618           .Case("xor", tgtok::XXOR)
 619           .Case("shl", tgtok::XSHL)
 620           .Case("sra", tgtok::XSRA)
 621           .Case("srl", tgtok::XSRL)
 622           .Case("cast", tgtok::XCast)
 623           .Case("empty", tgtok::XEmpty)
 624           .Case("subst", tgtok::XSubst)
 625           .Case("foldl", tgtok::XFoldl)
 626           .Case("foreach", tgtok::XForEach)
 627           .Case("filter", tgtok::XFilter)
 628           .Case("listconcat", tgtok::XListConcat)
 629           .Case("listflatten", tgtok::XListFlatten)
 630           .Case("listsplat", tgtok::XListSplat)
 631           .Case("listremove", tgtok::XListRemove)
 632           .Case("range", tgtok::XRange)
 633           .Case("strconcat", tgtok::XStrConcat)
 634           .Case("initialized", tgtok::XInitialized)
 635           .Case("interleave", tgtok::XInterleave)
 636           .Case("substr", tgtok::XSubstr)
 637           .Case("find", tgtok::XFind)
 638           .Cases("setdagop", "setop", tgtok::XSetDagOp) // !setop is deprecated.
 639           .Cases("getdagop", "getop", tgtok::XGetDagOp) // !getop is deprecated.
 640           .Case("getdagarg", tgtok::XGetDagArg)
 641           .Case("getdagname", tgtok::XGetDagName)
 642           .Case("setdagarg", tgtok::XSetDagArg)
 643           .Case("setdagname", tgtok::XSetDagName)
 644           .Case("exists", tgtok::XExists)
 645           .Case("tolower", tgtok::XToLower)
 646           .Case("toupper", tgtok::XToUpper)
 647           .Case("repr", tgtok::XRepr)
 648           .Default(tgtok::Error);
 649
 650   return Kind != tgtok::Error ? Kind
 651                               : ReturnError(Start - 1, "unknown operator");
 652 }
 653
 654 bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) {
 655   // Report an error, if preprocessor control stack for the current
 656   // file is not empty.
 657   if (!PrepIncludeStack.back().empty()) {
 658     prepReportPreprocessorStackError();
 659
 660     return false;
 661   }
 662
 663   // Pop the preprocessing controls from the include stack.
 664   PrepIncludeStack.pop_back();
 665
 666   if (IncludeStackMustBeEmpty) {
 667     if (!PrepIncludeStack.empty())
 668       PrintFatalError("preprocessor include stack is not empty");
 669   } else {
 670     if (PrepIncludeStack.empty())
 671       PrintFatalError("preprocessor include stack is empty");
 672   }
 673
 674   return true;
 675 }
 676
 677 tgtok::TokKind TGLexer::prepIsDirective() const {
 678   for (const auto [Kind, Word] : PreprocessorDirs) {
 679     if (StringRef(CurPtr, Word.size()) != Word)
 680       continue;
 681     int NextChar = peekNextChar(Word.size());
 682
 683     // Check for whitespace after the directive. If there is no whitespace,
 684     // then we do not recognize it as a preprocessing directive.
 685
 686     // New line and EOF may follow only #else/#endif. It will be reported
 687     // as an error for #ifdef/#define after the call to prepLexMacroName().
 688     if (NextChar == ' ' || NextChar == '\t' || NextChar == EOF ||
 689         NextChar == '\n' ||
 690         // It looks like TableGen does not support '\r' as the actual
 691         // carriage return, e.g. getNextChar() treats a single '\r'
 692         // as '\n'.  So we do the same here.
 693         NextChar == '\r')
 694       return Kind;
 695
 696     // Allow comments after some directives, e.g.:
 697     //     #else// OR #else/**/
 698     //     #endif// OR #endif/**/
 699     //
 700     // Note that we do allow comments after #ifdef/#define here, e.g.
 701     //     #ifdef/**/ AND #ifdef//
 702     //     #define/**/ AND #define//
 703     //
 704     // These cases will be reported as incorrect after calling
 705     // prepLexMacroName().  We could have supported C-style comments
 706     // after #ifdef/#define, but this would complicate the code
 707     // for little benefit.
 708     if (NextChar == '/') {
 709       NextChar = peekNextChar(Word.size() + 1);
 710
 711       if (NextChar == '*' || NextChar == '/')
 712         return Kind;
 713
 714       // Pretend that we do not recognize the directive.
 715     }
 716   }
 717
 718   return tgtok::Error;
 719 }
 720
 721 bool TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) {
 722   TokStart = CurPtr;
 723
 724   for (const auto [PKind, PWord] : PreprocessorDirs)
 725     if (PKind == Kind) {
 726       // Advance CurPtr to the end of the preprocessing word.
 727       CurPtr += PWord.size();
 728       return true;
 729     }
 730
 731   PrintFatalError("unsupported preprocessing token in "
 732                   "prepEatPreprocessorDirective()");
 733   return false;
 734 }
 735
 736 tgtok::TokKind TGLexer::lexPreprocessor(tgtok::TokKind Kind,
 737                                         bool ReturnNextLiveToken) {
 738   // We must be looking at a preprocessing directive.  Eat it!
 739   if (!prepEatPreprocessorDirective(Kind))
 740     PrintFatalError("lexPreprocessor() called for unknown "
 741                     "preprocessor directive");
 742
 743   if (Kind == tgtok::Ifdef || Kind == tgtok::Ifndef) {
 744     StringRef MacroName = prepLexMacroName();
 745     StringRef IfTokName = Kind == tgtok::Ifdef ? "#ifdef" : "#ifndef";
 746     if (MacroName.empty())
 747       return ReturnError(TokStart, "expected macro name after " + IfTokName);
 748
 749     bool MacroIsDefined = DefinedMacros.count(MacroName) != 0;
 750
 751     // Canonicalize ifndef's MacroIsDefined to its ifdef equivalent.
 752     if (Kind == tgtok::Ifndef)
 753       MacroIsDefined = !MacroIsDefined;
 754
 755     // Regardless of whether we are processing tokens or not,
 756     // we put the #ifdef control on stack.
 757     // Note that MacroIsDefined has been canonicalized against ifdef.
 758     PrepIncludeStack.back().push_back(
 759         {tgtok::Ifdef, MacroIsDefined, SMLoc::getFromPointer(TokStart)});
 760
 761     if (!prepSkipDirectiveEnd())
 762       return ReturnError(CurPtr, "only comments are supported after " +
 763                                      IfTokName + " NAME");
 764
 765     // If we were not processing tokens before this #ifdef,
 766     // then just return back to the lines skipping code.
 767     if (!ReturnNextLiveToken)
 768       return Kind;
 769
 770     // If we were processing tokens before this #ifdef,
 771     // and the macro is defined, then just return the next token.
 772     if (MacroIsDefined)
 773       return LexToken();
 774
 775     // We were processing tokens before this #ifdef, and the macro
 776     // is not defined, so we have to start skipping the lines.
 777     // If the skipping is successful, it will return the token following
 778     // either #else or #endif corresponding to this #ifdef.
 779     if (prepSkipRegion(ReturnNextLiveToken))
 780       return LexToken();
 781
 782     return tgtok::Error;
 783   } else if (Kind == tgtok::Else) {
 784     // Check if this #else is correct before calling prepSkipDirectiveEnd(),
 785     // which will move CurPtr away from the beginning of #else.
 786     if (PrepIncludeStack.back().empty())
 787       return ReturnError(TokStart, "#else without #ifdef or #ifndef");
 788
 789     PreprocessorControlDesc IfdefEntry = PrepIncludeStack.back().back();
 790
 791     if (IfdefEntry.Kind != tgtok::Ifdef) {
 792       PrintError(TokStart, "double #else");
 793       return ReturnError(IfdefEntry.SrcPos, "previous #else is here");
 794     }
 795
 796     // Replace the corresponding #ifdef's control with its negation
 797     // on the control stack.
 798     PrepIncludeStack.back().back() = {Kind, !IfdefEntry.IsDefined,
 799                                       SMLoc::getFromPointer(TokStart)};
 800
 801     if (!prepSkipDirectiveEnd())
 802       return ReturnError(CurPtr, "only comments are supported after #else");
 803
 804     // If we were processing tokens before this #else,
 805     // we have to start skipping lines until the matching #endif.
 806     if (ReturnNextLiveToken) {
 807       if (prepSkipRegion(ReturnNextLiveToken))
 808         return LexToken();
 809
 810       return tgtok::Error;
 811     }
 812
 813     // Return to the lines skipping code.
 814     return Kind;
 815   } else if (Kind == tgtok::Endif) {
 816     // Check if this #endif is correct before calling prepSkipDirectiveEnd(),
 817     // which will move CurPtr away from the beginning of #endif.
 818     if (PrepIncludeStack.back().empty())
 819       return ReturnError(TokStart, "#endif without #ifdef");
 820
 821     auto &IfdefOrElseEntry = PrepIncludeStack.back().back();
 822
 823     if (IfdefOrElseEntry.Kind != tgtok::Ifdef &&
 824         IfdefOrElseEntry.Kind != tgtok::Else) {
 825       PrintFatalError("invalid preprocessor control on the stack");
 826       return tgtok::Error;
 827     }
 828
 829     if (!prepSkipDirectiveEnd())
 830       return ReturnError(CurPtr, "only comments are supported after #endif");
 831
 832     PrepIncludeStack.back().pop_back();
 833
 834     // If we were processing tokens before this #endif, then
 835     // we should continue it.
 836     if (ReturnNextLiveToken) {
 837       return LexToken();
 838     }
 839
 840     // Return to the lines skipping code.
 841     return Kind;
 842   } else if (Kind == tgtok::Define) {
 843     StringRef MacroName = prepLexMacroName();
 844     if (MacroName.empty())
 845       return ReturnError(TokStart, "expected macro name after #define");
 846
 847     if (!DefinedMacros.insert(MacroName).second)
 848       PrintWarning(getLoc(),
 849                    "duplicate definition of macro: " + Twine(MacroName));
 850
 851     if (!prepSkipDirectiveEnd())
 852       return ReturnError(CurPtr,
 853                          "only comments are supported after #define NAME");
 854
 855     if (!ReturnNextLiveToken) {
 856       PrintFatalError("#define must be ignored during the lines skipping");
 857       return tgtok::Error;
 858     }
 859
 860     return LexToken();
 861   }
 862
 863   PrintFatalError("preprocessing directive is not supported");
 864   return tgtok::Error;
 865 }
 866
 867 bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) {
 868   if (!MustNeverBeFalse)
 869     PrintFatalError("invalid recursion.");
 870
 871   do {
 872     // Skip all symbols to the line end.
 873     while (*CurPtr != '\n')
 874       ++CurPtr;
 875
 876     // Find the first non-whitespace symbol in the next line(s).
 877     if (!prepSkipLineBegin())
 878       return false;
 879
 880     // If the first non-blank/comment symbol on the line is '#',
 881     // it may be a start of preprocessing directive.
 882     //
 883     // If it is not '#' just go to the next line.
 884     if (*CurPtr == '#')
 885       ++CurPtr;
 886     else
 887       continue;
 888
 889     tgtok::TokKind Kind = prepIsDirective();
 890
 891     // If we did not find a preprocessing directive or it is #define,
 892     // then just skip to the next line.  We do not have to do anything
 893     // for #define in the line-skipping mode.
 894     if (Kind == tgtok::Error || Kind == tgtok::Define)
 895       continue;
 896
 897     tgtok::TokKind ProcessedKind = lexPreprocessor(Kind, false);
 898
 899     // If lexPreprocessor() encountered an error during lexing this
 900     // preprocessor idiom, then return false to the calling lexPreprocessor().
 901     // This will force tgtok::Error to be returned to the tokens processing.
 902     if (ProcessedKind == tgtok::Error)
 903       return false;
 904
 905     if (Kind != ProcessedKind)
 906       PrintFatalError("prepIsDirective() and lexPreprocessor() "
 907                       "returned different token kinds");
 908
 909     // If this preprocessing directive enables tokens processing,
 910     // then return to the lexPreprocessor() and get to the next token.
 911     // We can move from line-skipping mode to processing tokens only
 912     // due to #else or #endif.
 913     if (prepIsProcessingEnabled()) {
 914       if (Kind != tgtok::Else && Kind != tgtok::Endif) {
 915         PrintFatalError("tokens processing was enabled by an unexpected "
 916                         "preprocessing directive");
 917         return false;
 918       }
 919
 920       return true;
 921     }
 922   } while (CurPtr != CurBuf.end());
 923
 924   // We have reached the end of the file, but never left the lines-skipping
 925   // mode.  This means there is no matching #endif.
 926   prepReportPreprocessorStackError();
 927   return false;
 928 }
 929
 930 StringRef TGLexer::prepLexMacroName() {
 931   // Skip whitespaces between the preprocessing directive and the macro name.
 932   while (*CurPtr == ' ' || *CurPtr == '\t')
 933     ++CurPtr;
 934
 935   TokStart = CurPtr;
 936   CurPtr = lexMacroName(StringRef(CurPtr, CurBuf.end() - CurPtr));
 937   return StringRef(TokStart, CurPtr - TokStart);
 938 }
 939
 940 bool TGLexer::prepSkipLineBegin() {
 941   while (CurPtr != CurBuf.end()) {
 942     switch (*CurPtr) {
 943     case ' ':
 944     case '\t':
 945     case '\n':
 946     case '\r':
 947       break;
 948
 949     case '/': {
 950       int NextChar = peekNextChar(1);
 951       if (NextChar == '*') {
 952         // Skip C-style comment.
 953         // Note that we do not care about skipping the C++-style comments.
 954         // If the line contains "//", it may not contain any processable
 955         // preprocessing directive.  Just return CurPtr pointing to
 956         // the first '/' in this case.  We also do not care about
 957         // incorrect symbols after the first '/' - we are in lines-skipping
 958         // mode, so incorrect code is allowed to some extent.
 959
 960         // Set TokStart to the beginning of the comment to enable proper
 961         // diagnostic printing in case of error in SkipCComment().
 962         TokStart = CurPtr;
 963
 964         // CurPtr must point to '*' before call to SkipCComment().
 965         ++CurPtr;
 966         if (SkipCComment())
 967           return false;
 968       } else {
 969         // CurPtr points to the non-whitespace '/'.
 970         return true;
 971       }
 972
 973       // We must not increment CurPtr after the comment was lexed.
 974       continue;
 975     }
 976
 977     default:
 978       return true;
 979     }
 980
 981     ++CurPtr;
 982   }
 983
 984   // We have reached the end of the file.  Return to the lines skipping
 985   // code, and allow it to handle the EOF as needed.
 986   return true;
 987 }
 988
 989 bool TGLexer::prepSkipDirectiveEnd() {
 990   while (CurPtr != CurBuf.end()) {
 991     switch (*CurPtr) {
 992     case ' ':
 993     case '\t':
 994       break;
 995
 996     case '\n':
 997     case '\r':
 998       return true;
 999
1000     case '/': {
1001       int NextChar = peekNextChar(1);
1002       if (NextChar == '/') {
1003         // Skip C++-style comment.
1004         // We may just return true now, but let's skip to the line/buffer end
1005         // to simplify the method specification.
1006         ++CurPtr;
1007         SkipBCPLComment();
1008       } else if (NextChar == '*') {
1009         // When we are skipping C-style comment at the end of a preprocessing
1010         // directive, we can skip several lines.  If any meaningful TD token
1011         // follows the end of the C-style comment on the same line, it will
1012         // be considered as an invalid usage of TD token.
1013         // For example, we want to forbid usages like this one:
1014         //     #define MACRO class Class {}
1015         // But with C-style comments we also disallow the following:
1016         //     #define MACRO /* This macro is used
1017         //                      to ... */ class Class {}
1018         // One can argue that this should be allowed, but it does not seem
1019         // to be worth of the complication.  Moreover, this matches
1020         // the C preprocessor behavior.
1021
1022         // Set TokStart to the beginning of the comment to enable proper
1023         // diagnostic printer in case of error in SkipCComment().
1024         TokStart = CurPtr;
1025         ++CurPtr;
1026         if (SkipCComment())
1027           return false;
1028       } else {
1029         TokStart = CurPtr;
1030         PrintError(CurPtr, "unexpected character");
1031         return false;
1032       }
1033
1034       // We must not increment CurPtr after the comment was lexed.
1035       continue;
1036     }
1037
1038     default:
1039       // Do not allow any non-whitespaces after the directive.
1040       TokStart = CurPtr;
1041       return false;
1042     }
1043
1044     ++CurPtr;
1045   }
1046
1047   return true;
1048 }
1049
1050 bool TGLexer::prepIsProcessingEnabled() {
1051   return all_of(PrepIncludeStack.back(),
1052                 [](const PreprocessorControlDesc &I) { return I.IsDefined; });
1053 }
1054
1055 void TGLexer::prepReportPreprocessorStackError() {
1056   if (PrepIncludeStack.back().empty())
1057     PrintFatalError("prepReportPreprocessorStackError() called with "
1058                     "empty control stack");
1059
1060   auto &PrepControl = PrepIncludeStack.back().back();
1061   PrintError(CurBuf.end(), "reached EOF without matching #endif");
1062   PrintError(PrepControl.SrcPos, "the latest preprocessor control is here");
1063
1064   TokStart = CurPtr;
1065 }