llvm/lib/TableGen/TGLexer.cpp

   1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // Implement the Lexer for TableGen.
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include "TGLexer.h"
  14 #include "llvm/ADT/ArrayRef.h"
  15 #include "llvm/ADT/StringSwitch.h"
  16 #include "llvm/ADT/Twine.h"
  17 #include "llvm/Config/config.h" // for strtoull()/strtoll() define
  18 #include "llvm/Support/Compiler.h"
  19 #include "llvm/Support/MemoryBuffer.h"
  20 #include "llvm/Support/SourceMgr.h"
  21 #include "llvm/TableGen/Error.h"
  22 #include <algorithm>
  23 #include <cctype>
  24 #include <cerrno>
  25 #include <cstdint>
  26 #include <cstdio>
  27 #include <cstdlib>
  28 #include <cstring>
  29
  30 using namespace llvm;
  31
  32 namespace {
  33 // A list of supported preprocessing directives with their
  34 // internal token kinds and names.
  35 struct {
  36   tgtok::TokKind Kind;
  37   const char *Word;
  38 } PreprocessorDirs[] = {
  39   { tgtok::Ifdef, "ifdef" },
  40   { tgtok::Ifndef, "ifndef" },
  41   { tgtok::Else, "else" },
  42   { tgtok::Endif, "endif" },
  43   { tgtok::Define, "define" }
  44 };
  45 } // end anonymous namespace
  46
  47 TGLexer::TGLexer(SourceMgr &SM, ArrayRef<std::string> Macros) : SrcMgr(SM) {
  48   CurBuffer = SrcMgr.getMainFileID();
  49   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
  50   CurPtr = CurBuf.begin();
  51   TokStart = nullptr;
  52
  53   // Pretend that we enter the "top-level" include file.
  54   PrepIncludeStack.push_back(
  55       std::make_unique<std::vector<PreprocessorControlDesc>>());
  56
  57   // Put all macros defined in the command line into the DefinedMacros set.
  58   std::for_each(Macros.begin(), Macros.end(),
  59                 [this](const std::string &MacroName) {
  60                   DefinedMacros.insert(MacroName);
  61                 });
  62 }
  63
  64 SMLoc TGLexer::getLoc() const {
  65   return SMLoc::getFromPointer(TokStart);
  66 }
  67
  68 /// ReturnError - Set the error to the specified string at the specified
  69 /// location.  This is defined to always return tgtok::Error.
  70 tgtok::TokKind TGLexer::ReturnError(SMLoc Loc, const Twine &Msg) {
  71   PrintError(Loc, Msg);
  72   return tgtok::Error;
  73 }
  74
  75 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
  76   return ReturnError(SMLoc::getFromPointer(Loc), Msg);
  77 }
  78
  79 bool TGLexer::processEOF() {
  80   SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
  81   if (ParentIncludeLoc != SMLoc()) {
  82     // If prepExitInclude() detects a problem with the preprocessing
  83     // control stack, it will return false.  Pretend that we reached
  84     // the final EOF and stop lexing more tokens by returning false
  85     // to LexToken().
  86     if (!prepExitInclude(false))
  87       return false;
  88
  89     CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
  90     CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
  91     CurPtr = ParentIncludeLoc.getPointer();
  92     // Make sure TokStart points into the parent file's buffer.
  93     // LexToken() assigns to it before calling getNextChar(),
  94     // so it is pointing into the included file now.
  95     TokStart = CurPtr;
  96     return true;
  97   }
  98
  99   // Pretend that we exit the "top-level" include file.
 100   // Note that in case of an error (e.g. control stack imbalance)
 101   // the routine will issue a fatal error.
 102   prepExitInclude(true);
 103   return false;
 104 }
 105
 106 int TGLexer::getNextChar() {
 107   char CurChar = *CurPtr++;
 108   switch (CurChar) {
 109   default:
 110     return (unsigned char)CurChar;
 111
 112   case 0: {
 113     // A NUL character in the stream is either the end of the current buffer or
 114     // a spurious NUL in the file.  Disambiguate that here.
 115     if (CurPtr - 1 == CurBuf.end()) {
 116       --CurPtr; // Arrange for another call to return EOF again.
 117       return EOF;
 118     }
 119     PrintError(getLoc(),
 120                "NUL character is invalid in source; treated as space");
 121     return ' ';
 122   }
 123
 124   case '\n':
 125   case '\r':
 126     // Handle the newline character by ignoring it and incrementing the line
 127     // count.  However, be careful about 'dos style' files with \n\r in them.
 128     // Only treat a \n\r or \r\n as a single line.
 129     if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
 130         *CurPtr != CurChar)
 131       ++CurPtr;  // Eat the two char newline sequence.
 132     return '\n';
 133   }
 134 }
 135
 136 int TGLexer::peekNextChar(int Index) const {
 137   return *(CurPtr + Index);
 138 }
 139
 140 tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
 141   TokStart = CurPtr;
 142   // This always consumes at least one character.
 143   int CurChar = getNextChar();
 144
 145   switch (CurChar) {
 146   default:
 147     // Handle letters: [a-zA-Z_]
 148     if (isalpha(CurChar) || CurChar == '_')
 149       return LexIdentifier();
 150
 151     // Unknown character, emit an error.
 152     return ReturnError(TokStart, "Unexpected character");
 153   case EOF:
 154     // Lex next token, if we just left an include file.
 155     // Note that leaving an include file means that the next
 156     // symbol is located at the end of the 'include "..."'
 157     // construct, so LexToken() is called with default
 158     // false parameter.
 159     if (processEOF())
 160       return LexToken();
 161
 162     // Return EOF denoting the end of lexing.
 163     return tgtok::Eof;
 164
 165   case ':': return tgtok::colon;
 166   case ';': return tgtok::semi;
 167   case ',': return tgtok::comma;
 168   case '<': return tgtok::less;
 169   case '>': return tgtok::greater;
 170   case ']': return tgtok::r_square;
 171   case '{': return tgtok::l_brace;
 172   case '}': return tgtok::r_brace;
 173   case '(': return tgtok::l_paren;
 174   case ')': return tgtok::r_paren;
 175   case '=': return tgtok::equal;
 176   case '?': return tgtok::question;
 177   case '#':
 178     if (FileOrLineStart) {
 179       tgtok::TokKind Kind = prepIsDirective();
 180       if (Kind != tgtok::Error)
 181         return lexPreprocessor(Kind);
 182     }
 183
 184     return tgtok::paste;
 185
 186   // The period is a separate case so we can recognize the "..."
 187   // range punctuator.
 188   case '.':
 189     if (peekNextChar(0) == '.') {
 190       ++CurPtr; // Eat second dot.
 191       if (peekNextChar(0) == '.') {
 192         ++CurPtr; // Eat third dot.
 193         return tgtok::dotdotdot;
 194       }
 195       return ReturnError(TokStart, "Invalid '..' punctuation");
 196     }
 197     return tgtok::dot;
 198
 199   case '\r':
 200     PrintFatalError("getNextChar() must never return '\r'");
 201     return tgtok::Error;
 202
 203   case ' ':
 204   case '\t':
 205     // Ignore whitespace.
 206     return LexToken(FileOrLineStart);
 207   case '\n':
 208     // Ignore whitespace, and identify the new line.
 209     return LexToken(true);
 210   case '/':
 211     // If this is the start of a // comment, skip until the end of the line or
 212     // the end of the buffer.
 213     if (*CurPtr == '/')
 214       SkipBCPLComment();
 215     else if (*CurPtr == '*') {
 216       if (SkipCComment())
 217         return tgtok::Error;
 218     } else // Otherwise, this is an error.
 219       return ReturnError(TokStart, "Unexpected character");
 220     return LexToken(FileOrLineStart);
 221   case '-': case '+':
 222   case '0': case '1': case '2': case '3': case '4': case '5': case '6':
 223   case '7': case '8': case '9': {
 224     int NextChar = 0;
 225     if (isdigit(CurChar)) {
 226       // Allow identifiers to start with a number if it is followed by
 227       // an identifier.  This can happen with paste operations like
 228       // foo#8i.
 229       int i = 0;
 230       do {
 231         NextChar = peekNextChar(i++);
 232       } while (isdigit(NextChar));
 233
 234       if (NextChar == 'x' || NextChar == 'b') {
 235         // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
 236         // likely a number.
 237         int NextNextChar = peekNextChar(i);
 238         switch (NextNextChar) {
 239         default:
 240           break;
 241         case '0': case '1':
 242           if (NextChar == 'b')
 243             return LexNumber();
 244           LLVM_FALLTHROUGH;
 245         case '2': case '3': case '4': case '5':
 246         case '6': case '7': case '8': case '9':
 247         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 248         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 249           if (NextChar == 'x')
 250             return LexNumber();
 251           break;
 252         }
 253       }
 254     }
 255
 256     if (isalpha(NextChar) || NextChar == '_')
 257       return LexIdentifier();
 258
 259     return LexNumber();
 260   }
 261   case '"': return LexString();
 262   case '$': return LexVarName();
 263   case '[': return LexBracket();
 264   case '!': return LexExclaim();
 265   }
 266 }
 267
 268 /// LexString - Lex "[^"]*"
 269 tgtok::TokKind TGLexer::LexString() {
 270   const char *StrStart = CurPtr;
 271
 272   CurStrVal = "";
 273
 274   while (*CurPtr != '"') {
 275     // If we hit the end of the buffer, report an error.
 276     if (*CurPtr == 0 && CurPtr == CurBuf.end())
 277       return ReturnError(StrStart, "End of file in string literal");
 278
 279     if (*CurPtr == '\n' || *CurPtr == '\r')
 280       return ReturnError(StrStart, "End of line in string literal");
 281
 282     if (*CurPtr != '\\') {
 283       CurStrVal += *CurPtr++;
 284       continue;
 285     }
 286
 287     ++CurPtr;
 288
 289     switch (*CurPtr) {
 290     case '\\': case '\'': case '"':
 291       // These turn into their literal character.
 292       CurStrVal += *CurPtr++;
 293       break;
 294     case 't':
 295       CurStrVal += '\t';
 296       ++CurPtr;
 297       break;
 298     case 'n':
 299       CurStrVal += '\n';
 300       ++CurPtr;
 301       break;
 302
 303     case '\n':
 304     case '\r':
 305       return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
 306
 307     // If we hit the end of the buffer, report an error.
 308     case '\0':
 309       if (CurPtr == CurBuf.end())
 310         return ReturnError(StrStart, "End of file in string literal");
 311       LLVM_FALLTHROUGH;
 312     default:
 313       return ReturnError(CurPtr, "invalid escape in string literal");
 314     }
 315   }
 316
 317   ++CurPtr;
 318   return tgtok::StrVal;
 319 }
 320
 321 tgtok::TokKind TGLexer::LexVarName() {
 322   if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
 323     return ReturnError(TokStart, "Invalid variable name");
 324
 325   // Otherwise, we're ok, consume the rest of the characters.
 326   const char *VarNameStart = CurPtr++;
 327
 328   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
 329     ++CurPtr;
 330
 331   CurStrVal.assign(VarNameStart, CurPtr);
 332   return tgtok::VarName;
 333 }
 334
 335 tgtok::TokKind TGLexer::LexIdentifier() {
 336   // The first letter is [a-zA-Z_].
 337   const char *IdentStart = TokStart;
 338
 339   // Match the rest of the identifier regex: [0-9a-zA-Z_]*
 340   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
 341     ++CurPtr;
 342
 343   // Check to see if this identifier is a reserved keyword.
 344   StringRef Str(IdentStart, CurPtr-IdentStart);
 345
 346   tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
 347     .Case("int", tgtok::Int)
 348     .Case("bit", tgtok::Bit)
 349     .Case("bits", tgtok::Bits)
 350     .Case("string", tgtok::String)
 351     .Case("list", tgtok::List)
 352     .Case("code", tgtok::Code)
 353     .Case("dag", tgtok::Dag)
 354     .Case("class", tgtok::Class)
 355     .Case("def", tgtok::Def)
 356     .Case("true", tgtok::TrueVal)
 357     .Case("false", tgtok::FalseVal)
 358     .Case("foreach", tgtok::Foreach)
 359     .Case("defm", tgtok::Defm)
 360     .Case("defset", tgtok::Defset)
 361     .Case("multiclass", tgtok::MultiClass)
 362     .Case("field", tgtok::Field)
 363     .Case("let", tgtok::Let)
 364     .Case("in", tgtok::In)
 365     .Case("defvar", tgtok::Defvar)
 366     .Case("include", tgtok::Include)
 367     .Case("if", tgtok::If)
 368     .Case("then", tgtok::Then)
 369     .Case("else", tgtok::ElseKW)
 370     .Case("assert", tgtok::Assert)
 371     .Default(tgtok::Id);
 372
 373   // A couple of tokens require special processing.
 374   switch (Kind) {
 375     case tgtok::Include:
 376       if (LexInclude()) return tgtok::Error;
 377       return Lex();
 378     case tgtok::Id:
 379       CurStrVal.assign(Str.begin(), Str.end());
 380       break;
 381     default:
 382       break;
 383   }
 384
 385   return Kind;
 386 }
 387
 388 /// LexInclude - We just read the "include" token.  Get the string token that
 389 /// comes next and enter the include.
 390 bool TGLexer::LexInclude() {
 391   // The token after the include must be a string.
 392   tgtok::TokKind Tok = LexToken();
 393   if (Tok == tgtok::Error) return true;
 394   if (Tok != tgtok::StrVal) {
 395     PrintError(getLoc(), "Expected filename after include");
 396     return true;
 397   }
 398
 399   // Get the string.
 400   std::string Filename = CurStrVal;
 401   std::string IncludedFile;
 402
 403   CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
 404                                     IncludedFile);
 405   if (!CurBuffer) {
 406     PrintError(getLoc(), "Could not find include file '" + Filename + "'");
 407     return true;
 408   }
 409
 410   Dependencies.insert(IncludedFile);
 411   // Save the line number and lex buffer of the includer.
 412   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
 413   CurPtr = CurBuf.begin();
 414
 415   PrepIncludeStack.push_back(
 416       std::make_unique<std::vector<PreprocessorControlDesc>>());
 417   return false;
 418 }
 419
 420 /// SkipBCPLComment - Skip over the comment by finding the next CR or LF.
 421 /// Or we may end up at the end of the buffer.
 422 void TGLexer::SkipBCPLComment() {
 423   ++CurPtr;  // skip the second slash.
 424   auto EOLPos = CurBuf.find_first_of("\r\n", CurPtr - CurBuf.data());
 425   CurPtr = (EOLPos == StringRef::npos) ? CurBuf.end() : CurBuf.data() + EOLPos;
 426 }
 427
 428 /// SkipCComment - This skips C-style /**/ comments.  The only difference from C
 429 /// is that we allow nesting.
 430 bool TGLexer::SkipCComment() {
 431   ++CurPtr;  // skip the star.
 432   unsigned CommentDepth = 1;
 433
 434   while (true) {
 435     int CurChar = getNextChar();
 436     switch (CurChar) {
 437     case EOF:
 438       PrintError(TokStart, "Unterminated comment!");
 439       return true;
 440     case '*':
 441       // End of the comment?
 442       if (CurPtr[0] != '/') break;
 443
 444       ++CurPtr;   // End the */.
 445       if (--CommentDepth == 0)
 446         return false;
 447       break;
 448     case '/':
 449       // Start of a nested comment?
 450       if (CurPtr[0] != '*') break;
 451       ++CurPtr;
 452       ++CommentDepth;
 453       break;
 454     }
 455   }
 456 }
 457
 458 /// LexNumber - Lex:
 459 ///    [-+]?[0-9]+
 460 ///    0x[0-9a-fA-F]+
 461 ///    0b[01]+
 462 tgtok::TokKind TGLexer::LexNumber() {
 463   if (CurPtr[-1] == '0') {
 464     if (CurPtr[0] == 'x') {
 465       ++CurPtr;
 466       const char *NumStart = CurPtr;
 467       while (isxdigit(CurPtr[0]))
 468         ++CurPtr;
 469
 470       // Requires at least one hex digit.
 471       if (CurPtr == NumStart)
 472         return ReturnError(TokStart, "Invalid hexadecimal number");
 473
 474       errno = 0;
 475       CurIntVal = strtoll(NumStart, nullptr, 16);
 476       if (errno == EINVAL)
 477         return ReturnError(TokStart, "Invalid hexadecimal number");
 478       if (errno == ERANGE) {
 479         errno = 0;
 480         CurIntVal = (int64_t)strtoull(NumStart, nullptr, 16);
 481         if (errno == EINVAL)
 482           return ReturnError(TokStart, "Invalid hexadecimal number");
 483         if (errno == ERANGE)
 484           return ReturnError(TokStart, "Hexadecimal number out of range");
 485       }
 486       return tgtok::IntVal;
 487     } else if (CurPtr[0] == 'b') {
 488       ++CurPtr;
 489       const char *NumStart = CurPtr;
 490       while (CurPtr[0] == '0' || CurPtr[0] == '1')
 491         ++CurPtr;
 492
 493       // Requires at least one binary digit.
 494       if (CurPtr == NumStart)
 495         return ReturnError(CurPtr-2, "Invalid binary number");
 496       CurIntVal = strtoll(NumStart, nullptr, 2);
 497       return tgtok::BinaryIntVal;
 498     }
 499   }
 500
 501   // Check for a sign without a digit.
 502   if (!isdigit(CurPtr[0])) {
 503     if (CurPtr[-1] == '-')
 504       return tgtok::minus;
 505     else if (CurPtr[-1] == '+')
 506       return tgtok::plus;
 507   }
 508
 509   while (isdigit(CurPtr[0]))
 510     ++CurPtr;
 511   CurIntVal = strtoll(TokStart, nullptr, 10);
 512   return tgtok::IntVal;
 513 }
 514
 515 /// LexBracket - We just read '['.  If this is a code block, return it,
 516 /// otherwise return the bracket.  Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
 517 tgtok::TokKind TGLexer::LexBracket() {
 518   if (CurPtr[0] != '{')
 519     return tgtok::l_square;
 520   ++CurPtr;
 521   const char *CodeStart = CurPtr;
 522   while (true) {
 523     int Char = getNextChar();
 524     if (Char == EOF) break;
 525
 526     if (Char != '}') continue;
 527
 528     Char = getNextChar();
 529     if (Char == EOF) break;
 530     if (Char == ']') {
 531       CurStrVal.assign(CodeStart, CurPtr-2);
 532       return tgtok::CodeFragment;
 533     }
 534   }
 535
 536   return ReturnError(CodeStart - 2, "Unterminated code block");
 537 }
 538
 539 /// LexExclaim - Lex '!' and '![a-zA-Z]+'.
 540 tgtok::TokKind TGLexer::LexExclaim() {
 541   if (!isalpha(*CurPtr))
 542     return ReturnError(CurPtr - 1, "Invalid \"!operator\"");
 543
 544   const char *Start = CurPtr++;
 545   while (isalpha(*CurPtr))
 546     ++CurPtr;
 547
 548   // Check to see which operator this is.
 549   tgtok::TokKind Kind =
 550     StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start))
 551     .Case("eq", tgtok::XEq)
 552     .Case("ne", tgtok::XNe)
 553     .Case("le", tgtok::XLe)
 554     .Case("lt", tgtok::XLt)
 555     .Case("ge", tgtok::XGe)
 556     .Case("gt", tgtok::XGt)
 557     .Case("if", tgtok::XIf)
 558     .Case("cond", tgtok::XCond)
 559     .Case("isa", tgtok::XIsA)
 560     .Case("head", tgtok::XHead)
 561     .Case("tail", tgtok::XTail)
 562     .Case("size", tgtok::XSize)
 563     .Case("con", tgtok::XConcat)
 564     .Case("dag", tgtok::XDag)
 565     .Case("add", tgtok::XADD)
 566     .Case("sub", tgtok::XSUB)
 567     .Case("mul", tgtok::XMUL)
 568     .Case("not", tgtok::XNOT)
 569     .Case("and", tgtok::XAND)
 570     .Case("or", tgtok::XOR)
 571     .Case("xor", tgtok::XXOR)
 572     .Case("shl", tgtok::XSHL)
 573     .Case("sra", tgtok::XSRA)
 574     .Case("srl", tgtok::XSRL)
 575     .Case("cast", tgtok::XCast)
 576     .Case("empty", tgtok::XEmpty)
 577     .Case("subst", tgtok::XSubst)
 578     .Case("foldl", tgtok::XFoldl)
 579     .Case("foreach", tgtok::XForEach)
 580     .Case("filter", tgtok::XFilter)
 581     .Case("listconcat", tgtok::XListConcat)
 582     .Case("listsplat", tgtok::XListSplat)
 583     .Case("strconcat", tgtok::XStrConcat)
 584     .Case("interleave", tgtok::XInterleave)
 585     .Case("substr", tgtok::XSubstr)
 586     .Case("find", tgtok::XFind)
 587     .Cases("setdagop", "setop", tgtok::XSetDagOp) // !setop is deprecated.
 588     .Cases("getdagop", "getop", tgtok::XGetDagOp) // !getop is deprecated.
 589     .Default(tgtok::Error);
 590
 591   return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator");
 592 }
 593
 594 bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) {
 595   // Report an error, if preprocessor control stack for the current
 596   // file is not empty.
 597   if (!PrepIncludeStack.back()->empty()) {
 598     prepReportPreprocessorStackError();
 599
 600     return false;
 601   }
 602
 603   // Pop the preprocessing controls from the include stack.
 604   if (PrepIncludeStack.empty()) {
 605     PrintFatalError("Preprocessor include stack is empty");
 606   }
 607
 608   PrepIncludeStack.pop_back();
 609
 610   if (IncludeStackMustBeEmpty) {
 611     if (!PrepIncludeStack.empty())
 612       PrintFatalError("Preprocessor include stack is not empty");
 613   } else {
 614     if (PrepIncludeStack.empty())
 615       PrintFatalError("Preprocessor include stack is empty");
 616   }
 617
 618   return true;
 619 }
 620
 621 tgtok::TokKind TGLexer::prepIsDirective() const {
 622   for (const auto &PD : PreprocessorDirs) {
 623     int NextChar = *CurPtr;
 624     bool Match = true;
 625     unsigned I = 0;
 626     for (; I < strlen(PD.Word); ++I) {
 627       if (NextChar != PD.Word[I]) {
 628         Match = false;
 629         break;
 630       }
 631
 632       NextChar = peekNextChar(I + 1);
 633     }
 634
 635     // Check for whitespace after the directive.  If there is no whitespace,
 636     // then we do not recognize it as a preprocessing directive.
 637     if (Match) {
 638       tgtok::TokKind Kind = PD.Kind;
 639
 640       // New line and EOF may follow only #else/#endif.  It will be reported
 641       // as an error for #ifdef/#define after the call to prepLexMacroName().
 642       if (NextChar == ' ' || NextChar == '\t' || NextChar == EOF ||
 643           NextChar == '\n' ||
 644           // It looks like TableGen does not support '\r' as the actual
 645           // carriage return, e.g. getNextChar() treats a single '\r'
 646           // as '\n'.  So we do the same here.
 647           NextChar == '\r')
 648         return Kind;
 649
 650       // Allow comments after some directives, e.g.:
 651       //     #else// OR #else/**/
 652       //     #endif// OR #endif/**/
 653       //
 654       // Note that we do allow comments after #ifdef/#define here, e.g.
 655       //     #ifdef/**/ AND #ifdef//
 656       //     #define/**/ AND #define//
 657       //
 658       // These cases will be reported as incorrect after calling
 659       // prepLexMacroName().  We could have supported C-style comments
 660       // after #ifdef/#define, but this would complicate the code
 661       // for little benefit.
 662       if (NextChar == '/') {
 663         NextChar = peekNextChar(I + 1);
 664
 665         if (NextChar == '*' || NextChar == '/')
 666           return Kind;
 667
 668         // Pretend that we do not recognize the directive.
 669       }
 670     }
 671   }
 672
 673   return tgtok::Error;
 674 }
 675
 676 bool TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) {
 677   TokStart = CurPtr;
 678
 679   for (const auto &PD : PreprocessorDirs)
 680     if (PD.Kind == Kind) {
 681       // Advance CurPtr to the end of the preprocessing word.
 682       CurPtr += strlen(PD.Word);
 683       return true;
 684     }
 685
 686   PrintFatalError("Unsupported preprocessing token in "
 687                   "prepEatPreprocessorDirective()");
 688   return false;
 689 }
 690
 691 tgtok::TokKind TGLexer::lexPreprocessor(
 692     tgtok::TokKind Kind, bool ReturnNextLiveToken) {
 693
 694   // We must be looking at a preprocessing directive.  Eat it!
 695   if (!prepEatPreprocessorDirective(Kind))
 696     PrintFatalError("lexPreprocessor() called for unknown "
 697                     "preprocessor directive");
 698
 699   if (Kind == tgtok::Ifdef || Kind == tgtok::Ifndef) {
 700     StringRef MacroName = prepLexMacroName();
 701     StringRef IfTokName = Kind == tgtok::Ifdef ? "#ifdef" : "#ifndef";
 702     if (MacroName.empty())
 703       return ReturnError(TokStart, "Expected macro name after " + IfTokName);
 704
 705     bool MacroIsDefined = DefinedMacros.count(MacroName) != 0;
 706
 707     // Canonicalize ifndef to ifdef equivalent
 708     if (Kind == tgtok::Ifndef) {
 709       MacroIsDefined = !MacroIsDefined;
 710       Kind = tgtok::Ifdef;
 711     }
 712
 713     // Regardless of whether we are processing tokens or not,
 714     // we put the #ifdef control on stack.
 715     PrepIncludeStack.back()->push_back(
 716         {Kind, MacroIsDefined, SMLoc::getFromPointer(TokStart)});
 717
 718     if (!prepSkipDirectiveEnd())
 719       return ReturnError(CurPtr, "Only comments are supported after " +
 720                                      IfTokName + " NAME");
 721
 722     // If we were not processing tokens before this #ifdef,
 723     // then just return back to the lines skipping code.
 724     if (!ReturnNextLiveToken)
 725       return Kind;
 726
 727     // If we were processing tokens before this #ifdef,
 728     // and the macro is defined, then just return the next token.
 729     if (MacroIsDefined)
 730       return LexToken();
 731
 732     // We were processing tokens before this #ifdef, and the macro
 733     // is not defined, so we have to start skipping the lines.
 734     // If the skipping is successful, it will return the token following
 735     // either #else or #endif corresponding to this #ifdef.
 736     if (prepSkipRegion(ReturnNextLiveToken))
 737       return LexToken();
 738
 739     return tgtok::Error;
 740   } else if (Kind == tgtok::Else) {
 741     // Check if this #else is correct before calling prepSkipDirectiveEnd(),
 742     // which will move CurPtr away from the beginning of #else.
 743     if (PrepIncludeStack.back()->empty())
 744       return ReturnError(TokStart, "#else without #ifdef or #ifndef");
 745
 746     PreprocessorControlDesc IfdefEntry = PrepIncludeStack.back()->back();
 747
 748     if (IfdefEntry.Kind != tgtok::Ifdef) {
 749       PrintError(TokStart, "double #else");
 750       return ReturnError(IfdefEntry.SrcPos, "Previous #else is here");
 751     }
 752
 753     // Replace the corresponding #ifdef's control with its negation
 754     // on the control stack.
 755     PrepIncludeStack.back()->pop_back();
 756     PrepIncludeStack.back()->push_back(
 757         {Kind, !IfdefEntry.IsDefined, SMLoc::getFromPointer(TokStart)});
 758
 759     if (!prepSkipDirectiveEnd())
 760       return ReturnError(CurPtr, "Only comments are supported after #else");
 761
 762     // If we were processing tokens before this #else,
 763     // we have to start skipping lines until the matching #endif.
 764     if (ReturnNextLiveToken) {
 765       if (prepSkipRegion(ReturnNextLiveToken))
 766         return LexToken();
 767
 768       return tgtok::Error;
 769     }
 770
 771     // Return to the lines skipping code.
 772     return Kind;
 773   } else if (Kind == tgtok::Endif) {
 774     // Check if this #endif is correct before calling prepSkipDirectiveEnd(),
 775     // which will move CurPtr away from the beginning of #endif.
 776     if (PrepIncludeStack.back()->empty())
 777       return ReturnError(TokStart, "#endif without #ifdef");
 778
 779     auto &IfdefOrElseEntry = PrepIncludeStack.back()->back();
 780
 781     if (IfdefOrElseEntry.Kind != tgtok::Ifdef &&
 782         IfdefOrElseEntry.Kind != tgtok::Else) {
 783       PrintFatalError("Invalid preprocessor control on the stack");
 784       return tgtok::Error;
 785     }
 786
 787     if (!prepSkipDirectiveEnd())
 788       return ReturnError(CurPtr, "Only comments are supported after #endif");
 789
 790     PrepIncludeStack.back()->pop_back();
 791
 792     // If we were processing tokens before this #endif, then
 793     // we should continue it.
 794     if (ReturnNextLiveToken) {
 795       return LexToken();
 796     }
 797
 798     // Return to the lines skipping code.
 799     return Kind;
 800   } else if (Kind == tgtok::Define) {
 801     StringRef MacroName = prepLexMacroName();
 802     if (MacroName.empty())
 803       return ReturnError(TokStart, "Expected macro name after #define");
 804
 805     if (!DefinedMacros.insert(MacroName).second)
 806       PrintWarning(getLoc(),
 807                    "Duplicate definition of macro: " + Twine(MacroName));
 808
 809     if (!prepSkipDirectiveEnd())
 810       return ReturnError(CurPtr,
 811                          "Only comments are supported after #define NAME");
 812
 813     if (!ReturnNextLiveToken) {
 814       PrintFatalError("#define must be ignored during the lines skipping");
 815       return tgtok::Error;
 816     }
 817
 818     return LexToken();
 819   }
 820
 821   PrintFatalError("Preprocessing directive is not supported");
 822   return tgtok::Error;
 823 }
 824
 825 bool TGLexer::prepSkipRegion(bool MustNeverBeFalse) {
 826   if (!MustNeverBeFalse)
 827     PrintFatalError("Invalid recursion.");
 828
 829   do {
 830     // Skip all symbols to the line end.
 831     prepSkipToLineEnd();
 832
 833     // Find the first non-whitespace symbol in the next line(s).
 834     if (!prepSkipLineBegin())
 835       return false;
 836
 837     // If the first non-blank/comment symbol on the line is '#',
 838     // it may be a start of preprocessing directive.
 839     //
 840     // If it is not '#' just go to the next line.
 841     if (*CurPtr == '#')
 842       ++CurPtr;
 843     else
 844       continue;
 845
 846     tgtok::TokKind Kind = prepIsDirective();
 847
 848     // If we did not find a preprocessing directive or it is #define,
 849     // then just skip to the next line.  We do not have to do anything
 850     // for #define in the line-skipping mode.
 851     if (Kind == tgtok::Error || Kind == tgtok::Define)
 852       continue;
 853
 854     tgtok::TokKind ProcessedKind = lexPreprocessor(Kind, false);
 855
 856     // If lexPreprocessor() encountered an error during lexing this
 857     // preprocessor idiom, then return false to the calling lexPreprocessor().
 858     // This will force tgtok::Error to be returned to the tokens processing.
 859     if (ProcessedKind == tgtok::Error)
 860       return false;
 861
 862     if (Kind != ProcessedKind)
 863       PrintFatalError("prepIsDirective() and lexPreprocessor() "
 864                       "returned different token kinds");
 865
 866     // If this preprocessing directive enables tokens processing,
 867     // then return to the lexPreprocessor() and get to the next token.
 868     // We can move from line-skipping mode to processing tokens only
 869     // due to #else or #endif.
 870     if (prepIsProcessingEnabled()) {
 871       if (Kind != tgtok::Else && Kind != tgtok::Endif) {
 872         PrintFatalError("Tokens processing was enabled by an unexpected "
 873                         "preprocessing directive");
 874         return false;
 875       }
 876
 877       return true;
 878     }
 879   } while (CurPtr != CurBuf.end());
 880
 881   // We have reached the end of the file, but never left the lines-skipping
 882   // mode.  This means there is no matching #endif.
 883   prepReportPreprocessorStackError();
 884   return false;
 885 }
 886
 887 StringRef TGLexer::prepLexMacroName() {
 888   // Skip whitespaces between the preprocessing directive and the macro name.
 889   while (*CurPtr == ' ' || *CurPtr == '\t')
 890     ++CurPtr;
 891
 892   TokStart = CurPtr;
 893   // Macro names start with [a-zA-Z_].
 894   if (*CurPtr != '_' && !isalpha(*CurPtr))
 895     return "";
 896
 897   // Match the rest of the identifier regex: [0-9a-zA-Z_]*
 898   while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
 899     ++CurPtr;
 900
 901   return StringRef(TokStart, CurPtr - TokStart);
 902 }
 903
 904 bool TGLexer::prepSkipLineBegin() {
 905   while (CurPtr != CurBuf.end()) {
 906     switch (*CurPtr) {
 907     case ' ':
 908     case '\t':
 909     case '\n':
 910     case '\r':
 911       break;
 912
 913     case '/': {
 914       int NextChar = peekNextChar(1);
 915       if (NextChar == '*') {
 916         // Skip C-style comment.
 917         // Note that we do not care about skipping the C++-style comments.
 918         // If the line contains "//", it may not contain any processable
 919         // preprocessing directive.  Just return CurPtr pointing to
 920         // the first '/' in this case.  We also do not care about
 921         // incorrect symbols after the first '/' - we are in lines-skipping
 922         // mode, so incorrect code is allowed to some extent.
 923
 924         // Set TokStart to the beginning of the comment to enable proper
 925         // diagnostic printing in case of error in SkipCComment().
 926         TokStart = CurPtr;
 927
 928         // CurPtr must point to '*' before call to SkipCComment().
 929         ++CurPtr;
 930         if (SkipCComment())
 931           return false;
 932       } else {
 933         // CurPtr points to the non-whitespace '/'.
 934         return true;
 935       }
 936
 937       // We must not increment CurPtr after the comment was lexed.
 938       continue;
 939     }
 940
 941     default:
 942       return true;
 943     }
 944
 945     ++CurPtr;
 946   }
 947
 948   // We have reached the end of the file.  Return to the lines skipping
 949   // code, and allow it to handle the EOF as needed.
 950   return true;
 951 }
 952
 953 bool TGLexer::prepSkipDirectiveEnd() {
 954   while (CurPtr != CurBuf.end()) {
 955     switch (*CurPtr) {
 956     case ' ':
 957     case '\t':
 958       break;
 959
 960     case '\n':
 961     case '\r':
 962       return true;
 963
 964     case '/': {
 965       int NextChar = peekNextChar(1);
 966       if (NextChar == '/') {
 967         // Skip C++-style comment.
 968         // We may just return true now, but let's skip to the line/buffer end
 969         // to simplify the method specification.
 970         ++CurPtr;
 971         SkipBCPLComment();
 972       } else if (NextChar == '*') {
 973         // When we are skipping C-style comment at the end of a preprocessing
 974         // directive, we can skip several lines.  If any meaningful TD token
 975         // follows the end of the C-style comment on the same line, it will
 976         // be considered as an invalid usage of TD token.
 977         // For example, we want to forbid usages like this one:
 978         //     #define MACRO class Class {}
 979         // But with C-style comments we also disallow the following:
 980         //     #define MACRO /* This macro is used
 981         //                      to ... */ class Class {}
 982         // One can argue that this should be allowed, but it does not seem
 983         // to be worth of the complication.  Moreover, this matches
 984         // the C preprocessor behavior.
 985
 986         // Set TokStart to the beginning of the comment to enable proper
 987         // diagnostic printer in case of error in SkipCComment().
 988         TokStart = CurPtr;
 989         ++CurPtr;
 990         if (SkipCComment())
 991           return false;
 992       } else {
 993         TokStart = CurPtr;
 994         PrintError(CurPtr, "Unexpected character");
 995         return false;
 996       }
 997
 998       // We must not increment CurPtr after the comment was lexed.
 999       continue;
1000     }
1001
1002     default:
1003       // Do not allow any non-whitespaces after the directive.
1004       TokStart = CurPtr;
1005       return false;
1006     }
1007
1008     ++CurPtr;
1009   }
1010
1011   return true;
1012 }
1013
1014 void TGLexer::prepSkipToLineEnd() {
1015   while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end())
1016     ++CurPtr;
1017 }
1018
1019 bool TGLexer::prepIsProcessingEnabled() {
1020   for (auto I = PrepIncludeStack.back()->rbegin(),
1021             E = PrepIncludeStack.back()->rend();
1022        I != E; ++I) {
1023     if (!I->IsDefined)
1024       return false;
1025   }
1026
1027   return true;
1028 }
1029
1030 void TGLexer::prepReportPreprocessorStackError() {
1031   if (PrepIncludeStack.back()->empty())
1032     PrintFatalError("prepReportPreprocessorStackError() called with "
1033                     "empty control stack");
1034
1035   auto &PrepControl = PrepIncludeStack.back()->back();
1036   PrintError(CurBuf.end(), "Reached EOF without matching #endif");
1037   PrintError(PrepControl.SrcPos, "The latest preprocessor control is here");
1038
1039   TokStart = CurPtr;
1040 }