clang/lib/Lex/DependencyDirectivesScanner.cpp

   1 //===- DependencyDirectivesScanner.cpp ------------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 ///
   9 /// \file
  10 /// This is the interface for scanning header and source files to get the
  11 /// minimum necessary preprocessor directives for evaluating includes. It
  12 /// reduces the source down to #define, #include, #import, @import, and any
  13 /// conditional preprocessor logic that contains one of those.
  14 ///
  15 //===----------------------------------------------------------------------===//
  16
  17 #include "clang/Lex/DependencyDirectivesScanner.h"
  18 #include "clang/Basic/CharInfo.h"
  19 #include "clang/Basic/Diagnostic.h"
  20 #include "clang/Lex/LexDiagnostic.h"
  21 #include "clang/Lex/Lexer.h"
  22 #include "clang/Lex/Pragma.h"
  23 #include "llvm/ADT/ScopeExit.h"
  24 #include "llvm/ADT/SmallString.h"
  25 #include "llvm/ADT/StringMap.h"
  26 #include "llvm/ADT/StringSwitch.h"
  27 #include <optional>
  28
  29 using namespace clang;
  30 using namespace clang::dependency_directives_scan;
  31 using namespace llvm;
  32
  33 namespace {
  34
  35 struct DirectiveWithTokens {
  36   DirectiveKind Kind;
  37   unsigned NumTokens;
  38
  39   DirectiveWithTokens(DirectiveKind Kind, unsigned NumTokens)
  40       : Kind(Kind), NumTokens(NumTokens) {}
  41 };
  42
  43 /// Does an efficient "scan" of the sources to detect the presence of
  44 /// preprocessor (or module import) directives and collects the raw lexed tokens
  45 /// for those directives so that the \p Lexer can "replay" them when the file is
  46 /// included.
  47 ///
  48 /// Note that the behavior of the raw lexer is affected by the language mode,
  49 /// while at this point we want to do a scan and collect tokens once,
  50 /// irrespective of the language mode that the file will get included in. To
  51 /// compensate for that the \p Lexer, while "replaying", will adjust a token
  52 /// where appropriate, when it could affect the preprocessor's state.
  53 /// For example in a directive like
  54 ///
  55 /// \code
  56 ///   #if __has_cpp_attribute(clang::fallthrough)
  57 /// \endcode
  58 ///
  59 /// The preprocessor needs to see '::' as 'tok::coloncolon' instead of 2
  60 /// 'tok::colon'. The \p Lexer will adjust if it sees consecutive 'tok::colon'
  61 /// while in C++ mode.
  62 struct Scanner {
  63   Scanner(StringRef Input,
  64           SmallVectorImpl<dependency_directives_scan::Token> &Tokens,
  65           DiagnosticsEngine *Diags, SourceLocation InputSourceLoc)
  66       : Input(Input), Tokens(Tokens), Diags(Diags),
  67         InputSourceLoc(InputSourceLoc), LangOpts(getLangOptsForDepScanning()),
  68         TheLexer(InputSourceLoc, LangOpts, Input.begin(), Input.begin(),
  69                  Input.end()) {}
  70
  71   static LangOptions getLangOptsForDepScanning() {
  72     LangOptions LangOpts;
  73     // Set the lexer to use 'tok::at' for '@', instead of 'tok::unknown'.
  74     LangOpts.ObjC = true;
  75     LangOpts.LineComment = true;
  76     // FIXME: we do not enable C11 or C++11, so we are missing u/u8/U"" and
  77     // R"()" literals.
  78     return LangOpts;
  79   }
  80
  81   /// Lex the provided source and emit the directive tokens.
  82   ///
  83   /// \returns True on error.
  84   bool scan(SmallVectorImpl<Directive> &Directives);
  85
  86 private:
  87   /// Lexes next token and advances \p First and the \p Lexer.
  88   [[nodiscard]] dependency_directives_scan::Token &
  89   lexToken(const char *&First, const char *const End);
  90
  91   dependency_directives_scan::Token &lexIncludeFilename(const char *&First,
  92                                                         const char *const End);
  93
  94   void skipLine(const char *&First, const char *const End);
  95   void skipDirective(StringRef Name, const char *&First, const char *const End);
  96
  97   /// Returns the spelling of a string literal or identifier after performing
  98   /// any processing needed to handle \c clang::Token::NeedsCleaning.
  99   StringRef cleanStringIfNeeded(const dependency_directives_scan::Token &Tok);
 100
 101   /// Lexes next token and if it is identifier returns its string, otherwise
 102   /// it skips the current line and returns \p std::nullopt.
 103   ///
 104   /// In any case (whatever the token kind) \p First and the \p Lexer will
 105   /// advance beyond the token.
 106   [[nodiscard]] std::optional<StringRef>
 107   tryLexIdentifierOrSkipLine(const char *&First, const char *const End);
 108
 109   /// Used when it is certain that next token is an identifier.
 110   [[nodiscard]] StringRef lexIdentifier(const char *&First,
 111                                         const char *const End);
 112
 113   /// Lexes next token and returns true iff it is an identifier that matches \p
 114   /// Id, otherwise it skips the current line and returns false.
 115   ///
 116   /// In any case (whatever the token kind) \p First and the \p Lexer will
 117   /// advance beyond the token.
 118   [[nodiscard]] bool isNextIdentifierOrSkipLine(StringRef Id,
 119                                                 const char *&First,
 120                                                 const char *const End);
 121
 122   /// Lexes next token and returns true iff it matches the kind \p K.
 123   /// Otherwise it skips the current line and returns false.
 124   ///
 125   /// In any case (whatever the token kind) \p First and the \p Lexer will
 126   /// advance beyond the token.
 127   [[nodiscard]] bool isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,
 128                                            const char *const End);
 129
 130   /// Lexes next token and if it is string literal, returns its string.
 131   /// Otherwise, it skips the current line and returns \p std::nullopt.
 132   ///
 133   /// In any case (whatever the token kind) \p First and the \p Lexer will
 134   /// advance beyond the token.
 135   [[nodiscard]] std::optional<StringRef>
 136   tryLexStringLiteralOrSkipLine(const char *&First, const char *const End);
 137
 138   [[nodiscard]] bool scanImpl(const char *First, const char *const End);
 139   [[nodiscard]] bool lexPPLine(const char *&First, const char *const End);
 140   [[nodiscard]] bool lexAt(const char *&First, const char *const End);
 141   [[nodiscard]] bool lexModule(const char *&First, const char *const End);
 142   [[nodiscard]] bool lexDefine(const char *HashLoc, const char *&First,
 143                                const char *const End);
 144   [[nodiscard]] bool lexPragma(const char *&First, const char *const End);
 145   [[nodiscard]] bool lex_Pragma(const char *&First, const char *const End);
 146   [[nodiscard]] bool lexEndif(const char *&First, const char *const End);
 147   [[nodiscard]] bool lexDefault(DirectiveKind Kind, const char *&First,
 148                                 const char *const End);
 149   [[nodiscard]] bool lexModuleDirectiveBody(DirectiveKind Kind,
 150                                             const char *&First,
 151                                             const char *const End);
 152   void lexPPDirectiveBody(const char *&First, const char *const End);
 153
 154   DirectiveWithTokens &pushDirective(DirectiveKind Kind) {
 155     Tokens.append(CurDirToks);
 156     DirsWithToks.emplace_back(Kind, CurDirToks.size());
 157     CurDirToks.clear();
 158     return DirsWithToks.back();
 159   }
 160   void popDirective() {
 161     Tokens.pop_back_n(DirsWithToks.pop_back_val().NumTokens);
 162   }
 163   DirectiveKind topDirective() const {
 164     return DirsWithToks.empty() ? pp_none : DirsWithToks.back().Kind;
 165   }
 166
 167   unsigned getOffsetAt(const char *CurPtr) const {
 168     return CurPtr - Input.data();
 169   }
 170
 171   /// Reports a diagnostic if the diagnostic engine is provided. Always returns
 172   /// true at the end.
 173   bool reportError(const char *CurPtr, unsigned Err);
 174
 175   StringMap<char> SplitIds;
 176   StringRef Input;
 177   SmallVectorImpl<dependency_directives_scan::Token> &Tokens;
 178   DiagnosticsEngine *Diags;
 179   SourceLocation InputSourceLoc;
 180
 181   const char *LastTokenPtr = nullptr;
 182   /// Keeps track of the tokens for the currently lexed directive. Once a
 183   /// directive is fully lexed and "committed" then the tokens get appended to
 184   /// \p Tokens and \p CurDirToks is cleared for the next directive.
 185   SmallVector<dependency_directives_scan::Token, 32> CurDirToks;
 186   /// The directives that were lexed along with the number of tokens that each
 187   /// directive contains. The tokens of all the directives are kept in \p Tokens
 188   /// vector, in the same order as the directives order in \p DirsWithToks.
 189   SmallVector<DirectiveWithTokens, 64> DirsWithToks;
 190   LangOptions LangOpts;
 191   Lexer TheLexer;
 192 };
 193
 194 } // end anonymous namespace
 195
 196 bool Scanner::reportError(const char *CurPtr, unsigned Err) {
 197   if (!Diags)
 198     return true;
 199   assert(CurPtr >= Input.data() && "invalid buffer ptr");
 200   Diags->Report(InputSourceLoc.getLocWithOffset(getOffsetAt(CurPtr)), Err);
 201   return true;
 202 }
 203
 204 static void skipOverSpaces(const char *&First, const char *const End) {
 205   while (First != End && isHorizontalWhitespace(*First))
 206     ++First;
 207 }
 208
 209 [[nodiscard]] static bool isRawStringLiteral(const char *First,
 210                                              const char *Current) {
 211   assert(First <= Current);
 212
 213   // Check if we can even back up.
 214   if (*Current != '"' || First == Current)
 215     return false;
 216
 217   // Check for an "R".
 218   --Current;
 219   if (*Current != 'R')
 220     return false;
 221   if (First == Current || !isAsciiIdentifierContinue(*--Current))
 222     return true;
 223
 224   // Check for a prefix of "u", "U", or "L".
 225   if (*Current == 'u' || *Current == 'U' || *Current == 'L')
 226     return First == Current || !isAsciiIdentifierContinue(*--Current);
 227
 228   // Check for a prefix of "u8".
 229   if (*Current != '8' || First == Current || *Current-- != 'u')
 230     return false;
 231   return First == Current || !isAsciiIdentifierContinue(*--Current);
 232 }
 233
 234 static void skipRawString(const char *&First, const char *const End) {
 235   assert(First[0] == '"');
 236   assert(First[-1] == 'R');
 237
 238   const char *Last = ++First;
 239   while (Last != End && *Last != '(')
 240     ++Last;
 241   if (Last == End) {
 242     First = Last; // Hit the end... just give up.
 243     return;
 244   }
 245
 246   StringRef Terminator(First, Last - First);
 247   for (;;) {
 248     // Move First to just past the next ")".
 249     First = Last;
 250     while (First != End && *First != ')')
 251       ++First;
 252     if (First == End)
 253       return;
 254     ++First;
 255
 256     // Look ahead for the terminator sequence.
 257     Last = First;
 258     while (Last != End && size_t(Last - First) < Terminator.size() &&
 259            Terminator[Last - First] == *Last)
 260       ++Last;
 261
 262     // Check if we hit it (or the end of the file).
 263     if (Last == End) {
 264       First = Last;
 265       return;
 266     }
 267     if (size_t(Last - First) < Terminator.size())
 268       continue;
 269     if (*Last != '"')
 270       continue;
 271     First = Last + 1;
 272     return;
 273   }
 274 }
 275
 276 // Returns the length of EOL, either 0 (no end-of-line), 1 (\n) or 2 (\r\n)
 277 static unsigned isEOL(const char *First, const char *const End) {
 278   if (First == End)
 279     return 0;
 280   if (End - First > 1 && isVerticalWhitespace(First[0]) &&
 281       isVerticalWhitespace(First[1]) && First[0] != First[1])
 282     return 2;
 283   return !!isVerticalWhitespace(First[0]);
 284 }
 285
 286 static void skipString(const char *&First, const char *const End) {
 287   assert(*First == '\'' || *First == '"' || *First == '<');
 288   const char Terminator = *First == '<' ? '>' : *First;
 289   for (++First; First != End && *First != Terminator; ++First) {
 290     // String and character literals don't extend past the end of the line.
 291     if (isVerticalWhitespace(*First))
 292       return;
 293     if (*First != '\\')
 294       continue;
 295     // Skip past backslash to the next character. This ensures that the
 296     // character right after it is skipped as well, which matters if it's
 297     // the terminator.
 298     if (++First == End)
 299       return;
 300     if (!isWhitespace(*First))
 301       continue;
 302     // Whitespace after the backslash might indicate a line continuation.
 303     const char *FirstAfterBackslashPastSpace = First;
 304     skipOverSpaces(FirstAfterBackslashPastSpace, End);
 305     if (unsigned NLSize = isEOL(FirstAfterBackslashPastSpace, End)) {
 306       // Advance the character pointer to the next line for the next
 307       // iteration.
 308       First = FirstAfterBackslashPastSpace + NLSize - 1;
 309     }
 310   }
 311   if (First != End)
 312     ++First; // Finish off the string.
 313 }
 314
 315 // Returns the length of the skipped newline
 316 static unsigned skipNewline(const char *&First, const char *End) {
 317   if (First == End)
 318     return 0;
 319   assert(isVerticalWhitespace(*First));
 320   unsigned Len = isEOL(First, End);
 321   assert(Len && "expected newline");
 322   First += Len;
 323   return Len;
 324 }
 325
 326 static bool wasLineContinuation(const char *First, unsigned EOLLen) {
 327   return *(First - (int)EOLLen - 1) == '\\';
 328 }
 329
 330 static void skipToNewlineRaw(const char *&First, const char *const End) {
 331   for (;;) {
 332     if (First == End)
 333       return;
 334
 335     unsigned Len = isEOL(First, End);
 336     if (Len)
 337       return;
 338
 339     do {
 340       if (++First == End)
 341         return;
 342       Len = isEOL(First, End);
 343     } while (!Len);
 344
 345     if (First[-1] != '\\')
 346       return;
 347
 348     First += Len;
 349     // Keep skipping lines...
 350   }
 351 }
 352
 353 static void skipLineComment(const char *&First, const char *const End) {
 354   assert(First[0] == '/' && First[1] == '/');
 355   First += 2;
 356   skipToNewlineRaw(First, End);
 357 }
 358
 359 static void skipBlockComment(const char *&First, const char *const End) {
 360   assert(First[0] == '/' && First[1] == '*');
 361   if (End - First < 4) {
 362     First = End;
 363     return;
 364   }
 365   for (First += 3; First != End; ++First)
 366     if (First[-1] == '*' && First[0] == '/') {
 367       ++First;
 368       return;
 369     }
 370 }
 371
 372 /// \returns True if the current single quotation mark character is a C++ 14
 373 /// digit separator.
 374 static bool isQuoteCppDigitSeparator(const char *const Start,
 375                                      const char *const Cur,
 376                                      const char *const End) {
 377   assert(*Cur == '\'' && "expected quotation character");
 378   // skipLine called in places where we don't expect a valid number
 379   // body before `start` on the same line, so always return false at the start.
 380   if (Start == Cur)
 381     return false;
 382   // The previous character must be a valid PP number character.
 383   // Make sure that the L, u, U, u8 prefixes don't get marked as a
 384   // separator though.
 385   char Prev = *(Cur - 1);
 386   if (Prev == 'L' || Prev == 'U' || Prev == 'u')
 387     return false;
 388   if (Prev == '8' && (Cur - 1 != Start) && *(Cur - 2) == 'u')
 389     return false;
 390   if (!isPreprocessingNumberBody(Prev))
 391     return false;
 392   // The next character should be a valid identifier body character.
 393   return (Cur + 1) < End && isAsciiIdentifierContinue(*(Cur + 1));
 394 }
 395
 396 void Scanner::skipLine(const char *&First, const char *const End) {
 397   for (;;) {
 398     assert(First <= End);
 399     if (First == End)
 400       return;
 401
 402     if (isVerticalWhitespace(*First)) {
 403       skipNewline(First, End);
 404       return;
 405     }
 406     const char *Start = First;
 407     while (First != End && !isVerticalWhitespace(*First)) {
 408       // Iterate over strings correctly to avoid comments and newlines.
 409       if (*First == '"' ||
 410           (*First == '\'' && !isQuoteCppDigitSeparator(Start, First, End))) {
 411         LastTokenPtr = First;
 412         if (isRawStringLiteral(Start, First))
 413           skipRawString(First, End);
 414         else
 415           skipString(First, End);
 416         continue;
 417       }
 418
 419       // Iterate over comments correctly.
 420       if (*First != '/' || End - First < 2) {
 421         LastTokenPtr = First;
 422         ++First;
 423         continue;
 424       }
 425
 426       if (First[1] == '/') {
 427         // "//...".
 428         skipLineComment(First, End);
 429         continue;
 430       }
 431
 432       if (First[1] != '*') {
 433         LastTokenPtr = First;
 434         ++First;
 435         continue;
 436       }
 437
 438       // "/*...*/".
 439       skipBlockComment(First, End);
 440     }
 441     if (First == End)
 442       return;
 443
 444     // Skip over the newline.
 445     unsigned Len = skipNewline(First, End);
 446     if (!wasLineContinuation(First, Len)) // Continue past line-continuations.
 447       break;
 448   }
 449 }
 450
 451 void Scanner::skipDirective(StringRef Name, const char *&First,
 452                             const char *const End) {
 453   if (llvm::StringSwitch<bool>(Name)
 454           .Case("warning", true)
 455           .Case("error", true)
 456           .Default(false))
 457     // Do not process quotes or comments.
 458     skipToNewlineRaw(First, End);
 459   else
 460     skipLine(First, End);
 461 }
 462
 463 static void skipWhitespace(const char *&First, const char *const End) {
 464   for (;;) {
 465     assert(First <= End);
 466     skipOverSpaces(First, End);
 467
 468     if (End - First < 2)
 469       return;
 470
 471     if (First[0] == '\\' && isVerticalWhitespace(First[1])) {
 472       skipNewline(++First, End);
 473       continue;
 474     }
 475
 476     // Check for a non-comment character.
 477     if (First[0] != '/')
 478       return;
 479
 480     // "// ...".
 481     if (First[1] == '/') {
 482       skipLineComment(First, End);
 483       return;
 484     }
 485
 486     // Cannot be a comment.
 487     if (First[1] != '*')
 488       return;
 489
 490     // "/*...*/".
 491     skipBlockComment(First, End);
 492   }
 493 }
 494
 495 bool Scanner::lexModuleDirectiveBody(DirectiveKind Kind, const char *&First,
 496                                      const char *const End) {
 497   const char *DirectiveLoc = Input.data() + CurDirToks.front().Offset;
 498   for (;;) {
 499     const dependency_directives_scan::Token &Tok = lexToken(First, End);
 500     if (Tok.is(tok::eof))
 501       return reportError(
 502           DirectiveLoc,
 503           diag::err_dep_source_scanner_missing_semi_after_at_import);
 504     if (Tok.is(tok::semi))
 505       break;
 506   }
 507   pushDirective(Kind);
 508   skipWhitespace(First, End);
 509   if (First == End)
 510     return false;
 511   if (!isVerticalWhitespace(*First))
 512     return reportError(
 513         DirectiveLoc, diag::err_dep_source_scanner_unexpected_tokens_at_import);
 514   skipNewline(First, End);
 515   return false;
 516 }
 517
 518 dependency_directives_scan::Token &Scanner::lexToken(const char *&First,
 519                                                      const char *const End) {
 520   clang::Token Tok;
 521   TheLexer.LexFromRawLexer(Tok);
 522   First = Input.data() + TheLexer.getCurrentBufferOffset();
 523   assert(First <= End);
 524
 525   unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength();
 526   CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(),
 527                           Tok.getFlags());
 528   return CurDirToks.back();
 529 }
 530
 531 dependency_directives_scan::Token &
 532 Scanner::lexIncludeFilename(const char *&First, const char *const End) {
 533   clang::Token Tok;
 534   TheLexer.LexIncludeFilename(Tok);
 535   First = Input.data() + TheLexer.getCurrentBufferOffset();
 536   assert(First <= End);
 537
 538   unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength();
 539   CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(),
 540                           Tok.getFlags());
 541   return CurDirToks.back();
 542 }
 543
 544 void Scanner::lexPPDirectiveBody(const char *&First, const char *const End) {
 545   while (true) {
 546     const dependency_directives_scan::Token &Tok = lexToken(First, End);
 547     if (Tok.is(tok::eod))
 548       break;
 549   }
 550 }
 551
 552 StringRef
 553 Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) {
 554   bool NeedsCleaning = Tok.Flags & clang::Token::NeedsCleaning;
 555   if (LLVM_LIKELY(!NeedsCleaning))
 556     return Input.slice(Tok.Offset, Tok.getEnd());
 557
 558   SmallString<64> Spelling;
 559   Spelling.resize(Tok.Length);
 560
 561   // FIXME: C++11 raw string literals need special handling (see getSpellingSlow
 562   // in the Lexer). Currently we cannot see them due to our LangOpts.
 563
 564   unsigned SpellingLength = 0;
 565   const char *BufPtr = Input.begin() + Tok.Offset;
 566   const char *AfterIdent = Input.begin() + Tok.getEnd();
 567   while (BufPtr < AfterIdent) {
 568     auto [Char, Size] = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
 569     Spelling[SpellingLength++] = Char;
 570     BufPtr += Size;
 571   }
 572
 573   return SplitIds.try_emplace(StringRef(Spelling.begin(), SpellingLength), 0)
 574       .first->first();
 575 }
 576
 577 std::optional<StringRef>
 578 Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
 579   const dependency_directives_scan::Token &Tok = lexToken(First, End);
 580   if (Tok.isNot(tok::raw_identifier)) {
 581     if (!Tok.is(tok::eod))
 582       skipLine(First, End);
 583     return std::nullopt;
 584   }
 585
 586   return cleanStringIfNeeded(Tok);
 587 }
 588
 589 StringRef Scanner::lexIdentifier(const char *&First, const char *const End) {
 590   std::optional<StringRef> Id = tryLexIdentifierOrSkipLine(First, End);
 591   assert(Id && "expected identifier token");
 592   return *Id;
 593 }
 594
 595 bool Scanner::isNextIdentifierOrSkipLine(StringRef Id, const char *&First,
 596                                          const char *const End) {
 597   if (std::optional<StringRef> FoundId =
 598           tryLexIdentifierOrSkipLine(First, End)) {
 599     if (*FoundId == Id)
 600       return true;
 601     skipLine(First, End);
 602   }
 603   return false;
 604 }
 605
 606 bool Scanner::isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,
 607                                     const char *const End) {
 608   const dependency_directives_scan::Token &Tok = lexToken(First, End);
 609   if (Tok.is(K))
 610     return true;
 611   skipLine(First, End);
 612   return false;
 613 }
 614
 615 std::optional<StringRef>
 616 Scanner::tryLexStringLiteralOrSkipLine(const char *&First,
 617                                        const char *const End) {
 618   const dependency_directives_scan::Token &Tok = lexToken(First, End);
 619   if (!tok::isStringLiteral(Tok.Kind)) {
 620     if (!Tok.is(tok::eod))
 621       skipLine(First, End);
 622     return std::nullopt;
 623   }
 624
 625   return cleanStringIfNeeded(Tok);
 626 }
 627
 628 bool Scanner::lexAt(const char *&First, const char *const End) {
 629   // Handle "@import".
 630
 631   // Lex '@'.
 632   const dependency_directives_scan::Token &AtTok = lexToken(First, End);
 633   assert(AtTok.is(tok::at));
 634   (void)AtTok;
 635
 636   if (!isNextIdentifierOrSkipLine("import", First, End))
 637     return false;
 638   return lexModuleDirectiveBody(decl_at_import, First, End);
 639 }
 640
 641 bool Scanner::lexModule(const char *&First, const char *const End) {
 642   StringRef Id = lexIdentifier(First, End);
 643   bool Export = false;
 644   if (Id == "export") {
 645     Export = true;
 646     std::optional<StringRef> NextId = tryLexIdentifierOrSkipLine(First, End);
 647     if (!NextId)
 648       return false;
 649     Id = *NextId;
 650   }
 651
 652   if (Id != "module" && Id != "import") {
 653     skipLine(First, End);
 654     return false;
 655   }
 656
 657   skipWhitespace(First, End);
 658
 659   // Ignore this as a module directive if the next character can't be part of
 660   // an import.
 661
 662   switch (*First) {
 663   case ':':
 664   case '<':
 665   case '"':
 666     break;
 667   default:
 668     if (!isAsciiIdentifierContinue(*First)) {
 669       skipLine(First, End);
 670       return false;
 671     }
 672   }
 673
 674   TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ false);
 675
 676   DirectiveKind Kind;
 677   if (Id == "module")
 678     Kind = Export ? cxx_export_module_decl : cxx_module_decl;
 679   else
 680     Kind = Export ? cxx_export_import_decl : cxx_import_decl;
 681
 682   return lexModuleDirectiveBody(Kind, First, End);
 683 }
 684
 685 bool Scanner::lex_Pragma(const char *&First, const char *const End) {
 686   if (!isNextTokenOrSkipLine(tok::l_paren, First, End))
 687     return false;
 688
 689   std::optional<StringRef> Str = tryLexStringLiteralOrSkipLine(First, End);
 690
 691   if (!Str || !isNextTokenOrSkipLine(tok::r_paren, First, End))
 692     return false;
 693
 694   SmallString<64> Buffer(*Str);
 695   prepare_PragmaString(Buffer);
 696
 697   // Use a new scanner instance since the tokens will be inside the allocated
 698   // string. We should already have captured all the relevant tokens in the
 699   // current scanner.
 700   SmallVector<dependency_directives_scan::Token> DiscardTokens;
 701   const char *Begin = Buffer.c_str();
 702   Scanner PragmaScanner{StringRef(Begin, Buffer.size()), DiscardTokens, Diags,
 703                         InputSourceLoc};
 704
 705   PragmaScanner.TheLexer.setParsingPreprocessorDirective(true);
 706   if (PragmaScanner.lexPragma(Begin, Buffer.end()))
 707     return true;
 708
 709   DirectiveKind K = PragmaScanner.topDirective();
 710   if (K == pp_none) {
 711     skipLine(First, End);
 712     return false;
 713   }
 714
 715   assert(Begin == Buffer.end());
 716   pushDirective(K);
 717   return false;
 718 }
 719
 720 bool Scanner::lexPragma(const char *&First, const char *const End) {
 721   std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End);
 722   if (!FoundId)
 723     return false;
 724
 725   StringRef Id = *FoundId;
 726   auto Kind = llvm::StringSwitch<DirectiveKind>(Id)
 727                   .Case("once", pp_pragma_once)
 728                   .Case("push_macro", pp_pragma_push_macro)
 729                   .Case("pop_macro", pp_pragma_pop_macro)
 730                   .Case("include_alias", pp_pragma_include_alias)
 731                   .Default(pp_none);
 732   if (Kind != pp_none) {
 733     lexPPDirectiveBody(First, End);
 734     pushDirective(Kind);
 735     return false;
 736   }
 737
 738   if (Id != "clang") {
 739     skipLine(First, End);
 740     return false;
 741   }
 742
 743   FoundId = tryLexIdentifierOrSkipLine(First, End);
 744   if (!FoundId)
 745     return false;
 746   Id = *FoundId;
 747
 748   // #pragma clang system_header
 749   if (Id == "system_header") {
 750     lexPPDirectiveBody(First, End);
 751     pushDirective(pp_pragma_system_header);
 752     return false;
 753   }
 754
 755   if (Id != "module") {
 756     skipLine(First, End);
 757     return false;
 758   }
 759
 760   // #pragma clang module.
 761   if (!isNextIdentifierOrSkipLine("import", First, End))
 762     return false;
 763
 764   // #pragma clang module import.
 765   lexPPDirectiveBody(First, End);
 766   pushDirective(pp_pragma_import);
 767   return false;
 768 }
 769
 770 bool Scanner::lexEndif(const char *&First, const char *const End) {
 771   // Strip out "#else" if it's empty.
 772   if (topDirective() == pp_else)
 773     popDirective();
 774
 775   // If "#ifdef" is empty, strip it and skip the "#endif".
 776   //
 777   // FIXME: Once/if Clang starts disallowing __has_include in macro expansions,
 778   // we can skip empty `#if` and `#elif` blocks as well after scanning for a
 779   // literal __has_include in the condition.  Even without that rule we could
 780   // drop the tokens if we scan for identifiers in the condition and find none.
 781   if (topDirective() == pp_ifdef || topDirective() == pp_ifndef) {
 782     popDirective();
 783     skipLine(First, End);
 784     return false;
 785   }
 786
 787   return lexDefault(pp_endif, First, End);
 788 }
 789
 790 bool Scanner::lexDefault(DirectiveKind Kind, const char *&First,
 791                          const char *const End) {
 792   lexPPDirectiveBody(First, End);
 793   pushDirective(Kind);
 794   return false;
 795 }
 796
 797 static bool isStartOfRelevantLine(char First) {
 798   switch (First) {
 799   case '#':
 800   case '@':
 801   case 'i':
 802   case 'e':
 803   case 'm':
 804   case '_':
 805     return true;
 806   }
 807   return false;
 808 }
 809
 810 bool Scanner::lexPPLine(const char *&First, const char *const End) {
 811   assert(First != End);
 812
 813   skipWhitespace(First, End);
 814   assert(First <= End);
 815   if (First == End)
 816     return false;
 817
 818   if (!isStartOfRelevantLine(*First)) {
 819     skipLine(First, End);
 820     assert(First <= End);
 821     return false;
 822   }
 823
 824   LastTokenPtr = First;
 825
 826   TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ true);
 827
 828   auto ScEx1 = make_scope_exit([&]() {
 829     /// Clear Scanner's CurDirToks before returning, in case we didn't push a
 830     /// new directive.
 831     CurDirToks.clear();
 832   });
 833
 834   // Handle "@import".
 835   if (*First == '@')
 836     return lexAt(First, End);
 837
 838   if (*First == 'i' || *First == 'e' || *First == 'm')
 839     return lexModule(First, End);
 840
 841   if (*First == '_') {
 842     if (isNextIdentifierOrSkipLine("_Pragma", First, End))
 843       return lex_Pragma(First, End);
 844     return false;
 845   }
 846
 847   // Handle preprocessing directives.
 848
 849   TheLexer.setParsingPreprocessorDirective(true);
 850   auto ScEx2 = make_scope_exit(
 851       [&]() { TheLexer.setParsingPreprocessorDirective(false); });
 852
 853   // Lex '#'.
 854   const dependency_directives_scan::Token &HashTok = lexToken(First, End);
 855   if (HashTok.is(tok::hashhash)) {
 856     // A \p tok::hashhash at this location is passed by the preprocessor to the
 857     // parser to interpret, like any other token. So for dependency scanning
 858     // skip it like a normal token not affecting the preprocessor.
 859     skipLine(First, End);
 860     assert(First <= End);
 861     return false;
 862   }
 863   assert(HashTok.is(tok::hash));
 864   (void)HashTok;
 865
 866   std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End);
 867   if (!FoundId)
 868     return false;
 869
 870   StringRef Id = *FoundId;
 871
 872   if (Id == "pragma")
 873     return lexPragma(First, End);
 874
 875   auto Kind = llvm::StringSwitch<DirectiveKind>(Id)
 876                   .Case("include", pp_include)
 877                   .Case("__include_macros", pp___include_macros)
 878                   .Case("define", pp_define)
 879                   .Case("undef", pp_undef)
 880                   .Case("import", pp_import)
 881                   .Case("include_next", pp_include_next)
 882                   .Case("if", pp_if)
 883                   .Case("ifdef", pp_ifdef)
 884                   .Case("ifndef", pp_ifndef)
 885                   .Case("elif", pp_elif)
 886                   .Case("elifdef", pp_elifdef)
 887                   .Case("elifndef", pp_elifndef)
 888                   .Case("else", pp_else)
 889                   .Case("endif", pp_endif)
 890                   .Default(pp_none);
 891   if (Kind == pp_none) {
 892     skipDirective(Id, First, End);
 893     return false;
 894   }
 895
 896   if (Kind == pp_endif)
 897     return lexEndif(First, End);
 898
 899   switch (Kind) {
 900   case pp_include:
 901   case pp___include_macros:
 902   case pp_include_next:
 903   case pp_import:
 904     lexIncludeFilename(First, End);
 905     break;
 906   default:
 907     break;
 908   }
 909
 910   // Everything else.
 911   return lexDefault(Kind, First, End);
 912 }
 913
 914 static void skipUTF8ByteOrderMark(const char *&First, const char *const End) {
 915   if ((End - First) >= 3 && First[0] == '\xef' && First[1] == '\xbb' &&
 916       First[2] == '\xbf')
 917     First += 3;
 918 }
 919
 920 bool Scanner::scanImpl(const char *First, const char *const End) {
 921   skipUTF8ByteOrderMark(First, End);
 922   while (First != End)
 923     if (lexPPLine(First, End))
 924       return true;
 925   return false;
 926 }
 927
 928 bool Scanner::scan(SmallVectorImpl<Directive> &Directives) {
 929   bool Error = scanImpl(Input.begin(), Input.end());
 930
 931   if (!Error) {
 932     // Add an EOF on success.
 933     if (LastTokenPtr &&
 934         (Tokens.empty() || LastTokenPtr > Input.begin() + Tokens.back().Offset))
 935       pushDirective(tokens_present_before_eof);
 936     pushDirective(pp_eof);
 937   }
 938
 939   ArrayRef<dependency_directives_scan::Token> RemainingTokens = Tokens;
 940   for (const DirectiveWithTokens &DirWithToks : DirsWithToks) {
 941     assert(RemainingTokens.size() >= DirWithToks.NumTokens);
 942     Directives.emplace_back(DirWithToks.Kind,
 943                             RemainingTokens.take_front(DirWithToks.NumTokens));
 944     RemainingTokens = RemainingTokens.drop_front(DirWithToks.NumTokens);
 945   }
 946   assert(RemainingTokens.empty());
 947
 948   return Error;
 949 }
 950
 951 bool clang::scanSourceForDependencyDirectives(
 952     StringRef Input, SmallVectorImpl<dependency_directives_scan::Token> &Tokens,
 953     SmallVectorImpl<Directive> &Directives, DiagnosticsEngine *Diags,
 954     SourceLocation InputSourceLoc) {
 955   return Scanner(Input, Tokens, Diags, InputSourceLoc).scan(Directives);
 956 }
 957
 958 void clang::printDependencyDirectivesAsSource(
 959     StringRef Source,
 960     ArrayRef<dependency_directives_scan::Directive> Directives,
 961     llvm::raw_ostream &OS) {
 962   // Add a space separator where it is convenient for testing purposes.
 963   auto needsSpaceSeparator =
 964       [](tok::TokenKind Prev,
 965          const dependency_directives_scan::Token &Tok) -> bool {
 966     if (Prev == Tok.Kind)
 967       return !Tok.isOneOf(tok::l_paren, tok::r_paren, tok::l_square,
 968                           tok::r_square);
 969     if (Prev == tok::raw_identifier &&
 970         Tok.isOneOf(tok::hash, tok::numeric_constant, tok::string_literal,
 971                     tok::char_constant, tok::header_name))
 972       return true;
 973     if (Prev == tok::r_paren &&
 974         Tok.isOneOf(tok::raw_identifier, tok::hash, tok::string_literal,
 975                     tok::char_constant, tok::unknown))
 976       return true;
 977     if (Prev == tok::comma &&
 978         Tok.isOneOf(tok::l_paren, tok::string_literal, tok::less))
 979       return true;
 980     return false;
 981   };
 982
 983   for (const dependency_directives_scan::Directive &Directive : Directives) {
 984     if (Directive.Kind == tokens_present_before_eof)
 985       OS << "<TokBeforeEOF>";
 986     std::optional<tok::TokenKind> PrevTokenKind;
 987     for (const dependency_directives_scan::Token &Tok : Directive.Tokens) {
 988       if (PrevTokenKind && needsSpaceSeparator(*PrevTokenKind, Tok))
 989         OS << ' ';
 990       PrevTokenKind = Tok.Kind;
 991       OS << Source.slice(Tok.Offset, Tok.getEnd());
 992     }
 993   }
 994 }