clang/lib/Lex/Lexer.cpp

   1 //===- Lexer.cpp - C Language Family Lexer --------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 //  This file implements the Lexer and Token interfaces.
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include "clang/Lex/Lexer.h"
  14 #include "UnicodeCharSets.h"
  15 #include "clang/Basic/CharInfo.h"
  16 #include "clang/Basic/Diagnostic.h"
  17 #include "clang/Basic/IdentifierTable.h"
  18 #include "clang/Basic/LLVM.h"
  19 #include "clang/Basic/LangOptions.h"
  20 #include "clang/Basic/SourceLocation.h"
  21 #include "clang/Basic/SourceManager.h"
  22 #include "clang/Basic/TokenKinds.h"
  23 #include "clang/Lex/LexDiagnostic.h"
  24 #include "clang/Lex/LiteralSupport.h"
  25 #include "clang/Lex/MultipleIncludeOpt.h"
  26 #include "clang/Lex/Preprocessor.h"
  27 #include "clang/Lex/PreprocessorOptions.h"
  28 #include "clang/Lex/Token.h"
  29 #include "llvm/ADT/STLExtras.h"
  30 #include "llvm/ADT/StringExtras.h"
  31 #include "llvm/ADT/StringRef.h"
  32 #include "llvm/ADT/StringSwitch.h"
  33 #include "llvm/Support/Compiler.h"
  34 #include "llvm/Support/ConvertUTF.h"
  35 #include "llvm/Support/MathExtras.h"
  36 #include "llvm/Support/MemoryBufferRef.h"
  37 #include "llvm/Support/NativeFormatting.h"
  38 #include "llvm/Support/Unicode.h"
  39 #include "llvm/Support/UnicodeCharRanges.h"
  40 #include <algorithm>
  41 #include <cassert>
  42 #include <cstddef>
  43 #include <cstdint>
  44 #include <cstring>
  45 #include <optional>
  46 #include <string>
  47 #include <tuple>
  48 #include <utility>
  49
  50 #ifdef __SSE4_2__
  51 #include <nmmintrin.h>
  52 #endif
  53
  54 using namespace clang;
  55
  56 //===----------------------------------------------------------------------===//
  57 // Token Class Implementation
  58 //===----------------------------------------------------------------------===//
  59
  60 /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
  61 bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
  62   if (isAnnotation())
  63     return false;
  64   if (const IdentifierInfo *II = getIdentifierInfo())
  65     return II->getObjCKeywordID() == objcKey;
  66   return false;
  67 }
  68
  69 /// getObjCKeywordID - Return the ObjC keyword kind.
  70 tok::ObjCKeywordKind Token::getObjCKeywordID() const {
  71   if (isAnnotation())
  72     return tok::objc_not_keyword;
  73   const IdentifierInfo *specId = getIdentifierInfo();
  74   return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
  75 }
  76
  77 //===----------------------------------------------------------------------===//
  78 // Lexer Class Implementation
  79 //===----------------------------------------------------------------------===//
  80
  81 void Lexer::anchor() {}
  82
  83 void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
  84                       const char *BufEnd) {
  85   BufferStart = BufStart;
  86   BufferPtr = BufPtr;
  87   BufferEnd = BufEnd;
  88
  89   assert(BufEnd[0] == 0 &&
  90          "We assume that the input buffer has a null character at the end"
  91          " to simplify lexing!");
  92
  93   // Check whether we have a BOM in the beginning of the buffer. If yes - act
  94   // accordingly. Right now we support only UTF-8 with and without BOM, so, just
  95   // skip the UTF-8 BOM if it's present.
  96   if (BufferStart == BufferPtr) {
  97     // Determine the size of the BOM.
  98     StringRef Buf(BufferStart, BufferEnd - BufferStart);
  99     size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
 100       .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
 101       .Default(0);
 102
 103     // Skip the BOM.
 104     BufferPtr += BOMLength;
 105   }
 106
 107   Is_PragmaLexer = false;
 108   CurrentConflictMarkerState = CMK_None;
 109
 110   // Start of the file is a start of line.
 111   IsAtStartOfLine = true;
 112   IsAtPhysicalStartOfLine = true;
 113
 114   HasLeadingSpace = false;
 115   HasLeadingEmptyMacro = false;
 116
 117   // We are not after parsing a #.
 118   ParsingPreprocessorDirective = false;
 119
 120   // We are not after parsing #include.
 121   ParsingFilename = false;
 122
 123   // We are not in raw mode.  Raw mode disables diagnostics and interpretation
 124   // of tokens (e.g. identifiers, thus disabling macro expansion).  It is used
 125   // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
 126   // or otherwise skipping over tokens.
 127   LexingRawMode = false;
 128
 129   // Default to not keeping comments.
 130   ExtendedTokenMode = 0;
 131
 132   NewLinePtr = nullptr;
 133 }
 134
 135 /// Lexer constructor - Create a new lexer object for the specified buffer
 136 /// with the specified preprocessor managing the lexing process.  This lexer
 137 /// assumes that the associated file buffer and Preprocessor objects will
 138 /// outlive it, so it doesn't take ownership of either of them.
 139 Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile,
 140              Preprocessor &PP, bool IsFirstIncludeOfFile)
 141     : PreprocessorLexer(&PP, FID),
 142       FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
 143       LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment),
 144       IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
 145   InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(),
 146             InputFile.getBufferEnd());
 147
 148   resetExtendedTokenMode();
 149 }
 150
 151 /// Lexer constructor - Create a new raw lexer object.  This object is only
 152 /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the text
 153 /// range will outlive it, so it doesn't take ownership of it.
 154 Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
 155              const char *BufStart, const char *BufPtr, const char *BufEnd,
 156              bool IsFirstIncludeOfFile)
 157     : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment),
 158       IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
 159   InitLexer(BufStart, BufPtr, BufEnd);
 160
 161   // We *are* in raw mode.
 162   LexingRawMode = true;
 163 }
 164
 165 /// Lexer constructor - Create a new raw lexer object.  This object is only
 166 /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the text
 167 /// range will outlive it, so it doesn't take ownership of it.
 168 Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
 169              const SourceManager &SM, const LangOptions &langOpts,
 170              bool IsFirstIncludeOfFile)
 171     : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(),
 172             FromFile.getBufferStart(), FromFile.getBufferEnd(),
 173             IsFirstIncludeOfFile) {}
 174
 175 void Lexer::resetExtendedTokenMode() {
 176   assert(PP && "Cannot reset token mode without a preprocessor");
 177   if (LangOpts.TraditionalCPP)
 178     SetKeepWhitespaceMode(true);
 179   else
 180     SetCommentRetentionState(PP->getCommentRetentionState());
 181 }
 182
 183 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
 184 /// _Pragma expansion.  This has a variety of magic semantics that this method
 185 /// sets up.  It returns a new'd Lexer that must be delete'd when done.
 186 ///
 187 /// On entrance to this routine, TokStartLoc is a macro location which has a
 188 /// spelling loc that indicates the bytes to be lexed for the token and an
 189 /// expansion location that indicates where all lexed tokens should be
 190 /// "expanded from".
 191 ///
 192 /// TODO: It would really be nice to make _Pragma just be a wrapper around a
 193 /// normal lexer that remaps tokens as they fly by.  This would require making
 194 /// Preprocessor::Lex virtual.  Given that, we could just dump in a magic lexer
 195 /// interface that could handle this stuff.  This would pull GetMappedTokenLoc
 196 /// out of the critical path of the lexer!
 197 ///
 198 Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
 199                                  SourceLocation ExpansionLocStart,
 200                                  SourceLocation ExpansionLocEnd,
 201                                  unsigned TokLen, Preprocessor &PP) {
 202   SourceManager &SM = PP.getSourceManager();
 203
 204   // Create the lexer as if we were going to lex the file normally.
 205   FileID SpellingFID = SM.getFileID(SpellingLoc);
 206   llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID);
 207   Lexer *L = new Lexer(SpellingFID, InputFile, PP);
 208
 209   // Now that the lexer is created, change the start/end locations so that we
 210   // just lex the subsection of the file that we want.  This is lexing from a
 211   // scratch buffer.
 212   const char *StrData = SM.getCharacterData(SpellingLoc);
 213
 214   L->BufferPtr = StrData;
 215   L->BufferEnd = StrData+TokLen;
 216   assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
 217
 218   // Set the SourceLocation with the remapping information.  This ensures that
 219   // GetMappedTokenLoc will remap the tokens as they are lexed.
 220   L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
 221                                      ExpansionLocStart,
 222                                      ExpansionLocEnd, TokLen);
 223
 224   // Ensure that the lexer thinks it is inside a directive, so that end \n will
 225   // return an EOD token.
 226   L->ParsingPreprocessorDirective = true;
 227
 228   // This lexer really is for _Pragma.
 229   L->Is_PragmaLexer = true;
 230   return L;
 231 }
 232
 233 void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {
 234   this->IsAtPhysicalStartOfLine = IsAtStartOfLine;
 235   this->IsAtStartOfLine = IsAtStartOfLine;
 236   assert((BufferStart + Offset) <= BufferEnd);
 237   BufferPtr = BufferStart + Offset;
 238 }
 239
 240 template <typename T> static void StringifyImpl(T &Str, char Quote) {
 241   typename T::size_type i = 0, e = Str.size();
 242   while (i < e) {
 243     if (Str[i] == '\\' || Str[i] == Quote) {
 244       Str.insert(Str.begin() + i, '\\');
 245       i += 2;
 246       ++e;
 247     } else if (Str[i] == '\n' || Str[i] == '\r') {
 248       // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
 249       if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
 250           Str[i] != Str[i + 1]) {
 251         Str[i] = '\\';
 252         Str[i + 1] = 'n';
 253       } else {
 254         // Replace '\n' and '\r' to '\\' followed by 'n'.
 255         Str[i] = '\\';
 256         Str.insert(Str.begin() + i + 1, 'n');
 257         ++e;
 258       }
 259       i += 2;
 260     } else
 261       ++i;
 262   }
 263 }
 264
 265 std::string Lexer::Stringify(StringRef Str, bool Charify) {
 266   std::string Result = std::string(Str);
 267   char Quote = Charify ? '\'' : '"';
 268   StringifyImpl(Result, Quote);
 269   return Result;
 270 }
 271
 272 void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); }
 273
 274 //===----------------------------------------------------------------------===//
 275 // Token Spelling
 276 //===----------------------------------------------------------------------===//
 277
 278 /// Slow case of getSpelling. Extract the characters comprising the
 279 /// spelling of this token from the provided input buffer.
 280 static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
 281                               const LangOptions &LangOpts, char *Spelling) {
 282   assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
 283
 284   size_t Length = 0;
 285   const char *BufEnd = BufPtr + Tok.getLength();
 286
 287   if (tok::isStringLiteral(Tok.getKind())) {
 288     // Munch the encoding-prefix and opening double-quote.
 289     while (BufPtr < BufEnd) {
 290       auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
 291       Spelling[Length++] = CharAndSize.Char;
 292       BufPtr += CharAndSize.Size;
 293
 294       if (Spelling[Length - 1] == '"')
 295         break;
 296     }
 297
 298     // Raw string literals need special handling; trigraph expansion and line
 299     // splicing do not occur within their d-char-sequence nor within their
 300     // r-char-sequence.
 301     if (Length >= 2 &&
 302         Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
 303       // Search backwards from the end of the token to find the matching closing
 304       // quote.
 305       const char *RawEnd = BufEnd;
 306       do --RawEnd; while (*RawEnd != '"');
 307       size_t RawLength = RawEnd - BufPtr + 1;
 308
 309       // Everything between the quotes is included verbatim in the spelling.
 310       memcpy(Spelling + Length, BufPtr, RawLength);
 311       Length += RawLength;
 312       BufPtr += RawLength;
 313
 314       // The rest of the token is lexed normally.
 315     }
 316   }
 317
 318   while (BufPtr < BufEnd) {
 319     auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
 320     Spelling[Length++] = CharAndSize.Char;
 321     BufPtr += CharAndSize.Size;
 322   }
 323
 324   assert(Length < Tok.getLength() &&
 325          "NeedsCleaning flag set on token that didn't need cleaning!");
 326   return Length;
 327 }
 328
 329 /// getSpelling() - Return the 'spelling' of this token.  The spelling of a
 330 /// token are the characters used to represent the token in the source file
 331 /// after trigraph expansion and escaped-newline folding.  In particular, this
 332 /// wants to get the true, uncanonicalized, spelling of things like digraphs
 333 /// UCNs, etc.
 334 StringRef Lexer::getSpelling(SourceLocation loc,
 335                              SmallVectorImpl<char> &buffer,
 336                              const SourceManager &SM,
 337                              const LangOptions &options,
 338                              bool *invalid) {
 339   // Break down the source location.
 340   std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
 341
 342   // Try to the load the file buffer.
 343   bool invalidTemp = false;
 344   StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
 345   if (invalidTemp) {
 346     if (invalid) *invalid = true;
 347     return {};
 348   }
 349
 350   const char *tokenBegin = file.data() + locInfo.second;
 351
 352   // Lex from the start of the given location.
 353   Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
 354               file.begin(), tokenBegin, file.end());
 355   Token token;
 356   lexer.LexFromRawLexer(token);
 357
 358   unsigned length = token.getLength();
 359
 360   // Common case:  no need for cleaning.
 361   if (!token.needsCleaning())
 362     return StringRef(tokenBegin, length);
 363
 364   // Hard case, we need to relex the characters into the string.
 365   buffer.resize(length);
 366   buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
 367   return StringRef(buffer.data(), buffer.size());
 368 }
 369
 370 /// getSpelling() - Return the 'spelling' of this token.  The spelling of a
 371 /// token are the characters used to represent the token in the source file
 372 /// after trigraph expansion and escaped-newline folding.  In particular, this
 373 /// wants to get the true, uncanonicalized, spelling of things like digraphs
 374 /// UCNs, etc.
 375 std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
 376                                const LangOptions &LangOpts, bool *Invalid) {
 377   assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
 378
 379   bool CharDataInvalid = false;
 380   const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
 381                                                     &CharDataInvalid);
 382   if (Invalid)
 383     *Invalid = CharDataInvalid;
 384   if (CharDataInvalid)
 385     return {};
 386
 387   // If this token contains nothing interesting, return it directly.
 388   if (!Tok.needsCleaning())
 389     return std::string(TokStart, TokStart + Tok.getLength());
 390
 391   std::string Result;
 392   Result.resize(Tok.getLength());
 393   Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
 394   return Result;
 395 }
 396
 397 /// getSpelling - This method is used to get the spelling of a token into a
 398 /// preallocated buffer, instead of as an std::string.  The caller is required
 399 /// to allocate enough space for the token, which is guaranteed to be at least
 400 /// Tok.getLength() bytes long.  The actual length of the token is returned.
 401 ///
 402 /// Note that this method may do two possible things: it may either fill in
 403 /// the buffer specified with characters, or it may *change the input pointer*
 404 /// to point to a constant buffer with the data already in it (avoiding a
 405 /// copy).  The caller is not allowed to modify the returned buffer pointer
 406 /// if an internal buffer is returned.
 407 unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
 408                             const SourceManager &SourceMgr,
 409                             const LangOptions &LangOpts, bool *Invalid) {
 410   assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
 411
 412   const char *TokStart = nullptr;
 413   // NOTE: this has to be checked *before* testing for an IdentifierInfo.
 414   if (Tok.is(tok::raw_identifier))
 415     TokStart = Tok.getRawIdentifier().data();
 416   else if (!Tok.hasUCN()) {
 417     if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
 418       // Just return the string from the identifier table, which is very quick.
 419       Buffer = II->getNameStart();
 420       return II->getLength();
 421     }
 422   }
 423
 424   // NOTE: this can be checked even after testing for an IdentifierInfo.
 425   if (Tok.isLiteral())
 426     TokStart = Tok.getLiteralData();
 427
 428   if (!TokStart) {
 429     // Compute the start of the token in the input lexer buffer.
 430     bool CharDataInvalid = false;
 431     TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
 432     if (Invalid)
 433       *Invalid = CharDataInvalid;
 434     if (CharDataInvalid) {
 435       Buffer = "";
 436       return 0;
 437     }
 438   }
 439
 440   // If this token contains nothing interesting, return it directly.
 441   if (!Tok.needsCleaning()) {
 442     Buffer = TokStart;
 443     return Tok.getLength();
 444   }
 445
 446   // Otherwise, hard case, relex the characters into the string.
 447   return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
 448 }
 449
 450 /// MeasureTokenLength - Relex the token at the specified location and return
 451 /// its length in bytes in the input file.  If the token needs cleaning (e.g.
 452 /// includes a trigraph or an escaped newline) then this count includes bytes
 453 /// that are part of that.
 454 unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
 455                                    const SourceManager &SM,
 456                                    const LangOptions &LangOpts) {
 457   Token TheTok;
 458   if (getRawToken(Loc, TheTok, SM, LangOpts))
 459     return 0;
 460   return TheTok.getLength();
 461 }
 462
 463 /// Relex the token at the specified location.
 464 /// \returns true if there was a failure, false on success.
 465 bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
 466                         const SourceManager &SM,
 467                         const LangOptions &LangOpts,
 468                         bool IgnoreWhiteSpace) {
 469   // TODO: this could be special cased for common tokens like identifiers, ')',
 470   // etc to make this faster, if it mattered.  Just look at StrData[0] to handle
 471   // all obviously single-char tokens.  This could use
 472   // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
 473   // something.
 474
 475   // If this comes from a macro expansion, we really do want the macro name, not
 476   // the token this macro expanded to.
 477   Loc = SM.getExpansionLoc(Loc);
 478   std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
 479   bool Invalid = false;
 480   StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
 481   if (Invalid)
 482     return true;
 483
 484   const char *StrData = Buffer.data()+LocInfo.second;
 485
 486   if (!IgnoreWhiteSpace && isWhitespace(StrData[0]))
 487     return true;
 488
 489   // Create a lexer starting at the beginning of this token.
 490   Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
 491                  Buffer.begin(), StrData, Buffer.end());
 492   TheLexer.SetCommentRetentionState(true);
 493   TheLexer.LexFromRawLexer(Result);
 494   return false;
 495 }
 496
 497 /// Returns the pointer that points to the beginning of line that contains
 498 /// the given offset, or null if the offset if invalid.
 499 static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
 500   const char *BufStart = Buffer.data();
 501   if (Offset >= Buffer.size())
 502     return nullptr;
 503
 504   const char *LexStart = BufStart + Offset;
 505   for (; LexStart != BufStart; --LexStart) {
 506     if (isVerticalWhitespace(LexStart[0]) &&
 507         !Lexer::isNewLineEscaped(BufStart, LexStart)) {
 508       // LexStart should point at first character of logical line.
 509       ++LexStart;
 510       break;
 511     }
 512   }
 513   return LexStart;
 514 }
 515
 516 static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
 517                                               const SourceManager &SM,
 518                                               const LangOptions &LangOpts) {
 519   assert(Loc.isFileID());
 520   std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
 521   if (LocInfo.first.isInvalid())
 522     return Loc;
 523
 524   bool Invalid = false;
 525   StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
 526   if (Invalid)
 527     return Loc;
 528
 529   // Back up from the current location until we hit the beginning of a line
 530   // (or the buffer). We'll relex from that point.
 531   const char *StrData = Buffer.data() + LocInfo.second;
 532   const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
 533   if (!LexStart || LexStart == StrData)
 534     return Loc;
 535
 536   // Create a lexer starting at the beginning of this token.
 537   SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
 538   Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
 539                  Buffer.end());
 540   TheLexer.SetCommentRetentionState(true);
 541
 542   // Lex tokens until we find the token that contains the source location.
 543   Token TheTok;
 544   do {
 545     TheLexer.LexFromRawLexer(TheTok);
 546
 547     if (TheLexer.getBufferLocation() > StrData) {
 548       // Lexing this token has taken the lexer past the source location we're
 549       // looking for. If the current token encompasses our source location,
 550       // return the beginning of that token.
 551       if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
 552         return TheTok.getLocation();
 553
 554       // We ended up skipping over the source location entirely, which means
 555       // that it points into whitespace. We're done here.
 556       break;
 557     }
 558   } while (TheTok.getKind() != tok::eof);
 559
 560   // We've passed our source location; just return the original source location.
 561   return Loc;
 562 }
 563
 564 SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
 565                                           const SourceManager &SM,
 566                                           const LangOptions &LangOpts) {
 567   if (Loc.isFileID())
 568     return getBeginningOfFileToken(Loc, SM, LangOpts);
 569
 570   if (!SM.isMacroArgExpansion(Loc))
 571     return Loc;
 572
 573   SourceLocation FileLoc = SM.getSpellingLoc(Loc);
 574   SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
 575   std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
 576   std::pair<FileID, unsigned> BeginFileLocInfo =
 577       SM.getDecomposedLoc(BeginFileLoc);
 578   assert(FileLocInfo.first == BeginFileLocInfo.first &&
 579          FileLocInfo.second >= BeginFileLocInfo.second);
 580   return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
 581 }
 582
 583 namespace {
 584
 585 enum PreambleDirectiveKind {
 586   PDK_Skipped,
 587   PDK_Unknown
 588 };
 589
 590 } // namespace
 591
 592 PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
 593                                       const LangOptions &LangOpts,
 594                                       unsigned MaxLines) {
 595   // Create a lexer starting at the beginning of the file. Note that we use a
 596   // "fake" file source location at offset 1 so that the lexer will track our
 597   // position within the file.
 598   const SourceLocation::UIntTy StartOffset = 1;
 599   SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset);
 600   Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
 601                  Buffer.end());
 602   TheLexer.SetCommentRetentionState(true);
 603
 604   bool InPreprocessorDirective = false;
 605   Token TheTok;
 606   SourceLocation ActiveCommentLoc;
 607
 608   unsigned MaxLineOffset = 0;
 609   if (MaxLines) {
 610     const char *CurPtr = Buffer.begin();
 611     unsigned CurLine = 0;
 612     while (CurPtr != Buffer.end()) {
 613       char ch = *CurPtr++;
 614       if (ch == '\n') {
 615         ++CurLine;
 616         if (CurLine == MaxLines)
 617           break;
 618       }
 619     }
 620     if (CurPtr != Buffer.end())
 621       MaxLineOffset = CurPtr - Buffer.begin();
 622   }
 623
 624   do {
 625     TheLexer.LexFromRawLexer(TheTok);
 626
 627     if (InPreprocessorDirective) {
 628       // If we've hit the end of the file, we're done.
 629       if (TheTok.getKind() == tok::eof) {
 630         break;
 631       }
 632
 633       // If we haven't hit the end of the preprocessor directive, skip this
 634       // token.
 635       if (!TheTok.isAtStartOfLine())
 636         continue;
 637
 638       // We've passed the end of the preprocessor directive, and will look
 639       // at this token again below.
 640       InPreprocessorDirective = false;
 641     }
 642
 643     // Keep track of the # of lines in the preamble.
 644     if (TheTok.isAtStartOfLine()) {
 645       unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
 646
 647       // If we were asked to limit the number of lines in the preamble,
 648       // and we're about to exceed that limit, we're done.
 649       if (MaxLineOffset && TokOffset >= MaxLineOffset)
 650         break;
 651     }
 652
 653     // Comments are okay; skip over them.
 654     if (TheTok.getKind() == tok::comment) {
 655       if (ActiveCommentLoc.isInvalid())
 656         ActiveCommentLoc = TheTok.getLocation();
 657       continue;
 658     }
 659
 660     if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
 661       // This is the start of a preprocessor directive.
 662       Token HashTok = TheTok;
 663       InPreprocessorDirective = true;
 664       ActiveCommentLoc = SourceLocation();
 665
 666       // Figure out which directive this is. Since we're lexing raw tokens,
 667       // we don't have an identifier table available. Instead, just look at
 668       // the raw identifier to recognize and categorize preprocessor directives.
 669       TheLexer.LexFromRawLexer(TheTok);
 670       if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
 671         StringRef Keyword = TheTok.getRawIdentifier();
 672         PreambleDirectiveKind PDK
 673           = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
 674               .Case("include", PDK_Skipped)
 675               .Case("__include_macros", PDK_Skipped)
 676               .Case("define", PDK_Skipped)
 677               .Case("undef", PDK_Skipped)
 678               .Case("line", PDK_Skipped)
 679               .Case("error", PDK_Skipped)
 680               .Case("pragma", PDK_Skipped)
 681               .Case("import", PDK_Skipped)
 682               .Case("include_next", PDK_Skipped)
 683               .Case("warning", PDK_Skipped)
 684               .Case("ident", PDK_Skipped)
 685               .Case("sccs", PDK_Skipped)
 686               .Case("assert", PDK_Skipped)
 687               .Case("unassert", PDK_Skipped)
 688               .Case("if", PDK_Skipped)
 689               .Case("ifdef", PDK_Skipped)
 690               .Case("ifndef", PDK_Skipped)
 691               .Case("elif", PDK_Skipped)
 692               .Case("elifdef", PDK_Skipped)
 693               .Case("elifndef", PDK_Skipped)
 694               .Case("else", PDK_Skipped)
 695               .Case("endif", PDK_Skipped)
 696               .Default(PDK_Unknown);
 697
 698         switch (PDK) {
 699         case PDK_Skipped:
 700           continue;
 701
 702         case PDK_Unknown:
 703           // We don't know what this directive is; stop at the '#'.
 704           break;
 705         }
 706       }
 707
 708       // We only end up here if we didn't recognize the preprocessor
 709       // directive or it was one that can't occur in the preamble at this
 710       // point. Roll back the current token to the location of the '#'.
 711       TheTok = HashTok;
 712     } else if (TheTok.isAtStartOfLine() &&
 713                TheTok.getKind() == tok::raw_identifier &&
 714                TheTok.getRawIdentifier() == "module" &&
 715                LangOpts.CPlusPlusModules) {
 716       // The initial global module fragment introducer "module;" is part of
 717       // the preamble, which runs up to the module declaration "module foo;".
 718       Token ModuleTok = TheTok;
 719       do {
 720         TheLexer.LexFromRawLexer(TheTok);
 721       } while (TheTok.getKind() == tok::comment);
 722       if (TheTok.getKind() != tok::semi) {
 723         // Not global module fragment, roll back.
 724         TheTok = ModuleTok;
 725         break;
 726       }
 727       continue;
 728     }
 729
 730     // We hit a token that we don't recognize as being in the
 731     // "preprocessing only" part of the file, so we're no longer in
 732     // the preamble.
 733     break;
 734   } while (true);
 735
 736   SourceLocation End;
 737   if (ActiveCommentLoc.isValid())
 738     End = ActiveCommentLoc; // don't truncate a decl comment.
 739   else
 740     End = TheTok.getLocation();
 741
 742   return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
 743                         TheTok.isAtStartOfLine());
 744 }
 745
 746 unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
 747                                      const SourceManager &SM,
 748                                      const LangOptions &LangOpts) {
 749   // Figure out how many physical characters away the specified expansion
 750   // character is.  This needs to take into consideration newlines and
 751   // trigraphs.
 752   bool Invalid = false;
 753   const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
 754
 755   // If they request the first char of the token, we're trivially done.
 756   if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
 757     return 0;
 758
 759   unsigned PhysOffset = 0;
 760
 761   // The usual case is that tokens don't contain anything interesting.  Skip
 762   // over the uninteresting characters.  If a token only consists of simple
 763   // chars, this method is extremely fast.
 764   while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
 765     if (CharNo == 0)
 766       return PhysOffset;
 767     ++TokPtr;
 768     --CharNo;
 769     ++PhysOffset;
 770   }
 771
 772   // If we have a character that may be a trigraph or escaped newline, use a
 773   // lexer to parse it correctly.
 774   for (; CharNo; --CharNo) {
 775     auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts);
 776     TokPtr += CharAndSize.Size;
 777     PhysOffset += CharAndSize.Size;
 778   }
 779
 780   // Final detail: if we end up on an escaped newline, we want to return the
 781   // location of the actual byte of the token.  For example foo\<newline>bar
 782   // advanced by 3 should return the location of b, not of \\.  One compounding
 783   // detail of this is that the escape may be made by a trigraph.
 784   if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
 785     PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
 786
 787   return PhysOffset;
 788 }
 789
 790 /// Computes the source location just past the end of the
 791 /// token at this source location.
 792 ///
 793 /// This routine can be used to produce a source location that
 794 /// points just past the end of the token referenced by \p Loc, and
 795 /// is generally used when a diagnostic needs to point just after a
 796 /// token where it expected something different that it received. If
 797 /// the returned source location would not be meaningful (e.g., if
 798 /// it points into a macro), this routine returns an invalid
 799 /// source location.
 800 ///
 801 /// \param Offset an offset from the end of the token, where the source
 802 /// location should refer to. The default offset (0) produces a source
 803 /// location pointing just past the end of the token; an offset of 1 produces
 804 /// a source location pointing to the last character in the token, etc.
 805 SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
 806                                           const SourceManager &SM,
 807                                           const LangOptions &LangOpts) {
 808   if (Loc.isInvalid())
 809     return {};
 810
 811   if (Loc.isMacroID()) {
 812     if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
 813       return {}; // Points inside the macro expansion.
 814   }
 815
 816   unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
 817   if (Len > Offset)
 818     Len = Len - Offset;
 819   else
 820     return Loc;
 821
 822   return Loc.getLocWithOffset(Len);
 823 }
 824
 825 /// Returns true if the given MacroID location points at the first
 826 /// token of the macro expansion.
 827 bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
 828                                       const SourceManager &SM,
 829                                       const LangOptions &LangOpts,
 830                                       SourceLocation *MacroBegin) {
 831   assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
 832
 833   SourceLocation expansionLoc;
 834   if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
 835     return false;
 836
 837   if (expansionLoc.isFileID()) {
 838     // No other macro expansions, this is the first.
 839     if (MacroBegin)
 840       *MacroBegin = expansionLoc;
 841     return true;
 842   }
 843
 844   return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
 845 }
 846
 847 /// Returns true if the given MacroID location points at the last
 848 /// token of the macro expansion.
 849 bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
 850                                     const SourceManager &SM,
 851                                     const LangOptions &LangOpts,
 852                                     SourceLocation *MacroEnd) {
 853   assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
 854
 855   SourceLocation spellLoc = SM.getSpellingLoc(loc);
 856   unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
 857   if (tokLen == 0)
 858     return false;
 859
 860   SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
 861   SourceLocation expansionLoc;
 862   if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
 863     return false;
 864
 865   if (expansionLoc.isFileID()) {
 866     // No other macro expansions.
 867     if (MacroEnd)
 868       *MacroEnd = expansionLoc;
 869     return true;
 870   }
 871
 872   return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
 873 }
 874
 875 static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
 876                                              const SourceManager &SM,
 877                                              const LangOptions &LangOpts) {
 878   SourceLocation Begin = Range.getBegin();
 879   SourceLocation End = Range.getEnd();
 880   assert(Begin.isFileID() && End.isFileID());
 881   if (Range.isTokenRange()) {
 882     End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
 883     if (End.isInvalid())
 884       return {};
 885   }
 886
 887   // Break down the source locations.
 888   FileID FID;
 889   unsigned BeginOffs;
 890   std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
 891   if (FID.isInvalid())
 892     return {};
 893
 894   unsigned EndOffs;
 895   if (!SM.isInFileID(End, FID, &EndOffs) ||
 896       BeginOffs > EndOffs)
 897     return {};
 898
 899   return CharSourceRange::getCharRange(Begin, End);
 900 }
 901
 902 // Assumes that `Loc` is in an expansion.
 903 static bool isInExpansionTokenRange(const SourceLocation Loc,
 904                                     const SourceManager &SM) {
 905   return SM.getSLocEntry(SM.getFileID(Loc))
 906       .getExpansion()
 907       .isExpansionTokenRange();
 908 }
 909
 910 CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
 911                                          const SourceManager &SM,
 912                                          const LangOptions &LangOpts) {
 913   SourceLocation Begin = Range.getBegin();
 914   SourceLocation End = Range.getEnd();
 915   if (Begin.isInvalid() || End.isInvalid())
 916     return {};
 917
 918   if (Begin.isFileID() && End.isFileID())
 919     return makeRangeFromFileLocs(Range, SM, LangOpts);
 920
 921   if (Begin.isMacroID() && End.isFileID()) {
 922     if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
 923       return {};
 924     Range.setBegin(Begin);
 925     return makeRangeFromFileLocs(Range, SM, LangOpts);
 926   }
 927
 928   if (Begin.isFileID() && End.isMacroID()) {
 929     if (Range.isTokenRange()) {
 930       if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End))
 931         return {};
 932       // Use the *original* end, not the expanded one in `End`.
 933       Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM));
 934     } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End))
 935       return {};
 936     Range.setEnd(End);
 937     return makeRangeFromFileLocs(Range, SM, LangOpts);
 938   }
 939
 940   assert(Begin.isMacroID() && End.isMacroID());
 941   SourceLocation MacroBegin, MacroEnd;
 942   if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
 943       ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
 944                                                         &MacroEnd)) ||
 945        (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
 946                                                          &MacroEnd)))) {
 947     Range.setBegin(MacroBegin);
 948     Range.setEnd(MacroEnd);
 949     // Use the *original* `End`, not the expanded one in `MacroEnd`.
 950     if (Range.isTokenRange())
 951       Range.setTokenRange(isInExpansionTokenRange(End, SM));
 952     return makeRangeFromFileLocs(Range, SM, LangOpts);
 953   }
 954
 955   bool Invalid = false;
 956   const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
 957                                                         &Invalid);
 958   if (Invalid)
 959     return {};
 960
 961   if (BeginEntry.getExpansion().isMacroArgExpansion()) {
 962     const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
 963                                                         &Invalid);
 964     if (Invalid)
 965       return {};
 966
 967     if (EndEntry.getExpansion().isMacroArgExpansion() &&
 968         BeginEntry.getExpansion().getExpansionLocStart() ==
 969             EndEntry.getExpansion().getExpansionLocStart()) {
 970       Range.setBegin(SM.getImmediateSpellingLoc(Begin));
 971       Range.setEnd(SM.getImmediateSpellingLoc(End));
 972       return makeFileCharRange(Range, SM, LangOpts);
 973     }
 974   }
 975
 976   return {};
 977 }
 978
 979 StringRef Lexer::getSourceText(CharSourceRange Range,
 980                                const SourceManager &SM,
 981                                const LangOptions &LangOpts,
 982                                bool *Invalid) {
 983   Range = makeFileCharRange(Range, SM, LangOpts);
 984   if (Range.isInvalid()) {
 985     if (Invalid) *Invalid = true;
 986     return {};
 987   }
 988
 989   // Break down the source location.
 990   std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
 991   if (beginInfo.first.isInvalid()) {
 992     if (Invalid) *Invalid = true;
 993     return {};
 994   }
 995
 996   unsigned EndOffs;
 997   if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
 998       beginInfo.second > EndOffs) {
 999     if (Invalid) *Invalid = true;
1000     return {};
1001   }
1002
1003   // Try to the load the file buffer.
1004   bool invalidTemp = false;
1005   StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
1006   if (invalidTemp) {
1007     if (Invalid) *Invalid = true;
1008     return {};
1009   }
1010
1011   if (Invalid) *Invalid = false;
1012   return file.substr(beginInfo.second, EndOffs - beginInfo.second);
1013 }
1014
1015 StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
1016                                        const SourceManager &SM,
1017                                        const LangOptions &LangOpts) {
1018   assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1019
1020   // Find the location of the immediate macro expansion.
1021   while (true) {
1022     FileID FID = SM.getFileID(Loc);
1023     const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
1024     const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
1025     Loc = Expansion.getExpansionLocStart();
1026     if (!Expansion.isMacroArgExpansion())
1027       break;
1028
1029     // For macro arguments we need to check that the argument did not come
1030     // from an inner macro, e.g: "MAC1( MAC2(foo) )"
1031
1032     // Loc points to the argument id of the macro definition, move to the
1033     // macro expansion.
1034     Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1035     SourceLocation SpellLoc = Expansion.getSpellingLoc();
1036     if (SpellLoc.isFileID())
1037       break; // No inner macro.
1038
1039     // If spelling location resides in the same FileID as macro expansion
1040     // location, it means there is no inner macro.
1041     FileID MacroFID = SM.getFileID(Loc);
1042     if (SM.isInFileID(SpellLoc, MacroFID))
1043       break;
1044
1045     // Argument came from inner macro.
1046     Loc = SpellLoc;
1047   }
1048
1049   // Find the spelling location of the start of the non-argument expansion
1050   // range. This is where the macro name was spelled in order to begin
1051   // expanding this macro.
1052   Loc = SM.getSpellingLoc(Loc);
1053
1054   // Dig out the buffer where the macro name was spelled and the extents of the
1055   // name so that we can render it into the expansion note.
1056   std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1057   unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1058   StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1059   return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1060 }
1061
1062 StringRef Lexer::getImmediateMacroNameForDiagnostics(
1063     SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1064   assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1065   // Walk past macro argument expansions.
1066   while (SM.isMacroArgExpansion(Loc))
1067     Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1068
1069   // If the macro's spelling isn't FileID or from scratch space, then it's
1070   // actually a token paste or stringization (or similar) and not a macro at
1071   // all.
1072   SourceLocation SpellLoc = SM.getSpellingLoc(Loc);
1073   if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc))
1074     return {};
1075
1076   // Find the spelling location of the start of the non-argument expansion
1077   // range. This is where the macro name was spelled in order to begin
1078   // expanding this macro.
1079   Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1080
1081   // Dig out the buffer where the macro name was spelled and the extents of the
1082   // name so that we can render it into the expansion note.
1083   std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1084   unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1085   StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1086   return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1087 }
1088
1089 bool Lexer::isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts) {
1090   return isAsciiIdentifierContinue(c, LangOpts.DollarIdents);
1091 }
1092
1093 bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1094   assert(isVerticalWhitespace(Str[0]));
1095   if (Str - 1 < BufferStart)
1096     return false;
1097
1098   if ((Str[0] == '\n' && Str[-1] == '\r') ||
1099       (Str[0] == '\r' && Str[-1] == '\n')) {
1100     if (Str - 2 < BufferStart)
1101       return false;
1102     --Str;
1103   }
1104   --Str;
1105
1106   // Rewind to first non-space character:
1107   while (Str > BufferStart && isHorizontalWhitespace(*Str))
1108     --Str;
1109
1110   return *Str == '\\';
1111 }
1112
1113 StringRef Lexer::getIndentationForLine(SourceLocation Loc,
1114                                        const SourceManager &SM) {
1115   if (Loc.isInvalid() || Loc.isMacroID())
1116     return {};
1117   std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1118   if (LocInfo.first.isInvalid())
1119     return {};
1120   bool Invalid = false;
1121   StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1122   if (Invalid)
1123     return {};
1124   const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1125   if (!Line)
1126     return {};
1127   StringRef Rest = Buffer.substr(Line - Buffer.data());
1128   size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1129   return NumWhitespaceChars == StringRef::npos
1130              ? ""
1131              : Rest.take_front(NumWhitespaceChars);
1132 }
1133
1134 //===----------------------------------------------------------------------===//
1135 // Diagnostics forwarding code.
1136 //===----------------------------------------------------------------------===//
1137
1138 /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1139 /// lexer buffer was all expanded at a single point, perform the mapping.
1140 /// This is currently only used for _Pragma implementation, so it is the slow
1141 /// path of the hot getSourceLocation method.  Do not allow it to be inlined.
1142 static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
1143     Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1144 static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
1145                                         SourceLocation FileLoc,
1146                                         unsigned CharNo, unsigned TokLen) {
1147   assert(FileLoc.isMacroID() && "Must be a macro expansion");
1148
1149   // Otherwise, we're lexing "mapped tokens".  This is used for things like
1150   // _Pragma handling.  Combine the expansion location of FileLoc with the
1151   // spelling location.
1152   SourceManager &SM = PP.getSourceManager();
1153
1154   // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1155   // characters come from spelling(FileLoc)+Offset.
1156   SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1157   SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1158
1159   // Figure out the expansion loc range, which is the range covered by the
1160   // original _Pragma(...) sequence.
1161   CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
1162
1163   return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1164 }
1165
1166 /// getSourceLocation - Return a source location identifier for the specified
1167 /// offset in the current file.
1168 SourceLocation Lexer::getSourceLocation(const char *Loc,
1169                                         unsigned TokLen) const {
1170   assert(Loc >= BufferStart && Loc <= BufferEnd &&
1171          "Location out of range for this buffer!");
1172
1173   // In the normal case, we're just lexing from a simple file buffer, return
1174   // the file id from FileLoc with the offset specified.
1175   unsigned CharNo = Loc-BufferStart;
1176   if (FileLoc.isFileID())
1177     return FileLoc.getLocWithOffset(CharNo);
1178
1179   // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1180   // tokens are lexed from where the _Pragma was defined.
1181   assert(PP && "This doesn't work on raw lexers");
1182   return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1183 }
1184
1185 /// Diag - Forwarding function for diagnostics.  This translate a source
1186 /// position in the current buffer into a SourceLocation object for rendering.
1187 DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1188   return PP->Diag(getSourceLocation(Loc), DiagID);
1189 }
1190
1191 //===----------------------------------------------------------------------===//
1192 // Trigraph and Escaped Newline Handling Code.
1193 //===----------------------------------------------------------------------===//
1194
1195 /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1196 /// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1197 static char GetTrigraphCharForLetter(char Letter) {
1198   switch (Letter) {
1199   default:   return 0;
1200   case '=':  return '#';
1201   case ')':  return ']';
1202   case '(':  return '[';
1203   case '!':  return '|';
1204   case '\'': return '^';
1205   case '>':  return '}';
1206   case '/':  return '\\';
1207   case '<':  return '{';
1208   case '-':  return '~';
1209   }
1210 }
1211
1212 /// DecodeTrigraphChar - If the specified character is a legal trigraph when
1213 /// prefixed with ??, emit a trigraph warning.  If trigraphs are enabled,
1214 /// return the result character.  Finally, emit a warning about trigraph use
1215 /// whether trigraphs are enabled or not.
1216 static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) {
1217   char Res = GetTrigraphCharForLetter(*CP);
1218   if (!Res)
1219     return Res;
1220
1221   if (!Trigraphs) {
1222     if (L && !L->isLexingRawMode())
1223       L->Diag(CP-2, diag::trigraph_ignored);
1224     return 0;
1225   }
1226
1227   if (L && !L->isLexingRawMode())
1228     L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1229   return Res;
1230 }
1231
1232 /// getEscapedNewLineSize - Return the size of the specified escaped newline,
1233 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1234 /// trigraph equivalent on entry to this function.
1235 unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1236   unsigned Size = 0;
1237   while (isWhitespace(Ptr[Size])) {
1238     ++Size;
1239
1240     if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1241       continue;
1242
1243     // If this is a \r\n or \n\r, skip the other half.
1244     if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
1245         Ptr[Size-1] != Ptr[Size])
1246       ++Size;
1247
1248     return Size;
1249   }
1250
1251   // Not an escaped newline, must be a \t or something else.
1252   return 0;
1253 }
1254
1255 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1256 /// them), skip over them and return the first non-escaped-newline found,
1257 /// otherwise return P.
1258 const char *Lexer::SkipEscapedNewLines(const char *P) {
1259   while (true) {
1260     const char *AfterEscape;
1261     if (*P == '\\') {
1262       AfterEscape = P+1;
1263     } else if (*P == '?') {
1264       // If not a trigraph for escape, bail out.
1265       if (P[1] != '?' || P[2] != '/')
1266         return P;
1267       // FIXME: Take LangOpts into account; the language might not
1268       // support trigraphs.
1269       AfterEscape = P+3;
1270     } else {
1271       return P;
1272     }
1273
1274     unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1275     if (NewLineSize == 0) return P;
1276     P = AfterEscape+NewLineSize;
1277   }
1278 }
1279
1280 std::optional<Token> Lexer::findNextToken(SourceLocation Loc,
1281                                           const SourceManager &SM,
1282                                           const LangOptions &LangOpts) {
1283   if (Loc.isMacroID()) {
1284     if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1285       return std::nullopt;
1286   }
1287   Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1288
1289   // Break down the source location.
1290   std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1291
1292   // Try to load the file buffer.
1293   bool InvalidTemp = false;
1294   StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1295   if (InvalidTemp)
1296     return std::nullopt;
1297
1298   const char *TokenBegin = File.data() + LocInfo.second;
1299
1300   // Lex from the start of the given location.
1301   Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1302                                       TokenBegin, File.end());
1303   // Find the token.
1304   Token Tok;
1305   lexer.LexFromRawLexer(Tok);
1306   return Tok;
1307 }
1308
1309 /// Checks that the given token is the first token that occurs after the
1310 /// given location (this excludes comments and whitespace). Returns the location
1311 /// immediately after the specified token. If the token is not found or the
1312 /// location is inside a macro, the returned source location will be invalid.
1313 SourceLocation Lexer::findLocationAfterToken(
1314     SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
1315     const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1316   std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1317   if (!Tok || Tok->isNot(TKind))
1318     return {};
1319   SourceLocation TokenLoc = Tok->getLocation();
1320
1321   // Calculate how much whitespace needs to be skipped if any.
1322   unsigned NumWhitespaceChars = 0;
1323   if (SkipTrailingWhitespaceAndNewLine) {
1324     const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1325     unsigned char C = *TokenEnd;
1326     while (isHorizontalWhitespace(C)) {
1327       C = *(++TokenEnd);
1328       NumWhitespaceChars++;
1329     }
1330
1331     // Skip \r, \n, \r\n, or \n\r
1332     if (C == '\n' || C == '\r') {
1333       char PrevC = C;
1334       C = *(++TokenEnd);
1335       NumWhitespaceChars++;
1336       if ((C == '\n' || C == '\r') && C != PrevC)
1337         NumWhitespaceChars++;
1338     }
1339   }
1340
1341   return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1342 }
1343
1344 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1345 /// get its size, and return it.  This is tricky in several cases:
1346 ///   1. If currently at the start of a trigraph, we warn about the trigraph,
1347 ///      then either return the trigraph (skipping 3 chars) or the '?',
1348 ///      depending on whether trigraphs are enabled or not.
1349 ///   2. If this is an escaped newline (potentially with whitespace between
1350 ///      the backslash and newline), implicitly skip the newline and return
1351 ///      the char after it.
1352 ///
1353 /// This handles the slow/uncommon case of the getCharAndSize method.  Here we
1354 /// know that we can accumulate into Size, and that we have already incremented
1355 /// Ptr by Size bytes.
1356 ///
1357 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1358 /// be updated to match.
1359 Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
1360   unsigned Size = 0;
1361   // If we have a slash, look for an escaped newline.
1362   if (Ptr[0] == '\\') {
1363     ++Size;
1364     ++Ptr;
1365 Slash:
1366     // Common case, backslash-char where the char is not whitespace.
1367     if (!isWhitespace(Ptr[0]))
1368       return {'\\', Size};
1369
1370     // See if we have optional whitespace characters between the slash and
1371     // newline.
1372     if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1373       // Remember that this token needs to be cleaned.
1374       if (Tok) Tok->setFlag(Token::NeedsCleaning);
1375
1376       // Warn if there was whitespace between the backslash and newline.
1377       if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
1378         Diag(Ptr, diag::backslash_newline_space);
1379
1380       // Found backslash<whitespace><newline>.  Parse the char after it.
1381       Size += EscapedNewLineSize;
1382       Ptr  += EscapedNewLineSize;
1383
1384       // Use slow version to accumulate a correct size field.
1385       auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
1386       CharAndSize.Size += Size;
1387       return CharAndSize;
1388     }
1389
1390     // Otherwise, this is not an escaped newline, just return the slash.
1391     return {'\\', Size};
1392   }
1393
1394   // If this is a trigraph, process it.
1395   if (Ptr[0] == '?' && Ptr[1] == '?') {
1396     // If this is actually a legal trigraph (not something like "??x"), emit
1397     // a trigraph warning.  If so, and if trigraphs are enabled, return it.
1398     if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr,
1399                                     LangOpts.Trigraphs)) {
1400       // Remember that this token needs to be cleaned.
1401       if (Tok) Tok->setFlag(Token::NeedsCleaning);
1402
1403       Ptr += 3;
1404       Size += 3;
1405       if (C == '\\') goto Slash;
1406       return {C, Size};
1407     }
1408   }
1409
1410   // If this is neither, return a single character.
1411   return {*Ptr, Size + 1u};
1412 }
1413
1414 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1415 /// getCharAndSizeNoWarn method.  Here we know that we can accumulate into Size,
1416 /// and that we have already incremented Ptr by Size bytes.
1417 ///
1418 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1419 /// be updated to match.
1420 Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
1421                                                  const LangOptions &LangOpts) {
1422
1423   unsigned Size = 0;
1424   // If we have a slash, look for an escaped newline.
1425   if (Ptr[0] == '\\') {
1426     ++Size;
1427     ++Ptr;
1428 Slash:
1429     // Common case, backslash-char where the char is not whitespace.
1430     if (!isWhitespace(Ptr[0]))
1431       return {'\\', Size};
1432
1433     // See if we have optional whitespace characters followed by a newline.
1434     if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1435       // Found backslash<whitespace><newline>.  Parse the char after it.
1436       Size += EscapedNewLineSize;
1437       Ptr  += EscapedNewLineSize;
1438
1439       // Use slow version to accumulate a correct size field.
1440       auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);
1441       CharAndSize.Size += Size;
1442       return CharAndSize;
1443     }
1444
1445     // Otherwise, this is not an escaped newline, just return the slash.
1446     return {'\\', Size};
1447   }
1448
1449   // If this is a trigraph, process it.
1450   if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1451     // If this is actually a legal trigraph (not something like "??x"), return
1452     // it.
1453     if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1454       Ptr += 3;
1455       Size += 3;
1456       if (C == '\\') goto Slash;
1457       return {C, Size};
1458     }
1459   }
1460
1461   // If this is neither, return a single character.
1462   return {*Ptr, Size + 1u};
1463 }
1464
1465 //===----------------------------------------------------------------------===//
1466 // Helper methods for lexing.
1467 //===----------------------------------------------------------------------===//
1468
1469 /// Routine that indiscriminately sets the offset into the source file.
1470 void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1471   BufferPtr = BufferStart + Offset;
1472   if (BufferPtr > BufferEnd)
1473     BufferPtr = BufferEnd;
1474   // FIXME: What exactly does the StartOfLine bit mean?  There are two
1475   // possible meanings for the "start" of the line: the first token on the
1476   // unexpanded line, or the first token on the expanded line.
1477   IsAtStartOfLine = StartOfLine;
1478   IsAtPhysicalStartOfLine = StartOfLine;
1479 }
1480
1481 static bool isUnicodeWhitespace(uint32_t Codepoint) {
1482   static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
1483       UnicodeWhitespaceCharRanges);
1484   return UnicodeWhitespaceChars.contains(Codepoint);
1485 }
1486
1487 static llvm::SmallString<5> codepointAsHexString(uint32_t C) {
1488   llvm::SmallString<5> CharBuf;
1489   llvm::raw_svector_ostream CharOS(CharBuf);
1490   llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1491   return CharBuf;
1492 }
1493
1494 // To mitigate https://github.com/llvm/llvm-project/issues/54732,
1495 // we allow "Mathematical Notation Characters" in identifiers.
1496 // This is a proposed profile that extends the XID_Start/XID_continue
1497 // with mathematical symbols, superscipts and subscripts digits
1498 // found in some production software.
1499 // https://www.unicode.org/L2/L2022/22230-math-profile.pdf
1500 static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts,
1501                                       bool IsStart, bool &IsExtension) {
1502   static const llvm::sys::UnicodeCharSet MathStartChars(
1503       MathematicalNotationProfileIDStartRanges);
1504   static const llvm::sys::UnicodeCharSet MathContinueChars(
1505       MathematicalNotationProfileIDContinueRanges);
1506   if (MathStartChars.contains(C) ||
1507       (!IsStart && MathContinueChars.contains(C))) {
1508     IsExtension = true;
1509     return true;
1510   }
1511   return false;
1512 }
1513
1514 static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts,
1515                             bool &IsExtension) {
1516   if (LangOpts.AsmPreprocessor) {
1517     return false;
1518   } else if (LangOpts.DollarIdents && '$' == C) {
1519     return true;
1520   } else if (LangOpts.CPlusPlus || LangOpts.C23) {
1521     // A non-leading codepoint must have the XID_Continue property.
1522     // XIDContinueRanges doesn't contains characters also in XIDStartRanges,
1523     // so we need to check both tables.
1524     // '_' doesn't have the XID_Continue property but is allowed in C and C++.
1525     static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1526     static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
1527     if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C))
1528       return true;
1529     return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false,
1530                                      IsExtension);
1531   } else if (LangOpts.C11) {
1532     static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1533         C11AllowedIDCharRanges);
1534     return C11AllowedIDChars.contains(C);
1535   } else {
1536     static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1537         C99AllowedIDCharRanges);
1538     return C99AllowedIDChars.contains(C);
1539   }
1540 }
1541
1542 static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts,
1543                                      bool &IsExtension) {
1544   assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint");
1545   IsExtension = false;
1546   if (LangOpts.AsmPreprocessor) {
1547     return false;
1548   }
1549   if (LangOpts.CPlusPlus || LangOpts.C23) {
1550     static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1551     if (XIDStartChars.contains(C))
1552       return true;
1553     return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true,
1554                                      IsExtension);
1555   }
1556   if (!isAllowedIDChar(C, LangOpts, IsExtension))
1557     return false;
1558   if (LangOpts.C11) {
1559     static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1560         C11DisallowedInitialIDCharRanges);
1561     return !C11DisallowedInitialIDChars.contains(C);
1562   }
1563   static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1564       C99DisallowedInitialIDCharRanges);
1565   return !C99DisallowedInitialIDChars.contains(C);
1566 }
1567
1568 static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C,
1569                                           CharSourceRange Range) {
1570
1571   static const llvm::sys::UnicodeCharSet MathStartChars(
1572       MathematicalNotationProfileIDStartRanges);
1573   static const llvm::sys::UnicodeCharSet MathContinueChars(
1574       MathematicalNotationProfileIDContinueRanges);
1575
1576   (void)MathStartChars;
1577   (void)MathContinueChars;
1578   assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) &&
1579          "Unexpected mathematical notation codepoint");
1580   Diags.Report(Range.getBegin(), diag::ext_mathematical_notation)
1581       << codepointAsHexString(C) << Range;
1582 }
1583
1584 static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1585                                             const char *End) {
1586   return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
1587                                        L.getSourceLocation(End));
1588 }
1589
1590 static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1591                                       CharSourceRange Range, bool IsFirst) {
1592   // Check C99 compatibility.
1593   if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1594     enum {
1595       CannotAppearInIdentifier = 0,
1596       CannotStartIdentifier
1597     };
1598
1599     static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1600         C99AllowedIDCharRanges);
1601     static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1602         C99DisallowedInitialIDCharRanges);
1603     if (!C99AllowedIDChars.contains(C)) {
1604       Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1605         << Range
1606         << CannotAppearInIdentifier;
1607     } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1608       Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1609         << Range
1610         << CannotStartIdentifier;
1611     }
1612   }
1613 }
1614
1615 /// After encountering UTF-8 character C and interpreting it as an identifier
1616 /// character, check whether it's a homoglyph for a common non-identifier
1617 /// source character that is unlikely to be an intentional identifier
1618 /// character and warn if so.
1619 static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
1620                                        CharSourceRange Range) {
1621   // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1622   struct HomoglyphPair {
1623     uint32_t Character;
1624     char LooksLike;
1625     bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1626   };
1627   static constexpr HomoglyphPair SortedHomoglyphs[] = {
1628     {U'\u00ad', 0},   // SOFT HYPHEN
1629     {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1630     {U'\u037e', ';'}, // GREEK QUESTION MARK
1631     {U'\u200b', 0},   // ZERO WIDTH SPACE
1632     {U'\u200c', 0},   // ZERO WIDTH NON-JOINER
1633     {U'\u200d', 0},   // ZERO WIDTH JOINER
1634     {U'\u2060', 0},   // WORD JOINER
1635     {U'\u2061', 0},   // FUNCTION APPLICATION
1636     {U'\u2062', 0},   // INVISIBLE TIMES
1637     {U'\u2063', 0},   // INVISIBLE SEPARATOR
1638     {U'\u2064', 0},   // INVISIBLE PLUS
1639     {U'\u2212', '-'}, // MINUS SIGN
1640     {U'\u2215', '/'}, // DIVISION SLASH
1641     {U'\u2216', '\\'}, // SET MINUS
1642     {U'\u2217', '*'}, // ASTERISK OPERATOR
1643     {U'\u2223', '|'}, // DIVIDES
1644     {U'\u2227', '^'}, // LOGICAL AND
1645     {U'\u2236', ':'}, // RATIO
1646     {U'\u223c', '~'}, // TILDE OPERATOR
1647     {U'\ua789', ':'}, // MODIFIER LETTER COLON
1648     {U'\ufeff', 0},   // ZERO WIDTH NO-BREAK SPACE
1649     {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1650     {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1651     {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1652     {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1653     {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1654     {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1655     {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1656     {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1657     {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1658     {U'\uff0c', ','}, // FULLWIDTH COMMA
1659     {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1660     {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1661     {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1662     {U'\uff1a', ':'}, // FULLWIDTH COLON
1663     {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1664     {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1665     {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1666     {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1667     {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1668     {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1669     {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1670     {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1671     {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1672     {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1673     {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1674     {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
1675     {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1676     {U'\uff5e', '~'}, // FULLWIDTH TILDE
1677     {0, 0}
1678   };
1679   auto Homoglyph =
1680       std::lower_bound(std::begin(SortedHomoglyphs),
1681                        std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1682   if (Homoglyph->Character == C) {
1683     if (Homoglyph->LooksLike) {
1684       const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1685       Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1686           << Range << codepointAsHexString(C) << LooksLikeStr;
1687     } else {
1688       Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1689           << Range << codepointAsHexString(C);
1690     }
1691   }
1692 }
1693
1694 static void diagnoseInvalidUnicodeCodepointInIdentifier(
1695     DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint,
1696     CharSourceRange Range, bool IsFirst) {
1697   if (isASCII(CodePoint))
1698     return;
1699
1700   bool IsExtension;
1701   bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension);
1702   bool IsIDContinue =
1703       IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension);
1704
1705   if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
1706     return;
1707
1708   bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
1709
1710   if (!IsFirst || InvalidOnlyAtStart) {
1711     Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
1712         << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart)
1713         << FixItHint::CreateRemoval(Range);
1714   } else {
1715     Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
1716         << Range << codepointAsHexString(CodePoint)
1717         << FixItHint::CreateRemoval(Range);
1718   }
1719 }
1720
1721 bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1722                                     Token &Result) {
1723   const char *UCNPtr = CurPtr + Size;
1724   uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1725   if (CodePoint == 0) {
1726     return false;
1727   }
1728   bool IsExtension = false;
1729   if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) {
1730     if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1731       return false;
1732     if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1733         !PP->isPreprocessedOutput())
1734       diagnoseInvalidUnicodeCodepointInIdentifier(
1735           PP->getDiagnostics(), LangOpts, CodePoint,
1736           makeCharRange(*this, CurPtr, UCNPtr),
1737           /*IsFirst=*/false);
1738
1739     // We got a unicode codepoint that is neither a space nor a
1740     // a valid identifier part.
1741     // Carry on as if the codepoint was valid for recovery purposes.
1742   } else if (!isLexingRawMode()) {
1743     if (IsExtension)
1744       diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint,
1745                                     makeCharRange(*this, CurPtr, UCNPtr));
1746
1747     maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1748                               makeCharRange(*this, CurPtr, UCNPtr),
1749                               /*IsFirst=*/false);
1750   }
1751
1752   Result.setFlag(Token::HasUCN);
1753   if ((UCNPtr - CurPtr ==  6 && CurPtr[1] == 'u') ||
1754       (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
1755     CurPtr = UCNPtr;
1756   else
1757     while (CurPtr != UCNPtr)
1758       (void)getAndAdvanceChar(CurPtr, Result);
1759   return true;
1760 }
1761
1762 bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) {
1763   llvm::UTF32 CodePoint;
1764
1765   // If a UTF-8 codepoint appears immediately after an escaped new line,
1766   // CurPtr may point to the splicing \ on the preceding line,
1767   // so we need to skip it.
1768   unsigned FirstCodeUnitSize;
1769   getCharAndSize(CurPtr, FirstCodeUnitSize);
1770   const char *CharStart = CurPtr + FirstCodeUnitSize - 1;
1771   const char *UnicodePtr = CharStart;
1772
1773   llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence(
1774       (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd,
1775       &CodePoint, llvm::strictConversion);
1776   if (ConvResult != llvm::conversionOK)
1777     return false;
1778
1779   bool IsExtension = false;
1780   if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts,
1781                        IsExtension)) {
1782     if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1783       return false;
1784
1785     if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1786         !PP->isPreprocessedOutput())
1787       diagnoseInvalidUnicodeCodepointInIdentifier(
1788           PP->getDiagnostics(), LangOpts, CodePoint,
1789           makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false);
1790     // We got a unicode codepoint that is neither a space nor a
1791     // a valid identifier part. Carry on as if the codepoint was
1792     // valid for recovery purposes.
1793   } else if (!isLexingRawMode()) {
1794     if (IsExtension)
1795       diagnoseExtensionInIdentifier(
1796           PP->getDiagnostics(), CodePoint,
1797           makeCharRange(*this, CharStart, UnicodePtr));
1798     maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1799                               makeCharRange(*this, CharStart, UnicodePtr),
1800                               /*IsFirst=*/false);
1801     maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
1802                                makeCharRange(*this, CharStart, UnicodePtr));
1803   }
1804
1805   // Once we sucessfully parsed some UTF-8,
1806   // calling ConsumeChar ensures the NeedsCleaning flag is set on the token
1807   // being lexed, and that warnings about trailing spaces are emitted.
1808   ConsumeChar(CurPtr, FirstCodeUnitSize, Result);
1809   CurPtr = UnicodePtr;
1810   return true;
1811 }
1812
1813 bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
1814                                       const char *CurPtr) {
1815   bool IsExtension = false;
1816   if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) {
1817     if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1818         !PP->isPreprocessedOutput()) {
1819       if (IsExtension)
1820         diagnoseExtensionInIdentifier(PP->getDiagnostics(), C,
1821                                       makeCharRange(*this, BufferPtr, CurPtr));
1822       maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
1823                                 makeCharRange(*this, BufferPtr, CurPtr),
1824                                 /*IsFirst=*/true);
1825       maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C,
1826                                  makeCharRange(*this, BufferPtr, CurPtr));
1827     }
1828
1829     MIOpt.ReadToken();
1830     return LexIdentifierContinue(Result, CurPtr);
1831   }
1832
1833   if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1834       !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
1835       !isUnicodeWhitespace(C)) {
1836     // Non-ASCII characters tend to creep into source code unintentionally.
1837     // Instead of letting the parser complain about the unknown token,
1838     // just drop the character.
1839     // Note that we can /only/ do this when the non-ASCII character is actually
1840     // spelled as Unicode, not written as a UCN. The standard requires that
1841     // we not throw away any possible preprocessor tokens, but there's a
1842     // loophole in the mapping of Unicode characters to basic character set
1843     // characters that allows us to map these particular characters to, say,
1844     // whitespace.
1845     diagnoseInvalidUnicodeCodepointInIdentifier(
1846         PP->getDiagnostics(), LangOpts, C,
1847         makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);
1848     BufferPtr = CurPtr;
1849     return false;
1850   }
1851
1852   // Otherwise, we have an explicit UCN or a character that's unlikely to show
1853   // up by accident.
1854   MIOpt.ReadToken();
1855   FormTokenWithChars(Result, CurPtr, tok::unknown);
1856   return true;
1857 }
1858
1859 static const char *
1860 fastParseASCIIIdentifier(const char *CurPtr,
1861                          [[maybe_unused]] const char *BufferEnd) {
1862 #ifdef __SSE4_2__
1863   alignas(16) static constexpr char AsciiIdentifierRange[16] = {
1864       '_', '_', 'A', 'Z', 'a', 'z', '0', '9',
1865   };
1866   constexpr ssize_t BytesPerRegister = 16;
1867
1868   __m128i AsciiIdentifierRangeV =
1869       _mm_load_si128((const __m128i *)AsciiIdentifierRange);
1870
1871   while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) {
1872     __m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr));
1873
1874     int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv,
1875                                 _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES |
1876                                     _SIDD_UBYTE_OPS | _SIDD_NEGATIVE_POLARITY);
1877     CurPtr += Consumed;
1878     if (Consumed == BytesPerRegister)
1879       continue;
1880     return CurPtr;
1881   }
1882 #endif
1883
1884   unsigned char C = *CurPtr;
1885   while (isAsciiIdentifierContinue(C))
1886     C = *++CurPtr;
1887   return CurPtr;
1888 }
1889
1890 bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
1891   // Match [_A-Za-z0-9]*, we have already matched an identifier start.
1892
1893   while (true) {
1894
1895     CurPtr = fastParseASCIIIdentifier(CurPtr, BufferEnd);
1896
1897     unsigned Size;
1898     // Slow path: handle trigraph, unicode codepoints, UCNs.
1899     unsigned char C = getCharAndSize(CurPtr, Size);
1900     if (isAsciiIdentifierContinue(C)) {
1901       CurPtr = ConsumeChar(CurPtr, Size, Result);
1902       continue;
1903     }
1904     if (C == '$') {
1905       // If we hit a $ and they are not supported in identifiers, we are done.
1906       if (!LangOpts.DollarIdents)
1907         break;
1908       // Otherwise, emit a diagnostic and continue.
1909       if (!isLexingRawMode())
1910         Diag(CurPtr, diag::ext_dollar_in_identifier);
1911       CurPtr = ConsumeChar(CurPtr, Size, Result);
1912       continue;
1913     }
1914     if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1915       continue;
1916     if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
1917       continue;
1918     // Neither an expected Unicode codepoint nor a UCN.
1919     break;
1920   }
1921
1922   const char *IdStart = BufferPtr;
1923   FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1924   Result.setRawIdentifierData(IdStart);
1925
1926   // If we are in raw mode, return this identifier raw.  There is no need to
1927   // look up identifier information or attempt to macro expand it.
1928   if (LexingRawMode)
1929     return true;
1930
1931   // Fill in Result.IdentifierInfo and update the token kind,
1932   // looking up the identifier in the identifier table.
1933   const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
1934   // Note that we have to call PP->LookUpIdentifierInfo() even for code
1935   // completion, it writes IdentifierInfo into Result, and callers rely on it.
1936
1937   // If the completion point is at the end of an identifier, we want to treat
1938   // the identifier as incomplete even if it resolves to a macro or a keyword.
1939   // This allows e.g. 'class^' to complete to 'classifier'.
1940   if (isCodeCompletionPoint(CurPtr)) {
1941     // Return the code-completion token.
1942     Result.setKind(tok::code_completion);
1943     // Skip the code-completion char and all immediate identifier characters.
1944     // This ensures we get consistent behavior when completing at any point in
1945     // an identifier (i.e. at the start, in the middle, at the end). Note that
1946     // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
1947     // simpler.
1948     assert(*CurPtr == 0 && "Completion character must be 0");
1949     ++CurPtr;
1950     // Note that code completion token is not added as a separate character
1951     // when the completion point is at the end of the buffer. Therefore, we need
1952     // to check if the buffer has ended.
1953     if (CurPtr < BufferEnd) {
1954       while (isAsciiIdentifierContinue(*CurPtr))
1955         ++CurPtr;
1956     }
1957     BufferPtr = CurPtr;
1958     return true;
1959   }
1960
1961   // Finally, now that we know we have an identifier, pass this off to the
1962   // preprocessor, which may macro expand it or something.
1963   if (II->isHandleIdentifierCase())
1964     return PP->HandleIdentifier(Result);
1965
1966   return true;
1967 }
1968
1969 /// isHexaLiteral - Return true if Start points to a hex constant.
1970 /// in microsoft mode (where this is supposed to be several different tokens).
1971 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
1972   auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts);
1973   char C1 = CharAndSize1.Char;
1974   if (C1 != '0')
1975     return false;
1976
1977   auto CharAndSize2 =
1978       Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts);
1979   char C2 = CharAndSize2.Char;
1980   return (C2 == 'x' || C2 == 'X');
1981 }
1982
1983 /// LexNumericConstant - Lex the remainder of a integer or floating point
1984 /// constant. From[-1] is the first character lexed.  Return the end of the
1985 /// constant.
1986 bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
1987   unsigned Size;
1988   char C = getCharAndSize(CurPtr, Size);
1989   char PrevCh = 0;
1990   while (isPreprocessingNumberBody(C)) {
1991     CurPtr = ConsumeChar(CurPtr, Size, Result);
1992     PrevCh = C;
1993     C = getCharAndSize(CurPtr, Size);
1994   }
1995
1996   // If we fell out, check for a sign, due to 1e+12.  If we have one, continue.
1997   if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
1998     // If we are in Microsoft mode, don't continue if the constant is hex.
1999     // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
2000     if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
2001       return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
2002   }
2003
2004   // If we have a hex FP constant, continue.
2005   if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
2006     // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
2007     // not-quite-conforming extension. Only do so if this looks like it's
2008     // actually meant to be a hexfloat, and not if it has a ud-suffix.
2009     bool IsHexFloat = true;
2010     if (!LangOpts.C99) {
2011       if (!isHexaLiteral(BufferPtr, LangOpts))
2012         IsHexFloat = false;
2013       else if (!LangOpts.CPlusPlus17 &&
2014                std::find(BufferPtr, CurPtr, '_') != CurPtr)
2015         IsHexFloat = false;
2016     }
2017     if (IsHexFloat)
2018       return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
2019   }
2020
2021   // If we have a digit separator, continue.
2022   if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {
2023     auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts);
2024     if (isAsciiIdentifierContinue(Next)) {
2025       if (!isLexingRawMode())
2026         Diag(CurPtr, LangOpts.CPlusPlus
2027                          ? diag::warn_cxx11_compat_digit_separator
2028                          : diag::warn_c23_compat_digit_separator);
2029       CurPtr = ConsumeChar(CurPtr, Size, Result);
2030       CurPtr = ConsumeChar(CurPtr, NextSize, Result);
2031       return LexNumericConstant(Result, CurPtr);
2032     }
2033   }
2034
2035   // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
2036   if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2037     return LexNumericConstant(Result, CurPtr);
2038   if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2039     return LexNumericConstant(Result, CurPtr);
2040
2041   // Update the location of token as well as BufferPtr.
2042   const char *TokStart = BufferPtr;
2043   FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
2044   Result.setLiteralData(TokStart);
2045   return true;
2046 }
2047
2048 /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
2049 /// in C++11, or warn on a ud-suffix in C++98.
2050 const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
2051                                bool IsStringLiteral) {
2052   assert(LangOpts.CPlusPlus);
2053
2054   // Maximally munch an identifier.
2055   unsigned Size;
2056   char C = getCharAndSize(CurPtr, Size);
2057   bool Consumed = false;
2058
2059   if (!isAsciiIdentifierStart(C)) {
2060     if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2061       Consumed = true;
2062     else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2063       Consumed = true;
2064     else
2065       return CurPtr;
2066   }
2067
2068   if (!LangOpts.CPlusPlus11) {
2069     if (!isLexingRawMode())
2070       Diag(CurPtr,
2071            C == '_' ? diag::warn_cxx11_compat_user_defined_literal
2072                     : diag::warn_cxx11_compat_reserved_user_defined_literal)
2073         << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
2074     return CurPtr;
2075   }
2076
2077   // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
2078   // that does not start with an underscore is ill-formed. As a conforming
2079   // extension, we treat all such suffixes as if they had whitespace before
2080   // them. We assume a suffix beginning with a UCN or UTF-8 character is more
2081   // likely to be a ud-suffix than a macro, however, and accept that.
2082   if (!Consumed) {
2083     bool IsUDSuffix = false;
2084     if (C == '_')
2085       IsUDSuffix = true;
2086     else if (IsStringLiteral && LangOpts.CPlusPlus14) {
2087       // In C++1y, we need to look ahead a few characters to see if this is a
2088       // valid suffix for a string literal or a numeric literal (this could be
2089       // the 'operator""if' defining a numeric literal operator).
2090       const unsigned MaxStandardSuffixLength = 3;
2091       char Buffer[MaxStandardSuffixLength] = { C };
2092       unsigned Consumed = Size;
2093       unsigned Chars = 1;
2094       while (true) {
2095         auto [Next, NextSize] =
2096             getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts);
2097         if (!isAsciiIdentifierContinue(Next)) {
2098           // End of suffix. Check whether this is on the allowed list.
2099           const StringRef CompleteSuffix(Buffer, Chars);
2100           IsUDSuffix =
2101               StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix);
2102           break;
2103         }
2104
2105         if (Chars == MaxStandardSuffixLength)
2106           // Too long: can't be a standard suffix.
2107           break;
2108
2109         Buffer[Chars++] = Next;
2110         Consumed += NextSize;
2111       }
2112     }
2113
2114     if (!IsUDSuffix) {
2115       if (!isLexingRawMode())
2116         Diag(CurPtr, LangOpts.MSVCCompat
2117                          ? diag::ext_ms_reserved_user_defined_literal
2118                          : diag::ext_reserved_user_defined_literal)
2119             << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
2120       return CurPtr;
2121     }
2122
2123     CurPtr = ConsumeChar(CurPtr, Size, Result);
2124   }
2125
2126   Result.setFlag(Token::HasUDSuffix);
2127   while (true) {
2128     C = getCharAndSize(CurPtr, Size);
2129     if (isAsciiIdentifierContinue(C)) {
2130       CurPtr = ConsumeChar(CurPtr, Size, Result);
2131     } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
2132     } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) {
2133     } else
2134       break;
2135   }
2136
2137   return CurPtr;
2138 }
2139
2140 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed
2141 /// either " or L" or u8" or u" or U".
2142 bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
2143                              tok::TokenKind Kind) {
2144   const char *AfterQuote = CurPtr;
2145   // Does this string contain the \0 character?
2146   const char *NulCharacter = nullptr;
2147
2148   if (!isLexingRawMode() &&
2149       (Kind == tok::utf8_string_literal ||
2150        Kind == tok::utf16_string_literal ||
2151        Kind == tok::utf32_string_literal))
2152     Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal
2153                                        : diag::warn_c99_compat_unicode_literal);
2154
2155   char C = getAndAdvanceChar(CurPtr, Result);
2156   while (C != '"') {
2157     // Skip escaped characters.  Escaped newlines will already be processed by
2158     // getAndAdvanceChar.
2159     if (C == '\\')
2160       C = getAndAdvanceChar(CurPtr, Result);
2161
2162     if (C == '\n' || C == '\r' ||             // Newline.
2163         (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
2164       if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2165         Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
2166       FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2167       return true;
2168     }
2169
2170     if (C == 0) {
2171       if (isCodeCompletionPoint(CurPtr-1)) {
2172         if (ParsingFilename)
2173           codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);
2174         else
2175           PP->CodeCompleteNaturalLanguage();
2176         FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2177         cutOffLexing();
2178         return true;
2179       }
2180
2181       NulCharacter = CurPtr-1;
2182     }
2183     C = getAndAdvanceChar(CurPtr, Result);
2184   }
2185
2186   // If we are in C++11, lex the optional ud-suffix.
2187   if (LangOpts.CPlusPlus)
2188     CurPtr = LexUDSuffix(Result, CurPtr, true);
2189
2190   // If a nul character existed in the string, warn about it.
2191   if (NulCharacter && !isLexingRawMode())
2192     Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2193
2194   // Update the location of the token as well as the BufferPtr instance var.
2195   const char *TokStart = BufferPtr;
2196   FormTokenWithChars(Result, CurPtr, Kind);
2197   Result.setLiteralData(TokStart);
2198   return true;
2199 }
2200
2201 /// LexRawStringLiteral - Lex the remainder of a raw string literal, after
2202 /// having lexed R", LR", u8R", uR", or UR".
2203 bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
2204                                 tok::TokenKind Kind) {
2205   // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
2206   //  Between the initial and final double quote characters of the raw string,
2207   //  any transformations performed in phases 1 and 2 (trigraphs,
2208   //  universal-character-names, and line splicing) are reverted.
2209
2210   if (!isLexingRawMode())
2211     Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
2212
2213   unsigned PrefixLen = 0;
2214
2215   while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
2216     ++PrefixLen;
2217
2218   // If the last character was not a '(', then we didn't lex a valid delimiter.
2219   if (CurPtr[PrefixLen] != '(') {
2220     if (!isLexingRawMode()) {
2221       const char *PrefixEnd = &CurPtr[PrefixLen];
2222       if (PrefixLen == 16) {
2223         Diag(PrefixEnd, diag::err_raw_delim_too_long);
2224       } else {
2225         Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
2226           << StringRef(PrefixEnd, 1);
2227       }
2228     }
2229
2230     // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
2231     // it's possible the '"' was intended to be part of the raw string, but
2232     // there's not much we can do about that.
2233     while (true) {
2234       char C = *CurPtr++;
2235
2236       if (C == '"')
2237         break;
2238       if (C == 0 && CurPtr-1 == BufferEnd) {
2239         --CurPtr;
2240         break;
2241       }
2242     }
2243
2244     FormTokenWithChars(Result, CurPtr, tok::unknown);
2245     return true;
2246   }
2247
2248   // Save prefix and move CurPtr past it
2249   const char *Prefix = CurPtr;
2250   CurPtr += PrefixLen + 1; // skip over prefix and '('
2251
2252   while (true) {
2253     char C = *CurPtr++;
2254
2255     if (C == ')') {
2256       // Check for prefix match and closing quote.
2257       if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
2258         CurPtr += PrefixLen + 1; // skip over prefix and '"'
2259         break;
2260       }
2261     } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
2262       if (!isLexingRawMode())
2263         Diag(BufferPtr, diag::err_unterminated_raw_string)
2264           << StringRef(Prefix, PrefixLen);
2265       FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2266       return true;
2267     }
2268   }
2269
2270   // If we are in C++11, lex the optional ud-suffix.
2271   if (LangOpts.CPlusPlus)
2272     CurPtr = LexUDSuffix(Result, CurPtr, true);
2273
2274   // Update the location of token as well as BufferPtr.
2275   const char *TokStart = BufferPtr;
2276   FormTokenWithChars(Result, CurPtr, Kind);
2277   Result.setLiteralData(TokStart);
2278   return true;
2279 }
2280
2281 /// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2282 /// after having lexed the '<' character.  This is used for #include filenames.
2283 bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2284   // Does this string contain the \0 character?
2285   const char *NulCharacter = nullptr;
2286   const char *AfterLessPos = CurPtr;
2287   char C = getAndAdvanceChar(CurPtr, Result);
2288   while (C != '>') {
2289     // Skip escaped characters.  Escaped newlines will already be processed by
2290     // getAndAdvanceChar.
2291     if (C == '\\')
2292       C = getAndAdvanceChar(CurPtr, Result);
2293
2294     if (isVerticalWhitespace(C) ||               // Newline.
2295         (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file.
2296       // If the filename is unterminated, then it must just be a lone <
2297       // character.  Return this as such.
2298       FormTokenWithChars(Result, AfterLessPos, tok::less);
2299       return true;
2300     }
2301
2302     if (C == 0) {
2303       if (isCodeCompletionPoint(CurPtr - 1)) {
2304         codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);
2305         cutOffLexing();
2306         FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2307         return true;
2308       }
2309       NulCharacter = CurPtr-1;
2310     }
2311     C = getAndAdvanceChar(CurPtr, Result);
2312   }
2313
2314   // If a nul character existed in the string, warn about it.
2315   if (NulCharacter && !isLexingRawMode())
2316     Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2317
2318   // Update the location of token as well as BufferPtr.
2319   const char *TokStart = BufferPtr;
2320   FormTokenWithChars(Result, CurPtr, tok::header_name);
2321   Result.setLiteralData(TokStart);
2322   return true;
2323 }
2324
2325 void Lexer::codeCompleteIncludedFile(const char *PathStart,
2326                                      const char *CompletionPoint,
2327                                      bool IsAngled) {
2328   // Completion only applies to the filename, after the last slash.
2329   StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2330   llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/";
2331   auto Slash = PartialPath.find_last_of(SlashChars);
2332   StringRef Dir =
2333       (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);
2334   const char *StartOfFilename =
2335       (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;
2336   // Code completion filter range is the filename only, up to completion point.
2337   PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get(
2338       StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2339   // We should replace the characters up to the closing quote or closest slash,
2340   // if any.
2341   while (CompletionPoint < BufferEnd) {
2342     char Next = *(CompletionPoint + 1);
2343     if (Next == 0 || Next == '\r' || Next == '\n')
2344       break;
2345     ++CompletionPoint;
2346     if (Next == (IsAngled ? '>' : '"'))
2347       break;
2348     if (SlashChars.contains(Next))
2349       break;
2350   }
2351
2352   PP->setCodeCompletionTokenRange(
2353       FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
2354       FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
2355   PP->CodeCompleteIncludedFile(Dir, IsAngled);
2356 }
2357
2358 /// LexCharConstant - Lex the remainder of a character constant, after having
2359 /// lexed either ' or L' or u8' or u' or U'.
2360 bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2361                             tok::TokenKind Kind) {
2362   // Does this character contain the \0 character?
2363   const char *NulCharacter = nullptr;
2364
2365   if (!isLexingRawMode()) {
2366     if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
2367       Diag(BufferPtr, LangOpts.CPlusPlus
2368                           ? diag::warn_cxx98_compat_unicode_literal
2369                           : diag::warn_c99_compat_unicode_literal);
2370     else if (Kind == tok::utf8_char_constant)
2371       Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
2372   }
2373
2374   char C = getAndAdvanceChar(CurPtr, Result);
2375   if (C == '\'') {
2376     if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2377       Diag(BufferPtr, diag::ext_empty_character);
2378     FormTokenWithChars(Result, CurPtr, tok::unknown);
2379     return true;
2380   }
2381
2382   while (C != '\'') {
2383     // Skip escaped characters.
2384     if (C == '\\')
2385       C = getAndAdvanceChar(CurPtr, Result);
2386
2387     if (C == '\n' || C == '\r' ||             // Newline.
2388         (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
2389       if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2390         Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2391       FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2392       return true;
2393     }
2394
2395     if (C == 0) {
2396       if (isCodeCompletionPoint(CurPtr-1)) {
2397         PP->CodeCompleteNaturalLanguage();
2398         FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2399         cutOffLexing();
2400         return true;
2401       }
2402
2403       NulCharacter = CurPtr-1;
2404     }
2405     C = getAndAdvanceChar(CurPtr, Result);
2406   }
2407
2408   // If we are in C++11, lex the optional ud-suffix.
2409   if (LangOpts.CPlusPlus)
2410     CurPtr = LexUDSuffix(Result, CurPtr, false);
2411
2412   // If a nul character existed in the character, warn about it.
2413   if (NulCharacter && !isLexingRawMode())
2414     Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2415
2416   // Update the location of token as well as BufferPtr.
2417   const char *TokStart = BufferPtr;
2418   FormTokenWithChars(Result, CurPtr, Kind);
2419   Result.setLiteralData(TokStart);
2420   return true;
2421 }
2422
2423 /// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2424 /// Update BufferPtr to point to the next non-whitespace character and return.
2425 ///
2426 /// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2427 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2428                            bool &TokAtPhysicalStartOfLine) {
2429   // Whitespace - Skip it, then return the token after the whitespace.
2430   bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2431
2432   unsigned char Char = *CurPtr;
2433
2434   const char *lastNewLine = nullptr;
2435   auto setLastNewLine = [&](const char *Ptr) {
2436     lastNewLine = Ptr;
2437     if (!NewLinePtr)
2438       NewLinePtr = Ptr;
2439   };
2440   if (SawNewline)
2441     setLastNewLine(CurPtr - 1);
2442
2443   // Skip consecutive spaces efficiently.
2444   while (true) {
2445     // Skip horizontal whitespace very aggressively.
2446     while (isHorizontalWhitespace(Char))
2447       Char = *++CurPtr;
2448
2449     // Otherwise if we have something other than whitespace, we're done.
2450     if (!isVerticalWhitespace(Char))
2451       break;
2452
2453     if (ParsingPreprocessorDirective) {
2454       // End of preprocessor directive line, let LexTokenInternal handle this.
2455       BufferPtr = CurPtr;
2456       return false;
2457     }
2458
2459     // OK, but handle newline.
2460     if (*CurPtr == '\n')
2461       setLastNewLine(CurPtr);
2462     SawNewline = true;
2463     Char = *++CurPtr;
2464   }
2465
2466   // If the client wants us to return whitespace, return it now.
2467   if (isKeepWhitespaceMode()) {
2468     FormTokenWithChars(Result, CurPtr, tok::unknown);
2469     if (SawNewline) {
2470       IsAtStartOfLine = true;
2471       IsAtPhysicalStartOfLine = true;
2472     }
2473     // FIXME: The next token will not have LeadingSpace set.
2474     return true;
2475   }
2476
2477   // If this isn't immediately after a newline, there is leading space.
2478   char PrevChar = CurPtr[-1];
2479   bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2480
2481   Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2482   if (SawNewline) {
2483     Result.setFlag(Token::StartOfLine);
2484     TokAtPhysicalStartOfLine = true;
2485
2486     if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) {
2487       if (auto *Handler = PP->getEmptylineHandler())
2488         Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1),
2489                                              getSourceLocation(lastNewLine)));
2490     }
2491   }
2492
2493   BufferPtr = CurPtr;
2494   return false;
2495 }
2496
2497 /// We have just read the // characters from input.  Skip until we find the
2498 /// newline character that terminates the comment.  Then update BufferPtr and
2499 /// return.
2500 ///
2501 /// If we're in KeepCommentMode or any CommentHandler has inserted
2502 /// some tokens, this will store the first token and return true.
2503 bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2504                             bool &TokAtPhysicalStartOfLine) {
2505   // If Line comments aren't explicitly enabled for this language, emit an
2506   // extension warning.
2507   if (!LineComment) {
2508     if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags.
2509       Diag(BufferPtr, diag::ext_line_comment);
2510
2511     // Mark them enabled so we only emit one warning for this translation
2512     // unit.
2513     LineComment = true;
2514   }
2515
2516   // Scan over the body of the comment.  The common case, when scanning, is that
2517   // the comment contains normal ascii characters with nothing interesting in
2518   // them.  As such, optimize for this case with the inner loop.
2519   //
2520   // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2521   // character that ends the line comment.
2522
2523   // C++23 [lex.phases] p1
2524   // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2525   // diagnostic only once per entire ill-formed subsequence to avoid
2526   // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2527   bool UnicodeDecodingAlreadyDiagnosed = false;
2528
2529   char C;
2530   while (true) {
2531     C = *CurPtr;
2532     // Skip over characters in the fast loop.
2533     while (isASCII(C) && C != 0 &&   // Potentially EOF.
2534            C != '\n' && C != '\r') { // Newline or DOS-style newline.
2535       C = *++CurPtr;
2536       UnicodeDecodingAlreadyDiagnosed = false;
2537     }
2538
2539     if (!isASCII(C)) {
2540       unsigned Length = llvm::getUTF8SequenceSize(
2541           (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
2542       if (Length == 0) {
2543         if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2544           Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
2545         UnicodeDecodingAlreadyDiagnosed = true;
2546         ++CurPtr;
2547       } else {
2548         UnicodeDecodingAlreadyDiagnosed = false;
2549         CurPtr += Length;
2550       }
2551       continue;
2552     }
2553
2554     const char *NextLine = CurPtr;
2555     if (C != 0) {
2556       // We found a newline, see if it's escaped.
2557       const char *EscapePtr = CurPtr-1;
2558       bool HasSpace = false;
2559       while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2560         --EscapePtr;
2561         HasSpace = true;
2562       }
2563
2564       if (*EscapePtr == '\\')
2565         // Escaped newline.
2566         CurPtr = EscapePtr;
2567       else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
2568                EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2569         // Trigraph-escaped newline.
2570         CurPtr = EscapePtr-2;
2571       else
2572         break; // This is a newline, we're done.
2573
2574       // If there was space between the backslash and newline, warn about it.
2575       if (HasSpace && !isLexingRawMode())
2576         Diag(EscapePtr, diag::backslash_newline_space);
2577     }
2578
2579     // Otherwise, this is a hard case.  Fall back on getAndAdvanceChar to
2580     // properly decode the character.  Read it in raw mode to avoid emitting
2581     // diagnostics about things like trigraphs.  If we see an escaped newline,
2582     // we'll handle it below.
2583     const char *OldPtr = CurPtr;
2584     bool OldRawMode = isLexingRawMode();
2585     LexingRawMode = true;
2586     C = getAndAdvanceChar(CurPtr, Result);
2587     LexingRawMode = OldRawMode;
2588
2589     // If we only read only one character, then no special handling is needed.
2590     // We're done and can skip forward to the newline.
2591     if (C != 0 && CurPtr == OldPtr+1) {
2592       CurPtr = NextLine;
2593       break;
2594     }
2595
2596     // If we read multiple characters, and one of those characters was a \r or
2597     // \n, then we had an escaped newline within the comment.  Emit diagnostic
2598     // unless the next line is also a // comment.
2599     if (CurPtr != OldPtr + 1 && C != '/' &&
2600         (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
2601       for (; OldPtr != CurPtr; ++OldPtr)
2602         if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
2603           // Okay, we found a // comment that ends in a newline, if the next
2604           // line is also a // comment, but has spaces, don't emit a diagnostic.
2605           if (isWhitespace(C)) {
2606             const char *ForwardPtr = CurPtr;
2607             while (isWhitespace(*ForwardPtr))  // Skip whitespace.
2608               ++ForwardPtr;
2609             if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
2610               break;
2611           }
2612
2613           if (!isLexingRawMode())
2614             Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2615           break;
2616         }
2617     }
2618
2619     if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
2620       --CurPtr;
2621       break;
2622     }
2623
2624     if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2625       PP->CodeCompleteNaturalLanguage();
2626       cutOffLexing();
2627       return false;
2628     }
2629   }
2630
2631   // Found but did not consume the newline.  Notify comment handlers about the
2632   // comment unless we're in a #if 0 block.
2633   if (PP && !isLexingRawMode() &&
2634       PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2635                                             getSourceLocation(CurPtr)))) {
2636     BufferPtr = CurPtr;
2637     return true; // A token has to be returned.
2638   }
2639
2640   // If we are returning comments as tokens, return this comment as a token.
2641   if (inKeepCommentMode())
2642     return SaveLineComment(Result, CurPtr);
2643
2644   // If we are inside a preprocessor directive and we see the end of line,
2645   // return immediately, so that the lexer can return this as an EOD token.
2646   if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
2647     BufferPtr = CurPtr;
2648     return false;
2649   }
2650
2651   // Otherwise, eat the \n character.  We don't care if this is a \n\r or
2652   // \r\n sequence.  This is an efficiency hack (because we know the \n can't
2653   // contribute to another token), it isn't needed for correctness.  Note that
2654   // this is ok even in KeepWhitespaceMode, because we would have returned the
2655   // comment above in that mode.
2656   NewLinePtr = CurPtr++;
2657
2658   // The next returned token is at the start of the line.
2659   Result.setFlag(Token::StartOfLine);
2660   TokAtPhysicalStartOfLine = true;
2661   // No leading whitespace seen so far.
2662   Result.clearFlag(Token::LeadingSpace);
2663   BufferPtr = CurPtr;
2664   return false;
2665 }
2666
2667 /// If in save-comment mode, package up this Line comment in an appropriate
2668 /// way and return it.
2669 bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2670   // If we're not in a preprocessor directive, just return the // comment
2671   // directly.
2672   FormTokenWithChars(Result, CurPtr, tok::comment);
2673
2674   if (!ParsingPreprocessorDirective || LexingRawMode)
2675     return true;
2676
2677   // If this Line-style comment is in a macro definition, transmogrify it into
2678   // a C-style block comment.
2679   bool Invalid = false;
2680   std::string Spelling = PP->getSpelling(Result, &Invalid);
2681   if (Invalid)
2682     return true;
2683
2684   assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
2685   Spelling[1] = '*';   // Change prefix to "/*".
2686   Spelling += "*/";    // add suffix.
2687
2688   Result.setKind(tok::comment);
2689   PP->CreateString(Spelling, Result,
2690                    Result.getLocation(), Result.getLocation());
2691   return true;
2692 }
2693
2694 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2695 /// character (either \\n or \\r) is part of an escaped newline sequence.  Issue
2696 /// a diagnostic if so.  We know that the newline is inside of a block comment.
2697 static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L,
2698                                                   bool Trigraphs) {
2699   assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
2700
2701   // Position of the first trigraph in the ending sequence.
2702   const char *TrigraphPos = nullptr;
2703   // Position of the first whitespace after a '\' in the ending sequence.
2704   const char *SpacePos = nullptr;
2705
2706   while (true) {
2707     // Back up off the newline.
2708     --CurPtr;
2709
2710     // If this is a two-character newline sequence, skip the other character.
2711     if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
2712       // \n\n or \r\r -> not escaped newline.
2713       if (CurPtr[0] == CurPtr[1])
2714         return false;
2715       // \n\r or \r\n -> skip the newline.
2716       --CurPtr;
2717     }
2718
2719     // If we have horizontal whitespace, skip over it.  We allow whitespace
2720     // between the slash and newline.
2721     while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
2722       SpacePos = CurPtr;
2723       --CurPtr;
2724     }
2725
2726     // If we have a slash, this is an escaped newline.
2727     if (*CurPtr == '\\') {
2728       --CurPtr;
2729     } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') {
2730       // This is a trigraph encoding of a slash.
2731       TrigraphPos = CurPtr - 2;
2732       CurPtr -= 3;
2733     } else {
2734       return false;
2735     }
2736
2737     // If the character preceding the escaped newline is a '*', then after line
2738     // splicing we have a '*/' ending the comment.
2739     if (*CurPtr == '*')
2740       break;
2741
2742     if (*CurPtr != '\n' && *CurPtr != '\r')
2743       return false;
2744   }
2745
2746   if (TrigraphPos) {
2747     // If no trigraphs are enabled, warn that we ignored this trigraph and
2748     // ignore this * character.
2749     if (!Trigraphs) {
2750       if (!L->isLexingRawMode())
2751         L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment);
2752       return false;
2753     }
2754     if (!L->isLexingRawMode())
2755       L->Diag(TrigraphPos, diag::trigraph_ends_block_comment);
2756   }
2757
2758   // Warn about having an escaped newline between the */ characters.
2759   if (!L->isLexingRawMode())
2760     L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end);
2761
2762   // If there was space between the backslash and newline, warn about it.
2763   if (SpacePos && !L->isLexingRawMode())
2764     L->Diag(SpacePos, diag::backslash_newline_space);
2765
2766   return true;
2767 }
2768
2769 #ifdef __SSE2__
2770 #include <emmintrin.h>
2771 #elif __ALTIVEC__
2772 #include <altivec.h>
2773 #undef bool
2774 #endif
2775
2776 /// We have just read from input the / and * characters that started a comment.
2777 /// Read until we find the * and / characters that terminate the comment.
2778 /// Note that we don't bother decoding trigraphs or escaped newlines in block
2779 /// comments, because they cannot cause the comment to end.  The only thing
2780 /// that can happen is the comment could end with an escaped newline between
2781 /// the terminating * and /.
2782 ///
2783 /// If we're in KeepCommentMode or any CommentHandler has inserted
2784 /// some tokens, this will store the first token and return true.
2785 bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2786                              bool &TokAtPhysicalStartOfLine) {
2787   // Scan one character past where we should, looking for a '/' character.  Once
2788   // we find it, check to see if it was preceded by a *.  This common
2789   // optimization helps people who like to put a lot of * characters in their
2790   // comments.
2791
2792   // The first character we get with newlines and trigraphs skipped to handle
2793   // the degenerate /*/ case below correctly if the * has an escaped newline
2794   // after it.
2795   unsigned CharSize;
2796   unsigned char C = getCharAndSize(CurPtr, CharSize);
2797   CurPtr += CharSize;
2798   if (C == 0 && CurPtr == BufferEnd+1) {
2799     if (!isLexingRawMode())
2800       Diag(BufferPtr, diag::err_unterminated_block_comment);
2801     --CurPtr;
2802
2803     // KeepWhitespaceMode should return this broken comment as a token.  Since
2804     // it isn't a well formed comment, just return it as an 'unknown' token.
2805     if (isKeepWhitespaceMode()) {
2806       FormTokenWithChars(Result, CurPtr, tok::unknown);
2807       return true;
2808     }
2809
2810     BufferPtr = CurPtr;
2811     return false;
2812   }
2813
2814   // Check to see if the first character after the '/*' is another /.  If so,
2815   // then this slash does not end the block comment, it is part of it.
2816   if (C == '/')
2817     C = *CurPtr++;
2818
2819   // C++23 [lex.phases] p1
2820   // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2821   // diagnostic only once per entire ill-formed subsequence to avoid
2822   // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2823   bool UnicodeDecodingAlreadyDiagnosed = false;
2824
2825   while (true) {
2826     // Skip over all non-interesting characters until we find end of buffer or a
2827     // (probably ending) '/' character.
2828     if (CurPtr + 24 < BufferEnd &&
2829         // If there is a code-completion point avoid the fast scan because it
2830         // doesn't check for '\0'.
2831         !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
2832       // While not aligned to a 16-byte boundary.
2833       while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
2834         if (!isASCII(C))
2835           goto MultiByteUTF8;
2836         C = *CurPtr++;
2837       }
2838       if (C == '/') goto FoundSlash;
2839
2840 #ifdef __SSE2__
2841       __m128i Slashes = _mm_set1_epi8('/');
2842       while (CurPtr + 16 < BufferEnd) {
2843         int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
2844         if (LLVM_UNLIKELY(Mask != 0)) {
2845           goto MultiByteUTF8;
2846         }
2847         // look for slashes
2848         int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2849                                     Slashes));
2850         if (cmp != 0) {
2851           // Adjust the pointer to point directly after the first slash. It's
2852           // not necessary to set C here, it will be overwritten at the end of
2853           // the outer loop.
2854           CurPtr += llvm::countr_zero<unsigned>(cmp) + 1;
2855           goto FoundSlash;
2856         }
2857         CurPtr += 16;
2858       }
2859 #elif __ALTIVEC__
2860       __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2861                                         0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2862                                         0x80, 0x80, 0x80, 0x80};
2863       __vector unsigned char Slashes = {
2864         '/', '/', '/', '/',  '/', '/', '/', '/',
2865         '/', '/', '/', '/',  '/', '/', '/', '/'
2866       };
2867       while (CurPtr + 16 < BufferEnd) {
2868         if (LLVM_UNLIKELY(
2869                 vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
2870           goto MultiByteUTF8;
2871         if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
2872           break;
2873         }
2874         CurPtr += 16;
2875       }
2876
2877 #else
2878       while (CurPtr + 16 < BufferEnd) {
2879         bool HasNonASCII = false;
2880         for (unsigned I = 0; I < 16; ++I)
2881           HasNonASCII |= !isASCII(CurPtr[I]);
2882
2883         if (LLVM_UNLIKELY(HasNonASCII))
2884           goto MultiByteUTF8;
2885
2886         bool HasSlash = false;
2887         for (unsigned I = 0; I < 16; ++I)
2888           HasSlash |= CurPtr[I] == '/';
2889         if (HasSlash)
2890           break;
2891         CurPtr += 16;
2892       }
2893 #endif
2894
2895       // It has to be one of the bytes scanned, increment to it and read one.
2896       C = *CurPtr++;
2897     }
2898
2899     // Loop to scan the remainder, warning on invalid UTF-8
2900     // if the corresponding warning is enabled, emitting a diagnostic only once
2901     // per sequence that cannot be decoded.
2902     while (C != '/' && C != '\0') {
2903       if (isASCII(C)) {
2904         UnicodeDecodingAlreadyDiagnosed = false;
2905         C = *CurPtr++;
2906         continue;
2907       }
2908     MultiByteUTF8:
2909       // CurPtr is 1 code unit past C, so to decode
2910       // the codepoint, we need to read from the previous position.
2911       unsigned Length = llvm::getUTF8SequenceSize(
2912           (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);
2913       if (Length == 0) {
2914         if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2915           Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment);
2916         UnicodeDecodingAlreadyDiagnosed = true;
2917       } else {
2918         UnicodeDecodingAlreadyDiagnosed = false;
2919         CurPtr += Length - 1;
2920       }
2921       C = *CurPtr++;
2922     }
2923
2924     if (C == '/') {
2925   FoundSlash:
2926       if (CurPtr[-2] == '*')  // We found the final */.  We're done!
2927         break;
2928
2929       if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
2930         if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this,
2931                                                   LangOpts.Trigraphs)) {
2932           // We found the final */, though it had an escaped newline between the
2933           // * and /.  We're done!
2934           break;
2935         }
2936       }
2937       if (CurPtr[0] == '*' && CurPtr[1] != '/') {
2938         // If this is a /* inside of the comment, emit a warning.  Don't do this
2939         // if this is a /*/, which will end the comment.  This misses cases with
2940         // embedded escaped newlines, but oh well.
2941         if (!isLexingRawMode())
2942           Diag(CurPtr-1, diag::warn_nested_block_comment);
2943       }
2944     } else if (C == 0 && CurPtr == BufferEnd+1) {
2945       if (!isLexingRawMode())
2946         Diag(BufferPtr, diag::err_unterminated_block_comment);
2947       // Note: the user probably forgot a */.  We could continue immediately
2948       // after the /*, but this would involve lexing a lot of what really is the
2949       // comment, which surely would confuse the parser.
2950       --CurPtr;
2951
2952       // KeepWhitespaceMode should return this broken comment as a token.  Since
2953       // it isn't a well formed comment, just return it as an 'unknown' token.
2954       if (isKeepWhitespaceMode()) {
2955         FormTokenWithChars(Result, CurPtr, tok::unknown);
2956         return true;
2957       }
2958
2959       BufferPtr = CurPtr;
2960       return false;
2961     } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2962       PP->CodeCompleteNaturalLanguage();
2963       cutOffLexing();
2964       return false;
2965     }
2966
2967     C = *CurPtr++;
2968   }
2969
2970   // Notify comment handlers about the comment unless we're in a #if 0 block.
2971   if (PP && !isLexingRawMode() &&
2972       PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2973                                             getSourceLocation(CurPtr)))) {
2974     BufferPtr = CurPtr;
2975     return true; // A token has to be returned.
2976   }
2977
2978   // If we are returning comments as tokens, return this comment as a token.
2979   if (inKeepCommentMode()) {
2980     FormTokenWithChars(Result, CurPtr, tok::comment);
2981     return true;
2982   }
2983
2984   // It is common for the tokens immediately after a /**/ comment to be
2985   // whitespace.  Instead of going through the big switch, handle it
2986   // efficiently now.  This is safe even in KeepWhitespaceMode because we would
2987   // have already returned above with the comment as a token.
2988   if (isHorizontalWhitespace(*CurPtr)) {
2989     SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
2990     return false;
2991   }
2992
2993   // Otherwise, just return so that the next character will be lexed as a token.
2994   BufferPtr = CurPtr;
2995   Result.setFlag(Token::LeadingSpace);
2996   return false;
2997 }
2998
2999 //===----------------------------------------------------------------------===//
3000 // Primary Lexing Entry Points
3001 //===----------------------------------------------------------------------===//
3002
3003 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
3004 /// uninterpreted string.  This switches the lexer out of directive mode.
3005 void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {
3006   assert(ParsingPreprocessorDirective && ParsingFilename == false &&
3007          "Must be in a preprocessing directive!");
3008   Token Tmp;
3009   Tmp.startToken();
3010
3011   // CurPtr - Cache BufferPtr in an automatic variable.
3012   const char *CurPtr = BufferPtr;
3013   while (true) {
3014     char Char = getAndAdvanceChar(CurPtr, Tmp);
3015     switch (Char) {
3016     default:
3017       if (Result)
3018         Result->push_back(Char);
3019       break;
3020     case 0:  // Null.
3021       // Found end of file?
3022       if (CurPtr-1 != BufferEnd) {
3023         if (isCodeCompletionPoint(CurPtr-1)) {
3024           PP->CodeCompleteNaturalLanguage();
3025           cutOffLexing();
3026           return;
3027         }
3028
3029         // Nope, normal character, continue.
3030         if (Result)
3031           Result->push_back(Char);
3032         break;
3033       }
3034       // FALL THROUGH.
3035       [[fallthrough]];
3036     case '\r':
3037     case '\n':
3038       // Okay, we found the end of the line. First, back up past the \0, \r, \n.
3039       assert(CurPtr[-1] == Char && "Trigraphs for newline?");
3040       BufferPtr = CurPtr-1;
3041
3042       // Next, lex the character, which should handle the EOD transition.
3043       Lex(Tmp);
3044       if (Tmp.is(tok::code_completion)) {
3045         if (PP)
3046           PP->CodeCompleteNaturalLanguage();
3047         Lex(Tmp);
3048       }
3049       assert(Tmp.is(tok::eod) && "Unexpected token!");
3050
3051       // Finally, we're done;
3052       return;
3053     }
3054   }
3055 }
3056
3057 /// LexEndOfFile - CurPtr points to the end of this file.  Handle this
3058 /// condition, reporting diagnostics and handling other edge cases as required.
3059 /// This returns true if Result contains a token, false if PP.Lex should be
3060 /// called again.
3061 bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
3062   // If we hit the end of the file while parsing a preprocessor directive,
3063   // end the preprocessor directive first.  The next token returned will
3064   // then be the end of file.
3065   if (ParsingPreprocessorDirective) {
3066     // Done parsing the "line".
3067     ParsingPreprocessorDirective = false;
3068     // Update the location of token as well as BufferPtr.
3069     FormTokenWithChars(Result, CurPtr, tok::eod);
3070
3071     // Restore comment saving mode, in case it was disabled for directive.
3072     if (PP)
3073       resetExtendedTokenMode();
3074     return true;  // Have a token.
3075   }
3076
3077   // If we are in raw mode, return this event as an EOF token.  Let the caller
3078   // that put us in raw mode handle the event.
3079   if (isLexingRawMode()) {
3080     Result.startToken();
3081     BufferPtr = BufferEnd;
3082     FormTokenWithChars(Result, BufferEnd, tok::eof);
3083     return true;
3084   }
3085
3086   if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
3087     PP->setRecordedPreambleConditionalStack(ConditionalStack);
3088     // If the preamble cuts off the end of a header guard, consider it guarded.
3089     // The guard is valid for the preamble content itself, and for tools the
3090     // most useful answer is "yes, this file has a header guard".
3091     if (!ConditionalStack.empty())
3092       MIOpt.ExitTopLevelConditional();
3093     ConditionalStack.clear();
3094   }
3095
3096   // Issue diagnostics for unterminated #if and missing newline.
3097
3098   // If we are in a #if directive, emit an error.
3099   while (!ConditionalStack.empty()) {
3100     if (PP->getCodeCompletionFileLoc() != FileLoc)
3101       PP->Diag(ConditionalStack.back().IfLoc,
3102                diag::err_pp_unterminated_conditional);
3103     ConditionalStack.pop_back();
3104   }
3105
3106   // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
3107   // a pedwarn.
3108   if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
3109     DiagnosticsEngine &Diags = PP->getDiagnostics();
3110     SourceLocation EndLoc = getSourceLocation(BufferEnd);
3111     unsigned DiagID;
3112
3113     if (LangOpts.CPlusPlus11) {
3114       // C++11 [lex.phases] 2.2 p2
3115       // Prefer the C++98 pedantic compatibility warning over the generic,
3116       // non-extension, user-requested "missing newline at EOF" warning.
3117       if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
3118         DiagID = diag::warn_cxx98_compat_no_newline_eof;
3119       } else {
3120         DiagID = diag::warn_no_newline_eof;
3121       }
3122     } else {
3123       DiagID = diag::ext_no_newline_eof;
3124     }
3125
3126     Diag(BufferEnd, DiagID)
3127       << FixItHint::CreateInsertion(EndLoc, "\n");
3128   }
3129
3130   BufferPtr = CurPtr;
3131
3132   // Finally, let the preprocessor handle this.
3133   return PP->HandleEndOfFile(Result, isPragmaLexer());
3134 }
3135
3136 /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
3137 /// the specified lexer will return a tok::l_paren token, 0 if it is something
3138 /// else and 2 if there are no more tokens in the buffer controlled by the
3139 /// lexer.
3140 unsigned Lexer::isNextPPTokenLParen() {
3141   assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
3142
3143   if (isDependencyDirectivesLexer()) {
3144     if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size())
3145       return 2;
3146     return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
3147         tok::l_paren);
3148   }
3149
3150   // Switch to 'skipping' mode.  This will ensure that we can lex a token
3151   // without emitting diagnostics, disables macro expansion, and will cause EOF
3152   // to return an EOF token instead of popping the include stack.
3153   LexingRawMode = true;
3154
3155   // Save state that can be changed while lexing so that we can restore it.
3156   const char *TmpBufferPtr = BufferPtr;
3157   bool inPPDirectiveMode = ParsingPreprocessorDirective;
3158   bool atStartOfLine = IsAtStartOfLine;
3159   bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3160   bool leadingSpace = HasLeadingSpace;
3161
3162   Token Tok;
3163   Lex(Tok);
3164
3165   // Restore state that may have changed.
3166   BufferPtr = TmpBufferPtr;
3167   ParsingPreprocessorDirective = inPPDirectiveMode;
3168   HasLeadingSpace = leadingSpace;
3169   IsAtStartOfLine = atStartOfLine;
3170   IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
3171
3172   // Restore the lexer back to non-skipping mode.
3173   LexingRawMode = false;
3174
3175   if (Tok.is(tok::eof))
3176     return 2;
3177   return Tok.is(tok::l_paren);
3178 }
3179
3180 /// Find the end of a version control conflict marker.
3181 static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
3182                                    ConflictMarkerKind CMK) {
3183   const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
3184   size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
3185   auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
3186   size_t Pos = RestOfBuffer.find(Terminator);
3187   while (Pos != StringRef::npos) {
3188     // Must occur at start of line.
3189     if (Pos == 0 ||
3190         (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
3191       RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
3192       Pos = RestOfBuffer.find(Terminator);
3193       continue;
3194     }
3195     return RestOfBuffer.data()+Pos;
3196   }
3197   return nullptr;
3198 }
3199
3200 /// IsStartOfConflictMarker - If the specified pointer is the start of a version
3201 /// control conflict marker like '<<<<<<<', recognize it as such, emit an error
3202 /// and recover nicely.  This returns true if it is a conflict marker and false
3203 /// if not.
3204 bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
3205   // Only a conflict marker if it starts at the beginning of a line.
3206   if (CurPtr != BufferStart &&
3207       CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3208     return false;
3209
3210   // Check to see if we have <<<<<<< or >>>>.
3211   if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
3212       !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> "))
3213     return false;
3214
3215   // If we have a situation where we don't care about conflict markers, ignore
3216   // it.
3217   if (CurrentConflictMarkerState || isLexingRawMode())
3218     return false;
3219
3220   ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
3221
3222   // Check to see if there is an ending marker somewhere in the buffer at the
3223   // start of a line to terminate this conflict marker.
3224   if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
3225     // We found a match.  We are really in a conflict marker.
3226     // Diagnose this, and ignore to the end of line.
3227     Diag(CurPtr, diag::err_conflict_marker);
3228     CurrentConflictMarkerState = Kind;
3229
3230     // Skip ahead to the end of line.  We know this exists because the
3231     // end-of-conflict marker starts with \r or \n.
3232     while (*CurPtr != '\r' && *CurPtr != '\n') {
3233       assert(CurPtr != BufferEnd && "Didn't find end of line");
3234       ++CurPtr;
3235     }
3236     BufferPtr = CurPtr;
3237     return true;
3238   }
3239
3240   // No end of conflict marker found.
3241   return false;
3242 }
3243
3244 /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
3245 /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
3246 /// is the end of a conflict marker.  Handle it by ignoring up until the end of
3247 /// the line.  This returns true if it is a conflict marker and false if not.
3248 bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
3249   // Only a conflict marker if it starts at the beginning of a line.
3250   if (CurPtr != BufferStart &&
3251       CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3252     return false;
3253
3254   // If we have a situation where we don't care about conflict markers, ignore
3255   // it.
3256   if (!CurrentConflictMarkerState || isLexingRawMode())
3257     return false;
3258
3259   // Check to see if we have the marker (4 characters in a row).
3260   for (unsigned i = 1; i != 4; ++i)
3261     if (CurPtr[i] != CurPtr[0])
3262       return false;
3263
3264   // If we do have it, search for the end of the conflict marker.  This could
3265   // fail if it got skipped with a '#if 0' or something.  Note that CurPtr might
3266   // be the end of conflict marker.
3267   if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
3268                                         CurrentConflictMarkerState)) {
3269     CurPtr = End;
3270
3271     // Skip ahead to the end of line.
3272     while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
3273       ++CurPtr;
3274
3275     BufferPtr = CurPtr;
3276
3277     // No longer in the conflict marker.
3278     CurrentConflictMarkerState = CMK_None;
3279     return true;
3280   }
3281
3282   return false;
3283 }
3284
3285 static const char *findPlaceholderEnd(const char *CurPtr,
3286                                       const char *BufferEnd) {
3287   if (CurPtr == BufferEnd)
3288     return nullptr;
3289   BufferEnd -= 1; // Scan until the second last character.
3290   for (; CurPtr != BufferEnd; ++CurPtr) {
3291     if (CurPtr[0] == '#' && CurPtr[1] == '>')
3292       return CurPtr + 2;
3293   }
3294   return nullptr;
3295 }
3296
3297 bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
3298   assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");
3299   if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode)
3300     return false;
3301   const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
3302   if (!End)
3303     return false;
3304   const char *Start = CurPtr - 1;
3305   if (!LangOpts.AllowEditorPlaceholders)
3306     Diag(Start, diag::err_placeholder_in_source);
3307   Result.startToken();
3308   FormTokenWithChars(Result, End, tok::raw_identifier);
3309   Result.setRawIdentifierData(Start);
3310   PP->LookUpIdentifierInfo(Result);
3311   Result.setFlag(Token::IsEditorPlaceholder);
3312   BufferPtr = End;
3313   return true;
3314 }
3315
3316 bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
3317   if (PP && PP->isCodeCompletionEnabled()) {
3318     SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
3319     return Loc == PP->getCodeCompletionLoc();
3320   }
3321
3322   return false;
3323 }
3324
3325 std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
3326                                                  const char *SlashLoc,
3327                                                  Token *Result) {
3328   unsigned CharSize;
3329   char Kind = getCharAndSize(StartPtr, CharSize);
3330   assert((Kind == 'u' || Kind == 'U') && "expected a UCN");
3331
3332   unsigned NumHexDigits;
3333   if (Kind == 'u')
3334     NumHexDigits = 4;
3335   else if (Kind == 'U')
3336     NumHexDigits = 8;
3337
3338   bool Delimited = false;
3339   bool FoundEndDelimiter = false;
3340   unsigned Count = 0;
3341   bool Diagnose = Result && !isLexingRawMode();
3342
3343   if (!LangOpts.CPlusPlus && !LangOpts.C99) {
3344     if (Diagnose)
3345       Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
3346     return std::nullopt;
3347   }
3348
3349   const char *CurPtr = StartPtr + CharSize;
3350   const char *KindLoc = &CurPtr[-1];
3351
3352   uint32_t CodePoint = 0;
3353   while (Count != NumHexDigits || Delimited) {
3354     char C = getCharAndSize(CurPtr, CharSize);
3355     if (!Delimited && Count == 0 && C == '{') {
3356       Delimited = true;
3357       CurPtr += CharSize;
3358       continue;
3359     }
3360
3361     if (Delimited && C == '}') {
3362       CurPtr += CharSize;
3363       FoundEndDelimiter = true;
3364       break;
3365     }
3366
3367     unsigned Value = llvm::hexDigitValue(C);
3368     if (Value == -1U) {
3369       if (!Delimited)
3370         break;
3371       if (Diagnose)
3372         Diag(SlashLoc, diag::warn_delimited_ucn_incomplete)
3373             << StringRef(KindLoc, 1);
3374       return std::nullopt;
3375     }
3376
3377     if (CodePoint & 0xF000'0000) {
3378       if (Diagnose)
3379         Diag(KindLoc, diag::err_escape_too_large) << 0;
3380       return std::nullopt;
3381     }
3382
3383     CodePoint <<= 4;
3384     CodePoint |= Value;
3385     CurPtr += CharSize;
3386     Count++;
3387   }
3388
3389   if (Count == 0) {
3390     if (Diagnose)
3391       Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3392                                        : diag::warn_ucn_escape_no_digits)
3393           << StringRef(KindLoc, 1);
3394     return std::nullopt;
3395   }
3396
3397   if (Delimited && Kind == 'U') {
3398     if (Diagnose)
3399       Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
3400     return std::nullopt;
3401   }
3402
3403   if (!Delimited && Count != NumHexDigits) {
3404     if (Diagnose) {
3405       Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3406       // If the user wrote \U1234, suggest a fixit to \u.
3407       if (Count == 4 && NumHexDigits == 8) {
3408         CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
3409         Diag(KindLoc, diag::note_ucn_four_not_eight)
3410             << FixItHint::CreateReplacement(URange, "u");
3411       }
3412     }
3413     return std::nullopt;
3414   }
3415
3416   if (Delimited && PP) {
3417     Diag(SlashLoc, PP->getLangOpts().CPlusPlus23
3418                        ? diag::warn_cxx23_delimited_escape_sequence
3419                        : diag::ext_delimited_escape_sequence)
3420         << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3421   }
3422
3423   if (Result) {
3424     Result->setFlag(Token::HasUCN);
3425     // If the UCN contains either a trigraph or a line splicing,
3426     // we need to call getAndAdvanceChar again to set the appropriate flags
3427     // on Result.
3428     if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0)))
3429       StartPtr = CurPtr;
3430     else
3431       while (StartPtr != CurPtr)
3432         (void)getAndAdvanceChar(StartPtr, *Result);
3433   } else {
3434     StartPtr = CurPtr;
3435   }
3436   return CodePoint;
3437 }
3438
3439 std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
3440                                                const char *SlashLoc,
3441                                                Token *Result) {
3442   unsigned CharSize;
3443   bool Diagnose = Result && !isLexingRawMode();
3444
3445   char C = getCharAndSize(StartPtr, CharSize);
3446   assert(C == 'N' && "expected \\N{...}");
3447
3448   const char *CurPtr = StartPtr + CharSize;
3449   const char *KindLoc = &CurPtr[-1];
3450
3451   C = getCharAndSize(CurPtr, CharSize);
3452   if (C != '{') {
3453     if (Diagnose)
3454       Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3455     return std::nullopt;
3456   }
3457   CurPtr += CharSize;
3458   const char *StartName = CurPtr;
3459   bool FoundEndDelimiter = false;
3460   llvm::SmallVector<char, 30> Buffer;
3461   while (C) {
3462     C = getCharAndSize(CurPtr, CharSize);
3463     CurPtr += CharSize;
3464     if (C == '}') {
3465       FoundEndDelimiter = true;
3466       break;
3467     }
3468
3469     if (isVerticalWhitespace(C))
3470       break;
3471     Buffer.push_back(C);
3472   }
3473
3474   if (!FoundEndDelimiter || Buffer.empty()) {
3475     if (Diagnose)
3476       Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3477                                        : diag::warn_delimited_ucn_incomplete)
3478           << StringRef(KindLoc, 1);
3479     return std::nullopt;
3480   }
3481
3482   StringRef Name(Buffer.data(), Buffer.size());
3483   std::optional<char32_t> Match =
3484       llvm::sys::unicode::nameToCodepointStrict(Name);
3485   std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
3486   if (!Match) {
3487     LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
3488     if (Diagnose) {
3489       Diag(StartName, diag::err_invalid_ucn_name)
3490           << StringRef(Buffer.data(), Buffer.size())
3491           << makeCharRange(*this, StartName, CurPtr - CharSize);
3492       if (LooseMatch) {
3493         Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
3494             << FixItHint::CreateReplacement(
3495                    makeCharRange(*this, StartName, CurPtr - CharSize),
3496                    LooseMatch->Name);
3497       }
3498     }
3499     // We do not offer misspelled character names suggestions here
3500     // as the set of what would be a valid suggestion depends on context,
3501     // and we should not make invalid suggestions.
3502   }
3503
3504   if (Diagnose && Match)
3505     Diag(SlashLoc, PP->getLangOpts().CPlusPlus23
3506                        ? diag::warn_cxx23_delimited_escape_sequence
3507                        : diag::ext_delimited_escape_sequence)
3508         << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3509
3510   // If no diagnostic has been emitted yet, likely because we are doing a
3511   // tentative lexing, we do not want to recover here to make sure the token
3512   // will not be incorrectly considered valid. This function will be called
3513   // again and a diagnostic emitted then.
3514   if (LooseMatch && Diagnose)
3515     Match = LooseMatch->CodePoint;
3516
3517   if (Result) {
3518     Result->setFlag(Token::HasUCN);
3519     // If the UCN contains either a trigraph or a line splicing,
3520     // we need to call getAndAdvanceChar again to set the appropriate flags
3521     // on Result.
3522     if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3))
3523       StartPtr = CurPtr;
3524     else
3525       while (StartPtr != CurPtr)
3526         (void)getAndAdvanceChar(StartPtr, *Result);
3527   } else {
3528     StartPtr = CurPtr;
3529   }
3530   return Match ? std::optional<uint32_t>(*Match) : std::nullopt;
3531 }
3532
3533 uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
3534                            Token *Result) {
3535
3536   unsigned CharSize;
3537   std::optional<uint32_t> CodePointOpt;
3538   char Kind = getCharAndSize(StartPtr, CharSize);
3539   if (Kind == 'u' || Kind == 'U')
3540     CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
3541   else if (Kind == 'N')
3542     CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);
3543
3544   if (!CodePointOpt)
3545     return 0;
3546
3547   uint32_t CodePoint = *CodePointOpt;
3548
3549   // Don't apply C family restrictions to UCNs in assembly mode
3550   if (LangOpts.AsmPreprocessor)
3551     return CodePoint;
3552
3553   // C23 6.4.3p2: A universal character name shall not designate a code point
3554   // where the hexadecimal value is:
3555   // - in the range D800 through DFFF inclusive; or
3556   // - greater than 10FFFF.
3557   // A universal-character-name outside the c-char-sequence of a character
3558   // constant, or the s-char-sequence of a string-literal shall not designate
3559   // a control character or a character in the basic character set.
3560
3561   // C++11 [lex.charset]p2: If the hexadecimal value for a
3562   //   universal-character-name corresponds to a surrogate code point (in the
3563   //   range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3564   //   if the hexadecimal value for a universal-character-name outside the
3565   //   c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3566   //   string literal corresponds to a control character (in either of the
3567   //   ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3568   //   basic source character set, the program is ill-formed.
3569   if (CodePoint < 0xA0) {
3570     // We don't use isLexingRawMode() here because we need to warn about bad
3571     // UCNs even when skipping preprocessing tokens in a #if block.
3572     if (Result && PP) {
3573       if (CodePoint < 0x20 || CodePoint >= 0x7F)
3574         Diag(BufferPtr, diag::err_ucn_control_character);
3575       else {
3576         char C = static_cast<char>(CodePoint);
3577         Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3578       }
3579     }
3580
3581     return 0;
3582   } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
3583     // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3584     // We don't use isLexingRawMode() here because we need to diagnose bad
3585     // UCNs even when skipping preprocessing tokens in a #if block.
3586     if (Result && PP) {
3587       if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
3588         Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3589       else
3590         Diag(BufferPtr, diag::err_ucn_escape_invalid);
3591     }
3592     return 0;
3593   }
3594
3595   return CodePoint;
3596 }
3597
3598 bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3599                                    const char *CurPtr) {
3600   if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3601       isUnicodeWhitespace(C)) {
3602     Diag(BufferPtr, diag::ext_unicode_whitespace)
3603       << makeCharRange(*this, BufferPtr, CurPtr);
3604
3605     Result.setFlag(Token::LeadingSpace);
3606     return true;
3607   }
3608   return false;
3609 }
3610
3611 void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3612   IsAtStartOfLine = Result.isAtStartOfLine();
3613   HasLeadingSpace = Result.hasLeadingSpace();
3614   HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3615   // Note that this doesn't affect IsAtPhysicalStartOfLine.
3616 }
3617
3618 bool Lexer::Lex(Token &Result) {
3619   assert(!isDependencyDirectivesLexer());
3620
3621   // Start a new token.
3622   Result.startToken();
3623
3624   // Set up misc whitespace flags for LexTokenInternal.
3625   if (IsAtStartOfLine) {
3626     Result.setFlag(Token::StartOfLine);
3627     IsAtStartOfLine = false;
3628   }
3629
3630   if (HasLeadingSpace) {
3631     Result.setFlag(Token::LeadingSpace);
3632     HasLeadingSpace = false;
3633   }
3634
3635   if (HasLeadingEmptyMacro) {
3636     Result.setFlag(Token::LeadingEmptyMacro);
3637     HasLeadingEmptyMacro = false;
3638   }
3639
3640   bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3641   IsAtPhysicalStartOfLine = false;
3642   bool isRawLex = isLexingRawMode();
3643   (void) isRawLex;
3644   bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3645   // (After the LexTokenInternal call, the lexer might be destroyed.)
3646   assert((returnedToken || !isRawLex) && "Raw lex must succeed");
3647   return returnedToken;
3648 }
3649
3650 /// LexTokenInternal - This implements a simple C family lexer.  It is an
3651 /// extremely performance critical piece of code.  This assumes that the buffer
3652 /// has a null character at the end of the file.  This returns a preprocessing
3653 /// token, not a normal token, as such, it is an internal interface.  It assumes
3654 /// that the Flags of result have been cleared before calling this.
3655 bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3656 LexStart:
3657   assert(!Result.needsCleaning() && "Result needs cleaning");
3658   assert(!Result.hasPtrData() && "Result has not been reset");
3659
3660   // CurPtr - Cache BufferPtr in an automatic variable.
3661   const char *CurPtr = BufferPtr;
3662
3663   // Small amounts of horizontal whitespace is very common between tokens.
3664   if (isHorizontalWhitespace(*CurPtr)) {
3665     do {
3666       ++CurPtr;
3667     } while (isHorizontalWhitespace(*CurPtr));
3668
3669     // If we are keeping whitespace and other tokens, just return what we just
3670     // skipped.  The next lexer invocation will return the token after the
3671     // whitespace.
3672     if (isKeepWhitespaceMode()) {
3673       FormTokenWithChars(Result, CurPtr, tok::unknown);
3674       // FIXME: The next token will not have LeadingSpace set.
3675       return true;
3676     }
3677
3678     BufferPtr = CurPtr;
3679     Result.setFlag(Token::LeadingSpace);
3680   }
3681
3682   unsigned SizeTmp, SizeTmp2;   // Temporaries for use in cases below.
3683
3684   // Read a character, advancing over it.
3685   char Char = getAndAdvanceChar(CurPtr, Result);
3686   tok::TokenKind Kind;
3687
3688   if (!isVerticalWhitespace(Char))
3689     NewLinePtr = nullptr;
3690
3691   switch (Char) {
3692   case 0:  // Null.
3693     // Found end of file?
3694     if (CurPtr-1 == BufferEnd)
3695       return LexEndOfFile(Result, CurPtr-1);
3696
3697     // Check if we are performing code completion.
3698     if (isCodeCompletionPoint(CurPtr-1)) {
3699       // Return the code-completion token.
3700       Result.startToken();
3701       FormTokenWithChars(Result, CurPtr, tok::code_completion);
3702       return true;
3703     }
3704
3705     if (!isLexingRawMode())
3706       Diag(CurPtr-1, diag::null_in_file);
3707     Result.setFlag(Token::LeadingSpace);
3708     if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3709       return true; // KeepWhitespaceMode
3710
3711     // We know the lexer hasn't changed, so just try again with this lexer.
3712     // (We manually eliminate the tail call to avoid recursion.)
3713     goto LexNextToken;
3714
3715   case 26:  // DOS & CP/M EOF: "^Z".
3716     // If we're in Microsoft extensions mode, treat this as end of file.
3717     if (LangOpts.MicrosoftExt) {
3718       if (!isLexingRawMode())
3719         Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3720       return LexEndOfFile(Result, CurPtr-1);
3721     }
3722
3723     // If Microsoft extensions are disabled, this is just random garbage.
3724     Kind = tok::unknown;
3725     break;
3726
3727   case '\r':
3728     if (CurPtr[0] == '\n')
3729       (void)getAndAdvanceChar(CurPtr, Result);
3730     [[fallthrough]];
3731   case '\n':
3732     // If we are inside a preprocessor directive and we see the end of line,
3733     // we know we are done with the directive, so return an EOD token.
3734     if (ParsingPreprocessorDirective) {
3735       // Done parsing the "line".
3736       ParsingPreprocessorDirective = false;
3737
3738       // Restore comment saving mode, in case it was disabled for directive.
3739       if (PP)
3740         resetExtendedTokenMode();
3741
3742       // Since we consumed a newline, we are back at the start of a line.
3743       IsAtStartOfLine = true;
3744       IsAtPhysicalStartOfLine = true;
3745       NewLinePtr = CurPtr - 1;
3746
3747       Kind = tok::eod;
3748       break;
3749     }
3750
3751     // No leading whitespace seen so far.
3752     Result.clearFlag(Token::LeadingSpace);
3753
3754     if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3755       return true; // KeepWhitespaceMode
3756
3757     // We only saw whitespace, so just try again with this lexer.
3758     // (We manually eliminate the tail call to avoid recursion.)
3759     goto LexNextToken;
3760   case ' ':
3761   case '\t':
3762   case '\f':
3763   case '\v':
3764   SkipHorizontalWhitespace:
3765     Result.setFlag(Token::LeadingSpace);
3766     if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3767       return true; // KeepWhitespaceMode
3768
3769   SkipIgnoredUnits:
3770     CurPtr = BufferPtr;
3771
3772     // If the next token is obviously a // or /* */ comment, skip it efficiently
3773     // too (without going through the big switch stmt).
3774     if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
3775         LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
3776       if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3777         return true; // There is a token to return.
3778       goto SkipIgnoredUnits;
3779     } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
3780       if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3781         return true; // There is a token to return.
3782       goto SkipIgnoredUnits;
3783     } else if (isHorizontalWhitespace(*CurPtr)) {
3784       goto SkipHorizontalWhitespace;
3785     }
3786     // We only saw whitespace, so just try again with this lexer.
3787     // (We manually eliminate the tail call to avoid recursion.)
3788     goto LexNextToken;
3789
3790   // C99 6.4.4.1: Integer Constants.
3791   // C99 6.4.4.2: Floating Constants.
3792   case '0': case '1': case '2': case '3': case '4':
3793   case '5': case '6': case '7': case '8': case '9':
3794     // Notify MIOpt that we read a non-whitespace/non-comment token.
3795     MIOpt.ReadToken();
3796     return LexNumericConstant(Result, CurPtr);
3797
3798   // Identifier (e.g., uber), or
3799   // UTF-8 (C23/C++17) or UTF-16 (C11/C++11) character literal, or
3800   // UTF-8 or UTF-16 string literal (C11/C++11).
3801   case 'u':
3802     // Notify MIOpt that we read a non-whitespace/non-comment token.
3803     MIOpt.ReadToken();
3804
3805     if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3806       Char = getCharAndSize(CurPtr, SizeTmp);
3807
3808       // UTF-16 string literal
3809       if (Char == '"')
3810         return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3811                                 tok::utf16_string_literal);
3812
3813       // UTF-16 character constant
3814       if (Char == '\'')
3815         return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3816                                tok::utf16_char_constant);
3817
3818       // UTF-16 raw string literal
3819       if (Char == 'R' && LangOpts.CPlusPlus11 &&
3820           getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3821         return LexRawStringLiteral(Result,
3822                                ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3823                                            SizeTmp2, Result),
3824                                tok::utf16_string_literal);
3825
3826       if (Char == '8') {
3827         char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3828
3829         // UTF-8 string literal
3830         if (Char2 == '"')
3831           return LexStringLiteral(Result,
3832                                ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3833                                            SizeTmp2, Result),
3834                                tok::utf8_string_literal);
3835         if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C23))
3836           return LexCharConstant(
3837               Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3838                                   SizeTmp2, Result),
3839               tok::utf8_char_constant);
3840
3841         if (Char2 == 'R' && LangOpts.CPlusPlus11) {
3842           unsigned SizeTmp3;
3843           char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3844           // UTF-8 raw string literal
3845           if (Char3 == '"') {
3846             return LexRawStringLiteral(Result,
3847                    ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3848                                            SizeTmp2, Result),
3849                                SizeTmp3, Result),
3850                    tok::utf8_string_literal);
3851           }
3852         }
3853       }
3854     }
3855
3856     // treat u like the start of an identifier.
3857     return LexIdentifierContinue(Result, CurPtr);
3858
3859   case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal
3860     // Notify MIOpt that we read a non-whitespace/non-comment token.
3861     MIOpt.ReadToken();
3862
3863     if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3864       Char = getCharAndSize(CurPtr, SizeTmp);
3865
3866       // UTF-32 string literal
3867       if (Char == '"')
3868         return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3869                                 tok::utf32_string_literal);
3870
3871       // UTF-32 character constant
3872       if (Char == '\'')
3873         return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3874                                tok::utf32_char_constant);
3875
3876       // UTF-32 raw string literal
3877       if (Char == 'R' && LangOpts.CPlusPlus11 &&
3878           getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3879         return LexRawStringLiteral(Result,
3880                                ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3881                                            SizeTmp2, Result),
3882                                tok::utf32_string_literal);
3883     }
3884
3885     // treat U like the start of an identifier.
3886     return LexIdentifierContinue(Result, CurPtr);
3887
3888   case 'R': // Identifier or C++0x raw string literal
3889     // Notify MIOpt that we read a non-whitespace/non-comment token.
3890     MIOpt.ReadToken();
3891
3892     if (LangOpts.CPlusPlus11) {
3893       Char = getCharAndSize(CurPtr, SizeTmp);
3894
3895       if (Char == '"')
3896         return LexRawStringLiteral(Result,
3897                                    ConsumeChar(CurPtr, SizeTmp, Result),
3898                                    tok::string_literal);
3899     }
3900
3901     // treat R like the start of an identifier.
3902     return LexIdentifierContinue(Result, CurPtr);
3903
3904   case 'L':   // Identifier (Loony) or wide literal (L'x' or L"xyz").
3905     // Notify MIOpt that we read a non-whitespace/non-comment token.
3906     MIOpt.ReadToken();
3907     Char = getCharAndSize(CurPtr, SizeTmp);
3908
3909     // Wide string literal.
3910     if (Char == '"')
3911       return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3912                               tok::wide_string_literal);
3913
3914     // Wide raw string literal.
3915     if (LangOpts.CPlusPlus11 && Char == 'R' &&
3916         getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3917       return LexRawStringLiteral(Result,
3918                                ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3919                                            SizeTmp2, Result),
3920                                tok::wide_string_literal);
3921
3922     // Wide character constant.
3923     if (Char == '\'')
3924       return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3925                              tok::wide_char_constant);
3926     // FALL THROUGH, treating L like the start of an identifier.
3927     [[fallthrough]];
3928
3929   // C99 6.4.2: Identifiers.
3930   case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
3931   case 'H': case 'I': case 'J': case 'K':    /*'L'*/case 'M': case 'N':
3932   case 'O': case 'P': case 'Q':    /*'R'*/case 'S': case 'T':    /*'U'*/
3933   case 'V': case 'W': case 'X': case 'Y': case 'Z':
3934   case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
3935   case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
3936   case 'o': case 'p': case 'q': case 'r': case 's': case 't':    /*'u'*/
3937   case 'v': case 'w': case 'x': case 'y': case 'z':
3938   case '_':
3939     // Notify MIOpt that we read a non-whitespace/non-comment token.
3940     MIOpt.ReadToken();
3941     return LexIdentifierContinue(Result, CurPtr);
3942
3943   case '$':   // $ in identifiers.
3944     if (LangOpts.DollarIdents) {
3945       if (!isLexingRawMode())
3946         Diag(CurPtr-1, diag::ext_dollar_in_identifier);
3947       // Notify MIOpt that we read a non-whitespace/non-comment token.
3948       MIOpt.ReadToken();
3949       return LexIdentifierContinue(Result, CurPtr);
3950     }
3951
3952     Kind = tok::unknown;
3953     break;
3954
3955   // C99 6.4.4: Character Constants.
3956   case '\'':
3957     // Notify MIOpt that we read a non-whitespace/non-comment token.
3958     MIOpt.ReadToken();
3959     return LexCharConstant(Result, CurPtr, tok::char_constant);
3960
3961   // C99 6.4.5: String Literals.
3962   case '"':
3963     // Notify MIOpt that we read a non-whitespace/non-comment token.
3964     MIOpt.ReadToken();
3965     return LexStringLiteral(Result, CurPtr,
3966                             ParsingFilename ? tok::header_name
3967                                             : tok::string_literal);
3968
3969   // C99 6.4.6: Punctuators.
3970   case '?':
3971     Kind = tok::question;
3972     break;
3973   case '[':
3974     Kind = tok::l_square;
3975     break;
3976   case ']':
3977     Kind = tok::r_square;
3978     break;
3979   case '(':
3980     Kind = tok::l_paren;
3981     break;
3982   case ')':
3983     Kind = tok::r_paren;
3984     break;
3985   case '{':
3986     Kind = tok::l_brace;
3987     break;
3988   case '}':
3989     Kind = tok::r_brace;
3990     break;
3991   case '.':
3992     Char = getCharAndSize(CurPtr, SizeTmp);
3993     if (Char >= '0' && Char <= '9') {
3994       // Notify MIOpt that we read a non-whitespace/non-comment token.
3995       MIOpt.ReadToken();
3996
3997       return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
3998     } else if (LangOpts.CPlusPlus && Char == '*') {
3999       Kind = tok::periodstar;
4000       CurPtr += SizeTmp;
4001     } else if (Char == '.' &&
4002                getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
4003       Kind = tok::ellipsis;
4004       CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4005                            SizeTmp2, Result);
4006     } else {
4007       Kind = tok::period;
4008     }
4009     break;
4010   case '&':
4011     Char = getCharAndSize(CurPtr, SizeTmp);
4012     if (Char == '&') {
4013       Kind = tok::ampamp;
4014       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4015     } else if (Char == '=') {
4016       Kind = tok::ampequal;
4017       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4018     } else {
4019       Kind = tok::amp;
4020     }
4021     break;
4022   case '*':
4023     if (getCharAndSize(CurPtr, SizeTmp) == '=') {
4024       Kind = tok::starequal;
4025       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4026     } else {
4027       Kind = tok::star;
4028     }
4029     break;
4030   case '+':
4031     Char = getCharAndSize(CurPtr, SizeTmp);
4032     if (Char == '+') {
4033       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4034       Kind = tok::plusplus;
4035     } else if (Char == '=') {
4036       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4037       Kind = tok::plusequal;
4038     } else {
4039       Kind = tok::plus;
4040     }
4041     break;
4042   case '-':
4043     Char = getCharAndSize(CurPtr, SizeTmp);
4044     if (Char == '-') {      // --
4045       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4046       Kind = tok::minusminus;
4047     } else if (Char == '>' && LangOpts.CPlusPlus &&
4048                getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') {  // C++ ->*
4049       CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4050                            SizeTmp2, Result);
4051       Kind = tok::arrowstar;
4052     } else if (Char == '>') {   // ->
4053       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4054       Kind = tok::arrow;
4055     } else if (Char == '=') {   // -=
4056       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4057       Kind = tok::minusequal;
4058     } else {
4059       Kind = tok::minus;
4060     }
4061     break;
4062   case '~':
4063     Kind = tok::tilde;
4064     break;
4065   case '!':
4066     if (getCharAndSize(CurPtr, SizeTmp) == '=') {
4067       Kind = tok::exclaimequal;
4068       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4069     } else {
4070       Kind = tok::exclaim;
4071     }
4072     break;
4073   case '/':
4074     // 6.4.9: Comments
4075     Char = getCharAndSize(CurPtr, SizeTmp);
4076     if (Char == '/') {         // Line comment.
4077       // Even if Line comments are disabled (e.g. in C89 mode), we generally
4078       // want to lex this as a comment.  There is one problem with this though,
4079       // that in one particular corner case, this can change the behavior of the
4080       // resultant program.  For example, In  "foo //**/ bar", C89 would lex
4081       // this as "foo / bar" and languages with Line comments would lex it as
4082       // "foo".  Check to see if the character after the second slash is a '*'.
4083       // If so, we will lex that as a "/" instead of the start of a comment.
4084       // However, we never do this if we are just preprocessing.
4085       bool TreatAsComment =
4086           LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
4087       if (!TreatAsComment)
4088         if (!(PP && PP->isPreprocessedOutput()))
4089           TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
4090
4091       if (TreatAsComment) {
4092         if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4093                             TokAtPhysicalStartOfLine))
4094           return true; // There is a token to return.
4095
4096         // It is common for the tokens immediately after a // comment to be
4097         // whitespace (indentation for the next line).  Instead of going through
4098         // the big switch, handle it efficiently now.
4099         goto SkipIgnoredUnits;
4100       }
4101     }
4102
4103     if (Char == '*') {  // /**/ comment.
4104       if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4105                            TokAtPhysicalStartOfLine))
4106         return true; // There is a token to return.
4107
4108       // We only saw whitespace, so just try again with this lexer.
4109       // (We manually eliminate the tail call to avoid recursion.)
4110       goto LexNextToken;
4111     }
4112
4113     if (Char == '=') {
4114       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4115       Kind = tok::slashequal;
4116     } else {
4117       Kind = tok::slash;
4118     }
4119     break;
4120   case '%':
4121     Char = getCharAndSize(CurPtr, SizeTmp);
4122     if (Char == '=') {
4123       Kind = tok::percentequal;
4124       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4125     } else if (LangOpts.Digraphs && Char == '>') {
4126       Kind = tok::r_brace;                             // '%>' -> '}'
4127       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4128     } else if (LangOpts.Digraphs && Char == ':') {
4129       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4130       Char = getCharAndSize(CurPtr, SizeTmp);
4131       if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
4132         Kind = tok::hashhash;                          // '%:%:' -> '##'
4133         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4134                              SizeTmp2, Result);
4135       } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
4136         CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4137         if (!isLexingRawMode())
4138           Diag(BufferPtr, diag::ext_charize_microsoft);
4139         Kind = tok::hashat;
4140       } else {                                         // '%:' -> '#'
4141         // We parsed a # character.  If this occurs at the start of the line,
4142         // it's actually the start of a preprocessing directive.  Callback to
4143         // the preprocessor to handle it.
4144         // TODO: -fpreprocessed mode??
4145         if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4146           goto HandleDirective;
4147
4148         Kind = tok::hash;
4149       }
4150     } else {
4151       Kind = tok::percent;
4152     }
4153     break;
4154   case '<':
4155     Char = getCharAndSize(CurPtr, SizeTmp);
4156     if (ParsingFilename) {
4157       return LexAngledStringLiteral(Result, CurPtr);
4158     } else if (Char == '<') {
4159       char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4160       if (After == '=') {
4161         Kind = tok::lesslessequal;
4162         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4163                              SizeTmp2, Result);
4164       } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
4165         // If this is actually a '<<<<<<<' version control conflict marker,
4166         // recognize it as such and recover nicely.
4167         goto LexNextToken;
4168       } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
4169         // If this is '<<<<' and we're in a Perforce-style conflict marker,
4170         // ignore it.
4171         goto LexNextToken;
4172       } else if (LangOpts.CUDA && After == '<') {
4173         Kind = tok::lesslessless;
4174         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4175                              SizeTmp2, Result);
4176       } else {
4177         CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4178         Kind = tok::lessless;
4179       }
4180     } else if (Char == '=') {
4181       char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4182       if (After == '>') {
4183         if (LangOpts.CPlusPlus20) {
4184           if (!isLexingRawMode())
4185             Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
4186           CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4187                                SizeTmp2, Result);
4188           Kind = tok::spaceship;
4189           break;
4190         }
4191         // Suggest adding a space between the '<=' and the '>' to avoid a
4192         // change in semantics if this turns up in C++ <=17 mode.
4193         if (LangOpts.CPlusPlus && !isLexingRawMode()) {
4194           Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)
4195             << FixItHint::CreateInsertion(
4196                    getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
4197         }
4198       }
4199       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4200       Kind = tok::lessequal;
4201     } else if (LangOpts.Digraphs && Char == ':') {     // '<:' -> '['
4202       if (LangOpts.CPlusPlus11 &&
4203           getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
4204         // C++0x [lex.pptoken]p3:
4205         //  Otherwise, if the next three characters are <:: and the subsequent
4206         //  character is neither : nor >, the < is treated as a preprocessor
4207         //  token by itself and not as the first character of the alternative
4208         //  token <:.
4209         unsigned SizeTmp3;
4210         char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
4211         if (After != ':' && After != '>') {
4212           Kind = tok::less;
4213           if (!isLexingRawMode())
4214             Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
4215           break;
4216         }
4217       }
4218
4219       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4220       Kind = tok::l_square;
4221     } else if (LangOpts.Digraphs && Char == '%') {     // '<%' -> '{'
4222       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4223       Kind = tok::l_brace;
4224     } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
4225                lexEditorPlaceholder(Result, CurPtr)) {
4226       return true;
4227     } else {
4228       Kind = tok::less;
4229     }
4230     break;
4231   case '>':
4232     Char = getCharAndSize(CurPtr, SizeTmp);
4233     if (Char == '=') {
4234       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4235       Kind = tok::greaterequal;
4236     } else if (Char == '>') {
4237       char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4238       if (After == '=') {
4239         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4240                              SizeTmp2, Result);
4241         Kind = tok::greatergreaterequal;
4242       } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
4243         // If this is actually a '>>>>' conflict marker, recognize it as such
4244         // and recover nicely.
4245         goto LexNextToken;
4246       } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
4247         // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
4248         goto LexNextToken;
4249       } else if (LangOpts.CUDA && After == '>') {
4250         Kind = tok::greatergreatergreater;
4251         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4252                              SizeTmp2, Result);
4253       } else {
4254         CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4255         Kind = tok::greatergreater;
4256       }
4257     } else {
4258       Kind = tok::greater;
4259     }
4260     break;
4261   case '^':
4262     Char = getCharAndSize(CurPtr, SizeTmp);
4263     if (Char == '=') {
4264       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4265       Kind = tok::caretequal;
4266     } else if (LangOpts.OpenCL && Char == '^') {
4267       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4268       Kind = tok::caretcaret;
4269     } else {
4270       Kind = tok::caret;
4271     }
4272     break;
4273   case '|':
4274     Char = getCharAndSize(CurPtr, SizeTmp);
4275     if (Char == '=') {
4276       Kind = tok::pipeequal;
4277       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4278     } else if (Char == '|') {
4279       // If this is '|||||||' and we're in a conflict marker, ignore it.
4280       if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
4281         goto LexNextToken;
4282       Kind = tok::pipepipe;
4283       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4284     } else {
4285       Kind = tok::pipe;
4286     }
4287     break;
4288   case ':':
4289     Char = getCharAndSize(CurPtr, SizeTmp);
4290     if (LangOpts.Digraphs && Char == '>') {
4291       Kind = tok::r_square; // ':>' -> ']'
4292       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4293     } else if (Char == ':') {
4294       Kind = tok::coloncolon;
4295       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4296     } else {
4297       Kind = tok::colon;
4298     }
4299     break;
4300   case ';':
4301     Kind = tok::semi;
4302     break;
4303   case '=':
4304     Char = getCharAndSize(CurPtr, SizeTmp);
4305     if (Char == '=') {
4306       // If this is '====' and we're in a conflict marker, ignore it.
4307       if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
4308         goto LexNextToken;
4309
4310       Kind = tok::equalequal;
4311       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4312     } else {
4313       Kind = tok::equal;
4314     }
4315     break;
4316   case ',':
4317     Kind = tok::comma;
4318     break;
4319   case '#':
4320     Char = getCharAndSize(CurPtr, SizeTmp);
4321     if (Char == '#') {
4322       Kind = tok::hashhash;
4323       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4324     } else if (Char == '@' && LangOpts.MicrosoftExt) {  // #@ -> Charize
4325       Kind = tok::hashat;
4326       if (!isLexingRawMode())
4327         Diag(BufferPtr, diag::ext_charize_microsoft);
4328       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4329     } else {
4330       // We parsed a # character.  If this occurs at the start of the line,
4331       // it's actually the start of a preprocessing directive.  Callback to
4332       // the preprocessor to handle it.
4333       // TODO: -fpreprocessed mode??
4334       if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4335         goto HandleDirective;
4336
4337       Kind = tok::hash;
4338     }
4339     break;
4340
4341   case '@':
4342     // Objective C support.
4343     if (CurPtr[-1] == '@' && LangOpts.ObjC)
4344       Kind = tok::at;
4345     else
4346       Kind = tok::unknown;
4347     break;
4348
4349   // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
4350   case '\\':
4351     if (!LangOpts.AsmPreprocessor) {
4352       if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
4353         if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4354           if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4355             return true; // KeepWhitespaceMode
4356
4357           // We only saw whitespace, so just try again with this lexer.
4358           // (We manually eliminate the tail call to avoid recursion.)
4359           goto LexNextToken;
4360         }
4361
4362         return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4363       }
4364     }
4365
4366     Kind = tok::unknown;
4367     break;
4368
4369   default: {
4370     if (isASCII(Char)) {
4371       Kind = tok::unknown;
4372       break;
4373     }
4374
4375     llvm::UTF32 CodePoint;
4376
4377     // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
4378     // an escaped newline.
4379     --CurPtr;
4380     llvm::ConversionResult Status =
4381         llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
4382                                   (const llvm::UTF8 *)BufferEnd,
4383                                   &CodePoint,
4384                                   llvm::strictConversion);
4385     if (Status == llvm::conversionOK) {
4386       if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4387         if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4388           return true; // KeepWhitespaceMode
4389
4390         // We only saw whitespace, so just try again with this lexer.
4391         // (We manually eliminate the tail call to avoid recursion.)
4392         goto LexNextToken;
4393       }
4394       return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4395     }
4396
4397     if (isLexingRawMode() || ParsingPreprocessorDirective ||
4398         PP->isPreprocessedOutput()) {
4399       ++CurPtr;
4400       Kind = tok::unknown;
4401       break;
4402     }
4403
4404     // Non-ASCII characters tend to creep into source code unintentionally.
4405     // Instead of letting the parser complain about the unknown token,
4406     // just diagnose the invalid UTF-8, then drop the character.
4407     Diag(CurPtr, diag::err_invalid_utf8);
4408
4409     BufferPtr = CurPtr+1;
4410     // We're pretending the character didn't exist, so just try again with
4411     // this lexer.
4412     // (We manually eliminate the tail call to avoid recursion.)
4413     goto LexNextToken;
4414   }
4415   }
4416
4417   // Notify MIOpt that we read a non-whitespace/non-comment token.
4418   MIOpt.ReadToken();
4419
4420   // Update the location of token as well as BufferPtr.
4421   FormTokenWithChars(Result, CurPtr, Kind);
4422   return true;
4423
4424 HandleDirective:
4425   // We parsed a # character and it's the start of a preprocessing directive.
4426
4427   FormTokenWithChars(Result, CurPtr, tok::hash);
4428   PP->HandleDirective(Result);
4429
4430   if (PP->hadModuleLoaderFatalFailure())
4431     // With a fatal failure in the module loader, we abort parsing.
4432     return true;
4433
4434   // We parsed the directive; lex a token with the new state.
4435   return false;
4436
4437 LexNextToken:
4438   Result.clearFlag(Token::NeedsCleaning);
4439   goto LexStart;
4440 }
4441
4442 const char *Lexer::convertDependencyDirectiveToken(
4443     const dependency_directives_scan::Token &DDTok, Token &Result) {
4444   const char *TokPtr = BufferStart + DDTok.Offset;
4445   Result.startToken();
4446   Result.setLocation(getSourceLocation(TokPtr));
4447   Result.setKind(DDTok.Kind);
4448   Result.setFlag((Token::TokenFlags)DDTok.Flags);
4449   Result.setLength(DDTok.Length);
4450   BufferPtr = TokPtr + DDTok.Length;
4451   return TokPtr;
4452 }
4453
4454 bool Lexer::LexDependencyDirectiveToken(Token &Result) {
4455   assert(isDependencyDirectivesLexer());
4456
4457   using namespace dependency_directives_scan;
4458
4459   while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) {
4460     if (DepDirectives.front().Kind == pp_eof)
4461       return LexEndOfFile(Result, BufferEnd);
4462     if (DepDirectives.front().Kind == tokens_present_before_eof)
4463       MIOpt.ReadToken();
4464     NextDepDirectiveTokenIndex = 0;
4465     DepDirectives = DepDirectives.drop_front();
4466   }
4467
4468   const dependency_directives_scan::Token &DDTok =
4469       DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++];
4470   if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) {
4471     // Read something other than a preprocessor directive hash.
4472     MIOpt.ReadToken();
4473   }
4474
4475   if (ParsingFilename && DDTok.is(tok::less)) {
4476     BufferPtr = BufferStart + DDTok.Offset;
4477     LexAngledStringLiteral(Result, BufferPtr + 1);
4478     if (Result.isNot(tok::header_name))
4479       return true;
4480     // Advance the index of lexed tokens.
4481     while (true) {
4482       const dependency_directives_scan::Token &NextTok =
4483           DepDirectives.front().Tokens[NextDepDirectiveTokenIndex];
4484       if (BufferStart + NextTok.Offset >= BufferPtr)
4485         break;
4486       ++NextDepDirectiveTokenIndex;
4487     }
4488     return true;
4489   }
4490
4491   const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result);
4492
4493   if (Result.is(tok::hash) && Result.isAtStartOfLine()) {
4494     PP->HandleDirective(Result);
4495     return false;
4496   }
4497   if (Result.is(tok::raw_identifier)) {
4498     Result.setRawIdentifierData(TokPtr);
4499     if (!isLexingRawMode()) {
4500       const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
4501       if (II->isHandleIdentifierCase())
4502         return PP->HandleIdentifier(Result);
4503     }
4504     return true;
4505   }
4506   if (Result.isLiteral()) {
4507     Result.setLiteralData(TokPtr);
4508     return true;
4509   }
4510   if (Result.is(tok::colon)) {
4511     // Convert consecutive colons to 'tok::coloncolon'.
4512     if (*BufferPtr == ':') {
4513       assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
4514           tok::colon));
4515       ++NextDepDirectiveTokenIndex;
4516       Result.setKind(tok::coloncolon);
4517     }
4518     return true;
4519   }
4520   if (Result.is(tok::eod))
4521     ParsingPreprocessorDirective = false;
4522
4523   return true;
4524 }
4525
4526 bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) {
4527   assert(isDependencyDirectivesLexer());
4528
4529   using namespace dependency_directives_scan;
4530
4531   bool Stop = false;
4532   unsigned NestedIfs = 0;
4533   do {
4534     DepDirectives = DepDirectives.drop_front();
4535     switch (DepDirectives.front().Kind) {
4536     case pp_none:
4537       llvm_unreachable("unexpected 'pp_none'");
4538     case pp_include:
4539     case pp___include_macros:
4540     case pp_define:
4541     case pp_undef:
4542     case pp_import:
4543     case pp_pragma_import:
4544     case pp_pragma_once:
4545     case pp_pragma_push_macro:
4546     case pp_pragma_pop_macro:
4547     case pp_pragma_include_alias:
4548     case pp_pragma_system_header:
4549     case pp_include_next:
4550     case decl_at_import:
4551     case cxx_module_decl:
4552     case cxx_import_decl:
4553     case cxx_export_module_decl:
4554     case cxx_export_import_decl:
4555     case tokens_present_before_eof:
4556       break;
4557     case pp_if:
4558     case pp_ifdef:
4559     case pp_ifndef:
4560       ++NestedIfs;
4561       break;
4562     case pp_elif:
4563     case pp_elifdef:
4564     case pp_elifndef:
4565     case pp_else:
4566       if (!NestedIfs) {
4567         Stop = true;
4568       }
4569       break;
4570     case pp_endif:
4571       if (!NestedIfs) {
4572         Stop = true;
4573       } else {
4574         --NestedIfs;
4575       }
4576       break;
4577     case pp_eof:
4578       NextDepDirectiveTokenIndex = 0;
4579       return LexEndOfFile(Result, BufferEnd);
4580     }
4581   } while (!Stop);
4582
4583   const dependency_directives_scan::Token &DDTok =
4584       DepDirectives.front().Tokens.front();
4585   assert(DDTok.is(tok::hash));
4586   NextDepDirectiveTokenIndex = 1;
4587
4588   convertDependencyDirectiveToken(DDTok, Result);
4589   return false;
4590 }