clang/lib/Lex/LiteralSupport.cpp

   1 //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file implements the NumericLiteralParser, CharLiteralParser, and
  10 // StringLiteralParser interfaces.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "clang/Lex/LiteralSupport.h"
  15 #include "clang/Basic/CharInfo.h"
  16 #include "clang/Basic/LangOptions.h"
  17 #include "clang/Basic/SourceLocation.h"
  18 #include "clang/Basic/TargetInfo.h"
  19 #include "clang/Lex/LexDiagnostic.h"
  20 #include "clang/Lex/Lexer.h"
  21 #include "clang/Lex/Preprocessor.h"
  22 #include "clang/Lex/Token.h"
  23 #include "llvm/ADT/APInt.h"
  24 #include "llvm/ADT/SmallVector.h"
  25 #include "llvm/ADT/StringExtras.h"
  26 #include "llvm/ADT/StringSwitch.h"
  27 #include "llvm/Support/ConvertUTF.h"
  28 #include "llvm/Support/Error.h"
  29 #include "llvm/Support/ErrorHandling.h"
  30 #include "llvm/Support/Unicode.h"
  31 #include <algorithm>
  32 #include <cassert>
  33 #include <cstddef>
  34 #include <cstdint>
  35 #include <cstring>
  36 #include <string>
  37
  38 using namespace clang;
  39
  40 static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
  41   switch (kind) {
  42   default: llvm_unreachable("Unknown token type!");
  43   case tok::char_constant:
  44   case tok::string_literal:
  45   case tok::utf8_char_constant:
  46   case tok::utf8_string_literal:
  47     return Target.getCharWidth();
  48   case tok::wide_char_constant:
  49   case tok::wide_string_literal:
  50     return Target.getWCharWidth();
  51   case tok::utf16_char_constant:
  52   case tok::utf16_string_literal:
  53     return Target.getChar16Width();
  54   case tok::utf32_char_constant:
  55   case tok::utf32_string_literal:
  56     return Target.getChar32Width();
  57   }
  58 }
  59
  60 static unsigned getEncodingPrefixLen(tok::TokenKind kind) {
  61   switch (kind) {
  62   default:
  63     llvm_unreachable("Unknown token type!");
  64   case tok::char_constant:
  65   case tok::string_literal:
  66     return 0;
  67   case tok::utf8_char_constant:
  68   case tok::utf8_string_literal:
  69     return 2;
  70   case tok::wide_char_constant:
  71   case tok::wide_string_literal:
  72   case tok::utf16_char_constant:
  73   case tok::utf16_string_literal:
  74   case tok::utf32_char_constant:
  75   case tok::utf32_string_literal:
  76     return 1;
  77   }
  78 }
  79
  80 static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
  81                                            FullSourceLoc TokLoc,
  82                                            const char *TokBegin,
  83                                            const char *TokRangeBegin,
  84                                            const char *TokRangeEnd) {
  85   SourceLocation Begin =
  86     Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
  87                                    TokLoc.getManager(), Features);
  88   SourceLocation End =
  89     Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
  90                                    TokLoc.getManager(), Features);
  91   return CharSourceRange::getCharRange(Begin, End);
  92 }
  93
  94 /// Produce a diagnostic highlighting some portion of a literal.
  95 ///
  96 /// Emits the diagnostic \p DiagID, highlighting the range of characters from
  97 /// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
  98 /// a substring of a spelling buffer for the token beginning at \p TokBegin.
  99 static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
 100                               const LangOptions &Features, FullSourceLoc TokLoc,
 101                               const char *TokBegin, const char *TokRangeBegin,
 102                               const char *TokRangeEnd, unsigned DiagID) {
 103   SourceLocation Begin =
 104     Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
 105                                    TokLoc.getManager(), Features);
 106   return Diags->Report(Begin, DiagID) <<
 107     MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
 108 }
 109
 110 static bool IsEscapeValidInUnevaluatedStringLiteral(char Escape) {
 111   switch (Escape) {
 112   case '\'':
 113   case '"':
 114   case '?':
 115   case '\\':
 116   case 'a':
 117   case 'b':
 118   case 'f':
 119   case 'n':
 120   case 'r':
 121   case 't':
 122   case 'v':
 123     return true;
 124   }
 125   return false;
 126 }
 127
 128 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
 129 /// either a character or a string literal.
 130 static unsigned ProcessCharEscape(const char *ThisTokBegin,
 131                                   const char *&ThisTokBuf,
 132                                   const char *ThisTokEnd, bool &HadError,
 133                                   FullSourceLoc Loc, unsigned CharWidth,
 134                                   DiagnosticsEngine *Diags,
 135                                   const LangOptions &Features,
 136                                   StringLiteralEvalMethod EvalMethod) {
 137   const char *EscapeBegin = ThisTokBuf;
 138   bool Delimited = false;
 139   bool EndDelimiterFound = false;
 140
 141   // Skip the '\' char.
 142   ++ThisTokBuf;
 143
 144   // We know that this character can't be off the end of the buffer, because
 145   // that would have been \", which would not have been the end of string.
 146   unsigned ResultChar = *ThisTokBuf++;
 147   char Escape = ResultChar;
 148   switch (ResultChar) {
 149   // These map to themselves.
 150   case '\\': case '\'': case '"': case '?': break;
 151
 152     // These have fixed mappings.
 153   case 'a':
 154     // TODO: K&R: the meaning of '\\a' is different in traditional C
 155     ResultChar = 7;
 156     break;
 157   case 'b':
 158     ResultChar = 8;
 159     break;
 160   case 'e':
 161     if (Diags)
 162       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 163            diag::ext_nonstandard_escape) << "e";
 164     ResultChar = 27;
 165     break;
 166   case 'E':
 167     if (Diags)
 168       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 169            diag::ext_nonstandard_escape) << "E";
 170     ResultChar = 27;
 171     break;
 172   case 'f':
 173     ResultChar = 12;
 174     break;
 175   case 'n':
 176     ResultChar = 10;
 177     break;
 178   case 'r':
 179     ResultChar = 13;
 180     break;
 181   case 't':
 182     ResultChar = 9;
 183     break;
 184   case 'v':
 185     ResultChar = 11;
 186     break;
 187   case 'x': { // Hex escape.
 188     ResultChar = 0;
 189     if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
 190       Delimited = true;
 191       ThisTokBuf++;
 192       if (*ThisTokBuf == '}') {
 193         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 194              diag::err_delimited_escape_empty);
 195         return ResultChar;
 196       }
 197     } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
 198       if (Diags)
 199         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 200              diag::err_hex_escape_no_digits) << "x";
 201       return ResultChar;
 202     }
 203
 204     // Hex escapes are a maximal series of hex digits.
 205     bool Overflow = false;
 206     for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
 207       if (Delimited && *ThisTokBuf == '}') {
 208         ThisTokBuf++;
 209         EndDelimiterFound = true;
 210         break;
 211       }
 212       int CharVal = llvm::hexDigitValue(*ThisTokBuf);
 213       if (CharVal == -1) {
 214         // Non delimited hex escape sequences stop at the first non-hex digit.
 215         if (!Delimited)
 216           break;
 217         HadError = true;
 218         if (Diags)
 219           Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 220                diag::err_delimited_escape_invalid)
 221               << StringRef(ThisTokBuf, 1);
 222         continue;
 223       }
 224       // About to shift out a digit?
 225       if (ResultChar & 0xF0000000)
 226         Overflow = true;
 227       ResultChar <<= 4;
 228       ResultChar |= CharVal;
 229     }
 230     // See if any bits will be truncated when evaluated as a character.
 231     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
 232       Overflow = true;
 233       ResultChar &= ~0U >> (32-CharWidth);
 234     }
 235
 236     // Check for overflow.
 237     if (!HadError && Overflow) { // Too many digits to fit in
 238       HadError = true;
 239       if (Diags)
 240         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 241              diag::err_escape_too_large)
 242             << 0;
 243     }
 244     break;
 245   }
 246   case '0': case '1': case '2': case '3':
 247   case '4': case '5': case '6': case '7': {
 248     // Octal escapes.
 249     --ThisTokBuf;
 250     ResultChar = 0;
 251
 252     // Octal escapes are a series of octal digits with maximum length 3.
 253     // "\0123" is a two digit sequence equal to "\012" "3".
 254     unsigned NumDigits = 0;
 255     do {
 256       ResultChar <<= 3;
 257       ResultChar |= *ThisTokBuf++ - '0';
 258       ++NumDigits;
 259     } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
 260              ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
 261
 262     // Check for overflow.  Reject '\777', but not L'\777'.
 263     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
 264       if (Diags)
 265         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 266              diag::err_escape_too_large) << 1;
 267       ResultChar &= ~0U >> (32-CharWidth);
 268     }
 269     break;
 270   }
 271   case 'o': {
 272     bool Overflow = false;
 273     if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
 274       HadError = true;
 275       if (Diags)
 276         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 277              diag::err_delimited_escape_missing_brace)
 278             << "o";
 279
 280       break;
 281     }
 282     ResultChar = 0;
 283     Delimited = true;
 284     ++ThisTokBuf;
 285     if (*ThisTokBuf == '}') {
 286       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 287            diag::err_delimited_escape_empty);
 288       return ResultChar;
 289     }
 290
 291     while (ThisTokBuf != ThisTokEnd) {
 292       if (*ThisTokBuf == '}') {
 293         EndDelimiterFound = true;
 294         ThisTokBuf++;
 295         break;
 296       }
 297       if (*ThisTokBuf < '0' || *ThisTokBuf > '7') {
 298         HadError = true;
 299         if (Diags)
 300           Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 301                diag::err_delimited_escape_invalid)
 302               << StringRef(ThisTokBuf, 1);
 303         ThisTokBuf++;
 304         continue;
 305       }
 306       // Check if one of the top three bits is set before shifting them out.
 307       if (ResultChar & 0xE0000000)
 308         Overflow = true;
 309
 310       ResultChar <<= 3;
 311       ResultChar |= *ThisTokBuf++ - '0';
 312     }
 313     // Check for overflow.  Reject '\777', but not L'\777'.
 314     if (!HadError &&
 315         (Overflow || (CharWidth != 32 && (ResultChar >> CharWidth) != 0))) {
 316       HadError = true;
 317       if (Diags)
 318         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 319              diag::err_escape_too_large)
 320             << 1;
 321       ResultChar &= ~0U >> (32 - CharWidth);
 322     }
 323     break;
 324   }
 325     // Otherwise, these are not valid escapes.
 326   case '(': case '{': case '[': case '%':
 327     // GCC accepts these as extensions.  We warn about them as such though.
 328     if (Diags)
 329       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 330            diag::ext_nonstandard_escape)
 331         << std::string(1, ResultChar);
 332     break;
 333   default:
 334     if (!Diags)
 335       break;
 336
 337     if (isPrintable(ResultChar))
 338       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 339            diag::ext_unknown_escape)
 340         << std::string(1, ResultChar);
 341     else
 342       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 343            diag::ext_unknown_escape)
 344         << "x" + llvm::utohexstr(ResultChar);
 345     break;
 346   }
 347
 348   if (Delimited && Diags) {
 349     if (!EndDelimiterFound)
 350       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 351            diag::err_expected)
 352           << tok::r_brace;
 353     else if (!HadError) {
 354       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 355            Features.CPlusPlus23 ? diag::warn_cxx23_delimited_escape_sequence
 356                                 : diag::ext_delimited_escape_sequence)
 357           << /*delimited*/ 0 << (Features.CPlusPlus ? 1 : 0);
 358     }
 359   }
 360
 361   if (EvalMethod == StringLiteralEvalMethod::Unevaluated &&
 362       !IsEscapeValidInUnevaluatedStringLiteral(Escape)) {
 363     Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 364          diag::err_unevaluated_string_invalid_escape_sequence)
 365         << StringRef(EscapeBegin, ThisTokBuf - EscapeBegin);
 366     HadError = true;
 367   }
 368
 369   return ResultChar;
 370 }
 371
 372 static void appendCodePoint(unsigned Codepoint,
 373                             llvm::SmallVectorImpl<char> &Str) {
 374   char ResultBuf[4];
 375   char *ResultPtr = ResultBuf;
 376   if (llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr))
 377     Str.append(ResultBuf, ResultPtr);
 378 }
 379
 380 void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
 381   for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
 382     if (*I != '\\') {
 383       Buf.push_back(*I);
 384       continue;
 385     }
 386
 387     ++I;
 388     char Kind = *I;
 389     ++I;
 390
 391     assert(Kind == 'u' || Kind == 'U' || Kind == 'N');
 392     uint32_t CodePoint = 0;
 393
 394     if (Kind == 'u' && *I == '{') {
 395       for (++I; *I != '}'; ++I) {
 396         unsigned Value = llvm::hexDigitValue(*I);
 397         assert(Value != -1U);
 398         CodePoint <<= 4;
 399         CodePoint += Value;
 400       }
 401       appendCodePoint(CodePoint, Buf);
 402       continue;
 403     }
 404
 405     if (Kind == 'N') {
 406       assert(*I == '{');
 407       ++I;
 408       auto Delim = std::find(I, Input.end(), '}');
 409       assert(Delim != Input.end());
 410       StringRef Name(I, std::distance(I, Delim));
 411       std::optional<llvm::sys::unicode::LooseMatchingResult> Res =
 412           llvm::sys::unicode::nameToCodepointLooseMatching(Name);
 413       assert(Res && "could not find a codepoint that was previously found");
 414       CodePoint = Res->CodePoint;
 415       assert(CodePoint != 0xFFFFFFFF);
 416       appendCodePoint(CodePoint, Buf);
 417       I = Delim;
 418       continue;
 419     }
 420
 421     unsigned NumHexDigits;
 422     if (Kind == 'u')
 423       NumHexDigits = 4;
 424     else
 425       NumHexDigits = 8;
 426
 427     assert(I + NumHexDigits <= E);
 428
 429     for (; NumHexDigits != 0; ++I, --NumHexDigits) {
 430       unsigned Value = llvm::hexDigitValue(*I);
 431       assert(Value != -1U);
 432
 433       CodePoint <<= 4;
 434       CodePoint += Value;
 435     }
 436
 437     appendCodePoint(CodePoint, Buf);
 438     --I;
 439   }
 440 }
 441
 442 bool clang::isFunctionLocalStringLiteralMacro(tok::TokenKind K,
 443                                               const LangOptions &LO) {
 444   return LO.MicrosoftExt &&
 445          (K == tok::kw___FUNCTION__ || K == tok::kw_L__FUNCTION__ ||
 446           K == tok::kw___FUNCSIG__ || K == tok::kw_L__FUNCSIG__ ||
 447           K == tok::kw___FUNCDNAME__);
 448 }
 449
 450 bool clang::tokenIsLikeStringLiteral(const Token &Tok, const LangOptions &LO) {
 451   return tok::isStringLiteral(Tok.getKind()) ||
 452          isFunctionLocalStringLiteralMacro(Tok.getKind(), LO);
 453 }
 454
 455 static bool ProcessNumericUCNEscape(const char *ThisTokBegin,
 456                                     const char *&ThisTokBuf,
 457                                     const char *ThisTokEnd, uint32_t &UcnVal,
 458                                     unsigned short &UcnLen, bool &Delimited,
 459                                     FullSourceLoc Loc, DiagnosticsEngine *Diags,
 460                                     const LangOptions &Features,
 461                                     bool in_char_string_literal = false) {
 462   const char *UcnBegin = ThisTokBuf;
 463   bool HasError = false;
 464   bool EndDelimiterFound = false;
 465
 466   // Skip the '\u' char's.
 467   ThisTokBuf += 2;
 468   Delimited = false;
 469   if (UcnBegin[1] == 'u' && in_char_string_literal &&
 470       ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
 471     Delimited = true;
 472     ThisTokBuf++;
 473   } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
 474     if (Diags)
 475       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 476            diag::err_hex_escape_no_digits)
 477           << StringRef(&ThisTokBuf[-1], 1);
 478     return false;
 479   }
 480   UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
 481
 482   bool Overflow = false;
 483   unsigned short Count = 0;
 484   for (; ThisTokBuf != ThisTokEnd && (Delimited || Count != UcnLen);
 485        ++ThisTokBuf) {
 486     if (Delimited && *ThisTokBuf == '}') {
 487       ++ThisTokBuf;
 488       EndDelimiterFound = true;
 489       break;
 490     }
 491     int CharVal = llvm::hexDigitValue(*ThisTokBuf);
 492     if (CharVal == -1) {
 493       HasError = true;
 494       if (!Delimited)
 495         break;
 496       if (Diags) {
 497         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 498              diag::err_delimited_escape_invalid)
 499             << StringRef(ThisTokBuf, 1);
 500       }
 501       Count++;
 502       continue;
 503     }
 504     if (UcnVal & 0xF0000000) {
 505       Overflow = true;
 506       continue;
 507     }
 508     UcnVal <<= 4;
 509     UcnVal |= CharVal;
 510     Count++;
 511   }
 512
 513   if (Overflow) {
 514     if (Diags)
 515       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 516            diag::err_escape_too_large)
 517           << 0;
 518     return false;
 519   }
 520
 521   if (Delimited && !EndDelimiterFound) {
 522     if (Diags) {
 523       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 524            diag::err_expected)
 525           << tok::r_brace;
 526     }
 527     return false;
 528   }
 529
 530   // If we didn't consume the proper number of digits, there is a problem.
 531   if (Count == 0 || (!Delimited && Count != UcnLen)) {
 532     if (Diags)
 533       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 534            Delimited ? diag::err_delimited_escape_empty
 535                      : diag::err_ucn_escape_incomplete);
 536     return false;
 537   }
 538   return !HasError;
 539 }
 540
 541 static void DiagnoseInvalidUnicodeCharacterName(
 542     DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc Loc,
 543     const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd,
 544     llvm::StringRef Name) {
 545
 546   Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
 547        diag::err_invalid_ucn_name)
 548       << Name;
 549
 550   namespace u = llvm::sys::unicode;
 551
 552   std::optional<u::LooseMatchingResult> Res =
 553       u::nameToCodepointLooseMatching(Name);
 554   if (Res) {
 555     Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
 556          diag::note_invalid_ucn_name_loose_matching)
 557         << FixItHint::CreateReplacement(
 558                MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
 559                                    TokRangeEnd),
 560                Res->Name);
 561     return;
 562   }
 563
 564   unsigned Distance = 0;
 565   SmallVector<u::MatchForCodepointName> Matches =
 566       u::nearestMatchesForCodepointName(Name, 5);
 567   assert(!Matches.empty() && "No unicode characters found");
 568
 569   for (const auto &Match : Matches) {
 570     if (Distance == 0)
 571       Distance = Match.Distance;
 572     if (std::max(Distance, Match.Distance) -
 573             std::min(Distance, Match.Distance) >
 574         3)
 575       break;
 576     Distance = Match.Distance;
 577
 578     std::string Str;
 579     llvm::UTF32 V = Match.Value;
 580     bool Converted =
 581         llvm::convertUTF32ToUTF8String(llvm::ArrayRef<llvm::UTF32>(&V, 1), Str);
 582     (void)Converted;
 583     assert(Converted && "Found a match wich is not a unicode character");
 584
 585     Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
 586          diag::note_invalid_ucn_name_candidate)
 587         << Match.Name << llvm::utohexstr(Match.Value)
 588         << Str // FIXME: Fix the rendering of non printable characters
 589         << FixItHint::CreateReplacement(
 590                MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
 591                                    TokRangeEnd),
 592                Match.Name);
 593   }
 594 }
 595
 596 static bool ProcessNamedUCNEscape(const char *ThisTokBegin,
 597                                   const char *&ThisTokBuf,
 598                                   const char *ThisTokEnd, uint32_t &UcnVal,
 599                                   unsigned short &UcnLen, FullSourceLoc Loc,
 600                                   DiagnosticsEngine *Diags,
 601                                   const LangOptions &Features) {
 602   const char *UcnBegin = ThisTokBuf;
 603   assert(UcnBegin[0] == '\\' && UcnBegin[1] == 'N');
 604   ThisTokBuf += 2;
 605   if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
 606     if (Diags) {
 607       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 608            diag::err_delimited_escape_missing_brace)
 609           << StringRef(&ThisTokBuf[-1], 1);
 610     }
 611     return false;
 612   }
 613   ThisTokBuf++;
 614   const char *ClosingBrace = std::find_if(ThisTokBuf, ThisTokEnd, [](char C) {
 615     return C == '}' || isVerticalWhitespace(C);
 616   });
 617   bool Incomplete = ClosingBrace == ThisTokEnd;
 618   bool Empty = ClosingBrace == ThisTokBuf;
 619   if (Incomplete || Empty) {
 620     if (Diags) {
 621       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 622            Incomplete ? diag::err_ucn_escape_incomplete
 623                       : diag::err_delimited_escape_empty)
 624           << StringRef(&UcnBegin[1], 1);
 625     }
 626     ThisTokBuf = ClosingBrace == ThisTokEnd ? ClosingBrace : ClosingBrace + 1;
 627     return false;
 628   }
 629   StringRef Name(ThisTokBuf, ClosingBrace - ThisTokBuf);
 630   ThisTokBuf = ClosingBrace + 1;
 631   std::optional<char32_t> Res = llvm::sys::unicode::nameToCodepointStrict(Name);
 632   if (!Res) {
 633     if (Diags)
 634       DiagnoseInvalidUnicodeCharacterName(Diags, Features, Loc, ThisTokBegin,
 635                                           &UcnBegin[3], ClosingBrace, Name);
 636     return false;
 637   }
 638   UcnVal = *Res;
 639   UcnLen = UcnVal > 0xFFFF ? 8 : 4;
 640   return true;
 641 }
 642
 643 /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
 644 /// return the UTF32.
 645 static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
 646                              const char *ThisTokEnd, uint32_t &UcnVal,
 647                              unsigned short &UcnLen, FullSourceLoc Loc,
 648                              DiagnosticsEngine *Diags,
 649                              const LangOptions &Features,
 650                              bool in_char_string_literal = false) {
 651
 652   bool HasError;
 653   const char *UcnBegin = ThisTokBuf;
 654   bool IsDelimitedEscapeSequence = false;
 655   bool IsNamedEscapeSequence = false;
 656   if (ThisTokBuf[1] == 'N') {
 657     IsNamedEscapeSequence = true;
 658     HasError = !ProcessNamedUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
 659                                       UcnVal, UcnLen, Loc, Diags, Features);
 660   } else {
 661     HasError =
 662         !ProcessNumericUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
 663                                  UcnLen, IsDelimitedEscapeSequence, Loc, Diags,
 664                                  Features, in_char_string_literal);
 665   }
 666   if (HasError)
 667     return false;
 668
 669   // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
 670   if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
 671       UcnVal > 0x10FFFF) {                      // maximum legal UTF32 value
 672     if (Diags)
 673       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 674            diag::err_ucn_escape_invalid);
 675     return false;
 676   }
 677
 678   // C23 and C++11 allow UCNs that refer to control characters
 679   // and basic source characters inside character and string literals
 680   if (UcnVal < 0xa0 &&
 681       // $, @, ` are allowed in all language modes
 682       (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) {
 683     bool IsError =
 684         (!(Features.CPlusPlus11 || Features.C23) || !in_char_string_literal);
 685     if (Diags) {
 686       char BasicSCSChar = UcnVal;
 687       if (UcnVal >= 0x20 && UcnVal < 0x7f)
 688         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 689              IsError ? diag::err_ucn_escape_basic_scs
 690              : Features.CPlusPlus
 691                  ? diag::warn_cxx98_compat_literal_ucn_escape_basic_scs
 692                  : diag::warn_c23_compat_literal_ucn_escape_basic_scs)
 693             << StringRef(&BasicSCSChar, 1);
 694       else
 695         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 696              IsError ? diag::err_ucn_control_character
 697              : Features.CPlusPlus
 698                  ? diag::warn_cxx98_compat_literal_ucn_control_character
 699                  : diag::warn_c23_compat_literal_ucn_control_character);
 700     }
 701     if (IsError)
 702       return false;
 703   }
 704
 705   if (!Features.CPlusPlus && !Features.C99 && Diags)
 706     Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 707          diag::warn_ucn_not_valid_in_c89_literal);
 708
 709   if ((IsDelimitedEscapeSequence || IsNamedEscapeSequence) && Diags)
 710     Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 711          Features.CPlusPlus23 ? diag::warn_cxx23_delimited_escape_sequence
 712                               : diag::ext_delimited_escape_sequence)
 713         << (IsNamedEscapeSequence ? 1 : 0) << (Features.CPlusPlus ? 1 : 0);
 714
 715   return true;
 716 }
 717
 718 /// MeasureUCNEscape - Determine the number of bytes within the resulting string
 719 /// which this UCN will occupy.
 720 static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
 721                             const char *ThisTokEnd, unsigned CharByteWidth,
 722                             const LangOptions &Features, bool &HadError) {
 723   // UTF-32: 4 bytes per escape.
 724   if (CharByteWidth == 4)
 725     return 4;
 726
 727   uint32_t UcnVal = 0;
 728   unsigned short UcnLen = 0;
 729   FullSourceLoc Loc;
 730
 731   if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
 732                         UcnLen, Loc, nullptr, Features, true)) {
 733     HadError = true;
 734     return 0;
 735   }
 736
 737   // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
 738   if (CharByteWidth == 2)
 739     return UcnVal <= 0xFFFF ? 2 : 4;
 740
 741   // UTF-8.
 742   if (UcnVal < 0x80)
 743     return 1;
 744   if (UcnVal < 0x800)
 745     return 2;
 746   if (UcnVal < 0x10000)
 747     return 3;
 748   return 4;
 749 }
 750
 751 /// EncodeUCNEscape - Read the Universal Character Name, check constraints and
 752 /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
 753 /// StringLiteralParser. When we decide to implement UCN's for identifiers,
 754 /// we will likely rework our support for UCN's.
 755 static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
 756                             const char *ThisTokEnd,
 757                             char *&ResultBuf, bool &HadError,
 758                             FullSourceLoc Loc, unsigned CharByteWidth,
 759                             DiagnosticsEngine *Diags,
 760                             const LangOptions &Features) {
 761   typedef uint32_t UTF32;
 762   UTF32 UcnVal = 0;
 763   unsigned short UcnLen = 0;
 764   if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
 765                         Loc, Diags, Features, true)) {
 766     HadError = true;
 767     return;
 768   }
 769
 770   assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) &&
 771          "only character widths of 1, 2, or 4 bytes supported");
 772
 773   (void)UcnLen;
 774   assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
 775
 776   if (CharByteWidth == 4) {
 777     // FIXME: Make the type of the result buffer correct instead of
 778     // using reinterpret_cast.
 779     llvm::UTF32 *ResultPtr = reinterpret_cast<llvm::UTF32*>(ResultBuf);
 780     *ResultPtr = UcnVal;
 781     ResultBuf += 4;
 782     return;
 783   }
 784
 785   if (CharByteWidth == 2) {
 786     // FIXME: Make the type of the result buffer correct instead of
 787     // using reinterpret_cast.
 788     llvm::UTF16 *ResultPtr = reinterpret_cast<llvm::UTF16*>(ResultBuf);
 789
 790     if (UcnVal <= (UTF32)0xFFFF) {
 791       *ResultPtr = UcnVal;
 792       ResultBuf += 2;
 793       return;
 794     }
 795
 796     // Convert to UTF16.
 797     UcnVal -= 0x10000;
 798     *ResultPtr     = 0xD800 + (UcnVal >> 10);
 799     *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
 800     ResultBuf += 4;
 801     return;
 802   }
 803
 804   assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
 805
 806   // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
 807   // The conversion below was inspired by:
 808   //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
 809   // First, we determine how many bytes the result will require.
 810   typedef uint8_t UTF8;
 811
 812   unsigned short bytesToWrite = 0;
 813   if (UcnVal < (UTF32)0x80)
 814     bytesToWrite = 1;
 815   else if (UcnVal < (UTF32)0x800)
 816     bytesToWrite = 2;
 817   else if (UcnVal < (UTF32)0x10000)
 818     bytesToWrite = 3;
 819   else
 820     bytesToWrite = 4;
 821
 822   const unsigned byteMask = 0xBF;
 823   const unsigned byteMark = 0x80;
 824
 825   // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
 826   // into the first byte, depending on how many bytes follow.
 827   static const UTF8 firstByteMark[5] = {
 828     0x00, 0x00, 0xC0, 0xE0, 0xF0
 829   };
 830   // Finally, we write the bytes into ResultBuf.
 831   ResultBuf += bytesToWrite;
 832   switch (bytesToWrite) { // note: everything falls through.
 833   case 4:
 834     *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
 835     [[fallthrough]];
 836   case 3:
 837     *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
 838     [[fallthrough]];
 839   case 2:
 840     *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
 841     [[fallthrough]];
 842   case 1:
 843     *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
 844   }
 845   // Update the buffer.
 846   ResultBuf += bytesToWrite;
 847 }
 848
 849 ///       integer-constant: [C99 6.4.4.1]
 850 ///         decimal-constant integer-suffix
 851 ///         octal-constant integer-suffix
 852 ///         hexadecimal-constant integer-suffix
 853 ///         binary-literal integer-suffix [GNU, C++1y]
 854 ///       user-defined-integer-literal: [C++11 lex.ext]
 855 ///         decimal-literal ud-suffix
 856 ///         octal-literal ud-suffix
 857 ///         hexadecimal-literal ud-suffix
 858 ///         binary-literal ud-suffix [GNU, C++1y]
 859 ///       decimal-constant:
 860 ///         nonzero-digit
 861 ///         decimal-constant digit
 862 ///       octal-constant:
 863 ///         0
 864 ///         octal-constant octal-digit
 865 ///       hexadecimal-constant:
 866 ///         hexadecimal-prefix hexadecimal-digit
 867 ///         hexadecimal-constant hexadecimal-digit
 868 ///       hexadecimal-prefix: one of
 869 ///         0x 0X
 870 ///       binary-literal:
 871 ///         0b binary-digit
 872 ///         0B binary-digit
 873 ///         binary-literal binary-digit
 874 ///       integer-suffix:
 875 ///         unsigned-suffix [long-suffix]
 876 ///         unsigned-suffix [long-long-suffix]
 877 ///         long-suffix [unsigned-suffix]
 878 ///         long-long-suffix [unsigned-sufix]
 879 ///       nonzero-digit:
 880 ///         1 2 3 4 5 6 7 8 9
 881 ///       octal-digit:
 882 ///         0 1 2 3 4 5 6 7
 883 ///       hexadecimal-digit:
 884 ///         0 1 2 3 4 5 6 7 8 9
 885 ///         a b c d e f
 886 ///         A B C D E F
 887 ///       binary-digit:
 888 ///         0
 889 ///         1
 890 ///       unsigned-suffix: one of
 891 ///         u U
 892 ///       long-suffix: one of
 893 ///         l L
 894 ///       long-long-suffix: one of
 895 ///         ll LL
 896 ///
 897 ///       floating-constant: [C99 6.4.4.2]
 898 ///         TODO: add rules...
 899 ///
 900 NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
 901                                            SourceLocation TokLoc,
 902                                            const SourceManager &SM,
 903                                            const LangOptions &LangOpts,
 904                                            const TargetInfo &Target,
 905                                            DiagnosticsEngine &Diags)
 906     : SM(SM), LangOpts(LangOpts), Diags(Diags),
 907       ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
 908
 909   s = DigitsBegin = ThisTokBegin;
 910   saw_exponent = false;
 911   saw_period = false;
 912   saw_ud_suffix = false;
 913   saw_fixed_point_suffix = false;
 914   isLong = false;
 915   isUnsigned = false;
 916   isLongLong = false;
 917   isSizeT = false;
 918   isHalf = false;
 919   isFloat = false;
 920   isImaginary = false;
 921   isFloat16 = false;
 922   isFloat128 = false;
 923   MicrosoftInteger = 0;
 924   isFract = false;
 925   isAccum = false;
 926   hadError = false;
 927   isBitInt = false;
 928
 929   // This routine assumes that the range begin/end matches the regex for integer
 930   // and FP constants (specifically, the 'pp-number' regex), and assumes that
 931   // the byte at "*end" is both valid and not part of the regex.  Because of
 932   // this, it doesn't have to check for 'overscan' in various places.
 933   if (isPreprocessingNumberBody(*ThisTokEnd)) {
 934     Diags.Report(TokLoc, diag::err_lexing_numeric);
 935     hadError = true;
 936     return;
 937   }
 938
 939   if (*s == '0') { // parse radix
 940     ParseNumberStartingWithZero(TokLoc);
 941     if (hadError)
 942       return;
 943   } else { // the first digit is non-zero
 944     radix = 10;
 945     s = SkipDigits(s);
 946     if (s == ThisTokEnd) {
 947       // Done.
 948     } else {
 949       ParseDecimalOrOctalCommon(TokLoc);
 950       if (hadError)
 951         return;
 952     }
 953   }
 954
 955   SuffixBegin = s;
 956   checkSeparator(TokLoc, s, CSK_AfterDigits);
 957
 958   // Initial scan to lookahead for fixed point suffix.
 959   if (LangOpts.FixedPoint) {
 960     for (const char *c = s; c != ThisTokEnd; ++c) {
 961       if (*c == 'r' || *c == 'k' || *c == 'R' || *c == 'K') {
 962         saw_fixed_point_suffix = true;
 963         break;
 964       }
 965     }
 966   }
 967
 968   // Parse the suffix.  At this point we can classify whether we have an FP or
 969   // integer constant.
 970   bool isFixedPointConstant = isFixedPointLiteral();
 971   bool isFPConstant = isFloatingLiteral();
 972   bool HasSize = false;
 973
 974   // Loop over all of the characters of the suffix.  If we see something bad,
 975   // we break out of the loop.
 976   for (; s != ThisTokEnd; ++s) {
 977     switch (*s) {
 978     case 'R':
 979     case 'r':
 980       if (!LangOpts.FixedPoint)
 981         break;
 982       if (isFract || isAccum) break;
 983       if (!(saw_period || saw_exponent)) break;
 984       isFract = true;
 985       continue;
 986     case 'K':
 987     case 'k':
 988       if (!LangOpts.FixedPoint)
 989         break;
 990       if (isFract || isAccum) break;
 991       if (!(saw_period || saw_exponent)) break;
 992       isAccum = true;
 993       continue;
 994     case 'h':      // FP Suffix for "half".
 995     case 'H':
 996       // OpenCL Extension v1.2 s9.5 - h or H suffix for half type.
 997       if (!(LangOpts.Half || LangOpts.FixedPoint))
 998         break;
 999       if (isIntegerLiteral()) break;  // Error for integer constant.
1000       if (HasSize)
1001         break;
1002       HasSize = true;
1003       isHalf = true;
1004       continue;  // Success.
1005     case 'f':      // FP Suffix for "float"
1006     case 'F':
1007       if (!isFPConstant) break;  // Error for integer constant.
1008       if (HasSize)
1009         break;
1010       HasSize = true;
1011
1012       // CUDA host and device may have different _Float16 support, therefore
1013       // allows f16 literals to avoid false alarm.
1014       // When we compile for OpenMP target offloading on NVPTX, f16 suffix
1015       // should also be supported.
1016       // ToDo: more precise check for CUDA.
1017       // TODO: AMDGPU might also support it in the future.
1018       if ((Target.hasFloat16Type() || LangOpts.CUDA ||
1019            (LangOpts.OpenMPIsTargetDevice && Target.getTriple().isNVPTX())) &&
1020           s + 2 < ThisTokEnd && s[1] == '1' && s[2] == '6') {
1021         s += 2; // success, eat up 2 characters.
1022         isFloat16 = true;
1023         continue;
1024       }
1025
1026       isFloat = true;
1027       continue;  // Success.
1028     case 'q':    // FP Suffix for "__float128"
1029     case 'Q':
1030       if (!isFPConstant) break;  // Error for integer constant.
1031       if (HasSize)
1032         break;
1033       HasSize = true;
1034       isFloat128 = true;
1035       continue;  // Success.
1036     case 'u':
1037     case 'U':
1038       if (isFPConstant) break;  // Error for floating constant.
1039       if (isUnsigned) break;    // Cannot be repeated.
1040       isUnsigned = true;
1041       continue;  // Success.
1042     case 'l':
1043     case 'L':
1044       if (HasSize)
1045         break;
1046       HasSize = true;
1047
1048       // Check for long long.  The L's need to be adjacent and the same case.
1049       if (s[1] == s[0]) {
1050         assert(s + 1 < ThisTokEnd && "didn't maximally munch?");
1051         if (isFPConstant) break;        // long long invalid for floats.
1052         isLongLong = true;
1053         ++s;  // Eat both of them.
1054       } else {
1055         isLong = true;
1056       }
1057       continue; // Success.
1058     case 'z':
1059     case 'Z':
1060       if (isFPConstant)
1061         break; // Invalid for floats.
1062       if (HasSize)
1063         break;
1064       HasSize = true;
1065       isSizeT = true;
1066       continue;
1067     case 'i':
1068     case 'I':
1069       if (LangOpts.MicrosoftExt && !isFPConstant) {
1070         // Allow i8, i16, i32, and i64. First, look ahead and check if
1071         // suffixes are Microsoft integers and not the imaginary unit.
1072         uint8_t Bits = 0;
1073         size_t ToSkip = 0;
1074         switch (s[1]) {
1075         case '8': // i8 suffix
1076           Bits = 8;
1077           ToSkip = 2;
1078           break;
1079         case '1':
1080           if (s[2] == '6') { // i16 suffix
1081             Bits = 16;
1082             ToSkip = 3;
1083           }
1084           break;
1085         case '3':
1086           if (s[2] == '2') { // i32 suffix
1087             Bits = 32;
1088             ToSkip = 3;
1089           }
1090           break;
1091         case '6':
1092           if (s[2] == '4') { // i64 suffix
1093             Bits = 64;
1094             ToSkip = 3;
1095           }
1096           break;
1097         default:
1098           break;
1099         }
1100         if (Bits) {
1101           if (HasSize)
1102             break;
1103           HasSize = true;
1104           MicrosoftInteger = Bits;
1105           s += ToSkip;
1106           assert(s <= ThisTokEnd && "didn't maximally munch?");
1107           break;
1108         }
1109       }
1110       [[fallthrough]];
1111     case 'j':
1112     case 'J':
1113       if (isImaginary) break;   // Cannot be repeated.
1114       isImaginary = true;
1115       continue;  // Success.
1116     case 'w':
1117     case 'W':
1118       if (isFPConstant)
1119         break; // Invalid for floats.
1120       if (HasSize)
1121         break; // Invalid if we already have a size for the literal.
1122
1123       // wb and WB are allowed, but a mixture of cases like Wb or wB is not. We
1124       // explicitly do not support the suffix in C++ as an extension because a
1125       // library-based UDL that resolves to a library type may be more
1126       // appropriate there.
1127       if (!LangOpts.CPlusPlus && ((s[0] == 'w' && s[1] == 'b') ||
1128           (s[0] == 'W' && s[1] == 'B'))) {
1129         isBitInt = true;
1130         HasSize = true;
1131         ++s; // Skip both characters (2nd char skipped on continue).
1132         continue; // Success.
1133       }
1134     }
1135     // If we reached here, there was an error or a ud-suffix.
1136     break;
1137   }
1138
1139   // "i", "if", and "il" are user-defined suffixes in C++1y.
1140   if (s != ThisTokEnd || isImaginary) {
1141     // FIXME: Don't bother expanding UCNs if !tok.hasUCN().
1142     expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
1143     if (isValidUDSuffix(LangOpts, UDSuffixBuf)) {
1144       if (!isImaginary) {
1145         // Any suffix pieces we might have parsed are actually part of the
1146         // ud-suffix.
1147         isLong = false;
1148         isUnsigned = false;
1149         isLongLong = false;
1150         isSizeT = false;
1151         isFloat = false;
1152         isFloat16 = false;
1153         isHalf = false;
1154         isImaginary = false;
1155         isBitInt = false;
1156         MicrosoftInteger = 0;
1157         saw_fixed_point_suffix = false;
1158         isFract = false;
1159         isAccum = false;
1160       }
1161
1162       saw_ud_suffix = true;
1163       return;
1164     }
1165
1166     if (s != ThisTokEnd) {
1167       // Report an error if there are any.
1168       Diags.Report(Lexer::AdvanceToTokenCharacter(
1169                        TokLoc, SuffixBegin - ThisTokBegin, SM, LangOpts),
1170                    diag::err_invalid_suffix_constant)
1171           << StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)
1172           << (isFixedPointConstant ? 2 : isFPConstant);
1173       hadError = true;
1174     }
1175   }
1176
1177   if (!hadError && saw_fixed_point_suffix) {
1178     assert(isFract || isAccum);
1179   }
1180 }
1181
1182 /// ParseDecimalOrOctalCommon - This method is called for decimal or octal
1183 /// numbers. It issues an error for illegal digits, and handles floating point
1184 /// parsing. If it detects a floating point number, the radix is set to 10.
1185 void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){
1186   assert((radix == 8 || radix == 10) && "Unexpected radix");
1187
1188   // If we have a hex digit other than 'e' (which denotes a FP exponent) then
1189   // the code is using an incorrect base.
1190   if (isHexDigit(*s) && *s != 'e' && *s != 'E' &&
1191       !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
1192     Diags.Report(
1193         Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, LangOpts),
1194         diag::err_invalid_digit)
1195         << StringRef(s, 1) << (radix == 8 ? 1 : 0);
1196     hadError = true;
1197     return;
1198   }
1199
1200   if (*s == '.') {
1201     checkSeparator(TokLoc, s, CSK_AfterDigits);
1202     s++;
1203     radix = 10;
1204     saw_period = true;
1205     checkSeparator(TokLoc, s, CSK_BeforeDigits);
1206     s = SkipDigits(s); // Skip suffix.
1207   }
1208   if (*s == 'e' || *s == 'E') { // exponent
1209     checkSeparator(TokLoc, s, CSK_AfterDigits);
1210     const char *Exponent = s;
1211     s++;
1212     radix = 10;
1213     saw_exponent = true;
1214     if (s != ThisTokEnd && (*s == '+' || *s == '-'))  s++; // sign
1215     const char *first_non_digit = SkipDigits(s);
1216     if (containsDigits(s, first_non_digit)) {
1217       checkSeparator(TokLoc, s, CSK_BeforeDigits);
1218       s = first_non_digit;
1219     } else {
1220       if (!hadError) {
1221         Diags.Report(Lexer::AdvanceToTokenCharacter(
1222                          TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
1223                      diag::err_exponent_has_no_digits);
1224         hadError = true;
1225       }
1226       return;
1227     }
1228   }
1229 }
1230
1231 /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
1232 /// suffixes as ud-suffixes, because the diagnostic experience is better if we
1233 /// treat it as an invalid suffix.
1234 bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
1235                                            StringRef Suffix) {
1236   if (!LangOpts.CPlusPlus11 || Suffix.empty())
1237     return false;
1238
1239   // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
1240   if (Suffix[0] == '_')
1241     return true;
1242
1243   // In C++11, there are no library suffixes.
1244   if (!LangOpts.CPlusPlus14)
1245     return false;
1246
1247   // In C++14, "s", "h", "min", "ms", "us", and "ns" are used in the library.
1248   // Per tweaked N3660, "il", "i", and "if" are also used in the library.
1249   // In C++2a "d" and "y" are used in the library.
1250   return llvm::StringSwitch<bool>(Suffix)
1251       .Cases("h", "min", "s", true)
1252       .Cases("ms", "us", "ns", true)
1253       .Cases("il", "i", "if", true)
1254       .Cases("d", "y", LangOpts.CPlusPlus20)
1255       .Default(false);
1256 }
1257
1258 void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
1259                                           const char *Pos,
1260                                           CheckSeparatorKind IsAfterDigits) {
1261   if (IsAfterDigits == CSK_AfterDigits) {
1262     if (Pos == ThisTokBegin)
1263       return;
1264     --Pos;
1265   } else if (Pos == ThisTokEnd)
1266     return;
1267
1268   if (isDigitSeparator(*Pos)) {
1269     Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin, SM,
1270                                                 LangOpts),
1271                  diag::err_digit_separator_not_between_digits)
1272         << IsAfterDigits;
1273     hadError = true;
1274   }
1275 }
1276
1277 /// ParseNumberStartingWithZero - This method is called when the first character
1278 /// of the number is found to be a zero.  This means it is either an octal
1279 /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
1280 /// a floating point number (01239.123e4).  Eat the prefix, determining the
1281 /// radix etc.
1282 void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
1283   assert(s[0] == '0' && "Invalid method call");
1284   s++;
1285
1286   int c1 = s[0];
1287
1288   // Handle a hex number like 0x1234.
1289   if ((c1 == 'x' || c1 == 'X') && (isHexDigit(s[1]) || s[1] == '.')) {
1290     s++;
1291     assert(s < ThisTokEnd && "didn't maximally munch?");
1292     radix = 16;
1293     DigitsBegin = s;
1294     s = SkipHexDigits(s);
1295     bool HasSignificandDigits = containsDigits(DigitsBegin, s);
1296     if (s == ThisTokEnd) {
1297       // Done.
1298     } else if (*s == '.') {
1299       s++;
1300       saw_period = true;
1301       const char *floatDigitsBegin = s;
1302       s = SkipHexDigits(s);
1303       if (containsDigits(floatDigitsBegin, s))
1304         HasSignificandDigits = true;
1305       if (HasSignificandDigits)
1306         checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits);
1307     }
1308
1309     if (!HasSignificandDigits) {
1310       Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1311                                                   LangOpts),
1312                    diag::err_hex_constant_requires)
1313           << LangOpts.CPlusPlus << 1;
1314       hadError = true;
1315       return;
1316     }
1317
1318     // A binary exponent can appear with or with a '.'. If dotted, the
1319     // binary exponent is required.
1320     if (*s == 'p' || *s == 'P') {
1321       checkSeparator(TokLoc, s, CSK_AfterDigits);
1322       const char *Exponent = s;
1323       s++;
1324       saw_exponent = true;
1325       if (s != ThisTokEnd && (*s == '+' || *s == '-'))  s++; // sign
1326       const char *first_non_digit = SkipDigits(s);
1327       if (!containsDigits(s, first_non_digit)) {
1328         if (!hadError) {
1329           Diags.Report(Lexer::AdvanceToTokenCharacter(
1330                            TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
1331                        diag::err_exponent_has_no_digits);
1332           hadError = true;
1333         }
1334         return;
1335       }
1336       checkSeparator(TokLoc, s, CSK_BeforeDigits);
1337       s = first_non_digit;
1338
1339       if (!LangOpts.HexFloats)
1340         Diags.Report(TokLoc, LangOpts.CPlusPlus
1341                                  ? diag::ext_hex_literal_invalid
1342                                  : diag::ext_hex_constant_invalid);
1343       else if (LangOpts.CPlusPlus17)
1344         Diags.Report(TokLoc, diag::warn_cxx17_hex_literal);
1345     } else if (saw_period) {
1346       Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1347                                                   LangOpts),
1348                    diag::err_hex_constant_requires)
1349           << LangOpts.CPlusPlus << 0;
1350       hadError = true;
1351     }
1352     return;
1353   }
1354
1355   // Handle simple binary numbers 0b01010
1356   if ((c1 == 'b' || c1 == 'B') && (s[1] == '0' || s[1] == '1')) {
1357     // 0b101010 is a C++1y / GCC extension.
1358     Diags.Report(TokLoc, LangOpts.CPlusPlus14
1359                              ? diag::warn_cxx11_compat_binary_literal
1360                          : LangOpts.CPlusPlus ? diag::ext_binary_literal_cxx14
1361                                               : diag::ext_binary_literal);
1362     ++s;
1363     assert(s < ThisTokEnd && "didn't maximally munch?");
1364     radix = 2;
1365     DigitsBegin = s;
1366     s = SkipBinaryDigits(s);
1367     if (s == ThisTokEnd) {
1368       // Done.
1369     } else if (isHexDigit(*s) &&
1370                !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
1371       Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1372                                                   LangOpts),
1373                    diag::err_invalid_digit)
1374           << StringRef(s, 1) << 2;
1375       hadError = true;
1376     }
1377     // Other suffixes will be diagnosed by the caller.
1378     return;
1379   }
1380
1381   // For now, the radix is set to 8. If we discover that we have a
1382   // floating point constant, the radix will change to 10. Octal floating
1383   // point constants are not permitted (only decimal and hexadecimal).
1384   radix = 8;
1385   const char *PossibleNewDigitStart = s;
1386   s = SkipOctalDigits(s);
1387   // When the value is 0 followed by a suffix (like 0wb), we want to leave 0
1388   // as the start of the digits. So if skipping octal digits does not skip
1389   // anything, we leave the digit start where it was.
1390   if (s != PossibleNewDigitStart)
1391     DigitsBegin = PossibleNewDigitStart;
1392
1393   if (s == ThisTokEnd)
1394     return; // Done, simple octal number like 01234
1395
1396   // If we have some other non-octal digit that *is* a decimal digit, see if
1397   // this is part of a floating point number like 094.123 or 09e1.
1398   if (isDigit(*s)) {
1399     const char *EndDecimal = SkipDigits(s);
1400     if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
1401       s = EndDecimal;
1402       radix = 10;
1403     }
1404   }
1405
1406   ParseDecimalOrOctalCommon(TokLoc);
1407 }
1408
1409 static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
1410   switch (Radix) {
1411   case 2:
1412     return NumDigits <= 64;
1413   case 8:
1414     return NumDigits <= 64 / 3; // Digits are groups of 3 bits.
1415   case 10:
1416     return NumDigits <= 19; // floor(log10(2^64))
1417   case 16:
1418     return NumDigits <= 64 / 4; // Digits are groups of 4 bits.
1419   default:
1420     llvm_unreachable("impossible Radix");
1421   }
1422 }
1423
1424 /// GetIntegerValue - Convert this numeric literal value to an APInt that
1425 /// matches Val's input width.  If there is an overflow, set Val to the low bits
1426 /// of the result and return true.  Otherwise, return false.
1427 bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
1428   // Fast path: Compute a conservative bound on the maximum number of
1429   // bits per digit in this radix. If we can't possibly overflow a
1430   // uint64 based on that bound then do the simple conversion to
1431   // integer. This avoids the expensive overflow checking below, and
1432   // handles the common cases that matter (small decimal integers and
1433   // hex/octal values which don't overflow).
1434   const unsigned NumDigits = SuffixBegin - DigitsBegin;
1435   if (alwaysFitsInto64Bits(radix, NumDigits)) {
1436     uint64_t N = 0;
1437     for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)
1438       if (!isDigitSeparator(*Ptr))
1439         N = N * radix + llvm::hexDigitValue(*Ptr);
1440
1441     // This will truncate the value to Val's input width. Simply check
1442     // for overflow by comparing.
1443     Val = N;
1444     return Val.getZExtValue() != N;
1445   }
1446
1447   Val = 0;
1448   const char *Ptr = DigitsBegin;
1449
1450   llvm::APInt RadixVal(Val.getBitWidth(), radix);
1451   llvm::APInt CharVal(Val.getBitWidth(), 0);
1452   llvm::APInt OldVal = Val;
1453
1454   bool OverflowOccurred = false;
1455   while (Ptr < SuffixBegin) {
1456     if (isDigitSeparator(*Ptr)) {
1457       ++Ptr;
1458       continue;
1459     }
1460
1461     unsigned C = llvm::hexDigitValue(*Ptr++);
1462
1463     // If this letter is out of bound for this radix, reject it.
1464     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1465
1466     CharVal = C;
1467
1468     // Add the digit to the value in the appropriate radix.  If adding in digits
1469     // made the value smaller, then this overflowed.
1470     OldVal = Val;
1471
1472     // Multiply by radix, did overflow occur on the multiply?
1473     Val *= RadixVal;
1474     OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
1475
1476     // Add value, did overflow occur on the value?
1477     //   (a + b) ult b  <=> overflow
1478     Val += CharVal;
1479     OverflowOccurred |= Val.ult(CharVal);
1480   }
1481   return OverflowOccurred;
1482 }
1483
1484 llvm::APFloat::opStatus
1485 NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
1486   using llvm::APFloat;
1487
1488   unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
1489
1490   llvm::SmallString<16> Buffer;
1491   StringRef Str(ThisTokBegin, n);
1492   if (Str.contains('\'')) {
1493     Buffer.reserve(n);
1494     std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),
1495                         &isDigitSeparator);
1496     Str = Buffer;
1497   }
1498
1499   auto StatusOrErr =
1500       Result.convertFromString(Str, APFloat::rmNearestTiesToEven);
1501   assert(StatusOrErr && "Invalid floating point representation");
1502   return !errorToBool(StatusOrErr.takeError()) ? *StatusOrErr
1503                                                : APFloat::opInvalidOp;
1504 }
1505
1506 static inline bool IsExponentPart(char c) {
1507   return c == 'p' || c == 'P' || c == 'e' || c == 'E';
1508 }
1509
1510 bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) {
1511   assert(radix == 16 || radix == 10);
1512
1513   // Find how many digits are needed to store the whole literal.
1514   unsigned NumDigits = SuffixBegin - DigitsBegin;
1515   if (saw_period) --NumDigits;
1516
1517   // Initial scan of the exponent if it exists
1518   bool ExpOverflowOccurred = false;
1519   bool NegativeExponent = false;
1520   const char *ExponentBegin;
1521   uint64_t Exponent = 0;
1522   int64_t BaseShift = 0;
1523   if (saw_exponent) {
1524     const char *Ptr = DigitsBegin;
1525
1526     while (!IsExponentPart(*Ptr)) ++Ptr;
1527     ExponentBegin = Ptr;
1528     ++Ptr;
1529     NegativeExponent = *Ptr == '-';
1530     if (NegativeExponent) ++Ptr;
1531
1532     unsigned NumExpDigits = SuffixBegin - Ptr;
1533     if (alwaysFitsInto64Bits(radix, NumExpDigits)) {
1534       llvm::StringRef ExpStr(Ptr, NumExpDigits);
1535       llvm::APInt ExpInt(/*numBits=*/64, ExpStr, /*radix=*/10);
1536       Exponent = ExpInt.getZExtValue();
1537     } else {
1538       ExpOverflowOccurred = true;
1539     }
1540
1541     if (NegativeExponent) BaseShift -= Exponent;
1542     else BaseShift += Exponent;
1543   }
1544
1545   // Number of bits needed for decimal literal is
1546   //   ceil(NumDigits * log2(10))       Integral part
1547   // + Scale                            Fractional part
1548   // + ceil(Exponent * log2(10))        Exponent
1549   // --------------------------------------------------
1550   //   ceil((NumDigits + Exponent) * log2(10)) + Scale
1551   //
1552   // But for simplicity in handling integers, we can round up log2(10) to 4,
1553   // making:
1554   // 4 * (NumDigits + Exponent) + Scale
1555   //
1556   // Number of digits needed for hexadecimal literal is
1557   //   4 * NumDigits                    Integral part
1558   // + Scale                            Fractional part
1559   // + Exponent                         Exponent
1560   // --------------------------------------------------
1561   //   (4 * NumDigits) + Scale + Exponent
1562   uint64_t NumBitsNeeded;
1563   if (radix == 10)
1564     NumBitsNeeded = 4 * (NumDigits + Exponent) + Scale;
1565   else
1566     NumBitsNeeded = 4 * NumDigits + Exponent + Scale;
1567
1568   if (NumBitsNeeded > std::numeric_limits<unsigned>::max())
1569     ExpOverflowOccurred = true;
1570   llvm::APInt Val(static_cast<unsigned>(NumBitsNeeded), 0, /*isSigned=*/false);
1571
1572   bool FoundDecimal = false;
1573
1574   int64_t FractBaseShift = 0;
1575   const char *End = saw_exponent ? ExponentBegin : SuffixBegin;
1576   for (const char *Ptr = DigitsBegin; Ptr < End; ++Ptr) {
1577     if (*Ptr == '.') {
1578       FoundDecimal = true;
1579       continue;
1580     }
1581
1582     // Normal reading of an integer
1583     unsigned C = llvm::hexDigitValue(*Ptr);
1584     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1585
1586     Val *= radix;
1587     Val += C;
1588
1589     if (FoundDecimal)
1590       // Keep track of how much we will need to adjust this value by from the
1591       // number of digits past the radix point.
1592       --FractBaseShift;
1593   }
1594
1595   // For a radix of 16, we will be multiplying by 2 instead of 16.
1596   if (radix == 16) FractBaseShift *= 4;
1597   BaseShift += FractBaseShift;
1598
1599   Val <<= Scale;
1600
1601   uint64_t Base = (radix == 16) ? 2 : 10;
1602   if (BaseShift > 0) {
1603     for (int64_t i = 0; i < BaseShift; ++i) {
1604       Val *= Base;
1605     }
1606   } else if (BaseShift < 0) {
1607     for (int64_t i = BaseShift; i < 0 && !Val.isZero(); ++i)
1608       Val = Val.udiv(Base);
1609   }
1610
1611   bool IntOverflowOccurred = false;
1612   auto MaxVal = llvm::APInt::getMaxValue(StoreVal.getBitWidth());
1613   if (Val.getBitWidth() > StoreVal.getBitWidth()) {
1614     IntOverflowOccurred |= Val.ugt(MaxVal.zext(Val.getBitWidth()));
1615     StoreVal = Val.trunc(StoreVal.getBitWidth());
1616   } else if (Val.getBitWidth() < StoreVal.getBitWidth()) {
1617     IntOverflowOccurred |= Val.zext(MaxVal.getBitWidth()).ugt(MaxVal);
1618     StoreVal = Val.zext(StoreVal.getBitWidth());
1619   } else {
1620     StoreVal = Val;
1621   }
1622
1623   return IntOverflowOccurred || ExpOverflowOccurred;
1624 }
1625
1626 /// \verbatim
1627 ///       user-defined-character-literal: [C++11 lex.ext]
1628 ///         character-literal ud-suffix
1629 ///       ud-suffix:
1630 ///         identifier
1631 ///       character-literal: [C++11 lex.ccon]
1632 ///         ' c-char-sequence '
1633 ///         u' c-char-sequence '
1634 ///         U' c-char-sequence '
1635 ///         L' c-char-sequence '
1636 ///         u8' c-char-sequence ' [C++1z lex.ccon]
1637 ///       c-char-sequence:
1638 ///         c-char
1639 ///         c-char-sequence c-char
1640 ///       c-char:
1641 ///         any member of the source character set except the single-quote ',
1642 ///           backslash \, or new-line character
1643 ///         escape-sequence
1644 ///         universal-character-name
1645 ///       escape-sequence:
1646 ///         simple-escape-sequence
1647 ///         octal-escape-sequence
1648 ///         hexadecimal-escape-sequence
1649 ///       simple-escape-sequence:
1650 ///         one of \' \" \? \\ \a \b \f \n \r \t \v
1651 ///       octal-escape-sequence:
1652 ///         \ octal-digit
1653 ///         \ octal-digit octal-digit
1654 ///         \ octal-digit octal-digit octal-digit
1655 ///       hexadecimal-escape-sequence:
1656 ///         \x hexadecimal-digit
1657 ///         hexadecimal-escape-sequence hexadecimal-digit
1658 ///       universal-character-name: [C++11 lex.charset]
1659 ///         \u hex-quad
1660 ///         \U hex-quad hex-quad
1661 ///       hex-quad:
1662 ///         hex-digit hex-digit hex-digit hex-digit
1663 /// \endverbatim
1664 ///
1665 CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
1666                                      SourceLocation Loc, Preprocessor &PP,
1667                                      tok::TokenKind kind) {
1668   // At this point we know that the character matches the regex "(L|u|U)?'.*'".
1669   HadError = false;
1670
1671   Kind = kind;
1672
1673   const char *TokBegin = begin;
1674
1675   // Skip over wide character determinant.
1676   if (Kind != tok::char_constant)
1677     ++begin;
1678   if (Kind == tok::utf8_char_constant)
1679     ++begin;
1680
1681   // Skip over the entry quote.
1682   if (begin[0] != '\'') {
1683     PP.Diag(Loc, diag::err_lexing_char);
1684     HadError = true;
1685     return;
1686   }
1687
1688   ++begin;
1689
1690   // Remove an optional ud-suffix.
1691   if (end[-1] != '\'') {
1692     const char *UDSuffixEnd = end;
1693     do {
1694       --end;
1695     } while (end[-1] != '\'');
1696     // FIXME: Don't bother with this if !tok.hasUCN().
1697     expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
1698     UDSuffixOffset = end - TokBegin;
1699   }
1700
1701   // Trim the ending quote.
1702   assert(end != begin && "Invalid token lexed");
1703   --end;
1704
1705   // FIXME: The "Value" is an uint64_t so we can handle char literals of
1706   // up to 64-bits.
1707   // FIXME: This extensively assumes that 'char' is 8-bits.
1708   assert(PP.getTargetInfo().getCharWidth() == 8 &&
1709          "Assumes char is 8 bits");
1710   assert(PP.getTargetInfo().getIntWidth() <= 64 &&
1711          (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
1712          "Assumes sizeof(int) on target is <= 64 and a multiple of char");
1713   assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
1714          "Assumes sizeof(wchar) on target is <= 64");
1715
1716   SmallVector<uint32_t, 4> codepoint_buffer;
1717   codepoint_buffer.resize(end - begin);
1718   uint32_t *buffer_begin = &codepoint_buffer.front();
1719   uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
1720
1721   // Unicode escapes representing characters that cannot be correctly
1722   // represented in a single code unit are disallowed in character literals
1723   // by this implementation.
1724   uint32_t largest_character_for_kind;
1725   if (tok::wide_char_constant == Kind) {
1726     largest_character_for_kind =
1727         0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
1728   } else if (tok::utf8_char_constant == Kind) {
1729     largest_character_for_kind = 0x7F;
1730   } else if (tok::utf16_char_constant == Kind) {
1731     largest_character_for_kind = 0xFFFF;
1732   } else if (tok::utf32_char_constant == Kind) {
1733     largest_character_for_kind = 0x10FFFF;
1734   } else {
1735     largest_character_for_kind = 0x7Fu;
1736   }
1737
1738   while (begin != end) {
1739     // Is this a span of non-escape characters?
1740     if (begin[0] != '\\') {
1741       char const *start = begin;
1742       do {
1743         ++begin;
1744       } while (begin != end && *begin != '\\');
1745
1746       char const *tmp_in_start = start;
1747       uint32_t *tmp_out_start = buffer_begin;
1748       llvm::ConversionResult res =
1749           llvm::ConvertUTF8toUTF32(reinterpret_cast<llvm::UTF8 const **>(&start),
1750                              reinterpret_cast<llvm::UTF8 const *>(begin),
1751                              &buffer_begin, buffer_end, llvm::strictConversion);
1752       if (res != llvm::conversionOK) {
1753         // If we see bad encoding for unprefixed character literals, warn and
1754         // simply copy the byte values, for compatibility with gcc and
1755         // older versions of clang.
1756         bool NoErrorOnBadEncoding = isOrdinary();
1757         unsigned Msg = diag::err_bad_character_encoding;
1758         if (NoErrorOnBadEncoding)
1759           Msg = diag::warn_bad_character_encoding;
1760         PP.Diag(Loc, Msg);
1761         if (NoErrorOnBadEncoding) {
1762           start = tmp_in_start;
1763           buffer_begin = tmp_out_start;
1764           for (; start != begin; ++start, ++buffer_begin)
1765             *buffer_begin = static_cast<uint8_t>(*start);
1766         } else {
1767           HadError = true;
1768         }
1769       } else {
1770         for (; tmp_out_start < buffer_begin; ++tmp_out_start) {
1771           if (*tmp_out_start > largest_character_for_kind) {
1772             HadError = true;
1773             PP.Diag(Loc, diag::err_character_too_large);
1774           }
1775         }
1776       }
1777
1778       continue;
1779     }
1780     // Is this a Universal Character Name escape?
1781     if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') {
1782       unsigned short UcnLen = 0;
1783       if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
1784                             FullSourceLoc(Loc, PP.getSourceManager()),
1785                             &PP.getDiagnostics(), PP.getLangOpts(), true)) {
1786         HadError = true;
1787       } else if (*buffer_begin > largest_character_for_kind) {
1788         HadError = true;
1789         PP.Diag(Loc, diag::err_character_too_large);
1790       }
1791
1792       ++buffer_begin;
1793       continue;
1794     }
1795     unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
1796     uint64_t result =
1797         ProcessCharEscape(TokBegin, begin, end, HadError,
1798                           FullSourceLoc(Loc, PP.getSourceManager()), CharWidth,
1799                           &PP.getDiagnostics(), PP.getLangOpts(),
1800                           StringLiteralEvalMethod::Evaluated);
1801     *buffer_begin++ = result;
1802   }
1803
1804   unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
1805
1806   if (NumCharsSoFar > 1) {
1807     if (isOrdinary() && NumCharsSoFar == 4)
1808       PP.Diag(Loc, diag::warn_four_char_character_literal);
1809     else if (isOrdinary())
1810       PP.Diag(Loc, diag::warn_multichar_character_literal);
1811     else {
1812       PP.Diag(Loc, diag::err_multichar_character_literal) << (isWide() ? 0 : 1);
1813       HadError = true;
1814     }
1815     IsMultiChar = true;
1816   } else {
1817     IsMultiChar = false;
1818   }
1819
1820   llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
1821
1822   // Narrow character literals act as though their value is concatenated
1823   // in this implementation, but warn on overflow.
1824   bool multi_char_too_long = false;
1825   if (isOrdinary() && isMultiChar()) {
1826     LitVal = 0;
1827     for (size_t i = 0; i < NumCharsSoFar; ++i) {
1828       // check for enough leading zeros to shift into
1829       multi_char_too_long |= (LitVal.countl_zero() < 8);
1830       LitVal <<= 8;
1831       LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
1832     }
1833   } else if (NumCharsSoFar > 0) {
1834     // otherwise just take the last character
1835     LitVal = buffer_begin[-1];
1836   }
1837
1838   if (!HadError && multi_char_too_long) {
1839     PP.Diag(Loc, diag::warn_char_constant_too_large);
1840   }
1841
1842   // Transfer the value from APInt to uint64_t
1843   Value = LitVal.getZExtValue();
1844
1845   // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
1846   // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
1847   // character constants are not sign extended in the this implementation:
1848   // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
1849   if (isOrdinary() && NumCharsSoFar == 1 && (Value & 128) &&
1850       PP.getLangOpts().CharIsSigned)
1851     Value = (signed char)Value;
1852 }
1853
1854 /// \verbatim
1855 ///       string-literal: [C++0x lex.string]
1856 ///         encoding-prefix " [s-char-sequence] "
1857 ///         encoding-prefix R raw-string
1858 ///       encoding-prefix:
1859 ///         u8
1860 ///         u
1861 ///         U
1862 ///         L
1863 ///       s-char-sequence:
1864 ///         s-char
1865 ///         s-char-sequence s-char
1866 ///       s-char:
1867 ///         any member of the source character set except the double-quote ",
1868 ///           backslash \, or new-line character
1869 ///         escape-sequence
1870 ///         universal-character-name
1871 ///       raw-string:
1872 ///         " d-char-sequence ( r-char-sequence ) d-char-sequence "
1873 ///       r-char-sequence:
1874 ///         r-char
1875 ///         r-char-sequence r-char
1876 ///       r-char:
1877 ///         any member of the source character set, except a right parenthesis )
1878 ///           followed by the initial d-char-sequence (which may be empty)
1879 ///           followed by a double quote ".
1880 ///       d-char-sequence:
1881 ///         d-char
1882 ///         d-char-sequence d-char
1883 ///       d-char:
1884 ///         any member of the basic source character set except:
1885 ///           space, the left parenthesis (, the right parenthesis ),
1886 ///           the backslash \, and the control characters representing horizontal
1887 ///           tab, vertical tab, form feed, and newline.
1888 ///       escape-sequence: [C++0x lex.ccon]
1889 ///         simple-escape-sequence
1890 ///         octal-escape-sequence
1891 ///         hexadecimal-escape-sequence
1892 ///       simple-escape-sequence:
1893 ///         one of \' \" \? \\ \a \b \f \n \r \t \v
1894 ///       octal-escape-sequence:
1895 ///         \ octal-digit
1896 ///         \ octal-digit octal-digit
1897 ///         \ octal-digit octal-digit octal-digit
1898 ///       hexadecimal-escape-sequence:
1899 ///         \x hexadecimal-digit
1900 ///         hexadecimal-escape-sequence hexadecimal-digit
1901 ///       universal-character-name:
1902 ///         \u hex-quad
1903 ///         \U hex-quad hex-quad
1904 ///       hex-quad:
1905 ///         hex-digit hex-digit hex-digit hex-digit
1906 /// \endverbatim
1907 ///
1908 StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks,
1909                                          Preprocessor &PP,
1910                                          StringLiteralEvalMethod EvalMethod)
1911     : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1912       Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()),
1913       MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
1914       ResultPtr(ResultBuf.data()), EvalMethod(EvalMethod), hadError(false),
1915       Pascal(false) {
1916   init(StringToks);
1917 }
1918
1919 void StringLiteralParser::init(ArrayRef<Token> StringToks){
1920   // The literal token may have come from an invalid source location (e.g. due
1921   // to a PCH error), in which case the token length will be 0.
1922   if (StringToks.empty() || StringToks[0].getLength() < 2)
1923     return DiagnoseLexingError(SourceLocation());
1924
1925   // Scan all of the string portions, remember the max individual token length,
1926   // computing a bound on the concatenated string length, and see whether any
1927   // piece is a wide-string.  If any of the string portions is a wide-string
1928   // literal, the result is a wide-string literal [C99 6.4.5p4].
1929   assert(!StringToks.empty() && "expected at least one token");
1930   MaxTokenLength = StringToks[0].getLength();
1931   assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
1932   SizeBound = StringToks[0].getLength() - 2; // -2 for "".
1933   hadError = false;
1934
1935   // Determines the kind of string from the prefix
1936   Kind = tok::string_literal;
1937
1938   /// (C99 5.1.1.2p1).  The common case is only one string fragment.
1939   for (const Token &Tok : StringToks) {
1940     if (Tok.getLength() < 2)
1941       return DiagnoseLexingError(Tok.getLocation());
1942
1943     // The string could be shorter than this if it needs cleaning, but this is a
1944     // reasonable bound, which is all we need.
1945     assert(Tok.getLength() >= 2 && "literal token is invalid!");
1946     SizeBound += Tok.getLength() - 2; // -2 for "".
1947
1948     // Remember maximum string piece length.
1949     if (Tok.getLength() > MaxTokenLength)
1950       MaxTokenLength = Tok.getLength();
1951
1952     // Remember if we see any wide or utf-8/16/32 strings.
1953     // Also check for illegal concatenations.
1954     if (isUnevaluated() && Tok.getKind() != tok::string_literal) {
1955       if (Diags) {
1956         SourceLocation PrefixEndLoc = Lexer::AdvanceToTokenCharacter(
1957             Tok.getLocation(), getEncodingPrefixLen(Tok.getKind()), SM,
1958             Features);
1959         CharSourceRange Range =
1960             CharSourceRange::getCharRange({Tok.getLocation(), PrefixEndLoc});
1961         StringRef Prefix(SM.getCharacterData(Tok.getLocation()),
1962                          getEncodingPrefixLen(Tok.getKind()));
1963         Diags->Report(Tok.getLocation(),
1964                       Features.CPlusPlus26
1965                           ? diag::err_unevaluated_string_prefix
1966                           : diag::warn_unevaluated_string_prefix)
1967             << Prefix << Features.CPlusPlus << FixItHint::CreateRemoval(Range);
1968       }
1969       if (Features.CPlusPlus26)
1970         hadError = true;
1971     } else if (Tok.isNot(Kind) && Tok.isNot(tok::string_literal)) {
1972       if (isOrdinary()) {
1973         Kind = Tok.getKind();
1974       } else {
1975         if (Diags)
1976           Diags->Report(Tok.getLocation(), diag::err_unsupported_string_concat);
1977         hadError = true;
1978       }
1979     }
1980   }
1981
1982   // Include space for the null terminator.
1983   ++SizeBound;
1984
1985   // TODO: K&R warning: "traditional C rejects string constant concatenation"
1986
1987   // Get the width in bytes of char/wchar_t/char16_t/char32_t
1988   CharByteWidth = getCharWidth(Kind, Target);
1989   assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
1990   CharByteWidth /= 8;
1991
1992   // The output buffer size needs to be large enough to hold wide characters.
1993   // This is a worst-case assumption which basically corresponds to L"" "long".
1994   SizeBound *= CharByteWidth;
1995
1996   // Size the temporary buffer to hold the result string data.
1997   ResultBuf.resize(SizeBound);
1998
1999   // Likewise, but for each string piece.
2000   SmallString<512> TokenBuf;
2001   TokenBuf.resize(MaxTokenLength);
2002
2003   // Loop over all the strings, getting their spelling, and expanding them to
2004   // wide strings as appropriate.
2005   ResultPtr = &ResultBuf[0];   // Next byte to fill in.
2006
2007   Pascal = false;
2008
2009   SourceLocation UDSuffixTokLoc;
2010
2011   for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
2012     const char *ThisTokBuf = &TokenBuf[0];
2013     // Get the spelling of the token, which eliminates trigraphs, etc.  We know
2014     // that ThisTokBuf points to a buffer that is big enough for the whole token
2015     // and 'spelled' tokens can only shrink.
2016     bool StringInvalid = false;
2017     unsigned ThisTokLen =
2018       Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
2019                          &StringInvalid);
2020     if (StringInvalid)
2021       return DiagnoseLexingError(StringToks[i].getLocation());
2022
2023     const char *ThisTokBegin = ThisTokBuf;
2024     const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
2025
2026     // Remove an optional ud-suffix.
2027     if (ThisTokEnd[-1] != '"') {
2028       const char *UDSuffixEnd = ThisTokEnd;
2029       do {
2030         --ThisTokEnd;
2031       } while (ThisTokEnd[-1] != '"');
2032
2033       StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
2034
2035       if (UDSuffixBuf.empty()) {
2036         if (StringToks[i].hasUCN())
2037           expandUCNs(UDSuffixBuf, UDSuffix);
2038         else
2039           UDSuffixBuf.assign(UDSuffix);
2040         UDSuffixToken = i;
2041         UDSuffixOffset = ThisTokEnd - ThisTokBuf;
2042         UDSuffixTokLoc = StringToks[i].getLocation();
2043       } else {
2044         SmallString<32> ExpandedUDSuffix;
2045         if (StringToks[i].hasUCN()) {
2046           expandUCNs(ExpandedUDSuffix, UDSuffix);
2047           UDSuffix = ExpandedUDSuffix;
2048         }
2049
2050         // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
2051         // result of a concatenation involving at least one user-defined-string-
2052         // literal, all the participating user-defined-string-literals shall
2053         // have the same ud-suffix.
2054         bool UnevaluatedStringHasUDL = isUnevaluated() && !UDSuffix.empty();
2055         if (UDSuffixBuf != UDSuffix || UnevaluatedStringHasUDL) {
2056           if (Diags) {
2057             SourceLocation TokLoc = StringToks[i].getLocation();
2058             if (UnevaluatedStringHasUDL) {
2059               Diags->Report(TokLoc, diag::err_unevaluated_string_udl)
2060                   << SourceRange(TokLoc, TokLoc);
2061             } else {
2062               Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
2063                   << UDSuffixBuf << UDSuffix
2064                   << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc);
2065             }
2066           }
2067           hadError = true;
2068         }
2069       }
2070     }
2071
2072     // Strip the end quote.
2073     --ThisTokEnd;
2074
2075     // TODO: Input character set mapping support.
2076
2077     // Skip marker for wide or unicode strings.
2078     if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
2079       ++ThisTokBuf;
2080       // Skip 8 of u8 marker for utf8 strings.
2081       if (ThisTokBuf[0] == '8')
2082         ++ThisTokBuf;
2083     }
2084
2085     // Check for raw string
2086     if (ThisTokBuf[0] == 'R') {
2087       if (ThisTokBuf[1] != '"') {
2088         // The file may have come from PCH and then changed after loading the
2089         // PCH; Fail gracefully.
2090         return DiagnoseLexingError(StringToks[i].getLocation());
2091       }
2092       ThisTokBuf += 2; // skip R"
2093
2094       // C++11 [lex.string]p2: A `d-char-sequence` shall consist of at most 16
2095       // characters.
2096       constexpr unsigned MaxRawStrDelimLen = 16;
2097
2098       const char *Prefix = ThisTokBuf;
2099       while (static_cast<unsigned>(ThisTokBuf - Prefix) < MaxRawStrDelimLen &&
2100              ThisTokBuf[0] != '(')
2101         ++ThisTokBuf;
2102       if (ThisTokBuf[0] != '(')
2103         return DiagnoseLexingError(StringToks[i].getLocation());
2104       ++ThisTokBuf; // skip '('
2105
2106       // Remove same number of characters from the end
2107       ThisTokEnd -= ThisTokBuf - Prefix;
2108       if (ThisTokEnd < ThisTokBuf)
2109         return DiagnoseLexingError(StringToks[i].getLocation());
2110
2111       // C++14 [lex.string]p4: A source-file new-line in a raw string literal
2112       // results in a new-line in the resulting execution string-literal.
2113       StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);
2114       while (!RemainingTokenSpan.empty()) {
2115         // Split the string literal on \r\n boundaries.
2116         size_t CRLFPos = RemainingTokenSpan.find("\r\n");
2117         StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);
2118         StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
2119
2120         // Copy everything before the \r\n sequence into the string literal.
2121         if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
2122           hadError = true;
2123
2124         // Point into the \n inside the \r\n sequence and operate on the
2125         // remaining portion of the literal.
2126         RemainingTokenSpan = AfterCRLF.substr(1);
2127       }
2128     } else {
2129       if (ThisTokBuf[0] != '"') {
2130         // The file may have come from PCH and then changed after loading the
2131         // PCH; Fail gracefully.
2132         return DiagnoseLexingError(StringToks[i].getLocation());
2133       }
2134       ++ThisTokBuf; // skip "
2135
2136       // Check if this is a pascal string
2137       if (!isUnevaluated() && Features.PascalStrings &&
2138           ThisTokBuf + 1 != ThisTokEnd && ThisTokBuf[0] == '\\' &&
2139           ThisTokBuf[1] == 'p') {
2140
2141         // If the \p sequence is found in the first token, we have a pascal string
2142         // Otherwise, if we already have a pascal string, ignore the first \p
2143         if (i == 0) {
2144           ++ThisTokBuf;
2145           Pascal = true;
2146         } else if (Pascal)
2147           ThisTokBuf += 2;
2148       }
2149
2150       while (ThisTokBuf != ThisTokEnd) {
2151         // Is this a span of non-escape characters?
2152         if (ThisTokBuf[0] != '\\') {
2153           const char *InStart = ThisTokBuf;
2154           do {
2155             ++ThisTokBuf;
2156           } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
2157
2158           // Copy the character span over.
2159           if (CopyStringFragment(StringToks[i], ThisTokBegin,
2160                                  StringRef(InStart, ThisTokBuf - InStart)))
2161             hadError = true;
2162           continue;
2163         }
2164         // Is this a Universal Character Name escape?
2165         if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' ||
2166             ThisTokBuf[1] == 'N') {
2167           EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
2168                           ResultPtr, hadError,
2169                           FullSourceLoc(StringToks[i].getLocation(), SM),
2170                           CharByteWidth, Diags, Features);
2171           continue;
2172         }
2173         // Otherwise, this is a non-UCN escape character.  Process it.
2174         unsigned ResultChar =
2175             ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
2176                               FullSourceLoc(StringToks[i].getLocation(), SM),
2177                               CharByteWidth * 8, Diags, Features, EvalMethod);
2178
2179         if (CharByteWidth == 4) {
2180           // FIXME: Make the type of the result buffer correct instead of
2181           // using reinterpret_cast.
2182           llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultPtr);
2183           *ResultWidePtr = ResultChar;
2184           ResultPtr += 4;
2185         } else if (CharByteWidth == 2) {
2186           // FIXME: Make the type of the result buffer correct instead of
2187           // using reinterpret_cast.
2188           llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultPtr);
2189           *ResultWidePtr = ResultChar & 0xFFFF;
2190           ResultPtr += 2;
2191         } else {
2192           assert(CharByteWidth == 1 && "Unexpected char width");
2193           *ResultPtr++ = ResultChar & 0xFF;
2194         }
2195       }
2196     }
2197   }
2198
2199   assert((!Pascal || !isUnevaluated()) &&
2200          "Pascal string in unevaluated context");
2201   if (Pascal) {
2202     if (CharByteWidth == 4) {
2203       // FIXME: Make the type of the result buffer correct instead of
2204       // using reinterpret_cast.
2205       llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultBuf.data());
2206       ResultWidePtr[0] = GetNumStringChars() - 1;
2207     } else if (CharByteWidth == 2) {
2208       // FIXME: Make the type of the result buffer correct instead of
2209       // using reinterpret_cast.
2210       llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultBuf.data());
2211       ResultWidePtr[0] = GetNumStringChars() - 1;
2212     } else {
2213       assert(CharByteWidth == 1 && "Unexpected char width");
2214       ResultBuf[0] = GetNumStringChars() - 1;
2215     }
2216
2217     // Verify that pascal strings aren't too large.
2218     if (GetStringLength() > 256) {
2219       if (Diags)
2220         Diags->Report(StringToks.front().getLocation(),
2221                       diag::err_pascal_string_too_long)
2222           << SourceRange(StringToks.front().getLocation(),
2223                          StringToks.back().getLocation());
2224       hadError = true;
2225       return;
2226     }
2227   } else if (Diags) {
2228     // Complain if this string literal has too many characters.
2229     unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
2230
2231     if (GetNumStringChars() > MaxChars)
2232       Diags->Report(StringToks.front().getLocation(),
2233                     diag::ext_string_too_long)
2234         << GetNumStringChars() << MaxChars
2235         << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
2236         << SourceRange(StringToks.front().getLocation(),
2237                        StringToks.back().getLocation());
2238   }
2239 }
2240
2241 static const char *resyncUTF8(const char *Err, const char *End) {
2242   if (Err == End)
2243     return End;
2244   End = Err + std::min<unsigned>(llvm::getNumBytesForUTF8(*Err), End-Err);
2245   while (++Err != End && (*Err & 0xC0) == 0x80)
2246     ;
2247   return Err;
2248 }
2249
2250 /// This function copies from Fragment, which is a sequence of bytes
2251 /// within Tok's contents (which begin at TokBegin) into ResultPtr.
2252 /// Performs widening for multi-byte characters.
2253 bool StringLiteralParser::CopyStringFragment(const Token &Tok,
2254                                              const char *TokBegin,
2255                                              StringRef Fragment) {
2256   const llvm::UTF8 *ErrorPtrTmp;
2257   if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
2258     return false;
2259
2260   // If we see bad encoding for unprefixed string literals, warn and
2261   // simply copy the byte values, for compatibility with gcc and older
2262   // versions of clang.
2263   bool NoErrorOnBadEncoding = isOrdinary();
2264   if (NoErrorOnBadEncoding) {
2265     memcpy(ResultPtr, Fragment.data(), Fragment.size());
2266     ResultPtr += Fragment.size();
2267   }
2268
2269   if (Diags) {
2270     const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
2271
2272     FullSourceLoc SourceLoc(Tok.getLocation(), SM);
2273     const DiagnosticBuilder &Builder =
2274       Diag(Diags, Features, SourceLoc, TokBegin,
2275            ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),
2276            NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
2277                                 : diag::err_bad_string_encoding);
2278
2279     const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());
2280     StringRef NextFragment(NextStart, Fragment.end()-NextStart);
2281
2282     // Decode into a dummy buffer.
2283     SmallString<512> Dummy;
2284     Dummy.reserve(Fragment.size() * CharByteWidth);
2285     char *Ptr = Dummy.data();
2286
2287     while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {
2288       const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
2289       NextStart = resyncUTF8(ErrorPtr, Fragment.end());
2290       Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
2291                                      ErrorPtr, NextStart);
2292       NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
2293     }
2294   }
2295   return !NoErrorOnBadEncoding;
2296 }
2297
2298 void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
2299   hadError = true;
2300   if (Diags)
2301     Diags->Report(Loc, diag::err_lexing_string);
2302 }
2303
2304 /// getOffsetOfStringByte - This function returns the offset of the
2305 /// specified byte of the string data represented by Token.  This handles
2306 /// advancing over escape sequences in the string.
2307 unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
2308                                                     unsigned ByteNo) const {
2309   // Get the spelling of the token.
2310   SmallString<32> SpellingBuffer;
2311   SpellingBuffer.resize(Tok.getLength());
2312
2313   bool StringInvalid = false;
2314   const char *SpellingPtr = &SpellingBuffer[0];
2315   unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
2316                                        &StringInvalid);
2317   if (StringInvalid)
2318     return 0;
2319
2320   const char *SpellingStart = SpellingPtr;
2321   const char *SpellingEnd = SpellingPtr+TokLen;
2322
2323   // Handle UTF-8 strings just like narrow strings.
2324   if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
2325     SpellingPtr += 2;
2326
2327   assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
2328          SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
2329
2330   // For raw string literals, this is easy.
2331   if (SpellingPtr[0] == 'R') {
2332     assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
2333     // Skip 'R"'.
2334     SpellingPtr += 2;
2335     while (*SpellingPtr != '(') {
2336       ++SpellingPtr;
2337       assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
2338     }
2339     // Skip '('.
2340     ++SpellingPtr;
2341     return SpellingPtr - SpellingStart + ByteNo;
2342   }
2343
2344   // Skip over the leading quote
2345   assert(SpellingPtr[0] == '"' && "Should be a string literal!");
2346   ++SpellingPtr;
2347
2348   // Skip over bytes until we find the offset we're looking for.
2349   while (ByteNo) {
2350     assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
2351
2352     // Step over non-escapes simply.
2353     if (*SpellingPtr != '\\') {
2354       ++SpellingPtr;
2355       --ByteNo;
2356       continue;
2357     }
2358
2359     // Otherwise, this is an escape character.  Advance over it.
2360     bool HadError = false;
2361     if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U' ||
2362         SpellingPtr[1] == 'N') {
2363       const char *EscapePtr = SpellingPtr;
2364       unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
2365                                       1, Features, HadError);
2366       if (Len > ByteNo) {
2367         // ByteNo is somewhere within the escape sequence.
2368         SpellingPtr = EscapePtr;
2369         break;
2370       }
2371       ByteNo -= Len;
2372     } else {
2373       ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
2374                         FullSourceLoc(Tok.getLocation(), SM), CharByteWidth * 8,
2375                         Diags, Features, StringLiteralEvalMethod::Evaluated);
2376       --ByteNo;
2377     }
2378     assert(!HadError && "This method isn't valid on erroneous strings");
2379   }
2380
2381   return SpellingPtr-SpellingStart;
2382 }
2383
2384 /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
2385 /// suffixes as ud-suffixes, because the diagnostic experience is better if we
2386 /// treat it as an invalid suffix.
2387 bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
2388                                           StringRef Suffix) {
2389   return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) ||
2390          Suffix == "sv";
2391 }