clang/lib/Lex/LiteralSupport.cpp

   1 //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file implements the NumericLiteralParser, CharLiteralParser, and
  10 // StringLiteralParser interfaces.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "clang/Lex/LiteralSupport.h"
  15 #include "clang/Basic/CharInfo.h"
  16 #include "clang/Basic/LangOptions.h"
  17 #include "clang/Basic/SourceLocation.h"
  18 #include "clang/Basic/TargetInfo.h"
  19 #include "clang/Lex/LexDiagnostic.h"
  20 #include "clang/Lex/Lexer.h"
  21 #include "clang/Lex/Preprocessor.h"
  22 #include "clang/Lex/Token.h"
  23 #include "llvm/ADT/APInt.h"
  24 #include "llvm/ADT/SmallVector.h"
  25 #include "llvm/ADT/StringExtras.h"
  26 #include "llvm/ADT/StringSwitch.h"
  27 #include "llvm/Support/ConvertUTF.h"
  28 #include "llvm/Support/Error.h"
  29 #include "llvm/Support/ErrorHandling.h"
  30 #include "llvm/Support/Unicode.h"
  31 #include <algorithm>
  32 #include <cassert>
  33 #include <cstddef>
  34 #include <cstdint>
  35 #include <cstring>
  36 #include <string>
  37
  38 using namespace clang;
  39
  40 static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
  41   switch (kind) {
  42   default: llvm_unreachable("Unknown token type!");
  43   case tok::char_constant:
  44   case tok::string_literal:
  45   case tok::utf8_char_constant:
  46   case tok::utf8_string_literal:
  47     return Target.getCharWidth();
  48   case tok::wide_char_constant:
  49   case tok::wide_string_literal:
  50     return Target.getWCharWidth();
  51   case tok::utf16_char_constant:
  52   case tok::utf16_string_literal:
  53     return Target.getChar16Width();
  54   case tok::utf32_char_constant:
  55   case tok::utf32_string_literal:
  56     return Target.getChar32Width();
  57   }
  58 }
  59
  60 static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
  61                                            FullSourceLoc TokLoc,
  62                                            const char *TokBegin,
  63                                            const char *TokRangeBegin,
  64                                            const char *TokRangeEnd) {
  65   SourceLocation Begin =
  66     Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
  67                                    TokLoc.getManager(), Features);
  68   SourceLocation End =
  69     Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
  70                                    TokLoc.getManager(), Features);
  71   return CharSourceRange::getCharRange(Begin, End);
  72 }
  73
  74 /// Produce a diagnostic highlighting some portion of a literal.
  75 ///
  76 /// Emits the diagnostic \p DiagID, highlighting the range of characters from
  77 /// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
  78 /// a substring of a spelling buffer for the token beginning at \p TokBegin.
  79 static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
  80                               const LangOptions &Features, FullSourceLoc TokLoc,
  81                               const char *TokBegin, const char *TokRangeBegin,
  82                               const char *TokRangeEnd, unsigned DiagID) {
  83   SourceLocation Begin =
  84     Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
  85                                    TokLoc.getManager(), Features);
  86   return Diags->Report(Begin, DiagID) <<
  87     MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
  88 }
  89
  90 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
  91 /// either a character or a string literal.
  92 static unsigned ProcessCharEscape(const char *ThisTokBegin,
  93                                   const char *&ThisTokBuf,
  94                                   const char *ThisTokEnd, bool &HadError,
  95                                   FullSourceLoc Loc, unsigned CharWidth,
  96                                   DiagnosticsEngine *Diags,
  97                                   const LangOptions &Features) {
  98   const char *EscapeBegin = ThisTokBuf;
  99   bool Delimited = false;
 100   bool EndDelimiterFound = false;
 101
 102   // Skip the '\' char.
 103   ++ThisTokBuf;
 104
 105   // We know that this character can't be off the end of the buffer, because
 106   // that would have been \", which would not have been the end of string.
 107   unsigned ResultChar = *ThisTokBuf++;
 108   switch (ResultChar) {
 109   // These map to themselves.
 110   case '\\': case '\'': case '"': case '?': break;
 111
 112     // These have fixed mappings.
 113   case 'a':
 114     // TODO: K&R: the meaning of '\\a' is different in traditional C
 115     ResultChar = 7;
 116     break;
 117   case 'b':
 118     ResultChar = 8;
 119     break;
 120   case 'e':
 121     if (Diags)
 122       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 123            diag::ext_nonstandard_escape) << "e";
 124     ResultChar = 27;
 125     break;
 126   case 'E':
 127     if (Diags)
 128       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 129            diag::ext_nonstandard_escape) << "E";
 130     ResultChar = 27;
 131     break;
 132   case 'f':
 133     ResultChar = 12;
 134     break;
 135   case 'n':
 136     ResultChar = 10;
 137     break;
 138   case 'r':
 139     ResultChar = 13;
 140     break;
 141   case 't':
 142     ResultChar = 9;
 143     break;
 144   case 'v':
 145     ResultChar = 11;
 146     break;
 147   case 'x': { // Hex escape.
 148     ResultChar = 0;
 149     if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
 150       Delimited = true;
 151       ThisTokBuf++;
 152       if (*ThisTokBuf == '}') {
 153         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 154              diag::err_delimited_escape_empty);
 155         return ResultChar;
 156       }
 157     } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
 158       if (Diags)
 159         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 160              diag::err_hex_escape_no_digits) << "x";
 161       return ResultChar;
 162     }
 163
 164     // Hex escapes are a maximal series of hex digits.
 165     bool Overflow = false;
 166     for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
 167       if (Delimited && *ThisTokBuf == '}') {
 168         ThisTokBuf++;
 169         EndDelimiterFound = true;
 170         break;
 171       }
 172       int CharVal = llvm::hexDigitValue(*ThisTokBuf);
 173       if (CharVal == -1) {
 174         // Non delimited hex escape sequences stop at the first non-hex digit.
 175         if (!Delimited)
 176           break;
 177         HadError = true;
 178         if (Diags)
 179           Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 180                diag::err_delimited_escape_invalid)
 181               << StringRef(ThisTokBuf, 1);
 182         continue;
 183       }
 184       // About to shift out a digit?
 185       if (ResultChar & 0xF0000000)
 186         Overflow = true;
 187       ResultChar <<= 4;
 188       ResultChar |= CharVal;
 189     }
 190     // See if any bits will be truncated when evaluated as a character.
 191     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
 192       Overflow = true;
 193       ResultChar &= ~0U >> (32-CharWidth);
 194     }
 195
 196     // Check for overflow.
 197     if (!HadError && Overflow) { // Too many digits to fit in
 198       HadError = true;
 199       if (Diags)
 200         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 201              diag::err_escape_too_large)
 202             << 0;
 203     }
 204     break;
 205   }
 206   case '0': case '1': case '2': case '3':
 207   case '4': case '5': case '6': case '7': {
 208     // Octal escapes.
 209     --ThisTokBuf;
 210     ResultChar = 0;
 211
 212     // Octal escapes are a series of octal digits with maximum length 3.
 213     // "\0123" is a two digit sequence equal to "\012" "3".
 214     unsigned NumDigits = 0;
 215     do {
 216       ResultChar <<= 3;
 217       ResultChar |= *ThisTokBuf++ - '0';
 218       ++NumDigits;
 219     } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
 220              ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
 221
 222     // Check for overflow.  Reject '\777', but not L'\777'.
 223     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
 224       if (Diags)
 225         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 226              diag::err_escape_too_large) << 1;
 227       ResultChar &= ~0U >> (32-CharWidth);
 228     }
 229     break;
 230   }
 231   case 'o': {
 232     bool Overflow = false;
 233     if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
 234       HadError = true;
 235       if (Diags)
 236         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 237              diag::err_delimited_escape_missing_brace)
 238             << "o";
 239
 240       break;
 241     }
 242     ResultChar = 0;
 243     Delimited = true;
 244     ++ThisTokBuf;
 245     if (*ThisTokBuf == '}') {
 246       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 247            diag::err_delimited_escape_empty);
 248       return ResultChar;
 249     }
 250
 251     while (ThisTokBuf != ThisTokEnd) {
 252       if (*ThisTokBuf == '}') {
 253         EndDelimiterFound = true;
 254         ThisTokBuf++;
 255         break;
 256       }
 257       if (*ThisTokBuf < '0' || *ThisTokBuf > '7') {
 258         HadError = true;
 259         if (Diags)
 260           Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 261                diag::err_delimited_escape_invalid)
 262               << StringRef(ThisTokBuf, 1);
 263         ThisTokBuf++;
 264         continue;
 265       }
 266       if (ResultChar & 0x020000000)
 267         Overflow = true;
 268
 269       ResultChar <<= 3;
 270       ResultChar |= *ThisTokBuf++ - '0';
 271     }
 272     // Check for overflow.  Reject '\777', but not L'\777'.
 273     if (!HadError &&
 274         (Overflow || (CharWidth != 32 && (ResultChar >> CharWidth) != 0))) {
 275       HadError = true;
 276       if (Diags)
 277         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 278              diag::err_escape_too_large)
 279             << 1;
 280       ResultChar &= ~0U >> (32 - CharWidth);
 281     }
 282     break;
 283   }
 284     // Otherwise, these are not valid escapes.
 285   case '(': case '{': case '[': case '%':
 286     // GCC accepts these as extensions.  We warn about them as such though.
 287     if (Diags)
 288       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 289            diag::ext_nonstandard_escape)
 290         << std::string(1, ResultChar);
 291     break;
 292   default:
 293     if (!Diags)
 294       break;
 295
 296     if (isPrintable(ResultChar))
 297       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 298            diag::ext_unknown_escape)
 299         << std::string(1, ResultChar);
 300     else
 301       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 302            diag::ext_unknown_escape)
 303         << "x" + llvm::utohexstr(ResultChar);
 304     break;
 305   }
 306
 307   if (Delimited && Diags) {
 308     if (!EndDelimiterFound)
 309       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 310            diag::err_expected)
 311           << tok::r_brace;
 312     else if (!HadError) {
 313       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
 314            Features.CPlusPlus2b ? diag::warn_cxx2b_delimited_escape_sequence
 315                                 : diag::ext_delimited_escape_sequence)
 316           << /*delimited*/ 0 << (Features.CPlusPlus ? 1 : 0);
 317     }
 318   }
 319
 320   return ResultChar;
 321 }
 322
 323 static void appendCodePoint(unsigned Codepoint,
 324                             llvm::SmallVectorImpl<char> &Str) {
 325   char ResultBuf[4];
 326   char *ResultPtr = ResultBuf;
 327   if (llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr))
 328     Str.append(ResultBuf, ResultPtr);
 329 }
 330
 331 void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
 332   for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
 333     if (*I != '\\') {
 334       Buf.push_back(*I);
 335       continue;
 336     }
 337
 338     ++I;
 339     char Kind = *I;
 340     ++I;
 341
 342     assert(Kind == 'u' || Kind == 'U' || Kind == 'N');
 343     uint32_t CodePoint = 0;
 344
 345     if (Kind == 'u' && *I == '{') {
 346       for (++I; *I != '}'; ++I) {
 347         unsigned Value = llvm::hexDigitValue(*I);
 348         assert(Value != -1U);
 349         CodePoint <<= 4;
 350         CodePoint += Value;
 351       }
 352       appendCodePoint(CodePoint, Buf);
 353       continue;
 354     }
 355
 356     if (Kind == 'N') {
 357       assert(*I == '{');
 358       ++I;
 359       auto Delim = std::find(I, Input.end(), '}');
 360       assert(Delim != Input.end());
 361       llvm::Optional<llvm::sys::unicode::LooseMatchingResult> Res =
 362           llvm::sys::unicode::nameToCodepointLooseMatching(
 363               StringRef(I, std::distance(I, Delim)));
 364       assert(Res);
 365       CodePoint = Res->CodePoint;
 366       assert(CodePoint != 0xFFFFFFFF);
 367       appendCodePoint(CodePoint, Buf);
 368       I = Delim;
 369       continue;
 370     }
 371
 372     unsigned NumHexDigits;
 373     if (Kind == 'u')
 374       NumHexDigits = 4;
 375     else
 376       NumHexDigits = 8;
 377
 378     assert(I + NumHexDigits <= E);
 379
 380     for (; NumHexDigits != 0; ++I, --NumHexDigits) {
 381       unsigned Value = llvm::hexDigitValue(*I);
 382       assert(Value != -1U);
 383
 384       CodePoint <<= 4;
 385       CodePoint += Value;
 386     }
 387
 388     appendCodePoint(CodePoint, Buf);
 389     --I;
 390   }
 391 }
 392
 393 static bool ProcessNumericUCNEscape(const char *ThisTokBegin,
 394                                     const char *&ThisTokBuf,
 395                                     const char *ThisTokEnd, uint32_t &UcnVal,
 396                                     unsigned short &UcnLen, bool &Delimited,
 397                                     FullSourceLoc Loc, DiagnosticsEngine *Diags,
 398                                     const LangOptions &Features,
 399                                     bool in_char_string_literal = false) {
 400   const char *UcnBegin = ThisTokBuf;
 401   bool HasError = false;
 402   bool EndDelimiterFound = false;
 403
 404   // Skip the '\u' char's.
 405   ThisTokBuf += 2;
 406   Delimited = false;
 407   if (UcnBegin[1] == 'u' && in_char_string_literal &&
 408       ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
 409     Delimited = true;
 410     ThisTokBuf++;
 411   } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
 412     if (Diags)
 413       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 414            diag::err_hex_escape_no_digits)
 415           << StringRef(&ThisTokBuf[-1], 1);
 416     return false;
 417   }
 418   UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
 419
 420   bool Overflow = false;
 421   unsigned short Count = 0;
 422   for (; ThisTokBuf != ThisTokEnd && (Delimited || Count != UcnLen);
 423        ++ThisTokBuf) {
 424     if (Delimited && *ThisTokBuf == '}') {
 425       ++ThisTokBuf;
 426       EndDelimiterFound = true;
 427       break;
 428     }
 429     int CharVal = llvm::hexDigitValue(*ThisTokBuf);
 430     if (CharVal == -1) {
 431       HasError = true;
 432       if (!Delimited)
 433         break;
 434       if (Diags) {
 435         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 436              diag::err_delimited_escape_invalid)
 437             << StringRef(ThisTokBuf, 1);
 438       }
 439       Count++;
 440       continue;
 441     }
 442     if (UcnVal & 0xF0000000) {
 443       Overflow = true;
 444       continue;
 445     }
 446     UcnVal <<= 4;
 447     UcnVal |= CharVal;
 448     Count++;
 449   }
 450
 451   if (Overflow) {
 452     if (Diags)
 453       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 454            diag::err_escape_too_large)
 455           << 0;
 456     return false;
 457   }
 458
 459   if (Delimited && !EndDelimiterFound) {
 460     if (Diags) {
 461       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 462            diag::err_expected)
 463           << tok::r_brace;
 464     }
 465     return false;
 466   }
 467
 468   // If we didn't consume the proper number of digits, there is a problem.
 469   if (Count == 0 || (!Delimited && Count != UcnLen)) {
 470     if (Diags)
 471       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 472            Delimited ? diag::err_delimited_escape_empty
 473                      : diag::err_ucn_escape_incomplete);
 474     return false;
 475   }
 476   return !HasError;
 477 }
 478
 479 static void DiagnoseInvalidUnicodeCharacterName(
 480     DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc Loc,
 481     const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd,
 482     llvm::StringRef Name) {
 483
 484   Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
 485        diag::err_invalid_ucn_name)
 486       << Name;
 487
 488   namespace u = llvm::sys::unicode;
 489
 490   llvm::Optional<u::LooseMatchingResult> Res =
 491       u::nameToCodepointLooseMatching(Name);
 492   if (Res) {
 493     Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
 494          diag::note_invalid_ucn_name_loose_matching)
 495         << FixItHint::CreateReplacement(
 496                MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
 497                                    TokRangeEnd),
 498                Res->Name);
 499     return;
 500   }
 501
 502   unsigned Distance = 0;
 503   SmallVector<u::MatchForCodepointName> Matches =
 504       u::nearestMatchesForCodepointName(Name, 5);
 505   assert(!Matches.empty() && "No unicode characters found");
 506
 507   for (const auto &Match : Matches) {
 508     if (Distance == 0)
 509       Distance = Match.Distance;
 510     if (std::max(Distance, Match.Distance) -
 511             std::min(Distance, Match.Distance) >
 512         3)
 513       break;
 514     Distance = Match.Distance;
 515
 516     std::string Str;
 517     llvm::UTF32 V = Match.Value;
 518     LLVM_ATTRIBUTE_UNUSED bool Converted =
 519         llvm::convertUTF32ToUTF8String(llvm::ArrayRef<llvm::UTF32>(&V, 1), Str);
 520     assert(Converted && "Found a match wich is not a unicode character");
 521
 522     Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
 523          diag::note_invalid_ucn_name_candidate)
 524         << Match.Name << llvm::utohexstr(Match.Value)
 525         << Str // FIXME: Fix the rendering of non printable characters
 526         << FixItHint::CreateReplacement(
 527                MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
 528                                    TokRangeEnd),
 529                Match.Name);
 530   }
 531 }
 532
 533 static bool ProcessNamedUCNEscape(const char *ThisTokBegin,
 534                                   const char *&ThisTokBuf,
 535                                   const char *ThisTokEnd, uint32_t &UcnVal,
 536                                   unsigned short &UcnLen, FullSourceLoc Loc,
 537                                   DiagnosticsEngine *Diags,
 538                                   const LangOptions &Features) {
 539   const char *UcnBegin = ThisTokBuf;
 540   assert(UcnBegin[0] == '\\' && UcnBegin[1] == 'N');
 541   ThisTokBuf += 2;
 542   if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
 543     if (Diags) {
 544       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 545            diag::err_delimited_escape_missing_brace)
 546           << StringRef(&ThisTokBuf[-1], 1);
 547     }
 548     ThisTokBuf++;
 549     return false;
 550   }
 551   ThisTokBuf++;
 552   const char *ClosingBrace =
 553       std::find_if_not(ThisTokBuf, ThisTokEnd, [](char C) {
 554         return llvm::isAlnum(C) || llvm::isSpace(C) || C == '_' || C == '-';
 555       });
 556   bool Incomplete = ClosingBrace == ThisTokEnd || *ClosingBrace != '}';
 557   bool Empty = ClosingBrace == ThisTokBuf;
 558   if (Incomplete || Empty) {
 559     if (Diags) {
 560       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 561            Incomplete ? diag::err_ucn_escape_incomplete
 562                       : diag::err_delimited_escape_empty)
 563           << StringRef(&UcnBegin[1], 1);
 564     }
 565     ThisTokBuf = ClosingBrace == ThisTokEnd ? ClosingBrace : ClosingBrace + 1;
 566     return false;
 567   }
 568   StringRef Name(ThisTokBuf, ClosingBrace - ThisTokBuf);
 569   ThisTokBuf = ClosingBrace + 1;
 570   llvm::Optional<char32_t> Res =
 571       llvm::sys::unicode::nameToCodepointStrict(Name);
 572   if (!Res) {
 573     if (Diags)
 574       DiagnoseInvalidUnicodeCharacterName(Diags, Features, Loc, ThisTokBegin,
 575                                           &UcnBegin[3], ClosingBrace, Name);
 576     return false;
 577   }
 578   UcnVal = *Res;
 579   UcnLen = UcnVal > 0xFFFF ? 8 : 4;
 580   return true;
 581 }
 582
 583 /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
 584 /// return the UTF32.
 585 static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
 586                              const char *ThisTokEnd, uint32_t &UcnVal,
 587                              unsigned short &UcnLen, FullSourceLoc Loc,
 588                              DiagnosticsEngine *Diags,
 589                              const LangOptions &Features,
 590                              bool in_char_string_literal = false) {
 591
 592   bool HasError;
 593   const char *UcnBegin = ThisTokBuf;
 594   bool IsDelimitedEscapeSequence = false;
 595   bool IsNamedEscapeSequence = false;
 596   if (ThisTokBuf[1] == 'N') {
 597     IsNamedEscapeSequence = true;
 598     HasError = !ProcessNamedUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
 599                                       UcnVal, UcnLen, Loc, Diags, Features);
 600   } else {
 601     HasError =
 602         !ProcessNumericUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
 603                                  UcnLen, IsDelimitedEscapeSequence, Loc, Diags,
 604                                  Features, in_char_string_literal);
 605   }
 606   if (HasError)
 607     return false;
 608
 609   // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
 610   if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
 611       UcnVal > 0x10FFFF) {                      // maximum legal UTF32 value
 612     if (Diags)
 613       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 614            diag::err_ucn_escape_invalid);
 615     return false;
 616   }
 617
 618   // C++11 allows UCNs that refer to control characters and basic source
 619   // characters inside character and string literals
 620   if (UcnVal < 0xa0 &&
 621       (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) {  // $, @, `
 622     bool IsError = (!Features.CPlusPlus11 || !in_char_string_literal);
 623     if (Diags) {
 624       char BasicSCSChar = UcnVal;
 625       if (UcnVal >= 0x20 && UcnVal < 0x7f)
 626         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 627              IsError ? diag::err_ucn_escape_basic_scs :
 628                        diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
 629             << StringRef(&BasicSCSChar, 1);
 630       else
 631         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 632              IsError ? diag::err_ucn_control_character :
 633                        diag::warn_cxx98_compat_literal_ucn_control_character);
 634     }
 635     if (IsError)
 636       return false;
 637   }
 638
 639   if (!Features.CPlusPlus && !Features.C99 && Diags)
 640     Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 641          diag::warn_ucn_not_valid_in_c89_literal);
 642
 643   if ((IsDelimitedEscapeSequence || IsNamedEscapeSequence) && Diags)
 644     Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
 645          Features.CPlusPlus2b ? diag::warn_cxx2b_delimited_escape_sequence
 646                               : diag::ext_delimited_escape_sequence)
 647         << (IsNamedEscapeSequence ? 1 : 0) << (Features.CPlusPlus ? 1 : 0);
 648
 649   return true;
 650 }
 651
 652 /// MeasureUCNEscape - Determine the number of bytes within the resulting string
 653 /// which this UCN will occupy.
 654 static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
 655                             const char *ThisTokEnd, unsigned CharByteWidth,
 656                             const LangOptions &Features, bool &HadError) {
 657   // UTF-32: 4 bytes per escape.
 658   if (CharByteWidth == 4)
 659     return 4;
 660
 661   uint32_t UcnVal = 0;
 662   unsigned short UcnLen = 0;
 663   FullSourceLoc Loc;
 664
 665   if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
 666                         UcnLen, Loc, nullptr, Features, true)) {
 667     HadError = true;
 668     return 0;
 669   }
 670
 671   // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
 672   if (CharByteWidth == 2)
 673     return UcnVal <= 0xFFFF ? 2 : 4;
 674
 675   // UTF-8.
 676   if (UcnVal < 0x80)
 677     return 1;
 678   if (UcnVal < 0x800)
 679     return 2;
 680   if (UcnVal < 0x10000)
 681     return 3;
 682   return 4;
 683 }
 684
 685 /// EncodeUCNEscape - Read the Universal Character Name, check constraints and
 686 /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
 687 /// StringLiteralParser. When we decide to implement UCN's for identifiers,
 688 /// we will likely rework our support for UCN's.
 689 static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
 690                             const char *ThisTokEnd,
 691                             char *&ResultBuf, bool &HadError,
 692                             FullSourceLoc Loc, unsigned CharByteWidth,
 693                             DiagnosticsEngine *Diags,
 694                             const LangOptions &Features) {
 695   typedef uint32_t UTF32;
 696   UTF32 UcnVal = 0;
 697   unsigned short UcnLen = 0;
 698   if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
 699                         Loc, Diags, Features, true)) {
 700     HadError = true;
 701     return;
 702   }
 703
 704   assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) &&
 705          "only character widths of 1, 2, or 4 bytes supported");
 706
 707   (void)UcnLen;
 708   assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
 709
 710   if (CharByteWidth == 4) {
 711     // FIXME: Make the type of the result buffer correct instead of
 712     // using reinterpret_cast.
 713     llvm::UTF32 *ResultPtr = reinterpret_cast<llvm::UTF32*>(ResultBuf);
 714     *ResultPtr = UcnVal;
 715     ResultBuf += 4;
 716     return;
 717   }
 718
 719   if (CharByteWidth == 2) {
 720     // FIXME: Make the type of the result buffer correct instead of
 721     // using reinterpret_cast.
 722     llvm::UTF16 *ResultPtr = reinterpret_cast<llvm::UTF16*>(ResultBuf);
 723
 724     if (UcnVal <= (UTF32)0xFFFF) {
 725       *ResultPtr = UcnVal;
 726       ResultBuf += 2;
 727       return;
 728     }
 729
 730     // Convert to UTF16.
 731     UcnVal -= 0x10000;
 732     *ResultPtr     = 0xD800 + (UcnVal >> 10);
 733     *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
 734     ResultBuf += 4;
 735     return;
 736   }
 737
 738   assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
 739
 740   // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
 741   // The conversion below was inspired by:
 742   //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
 743   // First, we determine how many bytes the result will require.
 744   typedef uint8_t UTF8;
 745
 746   unsigned short bytesToWrite = 0;
 747   if (UcnVal < (UTF32)0x80)
 748     bytesToWrite = 1;
 749   else if (UcnVal < (UTF32)0x800)
 750     bytesToWrite = 2;
 751   else if (UcnVal < (UTF32)0x10000)
 752     bytesToWrite = 3;
 753   else
 754     bytesToWrite = 4;
 755
 756   const unsigned byteMask = 0xBF;
 757   const unsigned byteMark = 0x80;
 758
 759   // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
 760   // into the first byte, depending on how many bytes follow.
 761   static const UTF8 firstByteMark[5] = {
 762     0x00, 0x00, 0xC0, 0xE0, 0xF0
 763   };
 764   // Finally, we write the bytes into ResultBuf.
 765   ResultBuf += bytesToWrite;
 766   switch (bytesToWrite) { // note: everything falls through.
 767   case 4:
 768     *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
 769     [[fallthrough]];
 770   case 3:
 771     *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
 772     [[fallthrough]];
 773   case 2:
 774     *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
 775     [[fallthrough]];
 776   case 1:
 777     *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
 778   }
 779   // Update the buffer.
 780   ResultBuf += bytesToWrite;
 781 }
 782
 783 ///       integer-constant: [C99 6.4.4.1]
 784 ///         decimal-constant integer-suffix
 785 ///         octal-constant integer-suffix
 786 ///         hexadecimal-constant integer-suffix
 787 ///         binary-literal integer-suffix [GNU, C++1y]
 788 ///       user-defined-integer-literal: [C++11 lex.ext]
 789 ///         decimal-literal ud-suffix
 790 ///         octal-literal ud-suffix
 791 ///         hexadecimal-literal ud-suffix
 792 ///         binary-literal ud-suffix [GNU, C++1y]
 793 ///       decimal-constant:
 794 ///         nonzero-digit
 795 ///         decimal-constant digit
 796 ///       octal-constant:
 797 ///         0
 798 ///         octal-constant octal-digit
 799 ///       hexadecimal-constant:
 800 ///         hexadecimal-prefix hexadecimal-digit
 801 ///         hexadecimal-constant hexadecimal-digit
 802 ///       hexadecimal-prefix: one of
 803 ///         0x 0X
 804 ///       binary-literal:
 805 ///         0b binary-digit
 806 ///         0B binary-digit
 807 ///         binary-literal binary-digit
 808 ///       integer-suffix:
 809 ///         unsigned-suffix [long-suffix]
 810 ///         unsigned-suffix [long-long-suffix]
 811 ///         long-suffix [unsigned-suffix]
 812 ///         long-long-suffix [unsigned-sufix]
 813 ///       nonzero-digit:
 814 ///         1 2 3 4 5 6 7 8 9
 815 ///       octal-digit:
 816 ///         0 1 2 3 4 5 6 7
 817 ///       hexadecimal-digit:
 818 ///         0 1 2 3 4 5 6 7 8 9
 819 ///         a b c d e f
 820 ///         A B C D E F
 821 ///       binary-digit:
 822 ///         0
 823 ///         1
 824 ///       unsigned-suffix: one of
 825 ///         u U
 826 ///       long-suffix: one of
 827 ///         l L
 828 ///       long-long-suffix: one of
 829 ///         ll LL
 830 ///
 831 ///       floating-constant: [C99 6.4.4.2]
 832 ///         TODO: add rules...
 833 ///
 834 NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
 835                                            SourceLocation TokLoc,
 836                                            const SourceManager &SM,
 837                                            const LangOptions &LangOpts,
 838                                            const TargetInfo &Target,
 839                                            DiagnosticsEngine &Diags)
 840     : SM(SM), LangOpts(LangOpts), Diags(Diags),
 841       ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
 842
 843   s = DigitsBegin = ThisTokBegin;
 844   saw_exponent = false;
 845   saw_period = false;
 846   saw_ud_suffix = false;
 847   saw_fixed_point_suffix = false;
 848   isLong = false;
 849   isUnsigned = false;
 850   isLongLong = false;
 851   isSizeT = false;
 852   isHalf = false;
 853   isFloat = false;
 854   isImaginary = false;
 855   isFloat16 = false;
 856   isFloat128 = false;
 857   MicrosoftInteger = 0;
 858   isFract = false;
 859   isAccum = false;
 860   hadError = false;
 861   isBitInt = false;
 862
 863   // This routine assumes that the range begin/end matches the regex for integer
 864   // and FP constants (specifically, the 'pp-number' regex), and assumes that
 865   // the byte at "*end" is both valid and not part of the regex.  Because of
 866   // this, it doesn't have to check for 'overscan' in various places.
 867   if (isPreprocessingNumberBody(*ThisTokEnd)) {
 868     Diags.Report(TokLoc, diag::err_lexing_numeric);
 869     hadError = true;
 870     return;
 871   }
 872
 873   if (*s == '0') { // parse radix
 874     ParseNumberStartingWithZero(TokLoc);
 875     if (hadError)
 876       return;
 877   } else { // the first digit is non-zero
 878     radix = 10;
 879     s = SkipDigits(s);
 880     if (s == ThisTokEnd) {
 881       // Done.
 882     } else {
 883       ParseDecimalOrOctalCommon(TokLoc);
 884       if (hadError)
 885         return;
 886     }
 887   }
 888
 889   SuffixBegin = s;
 890   checkSeparator(TokLoc, s, CSK_AfterDigits);
 891
 892   // Initial scan to lookahead for fixed point suffix.
 893   if (LangOpts.FixedPoint) {
 894     for (const char *c = s; c != ThisTokEnd; ++c) {
 895       if (*c == 'r' || *c == 'k' || *c == 'R' || *c == 'K') {
 896         saw_fixed_point_suffix = true;
 897         break;
 898       }
 899     }
 900   }
 901
 902   // Parse the suffix.  At this point we can classify whether we have an FP or
 903   // integer constant.
 904   bool isFixedPointConstant = isFixedPointLiteral();
 905   bool isFPConstant = isFloatingLiteral();
 906   bool HasSize = false;
 907
 908   // Loop over all of the characters of the suffix.  If we see something bad,
 909   // we break out of the loop.
 910   for (; s != ThisTokEnd; ++s) {
 911     switch (*s) {
 912     case 'R':
 913     case 'r':
 914       if (!LangOpts.FixedPoint)
 915         break;
 916       if (isFract || isAccum) break;
 917       if (!(saw_period || saw_exponent)) break;
 918       isFract = true;
 919       continue;
 920     case 'K':
 921     case 'k':
 922       if (!LangOpts.FixedPoint)
 923         break;
 924       if (isFract || isAccum) break;
 925       if (!(saw_period || saw_exponent)) break;
 926       isAccum = true;
 927       continue;
 928     case 'h':      // FP Suffix for "half".
 929     case 'H':
 930       // OpenCL Extension v1.2 s9.5 - h or H suffix for half type.
 931       if (!(LangOpts.Half || LangOpts.FixedPoint))
 932         break;
 933       if (isIntegerLiteral()) break;  // Error for integer constant.
 934       if (HasSize)
 935         break;
 936       HasSize = true;
 937       isHalf = true;
 938       continue;  // Success.
 939     case 'f':      // FP Suffix for "float"
 940     case 'F':
 941       if (!isFPConstant) break;  // Error for integer constant.
 942       if (HasSize)
 943         break;
 944       HasSize = true;
 945
 946       // CUDA host and device may have different _Float16 support, therefore
 947       // allows f16 literals to avoid false alarm.
 948       // ToDo: more precise check for CUDA.
 949       if ((Target.hasFloat16Type() || LangOpts.CUDA) && s + 2 < ThisTokEnd &&
 950           s[1] == '1' && s[2] == '6') {
 951         s += 2; // success, eat up 2 characters.
 952         isFloat16 = true;
 953         continue;
 954       }
 955
 956       isFloat = true;
 957       continue;  // Success.
 958     case 'q':    // FP Suffix for "__float128"
 959     case 'Q':
 960       if (!isFPConstant) break;  // Error for integer constant.
 961       if (HasSize)
 962         break;
 963       HasSize = true;
 964       isFloat128 = true;
 965       continue;  // Success.
 966     case 'u':
 967     case 'U':
 968       if (isFPConstant) break;  // Error for floating constant.
 969       if (isUnsigned) break;    // Cannot be repeated.
 970       isUnsigned = true;
 971       continue;  // Success.
 972     case 'l':
 973     case 'L':
 974       if (HasSize)
 975         break;
 976       HasSize = true;
 977
 978       // Check for long long.  The L's need to be adjacent and the same case.
 979       if (s[1] == s[0]) {
 980         assert(s + 1 < ThisTokEnd && "didn't maximally munch?");
 981         if (isFPConstant) break;        // long long invalid for floats.
 982         isLongLong = true;
 983         ++s;  // Eat both of them.
 984       } else {
 985         isLong = true;
 986       }
 987       continue; // Success.
 988     case 'z':
 989     case 'Z':
 990       if (isFPConstant)
 991         break; // Invalid for floats.
 992       if (HasSize)
 993         break;
 994       HasSize = true;
 995       isSizeT = true;
 996       continue;
 997     case 'i':
 998     case 'I':
 999       if (LangOpts.MicrosoftExt && !isFPConstant) {
1000         // Allow i8, i16, i32, and i64. First, look ahead and check if
1001         // suffixes are Microsoft integers and not the imaginary unit.
1002         uint8_t Bits = 0;
1003         size_t ToSkip = 0;
1004         switch (s[1]) {
1005         case '8': // i8 suffix
1006           Bits = 8;
1007           ToSkip = 2;
1008           break;
1009         case '1':
1010           if (s[2] == '6') { // i16 suffix
1011             Bits = 16;
1012             ToSkip = 3;
1013           }
1014           break;
1015         case '3':
1016           if (s[2] == '2') { // i32 suffix
1017             Bits = 32;
1018             ToSkip = 3;
1019           }
1020           break;
1021         case '6':
1022           if (s[2] == '4') { // i64 suffix
1023             Bits = 64;
1024             ToSkip = 3;
1025           }
1026           break;
1027         default:
1028           break;
1029         }
1030         if (Bits) {
1031           if (HasSize)
1032             break;
1033           HasSize = true;
1034           MicrosoftInteger = Bits;
1035           s += ToSkip;
1036           assert(s <= ThisTokEnd && "didn't maximally munch?");
1037           break;
1038         }
1039       }
1040       [[fallthrough]];
1041     case 'j':
1042     case 'J':
1043       if (isImaginary) break;   // Cannot be repeated.
1044       isImaginary = true;
1045       continue;  // Success.
1046     case 'w':
1047     case 'W':
1048       if (isFPConstant)
1049         break; // Invalid for floats.
1050       if (HasSize)
1051         break; // Invalid if we already have a size for the literal.
1052
1053       // wb and WB are allowed, but a mixture of cases like Wb or wB is not. We
1054       // explicitly do not support the suffix in C++ as an extension because a
1055       // library-based UDL that resolves to a library type may be more
1056       // appropriate there.
1057       if (!LangOpts.CPlusPlus && ((s[0] == 'w' && s[1] == 'b') ||
1058           (s[0] == 'W' && s[1] == 'B'))) {
1059         isBitInt = true;
1060         HasSize = true;
1061         ++s; // Skip both characters (2nd char skipped on continue).
1062         continue; // Success.
1063       }
1064     }
1065     // If we reached here, there was an error or a ud-suffix.
1066     break;
1067   }
1068
1069   // "i", "if", and "il" are user-defined suffixes in C++1y.
1070   if (s != ThisTokEnd || isImaginary) {
1071     // FIXME: Don't bother expanding UCNs if !tok.hasUCN().
1072     expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
1073     if (isValidUDSuffix(LangOpts, UDSuffixBuf)) {
1074       if (!isImaginary) {
1075         // Any suffix pieces we might have parsed are actually part of the
1076         // ud-suffix.
1077         isLong = false;
1078         isUnsigned = false;
1079         isLongLong = false;
1080         isSizeT = false;
1081         isFloat = false;
1082         isFloat16 = false;
1083         isHalf = false;
1084         isImaginary = false;
1085         isBitInt = false;
1086         MicrosoftInteger = 0;
1087         saw_fixed_point_suffix = false;
1088         isFract = false;
1089         isAccum = false;
1090       }
1091
1092       saw_ud_suffix = true;
1093       return;
1094     }
1095
1096     if (s != ThisTokEnd) {
1097       // Report an error if there are any.
1098       Diags.Report(Lexer::AdvanceToTokenCharacter(
1099                        TokLoc, SuffixBegin - ThisTokBegin, SM, LangOpts),
1100                    diag::err_invalid_suffix_constant)
1101           << StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)
1102           << (isFixedPointConstant ? 2 : isFPConstant);
1103       hadError = true;
1104     }
1105   }
1106
1107   if (!hadError && saw_fixed_point_suffix) {
1108     assert(isFract || isAccum);
1109   }
1110 }
1111
1112 /// ParseDecimalOrOctalCommon - This method is called for decimal or octal
1113 /// numbers. It issues an error for illegal digits, and handles floating point
1114 /// parsing. If it detects a floating point number, the radix is set to 10.
1115 void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){
1116   assert((radix == 8 || radix == 10) && "Unexpected radix");
1117
1118   // If we have a hex digit other than 'e' (which denotes a FP exponent) then
1119   // the code is using an incorrect base.
1120   if (isHexDigit(*s) && *s != 'e' && *s != 'E' &&
1121       !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
1122     Diags.Report(
1123         Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, LangOpts),
1124         diag::err_invalid_digit)
1125         << StringRef(s, 1) << (radix == 8 ? 1 : 0);
1126     hadError = true;
1127     return;
1128   }
1129
1130   if (*s == '.') {
1131     checkSeparator(TokLoc, s, CSK_AfterDigits);
1132     s++;
1133     radix = 10;
1134     saw_period = true;
1135     checkSeparator(TokLoc, s, CSK_BeforeDigits);
1136     s = SkipDigits(s); // Skip suffix.
1137   }
1138   if (*s == 'e' || *s == 'E') { // exponent
1139     checkSeparator(TokLoc, s, CSK_AfterDigits);
1140     const char *Exponent = s;
1141     s++;
1142     radix = 10;
1143     saw_exponent = true;
1144     if (s != ThisTokEnd && (*s == '+' || *s == '-'))  s++; // sign
1145     const char *first_non_digit = SkipDigits(s);
1146     if (containsDigits(s, first_non_digit)) {
1147       checkSeparator(TokLoc, s, CSK_BeforeDigits);
1148       s = first_non_digit;
1149     } else {
1150       if (!hadError) {
1151         Diags.Report(Lexer::AdvanceToTokenCharacter(
1152                          TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
1153                      diag::err_exponent_has_no_digits);
1154         hadError = true;
1155       }
1156       return;
1157     }
1158   }
1159 }
1160
1161 /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
1162 /// suffixes as ud-suffixes, because the diagnostic experience is better if we
1163 /// treat it as an invalid suffix.
1164 bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
1165                                            StringRef Suffix) {
1166   if (!LangOpts.CPlusPlus11 || Suffix.empty())
1167     return false;
1168
1169   // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
1170   if (Suffix[0] == '_')
1171     return true;
1172
1173   // In C++11, there are no library suffixes.
1174   if (!LangOpts.CPlusPlus14)
1175     return false;
1176
1177   // In C++14, "s", "h", "min", "ms", "us", and "ns" are used in the library.
1178   // Per tweaked N3660, "il", "i", and "if" are also used in the library.
1179   // In C++2a "d" and "y" are used in the library.
1180   return llvm::StringSwitch<bool>(Suffix)
1181       .Cases("h", "min", "s", true)
1182       .Cases("ms", "us", "ns", true)
1183       .Cases("il", "i", "if", true)
1184       .Cases("d", "y", LangOpts.CPlusPlus20)
1185       .Default(false);
1186 }
1187
1188 void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
1189                                           const char *Pos,
1190                                           CheckSeparatorKind IsAfterDigits) {
1191   if (IsAfterDigits == CSK_AfterDigits) {
1192     if (Pos == ThisTokBegin)
1193       return;
1194     --Pos;
1195   } else if (Pos == ThisTokEnd)
1196     return;
1197
1198   if (isDigitSeparator(*Pos)) {
1199     Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin, SM,
1200                                                 LangOpts),
1201                  diag::err_digit_separator_not_between_digits)
1202         << IsAfterDigits;
1203     hadError = true;
1204   }
1205 }
1206
1207 /// ParseNumberStartingWithZero - This method is called when the first character
1208 /// of the number is found to be a zero.  This means it is either an octal
1209 /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
1210 /// a floating point number (01239.123e4).  Eat the prefix, determining the
1211 /// radix etc.
1212 void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
1213   assert(s[0] == '0' && "Invalid method call");
1214   s++;
1215
1216   int c1 = s[0];
1217
1218   // Handle a hex number like 0x1234.
1219   if ((c1 == 'x' || c1 == 'X') && (isHexDigit(s[1]) || s[1] == '.')) {
1220     s++;
1221     assert(s < ThisTokEnd && "didn't maximally munch?");
1222     radix = 16;
1223     DigitsBegin = s;
1224     s = SkipHexDigits(s);
1225     bool HasSignificandDigits = containsDigits(DigitsBegin, s);
1226     if (s == ThisTokEnd) {
1227       // Done.
1228     } else if (*s == '.') {
1229       s++;
1230       saw_period = true;
1231       const char *floatDigitsBegin = s;
1232       s = SkipHexDigits(s);
1233       if (containsDigits(floatDigitsBegin, s))
1234         HasSignificandDigits = true;
1235       if (HasSignificandDigits)
1236         checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits);
1237     }
1238
1239     if (!HasSignificandDigits) {
1240       Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1241                                                   LangOpts),
1242                    diag::err_hex_constant_requires)
1243           << LangOpts.CPlusPlus << 1;
1244       hadError = true;
1245       return;
1246     }
1247
1248     // A binary exponent can appear with or with a '.'. If dotted, the
1249     // binary exponent is required.
1250     if (*s == 'p' || *s == 'P') {
1251       checkSeparator(TokLoc, s, CSK_AfterDigits);
1252       const char *Exponent = s;
1253       s++;
1254       saw_exponent = true;
1255       if (s != ThisTokEnd && (*s == '+' || *s == '-'))  s++; // sign
1256       const char *first_non_digit = SkipDigits(s);
1257       if (!containsDigits(s, first_non_digit)) {
1258         if (!hadError) {
1259           Diags.Report(Lexer::AdvanceToTokenCharacter(
1260                            TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
1261                        diag::err_exponent_has_no_digits);
1262           hadError = true;
1263         }
1264         return;
1265       }
1266       checkSeparator(TokLoc, s, CSK_BeforeDigits);
1267       s = first_non_digit;
1268
1269       if (!LangOpts.HexFloats)
1270         Diags.Report(TokLoc, LangOpts.CPlusPlus
1271                                  ? diag::ext_hex_literal_invalid
1272                                  : diag::ext_hex_constant_invalid);
1273       else if (LangOpts.CPlusPlus17)
1274         Diags.Report(TokLoc, diag::warn_cxx17_hex_literal);
1275     } else if (saw_period) {
1276       Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1277                                                   LangOpts),
1278                    diag::err_hex_constant_requires)
1279           << LangOpts.CPlusPlus << 0;
1280       hadError = true;
1281     }
1282     return;
1283   }
1284
1285   // Handle simple binary numbers 0b01010
1286   if ((c1 == 'b' || c1 == 'B') && (s[1] == '0' || s[1] == '1')) {
1287     // 0b101010 is a C++1y / GCC extension.
1288     Diags.Report(TokLoc, LangOpts.CPlusPlus14
1289                              ? diag::warn_cxx11_compat_binary_literal
1290                          : LangOpts.CPlusPlus ? diag::ext_binary_literal_cxx14
1291                                               : diag::ext_binary_literal);
1292     ++s;
1293     assert(s < ThisTokEnd && "didn't maximally munch?");
1294     radix = 2;
1295     DigitsBegin = s;
1296     s = SkipBinaryDigits(s);
1297     if (s == ThisTokEnd) {
1298       // Done.
1299     } else if (isHexDigit(*s) &&
1300                !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
1301       Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1302                                                   LangOpts),
1303                    diag::err_invalid_digit)
1304           << StringRef(s, 1) << 2;
1305       hadError = true;
1306     }
1307     // Other suffixes will be diagnosed by the caller.
1308     return;
1309   }
1310
1311   // For now, the radix is set to 8. If we discover that we have a
1312   // floating point constant, the radix will change to 10. Octal floating
1313   // point constants are not permitted (only decimal and hexadecimal).
1314   radix = 8;
1315   const char *PossibleNewDigitStart = s;
1316   s = SkipOctalDigits(s);
1317   // When the value is 0 followed by a suffix (like 0wb), we want to leave 0
1318   // as the start of the digits. So if skipping octal digits does not skip
1319   // anything, we leave the digit start where it was.
1320   if (s != PossibleNewDigitStart)
1321     DigitsBegin = PossibleNewDigitStart;
1322
1323   if (s == ThisTokEnd)
1324     return; // Done, simple octal number like 01234
1325
1326   // If we have some other non-octal digit that *is* a decimal digit, see if
1327   // this is part of a floating point number like 094.123 or 09e1.
1328   if (isDigit(*s)) {
1329     const char *EndDecimal = SkipDigits(s);
1330     if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
1331       s = EndDecimal;
1332       radix = 10;
1333     }
1334   }
1335
1336   ParseDecimalOrOctalCommon(TokLoc);
1337 }
1338
1339 static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
1340   switch (Radix) {
1341   case 2:
1342     return NumDigits <= 64;
1343   case 8:
1344     return NumDigits <= 64 / 3; // Digits are groups of 3 bits.
1345   case 10:
1346     return NumDigits <= 19; // floor(log10(2^64))
1347   case 16:
1348     return NumDigits <= 64 / 4; // Digits are groups of 4 bits.
1349   default:
1350     llvm_unreachable("impossible Radix");
1351   }
1352 }
1353
1354 /// GetIntegerValue - Convert this numeric literal value to an APInt that
1355 /// matches Val's input width.  If there is an overflow, set Val to the low bits
1356 /// of the result and return true.  Otherwise, return false.
1357 bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
1358   // Fast path: Compute a conservative bound on the maximum number of
1359   // bits per digit in this radix. If we can't possibly overflow a
1360   // uint64 based on that bound then do the simple conversion to
1361   // integer. This avoids the expensive overflow checking below, and
1362   // handles the common cases that matter (small decimal integers and
1363   // hex/octal values which don't overflow).
1364   const unsigned NumDigits = SuffixBegin - DigitsBegin;
1365   if (alwaysFitsInto64Bits(radix, NumDigits)) {
1366     uint64_t N = 0;
1367     for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)
1368       if (!isDigitSeparator(*Ptr))
1369         N = N * radix + llvm::hexDigitValue(*Ptr);
1370
1371     // This will truncate the value to Val's input width. Simply check
1372     // for overflow by comparing.
1373     Val = N;
1374     return Val.getZExtValue() != N;
1375   }
1376
1377   Val = 0;
1378   const char *Ptr = DigitsBegin;
1379
1380   llvm::APInt RadixVal(Val.getBitWidth(), radix);
1381   llvm::APInt CharVal(Val.getBitWidth(), 0);
1382   llvm::APInt OldVal = Val;
1383
1384   bool OverflowOccurred = false;
1385   while (Ptr < SuffixBegin) {
1386     if (isDigitSeparator(*Ptr)) {
1387       ++Ptr;
1388       continue;
1389     }
1390
1391     unsigned C = llvm::hexDigitValue(*Ptr++);
1392
1393     // If this letter is out of bound for this radix, reject it.
1394     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1395
1396     CharVal = C;
1397
1398     // Add the digit to the value in the appropriate radix.  If adding in digits
1399     // made the value smaller, then this overflowed.
1400     OldVal = Val;
1401
1402     // Multiply by radix, did overflow occur on the multiply?
1403     Val *= RadixVal;
1404     OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
1405
1406     // Add value, did overflow occur on the value?
1407     //   (a + b) ult b  <=> overflow
1408     Val += CharVal;
1409     OverflowOccurred |= Val.ult(CharVal);
1410   }
1411   return OverflowOccurred;
1412 }
1413
1414 llvm::APFloat::opStatus
1415 NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
1416   using llvm::APFloat;
1417
1418   unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
1419
1420   llvm::SmallString<16> Buffer;
1421   StringRef Str(ThisTokBegin, n);
1422   if (Str.contains('\'')) {
1423     Buffer.reserve(n);
1424     std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),
1425                         &isDigitSeparator);
1426     Str = Buffer;
1427   }
1428
1429   auto StatusOrErr =
1430       Result.convertFromString(Str, APFloat::rmNearestTiesToEven);
1431   assert(StatusOrErr && "Invalid floating point representation");
1432   return !errorToBool(StatusOrErr.takeError()) ? *StatusOrErr
1433                                                : APFloat::opInvalidOp;
1434 }
1435
1436 static inline bool IsExponentPart(char c) {
1437   return c == 'p' || c == 'P' || c == 'e' || c == 'E';
1438 }
1439
1440 bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) {
1441   assert(radix == 16 || radix == 10);
1442
1443   // Find how many digits are needed to store the whole literal.
1444   unsigned NumDigits = SuffixBegin - DigitsBegin;
1445   if (saw_period) --NumDigits;
1446
1447   // Initial scan of the exponent if it exists
1448   bool ExpOverflowOccurred = false;
1449   bool NegativeExponent = false;
1450   const char *ExponentBegin;
1451   uint64_t Exponent = 0;
1452   int64_t BaseShift = 0;
1453   if (saw_exponent) {
1454     const char *Ptr = DigitsBegin;
1455
1456     while (!IsExponentPart(*Ptr)) ++Ptr;
1457     ExponentBegin = Ptr;
1458     ++Ptr;
1459     NegativeExponent = *Ptr == '-';
1460     if (NegativeExponent) ++Ptr;
1461
1462     unsigned NumExpDigits = SuffixBegin - Ptr;
1463     if (alwaysFitsInto64Bits(radix, NumExpDigits)) {
1464       llvm::StringRef ExpStr(Ptr, NumExpDigits);
1465       llvm::APInt ExpInt(/*numBits=*/64, ExpStr, /*radix=*/10);
1466       Exponent = ExpInt.getZExtValue();
1467     } else {
1468       ExpOverflowOccurred = true;
1469     }
1470
1471     if (NegativeExponent) BaseShift -= Exponent;
1472     else BaseShift += Exponent;
1473   }
1474
1475   // Number of bits needed for decimal literal is
1476   //   ceil(NumDigits * log2(10))       Integral part
1477   // + Scale                            Fractional part
1478   // + ceil(Exponent * log2(10))        Exponent
1479   // --------------------------------------------------
1480   //   ceil((NumDigits + Exponent) * log2(10)) + Scale
1481   //
1482   // But for simplicity in handling integers, we can round up log2(10) to 4,
1483   // making:
1484   // 4 * (NumDigits + Exponent) + Scale
1485   //
1486   // Number of digits needed for hexadecimal literal is
1487   //   4 * NumDigits                    Integral part
1488   // + Scale                            Fractional part
1489   // + Exponent                         Exponent
1490   // --------------------------------------------------
1491   //   (4 * NumDigits) + Scale + Exponent
1492   uint64_t NumBitsNeeded;
1493   if (radix == 10)
1494     NumBitsNeeded = 4 * (NumDigits + Exponent) + Scale;
1495   else
1496     NumBitsNeeded = 4 * NumDigits + Exponent + Scale;
1497
1498   if (NumBitsNeeded > std::numeric_limits<unsigned>::max())
1499     ExpOverflowOccurred = true;
1500   llvm::APInt Val(static_cast<unsigned>(NumBitsNeeded), 0, /*isSigned=*/false);
1501
1502   bool FoundDecimal = false;
1503
1504   int64_t FractBaseShift = 0;
1505   const char *End = saw_exponent ? ExponentBegin : SuffixBegin;
1506   for (const char *Ptr = DigitsBegin; Ptr < End; ++Ptr) {
1507     if (*Ptr == '.') {
1508       FoundDecimal = true;
1509       continue;
1510     }
1511
1512     // Normal reading of an integer
1513     unsigned C = llvm::hexDigitValue(*Ptr);
1514     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1515
1516     Val *= radix;
1517     Val += C;
1518
1519     if (FoundDecimal)
1520       // Keep track of how much we will need to adjust this value by from the
1521       // number of digits past the radix point.
1522       --FractBaseShift;
1523   }
1524
1525   // For a radix of 16, we will be multiplying by 2 instead of 16.
1526   if (radix == 16) FractBaseShift *= 4;
1527   BaseShift += FractBaseShift;
1528
1529   Val <<= Scale;
1530
1531   uint64_t Base = (radix == 16) ? 2 : 10;
1532   if (BaseShift > 0) {
1533     for (int64_t i = 0; i < BaseShift; ++i) {
1534       Val *= Base;
1535     }
1536   } else if (BaseShift < 0) {
1537     for (int64_t i = BaseShift; i < 0 && !Val.isZero(); ++i)
1538       Val = Val.udiv(Base);
1539   }
1540
1541   bool IntOverflowOccurred = false;
1542   auto MaxVal = llvm::APInt::getMaxValue(StoreVal.getBitWidth());
1543   if (Val.getBitWidth() > StoreVal.getBitWidth()) {
1544     IntOverflowOccurred |= Val.ugt(MaxVal.zext(Val.getBitWidth()));
1545     StoreVal = Val.trunc(StoreVal.getBitWidth());
1546   } else if (Val.getBitWidth() < StoreVal.getBitWidth()) {
1547     IntOverflowOccurred |= Val.zext(MaxVal.getBitWidth()).ugt(MaxVal);
1548     StoreVal = Val.zext(StoreVal.getBitWidth());
1549   } else {
1550     StoreVal = Val;
1551   }
1552
1553   return IntOverflowOccurred || ExpOverflowOccurred;
1554 }
1555
1556 /// \verbatim
1557 ///       user-defined-character-literal: [C++11 lex.ext]
1558 ///         character-literal ud-suffix
1559 ///       ud-suffix:
1560 ///         identifier
1561 ///       character-literal: [C++11 lex.ccon]
1562 ///         ' c-char-sequence '
1563 ///         u' c-char-sequence '
1564 ///         U' c-char-sequence '
1565 ///         L' c-char-sequence '
1566 ///         u8' c-char-sequence ' [C++1z lex.ccon]
1567 ///       c-char-sequence:
1568 ///         c-char
1569 ///         c-char-sequence c-char
1570 ///       c-char:
1571 ///         any member of the source character set except the single-quote ',
1572 ///           backslash \, or new-line character
1573 ///         escape-sequence
1574 ///         universal-character-name
1575 ///       escape-sequence:
1576 ///         simple-escape-sequence
1577 ///         octal-escape-sequence
1578 ///         hexadecimal-escape-sequence
1579 ///       simple-escape-sequence:
1580 ///         one of \' \" \? \\ \a \b \f \n \r \t \v
1581 ///       octal-escape-sequence:
1582 ///         \ octal-digit
1583 ///         \ octal-digit octal-digit
1584 ///         \ octal-digit octal-digit octal-digit
1585 ///       hexadecimal-escape-sequence:
1586 ///         \x hexadecimal-digit
1587 ///         hexadecimal-escape-sequence hexadecimal-digit
1588 ///       universal-character-name: [C++11 lex.charset]
1589 ///         \u hex-quad
1590 ///         \U hex-quad hex-quad
1591 ///       hex-quad:
1592 ///         hex-digit hex-digit hex-digit hex-digit
1593 /// \endverbatim
1594 ///
1595 CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
1596                                      SourceLocation Loc, Preprocessor &PP,
1597                                      tok::TokenKind kind) {
1598   // At this point we know that the character matches the regex "(L|u|U)?'.*'".
1599   HadError = false;
1600
1601   Kind = kind;
1602
1603   const char *TokBegin = begin;
1604
1605   // Skip over wide character determinant.
1606   if (Kind != tok::char_constant)
1607     ++begin;
1608   if (Kind == tok::utf8_char_constant)
1609     ++begin;
1610
1611   // Skip over the entry quote.
1612   if (begin[0] != '\'') {
1613     PP.Diag(Loc, diag::err_lexing_char);
1614     HadError = true;
1615     return;
1616   }
1617
1618   ++begin;
1619
1620   // Remove an optional ud-suffix.
1621   if (end[-1] != '\'') {
1622     const char *UDSuffixEnd = end;
1623     do {
1624       --end;
1625     } while (end[-1] != '\'');
1626     // FIXME: Don't bother with this if !tok.hasUCN().
1627     expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
1628     UDSuffixOffset = end - TokBegin;
1629   }
1630
1631   // Trim the ending quote.
1632   assert(end != begin && "Invalid token lexed");
1633   --end;
1634
1635   // FIXME: The "Value" is an uint64_t so we can handle char literals of
1636   // up to 64-bits.
1637   // FIXME: This extensively assumes that 'char' is 8-bits.
1638   assert(PP.getTargetInfo().getCharWidth() == 8 &&
1639          "Assumes char is 8 bits");
1640   assert(PP.getTargetInfo().getIntWidth() <= 64 &&
1641          (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
1642          "Assumes sizeof(int) on target is <= 64 and a multiple of char");
1643   assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
1644          "Assumes sizeof(wchar) on target is <= 64");
1645
1646   SmallVector<uint32_t, 4> codepoint_buffer;
1647   codepoint_buffer.resize(end - begin);
1648   uint32_t *buffer_begin = &codepoint_buffer.front();
1649   uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
1650
1651   // Unicode escapes representing characters that cannot be correctly
1652   // represented in a single code unit are disallowed in character literals
1653   // by this implementation.
1654   uint32_t largest_character_for_kind;
1655   if (tok::wide_char_constant == Kind) {
1656     largest_character_for_kind =
1657         0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
1658   } else if (tok::utf8_char_constant == Kind) {
1659     largest_character_for_kind = 0x7F;
1660   } else if (tok::utf16_char_constant == Kind) {
1661     largest_character_for_kind = 0xFFFF;
1662   } else if (tok::utf32_char_constant == Kind) {
1663     largest_character_for_kind = 0x10FFFF;
1664   } else {
1665     largest_character_for_kind = 0x7Fu;
1666   }
1667
1668   while (begin != end) {
1669     // Is this a span of non-escape characters?
1670     if (begin[0] != '\\') {
1671       char const *start = begin;
1672       do {
1673         ++begin;
1674       } while (begin != end && *begin != '\\');
1675
1676       char const *tmp_in_start = start;
1677       uint32_t *tmp_out_start = buffer_begin;
1678       llvm::ConversionResult res =
1679           llvm::ConvertUTF8toUTF32(reinterpret_cast<llvm::UTF8 const **>(&start),
1680                              reinterpret_cast<llvm::UTF8 const *>(begin),
1681                              &buffer_begin, buffer_end, llvm::strictConversion);
1682       if (res != llvm::conversionOK) {
1683         // If we see bad encoding for unprefixed character literals, warn and
1684         // simply copy the byte values, for compatibility with gcc and
1685         // older versions of clang.
1686         bool NoErrorOnBadEncoding = isOrdinary();
1687         unsigned Msg = diag::err_bad_character_encoding;
1688         if (NoErrorOnBadEncoding)
1689           Msg = diag::warn_bad_character_encoding;
1690         PP.Diag(Loc, Msg);
1691         if (NoErrorOnBadEncoding) {
1692           start = tmp_in_start;
1693           buffer_begin = tmp_out_start;
1694           for (; start != begin; ++start, ++buffer_begin)
1695             *buffer_begin = static_cast<uint8_t>(*start);
1696         } else {
1697           HadError = true;
1698         }
1699       } else {
1700         for (; tmp_out_start < buffer_begin; ++tmp_out_start) {
1701           if (*tmp_out_start > largest_character_for_kind) {
1702             HadError = true;
1703             PP.Diag(Loc, diag::err_character_too_large);
1704           }
1705         }
1706       }
1707
1708       continue;
1709     }
1710     // Is this a Universal Character Name escape?
1711     if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') {
1712       unsigned short UcnLen = 0;
1713       if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
1714                             FullSourceLoc(Loc, PP.getSourceManager()),
1715                             &PP.getDiagnostics(), PP.getLangOpts(), true)) {
1716         HadError = true;
1717       } else if (*buffer_begin > largest_character_for_kind) {
1718         HadError = true;
1719         PP.Diag(Loc, diag::err_character_too_large);
1720       }
1721
1722       ++buffer_begin;
1723       continue;
1724     }
1725     unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
1726     uint64_t result =
1727       ProcessCharEscape(TokBegin, begin, end, HadError,
1728                         FullSourceLoc(Loc,PP.getSourceManager()),
1729                         CharWidth, &PP.getDiagnostics(), PP.getLangOpts());
1730     *buffer_begin++ = result;
1731   }
1732
1733   unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
1734
1735   if (NumCharsSoFar > 1) {
1736     if (isOrdinary() && NumCharsSoFar == 4)
1737       PP.Diag(Loc, diag::warn_four_char_character_literal);
1738     else if (isOrdinary())
1739       PP.Diag(Loc, diag::warn_multichar_character_literal);
1740     else {
1741       PP.Diag(Loc, diag::err_multichar_character_literal) << (isWide() ? 0 : 1);
1742       HadError = true;
1743     }
1744     IsMultiChar = true;
1745   } else {
1746     IsMultiChar = false;
1747   }
1748
1749   llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
1750
1751   // Narrow character literals act as though their value is concatenated
1752   // in this implementation, but warn on overflow.
1753   bool multi_char_too_long = false;
1754   if (isOrdinary() && isMultiChar()) {
1755     LitVal = 0;
1756     for (size_t i = 0; i < NumCharsSoFar; ++i) {
1757       // check for enough leading zeros to shift into
1758       multi_char_too_long |= (LitVal.countLeadingZeros() < 8);
1759       LitVal <<= 8;
1760       LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
1761     }
1762   } else if (NumCharsSoFar > 0) {
1763     // otherwise just take the last character
1764     LitVal = buffer_begin[-1];
1765   }
1766
1767   if (!HadError && multi_char_too_long) {
1768     PP.Diag(Loc, diag::warn_char_constant_too_large);
1769   }
1770
1771   // Transfer the value from APInt to uint64_t
1772   Value = LitVal.getZExtValue();
1773
1774   // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
1775   // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
1776   // character constants are not sign extended in the this implementation:
1777   // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
1778   if (isOrdinary() && NumCharsSoFar == 1 && (Value & 128) &&
1779       PP.getLangOpts().CharIsSigned)
1780     Value = (signed char)Value;
1781 }
1782
1783 /// \verbatim
1784 ///       string-literal: [C++0x lex.string]
1785 ///         encoding-prefix " [s-char-sequence] "
1786 ///         encoding-prefix R raw-string
1787 ///       encoding-prefix:
1788 ///         u8
1789 ///         u
1790 ///         U
1791 ///         L
1792 ///       s-char-sequence:
1793 ///         s-char
1794 ///         s-char-sequence s-char
1795 ///       s-char:
1796 ///         any member of the source character set except the double-quote ",
1797 ///           backslash \, or new-line character
1798 ///         escape-sequence
1799 ///         universal-character-name
1800 ///       raw-string:
1801 ///         " d-char-sequence ( r-char-sequence ) d-char-sequence "
1802 ///       r-char-sequence:
1803 ///         r-char
1804 ///         r-char-sequence r-char
1805 ///       r-char:
1806 ///         any member of the source character set, except a right parenthesis )
1807 ///           followed by the initial d-char-sequence (which may be empty)
1808 ///           followed by a double quote ".
1809 ///       d-char-sequence:
1810 ///         d-char
1811 ///         d-char-sequence d-char
1812 ///       d-char:
1813 ///         any member of the basic source character set except:
1814 ///           space, the left parenthesis (, the right parenthesis ),
1815 ///           the backslash \, and the control characters representing horizontal
1816 ///           tab, vertical tab, form feed, and newline.
1817 ///       escape-sequence: [C++0x lex.ccon]
1818 ///         simple-escape-sequence
1819 ///         octal-escape-sequence
1820 ///         hexadecimal-escape-sequence
1821 ///       simple-escape-sequence:
1822 ///         one of \' \" \? \\ \a \b \f \n \r \t \v
1823 ///       octal-escape-sequence:
1824 ///         \ octal-digit
1825 ///         \ octal-digit octal-digit
1826 ///         \ octal-digit octal-digit octal-digit
1827 ///       hexadecimal-escape-sequence:
1828 ///         \x hexadecimal-digit
1829 ///         hexadecimal-escape-sequence hexadecimal-digit
1830 ///       universal-character-name:
1831 ///         \u hex-quad
1832 ///         \U hex-quad hex-quad
1833 ///       hex-quad:
1834 ///         hex-digit hex-digit hex-digit hex-digit
1835 /// \endverbatim
1836 ///
1837 StringLiteralParser::
1838 StringLiteralParser(ArrayRef<Token> StringToks,
1839                     Preprocessor &PP)
1840   : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1841     Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()),
1842     MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
1843     ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
1844   init(StringToks);
1845 }
1846
1847 void StringLiteralParser::init(ArrayRef<Token> StringToks){
1848   // The literal token may have come from an invalid source location (e.g. due
1849   // to a PCH error), in which case the token length will be 0.
1850   if (StringToks.empty() || StringToks[0].getLength() < 2)
1851     return DiagnoseLexingError(SourceLocation());
1852
1853   // Scan all of the string portions, remember the max individual token length,
1854   // computing a bound on the concatenated string length, and see whether any
1855   // piece is a wide-string.  If any of the string portions is a wide-string
1856   // literal, the result is a wide-string literal [C99 6.4.5p4].
1857   assert(!StringToks.empty() && "expected at least one token");
1858   MaxTokenLength = StringToks[0].getLength();
1859   assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
1860   SizeBound = StringToks[0].getLength()-2;  // -2 for "".
1861   Kind = StringToks[0].getKind();
1862
1863   hadError = false;
1864
1865   // Implement Translation Phase #6: concatenation of string literals
1866   /// (C99 5.1.1.2p1).  The common case is only one string fragment.
1867   for (unsigned i = 1; i != StringToks.size(); ++i) {
1868     if (StringToks[i].getLength() < 2)
1869       return DiagnoseLexingError(StringToks[i].getLocation());
1870
1871     // The string could be shorter than this if it needs cleaning, but this is a
1872     // reasonable bound, which is all we need.
1873     assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");
1874     SizeBound += StringToks[i].getLength()-2;  // -2 for "".
1875
1876     // Remember maximum string piece length.
1877     if (StringToks[i].getLength() > MaxTokenLength)
1878       MaxTokenLength = StringToks[i].getLength();
1879
1880     // Remember if we see any wide or utf-8/16/32 strings.
1881     // Also check for illegal concatenations.
1882     if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
1883       if (isOrdinary()) {
1884         Kind = StringToks[i].getKind();
1885       } else {
1886         if (Diags)
1887           Diags->Report(StringToks[i].getLocation(),
1888                         diag::err_unsupported_string_concat);
1889         hadError = true;
1890       }
1891     }
1892   }
1893
1894   // Include space for the null terminator.
1895   ++SizeBound;
1896
1897   // TODO: K&R warning: "traditional C rejects string constant concatenation"
1898
1899   // Get the width in bytes of char/wchar_t/char16_t/char32_t
1900   CharByteWidth = getCharWidth(Kind, Target);
1901   assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
1902   CharByteWidth /= 8;
1903
1904   // The output buffer size needs to be large enough to hold wide characters.
1905   // This is a worst-case assumption which basically corresponds to L"" "long".
1906   SizeBound *= CharByteWidth;
1907
1908   // Size the temporary buffer to hold the result string data.
1909   ResultBuf.resize(SizeBound);
1910
1911   // Likewise, but for each string piece.
1912   SmallString<512> TokenBuf;
1913   TokenBuf.resize(MaxTokenLength);
1914
1915   // Loop over all the strings, getting their spelling, and expanding them to
1916   // wide strings as appropriate.
1917   ResultPtr = &ResultBuf[0];   // Next byte to fill in.
1918
1919   Pascal = false;
1920
1921   SourceLocation UDSuffixTokLoc;
1922
1923   for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
1924     const char *ThisTokBuf = &TokenBuf[0];
1925     // Get the spelling of the token, which eliminates trigraphs, etc.  We know
1926     // that ThisTokBuf points to a buffer that is big enough for the whole token
1927     // and 'spelled' tokens can only shrink.
1928     bool StringInvalid = false;
1929     unsigned ThisTokLen =
1930       Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
1931                          &StringInvalid);
1932     if (StringInvalid)
1933       return DiagnoseLexingError(StringToks[i].getLocation());
1934
1935     const char *ThisTokBegin = ThisTokBuf;
1936     const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
1937
1938     // Remove an optional ud-suffix.
1939     if (ThisTokEnd[-1] != '"') {
1940       const char *UDSuffixEnd = ThisTokEnd;
1941       do {
1942         --ThisTokEnd;
1943       } while (ThisTokEnd[-1] != '"');
1944
1945       StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
1946
1947       if (UDSuffixBuf.empty()) {
1948         if (StringToks[i].hasUCN())
1949           expandUCNs(UDSuffixBuf, UDSuffix);
1950         else
1951           UDSuffixBuf.assign(UDSuffix);
1952         UDSuffixToken = i;
1953         UDSuffixOffset = ThisTokEnd - ThisTokBuf;
1954         UDSuffixTokLoc = StringToks[i].getLocation();
1955       } else {
1956         SmallString<32> ExpandedUDSuffix;
1957         if (StringToks[i].hasUCN()) {
1958           expandUCNs(ExpandedUDSuffix, UDSuffix);
1959           UDSuffix = ExpandedUDSuffix;
1960         }
1961
1962         // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
1963         // result of a concatenation involving at least one user-defined-string-
1964         // literal, all the participating user-defined-string-literals shall
1965         // have the same ud-suffix.
1966         if (UDSuffixBuf != UDSuffix) {
1967           if (Diags) {
1968             SourceLocation TokLoc = StringToks[i].getLocation();
1969             Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
1970               << UDSuffixBuf << UDSuffix
1971               << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
1972               << SourceRange(TokLoc, TokLoc);
1973           }
1974           hadError = true;
1975         }
1976       }
1977     }
1978
1979     // Strip the end quote.
1980     --ThisTokEnd;
1981
1982     // TODO: Input character set mapping support.
1983
1984     // Skip marker for wide or unicode strings.
1985     if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
1986       ++ThisTokBuf;
1987       // Skip 8 of u8 marker for utf8 strings.
1988       if (ThisTokBuf[0] == '8')
1989         ++ThisTokBuf;
1990     }
1991
1992     // Check for raw string
1993     if (ThisTokBuf[0] == 'R') {
1994       if (ThisTokBuf[1] != '"') {
1995         // The file may have come from PCH and then changed after loading the
1996         // PCH; Fail gracefully.
1997         return DiagnoseLexingError(StringToks[i].getLocation());
1998       }
1999       ThisTokBuf += 2; // skip R"
2000
2001       // C++11 [lex.string]p2: A `d-char-sequence` shall consist of at most 16
2002       // characters.
2003       constexpr unsigned MaxRawStrDelimLen = 16;
2004
2005       const char *Prefix = ThisTokBuf;
2006       while (static_cast<unsigned>(ThisTokBuf - Prefix) < MaxRawStrDelimLen &&
2007              ThisTokBuf[0] != '(')
2008         ++ThisTokBuf;
2009       if (ThisTokBuf[0] != '(')
2010         return DiagnoseLexingError(StringToks[i].getLocation());
2011       ++ThisTokBuf; // skip '('
2012
2013       // Remove same number of characters from the end
2014       ThisTokEnd -= ThisTokBuf - Prefix;
2015       if (ThisTokEnd < ThisTokBuf)
2016         return DiagnoseLexingError(StringToks[i].getLocation());
2017
2018       // C++14 [lex.string]p4: A source-file new-line in a raw string literal
2019       // results in a new-line in the resulting execution string-literal.
2020       StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);
2021       while (!RemainingTokenSpan.empty()) {
2022         // Split the string literal on \r\n boundaries.
2023         size_t CRLFPos = RemainingTokenSpan.find("\r\n");
2024         StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);
2025         StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
2026
2027         // Copy everything before the \r\n sequence into the string literal.
2028         if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
2029           hadError = true;
2030
2031         // Point into the \n inside the \r\n sequence and operate on the
2032         // remaining portion of the literal.
2033         RemainingTokenSpan = AfterCRLF.substr(1);
2034       }
2035     } else {
2036       if (ThisTokBuf[0] != '"') {
2037         // The file may have come from PCH and then changed after loading the
2038         // PCH; Fail gracefully.
2039         return DiagnoseLexingError(StringToks[i].getLocation());
2040       }
2041       ++ThisTokBuf; // skip "
2042
2043       // Check if this is a pascal string
2044       if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
2045           ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
2046
2047         // If the \p sequence is found in the first token, we have a pascal string
2048         // Otherwise, if we already have a pascal string, ignore the first \p
2049         if (i == 0) {
2050           ++ThisTokBuf;
2051           Pascal = true;
2052         } else if (Pascal)
2053           ThisTokBuf += 2;
2054       }
2055
2056       while (ThisTokBuf != ThisTokEnd) {
2057         // Is this a span of non-escape characters?
2058         if (ThisTokBuf[0] != '\\') {
2059           const char *InStart = ThisTokBuf;
2060           do {
2061             ++ThisTokBuf;
2062           } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
2063
2064           // Copy the character span over.
2065           if (CopyStringFragment(StringToks[i], ThisTokBegin,
2066                                  StringRef(InStart, ThisTokBuf - InStart)))
2067             hadError = true;
2068           continue;
2069         }
2070         // Is this a Universal Character Name escape?
2071         if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' ||
2072             ThisTokBuf[1] == 'N') {
2073           EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
2074                           ResultPtr, hadError,
2075                           FullSourceLoc(StringToks[i].getLocation(), SM),
2076                           CharByteWidth, Diags, Features);
2077           continue;
2078         }
2079         // Otherwise, this is a non-UCN escape character.  Process it.
2080         unsigned ResultChar =
2081           ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
2082                             FullSourceLoc(StringToks[i].getLocation(), SM),
2083                             CharByteWidth*8, Diags, Features);
2084
2085         if (CharByteWidth == 4) {
2086           // FIXME: Make the type of the result buffer correct instead of
2087           // using reinterpret_cast.
2088           llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultPtr);
2089           *ResultWidePtr = ResultChar;
2090           ResultPtr += 4;
2091         } else if (CharByteWidth == 2) {
2092           // FIXME: Make the type of the result buffer correct instead of
2093           // using reinterpret_cast.
2094           llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultPtr);
2095           *ResultWidePtr = ResultChar & 0xFFFF;
2096           ResultPtr += 2;
2097         } else {
2098           assert(CharByteWidth == 1 && "Unexpected char width");
2099           *ResultPtr++ = ResultChar & 0xFF;
2100         }
2101       }
2102     }
2103   }
2104
2105   if (Pascal) {
2106     if (CharByteWidth == 4) {
2107       // FIXME: Make the type of the result buffer correct instead of
2108       // using reinterpret_cast.
2109       llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultBuf.data());
2110       ResultWidePtr[0] = GetNumStringChars() - 1;
2111     } else if (CharByteWidth == 2) {
2112       // FIXME: Make the type of the result buffer correct instead of
2113       // using reinterpret_cast.
2114       llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultBuf.data());
2115       ResultWidePtr[0] = GetNumStringChars() - 1;
2116     } else {
2117       assert(CharByteWidth == 1 && "Unexpected char width");
2118       ResultBuf[0] = GetNumStringChars() - 1;
2119     }
2120
2121     // Verify that pascal strings aren't too large.
2122     if (GetStringLength() > 256) {
2123       if (Diags)
2124         Diags->Report(StringToks.front().getLocation(),
2125                       diag::err_pascal_string_too_long)
2126           << SourceRange(StringToks.front().getLocation(),
2127                          StringToks.back().getLocation());
2128       hadError = true;
2129       return;
2130     }
2131   } else if (Diags) {
2132     // Complain if this string literal has too many characters.
2133     unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
2134
2135     if (GetNumStringChars() > MaxChars)
2136       Diags->Report(StringToks.front().getLocation(),
2137                     diag::ext_string_too_long)
2138         << GetNumStringChars() << MaxChars
2139         << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
2140         << SourceRange(StringToks.front().getLocation(),
2141                        StringToks.back().getLocation());
2142   }
2143 }
2144
2145 static const char *resyncUTF8(const char *Err, const char *End) {
2146   if (Err == End)
2147     return End;
2148   End = Err + std::min<unsigned>(llvm::getNumBytesForUTF8(*Err), End-Err);
2149   while (++Err != End && (*Err & 0xC0) == 0x80)
2150     ;
2151   return Err;
2152 }
2153
2154 /// This function copies from Fragment, which is a sequence of bytes
2155 /// within Tok's contents (which begin at TokBegin) into ResultPtr.
2156 /// Performs widening for multi-byte characters.
2157 bool StringLiteralParser::CopyStringFragment(const Token &Tok,
2158                                              const char *TokBegin,
2159                                              StringRef Fragment) {
2160   const llvm::UTF8 *ErrorPtrTmp;
2161   if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
2162     return false;
2163
2164   // If we see bad encoding for unprefixed string literals, warn and
2165   // simply copy the byte values, for compatibility with gcc and older
2166   // versions of clang.
2167   bool NoErrorOnBadEncoding = isOrdinary();
2168   if (NoErrorOnBadEncoding) {
2169     memcpy(ResultPtr, Fragment.data(), Fragment.size());
2170     ResultPtr += Fragment.size();
2171   }
2172
2173   if (Diags) {
2174     const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
2175
2176     FullSourceLoc SourceLoc(Tok.getLocation(), SM);
2177     const DiagnosticBuilder &Builder =
2178       Diag(Diags, Features, SourceLoc, TokBegin,
2179            ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),
2180            NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
2181                                 : diag::err_bad_string_encoding);
2182
2183     const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());
2184     StringRef NextFragment(NextStart, Fragment.end()-NextStart);
2185
2186     // Decode into a dummy buffer.
2187     SmallString<512> Dummy;
2188     Dummy.reserve(Fragment.size() * CharByteWidth);
2189     char *Ptr = Dummy.data();
2190
2191     while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {
2192       const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
2193       NextStart = resyncUTF8(ErrorPtr, Fragment.end());
2194       Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
2195                                      ErrorPtr, NextStart);
2196       NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
2197     }
2198   }
2199   return !NoErrorOnBadEncoding;
2200 }
2201
2202 void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
2203   hadError = true;
2204   if (Diags)
2205     Diags->Report(Loc, diag::err_lexing_string);
2206 }
2207
2208 /// getOffsetOfStringByte - This function returns the offset of the
2209 /// specified byte of the string data represented by Token.  This handles
2210 /// advancing over escape sequences in the string.
2211 unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
2212                                                     unsigned ByteNo) const {
2213   // Get the spelling of the token.
2214   SmallString<32> SpellingBuffer;
2215   SpellingBuffer.resize(Tok.getLength());
2216
2217   bool StringInvalid = false;
2218   const char *SpellingPtr = &SpellingBuffer[0];
2219   unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
2220                                        &StringInvalid);
2221   if (StringInvalid)
2222     return 0;
2223
2224   const char *SpellingStart = SpellingPtr;
2225   const char *SpellingEnd = SpellingPtr+TokLen;
2226
2227   // Handle UTF-8 strings just like narrow strings.
2228   if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
2229     SpellingPtr += 2;
2230
2231   assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
2232          SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
2233
2234   // For raw string literals, this is easy.
2235   if (SpellingPtr[0] == 'R') {
2236     assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
2237     // Skip 'R"'.
2238     SpellingPtr += 2;
2239     while (*SpellingPtr != '(') {
2240       ++SpellingPtr;
2241       assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
2242     }
2243     // Skip '('.
2244     ++SpellingPtr;
2245     return SpellingPtr - SpellingStart + ByteNo;
2246   }
2247
2248   // Skip over the leading quote
2249   assert(SpellingPtr[0] == '"' && "Should be a string literal!");
2250   ++SpellingPtr;
2251
2252   // Skip over bytes until we find the offset we're looking for.
2253   while (ByteNo) {
2254     assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
2255
2256     // Step over non-escapes simply.
2257     if (*SpellingPtr != '\\') {
2258       ++SpellingPtr;
2259       --ByteNo;
2260       continue;
2261     }
2262
2263     // Otherwise, this is an escape character.  Advance over it.
2264     bool HadError = false;
2265     if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U' ||
2266         SpellingPtr[1] == 'N') {
2267       const char *EscapePtr = SpellingPtr;
2268       unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
2269                                       1, Features, HadError);
2270       if (Len > ByteNo) {
2271         // ByteNo is somewhere within the escape sequence.
2272         SpellingPtr = EscapePtr;
2273         break;
2274       }
2275       ByteNo -= Len;
2276     } else {
2277       ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
2278                         FullSourceLoc(Tok.getLocation(), SM),
2279                         CharByteWidth*8, Diags, Features);
2280       --ByteNo;
2281     }
2282     assert(!HadError && "This method isn't valid on erroneous strings");
2283   }
2284
2285   return SpellingPtr-SpellingStart;
2286 }
2287
2288 /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
2289 /// suffixes as ud-suffixes, because the diagnostic experience is better if we
2290 /// treat it as an invalid suffix.
2291 bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
2292                                           StringRef Suffix) {
2293   return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) ||
2294          Suffix == "sv";
2295 }