1 //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file implements the NumericLiteralParser, CharLiteralParser, and
11 // StringLiteralParser interfaces.
13 //===----------------------------------------------------------------------===//
15 #include "clang/Lex/LiteralSupport.h"
16 #include "clang/Basic/CharInfo.h"
17 #include "clang/Basic/TargetInfo.h"
18 #include "clang/Lex/LexDiagnostic.h"
19 #include "clang/Lex/Preprocessor.h"
20 #include "llvm/ADT/StringExtras.h"
21 #include "llvm/Support/ConvertUTF.h"
22 #include "llvm/Support/ErrorHandling.h"
24 using namespace clang
;
26 static unsigned getCharWidth(tok::TokenKind kind
, const TargetInfo
&Target
) {
28 default: llvm_unreachable("Unknown token type!");
29 case tok::char_constant
:
30 case tok::string_literal
:
31 case tok::utf8_char_constant
:
32 case tok::utf8_string_literal
:
33 return Target
.getCharWidth();
34 case tok::wide_char_constant
:
35 case tok::wide_string_literal
:
36 return Target
.getWCharWidth();
37 case tok::utf16_char_constant
:
38 case tok::utf16_string_literal
:
39 return Target
.getChar16Width();
40 case tok::utf32_char_constant
:
41 case tok::utf32_string_literal
:
42 return Target
.getChar32Width();
46 static CharSourceRange
MakeCharSourceRange(const LangOptions
&Features
,
49 const char *TokRangeBegin
,
50 const char *TokRangeEnd
) {
51 SourceLocation Begin
=
52 Lexer::AdvanceToTokenCharacter(TokLoc
, TokRangeBegin
- TokBegin
,
53 TokLoc
.getManager(), Features
);
55 Lexer::AdvanceToTokenCharacter(Begin
, TokRangeEnd
- TokRangeBegin
,
56 TokLoc
.getManager(), Features
);
57 return CharSourceRange::getCharRange(Begin
, End
);
60 /// \brief Produce a diagnostic highlighting some portion of a literal.
62 /// Emits the diagnostic \p DiagID, highlighting the range of characters from
63 /// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
64 /// a substring of a spelling buffer for the token beginning at \p TokBegin.
65 static DiagnosticBuilder
Diag(DiagnosticsEngine
*Diags
,
66 const LangOptions
&Features
, FullSourceLoc TokLoc
,
67 const char *TokBegin
, const char *TokRangeBegin
,
68 const char *TokRangeEnd
, unsigned DiagID
) {
69 SourceLocation Begin
=
70 Lexer::AdvanceToTokenCharacter(TokLoc
, TokRangeBegin
- TokBegin
,
71 TokLoc
.getManager(), Features
);
72 return Diags
->Report(Begin
, DiagID
) <<
73 MakeCharSourceRange(Features
, TokLoc
, TokBegin
, TokRangeBegin
, TokRangeEnd
);
76 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
77 /// either a character or a string literal.
78 static unsigned ProcessCharEscape(const char *ThisTokBegin
,
79 const char *&ThisTokBuf
,
80 const char *ThisTokEnd
, bool &HadError
,
81 FullSourceLoc Loc
, unsigned CharWidth
,
82 DiagnosticsEngine
*Diags
,
83 const LangOptions
&Features
) {
84 const char *EscapeBegin
= ThisTokBuf
;
89 // We know that this character can't be off the end of the buffer, because
90 // that would have been \", which would not have been the end of string.
91 unsigned ResultChar
= *ThisTokBuf
++;
93 // These map to themselves.
94 case '\\': case '\'': case '"': case '?': break;
96 // These have fixed mappings.
98 // TODO: K&R: the meaning of '\\a' is different in traditional C
106 Diag(Diags
, Features
, Loc
, ThisTokBegin
, EscapeBegin
, ThisTokBuf
,
107 diag::ext_nonstandard_escape
) << "e";
112 Diag(Diags
, Features
, Loc
, ThisTokBegin
, EscapeBegin
, ThisTokBuf
,
113 diag::ext_nonstandard_escape
) << "E";
131 case 'x': { // Hex escape.
133 if (ThisTokBuf
== ThisTokEnd
|| !isHexDigit(*ThisTokBuf
)) {
135 Diag(Diags
, Features
, Loc
, ThisTokBegin
, EscapeBegin
, ThisTokBuf
,
136 diag::err_hex_escape_no_digits
) << "x";
141 // Hex escapes are a maximal series of hex digits.
142 bool Overflow
= false;
143 for (; ThisTokBuf
!= ThisTokEnd
; ++ThisTokBuf
) {
144 int CharVal
= llvm::hexDigitValue(ThisTokBuf
[0]);
145 if (CharVal
== -1) break;
146 // About to shift out a digit?
147 Overflow
|= (ResultChar
& 0xF0000000) ? true : false;
149 ResultChar
|= CharVal
;
152 // See if any bits will be truncated when evaluated as a character.
153 if (CharWidth
!= 32 && (ResultChar
>> CharWidth
) != 0) {
155 ResultChar
&= ~0U >> (32-CharWidth
);
158 // Check for overflow.
159 if (Overflow
&& Diags
) // Too many digits to fit in
160 Diag(Diags
, Features
, Loc
, ThisTokBegin
, EscapeBegin
, ThisTokBuf
,
161 diag::err_hex_escape_too_large
);
164 case '0': case '1': case '2': case '3':
165 case '4': case '5': case '6': case '7': {
170 // Octal escapes are a series of octal digits with maximum length 3.
171 // "\0123" is a two digit sequence equal to "\012" "3".
172 unsigned NumDigits
= 0;
175 ResultChar
|= *ThisTokBuf
++ - '0';
177 } while (ThisTokBuf
!= ThisTokEnd
&& NumDigits
< 3 &&
178 ThisTokBuf
[0] >= '0' && ThisTokBuf
[0] <= '7');
180 // Check for overflow. Reject '\777', but not L'\777'.
181 if (CharWidth
!= 32 && (ResultChar
>> CharWidth
) != 0) {
183 Diag(Diags
, Features
, Loc
, ThisTokBegin
, EscapeBegin
, ThisTokBuf
,
184 diag::err_octal_escape_too_large
);
185 ResultChar
&= ~0U >> (32-CharWidth
);
190 // Otherwise, these are not valid escapes.
191 case '(': case '{': case '[': case '%':
192 // GCC accepts these as extensions. We warn about them as such though.
194 Diag(Diags
, Features
, Loc
, ThisTokBegin
, EscapeBegin
, ThisTokBuf
,
195 diag::ext_nonstandard_escape
)
196 << std::string(1, ResultChar
);
202 if (isPrintable(ResultChar
))
203 Diag(Diags
, Features
, Loc
, ThisTokBegin
, EscapeBegin
, ThisTokBuf
,
204 diag::ext_unknown_escape
)
205 << std::string(1, ResultChar
);
207 Diag(Diags
, Features
, Loc
, ThisTokBegin
, EscapeBegin
, ThisTokBuf
,
208 diag::ext_unknown_escape
)
209 << "x" + llvm::utohexstr(ResultChar
);
216 static void appendCodePoint(unsigned Codepoint
,
217 llvm::SmallVectorImpl
<char> &Str
) {
219 char *ResultPtr
= ResultBuf
;
220 bool Res
= llvm::ConvertCodePointToUTF8(Codepoint
, ResultPtr
);
222 assert(Res
&& "Unexpected conversion failure");
223 Str
.append(ResultBuf
, ResultPtr
);
226 void clang::expandUCNs(SmallVectorImpl
<char> &Buf
, StringRef Input
) {
227 for (StringRef::iterator I
= Input
.begin(), E
= Input
.end(); I
!= E
; ++I
) {
234 assert(*I
== 'u' || *I
== 'U');
236 unsigned NumHexDigits
;
242 assert(I
+ NumHexDigits
<= E
);
244 uint32_t CodePoint
= 0;
245 for (++I
; NumHexDigits
!= 0; ++I
, --NumHexDigits
) {
246 unsigned Value
= llvm::hexDigitValue(*I
);
247 assert(Value
!= -1U);
253 appendCodePoint(CodePoint
, Buf
);
258 /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
259 /// return the UTF32.
260 static bool ProcessUCNEscape(const char *ThisTokBegin
, const char *&ThisTokBuf
,
261 const char *ThisTokEnd
,
262 uint32_t &UcnVal
, unsigned short &UcnLen
,
263 FullSourceLoc Loc
, DiagnosticsEngine
*Diags
,
264 const LangOptions
&Features
,
265 bool in_char_string_literal
= false) {
266 const char *UcnBegin
= ThisTokBuf
;
268 // Skip the '\u' char's.
271 if (ThisTokBuf
== ThisTokEnd
|| !isHexDigit(*ThisTokBuf
)) {
273 Diag(Diags
, Features
, Loc
, ThisTokBegin
, UcnBegin
, ThisTokBuf
,
274 diag::err_hex_escape_no_digits
) << StringRef(&ThisTokBuf
[-1], 1);
277 UcnLen
= (ThisTokBuf
[-1] == 'u' ? 4 : 8);
278 unsigned short UcnLenSave
= UcnLen
;
279 for (; ThisTokBuf
!= ThisTokEnd
&& UcnLenSave
; ++ThisTokBuf
, UcnLenSave
--) {
280 int CharVal
= llvm::hexDigitValue(ThisTokBuf
[0]);
281 if (CharVal
== -1) break;
285 // If we didn't consume the proper number of digits, there is a problem.
288 Diag(Diags
, Features
, Loc
, ThisTokBegin
, UcnBegin
, ThisTokBuf
,
289 diag::err_ucn_escape_incomplete
);
293 // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
294 if ((0xD800 <= UcnVal
&& UcnVal
<= 0xDFFF) || // surrogate codepoints
295 UcnVal
> 0x10FFFF) { // maximum legal UTF32 value
297 Diag(Diags
, Features
, Loc
, ThisTokBegin
, UcnBegin
, ThisTokBuf
,
298 diag::err_ucn_escape_invalid
);
302 // C++11 allows UCNs that refer to control characters and basic source
303 // characters inside character and string literals
305 (UcnVal
!= 0x24 && UcnVal
!= 0x40 && UcnVal
!= 0x60)) { // $, @, `
306 bool IsError
= (!Features
.CPlusPlus11
|| !in_char_string_literal
);
308 char BasicSCSChar
= UcnVal
;
309 if (UcnVal
>= 0x20 && UcnVal
< 0x7f)
310 Diag(Diags
, Features
, Loc
, ThisTokBegin
, UcnBegin
, ThisTokBuf
,
311 IsError
? diag::err_ucn_escape_basic_scs
:
312 diag::warn_cxx98_compat_literal_ucn_escape_basic_scs
)
313 << StringRef(&BasicSCSChar
, 1);
315 Diag(Diags
, Features
, Loc
, ThisTokBegin
, UcnBegin
, ThisTokBuf
,
316 IsError
? diag::err_ucn_control_character
:
317 diag::warn_cxx98_compat_literal_ucn_control_character
);
323 if (!Features
.CPlusPlus
&& !Features
.C99
&& Diags
)
324 Diag(Diags
, Features
, Loc
, ThisTokBegin
, UcnBegin
, ThisTokBuf
,
325 diag::warn_ucn_not_valid_in_c89_literal
);
330 /// MeasureUCNEscape - Determine the number of bytes within the resulting string
331 /// which this UCN will occupy.
332 static int MeasureUCNEscape(const char *ThisTokBegin
, const char *&ThisTokBuf
,
333 const char *ThisTokEnd
, unsigned CharByteWidth
,
334 const LangOptions
&Features
, bool &HadError
) {
335 // UTF-32: 4 bytes per escape.
336 if (CharByteWidth
== 4)
340 unsigned short UcnLen
= 0;
343 if (!ProcessUCNEscape(ThisTokBegin
, ThisTokBuf
, ThisTokEnd
, UcnVal
,
344 UcnLen
, Loc
, nullptr, Features
, true)) {
349 // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
350 if (CharByteWidth
== 2)
351 return UcnVal
<= 0xFFFF ? 2 : 4;
358 if (UcnVal
< 0x10000)
363 /// EncodeUCNEscape - Read the Universal Character Name, check constraints and
364 /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
365 /// StringLiteralParser. When we decide to implement UCN's for identifiers,
366 /// we will likely rework our support for UCN's.
367 static void EncodeUCNEscape(const char *ThisTokBegin
, const char *&ThisTokBuf
,
368 const char *ThisTokEnd
,
369 char *&ResultBuf
, bool &HadError
,
370 FullSourceLoc Loc
, unsigned CharByteWidth
,
371 DiagnosticsEngine
*Diags
,
372 const LangOptions
&Features
) {
373 typedef uint32_t UTF32
;
375 unsigned short UcnLen
= 0;
376 if (!ProcessUCNEscape(ThisTokBegin
, ThisTokBuf
, ThisTokEnd
, UcnVal
, UcnLen
,
377 Loc
, Diags
, Features
, true)) {
382 assert((CharByteWidth
== 1 || CharByteWidth
== 2 || CharByteWidth
== 4) &&
383 "only character widths of 1, 2, or 4 bytes supported");
386 assert((UcnLen
== 4 || UcnLen
== 8) && "only ucn length of 4 or 8 supported");
388 if (CharByteWidth
== 4) {
389 // FIXME: Make the type of the result buffer correct instead of
390 // using reinterpret_cast.
391 UTF32
*ResultPtr
= reinterpret_cast<UTF32
*>(ResultBuf
);
397 if (CharByteWidth
== 2) {
398 // FIXME: Make the type of the result buffer correct instead of
399 // using reinterpret_cast.
400 UTF16
*ResultPtr
= reinterpret_cast<UTF16
*>(ResultBuf
);
402 if (UcnVal
<= (UTF32
)0xFFFF) {
410 *ResultPtr
= 0xD800 + (UcnVal
>> 10);
411 *(ResultPtr
+1) = 0xDC00 + (UcnVal
& 0x3FF);
416 assert(CharByteWidth
== 1 && "UTF-8 encoding is only for 1 byte characters");
418 // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
419 // The conversion below was inspired by:
420 // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
421 // First, we determine how many bytes the result will require.
422 typedef uint8_t UTF8
;
424 unsigned short bytesToWrite
= 0;
425 if (UcnVal
< (UTF32
)0x80)
427 else if (UcnVal
< (UTF32
)0x800)
429 else if (UcnVal
< (UTF32
)0x10000)
434 const unsigned byteMask
= 0xBF;
435 const unsigned byteMark
= 0x80;
437 // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
438 // into the first byte, depending on how many bytes follow.
439 static const UTF8 firstByteMark
[5] = {
440 0x00, 0x00, 0xC0, 0xE0, 0xF0
442 // Finally, we write the bytes into ResultBuf.
443 ResultBuf
+= bytesToWrite
;
444 switch (bytesToWrite
) { // note: everything falls through.
445 case 4: *--ResultBuf
= (UTF8
)((UcnVal
| byteMark
) & byteMask
); UcnVal
>>= 6;
446 case 3: *--ResultBuf
= (UTF8
)((UcnVal
| byteMark
) & byteMask
); UcnVal
>>= 6;
447 case 2: *--ResultBuf
= (UTF8
)((UcnVal
| byteMark
) & byteMask
); UcnVal
>>= 6;
448 case 1: *--ResultBuf
= (UTF8
) (UcnVal
| firstByteMark
[bytesToWrite
]);
450 // Update the buffer.
451 ResultBuf
+= bytesToWrite
;
455 /// integer-constant: [C99 6.4.4.1]
456 /// decimal-constant integer-suffix
457 /// octal-constant integer-suffix
458 /// hexadecimal-constant integer-suffix
459 /// binary-literal integer-suffix [GNU, C++1y]
460 /// user-defined-integer-literal: [C++11 lex.ext]
461 /// decimal-literal ud-suffix
462 /// octal-literal ud-suffix
463 /// hexadecimal-literal ud-suffix
464 /// binary-literal ud-suffix [GNU, C++1y]
465 /// decimal-constant:
467 /// decimal-constant digit
470 /// octal-constant octal-digit
471 /// hexadecimal-constant:
472 /// hexadecimal-prefix hexadecimal-digit
473 /// hexadecimal-constant hexadecimal-digit
474 /// hexadecimal-prefix: one of
479 /// binary-literal binary-digit
481 /// unsigned-suffix [long-suffix]
482 /// unsigned-suffix [long-long-suffix]
483 /// long-suffix [unsigned-suffix]
484 /// long-long-suffix [unsigned-sufix]
486 /// 1 2 3 4 5 6 7 8 9
489 /// hexadecimal-digit:
490 /// 0 1 2 3 4 5 6 7 8 9
496 /// unsigned-suffix: one of
498 /// long-suffix: one of
500 /// long-long-suffix: one of
503 /// floating-constant: [C99 6.4.4.2]
504 /// TODO: add rules...
506 NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling
,
507 SourceLocation TokLoc
,
509 : PP(PP
), ThisTokBegin(TokSpelling
.begin()), ThisTokEnd(TokSpelling
.end()) {
511 // This routine assumes that the range begin/end matches the regex for integer
512 // and FP constants (specifically, the 'pp-number' regex), and assumes that
513 // the byte at "*end" is both valid and not part of the regex. Because of
514 // this, it doesn't have to check for 'overscan' in various places.
515 assert(!isPreprocessingNumberBody(*ThisTokEnd
) && "didn't maximally munch?");
517 s
= DigitsBegin
= ThisTokBegin
;
518 saw_exponent
= false;
520 saw_ud_suffix
= false;
526 MicrosoftInteger
= 0;
529 if (*s
== '0') { // parse radix
530 ParseNumberStartingWithZero(TokLoc
);
533 } else { // the first digit is non-zero
536 if (s
== ThisTokEnd
) {
538 } else if (isHexDigit(*s
) && !(*s
== 'e' || *s
== 'E')) {
539 PP
.Diag(PP
.AdvanceToTokenCharacter(TokLoc
, s
- ThisTokBegin
),
540 diag::err_invalid_decimal_digit
) << StringRef(s
, 1);
543 } else if (*s
== '.') {
544 checkSeparator(TokLoc
, s
, CSK_AfterDigits
);
547 checkSeparator(TokLoc
, s
, CSK_BeforeDigits
);
550 if ((*s
== 'e' || *s
== 'E')) { // exponent
551 checkSeparator(TokLoc
, s
, CSK_AfterDigits
);
552 const char *Exponent
= s
;
555 if (*s
== '+' || *s
== '-') s
++; // sign
556 checkSeparator(TokLoc
, s
, CSK_BeforeDigits
);
557 const char *first_non_digit
= SkipDigits(s
);
558 if (first_non_digit
!= s
) {
561 PP
.Diag(PP
.AdvanceToTokenCharacter(TokLoc
, Exponent
- ThisTokBegin
),
562 diag::err_exponent_has_no_digits
);
570 checkSeparator(TokLoc
, s
, CSK_AfterDigits
);
572 // Parse the suffix. At this point we can classify whether we have an FP or
574 bool isFPConstant
= isFloatingLiteral();
575 const char *ImaginarySuffixLoc
= nullptr;
577 // Loop over all of the characters of the suffix. If we see something bad,
578 // we break out of the loop.
579 for (; s
!= ThisTokEnd
; ++s
) {
581 case 'f': // FP Suffix for "float"
583 if (!isFPConstant
) break; // Error for integer constant.
584 if (isFloat
|| isLong
) break; // FF, LF invalid.
586 continue; // Success.
589 if (isFPConstant
) break; // Error for floating constant.
590 if (isUnsigned
) break; // Cannot be repeated.
592 continue; // Success.
595 if (isLong
|| isLongLong
) break; // Cannot be repeated.
596 if (isFloat
) break; // LF invalid.
598 // Check for long long. The L's need to be adjacent and the same case.
599 if (s
+1 != ThisTokEnd
&& s
[1] == s
[0]) {
600 if (isFPConstant
) break; // long long invalid for floats.
602 ++s
; // Eat both of them.
606 continue; // Success.
609 if (PP
.getLangOpts().MicrosoftExt
) {
610 if (isLong
|| isLongLong
|| MicrosoftInteger
)
613 // Allow i8, i16, i32, i64, and i128.
614 if (s
+ 1 != ThisTokEnd
) {
617 if (isFPConstant
) break;
619 MicrosoftInteger
= 8;
622 if (isFPConstant
) break;
623 if (s
+ 2 == ThisTokEnd
) break;
625 s
+= 3; // i16 suffix
626 MicrosoftInteger
= 16;
628 else if (s
[2] == '2') {
629 if (s
+ 3 == ThisTokEnd
) break;
631 s
+= 4; // i128 suffix
632 MicrosoftInteger
= 128;
637 if (isFPConstant
) break;
638 if (s
+ 2 == ThisTokEnd
) break;
640 s
+= 3; // i32 suffix
641 MicrosoftInteger
= 32;
645 if (isFPConstant
) break;
646 if (s
+ 2 == ThisTokEnd
) break;
648 s
+= 3; // i64 suffix
649 MicrosoftInteger
= 64;
655 if (MicrosoftInteger
)
659 // "i", "if", and "il" are user-defined suffixes in C++1y.
660 if (PP
.getLangOpts().CPlusPlus14
&& *s
== 'i')
665 if (isImaginary
) break; // Cannot be repeated.
667 ImaginarySuffixLoc
= s
;
668 continue; // Success.
670 // If we reached here, there was an error or a ud-suffix.
674 if (s
!= ThisTokEnd
) {
675 // FIXME: Don't bother expanding UCNs if !tok.hasUCN().
676 expandUCNs(UDSuffixBuf
, StringRef(SuffixBegin
, ThisTokEnd
- SuffixBegin
));
677 if (isValidUDSuffix(PP
.getLangOpts(), UDSuffixBuf
)) {
678 // Any suffix pieces we might have parsed are actually part of the
685 MicrosoftInteger
= 0;
687 saw_ud_suffix
= true;
691 // Report an error if there are any.
692 PP
.Diag(PP
.AdvanceToTokenCharacter(TokLoc
, SuffixBegin
- ThisTokBegin
),
693 isFPConstant
? diag::err_invalid_suffix_float_constant
:
694 diag::err_invalid_suffix_integer_constant
)
695 << StringRef(SuffixBegin
, ThisTokEnd
-SuffixBegin
);
701 PP
.Diag(PP
.AdvanceToTokenCharacter(TokLoc
,
702 ImaginarySuffixLoc
- ThisTokBegin
),
703 diag::ext_imaginary_constant
);
707 /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
708 /// suffixes as ud-suffixes, because the diagnostic experience is better if we
709 /// treat it as an invalid suffix.
710 bool NumericLiteralParser::isValidUDSuffix(const LangOptions
&LangOpts
,
712 if (!LangOpts
.CPlusPlus11
|| Suffix
.empty())
715 // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
716 if (Suffix
[0] == '_')
719 // In C++11, there are no library suffixes.
720 if (!LangOpts
.CPlusPlus14
)
723 // In C++1y, "s", "h", "min", "ms", "us", and "ns" are used in the library.
724 // Per tweaked N3660, "il", "i", and "if" are also used in the library.
725 return llvm::StringSwitch
<bool>(Suffix
)
726 .Cases("h", "min", "s", true)
727 .Cases("ms", "us", "ns", true)
728 .Cases("il", "i", "if", true)
732 void NumericLiteralParser::checkSeparator(SourceLocation TokLoc
,
734 CheckSeparatorKind IsAfterDigits
) {
735 if (IsAfterDigits
== CSK_AfterDigits
) {
736 if (Pos
== ThisTokBegin
)
739 } else if (Pos
== ThisTokEnd
)
742 if (isDigitSeparator(*Pos
))
743 PP
.Diag(PP
.AdvanceToTokenCharacter(TokLoc
, Pos
- ThisTokBegin
),
744 diag::err_digit_separator_not_between_digits
)
748 /// ParseNumberStartingWithZero - This method is called when the first character
749 /// of the number is found to be a zero. This means it is either an octal
750 /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
751 /// a floating point number (01239.123e4). Eat the prefix, determining the
753 void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc
) {
754 assert(s
[0] == '0' && "Invalid method call");
760 // Handle a hex number like 0x1234.
761 if ((c1
== 'x' || c1
== 'X') && (isHexDigit(c2
) || c2
== '.')) {
765 s
= SkipHexDigits(s
);
766 bool noSignificand
= (s
== DigitsBegin
);
767 if (s
== ThisTokEnd
) {
769 } else if (*s
== '.') {
772 const char *floatDigitsBegin
= s
;
773 checkSeparator(TokLoc
, s
, CSK_BeforeDigits
);
774 s
= SkipHexDigits(s
);
775 noSignificand
&= (floatDigitsBegin
== s
);
779 PP
.Diag(PP
.AdvanceToTokenCharacter(TokLoc
, s
- ThisTokBegin
),
780 diag::err_hexconstant_requires_digits
);
785 // A binary exponent can appear with or with a '.'. If dotted, the
786 // binary exponent is required.
787 if (*s
== 'p' || *s
== 'P') {
788 checkSeparator(TokLoc
, s
, CSK_AfterDigits
);
789 const char *Exponent
= s
;
792 if (*s
== '+' || *s
== '-') s
++; // sign
793 const char *first_non_digit
= SkipDigits(s
);
794 if (first_non_digit
== s
) {
795 PP
.Diag(PP
.AdvanceToTokenCharacter(TokLoc
, Exponent
-ThisTokBegin
),
796 diag::err_exponent_has_no_digits
);
800 checkSeparator(TokLoc
, s
, CSK_BeforeDigits
);
803 if (!PP
.getLangOpts().HexFloats
)
804 PP
.Diag(TokLoc
, diag::ext_hexconstant_invalid
);
805 } else if (saw_period
) {
806 PP
.Diag(PP
.AdvanceToTokenCharacter(TokLoc
, s
-ThisTokBegin
),
807 diag::err_hexconstant_requires_exponent
);
813 // Handle simple binary numbers 0b01010
814 if ((c1
== 'b' || c1
== 'B') && (c2
== '0' || c2
== '1')) {
815 // 0b101010 is a C++1y / GCC extension.
817 PP
.getLangOpts().CPlusPlus14
818 ? diag::warn_cxx11_compat_binary_literal
819 : PP
.getLangOpts().CPlusPlus
820 ? diag::ext_binary_literal_cxx14
821 : diag::ext_binary_literal
);
825 s
= SkipBinaryDigits(s
);
826 if (s
== ThisTokEnd
) {
828 } else if (isHexDigit(*s
)) {
829 PP
.Diag(PP
.AdvanceToTokenCharacter(TokLoc
, s
-ThisTokBegin
),
830 diag::err_invalid_binary_digit
) << StringRef(s
, 1);
833 // Other suffixes will be diagnosed by the caller.
837 // For now, the radix is set to 8. If we discover that we have a
838 // floating point constant, the radix will change to 10. Octal floating
839 // point constants are not permitted (only decimal and hexadecimal).
842 s
= SkipOctalDigits(s
);
844 return; // Done, simple octal number like 01234
846 // If we have some other non-octal digit that *is* a decimal digit, see if
847 // this is part of a floating point number like 094.123 or 09e1.
849 const char *EndDecimal
= SkipDigits(s
);
850 if (EndDecimal
[0] == '.' || EndDecimal
[0] == 'e' || EndDecimal
[0] == 'E') {
856 // If we have a hex digit other than 'e' (which denotes a FP exponent) then
857 // the code is using an incorrect base.
858 if (isHexDigit(*s
) && *s
!= 'e' && *s
!= 'E') {
859 PP
.Diag(PP
.AdvanceToTokenCharacter(TokLoc
, s
-ThisTokBegin
),
860 diag::err_invalid_octal_digit
) << StringRef(s
, 1);
869 checkSeparator(TokLoc
, s
, CSK_BeforeDigits
);
870 s
= SkipDigits(s
); // Skip suffix.
872 if (*s
== 'e' || *s
== 'E') { // exponent
873 checkSeparator(TokLoc
, s
, CSK_AfterDigits
);
874 const char *Exponent
= s
;
878 if (*s
== '+' || *s
== '-') s
++; // sign
879 const char *first_non_digit
= SkipDigits(s
);
880 if (first_non_digit
!= s
) {
881 checkSeparator(TokLoc
, s
, CSK_BeforeDigits
);
884 PP
.Diag(PP
.AdvanceToTokenCharacter(TokLoc
, Exponent
-ThisTokBegin
),
885 diag::err_exponent_has_no_digits
);
892 static bool alwaysFitsInto64Bits(unsigned Radix
, unsigned NumDigits
) {
895 return NumDigits
<= 64;
897 return NumDigits
<= 64 / 3; // Digits are groups of 3 bits.
899 return NumDigits
<= 19; // floor(log10(2^64))
901 return NumDigits
<= 64 / 4; // Digits are groups of 4 bits.
903 llvm_unreachable("impossible Radix");
907 /// GetIntegerValue - Convert this numeric literal value to an APInt that
908 /// matches Val's input width. If there is an overflow, set Val to the low bits
909 /// of the result and return true. Otherwise, return false.
910 bool NumericLiteralParser::GetIntegerValue(llvm::APInt
&Val
) {
911 // Fast path: Compute a conservative bound on the maximum number of
912 // bits per digit in this radix. If we can't possibly overflow a
913 // uint64 based on that bound then do the simple conversion to
914 // integer. This avoids the expensive overflow checking below, and
915 // handles the common cases that matter (small decimal integers and
916 // hex/octal values which don't overflow).
917 const unsigned NumDigits
= SuffixBegin
- DigitsBegin
;
918 if (alwaysFitsInto64Bits(radix
, NumDigits
)) {
920 for (const char *Ptr
= DigitsBegin
; Ptr
!= SuffixBegin
; ++Ptr
)
921 if (!isDigitSeparator(*Ptr
))
922 N
= N
* radix
+ llvm::hexDigitValue(*Ptr
);
924 // This will truncate the value to Val's input width. Simply check
925 // for overflow by comparing.
927 return Val
.getZExtValue() != N
;
931 const char *Ptr
= DigitsBegin
;
933 llvm::APInt
RadixVal(Val
.getBitWidth(), radix
);
934 llvm::APInt
CharVal(Val
.getBitWidth(), 0);
935 llvm::APInt OldVal
= Val
;
937 bool OverflowOccurred
= false;
938 while (Ptr
< SuffixBegin
) {
939 if (isDigitSeparator(*Ptr
)) {
944 unsigned C
= llvm::hexDigitValue(*Ptr
++);
946 // If this letter is out of bound for this radix, reject it.
947 assert(C
< radix
&& "NumericLiteralParser ctor should have rejected this");
951 // Add the digit to the value in the appropriate radix. If adding in digits
952 // made the value smaller, then this overflowed.
955 // Multiply by radix, did overflow occur on the multiply?
957 OverflowOccurred
|= Val
.udiv(RadixVal
) != OldVal
;
959 // Add value, did overflow occur on the value?
960 // (a + b) ult b <=> overflow
962 OverflowOccurred
|= Val
.ult(CharVal
);
964 return OverflowOccurred
;
967 llvm::APFloat::opStatus
968 NumericLiteralParser::GetFloatValue(llvm::APFloat
&Result
) {
971 unsigned n
= std::min(SuffixBegin
- ThisTokBegin
, ThisTokEnd
- ThisTokBegin
);
973 llvm::SmallString
<16> Buffer
;
974 StringRef
Str(ThisTokBegin
, n
);
975 if (Str
.find('\'') != StringRef::npos
) {
977 std::remove_copy_if(Str
.begin(), Str
.end(), std::back_inserter(Buffer
),
982 return Result
.convertFromString(Str
, APFloat::rmNearestTiesToEven
);
987 /// user-defined-character-literal: [C++11 lex.ext]
988 /// character-literal ud-suffix
991 /// character-literal: [C++11 lex.ccon]
992 /// ' c-char-sequence '
993 /// u' c-char-sequence '
994 /// U' c-char-sequence '
995 /// L' c-char-sequence '
998 /// c-char-sequence c-char
1000 /// any member of the source character set except the single-quote ',
1001 /// backslash \, or new-line character
1003 /// universal-character-name
1004 /// escape-sequence:
1005 /// simple-escape-sequence
1006 /// octal-escape-sequence
1007 /// hexadecimal-escape-sequence
1008 /// simple-escape-sequence:
1009 /// one of \' \" \? \\ \a \b \f \n \r \t \v
1010 /// octal-escape-sequence:
1012 /// \ octal-digit octal-digit
1013 /// \ octal-digit octal-digit octal-digit
1014 /// hexadecimal-escape-sequence:
1015 /// \x hexadecimal-digit
1016 /// hexadecimal-escape-sequence hexadecimal-digit
1017 /// universal-character-name: [C++11 lex.charset]
1019 /// \U hex-quad hex-quad
1021 /// hex-digit hex-digit hex-digit hex-digit
1024 CharLiteralParser::CharLiteralParser(const char *begin
, const char *end
,
1025 SourceLocation Loc
, Preprocessor
&PP
,
1026 tok::TokenKind kind
) {
1027 // At this point we know that the character matches the regex "(L|u|U)?'.*'".
1032 const char *TokBegin
= begin
;
1034 // Skip over wide character determinant.
1035 if (Kind
!= tok::char_constant
)
1037 if (Kind
== tok::utf8_char_constant
)
1040 // Skip over the entry quote.
1041 assert(begin
[0] == '\'' && "Invalid token lexed");
1044 // Remove an optional ud-suffix.
1045 if (end
[-1] != '\'') {
1046 const char *UDSuffixEnd
= end
;
1049 } while (end
[-1] != '\'');
1050 // FIXME: Don't bother with this if !tok.hasUCN().
1051 expandUCNs(UDSuffixBuf
, StringRef(end
, UDSuffixEnd
- end
));
1052 UDSuffixOffset
= end
- TokBegin
;
1055 // Trim the ending quote.
1056 assert(end
!= begin
&& "Invalid token lexed");
1059 // FIXME: The "Value" is an uint64_t so we can handle char literals of
1061 // FIXME: This extensively assumes that 'char' is 8-bits.
1062 assert(PP
.getTargetInfo().getCharWidth() == 8 &&
1063 "Assumes char is 8 bits");
1064 assert(PP
.getTargetInfo().getIntWidth() <= 64 &&
1065 (PP
.getTargetInfo().getIntWidth() & 7) == 0 &&
1066 "Assumes sizeof(int) on target is <= 64 and a multiple of char");
1067 assert(PP
.getTargetInfo().getWCharWidth() <= 64 &&
1068 "Assumes sizeof(wchar) on target is <= 64");
1070 SmallVector
<uint32_t, 4> codepoint_buffer
;
1071 codepoint_buffer
.resize(end
- begin
);
1072 uint32_t *buffer_begin
= &codepoint_buffer
.front();
1073 uint32_t *buffer_end
= buffer_begin
+ codepoint_buffer
.size();
1075 // Unicode escapes representing characters that cannot be correctly
1076 // represented in a single code unit are disallowed in character literals
1077 // by this implementation.
1078 uint32_t largest_character_for_kind
;
1079 if (tok::wide_char_constant
== Kind
) {
1080 largest_character_for_kind
=
1081 0xFFFFFFFFu
>> (32-PP
.getTargetInfo().getWCharWidth());
1082 } else if (tok::utf8_char_constant
== Kind
) {
1083 largest_character_for_kind
= 0x7F;
1084 } else if (tok::utf16_char_constant
== Kind
) {
1085 largest_character_for_kind
= 0xFFFF;
1086 } else if (tok::utf32_char_constant
== Kind
) {
1087 largest_character_for_kind
= 0x10FFFF;
1089 largest_character_for_kind
= 0x7Fu
;
1092 while (begin
!= end
) {
1093 // Is this a span of non-escape characters?
1094 if (begin
[0] != '\\') {
1095 char const *start
= begin
;
1098 } while (begin
!= end
&& *begin
!= '\\');
1100 char const *tmp_in_start
= start
;
1101 uint32_t *tmp_out_start
= buffer_begin
;
1102 ConversionResult res
=
1103 ConvertUTF8toUTF32(reinterpret_cast<UTF8
const **>(&start
),
1104 reinterpret_cast<UTF8
const *>(begin
),
1105 &buffer_begin
, buffer_end
, strictConversion
);
1106 if (res
!= conversionOK
) {
1107 // If we see bad encoding for unprefixed character literals, warn and
1108 // simply copy the byte values, for compatibility with gcc and
1109 // older versions of clang.
1110 bool NoErrorOnBadEncoding
= isAscii();
1111 unsigned Msg
= diag::err_bad_character_encoding
;
1112 if (NoErrorOnBadEncoding
)
1113 Msg
= diag::warn_bad_character_encoding
;
1115 if (NoErrorOnBadEncoding
) {
1116 start
= tmp_in_start
;
1117 buffer_begin
= tmp_out_start
;
1118 for (; start
!= begin
; ++start
, ++buffer_begin
)
1119 *buffer_begin
= static_cast<uint8_t>(*start
);
1124 for (; tmp_out_start
< buffer_begin
; ++tmp_out_start
) {
1125 if (*tmp_out_start
> largest_character_for_kind
) {
1127 PP
.Diag(Loc
, diag::err_character_too_large
);
1134 // Is this a Universal Character Name escape?
1135 if (begin
[1] == 'u' || begin
[1] == 'U') {
1136 unsigned short UcnLen
= 0;
1137 if (!ProcessUCNEscape(TokBegin
, begin
, end
, *buffer_begin
, UcnLen
,
1138 FullSourceLoc(Loc
, PP
.getSourceManager()),
1139 &PP
.getDiagnostics(), PP
.getLangOpts(), true)) {
1141 } else if (*buffer_begin
> largest_character_for_kind
) {
1143 PP
.Diag(Loc
, diag::err_character_too_large
);
1149 unsigned CharWidth
= getCharWidth(Kind
, PP
.getTargetInfo());
1151 ProcessCharEscape(TokBegin
, begin
, end
, HadError
,
1152 FullSourceLoc(Loc
,PP
.getSourceManager()),
1153 CharWidth
, &PP
.getDiagnostics(), PP
.getLangOpts());
1154 *buffer_begin
++ = result
;
1157 unsigned NumCharsSoFar
= buffer_begin
- &codepoint_buffer
.front();
1159 if (NumCharsSoFar
> 1) {
1161 PP
.Diag(Loc
, diag::warn_extraneous_char_constant
);
1162 else if (isAscii() && NumCharsSoFar
== 4)
1163 PP
.Diag(Loc
, diag::ext_four_char_character_literal
);
1165 PP
.Diag(Loc
, diag::ext_multichar_character_literal
);
1167 PP
.Diag(Loc
, diag::err_multichar_utf_character_literal
);
1170 IsMultiChar
= false;
1173 llvm::APInt
LitVal(PP
.getTargetInfo().getIntWidth(), 0);
1175 // Narrow character literals act as though their value is concatenated
1176 // in this implementation, but warn on overflow.
1177 bool multi_char_too_long
= false;
1178 if (isAscii() && isMultiChar()) {
1180 for (size_t i
= 0; i
< NumCharsSoFar
; ++i
) {
1181 // check for enough leading zeros to shift into
1182 multi_char_too_long
|= (LitVal
.countLeadingZeros() < 8);
1184 LitVal
= LitVal
+ (codepoint_buffer
[i
] & 0xFF);
1186 } else if (NumCharsSoFar
> 0) {
1187 // otherwise just take the last character
1188 LitVal
= buffer_begin
[-1];
1191 if (!HadError
&& multi_char_too_long
) {
1192 PP
.Diag(Loc
, diag::warn_char_constant_too_large
);
1195 // Transfer the value from APInt to uint64_t
1196 Value
= LitVal
.getZExtValue();
1198 // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
1199 // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple
1200 // character constants are not sign extended in the this implementation:
1201 // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
1202 if (isAscii() && NumCharsSoFar
== 1 && (Value
& 128) &&
1203 PP
.getLangOpts().CharIsSigned
)
1204 Value
= (signed char)Value
;
1208 /// string-literal: [C++0x lex.string]
1209 /// encoding-prefix " [s-char-sequence] "
1210 /// encoding-prefix R raw-string
1211 /// encoding-prefix:
1216 /// s-char-sequence:
1218 /// s-char-sequence s-char
1220 /// any member of the source character set except the double-quote ",
1221 /// backslash \, or new-line character
1223 /// universal-character-name
1225 /// " d-char-sequence ( r-char-sequence ) d-char-sequence "
1226 /// r-char-sequence:
1228 /// r-char-sequence r-char
1230 /// any member of the source character set, except a right parenthesis )
1231 /// followed by the initial d-char-sequence (which may be empty)
1232 /// followed by a double quote ".
1233 /// d-char-sequence:
1235 /// d-char-sequence d-char
1237 /// any member of the basic source character set except:
1238 /// space, the left parenthesis (, the right parenthesis ),
1239 /// the backslash \, and the control characters representing horizontal
1240 /// tab, vertical tab, form feed, and newline.
1241 /// escape-sequence: [C++0x lex.ccon]
1242 /// simple-escape-sequence
1243 /// octal-escape-sequence
1244 /// hexadecimal-escape-sequence
1245 /// simple-escape-sequence:
1246 /// one of \' \" \? \\ \a \b \f \n \r \t \v
1247 /// octal-escape-sequence:
1249 /// \ octal-digit octal-digit
1250 /// \ octal-digit octal-digit octal-digit
1251 /// hexadecimal-escape-sequence:
1252 /// \x hexadecimal-digit
1253 /// hexadecimal-escape-sequence hexadecimal-digit
1254 /// universal-character-name:
1256 /// \U hex-quad hex-quad
1258 /// hex-digit hex-digit hex-digit hex-digit
1261 StringLiteralParser::
1262 StringLiteralParser(ArrayRef
<Token
> StringToks
,
1263 Preprocessor
&PP
, bool Complain
)
1264 : SM(PP
.getSourceManager()), Features(PP
.getLangOpts()),
1265 Target(PP
.getTargetInfo()), Diags(Complain
? &PP
.getDiagnostics() :nullptr),
1266 MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown
),
1267 ResultPtr(ResultBuf
.data()), hadError(false), Pascal(false) {
1271 void StringLiteralParser::init(ArrayRef
<Token
> StringToks
){
1272 // The literal token may have come from an invalid source location (e.g. due
1273 // to a PCH error), in which case the token length will be 0.
1274 if (StringToks
.empty() || StringToks
[0].getLength() < 2)
1275 return DiagnoseLexingError(SourceLocation());
1277 // Scan all of the string portions, remember the max individual token length,
1278 // computing a bound on the concatenated string length, and see whether any
1279 // piece is a wide-string. If any of the string portions is a wide-string
1280 // literal, the result is a wide-string literal [C99 6.4.5p4].
1281 assert(!StringToks
.empty() && "expected at least one token");
1282 MaxTokenLength
= StringToks
[0].getLength();
1283 assert(StringToks
[0].getLength() >= 2 && "literal token is invalid!");
1284 SizeBound
= StringToks
[0].getLength()-2; // -2 for "".
1285 Kind
= StringToks
[0].getKind();
1289 // Implement Translation Phase #6: concatenation of string literals
1290 /// (C99 5.1.1.2p1). The common case is only one string fragment.
1291 for (unsigned i
= 1; i
!= StringToks
.size(); ++i
) {
1292 if (StringToks
[i
].getLength() < 2)
1293 return DiagnoseLexingError(StringToks
[i
].getLocation());
1295 // The string could be shorter than this if it needs cleaning, but this is a
1296 // reasonable bound, which is all we need.
1297 assert(StringToks
[i
].getLength() >= 2 && "literal token is invalid!");
1298 SizeBound
+= StringToks
[i
].getLength()-2; // -2 for "".
1300 // Remember maximum string piece length.
1301 if (StringToks
[i
].getLength() > MaxTokenLength
)
1302 MaxTokenLength
= StringToks
[i
].getLength();
1304 // Remember if we see any wide or utf-8/16/32 strings.
1305 // Also check for illegal concatenations.
1306 if (StringToks
[i
].isNot(Kind
) && StringToks
[i
].isNot(tok::string_literal
)) {
1308 Kind
= StringToks
[i
].getKind();
1311 Diags
->Report(StringToks
[i
].getLocation(),
1312 diag::err_unsupported_string_concat
);
1318 // Include space for the null terminator.
1321 // TODO: K&R warning: "traditional C rejects string constant concatenation"
1323 // Get the width in bytes of char/wchar_t/char16_t/char32_t
1324 CharByteWidth
= getCharWidth(Kind
, Target
);
1325 assert((CharByteWidth
& 7) == 0 && "Assumes character size is byte multiple");
1328 // The output buffer size needs to be large enough to hold wide characters.
1329 // This is a worst-case assumption which basically corresponds to L"" "long".
1330 SizeBound
*= CharByteWidth
;
1332 // Size the temporary buffer to hold the result string data.
1333 ResultBuf
.resize(SizeBound
);
1335 // Likewise, but for each string piece.
1336 SmallString
<512> TokenBuf
;
1337 TokenBuf
.resize(MaxTokenLength
);
1339 // Loop over all the strings, getting their spelling, and expanding them to
1340 // wide strings as appropriate.
1341 ResultPtr
= &ResultBuf
[0]; // Next byte to fill in.
1345 SourceLocation UDSuffixTokLoc
;
1347 for (unsigned i
= 0, e
= StringToks
.size(); i
!= e
; ++i
) {
1348 const char *ThisTokBuf
= &TokenBuf
[0];
1349 // Get the spelling of the token, which eliminates trigraphs, etc. We know
1350 // that ThisTokBuf points to a buffer that is big enough for the whole token
1351 // and 'spelled' tokens can only shrink.
1352 bool StringInvalid
= false;
1353 unsigned ThisTokLen
=
1354 Lexer::getSpelling(StringToks
[i
], ThisTokBuf
, SM
, Features
,
1357 return DiagnoseLexingError(StringToks
[i
].getLocation());
1359 const char *ThisTokBegin
= ThisTokBuf
;
1360 const char *ThisTokEnd
= ThisTokBuf
+ThisTokLen
;
1362 // Remove an optional ud-suffix.
1363 if (ThisTokEnd
[-1] != '"') {
1364 const char *UDSuffixEnd
= ThisTokEnd
;
1367 } while (ThisTokEnd
[-1] != '"');
1369 StringRef
UDSuffix(ThisTokEnd
, UDSuffixEnd
- ThisTokEnd
);
1371 if (UDSuffixBuf
.empty()) {
1372 if (StringToks
[i
].hasUCN())
1373 expandUCNs(UDSuffixBuf
, UDSuffix
);
1375 UDSuffixBuf
.assign(UDSuffix
);
1377 UDSuffixOffset
= ThisTokEnd
- ThisTokBuf
;
1378 UDSuffixTokLoc
= StringToks
[i
].getLocation();
1380 SmallString
<32> ExpandedUDSuffix
;
1381 if (StringToks
[i
].hasUCN()) {
1382 expandUCNs(ExpandedUDSuffix
, UDSuffix
);
1383 UDSuffix
= ExpandedUDSuffix
;
1386 // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
1387 // result of a concatenation involving at least one user-defined-string-
1388 // literal, all the participating user-defined-string-literals shall
1389 // have the same ud-suffix.
1390 if (UDSuffixBuf
!= UDSuffix
) {
1392 SourceLocation TokLoc
= StringToks
[i
].getLocation();
1393 Diags
->Report(TokLoc
, diag::err_string_concat_mixed_suffix
)
1394 << UDSuffixBuf
<< UDSuffix
1395 << SourceRange(UDSuffixTokLoc
, UDSuffixTokLoc
)
1396 << SourceRange(TokLoc
, TokLoc
);
1403 // Strip the end quote.
1406 // TODO: Input character set mapping support.
1408 // Skip marker for wide or unicode strings.
1409 if (ThisTokBuf
[0] == 'L' || ThisTokBuf
[0] == 'u' || ThisTokBuf
[0] == 'U') {
1411 // Skip 8 of u8 marker for utf8 strings.
1412 if (ThisTokBuf
[0] == '8')
1416 // Check for raw string
1417 if (ThisTokBuf
[0] == 'R') {
1418 ThisTokBuf
+= 2; // skip R"
1420 const char *Prefix
= ThisTokBuf
;
1421 while (ThisTokBuf
[0] != '(')
1423 ++ThisTokBuf
; // skip '('
1425 // Remove same number of characters from the end
1426 ThisTokEnd
-= ThisTokBuf
- Prefix
;
1427 assert(ThisTokEnd
>= ThisTokBuf
&& "malformed raw string literal");
1429 // Copy the string over
1430 if (CopyStringFragment(StringToks
[i
], ThisTokBegin
,
1431 StringRef(ThisTokBuf
, ThisTokEnd
- ThisTokBuf
)))
1434 if (ThisTokBuf
[0] != '"') {
1435 // The file may have come from PCH and then changed after loading the
1436 // PCH; Fail gracefully.
1437 return DiagnoseLexingError(StringToks
[i
].getLocation());
1439 ++ThisTokBuf
; // skip "
1441 // Check if this is a pascal string
1442 if (Features
.PascalStrings
&& ThisTokBuf
+ 1 != ThisTokEnd
&&
1443 ThisTokBuf
[0] == '\\' && ThisTokBuf
[1] == 'p') {
1445 // If the \p sequence is found in the first token, we have a pascal string
1446 // Otherwise, if we already have a pascal string, ignore the first \p
1454 while (ThisTokBuf
!= ThisTokEnd
) {
1455 // Is this a span of non-escape characters?
1456 if (ThisTokBuf
[0] != '\\') {
1457 const char *InStart
= ThisTokBuf
;
1460 } while (ThisTokBuf
!= ThisTokEnd
&& ThisTokBuf
[0] != '\\');
1462 // Copy the character span over.
1463 if (CopyStringFragment(StringToks
[i
], ThisTokBegin
,
1464 StringRef(InStart
, ThisTokBuf
- InStart
)))
1468 // Is this a Universal Character Name escape?
1469 if (ThisTokBuf
[1] == 'u' || ThisTokBuf
[1] == 'U') {
1470 EncodeUCNEscape(ThisTokBegin
, ThisTokBuf
, ThisTokEnd
,
1471 ResultPtr
, hadError
,
1472 FullSourceLoc(StringToks
[i
].getLocation(), SM
),
1473 CharByteWidth
, Diags
, Features
);
1476 // Otherwise, this is a non-UCN escape character. Process it.
1477 unsigned ResultChar
=
1478 ProcessCharEscape(ThisTokBegin
, ThisTokBuf
, ThisTokEnd
, hadError
,
1479 FullSourceLoc(StringToks
[i
].getLocation(), SM
),
1480 CharByteWidth
*8, Diags
, Features
);
1482 if (CharByteWidth
== 4) {
1483 // FIXME: Make the type of the result buffer correct instead of
1484 // using reinterpret_cast.
1485 UTF32
*ResultWidePtr
= reinterpret_cast<UTF32
*>(ResultPtr
);
1486 *ResultWidePtr
= ResultChar
;
1488 } else if (CharByteWidth
== 2) {
1489 // FIXME: Make the type of the result buffer correct instead of
1490 // using reinterpret_cast.
1491 UTF16
*ResultWidePtr
= reinterpret_cast<UTF16
*>(ResultPtr
);
1492 *ResultWidePtr
= ResultChar
& 0xFFFF;
1495 assert(CharByteWidth
== 1 && "Unexpected char width");
1496 *ResultPtr
++ = ResultChar
& 0xFF;
1503 if (CharByteWidth
== 4) {
1504 // FIXME: Make the type of the result buffer correct instead of
1505 // using reinterpret_cast.
1506 UTF32
*ResultWidePtr
= reinterpret_cast<UTF32
*>(ResultBuf
.data());
1507 ResultWidePtr
[0] = GetNumStringChars() - 1;
1508 } else if (CharByteWidth
== 2) {
1509 // FIXME: Make the type of the result buffer correct instead of
1510 // using reinterpret_cast.
1511 UTF16
*ResultWidePtr
= reinterpret_cast<UTF16
*>(ResultBuf
.data());
1512 ResultWidePtr
[0] = GetNumStringChars() - 1;
1514 assert(CharByteWidth
== 1 && "Unexpected char width");
1515 ResultBuf
[0] = GetNumStringChars() - 1;
1518 // Verify that pascal strings aren't too large.
1519 if (GetStringLength() > 256) {
1521 Diags
->Report(StringToks
.front().getLocation(),
1522 diag::err_pascal_string_too_long
)
1523 << SourceRange(StringToks
.front().getLocation(),
1524 StringToks
.back().getLocation());
1529 // Complain if this string literal has too many characters.
1530 unsigned MaxChars
= Features
.CPlusPlus
? 65536 : Features
.C99
? 4095 : 509;
1532 if (GetNumStringChars() > MaxChars
)
1533 Diags
->Report(StringToks
.front().getLocation(),
1534 diag::ext_string_too_long
)
1535 << GetNumStringChars() << MaxChars
1536 << (Features
.CPlusPlus
? 2 : Features
.C99
? 1 : 0)
1537 << SourceRange(StringToks
.front().getLocation(),
1538 StringToks
.back().getLocation());
1542 static const char *resyncUTF8(const char *Err
, const char *End
) {
1545 End
= Err
+ std::min
<unsigned>(getNumBytesForUTF8(*Err
), End
-Err
);
1546 while (++Err
!= End
&& (*Err
& 0xC0) == 0x80)
1551 /// \brief This function copies from Fragment, which is a sequence of bytes
1552 /// within Tok's contents (which begin at TokBegin) into ResultPtr.
1553 /// Performs widening for multi-byte characters.
1554 bool StringLiteralParser::CopyStringFragment(const Token
&Tok
,
1555 const char *TokBegin
,
1556 StringRef Fragment
) {
1557 const UTF8
*ErrorPtrTmp
;
1558 if (ConvertUTF8toWide(CharByteWidth
, Fragment
, ResultPtr
, ErrorPtrTmp
))
1561 // If we see bad encoding for unprefixed string literals, warn and
1562 // simply copy the byte values, for compatibility with gcc and older
1563 // versions of clang.
1564 bool NoErrorOnBadEncoding
= isAscii();
1565 if (NoErrorOnBadEncoding
) {
1566 memcpy(ResultPtr
, Fragment
.data(), Fragment
.size());
1567 ResultPtr
+= Fragment
.size();
1571 const char *ErrorPtr
= reinterpret_cast<const char *>(ErrorPtrTmp
);
1573 FullSourceLoc
SourceLoc(Tok
.getLocation(), SM
);
1574 const DiagnosticBuilder
&Builder
=
1575 Diag(Diags
, Features
, SourceLoc
, TokBegin
,
1576 ErrorPtr
, resyncUTF8(ErrorPtr
, Fragment
.end()),
1577 NoErrorOnBadEncoding
? diag::warn_bad_string_encoding
1578 : diag::err_bad_string_encoding
);
1580 const char *NextStart
= resyncUTF8(ErrorPtr
, Fragment
.end());
1581 StringRef
NextFragment(NextStart
, Fragment
.end()-NextStart
);
1583 // Decode into a dummy buffer.
1584 SmallString
<512> Dummy
;
1585 Dummy
.reserve(Fragment
.size() * CharByteWidth
);
1586 char *Ptr
= Dummy
.data();
1588 while (!ConvertUTF8toWide(CharByteWidth
, NextFragment
, Ptr
, ErrorPtrTmp
)) {
1589 const char *ErrorPtr
= reinterpret_cast<const char *>(ErrorPtrTmp
);
1590 NextStart
= resyncUTF8(ErrorPtr
, Fragment
.end());
1591 Builder
<< MakeCharSourceRange(Features
, SourceLoc
, TokBegin
,
1592 ErrorPtr
, NextStart
);
1593 NextFragment
= StringRef(NextStart
, Fragment
.end()-NextStart
);
1596 return !NoErrorOnBadEncoding
;
1599 void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc
) {
1602 Diags
->Report(Loc
, diag::err_lexing_string
);
1605 /// getOffsetOfStringByte - This function returns the offset of the
1606 /// specified byte of the string data represented by Token. This handles
1607 /// advancing over escape sequences in the string.
1608 unsigned StringLiteralParser::getOffsetOfStringByte(const Token
&Tok
,
1609 unsigned ByteNo
) const {
1610 // Get the spelling of the token.
1611 SmallString
<32> SpellingBuffer
;
1612 SpellingBuffer
.resize(Tok
.getLength());
1614 bool StringInvalid
= false;
1615 const char *SpellingPtr
= &SpellingBuffer
[0];
1616 unsigned TokLen
= Lexer::getSpelling(Tok
, SpellingPtr
, SM
, Features
,
1621 const char *SpellingStart
= SpellingPtr
;
1622 const char *SpellingEnd
= SpellingPtr
+TokLen
;
1624 // Handle UTF-8 strings just like narrow strings.
1625 if (SpellingPtr
[0] == 'u' && SpellingPtr
[1] == '8')
1628 assert(SpellingPtr
[0] != 'L' && SpellingPtr
[0] != 'u' &&
1629 SpellingPtr
[0] != 'U' && "Doesn't handle wide or utf strings yet");
1631 // For raw string literals, this is easy.
1632 if (SpellingPtr
[0] == 'R') {
1633 assert(SpellingPtr
[1] == '"' && "Should be a raw string literal!");
1636 while (*SpellingPtr
!= '(') {
1638 assert(SpellingPtr
< SpellingEnd
&& "Missing ( for raw string literal");
1642 return SpellingPtr
- SpellingStart
+ ByteNo
;
1645 // Skip over the leading quote
1646 assert(SpellingPtr
[0] == '"' && "Should be a string literal!");
1649 // Skip over bytes until we find the offset we're looking for.
1651 assert(SpellingPtr
< SpellingEnd
&& "Didn't find byte offset!");
1653 // Step over non-escapes simply.
1654 if (*SpellingPtr
!= '\\') {
1660 // Otherwise, this is an escape character. Advance over it.
1661 bool HadError
= false;
1662 if (SpellingPtr
[1] == 'u' || SpellingPtr
[1] == 'U') {
1663 const char *EscapePtr
= SpellingPtr
;
1664 unsigned Len
= MeasureUCNEscape(SpellingStart
, SpellingPtr
, SpellingEnd
,
1665 1, Features
, HadError
);
1667 // ByteNo is somewhere within the escape sequence.
1668 SpellingPtr
= EscapePtr
;
1673 ProcessCharEscape(SpellingStart
, SpellingPtr
, SpellingEnd
, HadError
,
1674 FullSourceLoc(Tok
.getLocation(), SM
),
1675 CharByteWidth
*8, Diags
, Features
);
1678 assert(!HadError
&& "This method isn't valid on erroneous strings");
1681 return SpellingPtr
-SpellingStart
;