1 //=== JSON.cpp - JSON value, parsing and serialization - C++ -----------*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===---------------------------------------------------------------------===//
10 #include "llvm/Support/JSON.h"
11 #include "llvm/Support/ConvertUTF.h"
12 #include "llvm/Support/Format.h"
18 Value
&Object::operator[](const ObjectKey
&K
) {
19 return try_emplace(K
, nullptr).first
->getSecond();
21 Value
&Object::operator[](ObjectKey
&&K
) {
22 return try_emplace(std::move(K
), nullptr).first
->getSecond();
24 Value
*Object::get(StringRef K
) {
30 const Value
*Object::get(StringRef K
) const {
36 llvm::Optional
<std::nullptr_t
> Object::getNull(StringRef K
) const {
38 return V
->getAsNull();
41 llvm::Optional
<bool> Object::getBoolean(StringRef K
) const {
43 return V
->getAsBoolean();
46 llvm::Optional
<double> Object::getNumber(StringRef K
) const {
48 return V
->getAsNumber();
51 llvm::Optional
<int64_t> Object::getInteger(StringRef K
) const {
53 return V
->getAsInteger();
56 llvm::Optional
<llvm::StringRef
> Object::getString(StringRef K
) const {
58 return V
->getAsString();
61 const json::Object
*Object::getObject(StringRef K
) const {
63 return V
->getAsObject();
66 json::Object
*Object::getObject(StringRef K
) {
68 return V
->getAsObject();
71 const json::Array
*Object::getArray(StringRef K
) const {
73 return V
->getAsArray();
76 json::Array
*Object::getArray(StringRef K
) {
78 return V
->getAsArray();
81 bool operator==(const Object
&LHS
, const Object
&RHS
) {
82 if (LHS
.size() != RHS
.size())
84 for (const auto &L
: LHS
) {
85 auto R
= RHS
.find(L
.first
);
86 if (R
== RHS
.end() || L
.second
!= R
->second
)
92 Array::Array(std::initializer_list
<Value
> Elements
) {
93 V
.reserve(Elements
.size());
94 for (const Value
&V
: Elements
) {
95 emplace_back(nullptr);
96 back().moveFrom(std::move(V
));
100 Value::Value(std::initializer_list
<Value
> Elements
)
101 : Value(json::Array(Elements
)) {}
103 void Value::copyFrom(const Value
&M
) {
110 memcpy(Union
.buffer
, M
.Union
.buffer
, sizeof(Union
.buffer
));
113 create
<StringRef
>(M
.as
<StringRef
>());
116 create
<std::string
>(M
.as
<std::string
>());
119 create
<json::Object
>(M
.as
<json::Object
>());
122 create
<json::Array
>(M
.as
<json::Array
>());
127 void Value::moveFrom(const Value
&&M
) {
134 memcpy(Union
.buffer
, M
.Union
.buffer
, sizeof(Union
.buffer
));
137 create
<StringRef
>(M
.as
<StringRef
>());
140 create
<std::string
>(std::move(M
.as
<std::string
>()));
144 create
<json::Object
>(std::move(M
.as
<json::Object
>()));
148 create
<json::Array
>(std::move(M
.as
<json::Array
>()));
154 void Value::destroy() {
162 as
<StringRef
>().~StringRef();
165 as
<std::string
>().~basic_string();
168 as
<json::Object
>().~Object();
171 as
<json::Array
>().~Array();
176 bool operator==(const Value
&L
, const Value
&R
) {
177 if (L
.kind() != R
.kind())
181 return *L
.getAsNull() == *R
.getAsNull();
183 return *L
.getAsBoolean() == *R
.getAsBoolean();
185 return *L
.getAsNumber() == *R
.getAsNumber();
187 return *L
.getAsString() == *R
.getAsString();
189 return *L
.getAsArray() == *R
.getAsArray();
191 return *L
.getAsObject() == *R
.getAsObject();
193 llvm_unreachable("Unknown value kind");
197 // Simple recursive-descent JSON parser.
200 Parser(StringRef JSON
)
201 : Start(JSON
.begin()), P(JSON
.begin()), End(JSON
.end()) {}
205 if (isUTF8(StringRef(Start
, End
- Start
), &ErrOffset
))
207 P
= Start
+ ErrOffset
; // For line/column calculation.
208 return parseError("Invalid UTF-8 sequence");
211 bool parseValue(Value
&Out
);
217 return parseError("Text after end of document");
222 return std::move(*Err
);
226 void eatWhitespace() {
227 while (P
!= End
&& (*P
== ' ' || *P
== '\r' || *P
== '\n' || *P
== '\t'))
231 // On invalid syntax, parseX() functions return false and set Err.
232 bool parseNumber(char First
, Value
&Out
);
233 bool parseString(std::string
&Out
);
234 bool parseUnicode(std::string
&Out
);
235 bool parseError(const char *Msg
); // always returns false
237 char next() { return P
== End
? 0 : *P
++; }
238 char peek() { return P
== End
? 0 : *P
; }
239 static bool isNumber(char C
) {
240 return C
== '0' || C
== '1' || C
== '2' || C
== '3' || C
== '4' ||
241 C
== '5' || C
== '6' || C
== '7' || C
== '8' || C
== '9' ||
242 C
== 'e' || C
== 'E' || C
== '+' || C
== '-' || C
== '.';
246 const char *Start
, *P
, *End
;
249 bool Parser::parseValue(Value
&Out
) {
252 return parseError("Unexpected EOF");
253 switch (char C
= next()) {
254 // Bare null/true/false are easy - first char identifies them.
257 return (next() == 'u' && next() == 'l' && next() == 'l') ||
258 parseError("Invalid JSON value (null?)");
261 return (next() == 'r' && next() == 'u' && next() == 'e') ||
262 parseError("Invalid JSON value (true?)");
265 return (next() == 'a' && next() == 'l' && next() == 's' && next() == 'e') ||
266 parseError("Invalid JSON value (false?)");
269 if (parseString(S
)) {
277 Array
&A
= *Out
.getAsArray();
284 A
.emplace_back(nullptr);
285 if (!parseValue(A
.back()))
295 return parseError("Expected , or ] after array element");
301 Object
&O
= *Out
.getAsObject();
309 return parseError("Expected object key");
315 return parseError("Expected : after object key");
317 if (!parseValue(O
[std::move(K
)]))
327 return parseError("Expected , or } after object property");
333 return parseNumber(C
, Out
);
334 return parseError("Invalid JSON value");
338 bool Parser::parseNumber(char First
, Value
&Out
) {
339 // Read the number into a string. (Must be null-terminated for strto*).
342 while (isNumber(peek()))
345 // Try first to parse as integer, and if so preserve full 64 bits.
346 // strtoll returns long long >= 64 bits, so check it's in range too.
347 auto I
= std::strtoll(S
.c_str(), &End
, 10);
348 if (End
== S
.end() && I
>= std::numeric_limits
<int64_t>::min() &&
349 I
<= std::numeric_limits
<int64_t>::max()) {
353 // If it's not an integer
354 Out
= std::strtod(S
.c_str(), &End
);
355 return End
== S
.end() || parseError("Invalid JSON value (number?)");
358 bool Parser::parseString(std::string
&Out
) {
359 // leading quote was already consumed.
360 for (char C
= next(); C
!= '"'; C
= next()) {
361 if (LLVM_UNLIKELY(P
== End
))
362 return parseError("Unterminated string");
363 if (LLVM_UNLIKELY((C
& 0x1f) == C
))
364 return parseError("Control character in string");
365 if (LLVM_LIKELY(C
!= '\\')) {
369 // Handle escape sequence.
370 switch (C
= next()) {
392 if (!parseUnicode(Out
))
396 return parseError("Invalid escape sequence");
402 static void encodeUtf8(uint32_t Rune
, std::string
&Out
) {
404 Out
.push_back(Rune
& 0x7F);
405 } else if (Rune
< 0x800) {
406 uint8_t FirstByte
= 0xC0 | ((Rune
& 0x7C0) >> 6);
407 uint8_t SecondByte
= 0x80 | (Rune
& 0x3F);
408 Out
.push_back(FirstByte
);
409 Out
.push_back(SecondByte
);
410 } else if (Rune
< 0x10000) {
411 uint8_t FirstByte
= 0xE0 | ((Rune
& 0xF000) >> 12);
412 uint8_t SecondByte
= 0x80 | ((Rune
& 0xFC0) >> 6);
413 uint8_t ThirdByte
= 0x80 | (Rune
& 0x3F);
414 Out
.push_back(FirstByte
);
415 Out
.push_back(SecondByte
);
416 Out
.push_back(ThirdByte
);
417 } else if (Rune
< 0x110000) {
418 uint8_t FirstByte
= 0xF0 | ((Rune
& 0x1F0000) >> 18);
419 uint8_t SecondByte
= 0x80 | ((Rune
& 0x3F000) >> 12);
420 uint8_t ThirdByte
= 0x80 | ((Rune
& 0xFC0) >> 6);
421 uint8_t FourthByte
= 0x80 | (Rune
& 0x3F);
422 Out
.push_back(FirstByte
);
423 Out
.push_back(SecondByte
);
424 Out
.push_back(ThirdByte
);
425 Out
.push_back(FourthByte
);
427 llvm_unreachable("Invalid codepoint");
431 // Parse a UTF-16 \uNNNN escape sequence. "\u" has already been consumed.
432 // May parse several sequential escapes to ensure proper surrogate handling.
433 // We do not use ConvertUTF.h, it can't accept and replace unpaired surrogates.
434 // These are invalid Unicode but valid JSON (RFC 8259, section 8.2).
435 bool Parser::parseUnicode(std::string
&Out
) {
436 // Invalid UTF is not a JSON error (RFC 8529§8.2). It gets replaced by U+FFFD.
437 auto Invalid
= [&] { Out
.append(/* UTF-8 */ {'\xef', '\xbf', '\xbd'}); };
438 // Decodes 4 hex digits from the stream into Out, returns false on error.
439 auto Parse4Hex
= [this](uint16_t &Out
) -> bool {
441 char Bytes
[] = {next(), next(), next(), next()};
442 for (unsigned char C
: Bytes
) {
443 if (!std::isxdigit(C
))
444 return parseError("Invalid \\u escape sequence");
446 Out
|= (C
> '9') ? (C
& ~0x20) - 'A' + 10 : (C
- '0');
450 uint16_t First
; // UTF-16 code unit from the first \u escape.
451 if (!Parse4Hex(First
))
454 // We loop to allow proper surrogate-pair error handling.
456 // Case 1: the UTF-16 code unit is already a codepoint in the BMP.
457 if (LLVM_LIKELY(First
< 0xD800 || First
>= 0xE000)) {
458 encodeUtf8(First
, Out
);
462 // Case 2: it's an (unpaired) trailing surrogate.
463 if (LLVM_UNLIKELY(First
>= 0xDC00)) {
468 // Case 3: it's a leading surrogate. We expect a trailing one next.
469 // Case 3a: there's no trailing \u escape. Don't advance in the stream.
470 if (LLVM_UNLIKELY(P
+ 2 > End
|| *P
!= '\\' || *(P
+ 1) != 'u')) {
471 Invalid(); // Leading surrogate was unpaired.
476 if (!Parse4Hex(Second
))
478 // Case 3b: there was another \u escape, but it wasn't a trailing surrogate.
479 if (LLVM_UNLIKELY(Second
< 0xDC00 || Second
>= 0xE000)) {
480 Invalid(); // Leading surrogate was unpaired.
481 First
= Second
; // Second escape still needs to be processed.
484 // Case 3c: a valid surrogate pair encoding an astral codepoint.
485 encodeUtf8(0x10000 | ((First
- 0xD800) << 10) | (Second
- 0xDC00), Out
);
490 bool Parser::parseError(const char *Msg
) {
492 const char *StartOfLine
= Start
;
493 for (const char *X
= Start
; X
< P
; ++X
) {
500 llvm::make_unique
<ParseError
>(Msg
, Line
, P
- StartOfLine
, P
- Start
));
505 Expected
<Value
> parse(StringRef JSON
) {
512 return P
.takeError();
514 char ParseError::ID
= 0;
516 static std::vector
<const Object::value_type
*> sortedElements(const Object
&O
) {
517 std::vector
<const Object::value_type
*> Elements
;
518 for (const auto &E
: O
)
519 Elements
.push_back(&E
);
521 [](const Object::value_type
*L
, const Object::value_type
*R
) {
522 return L
->first
< R
->first
;
527 bool isUTF8(llvm::StringRef S
, size_t *ErrOffset
) {
528 // Fast-path for ASCII, which is valid UTF-8.
529 if (LLVM_LIKELY(isASCII(S
)))
532 const UTF8
*Data
= reinterpret_cast<const UTF8
*>(S
.data()), *Rest
= Data
;
533 if (LLVM_LIKELY(isLegalUTF8String(&Rest
, Data
+ S
.size())))
537 *ErrOffset
= Rest
- Data
;
541 std::string
fixUTF8(llvm::StringRef S
) {
542 // This isn't particularly efficient, but is only for error-recovery.
543 std::vector
<UTF32
> Codepoints(S
.size()); // 1 codepoint per byte suffices.
544 const UTF8
*In8
= reinterpret_cast<const UTF8
*>(S
.data());
545 UTF32
*Out32
= Codepoints
.data();
546 ConvertUTF8toUTF32(&In8
, In8
+ S
.size(), &Out32
, Out32
+ Codepoints
.size(),
548 Codepoints
.resize(Out32
- Codepoints
.data());
549 std::string
Res(4 * Codepoints
.size(), 0); // 4 bytes per codepoint suffice
550 const UTF32
*In32
= Codepoints
.data();
551 UTF8
*Out8
= reinterpret_cast<UTF8
*>(&Res
[0]);
552 ConvertUTF32toUTF8(&In32
, In32
+ Codepoints
.size(), &Out8
, Out8
+ Res
.size(),
554 Res
.resize(reinterpret_cast<char *>(Out8
) - Res
.data());
561 static void quote(llvm::raw_ostream
&OS
, llvm::StringRef S
) {
563 for (unsigned char C
: S
) {
564 if (C
== 0x22 || C
== 0x5C)
572 // A few characters are common enough to make short escapes worthwhile.
584 llvm::write_hex(OS
, C
, llvm::HexPrintStyle::Lower
, 4);
591 enum IndenterAction
{
598 // Prints JSON. The indenter can be used to control formatting.
599 template <typename Indenter
>
600 void llvm::json::Value::print(raw_ostream
&OS
, const Indenter
&I
) const {
606 OS
<< (as
<bool>() ? "true" : "false");
609 OS
<< format("%.*g", std::numeric_limits
<double>::max_digits10
,
616 quote(OS
, as
<StringRef
>());
619 quote(OS
, as
<std::string
>());
625 for (const auto *P
: sortedElements(as
<json::Object
>())) {
633 P
->second
.print(OS
, I
);
645 for (const auto &E
: as
<json::Array
>()) {
661 void llvm::format_provider
<llvm::json::Value
>::format(
662 const llvm::json::Value
&E
, raw_ostream
&OS
, StringRef Options
) {
663 if (Options
.empty()) {
667 unsigned IndentAmount
= 0;
668 if (Options
.getAsInteger(/*Radix=*/10, IndentAmount
))
669 llvm_unreachable("json::Value format options should be an integer");
670 unsigned IndentLevel
= 0;
671 E
.print(OS
, [&](IndenterAction A
) {
675 OS
.indent(IndentLevel
);
681 IndentLevel
+= IndentAmount
;
684 IndentLevel
-= IndentAmount
;
690 llvm::raw_ostream
&llvm::json::operator<<(raw_ostream
&OS
, const Value
&E
) {
691 E
.print(OS
, [](IndenterAction A
) { /*ignore*/ });