1 //=== JSON.cpp - JSON value, parsing and serialization - C++ -----------*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===---------------------------------------------------------------------===//
9 #include "llvm/Support/JSON.h"
10 #include "llvm/Support/ConvertUTF.h"
11 #include "llvm/Support/Format.h"
17 Value
&Object::operator[](const ObjectKey
&K
) {
18 return try_emplace(K
, nullptr).first
->getSecond();
20 Value
&Object::operator[](ObjectKey
&&K
) {
21 return try_emplace(std::move(K
), nullptr).first
->getSecond();
23 Value
*Object::get(StringRef K
) {
29 const Value
*Object::get(StringRef K
) const {
35 llvm::Optional
<std::nullptr_t
> Object::getNull(StringRef K
) const {
37 return V
->getAsNull();
40 llvm::Optional
<bool> Object::getBoolean(StringRef K
) const {
42 return V
->getAsBoolean();
45 llvm::Optional
<double> Object::getNumber(StringRef K
) const {
47 return V
->getAsNumber();
50 llvm::Optional
<int64_t> Object::getInteger(StringRef K
) const {
52 return V
->getAsInteger();
55 llvm::Optional
<llvm::StringRef
> Object::getString(StringRef K
) const {
57 return V
->getAsString();
60 const json::Object
*Object::getObject(StringRef K
) const {
62 return V
->getAsObject();
65 json::Object
*Object::getObject(StringRef K
) {
67 return V
->getAsObject();
70 const json::Array
*Object::getArray(StringRef K
) const {
72 return V
->getAsArray();
75 json::Array
*Object::getArray(StringRef K
) {
77 return V
->getAsArray();
80 bool operator==(const Object
&LHS
, const Object
&RHS
) {
81 if (LHS
.size() != RHS
.size())
83 for (const auto &L
: LHS
) {
84 auto R
= RHS
.find(L
.first
);
85 if (R
== RHS
.end() || L
.second
!= R
->second
)
91 Array::Array(std::initializer_list
<Value
> Elements
) {
92 V
.reserve(Elements
.size());
93 for (const Value
&V
: Elements
) {
94 emplace_back(nullptr);
95 back().moveFrom(std::move(V
));
99 Value::Value(std::initializer_list
<Value
> Elements
)
100 : Value(json::Array(Elements
)) {}
102 void Value::copyFrom(const Value
&M
) {
109 memcpy(Union
.buffer
, M
.Union
.buffer
, sizeof(Union
.buffer
));
112 create
<StringRef
>(M
.as
<StringRef
>());
115 create
<std::string
>(M
.as
<std::string
>());
118 create
<json::Object
>(M
.as
<json::Object
>());
121 create
<json::Array
>(M
.as
<json::Array
>());
126 void Value::moveFrom(const Value
&&M
) {
133 memcpy(Union
.buffer
, M
.Union
.buffer
, sizeof(Union
.buffer
));
136 create
<StringRef
>(M
.as
<StringRef
>());
139 create
<std::string
>(std::move(M
.as
<std::string
>()));
143 create
<json::Object
>(std::move(M
.as
<json::Object
>()));
147 create
<json::Array
>(std::move(M
.as
<json::Array
>()));
153 void Value::destroy() {
161 as
<StringRef
>().~StringRef();
164 as
<std::string
>().~basic_string();
167 as
<json::Object
>().~Object();
170 as
<json::Array
>().~Array();
175 bool operator==(const Value
&L
, const Value
&R
) {
176 if (L
.kind() != R
.kind())
180 return *L
.getAsNull() == *R
.getAsNull();
182 return *L
.getAsBoolean() == *R
.getAsBoolean();
184 // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=323
185 // The same integer must convert to the same double, per the standard.
186 // However we see 64-vs-80-bit precision comparisons with gcc-7 -O3 -m32.
187 // So we avoid floating point promotion for exact comparisons.
188 if (L
.Type
== Value::T_Integer
|| R
.Type
== Value::T_Integer
)
189 return L
.getAsInteger() == R
.getAsInteger();
190 return *L
.getAsNumber() == *R
.getAsNumber();
192 return *L
.getAsString() == *R
.getAsString();
194 return *L
.getAsArray() == *R
.getAsArray();
196 return *L
.getAsObject() == *R
.getAsObject();
198 llvm_unreachable("Unknown value kind");
202 // Simple recursive-descent JSON parser.
205 Parser(StringRef JSON
)
206 : Start(JSON
.begin()), P(JSON
.begin()), End(JSON
.end()) {}
210 if (isUTF8(StringRef(Start
, End
- Start
), &ErrOffset
))
212 P
= Start
+ ErrOffset
; // For line/column calculation.
213 return parseError("Invalid UTF-8 sequence");
216 bool parseValue(Value
&Out
);
222 return parseError("Text after end of document");
227 return std::move(*Err
);
231 void eatWhitespace() {
232 while (P
!= End
&& (*P
== ' ' || *P
== '\r' || *P
== '\n' || *P
== '\t'))
236 // On invalid syntax, parseX() functions return false and set Err.
237 bool parseNumber(char First
, Value
&Out
);
238 bool parseString(std::string
&Out
);
239 bool parseUnicode(std::string
&Out
);
240 bool parseError(const char *Msg
); // always returns false
242 char next() { return P
== End
? 0 : *P
++; }
243 char peek() { return P
== End
? 0 : *P
; }
244 static bool isNumber(char C
) {
245 return C
== '0' || C
== '1' || C
== '2' || C
== '3' || C
== '4' ||
246 C
== '5' || C
== '6' || C
== '7' || C
== '8' || C
== '9' ||
247 C
== 'e' || C
== 'E' || C
== '+' || C
== '-' || C
== '.';
251 const char *Start
, *P
, *End
;
254 bool Parser::parseValue(Value
&Out
) {
257 return parseError("Unexpected EOF");
258 switch (char C
= next()) {
259 // Bare null/true/false are easy - first char identifies them.
262 return (next() == 'u' && next() == 'l' && next() == 'l') ||
263 parseError("Invalid JSON value (null?)");
266 return (next() == 'r' && next() == 'u' && next() == 'e') ||
267 parseError("Invalid JSON value (true?)");
270 return (next() == 'a' && next() == 'l' && next() == 's' && next() == 'e') ||
271 parseError("Invalid JSON value (false?)");
274 if (parseString(S
)) {
282 Array
&A
= *Out
.getAsArray();
289 A
.emplace_back(nullptr);
290 if (!parseValue(A
.back()))
300 return parseError("Expected , or ] after array element");
306 Object
&O
= *Out
.getAsObject();
314 return parseError("Expected object key");
320 return parseError("Expected : after object key");
322 if (!parseValue(O
[std::move(K
)]))
332 return parseError("Expected , or } after object property");
338 return parseNumber(C
, Out
);
339 return parseError("Invalid JSON value");
343 bool Parser::parseNumber(char First
, Value
&Out
) {
344 // Read the number into a string. (Must be null-terminated for strto*).
347 while (isNumber(peek()))
350 // Try first to parse as integer, and if so preserve full 64 bits.
351 // strtoll returns long long >= 64 bits, so check it's in range too.
352 auto I
= std::strtoll(S
.c_str(), &End
, 10);
353 if (End
== S
.end() && I
>= std::numeric_limits
<int64_t>::min() &&
354 I
<= std::numeric_limits
<int64_t>::max()) {
358 // If it's not an integer
359 Out
= std::strtod(S
.c_str(), &End
);
360 return End
== S
.end() || parseError("Invalid JSON value (number?)");
363 bool Parser::parseString(std::string
&Out
) {
364 // leading quote was already consumed.
365 for (char C
= next(); C
!= '"'; C
= next()) {
366 if (LLVM_UNLIKELY(P
== End
))
367 return parseError("Unterminated string");
368 if (LLVM_UNLIKELY((C
& 0x1f) == C
))
369 return parseError("Control character in string");
370 if (LLVM_LIKELY(C
!= '\\')) {
374 // Handle escape sequence.
375 switch (C
= next()) {
397 if (!parseUnicode(Out
))
401 return parseError("Invalid escape sequence");
407 static void encodeUtf8(uint32_t Rune
, std::string
&Out
) {
409 Out
.push_back(Rune
& 0x7F);
410 } else if (Rune
< 0x800) {
411 uint8_t FirstByte
= 0xC0 | ((Rune
& 0x7C0) >> 6);
412 uint8_t SecondByte
= 0x80 | (Rune
& 0x3F);
413 Out
.push_back(FirstByte
);
414 Out
.push_back(SecondByte
);
415 } else if (Rune
< 0x10000) {
416 uint8_t FirstByte
= 0xE0 | ((Rune
& 0xF000) >> 12);
417 uint8_t SecondByte
= 0x80 | ((Rune
& 0xFC0) >> 6);
418 uint8_t ThirdByte
= 0x80 | (Rune
& 0x3F);
419 Out
.push_back(FirstByte
);
420 Out
.push_back(SecondByte
);
421 Out
.push_back(ThirdByte
);
422 } else if (Rune
< 0x110000) {
423 uint8_t FirstByte
= 0xF0 | ((Rune
& 0x1F0000) >> 18);
424 uint8_t SecondByte
= 0x80 | ((Rune
& 0x3F000) >> 12);
425 uint8_t ThirdByte
= 0x80 | ((Rune
& 0xFC0) >> 6);
426 uint8_t FourthByte
= 0x80 | (Rune
& 0x3F);
427 Out
.push_back(FirstByte
);
428 Out
.push_back(SecondByte
);
429 Out
.push_back(ThirdByte
);
430 Out
.push_back(FourthByte
);
432 llvm_unreachable("Invalid codepoint");
436 // Parse a UTF-16 \uNNNN escape sequence. "\u" has already been consumed.
437 // May parse several sequential escapes to ensure proper surrogate handling.
438 // We do not use ConvertUTF.h, it can't accept and replace unpaired surrogates.
439 // These are invalid Unicode but valid JSON (RFC 8259, section 8.2).
440 bool Parser::parseUnicode(std::string
&Out
) {
441 // Invalid UTF is not a JSON error (RFC 8529§8.2). It gets replaced by U+FFFD.
442 auto Invalid
= [&] { Out
.append(/* UTF-8 */ {'\xef', '\xbf', '\xbd'}); };
443 // Decodes 4 hex digits from the stream into Out, returns false on error.
444 auto Parse4Hex
= [this](uint16_t &Out
) -> bool {
446 char Bytes
[] = {next(), next(), next(), next()};
447 for (unsigned char C
: Bytes
) {
448 if (!std::isxdigit(C
))
449 return parseError("Invalid \\u escape sequence");
451 Out
|= (C
> '9') ? (C
& ~0x20) - 'A' + 10 : (C
- '0');
455 uint16_t First
; // UTF-16 code unit from the first \u escape.
456 if (!Parse4Hex(First
))
459 // We loop to allow proper surrogate-pair error handling.
461 // Case 1: the UTF-16 code unit is already a codepoint in the BMP.
462 if (LLVM_LIKELY(First
< 0xD800 || First
>= 0xE000)) {
463 encodeUtf8(First
, Out
);
467 // Case 2: it's an (unpaired) trailing surrogate.
468 if (LLVM_UNLIKELY(First
>= 0xDC00)) {
473 // Case 3: it's a leading surrogate. We expect a trailing one next.
474 // Case 3a: there's no trailing \u escape. Don't advance in the stream.
475 if (LLVM_UNLIKELY(P
+ 2 > End
|| *P
!= '\\' || *(P
+ 1) != 'u')) {
476 Invalid(); // Leading surrogate was unpaired.
481 if (!Parse4Hex(Second
))
483 // Case 3b: there was another \u escape, but it wasn't a trailing surrogate.
484 if (LLVM_UNLIKELY(Second
< 0xDC00 || Second
>= 0xE000)) {
485 Invalid(); // Leading surrogate was unpaired.
486 First
= Second
; // Second escape still needs to be processed.
489 // Case 3c: a valid surrogate pair encoding an astral codepoint.
490 encodeUtf8(0x10000 | ((First
- 0xD800) << 10) | (Second
- 0xDC00), Out
);
495 bool Parser::parseError(const char *Msg
) {
497 const char *StartOfLine
= Start
;
498 for (const char *X
= Start
; X
< P
; ++X
) {
505 std::make_unique
<ParseError
>(Msg
, Line
, P
- StartOfLine
, P
- Start
));
510 Expected
<Value
> parse(StringRef JSON
) {
517 return P
.takeError();
519 char ParseError::ID
= 0;
521 static std::vector
<const Object::value_type
*> sortedElements(const Object
&O
) {
522 std::vector
<const Object::value_type
*> Elements
;
523 for (const auto &E
: O
)
524 Elements
.push_back(&E
);
526 [](const Object::value_type
*L
, const Object::value_type
*R
) {
527 return L
->first
< R
->first
;
532 bool isUTF8(llvm::StringRef S
, size_t *ErrOffset
) {
533 // Fast-path for ASCII, which is valid UTF-8.
534 if (LLVM_LIKELY(isASCII(S
)))
537 const UTF8
*Data
= reinterpret_cast<const UTF8
*>(S
.data()), *Rest
= Data
;
538 if (LLVM_LIKELY(isLegalUTF8String(&Rest
, Data
+ S
.size())))
542 *ErrOffset
= Rest
- Data
;
546 std::string
fixUTF8(llvm::StringRef S
) {
547 // This isn't particularly efficient, but is only for error-recovery.
548 std::vector
<UTF32
> Codepoints(S
.size()); // 1 codepoint per byte suffices.
549 const UTF8
*In8
= reinterpret_cast<const UTF8
*>(S
.data());
550 UTF32
*Out32
= Codepoints
.data();
551 ConvertUTF8toUTF32(&In8
, In8
+ S
.size(), &Out32
, Out32
+ Codepoints
.size(),
553 Codepoints
.resize(Out32
- Codepoints
.data());
554 std::string
Res(4 * Codepoints
.size(), 0); // 4 bytes per codepoint suffice
555 const UTF32
*In32
= Codepoints
.data();
556 UTF8
*Out8
= reinterpret_cast<UTF8
*>(&Res
[0]);
557 ConvertUTF32toUTF8(&In32
, In32
+ Codepoints
.size(), &Out8
, Out8
+ Res
.size(),
559 Res
.resize(reinterpret_cast<char *>(Out8
) - Res
.data());
563 static void quote(llvm::raw_ostream
&OS
, llvm::StringRef S
) {
565 for (unsigned char C
: S
) {
566 if (C
== 0x22 || C
== 0x5C)
574 // A few characters are common enough to make short escapes worthwhile.
586 llvm::write_hex(OS
, C
, llvm::HexPrintStyle::Lower
, 4);
593 void llvm::json::OStream::value(const Value
&V
) {
601 OS
<< (*V
.getAsBoolean() ? "true" : "false");
605 if (V
.Type
== Value::T_Integer
)
606 OS
<< *V
.getAsInteger();
608 OS
<< format("%.*g", std::numeric_limits
<double>::max_digits10
,
613 quote(OS
, *V
.getAsString());
617 for (const Value
&E
: *V
.getAsArray())
622 for (const Object::value_type
*E
: sortedElements(*V
.getAsObject()))
623 attribute(E
->first
, E
->second
);
628 void llvm::json::OStream::valueBegin() {
629 assert(Stack
.back().Ctx
!= Object
&& "Only attributes allowed here");
630 if (Stack
.back().HasValue
) {
631 assert(Stack
.back().Ctx
!= Singleton
&& "Only one value allowed here");
634 if (Stack
.back().Ctx
== Array
)
636 Stack
.back().HasValue
= true;
639 void llvm::json::OStream::newline() {
646 void llvm::json::OStream::arrayBegin() {
648 Stack
.emplace_back();
649 Stack
.back().Ctx
= Array
;
650 Indent
+= IndentSize
;
654 void llvm::json::OStream::arrayEnd() {
655 assert(Stack
.back().Ctx
== Array
);
656 Indent
-= IndentSize
;
657 if (Stack
.back().HasValue
)
661 assert(!Stack
.empty());
664 void llvm::json::OStream::objectBegin() {
666 Stack
.emplace_back();
667 Stack
.back().Ctx
= Object
;
668 Indent
+= IndentSize
;
672 void llvm::json::OStream::objectEnd() {
673 assert(Stack
.back().Ctx
== Object
);
674 Indent
-= IndentSize
;
675 if (Stack
.back().HasValue
)
679 assert(!Stack
.empty());
682 void llvm::json::OStream::attributeBegin(llvm::StringRef Key
) {
683 assert(Stack
.back().Ctx
== Object
);
684 if (Stack
.back().HasValue
)
687 Stack
.back().HasValue
= true;
688 Stack
.emplace_back();
689 Stack
.back().Ctx
= Singleton
;
690 if (LLVM_LIKELY(isUTF8(Key
))) {
693 assert(false && "Invalid UTF-8 in attribute key");
694 quote(OS
, fixUTF8(Key
));
701 void llvm::json::OStream::attributeEnd() {
702 assert(Stack
.back().Ctx
== Singleton
);
703 assert(Stack
.back().HasValue
&& "Attribute must have a value");
705 assert(Stack
.back().Ctx
== Object
);
711 void llvm::format_provider
<llvm::json::Value
>::format(
712 const llvm::json::Value
&E
, raw_ostream
&OS
, StringRef Options
) {
713 unsigned IndentAmount
= 0;
714 if (!Options
.empty() && Options
.getAsInteger(/*Radix=*/10, IndentAmount
))
715 llvm_unreachable("json::Value format options should be an integer");
716 json::OStream(OS
, IndentAmount
).value(E
);