1 //=== JSON.cpp - JSON value, parsing and serialization - C++ -----------*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===---------------------------------------------------------------------===//
9 #include "llvm/Support/JSON.h"
10 #include "llvm/ADT/STLExtras.h"
11 #include "llvm/ADT/StringExtras.h"
12 #include "llvm/Support/ConvertUTF.h"
13 #include "llvm/Support/Error.h"
14 #include "llvm/Support/Format.h"
15 #include "llvm/Support/NativeFormatting.h"
16 #include "llvm/Support/raw_ostream.h"
24 Value
&Object::operator[](const ObjectKey
&K
) {
25 return try_emplace(K
, nullptr).first
->getSecond();
27 Value
&Object::operator[](ObjectKey
&&K
) {
28 return try_emplace(std::move(K
), nullptr).first
->getSecond();
30 Value
*Object::get(StringRef K
) {
36 const Value
*Object::get(StringRef K
) const {
42 std::optional
<std::nullptr_t
> Object::getNull(StringRef K
) const {
44 return V
->getAsNull();
47 std::optional
<bool> Object::getBoolean(StringRef K
) const {
49 return V
->getAsBoolean();
52 std::optional
<double> Object::getNumber(StringRef K
) const {
54 return V
->getAsNumber();
57 std::optional
<int64_t> Object::getInteger(StringRef K
) const {
59 return V
->getAsInteger();
62 std::optional
<llvm::StringRef
> Object::getString(StringRef K
) const {
64 return V
->getAsString();
67 const json::Object
*Object::getObject(StringRef K
) const {
69 return V
->getAsObject();
72 json::Object
*Object::getObject(StringRef K
) {
74 return V
->getAsObject();
77 const json::Array
*Object::getArray(StringRef K
) const {
79 return V
->getAsArray();
82 json::Array
*Object::getArray(StringRef K
) {
84 return V
->getAsArray();
87 bool operator==(const Object
&LHS
, const Object
&RHS
) {
88 if (LHS
.size() != RHS
.size())
90 for (const auto &L
: LHS
) {
91 auto R
= RHS
.find(L
.first
);
92 if (R
== RHS
.end() || L
.second
!= R
->second
)
98 Array::Array(std::initializer_list
<Value
> Elements
) {
99 V
.reserve(Elements
.size());
100 for (const Value
&V
: Elements
) {
101 emplace_back(nullptr);
102 back().moveFrom(std::move(V
));
106 Value::Value(std::initializer_list
<Value
> Elements
)
107 : Value(json::Array(Elements
)) {}
109 void Value::copyFrom(const Value
&M
) {
117 memcpy(&Union
, &M
.Union
, sizeof(Union
));
120 create
<StringRef
>(M
.as
<StringRef
>());
123 create
<std::string
>(M
.as
<std::string
>());
126 create
<json::Object
>(M
.as
<json::Object
>());
129 create
<json::Array
>(M
.as
<json::Array
>());
134 void Value::moveFrom(const Value
&&M
) {
142 memcpy(&Union
, &M
.Union
, sizeof(Union
));
145 create
<StringRef
>(M
.as
<StringRef
>());
148 create
<std::string
>(std::move(M
.as
<std::string
>()));
152 create
<json::Object
>(std::move(M
.as
<json::Object
>()));
156 create
<json::Array
>(std::move(M
.as
<json::Array
>()));
162 void Value::destroy() {
171 as
<StringRef
>().~StringRef();
174 as
<std::string
>().~basic_string();
177 as
<json::Object
>().~Object();
180 as
<json::Array
>().~Array();
185 bool operator==(const Value
&L
, const Value
&R
) {
186 if (L
.kind() != R
.kind())
190 return *L
.getAsNull() == *R
.getAsNull();
192 return *L
.getAsBoolean() == *R
.getAsBoolean();
194 // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=323
195 // The same integer must convert to the same double, per the standard.
196 // However we see 64-vs-80-bit precision comparisons with gcc-7 -O3 -m32.
197 // So we avoid floating point promotion for exact comparisons.
198 if (L
.Type
== Value::T_Integer
|| R
.Type
== Value::T_Integer
)
199 return L
.getAsInteger() == R
.getAsInteger();
200 return *L
.getAsNumber() == *R
.getAsNumber();
202 return *L
.getAsString() == *R
.getAsString();
204 return *L
.getAsArray() == *R
.getAsArray();
206 return *L
.getAsObject() == *R
.getAsObject();
208 llvm_unreachable("Unknown value kind");
211 void Path::report(llvm::StringLiteral Msg
) {
212 // Walk up to the root context, and count the number of segments.
215 for (P
= this; P
->Parent
!= nullptr; P
= P
->Parent
)
217 Path::Root
*R
= P
->Seg
.root();
218 // Fill in the error message and copy the path (in reverse order).
219 R
->ErrorMessage
= Msg
;
220 R
->ErrorPath
.resize(Count
);
221 auto It
= R
->ErrorPath
.begin();
222 for (P
= this; P
->Parent
!= nullptr; P
= P
->Parent
)
226 Error
Path::Root::getError() const {
228 raw_string_ostream
OS(S
);
229 OS
<< (ErrorMessage
.empty() ? "invalid JSON contents" : ErrorMessage
);
230 if (ErrorPath
.empty()) {
232 OS
<< " when parsing " << Name
;
234 OS
<< " at " << (Name
.empty() ? "(root)" : Name
);
235 for (const Path::Segment
&S
: llvm::reverse(ErrorPath
)) {
237 OS
<< '.' << S
.field();
239 OS
<< '[' << S
.index() << ']';
242 return createStringError(llvm::inconvertibleErrorCode(), OS
.str());
247 std::vector
<const Object::value_type
*> sortedElements(const Object
&O
) {
248 std::vector
<const Object::value_type
*> Elements
;
249 for (const auto &E
: O
)
250 Elements
.push_back(&E
);
252 [](const Object::value_type
*L
, const Object::value_type
*R
) {
253 return L
->first
< R
->first
;
258 // Prints a one-line version of a value that isn't our main focus.
259 // We interleave writes to OS and JOS, exploiting the lack of extra buffering.
260 // This is OK as we own the implementation.
261 void abbreviate(const Value
&V
, OStream
&JOS
) {
264 JOS
.rawValue(V
.getAsArray()->empty() ? "[]" : "[ ... ]");
267 JOS
.rawValue(V
.getAsObject()->empty() ? "{}" : "{ ... }");
269 case Value::String
: {
270 llvm::StringRef S
= *V
.getAsString();
274 std::string Truncated
= fixUTF8(S
.take_front(37));
275 Truncated
.append("...");
276 JOS
.value(Truncated
);
285 // Prints a semi-expanded version of a value that is our main focus.
286 // Array/Object entries are printed, but not recursively as they may be huge.
287 void abbreviateChildren(const Value
&V
, OStream
&JOS
) {
291 for (const auto &I
: *V
.getAsArray())
297 for (const auto *KV
: sortedElements(*V
.getAsObject())) {
298 JOS
.attributeBegin(KV
->first
);
299 abbreviate(KV
->second
, JOS
);
311 void Path::Root::printErrorContext(const Value
&R
, raw_ostream
&OS
) const {
312 OStream
JOS(OS
, /*IndentSize=*/2);
313 // PrintValue recurses down the path, printing the ancestors of our target.
314 // Siblings of nodes along the path are printed with abbreviate(), and the
315 // target itself is printed with the somewhat richer abbreviateChildren().
316 // 'Recurse' is the lambda itself, to allow recursive calls.
317 auto PrintValue
= [&](const Value
&V
, ArrayRef
<Segment
> Path
, auto &Recurse
) {
318 // Print the target node itself, with the error as a comment.
319 // Also used if we can't follow our path, e.g. it names a field that
320 // *should* exist but doesn't.
321 auto HighlightCurrent
= [&] {
322 std::string Comment
= "error: ";
323 Comment
.append(ErrorMessage
.data(), ErrorMessage
.size());
324 JOS
.comment(Comment
);
325 abbreviateChildren(V
, JOS
);
327 if (Path
.empty()) // We reached our target.
328 return HighlightCurrent();
329 const Segment
&S
= Path
.back(); // Path is in reverse order.
331 // Current node is an object, path names a field.
332 llvm::StringRef FieldName
= S
.field();
333 const Object
*O
= V
.getAsObject();
334 if (!O
|| !O
->get(FieldName
))
335 return HighlightCurrent();
337 for (const auto *KV
: sortedElements(*O
)) {
338 JOS
.attributeBegin(KV
->first
);
339 if (FieldName
.equals(KV
->first
))
340 Recurse(KV
->second
, Path
.drop_back(), Recurse
);
342 abbreviate(KV
->second
, JOS
);
347 // Current node is an array, path names an element.
348 const Array
*A
= V
.getAsArray();
349 if (!A
|| S
.index() >= A
->size())
350 return HighlightCurrent();
352 unsigned Current
= 0;
353 for (const auto &V
: *A
) {
354 if (Current
++ == S
.index())
355 Recurse(V
, Path
.drop_back(), Recurse
);
362 PrintValue(R
, ErrorPath
, PrintValue
);
366 // Simple recursive-descent JSON parser.
369 Parser(StringRef JSON
)
370 : Start(JSON
.begin()), P(JSON
.begin()), End(JSON
.end()) {}
374 if (isUTF8(StringRef(Start
, End
- Start
), &ErrOffset
))
376 P
= Start
+ ErrOffset
; // For line/column calculation.
377 return parseError("Invalid UTF-8 sequence");
380 bool parseValue(Value
&Out
);
386 return parseError("Text after end of document");
391 return std::move(*Err
);
395 void eatWhitespace() {
396 while (P
!= End
&& (*P
== ' ' || *P
== '\r' || *P
== '\n' || *P
== '\t'))
400 // On invalid syntax, parseX() functions return false and set Err.
401 bool parseNumber(char First
, Value
&Out
);
402 bool parseString(std::string
&Out
);
403 bool parseUnicode(std::string
&Out
);
404 bool parseError(const char *Msg
); // always returns false
406 char next() { return P
== End
? 0 : *P
++; }
407 char peek() { return P
== End
? 0 : *P
; }
408 static bool isNumber(char C
) {
409 return C
== '0' || C
== '1' || C
== '2' || C
== '3' || C
== '4' ||
410 C
== '5' || C
== '6' || C
== '7' || C
== '8' || C
== '9' ||
411 C
== 'e' || C
== 'E' || C
== '+' || C
== '-' || C
== '.';
414 std::optional
<Error
> Err
;
415 const char *Start
, *P
, *End
;
418 bool Parser::parseValue(Value
&Out
) {
421 return parseError("Unexpected EOF");
422 switch (char C
= next()) {
423 // Bare null/true/false are easy - first char identifies them.
426 return (next() == 'u' && next() == 'l' && next() == 'l') ||
427 parseError("Invalid JSON value (null?)");
430 return (next() == 'r' && next() == 'u' && next() == 'e') ||
431 parseError("Invalid JSON value (true?)");
434 return (next() == 'a' && next() == 'l' && next() == 's' && next() == 'e') ||
435 parseError("Invalid JSON value (false?)");
438 if (parseString(S
)) {
446 Array
&A
= *Out
.getAsArray();
453 A
.emplace_back(nullptr);
454 if (!parseValue(A
.back()))
464 return parseError("Expected , or ] after array element");
470 Object
&O
= *Out
.getAsObject();
478 return parseError("Expected object key");
484 return parseError("Expected : after object key");
486 if (!parseValue(O
[std::move(K
)]))
496 return parseError("Expected , or } after object property");
502 return parseNumber(C
, Out
);
503 return parseError("Invalid JSON value");
507 bool Parser::parseNumber(char First
, Value
&Out
) {
508 // Read the number into a string. (Must be null-terminated for strto*).
511 while (isNumber(peek()))
514 // Try first to parse as integer, and if so preserve full 64 bits.
515 // We check for errno for out of bounds errors and for End == S.end()
516 // to make sure that the numeric string is not malformed.
518 int64_t I
= std::strtoll(S
.c_str(), &End
, 10);
519 if (End
== S
.end() && errno
!= ERANGE
) {
523 // strtroull has a special handling for negative numbers, but in this
524 // case we don't want to do that because negative numbers were already
525 // handled in the previous block.
528 uint64_t UI
= std::strtoull(S
.c_str(), &End
, 10);
529 if (End
== S
.end() && errno
!= ERANGE
) {
534 // If it's not an integer
535 Out
= std::strtod(S
.c_str(), &End
);
536 return End
== S
.end() || parseError("Invalid JSON value (number?)");
539 bool Parser::parseString(std::string
&Out
) {
540 // leading quote was already consumed.
541 for (char C
= next(); C
!= '"'; C
= next()) {
542 if (LLVM_UNLIKELY(P
== End
))
543 return parseError("Unterminated string");
544 if (LLVM_UNLIKELY((C
& 0x1f) == C
))
545 return parseError("Control character in string");
546 if (LLVM_LIKELY(C
!= '\\')) {
550 // Handle escape sequence.
551 switch (C
= next()) {
573 if (!parseUnicode(Out
))
577 return parseError("Invalid escape sequence");
583 static void encodeUtf8(uint32_t Rune
, std::string
&Out
) {
585 Out
.push_back(Rune
& 0x7F);
586 } else if (Rune
< 0x800) {
587 uint8_t FirstByte
= 0xC0 | ((Rune
& 0x7C0) >> 6);
588 uint8_t SecondByte
= 0x80 | (Rune
& 0x3F);
589 Out
.push_back(FirstByte
);
590 Out
.push_back(SecondByte
);
591 } else if (Rune
< 0x10000) {
592 uint8_t FirstByte
= 0xE0 | ((Rune
& 0xF000) >> 12);
593 uint8_t SecondByte
= 0x80 | ((Rune
& 0xFC0) >> 6);
594 uint8_t ThirdByte
= 0x80 | (Rune
& 0x3F);
595 Out
.push_back(FirstByte
);
596 Out
.push_back(SecondByte
);
597 Out
.push_back(ThirdByte
);
598 } else if (Rune
< 0x110000) {
599 uint8_t FirstByte
= 0xF0 | ((Rune
& 0x1F0000) >> 18);
600 uint8_t SecondByte
= 0x80 | ((Rune
& 0x3F000) >> 12);
601 uint8_t ThirdByte
= 0x80 | ((Rune
& 0xFC0) >> 6);
602 uint8_t FourthByte
= 0x80 | (Rune
& 0x3F);
603 Out
.push_back(FirstByte
);
604 Out
.push_back(SecondByte
);
605 Out
.push_back(ThirdByte
);
606 Out
.push_back(FourthByte
);
608 llvm_unreachable("Invalid codepoint");
612 // Parse a UTF-16 \uNNNN escape sequence. "\u" has already been consumed.
613 // May parse several sequential escapes to ensure proper surrogate handling.
614 // We do not use ConvertUTF.h, it can't accept and replace unpaired surrogates.
615 // These are invalid Unicode but valid JSON (RFC 8259, section 8.2).
616 bool Parser::parseUnicode(std::string
&Out
) {
617 // Invalid UTF is not a JSON error (RFC 8529§8.2). It gets replaced by U+FFFD.
618 auto Invalid
= [&] { Out
.append(/* UTF-8 */ {'\xef', '\xbf', '\xbd'}); };
619 // Decodes 4 hex digits from the stream into Out, returns false on error.
620 auto Parse4Hex
= [this](uint16_t &Out
) -> bool {
622 char Bytes
[] = {next(), next(), next(), next()};
623 for (unsigned char C
: Bytes
) {
624 if (!std::isxdigit(C
))
625 return parseError("Invalid \\u escape sequence");
627 Out
|= (C
> '9') ? (C
& ~0x20) - 'A' + 10 : (C
- '0');
631 uint16_t First
; // UTF-16 code unit from the first \u escape.
632 if (!Parse4Hex(First
))
635 // We loop to allow proper surrogate-pair error handling.
637 // Case 1: the UTF-16 code unit is already a codepoint in the BMP.
638 if (LLVM_LIKELY(First
< 0xD800 || First
>= 0xE000)) {
639 encodeUtf8(First
, Out
);
643 // Case 2: it's an (unpaired) trailing surrogate.
644 if (LLVM_UNLIKELY(First
>= 0xDC00)) {
649 // Case 3: it's a leading surrogate. We expect a trailing one next.
650 // Case 3a: there's no trailing \u escape. Don't advance in the stream.
651 if (LLVM_UNLIKELY(P
+ 2 > End
|| *P
!= '\\' || *(P
+ 1) != 'u')) {
652 Invalid(); // Leading surrogate was unpaired.
657 if (!Parse4Hex(Second
))
659 // Case 3b: there was another \u escape, but it wasn't a trailing surrogate.
660 if (LLVM_UNLIKELY(Second
< 0xDC00 || Second
>= 0xE000)) {
661 Invalid(); // Leading surrogate was unpaired.
662 First
= Second
; // Second escape still needs to be processed.
665 // Case 3c: a valid surrogate pair encoding an astral codepoint.
666 encodeUtf8(0x10000 | ((First
- 0xD800) << 10) | (Second
- 0xDC00), Out
);
671 bool Parser::parseError(const char *Msg
) {
673 const char *StartOfLine
= Start
;
674 for (const char *X
= Start
; X
< P
; ++X
) {
681 std::make_unique
<ParseError
>(Msg
, Line
, P
- StartOfLine
, P
- Start
));
686 Expected
<Value
> parse(StringRef JSON
) {
693 return P
.takeError();
695 char ParseError::ID
= 0;
697 bool isUTF8(llvm::StringRef S
, size_t *ErrOffset
) {
698 // Fast-path for ASCII, which is valid UTF-8.
699 if (LLVM_LIKELY(isASCII(S
)))
702 const UTF8
*Data
= reinterpret_cast<const UTF8
*>(S
.data()), *Rest
= Data
;
703 if (LLVM_LIKELY(isLegalUTF8String(&Rest
, Data
+ S
.size())))
707 *ErrOffset
= Rest
- Data
;
711 std::string
fixUTF8(llvm::StringRef S
) {
712 // This isn't particularly efficient, but is only for error-recovery.
713 std::vector
<UTF32
> Codepoints(S
.size()); // 1 codepoint per byte suffices.
714 const UTF8
*In8
= reinterpret_cast<const UTF8
*>(S
.data());
715 UTF32
*Out32
= Codepoints
.data();
716 ConvertUTF8toUTF32(&In8
, In8
+ S
.size(), &Out32
, Out32
+ Codepoints
.size(),
718 Codepoints
.resize(Out32
- Codepoints
.data());
719 std::string
Res(4 * Codepoints
.size(), 0); // 4 bytes per codepoint suffice
720 const UTF32
*In32
= Codepoints
.data();
721 UTF8
*Out8
= reinterpret_cast<UTF8
*>(&Res
[0]);
722 ConvertUTF32toUTF8(&In32
, In32
+ Codepoints
.size(), &Out8
, Out8
+ Res
.size(),
724 Res
.resize(reinterpret_cast<char *>(Out8
) - Res
.data());
728 static void quote(llvm::raw_ostream
&OS
, llvm::StringRef S
) {
730 for (unsigned char C
: S
) {
731 if (C
== 0x22 || C
== 0x5C)
739 // A few characters are common enough to make short escapes worthwhile.
751 llvm::write_hex(OS
, C
, llvm::HexPrintStyle::Lower
, 4);
758 void llvm::json::OStream::value(const Value
&V
) {
766 OS
<< (*V
.getAsBoolean() ? "true" : "false");
770 if (V
.Type
== Value::T_Integer
)
771 OS
<< *V
.getAsInteger();
772 else if (V
.Type
== Value::T_UINT64
)
773 OS
<< *V
.getAsUINT64();
775 OS
<< format("%.*g", std::numeric_limits
<double>::max_digits10
,
780 quote(OS
, *V
.getAsString());
784 for (const Value
&E
: *V
.getAsArray())
789 for (const Object::value_type
*E
: sortedElements(*V
.getAsObject()))
790 attribute(E
->first
, E
->second
);
795 void llvm::json::OStream::valueBegin() {
796 assert(Stack
.back().Ctx
!= Object
&& "Only attributes allowed here");
797 if (Stack
.back().HasValue
) {
798 assert(Stack
.back().Ctx
!= Singleton
&& "Only one value allowed here");
801 if (Stack
.back().Ctx
== Array
)
804 Stack
.back().HasValue
= true;
807 void OStream::comment(llvm::StringRef Comment
) {
808 assert(PendingComment
.empty() && "Only one comment per value!");
809 PendingComment
= Comment
;
812 void OStream::flushComment() {
813 if (PendingComment
.empty())
815 OS
<< (IndentSize
? "/* " : "/*");
816 // Be sure not to accidentally emit "*/". Transform to "* /".
817 while (!PendingComment
.empty()) {
818 auto Pos
= PendingComment
.find("*/");
819 if (Pos
== StringRef::npos
) {
820 OS
<< PendingComment
;
823 OS
<< PendingComment
.take_front(Pos
) << "* /";
824 PendingComment
= PendingComment
.drop_front(Pos
+ 2);
827 OS
<< (IndentSize
? " */" : "*/");
828 // Comments are on their own line unless attached to an attribute value.
829 if (Stack
.size() > 1 && Stack
.back().Ctx
== Singleton
) {
837 void llvm::json::OStream::newline() {
844 void llvm::json::OStream::arrayBegin() {
846 Stack
.emplace_back();
847 Stack
.back().Ctx
= Array
;
848 Indent
+= IndentSize
;
852 void llvm::json::OStream::arrayEnd() {
853 assert(Stack
.back().Ctx
== Array
);
854 Indent
-= IndentSize
;
855 if (Stack
.back().HasValue
)
858 assert(PendingComment
.empty());
860 assert(!Stack
.empty());
863 void llvm::json::OStream::objectBegin() {
865 Stack
.emplace_back();
866 Stack
.back().Ctx
= Object
;
867 Indent
+= IndentSize
;
871 void llvm::json::OStream::objectEnd() {
872 assert(Stack
.back().Ctx
== Object
);
873 Indent
-= IndentSize
;
874 if (Stack
.back().HasValue
)
877 assert(PendingComment
.empty());
879 assert(!Stack
.empty());
882 void llvm::json::OStream::attributeBegin(llvm::StringRef Key
) {
883 assert(Stack
.back().Ctx
== Object
);
884 if (Stack
.back().HasValue
)
888 Stack
.back().HasValue
= true;
889 Stack
.emplace_back();
890 Stack
.back().Ctx
= Singleton
;
891 if (LLVM_LIKELY(isUTF8(Key
))) {
894 assert(false && "Invalid UTF-8 in attribute key");
895 quote(OS
, fixUTF8(Key
));
902 void llvm::json::OStream::attributeEnd() {
903 assert(Stack
.back().Ctx
== Singleton
);
904 assert(Stack
.back().HasValue
&& "Attribute must have a value");
905 assert(PendingComment
.empty());
907 assert(Stack
.back().Ctx
== Object
);
910 raw_ostream
&llvm::json::OStream::rawValueBegin() {
912 Stack
.emplace_back();
913 Stack
.back().Ctx
= RawValue
;
917 void llvm::json::OStream::rawValueEnd() {
918 assert(Stack
.back().Ctx
== RawValue
);
925 void llvm::format_provider
<llvm::json::Value
>::format(
926 const llvm::json::Value
&E
, raw_ostream
&OS
, StringRef Options
) {
927 unsigned IndentAmount
= 0;
928 if (!Options
.empty() && Options
.getAsInteger(/*Radix=*/10, IndentAmount
))
929 llvm_unreachable("json::Value format options should be an integer");
930 json::OStream(OS
, IndentAmount
).value(E
);