1 //=== JSON.cpp - JSON value, parsing and serialization - C++ -----------*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===---------------------------------------------------------------------===//
9 #include "llvm/Support/JSON.h"
10 #include "llvm/ADT/STLExtras.h"
11 #include "llvm/ADT/StringExtras.h"
12 #include "llvm/Support/ConvertUTF.h"
13 #include "llvm/Support/Error.h"
14 #include "llvm/Support/Format.h"
15 #include "llvm/Support/NativeFormatting.h"
16 #include "llvm/Support/raw_ostream.h"
24 Value
&Object::operator[](const ObjectKey
&K
) {
25 return try_emplace(K
, nullptr).first
->getSecond();
27 Value
&Object::operator[](ObjectKey
&&K
) {
28 return try_emplace(std::move(K
), nullptr).first
->getSecond();
30 Value
*Object::get(StringRef K
) {
36 const Value
*Object::get(StringRef K
) const {
42 std::optional
<std::nullptr_t
> Object::getNull(StringRef K
) const {
44 return V
->getAsNull();
47 std::optional
<bool> Object::getBoolean(StringRef K
) const {
49 return V
->getAsBoolean();
52 std::optional
<double> Object::getNumber(StringRef K
) const {
54 return V
->getAsNumber();
57 std::optional
<int64_t> Object::getInteger(StringRef K
) const {
59 return V
->getAsInteger();
62 std::optional
<llvm::StringRef
> Object::getString(StringRef K
) const {
64 return V
->getAsString();
67 const json::Object
*Object::getObject(StringRef K
) const {
69 return V
->getAsObject();
72 json::Object
*Object::getObject(StringRef K
) {
74 return V
->getAsObject();
77 const json::Array
*Object::getArray(StringRef K
) const {
79 return V
->getAsArray();
82 json::Array
*Object::getArray(StringRef K
) {
84 return V
->getAsArray();
87 bool operator==(const Object
&LHS
, const Object
&RHS
) {
88 if (LHS
.size() != RHS
.size())
90 for (const auto &L
: LHS
) {
91 auto R
= RHS
.find(L
.first
);
92 if (R
== RHS
.end() || L
.second
!= R
->second
)
98 Array::Array(std::initializer_list
<Value
> Elements
) {
99 V
.reserve(Elements
.size());
100 for (const Value
&V
: Elements
) {
101 emplace_back(nullptr);
102 back().moveFrom(std::move(V
));
106 Value::Value(std::initializer_list
<Value
> Elements
)
107 : Value(json::Array(Elements
)) {}
109 void Value::copyFrom(const Value
&M
) {
117 memcpy(&Union
, &M
.Union
, sizeof(Union
));
120 create
<StringRef
>(M
.as
<StringRef
>());
123 create
<std::string
>(M
.as
<std::string
>());
126 create
<json::Object
>(M
.as
<json::Object
>());
129 create
<json::Array
>(M
.as
<json::Array
>());
134 void Value::moveFrom(const Value
&&M
) {
142 memcpy(&Union
, &M
.Union
, sizeof(Union
));
145 create
<StringRef
>(M
.as
<StringRef
>());
148 create
<std::string
>(std::move(M
.as
<std::string
>()));
152 create
<json::Object
>(std::move(M
.as
<json::Object
>()));
156 create
<json::Array
>(std::move(M
.as
<json::Array
>()));
162 void Value::destroy() {
171 as
<StringRef
>().~StringRef();
174 as
<std::string
>().~basic_string();
177 as
<json::Object
>().~Object();
180 as
<json::Array
>().~Array();
185 bool operator==(const Value
&L
, const Value
&R
) {
186 if (L
.kind() != R
.kind())
190 return *L
.getAsNull() == *R
.getAsNull();
192 return *L
.getAsBoolean() == *R
.getAsBoolean();
194 // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=323
195 // The same integer must convert to the same double, per the standard.
196 // However we see 64-vs-80-bit precision comparisons with gcc-7 -O3 -m32.
197 // So we avoid floating point promotion for exact comparisons.
198 if (L
.Type
== Value::T_Integer
|| R
.Type
== Value::T_Integer
)
199 return L
.getAsInteger() == R
.getAsInteger();
200 return *L
.getAsNumber() == *R
.getAsNumber();
202 return *L
.getAsString() == *R
.getAsString();
204 return *L
.getAsArray() == *R
.getAsArray();
206 return *L
.getAsObject() == *R
.getAsObject();
208 llvm_unreachable("Unknown value kind");
211 void Path::report(llvm::StringLiteral Msg
) {
212 // Walk up to the root context, and count the number of segments.
215 for (P
= this; P
->Parent
!= nullptr; P
= P
->Parent
)
217 Path::Root
*R
= P
->Seg
.root();
218 // Fill in the error message and copy the path (in reverse order).
219 R
->ErrorMessage
= Msg
;
220 R
->ErrorPath
.resize(Count
);
221 auto It
= R
->ErrorPath
.begin();
222 for (P
= this; P
->Parent
!= nullptr; P
= P
->Parent
)
226 Error
Path::Root::getError() const {
228 raw_string_ostream
OS(S
);
229 OS
<< (ErrorMessage
.empty() ? "invalid JSON contents" : ErrorMessage
);
230 if (ErrorPath
.empty()) {
232 OS
<< " when parsing " << Name
;
234 OS
<< " at " << (Name
.empty() ? "(root)" : Name
);
235 for (const Path::Segment
&S
: llvm::reverse(ErrorPath
)) {
237 OS
<< '.' << S
.field();
239 OS
<< '[' << S
.index() << ']';
242 return createStringError(llvm::inconvertibleErrorCode(), S
);
245 std::vector
<const Object::value_type
*> sortedElements(const Object
&O
) {
246 std::vector
<const Object::value_type
*> Elements
;
247 for (const auto &E
: O
)
248 Elements
.push_back(&E
);
250 [](const Object::value_type
*L
, const Object::value_type
*R
) {
251 return L
->first
< R
->first
;
256 // Prints a one-line version of a value that isn't our main focus.
257 // We interleave writes to OS and JOS, exploiting the lack of extra buffering.
258 // This is OK as we own the implementation.
259 static void abbreviate(const Value
&V
, OStream
&JOS
) {
262 JOS
.rawValue(V
.getAsArray()->empty() ? "[]" : "[ ... ]");
265 JOS
.rawValue(V
.getAsObject()->empty() ? "{}" : "{ ... }");
267 case Value::String
: {
268 llvm::StringRef S
= *V
.getAsString();
272 std::string Truncated
= fixUTF8(S
.take_front(37));
273 Truncated
.append("...");
274 JOS
.value(Truncated
);
283 // Prints a semi-expanded version of a value that is our main focus.
284 // Array/Object entries are printed, but not recursively as they may be huge.
285 static void abbreviateChildren(const Value
&V
, OStream
&JOS
) {
289 for (const auto &I
: *V
.getAsArray())
295 for (const auto *KV
: sortedElements(*V
.getAsObject())) {
296 JOS
.attributeBegin(KV
->first
);
297 abbreviate(KV
->second
, JOS
);
307 void Path::Root::printErrorContext(const Value
&R
, raw_ostream
&OS
) const {
308 OStream
JOS(OS
, /*IndentSize=*/2);
309 // PrintValue recurses down the path, printing the ancestors of our target.
310 // Siblings of nodes along the path are printed with abbreviate(), and the
311 // target itself is printed with the somewhat richer abbreviateChildren().
312 // 'Recurse' is the lambda itself, to allow recursive calls.
313 auto PrintValue
= [&](const Value
&V
, ArrayRef
<Segment
> Path
, auto &Recurse
) {
314 // Print the target node itself, with the error as a comment.
315 // Also used if we can't follow our path, e.g. it names a field that
316 // *should* exist but doesn't.
317 auto HighlightCurrent
= [&] {
318 std::string Comment
= "error: ";
319 Comment
.append(ErrorMessage
.data(), ErrorMessage
.size());
320 JOS
.comment(Comment
);
321 abbreviateChildren(V
, JOS
);
323 if (Path
.empty()) // We reached our target.
324 return HighlightCurrent();
325 const Segment
&S
= Path
.back(); // Path is in reverse order.
327 // Current node is an object, path names a field.
328 llvm::StringRef FieldName
= S
.field();
329 const Object
*O
= V
.getAsObject();
330 if (!O
|| !O
->get(FieldName
))
331 return HighlightCurrent();
333 for (const auto *KV
: sortedElements(*O
)) {
334 JOS
.attributeBegin(KV
->first
);
335 if (FieldName
== StringRef(KV
->first
))
336 Recurse(KV
->second
, Path
.drop_back(), Recurse
);
338 abbreviate(KV
->second
, JOS
);
343 // Current node is an array, path names an element.
344 const Array
*A
= V
.getAsArray();
345 if (!A
|| S
.index() >= A
->size())
346 return HighlightCurrent();
348 unsigned Current
= 0;
349 for (const auto &V
: *A
) {
350 if (Current
++ == S
.index())
351 Recurse(V
, Path
.drop_back(), Recurse
);
358 PrintValue(R
, ErrorPath
, PrintValue
);
362 // Simple recursive-descent JSON parser.
365 Parser(StringRef JSON
)
366 : Start(JSON
.begin()), P(JSON
.begin()), End(JSON
.end()) {}
370 if (isUTF8(StringRef(Start
, End
- Start
), &ErrOffset
))
372 P
= Start
+ ErrOffset
; // For line/column calculation.
373 return parseError("Invalid UTF-8 sequence");
376 bool parseValue(Value
&Out
);
382 return parseError("Text after end of document");
387 return std::move(*Err
);
391 void eatWhitespace() {
392 while (P
!= End
&& (*P
== ' ' || *P
== '\r' || *P
== '\n' || *P
== '\t'))
396 // On invalid syntax, parseX() functions return false and set Err.
397 bool parseNumber(char First
, Value
&Out
);
398 bool parseString(std::string
&Out
);
399 bool parseUnicode(std::string
&Out
);
400 bool parseError(const char *Msg
); // always returns false
402 char next() { return P
== End
? 0 : *P
++; }
403 char peek() { return P
== End
? 0 : *P
; }
404 static bool isNumber(char C
) {
405 return C
== '0' || C
== '1' || C
== '2' || C
== '3' || C
== '4' ||
406 C
== '5' || C
== '6' || C
== '7' || C
== '8' || C
== '9' ||
407 C
== 'e' || C
== 'E' || C
== '+' || C
== '-' || C
== '.';
410 std::optional
<Error
> Err
;
411 const char *Start
, *P
, *End
;
415 bool Parser::parseValue(Value
&Out
) {
418 return parseError("Unexpected EOF");
419 switch (char C
= next()) {
420 // Bare null/true/false are easy - first char identifies them.
423 return (next() == 'u' && next() == 'l' && next() == 'l') ||
424 parseError("Invalid JSON value (null?)");
427 return (next() == 'r' && next() == 'u' && next() == 'e') ||
428 parseError("Invalid JSON value (true?)");
431 return (next() == 'a' && next() == 'l' && next() == 's' && next() == 'e') ||
432 parseError("Invalid JSON value (false?)");
435 if (parseString(S
)) {
443 Array
&A
= *Out
.getAsArray();
450 A
.emplace_back(nullptr);
451 if (!parseValue(A
.back()))
461 return parseError("Expected , or ] after array element");
467 Object
&O
= *Out
.getAsObject();
475 return parseError("Expected object key");
481 return parseError("Expected : after object key");
483 if (!parseValue(O
[std::move(K
)]))
493 return parseError("Expected , or } after object property");
499 return parseNumber(C
, Out
);
500 return parseError("Invalid JSON value");
504 bool Parser::parseNumber(char First
, Value
&Out
) {
505 // Read the number into a string. (Must be null-terminated for strto*).
508 while (isNumber(peek()))
511 // Try first to parse as integer, and if so preserve full 64 bits.
512 // We check for errno for out of bounds errors and for End == S.end()
513 // to make sure that the numeric string is not malformed.
515 int64_t I
= std::strtoll(S
.c_str(), &End
, 10);
516 if (End
== S
.end() && errno
!= ERANGE
) {
520 // strtroull has a special handling for negative numbers, but in this
521 // case we don't want to do that because negative numbers were already
522 // handled in the previous block.
525 uint64_t UI
= std::strtoull(S
.c_str(), &End
, 10);
526 if (End
== S
.end() && errno
!= ERANGE
) {
531 // If it's not an integer
532 Out
= std::strtod(S
.c_str(), &End
);
533 return End
== S
.end() || parseError("Invalid JSON value (number?)");
536 bool Parser::parseString(std::string
&Out
) {
537 // leading quote was already consumed.
538 for (char C
= next(); C
!= '"'; C
= next()) {
539 if (LLVM_UNLIKELY(P
== End
))
540 return parseError("Unterminated string");
541 if (LLVM_UNLIKELY((C
& 0x1f) == C
))
542 return parseError("Control character in string");
543 if (LLVM_LIKELY(C
!= '\\')) {
547 // Handle escape sequence.
548 switch (C
= next()) {
570 if (!parseUnicode(Out
))
574 return parseError("Invalid escape sequence");
580 static void encodeUtf8(uint32_t Rune
, std::string
&Out
) {
582 Out
.push_back(Rune
& 0x7F);
583 } else if (Rune
< 0x800) {
584 uint8_t FirstByte
= 0xC0 | ((Rune
& 0x7C0) >> 6);
585 uint8_t SecondByte
= 0x80 | (Rune
& 0x3F);
586 Out
.push_back(FirstByte
);
587 Out
.push_back(SecondByte
);
588 } else if (Rune
< 0x10000) {
589 uint8_t FirstByte
= 0xE0 | ((Rune
& 0xF000) >> 12);
590 uint8_t SecondByte
= 0x80 | ((Rune
& 0xFC0) >> 6);
591 uint8_t ThirdByte
= 0x80 | (Rune
& 0x3F);
592 Out
.push_back(FirstByte
);
593 Out
.push_back(SecondByte
);
594 Out
.push_back(ThirdByte
);
595 } else if (Rune
< 0x110000) {
596 uint8_t FirstByte
= 0xF0 | ((Rune
& 0x1F0000) >> 18);
597 uint8_t SecondByte
= 0x80 | ((Rune
& 0x3F000) >> 12);
598 uint8_t ThirdByte
= 0x80 | ((Rune
& 0xFC0) >> 6);
599 uint8_t FourthByte
= 0x80 | (Rune
& 0x3F);
600 Out
.push_back(FirstByte
);
601 Out
.push_back(SecondByte
);
602 Out
.push_back(ThirdByte
);
603 Out
.push_back(FourthByte
);
605 llvm_unreachable("Invalid codepoint");
609 // Parse a UTF-16 \uNNNN escape sequence. "\u" has already been consumed.
610 // May parse several sequential escapes to ensure proper surrogate handling.
611 // We do not use ConvertUTF.h, it can't accept and replace unpaired surrogates.
612 // These are invalid Unicode but valid JSON (RFC 8259, section 8.2).
613 bool Parser::parseUnicode(std::string
&Out
) {
614 // Invalid UTF is not a JSON error (RFC 8529§8.2). It gets replaced by U+FFFD.
615 auto Invalid
= [&] { Out
.append(/* UTF-8 */ {'\xef', '\xbf', '\xbd'}); };
616 // Decodes 4 hex digits from the stream into Out, returns false on error.
617 auto Parse4Hex
= [this](uint16_t &Out
) -> bool {
619 char Bytes
[] = {next(), next(), next(), next()};
620 for (unsigned char C
: Bytes
) {
621 if (!std::isxdigit(C
))
622 return parseError("Invalid \\u escape sequence");
624 Out
|= (C
> '9') ? (C
& ~0x20) - 'A' + 10 : (C
- '0');
628 uint16_t First
; // UTF-16 code unit from the first \u escape.
629 if (!Parse4Hex(First
))
632 // We loop to allow proper surrogate-pair error handling.
634 // Case 1: the UTF-16 code unit is already a codepoint in the BMP.
635 if (LLVM_LIKELY(First
< 0xD800 || First
>= 0xE000)) {
636 encodeUtf8(First
, Out
);
640 // Case 2: it's an (unpaired) trailing surrogate.
641 if (LLVM_UNLIKELY(First
>= 0xDC00)) {
646 // Case 3: it's a leading surrogate. We expect a trailing one next.
647 // Case 3a: there's no trailing \u escape. Don't advance in the stream.
648 if (LLVM_UNLIKELY(P
+ 2 > End
|| *P
!= '\\' || *(P
+ 1) != 'u')) {
649 Invalid(); // Leading surrogate was unpaired.
654 if (!Parse4Hex(Second
))
656 // Case 3b: there was another \u escape, but it wasn't a trailing surrogate.
657 if (LLVM_UNLIKELY(Second
< 0xDC00 || Second
>= 0xE000)) {
658 Invalid(); // Leading surrogate was unpaired.
659 First
= Second
; // Second escape still needs to be processed.
662 // Case 3c: a valid surrogate pair encoding an astral codepoint.
663 encodeUtf8(0x10000 | ((First
- 0xD800) << 10) | (Second
- 0xDC00), Out
);
668 bool Parser::parseError(const char *Msg
) {
670 const char *StartOfLine
= Start
;
671 for (const char *X
= Start
; X
< P
; ++X
) {
678 std::make_unique
<ParseError
>(Msg
, Line
, P
- StartOfLine
, P
- Start
));
682 Expected
<Value
> parse(StringRef JSON
) {
689 return P
.takeError();
691 char ParseError::ID
= 0;
693 bool isUTF8(llvm::StringRef S
, size_t *ErrOffset
) {
694 // Fast-path for ASCII, which is valid UTF-8.
695 if (LLVM_LIKELY(isASCII(S
)))
698 const UTF8
*Data
= reinterpret_cast<const UTF8
*>(S
.data()), *Rest
= Data
;
699 if (LLVM_LIKELY(isLegalUTF8String(&Rest
, Data
+ S
.size())))
703 *ErrOffset
= Rest
- Data
;
707 std::string
fixUTF8(llvm::StringRef S
) {
708 // This isn't particularly efficient, but is only for error-recovery.
709 std::vector
<UTF32
> Codepoints(S
.size()); // 1 codepoint per byte suffices.
710 const UTF8
*In8
= reinterpret_cast<const UTF8
*>(S
.data());
711 UTF32
*Out32
= Codepoints
.data();
712 ConvertUTF8toUTF32(&In8
, In8
+ S
.size(), &Out32
, Out32
+ Codepoints
.size(),
714 Codepoints
.resize(Out32
- Codepoints
.data());
715 std::string
Res(4 * Codepoints
.size(), 0); // 4 bytes per codepoint suffice
716 const UTF32
*In32
= Codepoints
.data();
717 UTF8
*Out8
= reinterpret_cast<UTF8
*>(&Res
[0]);
718 ConvertUTF32toUTF8(&In32
, In32
+ Codepoints
.size(), &Out8
, Out8
+ Res
.size(),
720 Res
.resize(reinterpret_cast<char *>(Out8
) - Res
.data());
724 static void quote(llvm::raw_ostream
&OS
, llvm::StringRef S
) {
726 for (unsigned char C
: S
) {
727 if (C
== 0x22 || C
== 0x5C)
735 // A few characters are common enough to make short escapes worthwhile.
747 llvm::write_hex(OS
, C
, llvm::HexPrintStyle::Lower
, 4);
754 void llvm::json::OStream::value(const Value
&V
) {
762 OS
<< (*V
.getAsBoolean() ? "true" : "false");
766 if (V
.Type
== Value::T_Integer
)
767 OS
<< *V
.getAsInteger();
768 else if (V
.Type
== Value::T_UINT64
)
769 OS
<< *V
.getAsUINT64();
771 OS
<< format("%.*g", std::numeric_limits
<double>::max_digits10
,
776 quote(OS
, *V
.getAsString());
780 for (const Value
&E
: *V
.getAsArray())
785 for (const Object::value_type
*E
: sortedElements(*V
.getAsObject()))
786 attribute(E
->first
, E
->second
);
791 void llvm::json::OStream::valueBegin() {
792 assert(Stack
.back().Ctx
!= Object
&& "Only attributes allowed here");
793 if (Stack
.back().HasValue
) {
794 assert(Stack
.back().Ctx
!= Singleton
&& "Only one value allowed here");
797 if (Stack
.back().Ctx
== Array
)
800 Stack
.back().HasValue
= true;
803 void OStream::comment(llvm::StringRef Comment
) {
804 assert(PendingComment
.empty() && "Only one comment per value!");
805 PendingComment
= Comment
;
808 void OStream::flushComment() {
809 if (PendingComment
.empty())
811 OS
<< (IndentSize
? "/* " : "/*");
812 // Be sure not to accidentally emit "*/". Transform to "* /".
813 while (!PendingComment
.empty()) {
814 auto Pos
= PendingComment
.find("*/");
815 if (Pos
== StringRef::npos
) {
816 OS
<< PendingComment
;
819 OS
<< PendingComment
.take_front(Pos
) << "* /";
820 PendingComment
= PendingComment
.drop_front(Pos
+ 2);
823 OS
<< (IndentSize
? " */" : "*/");
824 // Comments are on their own line unless attached to an attribute value.
825 if (Stack
.size() > 1 && Stack
.back().Ctx
== Singleton
) {
833 void llvm::json::OStream::newline() {
840 void llvm::json::OStream::arrayBegin() {
842 Stack
.emplace_back();
843 Stack
.back().Ctx
= Array
;
844 Indent
+= IndentSize
;
848 void llvm::json::OStream::arrayEnd() {
849 assert(Stack
.back().Ctx
== Array
);
850 Indent
-= IndentSize
;
851 if (Stack
.back().HasValue
)
854 assert(PendingComment
.empty());
856 assert(!Stack
.empty());
859 void llvm::json::OStream::objectBegin() {
861 Stack
.emplace_back();
862 Stack
.back().Ctx
= Object
;
863 Indent
+= IndentSize
;
867 void llvm::json::OStream::objectEnd() {
868 assert(Stack
.back().Ctx
== Object
);
869 Indent
-= IndentSize
;
870 if (Stack
.back().HasValue
)
873 assert(PendingComment
.empty());
875 assert(!Stack
.empty());
878 void llvm::json::OStream::attributeBegin(llvm::StringRef Key
) {
879 assert(Stack
.back().Ctx
== Object
);
880 if (Stack
.back().HasValue
)
884 Stack
.back().HasValue
= true;
885 Stack
.emplace_back();
886 Stack
.back().Ctx
= Singleton
;
887 if (LLVM_LIKELY(isUTF8(Key
))) {
890 assert(false && "Invalid UTF-8 in attribute key");
891 quote(OS
, fixUTF8(Key
));
898 void llvm::json::OStream::attributeEnd() {
899 assert(Stack
.back().Ctx
== Singleton
);
900 assert(Stack
.back().HasValue
&& "Attribute must have a value");
901 assert(PendingComment
.empty());
903 assert(Stack
.back().Ctx
== Object
);
906 raw_ostream
&llvm::json::OStream::rawValueBegin() {
908 Stack
.emplace_back();
909 Stack
.back().Ctx
= RawValue
;
913 void llvm::json::OStream::rawValueEnd() {
914 assert(Stack
.back().Ctx
== RawValue
);
921 void llvm::format_provider
<llvm::json::Value
>::format(
922 const llvm::json::Value
&E
, raw_ostream
&OS
, StringRef Options
) {
923 unsigned IndentAmount
= 0;
924 if (!Options
.empty() && Options
.getAsInteger(/*Radix=*/10, IndentAmount
))
925 llvm_unreachable("json::Value format options should be an integer");
926 json::OStream(OS
, IndentAmount
).value(E
);