1 //=== JSON.cpp - JSON value, parsing and serialization - C++ -----------*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===---------------------------------------------------------------------===//
9 #include "llvm/Support/JSON.h"
10 #include "llvm/ADT/STLExtras.h"
11 #include "llvm/Support/ConvertUTF.h"
12 #include "llvm/Support/Error.h"
13 #include "llvm/Support/Format.h"
14 #include "llvm/Support/raw_ostream.h"
20 Value
&Object::operator[](const ObjectKey
&K
) {
21 return try_emplace(K
, nullptr).first
->getSecond();
23 Value
&Object::operator[](ObjectKey
&&K
) {
24 return try_emplace(std::move(K
), nullptr).first
->getSecond();
26 Value
*Object::get(StringRef K
) {
32 const Value
*Object::get(StringRef K
) const {
38 llvm::Optional
<std::nullptr_t
> Object::getNull(StringRef K
) const {
40 return V
->getAsNull();
43 llvm::Optional
<bool> Object::getBoolean(StringRef K
) const {
45 return V
->getAsBoolean();
48 llvm::Optional
<double> Object::getNumber(StringRef K
) const {
50 return V
->getAsNumber();
53 llvm::Optional
<int64_t> Object::getInteger(StringRef K
) const {
55 return V
->getAsInteger();
58 llvm::Optional
<llvm::StringRef
> Object::getString(StringRef K
) const {
60 return V
->getAsString();
63 const json::Object
*Object::getObject(StringRef K
) const {
65 return V
->getAsObject();
68 json::Object
*Object::getObject(StringRef K
) {
70 return V
->getAsObject();
73 const json::Array
*Object::getArray(StringRef K
) const {
75 return V
->getAsArray();
78 json::Array
*Object::getArray(StringRef K
) {
80 return V
->getAsArray();
83 bool operator==(const Object
&LHS
, const Object
&RHS
) {
84 if (LHS
.size() != RHS
.size())
86 for (const auto &L
: LHS
) {
87 auto R
= RHS
.find(L
.first
);
88 if (R
== RHS
.end() || L
.second
!= R
->second
)
94 Array::Array(std::initializer_list
<Value
> Elements
) {
95 V
.reserve(Elements
.size());
96 for (const Value
&V
: Elements
) {
97 emplace_back(nullptr);
98 back().moveFrom(std::move(V
));
102 Value::Value(std::initializer_list
<Value
> Elements
)
103 : Value(json::Array(Elements
)) {}
105 void Value::copyFrom(const Value
&M
) {
112 memcpy(&Union
, &M
.Union
, sizeof(Union
));
115 create
<StringRef
>(M
.as
<StringRef
>());
118 create
<std::string
>(M
.as
<std::string
>());
121 create
<json::Object
>(M
.as
<json::Object
>());
124 create
<json::Array
>(M
.as
<json::Array
>());
129 void Value::moveFrom(const Value
&&M
) {
136 memcpy(&Union
, &M
.Union
, sizeof(Union
));
139 create
<StringRef
>(M
.as
<StringRef
>());
142 create
<std::string
>(std::move(M
.as
<std::string
>()));
146 create
<json::Object
>(std::move(M
.as
<json::Object
>()));
150 create
<json::Array
>(std::move(M
.as
<json::Array
>()));
156 void Value::destroy() {
164 as
<StringRef
>().~StringRef();
167 as
<std::string
>().~basic_string();
170 as
<json::Object
>().~Object();
173 as
<json::Array
>().~Array();
178 bool operator==(const Value
&L
, const Value
&R
) {
179 if (L
.kind() != R
.kind())
183 return *L
.getAsNull() == *R
.getAsNull();
185 return *L
.getAsBoolean() == *R
.getAsBoolean();
187 // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=323
188 // The same integer must convert to the same double, per the standard.
189 // However we see 64-vs-80-bit precision comparisons with gcc-7 -O3 -m32.
190 // So we avoid floating point promotion for exact comparisons.
191 if (L
.Type
== Value::T_Integer
|| R
.Type
== Value::T_Integer
)
192 return L
.getAsInteger() == R
.getAsInteger();
193 return *L
.getAsNumber() == *R
.getAsNumber();
195 return *L
.getAsString() == *R
.getAsString();
197 return *L
.getAsArray() == *R
.getAsArray();
199 return *L
.getAsObject() == *R
.getAsObject();
201 llvm_unreachable("Unknown value kind");
204 void Path::report(llvm::StringLiteral Msg
) {
205 // Walk up to the root context, and count the number of segments.
208 for (P
= this; P
->Parent
!= nullptr; P
= P
->Parent
)
210 Path::Root
*R
= P
->Seg
.root();
211 // Fill in the error message and copy the path (in reverse order).
212 R
->ErrorMessage
= Msg
;
213 R
->ErrorPath
.resize(Count
);
214 auto It
= R
->ErrorPath
.begin();
215 for (P
= this; P
->Parent
!= nullptr; P
= P
->Parent
)
219 Error
Path::Root::getError() const {
221 raw_string_ostream
OS(S
);
222 OS
<< (ErrorMessage
.empty() ? "invalid JSON contents" : ErrorMessage
);
223 if (ErrorPath
.empty()) {
225 OS
<< " when parsing " << Name
;
227 OS
<< " at " << (Name
.empty() ? "(root)" : Name
);
228 for (const Path::Segment
&S
: llvm::reverse(ErrorPath
)) {
230 OS
<< '.' << S
.field();
232 OS
<< '[' << S
.index() << ']';
235 return createStringError(llvm::inconvertibleErrorCode(), OS
.str());
240 std::vector
<const Object::value_type
*> sortedElements(const Object
&O
) {
241 std::vector
<const Object::value_type
*> Elements
;
242 for (const auto &E
: O
)
243 Elements
.push_back(&E
);
245 [](const Object::value_type
*L
, const Object::value_type
*R
) {
246 return L
->first
< R
->first
;
251 // Prints a one-line version of a value that isn't our main focus.
252 // We interleave writes to OS and JOS, exploiting the lack of extra buffering.
253 // This is OK as we own the implementation.
254 void abbreviate(const Value
&V
, OStream
&JOS
) {
257 JOS
.rawValue(V
.getAsArray()->empty() ? "[]" : "[ ... ]");
260 JOS
.rawValue(V
.getAsObject()->empty() ? "{}" : "{ ... }");
262 case Value::String
: {
263 llvm::StringRef S
= *V
.getAsString();
267 std::string Truncated
= fixUTF8(S
.take_front(37));
268 Truncated
.append("...");
269 JOS
.value(Truncated
);
278 // Prints a semi-expanded version of a value that is our main focus.
279 // Array/Object entries are printed, but not recursively as they may be huge.
280 void abbreviateChildren(const Value
&V
, OStream
&JOS
) {
284 for (const auto &I
: *V
.getAsArray())
290 for (const auto *KV
: sortedElements(*V
.getAsObject())) {
291 JOS
.attributeBegin(KV
->first
);
292 abbreviate(KV
->second
, JOS
);
304 void Path::Root::printErrorContext(const Value
&R
, raw_ostream
&OS
) const {
305 OStream
JOS(OS
, /*IndentSize=*/2);
306 // PrintValue recurses down the path, printing the ancestors of our target.
307 // Siblings of nodes along the path are printed with abbreviate(), and the
308 // target itself is printed with the somewhat richer abbreviateChildren().
309 // 'Recurse' is the lambda itself, to allow recursive calls.
310 auto PrintValue
= [&](const Value
&V
, ArrayRef
<Segment
> Path
, auto &Recurse
) {
311 // Print the target node itself, with the error as a comment.
312 // Also used if we can't follow our path, e.g. it names a field that
313 // *should* exist but doesn't.
314 auto HighlightCurrent
= [&] {
315 std::string Comment
= "error: ";
316 Comment
.append(ErrorMessage
.data(), ErrorMessage
.size());
317 JOS
.comment(Comment
);
318 abbreviateChildren(V
, JOS
);
320 if (Path
.empty()) // We reached our target.
321 return HighlightCurrent();
322 const Segment
&S
= Path
.back(); // Path is in reverse order.
324 // Current node is an object, path names a field.
325 llvm::StringRef FieldName
= S
.field();
326 const Object
*O
= V
.getAsObject();
327 if (!O
|| !O
->get(FieldName
))
328 return HighlightCurrent();
330 for (const auto *KV
: sortedElements(*O
)) {
331 JOS
.attributeBegin(KV
->first
);
332 if (FieldName
.equals(KV
->first
))
333 Recurse(KV
->second
, Path
.drop_back(), Recurse
);
335 abbreviate(KV
->second
, JOS
);
340 // Current node is an array, path names an element.
341 const Array
*A
= V
.getAsArray();
342 if (!A
|| S
.index() >= A
->size())
343 return HighlightCurrent();
345 unsigned Current
= 0;
346 for (const auto &V
: *A
) {
347 if (Current
++ == S
.index())
348 Recurse(V
, Path
.drop_back(), Recurse
);
355 PrintValue(R
, ErrorPath
, PrintValue
);
359 // Simple recursive-descent JSON parser.
362 Parser(StringRef JSON
)
363 : Start(JSON
.begin()), P(JSON
.begin()), End(JSON
.end()) {}
367 if (isUTF8(StringRef(Start
, End
- Start
), &ErrOffset
))
369 P
= Start
+ ErrOffset
; // For line/column calculation.
370 return parseError("Invalid UTF-8 sequence");
373 bool parseValue(Value
&Out
);
379 return parseError("Text after end of document");
384 return std::move(*Err
);
388 void eatWhitespace() {
389 while (P
!= End
&& (*P
== ' ' || *P
== '\r' || *P
== '\n' || *P
== '\t'))
393 // On invalid syntax, parseX() functions return false and set Err.
394 bool parseNumber(char First
, Value
&Out
);
395 bool parseString(std::string
&Out
);
396 bool parseUnicode(std::string
&Out
);
397 bool parseError(const char *Msg
); // always returns false
399 char next() { return P
== End
? 0 : *P
++; }
400 char peek() { return P
== End
? 0 : *P
; }
401 static bool isNumber(char C
) {
402 return C
== '0' || C
== '1' || C
== '2' || C
== '3' || C
== '4' ||
403 C
== '5' || C
== '6' || C
== '7' || C
== '8' || C
== '9' ||
404 C
== 'e' || C
== 'E' || C
== '+' || C
== '-' || C
== '.';
408 const char *Start
, *P
, *End
;
411 bool Parser::parseValue(Value
&Out
) {
414 return parseError("Unexpected EOF");
415 switch (char C
= next()) {
416 // Bare null/true/false are easy - first char identifies them.
419 return (next() == 'u' && next() == 'l' && next() == 'l') ||
420 parseError("Invalid JSON value (null?)");
423 return (next() == 'r' && next() == 'u' && next() == 'e') ||
424 parseError("Invalid JSON value (true?)");
427 return (next() == 'a' && next() == 'l' && next() == 's' && next() == 'e') ||
428 parseError("Invalid JSON value (false?)");
431 if (parseString(S
)) {
439 Array
&A
= *Out
.getAsArray();
446 A
.emplace_back(nullptr);
447 if (!parseValue(A
.back()))
457 return parseError("Expected , or ] after array element");
463 Object
&O
= *Out
.getAsObject();
471 return parseError("Expected object key");
477 return parseError("Expected : after object key");
479 if (!parseValue(O
[std::move(K
)]))
489 return parseError("Expected , or } after object property");
495 return parseNumber(C
, Out
);
496 return parseError("Invalid JSON value");
500 bool Parser::parseNumber(char First
, Value
&Out
) {
501 // Read the number into a string. (Must be null-terminated for strto*).
504 while (isNumber(peek()))
507 // Try first to parse as integer, and if so preserve full 64 bits.
508 // strtoll returns long long >= 64 bits, so check it's in range too.
509 auto I
= std::strtoll(S
.c_str(), &End
, 10);
510 if (End
== S
.end() && I
>= std::numeric_limits
<int64_t>::min() &&
511 I
<= std::numeric_limits
<int64_t>::max()) {
515 // If it's not an integer
516 Out
= std::strtod(S
.c_str(), &End
);
517 return End
== S
.end() || parseError("Invalid JSON value (number?)");
520 bool Parser::parseString(std::string
&Out
) {
521 // leading quote was already consumed.
522 for (char C
= next(); C
!= '"'; C
= next()) {
523 if (LLVM_UNLIKELY(P
== End
))
524 return parseError("Unterminated string");
525 if (LLVM_UNLIKELY((C
& 0x1f) == C
))
526 return parseError("Control character in string");
527 if (LLVM_LIKELY(C
!= '\\')) {
531 // Handle escape sequence.
532 switch (C
= next()) {
554 if (!parseUnicode(Out
))
558 return parseError("Invalid escape sequence");
564 static void encodeUtf8(uint32_t Rune
, std::string
&Out
) {
566 Out
.push_back(Rune
& 0x7F);
567 } else if (Rune
< 0x800) {
568 uint8_t FirstByte
= 0xC0 | ((Rune
& 0x7C0) >> 6);
569 uint8_t SecondByte
= 0x80 | (Rune
& 0x3F);
570 Out
.push_back(FirstByte
);
571 Out
.push_back(SecondByte
);
572 } else if (Rune
< 0x10000) {
573 uint8_t FirstByte
= 0xE0 | ((Rune
& 0xF000) >> 12);
574 uint8_t SecondByte
= 0x80 | ((Rune
& 0xFC0) >> 6);
575 uint8_t ThirdByte
= 0x80 | (Rune
& 0x3F);
576 Out
.push_back(FirstByte
);
577 Out
.push_back(SecondByte
);
578 Out
.push_back(ThirdByte
);
579 } else if (Rune
< 0x110000) {
580 uint8_t FirstByte
= 0xF0 | ((Rune
& 0x1F0000) >> 18);
581 uint8_t SecondByte
= 0x80 | ((Rune
& 0x3F000) >> 12);
582 uint8_t ThirdByte
= 0x80 | ((Rune
& 0xFC0) >> 6);
583 uint8_t FourthByte
= 0x80 | (Rune
& 0x3F);
584 Out
.push_back(FirstByte
);
585 Out
.push_back(SecondByte
);
586 Out
.push_back(ThirdByte
);
587 Out
.push_back(FourthByte
);
589 llvm_unreachable("Invalid codepoint");
593 // Parse a UTF-16 \uNNNN escape sequence. "\u" has already been consumed.
594 // May parse several sequential escapes to ensure proper surrogate handling.
595 // We do not use ConvertUTF.h, it can't accept and replace unpaired surrogates.
596 // These are invalid Unicode but valid JSON (RFC 8259, section 8.2).
597 bool Parser::parseUnicode(std::string
&Out
) {
598 // Invalid UTF is not a JSON error (RFC 8529§8.2). It gets replaced by U+FFFD.
599 auto Invalid
= [&] { Out
.append(/* UTF-8 */ {'\xef', '\xbf', '\xbd'}); };
600 // Decodes 4 hex digits from the stream into Out, returns false on error.
601 auto Parse4Hex
= [this](uint16_t &Out
) -> bool {
603 char Bytes
[] = {next(), next(), next(), next()};
604 for (unsigned char C
: Bytes
) {
605 if (!std::isxdigit(C
))
606 return parseError("Invalid \\u escape sequence");
608 Out
|= (C
> '9') ? (C
& ~0x20) - 'A' + 10 : (C
- '0');
612 uint16_t First
; // UTF-16 code unit from the first \u escape.
613 if (!Parse4Hex(First
))
616 // We loop to allow proper surrogate-pair error handling.
618 // Case 1: the UTF-16 code unit is already a codepoint in the BMP.
619 if (LLVM_LIKELY(First
< 0xD800 || First
>= 0xE000)) {
620 encodeUtf8(First
, Out
);
624 // Case 2: it's an (unpaired) trailing surrogate.
625 if (LLVM_UNLIKELY(First
>= 0xDC00)) {
630 // Case 3: it's a leading surrogate. We expect a trailing one next.
631 // Case 3a: there's no trailing \u escape. Don't advance in the stream.
632 if (LLVM_UNLIKELY(P
+ 2 > End
|| *P
!= '\\' || *(P
+ 1) != 'u')) {
633 Invalid(); // Leading surrogate was unpaired.
638 if (!Parse4Hex(Second
))
640 // Case 3b: there was another \u escape, but it wasn't a trailing surrogate.
641 if (LLVM_UNLIKELY(Second
< 0xDC00 || Second
>= 0xE000)) {
642 Invalid(); // Leading surrogate was unpaired.
643 First
= Second
; // Second escape still needs to be processed.
646 // Case 3c: a valid surrogate pair encoding an astral codepoint.
647 encodeUtf8(0x10000 | ((First
- 0xD800) << 10) | (Second
- 0xDC00), Out
);
652 bool Parser::parseError(const char *Msg
) {
654 const char *StartOfLine
= Start
;
655 for (const char *X
= Start
; X
< P
; ++X
) {
662 std::make_unique
<ParseError
>(Msg
, Line
, P
- StartOfLine
, P
- Start
));
667 Expected
<Value
> parse(StringRef JSON
) {
674 return P
.takeError();
676 char ParseError::ID
= 0;
678 bool isUTF8(llvm::StringRef S
, size_t *ErrOffset
) {
679 // Fast-path for ASCII, which is valid UTF-8.
680 if (LLVM_LIKELY(isASCII(S
)))
683 const UTF8
*Data
= reinterpret_cast<const UTF8
*>(S
.data()), *Rest
= Data
;
684 if (LLVM_LIKELY(isLegalUTF8String(&Rest
, Data
+ S
.size())))
688 *ErrOffset
= Rest
- Data
;
692 std::string
fixUTF8(llvm::StringRef S
) {
693 // This isn't particularly efficient, but is only for error-recovery.
694 std::vector
<UTF32
> Codepoints(S
.size()); // 1 codepoint per byte suffices.
695 const UTF8
*In8
= reinterpret_cast<const UTF8
*>(S
.data());
696 UTF32
*Out32
= Codepoints
.data();
697 ConvertUTF8toUTF32(&In8
, In8
+ S
.size(), &Out32
, Out32
+ Codepoints
.size(),
699 Codepoints
.resize(Out32
- Codepoints
.data());
700 std::string
Res(4 * Codepoints
.size(), 0); // 4 bytes per codepoint suffice
701 const UTF32
*In32
= Codepoints
.data();
702 UTF8
*Out8
= reinterpret_cast<UTF8
*>(&Res
[0]);
703 ConvertUTF32toUTF8(&In32
, In32
+ Codepoints
.size(), &Out8
, Out8
+ Res
.size(),
705 Res
.resize(reinterpret_cast<char *>(Out8
) - Res
.data());
709 static void quote(llvm::raw_ostream
&OS
, llvm::StringRef S
) {
711 for (unsigned char C
: S
) {
712 if (C
== 0x22 || C
== 0x5C)
720 // A few characters are common enough to make short escapes worthwhile.
732 llvm::write_hex(OS
, C
, llvm::HexPrintStyle::Lower
, 4);
739 void llvm::json::OStream::value(const Value
&V
) {
747 OS
<< (*V
.getAsBoolean() ? "true" : "false");
751 if (V
.Type
== Value::T_Integer
)
752 OS
<< *V
.getAsInteger();
754 OS
<< format("%.*g", std::numeric_limits
<double>::max_digits10
,
759 quote(OS
, *V
.getAsString());
763 for (const Value
&E
: *V
.getAsArray())
768 for (const Object::value_type
*E
: sortedElements(*V
.getAsObject()))
769 attribute(E
->first
, E
->second
);
774 void llvm::json::OStream::valueBegin() {
775 assert(Stack
.back().Ctx
!= Object
&& "Only attributes allowed here");
776 if (Stack
.back().HasValue
) {
777 assert(Stack
.back().Ctx
!= Singleton
&& "Only one value allowed here");
780 if (Stack
.back().Ctx
== Array
)
783 Stack
.back().HasValue
= true;
786 void OStream::comment(llvm::StringRef Comment
) {
787 assert(PendingComment
.empty() && "Only one comment per value!");
788 PendingComment
= Comment
;
791 void OStream::flushComment() {
792 if (PendingComment
.empty())
794 OS
<< (IndentSize
? "/* " : "/*");
795 // Be sure not to accidentally emit "*/". Transform to "* /".
796 while (!PendingComment
.empty()) {
797 auto Pos
= PendingComment
.find("*/");
798 if (Pos
== StringRef::npos
) {
799 OS
<< PendingComment
;
802 OS
<< PendingComment
.take_front(Pos
) << "* /";
803 PendingComment
= PendingComment
.drop_front(Pos
+ 2);
806 OS
<< (IndentSize
? " */" : "*/");
807 // Comments are on their own line unless attached to an attribute value.
808 if (Stack
.size() > 1 && Stack
.back().Ctx
== Singleton
) {
816 void llvm::json::OStream::newline() {
823 void llvm::json::OStream::arrayBegin() {
825 Stack
.emplace_back();
826 Stack
.back().Ctx
= Array
;
827 Indent
+= IndentSize
;
831 void llvm::json::OStream::arrayEnd() {
832 assert(Stack
.back().Ctx
== Array
);
833 Indent
-= IndentSize
;
834 if (Stack
.back().HasValue
)
837 assert(PendingComment
.empty());
839 assert(!Stack
.empty());
842 void llvm::json::OStream::objectBegin() {
844 Stack
.emplace_back();
845 Stack
.back().Ctx
= Object
;
846 Indent
+= IndentSize
;
850 void llvm::json::OStream::objectEnd() {
851 assert(Stack
.back().Ctx
== Object
);
852 Indent
-= IndentSize
;
853 if (Stack
.back().HasValue
)
856 assert(PendingComment
.empty());
858 assert(!Stack
.empty());
861 void llvm::json::OStream::attributeBegin(llvm::StringRef Key
) {
862 assert(Stack
.back().Ctx
== Object
);
863 if (Stack
.back().HasValue
)
867 Stack
.back().HasValue
= true;
868 Stack
.emplace_back();
869 Stack
.back().Ctx
= Singleton
;
870 if (LLVM_LIKELY(isUTF8(Key
))) {
873 assert(false && "Invalid UTF-8 in attribute key");
874 quote(OS
, fixUTF8(Key
));
881 void llvm::json::OStream::attributeEnd() {
882 assert(Stack
.back().Ctx
== Singleton
);
883 assert(Stack
.back().HasValue
&& "Attribute must have a value");
884 assert(PendingComment
.empty());
886 assert(Stack
.back().Ctx
== Object
);
889 raw_ostream
&llvm::json::OStream::rawValueBegin() {
891 Stack
.emplace_back();
892 Stack
.back().Ctx
= RawValue
;
896 void llvm::json::OStream::rawValueEnd() {
897 assert(Stack
.back().Ctx
== RawValue
);
904 void llvm::format_provider
<llvm::json::Value
>::format(
905 const llvm::json::Value
&E
, raw_ostream
&OS
, StringRef Options
) {
906 unsigned IndentAmount
= 0;
907 if (!Options
.empty() && Options
.getAsInteger(/*Radix=*/10, IndentAmount
))
908 llvm_unreachable("json::Value format options should be an integer");
909 json::OStream(OS
, IndentAmount
).value(E
);