1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "base/json/json_parser.h"
9 #include "base/logging.h"
10 #include "base/memory/scoped_ptr.h"
11 #include "base/strings/string_number_conversions.h"
12 #include "base/strings/string_piece.h"
13 #include "base/strings/string_util.h"
14 #include "base/strings/stringprintf.h"
15 #include "base/strings/utf_string_conversion_utils.h"
16 #include "base/strings/utf_string_conversions.h"
17 #include "base/third_party/icu/icu_utf.h"
18 #include "base/values.h"
25 const int kStackMaxDepth
= 100;
27 const int32 kExtendedASCIIStart
= 0x80;
29 // This and the class below are used to own the JSON input string for when
30 // string tokens are stored as StringPiece instead of std::string. This
31 // optimization avoids about 2/3rds of string memory copies. The constructor
32 // takes ownership of the input string. The real root value is Swap()ed into
34 class DictionaryHiddenRootValue
: public base::DictionaryValue
{
36 DictionaryHiddenRootValue(std::string
* json
, Value
* root
) : json_(json
) {
37 DCHECK(root
->IsType(Value::TYPE_DICTIONARY
));
38 DictionaryValue::Swap(static_cast<DictionaryValue
*>(root
));
41 void Swap(DictionaryValue
* other
) override
{
42 DVLOG(1) << "Swap()ing a DictionaryValue inefficiently.";
44 // First deep copy to convert JSONStringValue to std::string and swap that
45 // copy with |other|, which contains the new contents of |this|.
46 scoped_ptr
<base::DictionaryValue
> copy(DeepCopy());
49 // Then erase the contents of the current dictionary and swap in the
50 // new contents, originally from |other|.
53 DictionaryValue::Swap(copy
.get());
56 // Not overriding DictionaryValue::Remove because it just calls through to
59 bool RemoveWithoutPathExpansion(const std::string
& key
,
60 scoped_ptr
<Value
>* out
) override
{
61 // If the caller won't take ownership of the removed value, just call up.
63 return DictionaryValue::RemoveWithoutPathExpansion(key
, out
);
65 DVLOG(1) << "Remove()ing from a DictionaryValue inefficiently.";
67 // Otherwise, remove the value while its still "owned" by this and copy it
68 // to convert any JSONStringValues to std::string.
69 scoped_ptr
<Value
> out_owned
;
70 if (!DictionaryValue::RemoveWithoutPathExpansion(key
, &out_owned
))
73 out
->reset(out_owned
->DeepCopy());
79 scoped_ptr
<std::string
> json_
;
81 DISALLOW_COPY_AND_ASSIGN(DictionaryHiddenRootValue
);
84 class ListHiddenRootValue
: public base::ListValue
{
86 ListHiddenRootValue(std::string
* json
, Value
* root
) : json_(json
) {
87 DCHECK(root
->IsType(Value::TYPE_LIST
));
88 ListValue::Swap(static_cast<ListValue
*>(root
));
91 void Swap(ListValue
* other
) override
{
92 DVLOG(1) << "Swap()ing a ListValue inefficiently.";
94 // First deep copy to convert JSONStringValue to std::string and swap that
95 // copy with |other|, which contains the new contents of |this|.
96 scoped_ptr
<base::ListValue
> copy(DeepCopy());
99 // Then erase the contents of the current list and swap in the new contents,
100 // originally from |other|.
103 ListValue::Swap(copy
.get());
106 bool Remove(size_t index
, scoped_ptr
<Value
>* out
) override
{
107 // If the caller won't take ownership of the removed value, just call up.
109 return ListValue::Remove(index
, out
);
111 DVLOG(1) << "Remove()ing from a ListValue inefficiently.";
113 // Otherwise, remove the value while its still "owned" by this and copy it
114 // to convert any JSONStringValues to std::string.
115 scoped_ptr
<Value
> out_owned
;
116 if (!ListValue::Remove(index
, &out_owned
))
119 out
->reset(out_owned
->DeepCopy());
125 scoped_ptr
<std::string
> json_
;
127 DISALLOW_COPY_AND_ASSIGN(ListHiddenRootValue
);
130 // A variant on StringValue that uses StringPiece instead of copying the string
131 // into the Value. This can only be stored in a child of hidden root (above),
132 // otherwise the referenced string will not be guaranteed to outlive it.
133 class JSONStringValue
: public base::Value
{
135 explicit JSONStringValue(const base::StringPiece
& piece
)
136 : Value(TYPE_STRING
),
137 string_piece_(piece
) {
140 // Overridden from base::Value:
141 bool GetAsString(std::string
* out_value
) const override
{
142 string_piece_
.CopyToString(out_value
);
145 bool GetAsString(string16
* out_value
) const override
{
146 *out_value
= UTF8ToUTF16(string_piece_
);
149 Value
* DeepCopy() const override
{
150 return new StringValue(string_piece_
.as_string());
152 bool Equals(const Value
* other
) const override
{
153 std::string other_string
;
154 return other
->IsType(TYPE_STRING
) && other
->GetAsString(&other_string
) &&
155 StringPiece(other_string
) == string_piece_
;
159 // The location in the original input stream.
160 base::StringPiece string_piece_
;
162 DISALLOW_COPY_AND_ASSIGN(JSONStringValue
);
165 // Simple class that checks for maximum recursion/"stack overflow."
168 explicit StackMarker(int* depth
) : depth_(depth
) {
170 DCHECK_LE(*depth_
, kStackMaxDepth
);
176 bool IsTooDeep() const {
177 return *depth_
>= kStackMaxDepth
;
183 DISALLOW_COPY_AND_ASSIGN(StackMarker
);
188 JSONParser::JSONParser(int options
)
197 error_code_(JSONReader::JSON_NO_ERROR
),
202 JSONParser::~JSONParser() {
205 Value
* JSONParser::Parse(const StringPiece
& input
) {
206 scoped_ptr
<std::string
> input_copy
;
207 // If the children of a JSON root can be detached, then hidden roots cannot
208 // be used, so do not bother copying the input because StringPiece will not
210 if (!(options_
& JSON_DETACHABLE_CHILDREN
)) {
211 input_copy
.reset(new std::string(input
.as_string()));
212 start_pos_
= input_copy
->data();
214 start_pos_
= input
.data();
217 end_pos_
= start_pos_
+ input
.length();
220 index_last_line_
= 0;
222 error_code_
= JSONReader::JSON_NO_ERROR
;
226 // When the input JSON string starts with a UTF-8 Byte-Order-Mark
227 // <0xEF 0xBB 0xBF>, advance the start position to avoid the
228 // ParseNextToken function mis-treating a Unicode BOM as an invalid
229 // character and returning NULL.
230 if (CanConsume(3) && static_cast<uint8
>(*pos_
) == 0xEF &&
231 static_cast<uint8
>(*(pos_
+ 1)) == 0xBB &&
232 static_cast<uint8
>(*(pos_
+ 2)) == 0xBF) {
236 // Parse the first and any nested tokens.
237 scoped_ptr
<Value
> root(ParseNextToken());
241 // Make sure the input stream is at an end.
242 if (GetNextToken() != T_END_OF_INPUT
) {
243 if (!CanConsume(1) || (NextChar() && GetNextToken() != T_END_OF_INPUT
)) {
244 ReportError(JSONReader::JSON_UNEXPECTED_DATA_AFTER_ROOT
, 1);
249 // Dictionaries and lists can contain JSONStringValues, so wrap them in a
251 if (!(options_
& JSON_DETACHABLE_CHILDREN
)) {
252 if (root
->IsType(Value::TYPE_DICTIONARY
)) {
253 return new DictionaryHiddenRootValue(input_copy
.release(), root
.get());
254 } else if (root
->IsType(Value::TYPE_LIST
)) {
255 return new ListHiddenRootValue(input_copy
.release(), root
.get());
256 } else if (root
->IsType(Value::TYPE_STRING
)) {
257 // A string type could be a JSONStringValue, but because there's no
258 // corresponding HiddenRootValue, the memory will be lost. Deep copy to
260 return root
->DeepCopy();
264 // All other values can be returned directly.
265 return root
.release();
268 JSONReader::JsonParseError
JSONParser::error_code() const {
272 std::string
JSONParser::GetErrorMessage() const {
273 return FormatErrorMessage(error_line_
, error_column_
,
274 JSONReader::ErrorCodeToString(error_code_
));
277 // StringBuilder ///////////////////////////////////////////////////////////////
279 JSONParser::StringBuilder::StringBuilder()
285 JSONParser::StringBuilder::StringBuilder(const char* pos
)
291 void JSONParser::StringBuilder::Swap(StringBuilder
* other
) {
292 std::swap(other
->string_
, string_
);
293 std::swap(other
->pos_
, pos_
);
294 std::swap(other
->length_
, length_
);
297 JSONParser::StringBuilder::~StringBuilder() {
301 void JSONParser::StringBuilder::Append(const char& c
) {
306 string_
->push_back(c
);
311 void JSONParser::StringBuilder::AppendString(const std::string
& str
) {
313 string_
->append(str
);
316 void JSONParser::StringBuilder::Convert() {
319 string_
= new std::string(pos_
, length_
);
322 bool JSONParser::StringBuilder::CanBeStringPiece() const {
326 StringPiece
JSONParser::StringBuilder::AsStringPiece() {
328 return StringPiece();
329 return StringPiece(pos_
, length_
);
332 const std::string
& JSONParser::StringBuilder::AsString() {
338 // JSONParser private //////////////////////////////////////////////////////////
340 inline bool JSONParser::CanConsume(int length
) {
341 return pos_
+ length
<= end_pos_
;
344 const char* JSONParser::NextChar() {
345 DCHECK(CanConsume(1));
351 void JSONParser::NextNChars(int n
) {
352 DCHECK(CanConsume(n
));
357 JSONParser::Token
JSONParser::GetNextToken() {
358 EatWhitespaceAndComments();
360 return T_END_OF_INPUT
;
364 return T_OBJECT_BEGIN
;
368 return T_ARRAY_BEGIN
;
392 return T_LIST_SEPARATOR
;
394 return T_OBJECT_PAIR_SEPARATOR
;
396 return T_INVALID_TOKEN
;
400 void JSONParser::EatWhitespaceAndComments() {
401 while (pos_
< end_pos_
) {
405 index_last_line_
= index_
;
406 // Don't increment line_number_ twice for "\r\n".
407 if (!(*pos_
== '\n' && pos_
> start_pos_
&& *(pos_
- 1) == '\r'))
424 bool JSONParser::EatComment() {
425 if (*pos_
!= '/' || !CanConsume(1))
428 char next_char
= *NextChar();
429 if (next_char
== '/') {
430 // Single line comment, read to newline.
431 while (CanConsume(1)) {
432 next_char
= *NextChar();
433 if (next_char
== '\n' || next_char
== '\r')
436 } else if (next_char
== '*') {
437 char previous_char
= '\0';
438 // Block comment, read until end marker.
439 while (CanConsume(1)) {
440 next_char
= *NextChar();
441 if (previous_char
== '*' && next_char
== '/') {
442 // EatWhitespaceAndComments will inspect pos_, which will still be on
443 // the last / of the comment, so advance once more (which may also be
448 previous_char
= next_char
;
451 // If the comment is unterminated, GetNextToken will report T_END_OF_INPUT.
457 Value
* JSONParser::ParseNextToken() {
458 return ParseToken(GetNextToken());
461 Value
* JSONParser::ParseToken(Token token
) {
464 return ConsumeDictionary();
466 return ConsumeList();
468 return ConsumeString();
470 return ConsumeNumber();
474 return ConsumeLiteral();
476 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN
, 1);
481 Value
* JSONParser::ConsumeDictionary() {
483 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN
, 1);
487 StackMarker
depth_check(&stack_depth_
);
488 if (depth_check
.IsTooDeep()) {
489 ReportError(JSONReader::JSON_TOO_MUCH_NESTING
, 1);
493 scoped_ptr
<DictionaryValue
> dict(new DictionaryValue
);
496 Token token
= GetNextToken();
497 while (token
!= T_OBJECT_END
) {
498 if (token
!= T_STRING
) {
499 ReportError(JSONReader::JSON_UNQUOTED_DICTIONARY_KEY
, 1);
503 // First consume the key.
505 if (!ConsumeStringRaw(&key
)) {
509 // Read the separator.
511 token
= GetNextToken();
512 if (token
!= T_OBJECT_PAIR_SEPARATOR
) {
513 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
517 // The next token is the value. Ownership transfers to |dict|.
519 Value
* value
= ParseNextToken();
521 // ReportError from deeper level.
525 dict
->SetWithoutPathExpansion(key
.AsString(), value
);
528 token
= GetNextToken();
529 if (token
== T_LIST_SEPARATOR
) {
531 token
= GetNextToken();
532 if (token
== T_OBJECT_END
&& !(options_
& JSON_ALLOW_TRAILING_COMMAS
)) {
533 ReportError(JSONReader::JSON_TRAILING_COMMA
, 1);
536 } else if (token
!= T_OBJECT_END
) {
537 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 0);
542 return dict
.release();
545 Value
* JSONParser::ConsumeList() {
547 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN
, 1);
551 StackMarker
depth_check(&stack_depth_
);
552 if (depth_check
.IsTooDeep()) {
553 ReportError(JSONReader::JSON_TOO_MUCH_NESTING
, 1);
557 scoped_ptr
<ListValue
> list(new ListValue
);
560 Token token
= GetNextToken();
561 while (token
!= T_ARRAY_END
) {
562 Value
* item
= ParseToken(token
);
564 // ReportError from deeper level.
571 token
= GetNextToken();
572 if (token
== T_LIST_SEPARATOR
) {
574 token
= GetNextToken();
575 if (token
== T_ARRAY_END
&& !(options_
& JSON_ALLOW_TRAILING_COMMAS
)) {
576 ReportError(JSONReader::JSON_TRAILING_COMMA
, 1);
579 } else if (token
!= T_ARRAY_END
) {
580 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
585 return list
.release();
588 Value
* JSONParser::ConsumeString() {
589 StringBuilder string
;
590 if (!ConsumeStringRaw(&string
))
593 // Create the Value representation, using a hidden root, if configured
594 // to do so, and if the string can be represented by StringPiece.
595 if (string
.CanBeStringPiece() && !(options_
& JSON_DETACHABLE_CHILDREN
)) {
596 return new JSONStringValue(string
.AsStringPiece());
598 if (string
.CanBeStringPiece())
600 return new StringValue(string
.AsString());
604 bool JSONParser::ConsumeStringRaw(StringBuilder
* out
) {
606 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN
, 1);
610 // StringBuilder will internally build a StringPiece unless a UTF-16
611 // conversion occurs, at which point it will perform a copy into a
613 StringBuilder
string(NextChar());
615 int length
= end_pos_
- start_pos_
;
618 while (CanConsume(1)) {
619 pos_
= start_pos_
+ index_
; // CBU8_NEXT is postcrement.
620 CBU8_NEXT(start_pos_
, index_
, length
, next_char
);
621 if (next_char
< 0 || !IsValidCharacter(next_char
)) {
622 ReportError(JSONReader::JSON_UNSUPPORTED_ENCODING
, 1);
626 // If this character is an escape sequence...
627 if (next_char
== '\\') {
628 // The input string will be adjusted (either by combining the two
629 // characters of an encoded escape sequence, or with a UTF conversion),
630 // so using StringPiece isn't possible -- force a conversion.
633 if (!CanConsume(1)) {
634 ReportError(JSONReader::JSON_INVALID_ESCAPE
, 0);
638 switch (*NextChar()) {
639 // Allowed esape sequences:
640 case 'x': { // UTF-8 sequence.
641 // UTF-8 \x escape sequences are not allowed in the spec, but they
642 // are supported here for backwards-compatiblity with the old parser.
643 if (!CanConsume(2)) {
644 ReportError(JSONReader::JSON_INVALID_ESCAPE
, 1);
649 if (!HexStringToInt(StringPiece(NextChar(), 2), &hex_digit
)) {
650 ReportError(JSONReader::JSON_INVALID_ESCAPE
, -1);
655 if (hex_digit
< kExtendedASCIIStart
)
656 string
.Append(static_cast<char>(hex_digit
));
658 DecodeUTF8(hex_digit
, &string
);
661 case 'u': { // UTF-16 sequence.
662 // UTF units are of the form \uXXXX.
663 if (!CanConsume(5)) { // 5 being 'u' and four HEX digits.
664 ReportError(JSONReader::JSON_INVALID_ESCAPE
, 0);
671 std::string utf8_units
;
672 if (!DecodeUTF16(&utf8_units
)) {
673 ReportError(JSONReader::JSON_INVALID_ESCAPE
, -1);
677 string
.AppendString(utf8_units
);
704 case 'v': // Not listed as valid escape sequence in the RFC.
707 // All other escape squences are illegal.
709 ReportError(JSONReader::JSON_INVALID_ESCAPE
, 0);
712 } else if (next_char
== '"') {
713 --index_
; // Rewind by one because of CBU8_NEXT.
717 if (next_char
< kExtendedASCIIStart
)
718 string
.Append(static_cast<char>(next_char
));
720 DecodeUTF8(next_char
, &string
);
724 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 0);
728 // Entry is at the first X in \uXXXX.
729 bool JSONParser::DecodeUTF16(std::string
* dest_string
) {
733 // This is a 32-bit field because the shift operations in the
734 // conversion process below cause MSVC to error about "data loss."
735 // This only stores UTF-16 code units, though.
736 // Consume the UTF-16 code unit, which may be a high surrogate.
737 int code_unit16_high
= 0;
738 if (!HexStringToInt(StringPiece(pos_
, 4), &code_unit16_high
))
741 // Only add 3, not 4, because at the end of this iteration, the parser has
742 // finished working with the last digit of the UTF sequence, meaning that
743 // the next iteration will advance to the next byte.
746 // Used to convert the UTF-16 code units to a code point and then to a UTF-8
747 // code unit sequence.
748 char code_unit8
[8] = { 0 };
751 // If this is a high surrogate, consume the next code unit to get the
753 if (CBU16_IS_SURROGATE(code_unit16_high
)) {
754 // Make sure this is the high surrogate. If not, it's an encoding
756 if (!CBU16_IS_SURROGATE_LEAD(code_unit16_high
))
759 // Make sure that the token has more characters to consume the
761 if (!CanConsume(6)) // 6 being '\' 'u' and four HEX digits.
763 if (*NextChar() != '\\' || *NextChar() != 'u')
766 NextChar(); // Read past 'u'.
767 int code_unit16_low
= 0;
768 if (!HexStringToInt(StringPiece(pos_
, 4), &code_unit16_low
))
773 if (!CBU16_IS_TRAIL(code_unit16_low
)) {
777 uint32 code_point
= CBU16_GET_SUPPLEMENTARY(code_unit16_high
,
780 CBU8_APPEND_UNSAFE(code_unit8
, offset
, code_point
);
783 DCHECK(CBU16_IS_SINGLE(code_unit16_high
));
784 CBU8_APPEND_UNSAFE(code_unit8
, offset
, code_unit16_high
);
787 dest_string
->append(code_unit8
);
791 void JSONParser::DecodeUTF8(const int32
& point
, StringBuilder
* dest
) {
792 // Anything outside of the basic ASCII plane will need to be decoded from
793 // int32 to a multi-byte sequence.
794 if (point
< kExtendedASCIIStart
) {
795 dest
->Append(static_cast<char>(point
));
797 char utf8_units
[4] = { 0 };
799 CBU8_APPEND_UNSAFE(utf8_units
, offset
, point
);
801 // CBU8_APPEND_UNSAFE can overwrite up to 4 bytes, so utf8_units may not be
802 // zero terminated at this point. |offset| contains the correct length.
803 dest
->AppendString(std::string(utf8_units
, offset
));
807 Value
* JSONParser::ConsumeNumber() {
808 const char* num_start
= pos_
;
809 const int start_index
= index_
;
810 int end_index
= start_index
;
815 if (!ReadInt(false)) {
816 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
821 // The optional fraction part.
823 if (!CanConsume(1)) {
824 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
828 if (!ReadInt(true)) {
829 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
835 // Optional exponent part.
836 if (*pos_
== 'e' || *pos_
== 'E') {
838 if (*pos_
== '-' || *pos_
== '+')
840 if (!ReadInt(true)) {
841 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
847 // ReadInt is greedy because numbers have no easily detectable sentinel,
848 // so save off where the parser should be on exit (see Consume invariant at
849 // the top of the header), then make sure the next token is one which is
851 const char* exit_pos
= pos_
- 1;
852 int exit_index
= index_
- 1;
854 switch (GetNextToken()) {
857 case T_LIST_SEPARATOR
:
861 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
868 StringPiece
num_string(num_start
, end_index
- start_index
);
871 if (StringToInt(num_string
, &num_int
))
872 return new FundamentalValue(num_int
);
875 if (base::StringToDouble(num_string
.as_string(), &num_double
) &&
876 std::isfinite(num_double
)) {
877 return new FundamentalValue(num_double
);
883 bool JSONParser::ReadInt(bool allow_leading_zeros
) {
888 while (CanConsume(1) && IsAsciiDigit(c
)) {
896 if (!allow_leading_zeros
&& len
> 1 && first
== '0')
902 Value
* JSONParser::ConsumeLiteral() {
905 const char kTrueLiteral
[] = "true";
906 const int kTrueLen
= static_cast<int>(strlen(kTrueLiteral
));
907 if (!CanConsume(kTrueLen
- 1) ||
908 !StringsAreEqual(pos_
, kTrueLiteral
, kTrueLen
)) {
909 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
912 NextNChars(kTrueLen
- 1);
913 return new FundamentalValue(true);
916 const char kFalseLiteral
[] = "false";
917 const int kFalseLen
= static_cast<int>(strlen(kFalseLiteral
));
918 if (!CanConsume(kFalseLen
- 1) ||
919 !StringsAreEqual(pos_
, kFalseLiteral
, kFalseLen
)) {
920 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
923 NextNChars(kFalseLen
- 1);
924 return new FundamentalValue(false);
927 const char kNullLiteral
[] = "null";
928 const int kNullLen
= static_cast<int>(strlen(kNullLiteral
));
929 if (!CanConsume(kNullLen
- 1) ||
930 !StringsAreEqual(pos_
, kNullLiteral
, kNullLen
)) {
931 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
934 NextNChars(kNullLen
- 1);
935 return Value::CreateNullValue();
938 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN
, 1);
944 bool JSONParser::StringsAreEqual(const char* one
, const char* two
, size_t len
) {
945 return strncmp(one
, two
, len
) == 0;
948 void JSONParser::ReportError(JSONReader::JsonParseError code
,
951 error_line_
= line_number_
;
952 error_column_
= index_
- index_last_line_
+ column_adjust
;
956 std::string
JSONParser::FormatErrorMessage(int line
, int column
,
957 const std::string
& description
) {
958 if (line
|| column
) {
959 return StringPrintf("Line: %i, column: %i, %s",
960 line
, column
, description
.c_str());
965 } // namespace internal