1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "base/json/json_parser.h"
7 #include "base/float_util.h"
8 #include "base/logging.h"
9 #include "base/memory/scoped_ptr.h"
10 #include "base/string_number_conversions.h"
11 #include "base/string_piece.h"
12 #include "base/string_util.h"
13 #include "base/stringprintf.h"
14 #include "base/third_party/icu/icu_utf.h"
15 #include "base/utf_string_conversion_utils.h"
16 #include "base/utf_string_conversions.h"
17 #include "base/values.h"
24 const int kStackMaxDepth
= 100;
26 const int32 kExtendedASCIIStart
= 0x80;
28 // This and the class below are used to own the JSON input string for when
29 // string tokens are stored as StringPiece instead of std::string. This
30 // optimization avoids about 2/3rds of string memory copies. The constructor
31 // takes ownership of the input string. The real root value is Swap()ed into
33 class DictionaryHiddenRootValue
: public base::DictionaryValue
{
35 DictionaryHiddenRootValue(std::string
* json
, Value
* root
) : json_(json
) {
36 DCHECK(root
->IsType(Value::TYPE_DICTIONARY
));
37 DictionaryValue::Swap(static_cast<DictionaryValue
*>(root
));
40 virtual void Swap(DictionaryValue
* other
) OVERRIDE
{
41 DVLOG(1) << "Swap()ing a DictionaryValue inefficiently.";
43 // First deep copy to convert JSONStringValue to std::string and swap that
44 // copy with |other|, which contains the new contents of |this|.
45 scoped_ptr
<base::DictionaryValue
> copy(DeepCopy());
48 // Then erase the contents of the current dictionary and swap in the
49 // new contents, originally from |other|.
52 DictionaryValue::Swap(copy
.get());
55 // Not overriding DictionaryValue::Remove because it just calls through to
58 virtual bool RemoveWithoutPathExpansion(const std::string
& key
,
59 Value
** out
) OVERRIDE
{
60 // If the caller won't take ownership of the removed value, just call up.
62 return DictionaryValue::RemoveWithoutPathExpansion(key
, out
);
64 DVLOG(1) << "Remove()ing from a DictionaryValue inefficiently.";
66 // Otherwise, remove the value while its still "owned" by this and copy it
67 // to convert any JSONStringValues to std::string.
68 Value
* out_owned
= NULL
;
69 if (!DictionaryValue::RemoveWithoutPathExpansion(key
, &out_owned
))
72 *out
= out_owned
->DeepCopy();
79 scoped_ptr
<std::string
> json_
;
81 DISALLOW_COPY_AND_ASSIGN(DictionaryHiddenRootValue
);
84 class ListHiddenRootValue
: public base::ListValue
{
86 ListHiddenRootValue(std::string
* json
, Value
* root
) : json_(json
) {
87 DCHECK(root
->IsType(Value::TYPE_LIST
));
88 ListValue::Swap(static_cast<ListValue
*>(root
));
91 virtual void Swap(ListValue
* other
) OVERRIDE
{
92 DVLOG(1) << "Swap()ing a ListValue inefficiently.";
94 // First deep copy to convert JSONStringValue to std::string and swap that
95 // copy with |other|, which contains the new contents of |this|.
96 scoped_ptr
<base::ListValue
> copy(DeepCopy());
99 // Then erase the contents of the current list and swap in the new contents,
100 // originally from |other|.
103 ListValue::Swap(copy
.get());
106 virtual bool Remove(size_t index
, Value
** out
) OVERRIDE
{
107 // If the caller won't take ownership of the removed value, just call up.
109 return ListValue::Remove(index
, out
);
111 DVLOG(1) << "Remove()ing from a ListValue inefficiently.";
113 // Otherwise, remove the value while its still "owned" by this and copy it
114 // to convert any JSONStringValues to std::string.
115 Value
* out_owned
= NULL
;
116 if (!ListValue::Remove(index
, &out_owned
))
119 *out
= out_owned
->DeepCopy();
126 scoped_ptr
<std::string
> json_
;
128 DISALLOW_COPY_AND_ASSIGN(ListHiddenRootValue
);
131 // A variant on StringValue that uses StringPiece instead of copying the string
132 // into the Value. This can only be stored in a child of hidden root (above),
133 // otherwise the referenced string will not be guaranteed to outlive it.
134 class JSONStringValue
: public base::Value
{
136 explicit JSONStringValue(const base::StringPiece
& piece
)
137 : Value(TYPE_STRING
),
138 string_piece_(piece
) {
141 // Overridden from base::Value:
142 virtual bool GetAsString(std::string
* out_value
) const OVERRIDE
{
143 string_piece_
.CopyToString(out_value
);
146 virtual bool GetAsString(string16
* out_value
) const OVERRIDE
{
147 *out_value
= UTF8ToUTF16(string_piece_
);
150 virtual Value
* DeepCopy() const OVERRIDE
{
151 return new StringValue(string_piece_
.as_string());
153 virtual bool Equals(const Value
* other
) const OVERRIDE
{
154 std::string other_string
;
155 return other
->IsType(TYPE_STRING
) && other
->GetAsString(&other_string
) &&
156 StringPiece(other_string
) == string_piece_
;
160 // The location in the original input stream.
161 base::StringPiece string_piece_
;
163 DISALLOW_COPY_AND_ASSIGN(JSONStringValue
);
166 // Simple class that checks for maximum recursion/"stack overflow."
169 explicit StackMarker(int* depth
) : depth_(depth
) {
171 DCHECK_LE(*depth_
, kStackMaxDepth
);
177 bool IsTooDeep() const {
178 return *depth_
>= kStackMaxDepth
;
184 DISALLOW_COPY_AND_ASSIGN(StackMarker
);
189 JSONParser::JSONParser(int options
)
198 error_code_(JSONReader::JSON_NO_ERROR
),
203 JSONParser::~JSONParser() {
206 Value
* JSONParser::Parse(const StringPiece
& input
) {
207 scoped_ptr
<std::string
> input_copy
;
208 // If the children of a JSON root can be detached, then hidden roots cannot
209 // be used, so do not bother copying the input because StringPiece will not
211 if (!(options_
& JSON_DETACHABLE_CHILDREN
)) {
212 input_copy
.reset(new std::string(input
.as_string()));
213 start_pos_
= input_copy
->data();
215 start_pos_
= input
.data();
218 end_pos_
= start_pos_
+ input
.length();
221 index_last_line_
= 0;
223 error_code_
= JSONReader::JSON_NO_ERROR
;
227 // When the input JSON string starts with a UTF-8 Byte-Order-Mark
228 // <0xEF 0xBB 0xBF>, advance the start position to avoid the
229 // ParseNextToken function mis-treating a Unicode BOM as an invalid
230 // character and returning NULL.
231 if (CanConsume(3) && static_cast<uint8
>(*pos_
) == 0xEF &&
232 static_cast<uint8
>(*(pos_
+ 1)) == 0xBB &&
233 static_cast<uint8
>(*(pos_
+ 2)) == 0xBF) {
237 // Parse the first and any nested tokens.
238 scoped_ptr
<Value
> root(ParseNextToken());
242 // Make sure the input stream is at an end.
243 if (GetNextToken() != T_END_OF_INPUT
) {
244 if (!CanConsume(1) || (NextChar() && GetNextToken() != T_END_OF_INPUT
)) {
245 ReportError(JSONReader::JSON_UNEXPECTED_DATA_AFTER_ROOT
, 1);
250 // Dictionaries and lists can contain JSONStringValues, so wrap them in a
252 if (!(options_
& JSON_DETACHABLE_CHILDREN
)) {
253 if (root
->IsType(Value::TYPE_DICTIONARY
)) {
254 return new DictionaryHiddenRootValue(input_copy
.release(), root
.get());
255 } else if (root
->IsType(Value::TYPE_LIST
)) {
256 return new ListHiddenRootValue(input_copy
.release(), root
.get());
257 } else if (root
->IsType(Value::TYPE_STRING
)) {
258 // A string type could be a JSONStringValue, but because there's no
259 // corresponding HiddenRootValue, the memory will be lost. Deep copy to
261 return root
->DeepCopy();
265 // All other values can be returned directly.
266 return root
.release();
269 JSONReader::JsonParseError
JSONParser::error_code() const {
273 std::string
JSONParser::GetErrorMessage() const {
274 return FormatErrorMessage(error_line_
, error_column_
,
275 JSONReader::ErrorCodeToString(error_code_
));
278 // StringBuilder ///////////////////////////////////////////////////////////////
280 JSONParser::StringBuilder::StringBuilder()
286 JSONParser::StringBuilder::StringBuilder(const char* pos
)
292 void JSONParser::StringBuilder::Swap(StringBuilder
* other
) {
293 std::swap(other
->string_
, string_
);
294 std::swap(other
->pos_
, pos_
);
295 std::swap(other
->length_
, length_
);
298 JSONParser::StringBuilder::~StringBuilder() {
302 void JSONParser::StringBuilder::Append(const char& c
) {
307 string_
->push_back(c
);
312 void JSONParser::StringBuilder::AppendString(const std::string
& str
) {
314 string_
->append(str
);
317 void JSONParser::StringBuilder::Convert() {
320 string_
= new std::string(pos_
, length_
);
323 bool JSONParser::StringBuilder::CanBeStringPiece() const {
327 StringPiece
JSONParser::StringBuilder::AsStringPiece() {
329 return StringPiece();
330 return StringPiece(pos_
, length_
);
333 const std::string
& JSONParser::StringBuilder::AsString() {
339 // JSONParser private //////////////////////////////////////////////////////////
341 inline bool JSONParser::CanConsume(int length
) {
342 return pos_
+ length
<= end_pos_
;
345 const char* JSONParser::NextChar() {
346 DCHECK(CanConsume(1));
352 void JSONParser::NextNChars(int n
) {
353 DCHECK(CanConsume(n
));
358 JSONParser::Token
JSONParser::GetNextToken() {
359 EatWhitespaceAndComments();
361 return T_END_OF_INPUT
;
365 return T_OBJECT_BEGIN
;
369 return T_ARRAY_BEGIN
;
393 return T_LIST_SEPARATOR
;
395 return T_OBJECT_PAIR_SEPARATOR
;
397 return T_INVALID_TOKEN
;
401 void JSONParser::EatWhitespaceAndComments() {
402 while (pos_
< end_pos_
) {
406 index_last_line_
= index_
;
423 bool JSONParser::EatComment() {
424 if (*pos_
!= '/' || !CanConsume(1))
427 char next_char
= *NextChar();
428 if (next_char
== '/') {
429 // Single line comment, read to newline.
430 while (CanConsume(1)) {
431 char next_char
= *NextChar();
432 if (next_char
== '\n' || next_char
== '\r')
435 } else if (next_char
== '*') {
436 // Block comment, read until end marker.
437 while (CanConsume(2)) {
438 if (*NextChar() == '*' && *NextChar() == '/') {
439 // EatWhitespaceAndComments will inspect pos_, which will still be on
440 // the last / of the comment, so advance once more (which may also be
447 // If the comment is unterminated, GetNextToken will report T_END_OF_INPUT.
453 Value
* JSONParser::ParseNextToken() {
454 return ParseToken(GetNextToken());
457 Value
* JSONParser::ParseToken(Token token
) {
460 return ConsumeDictionary();
462 return ConsumeList();
464 return ConsumeString();
466 return ConsumeNumber();
470 return ConsumeLiteral();
472 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN
, 1);
477 Value
* JSONParser::ConsumeDictionary() {
479 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN
, 1);
483 StackMarker
depth_check(&stack_depth_
);
484 if (depth_check
.IsTooDeep()) {
485 ReportError(JSONReader::JSON_TOO_MUCH_NESTING
, 1);
489 scoped_ptr
<DictionaryValue
> dict(new DictionaryValue
);
492 Token token
= GetNextToken();
493 while (token
!= T_OBJECT_END
) {
494 if (token
!= T_STRING
) {
495 ReportError(JSONReader::JSON_UNQUOTED_DICTIONARY_KEY
, 1);
499 // First consume the key.
501 if (!ConsumeStringRaw(&key
)) {
505 // Read the separator.
507 token
= GetNextToken();
508 if (token
!= T_OBJECT_PAIR_SEPARATOR
) {
509 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
513 // The next token is the value. Ownership transfers to |dict|.
515 Value
* value
= ParseNextToken();
517 // ReportError from deeper level.
521 dict
->SetWithoutPathExpansion(key
.AsString(), value
);
524 token
= GetNextToken();
525 if (token
== T_LIST_SEPARATOR
) {
527 token
= GetNextToken();
528 if (token
== T_OBJECT_END
&& !(options_
& JSON_ALLOW_TRAILING_COMMAS
)) {
529 ReportError(JSONReader::JSON_TRAILING_COMMA
, 1);
532 } else if (token
!= T_OBJECT_END
) {
533 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 0);
538 return dict
.release();
541 Value
* JSONParser::ConsumeList() {
543 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN
, 1);
547 StackMarker
depth_check(&stack_depth_
);
548 if (depth_check
.IsTooDeep()) {
549 ReportError(JSONReader::JSON_TOO_MUCH_NESTING
, 1);
553 scoped_ptr
<ListValue
> list(new ListValue
);
556 Token token
= GetNextToken();
557 while (token
!= T_ARRAY_END
) {
558 Value
* item
= ParseToken(token
);
560 // ReportError from deeper level.
567 token
= GetNextToken();
568 if (token
== T_LIST_SEPARATOR
) {
570 token
= GetNextToken();
571 if (token
== T_ARRAY_END
&& !(options_
& JSON_ALLOW_TRAILING_COMMAS
)) {
572 ReportError(JSONReader::JSON_TRAILING_COMMA
, 1);
575 } else if (token
!= T_ARRAY_END
) {
576 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
581 return list
.release();
584 Value
* JSONParser::ConsumeString() {
585 StringBuilder string
;
586 if (!ConsumeStringRaw(&string
))
589 // Create the Value representation, using a hidden root, if configured
590 // to do so, and if the string can be represented by StringPiece.
591 if (string
.CanBeStringPiece() && !(options_
& JSON_DETACHABLE_CHILDREN
)) {
592 return new JSONStringValue(string
.AsStringPiece());
594 if (string
.CanBeStringPiece())
596 return new StringValue(string
.AsString());
600 bool JSONParser::ConsumeStringRaw(StringBuilder
* out
) {
602 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN
, 1);
606 // StringBuilder will internally build a StringPiece unless a UTF-16
607 // conversion occurs, at which point it will perform a copy into a
609 StringBuilder
string(NextChar());
611 int length
= end_pos_
- start_pos_
;
614 while (CanConsume(1)) {
615 pos_
= start_pos_
+ index_
; // CBU8_NEXT is postcrement.
616 CBU8_NEXT(start_pos_
, index_
, length
, next_char
);
617 if (next_char
< 0 || !IsValidCharacter(next_char
)) {
618 ReportError(JSONReader::JSON_UNSUPPORTED_ENCODING
, 1);
622 // If this character is an escape sequence...
623 if (next_char
== '\\') {
624 // The input string will be adjusted (either by combining the two
625 // characters of an encoded escape sequence, or with a UTF conversion),
626 // so using StringPiece isn't possible -- force a conversion.
629 if (!CanConsume(1)) {
630 ReportError(JSONReader::JSON_INVALID_ESCAPE
, 0);
634 switch (*NextChar()) {
635 // Allowed esape sequences:
636 case 'x': { // UTF-8 sequence.
637 // UTF-8 \x escape sequences are not allowed in the spec, but they
638 // are supported here for backwards-compatiblity with the old parser.
639 if (!CanConsume(2)) {
640 ReportError(JSONReader::JSON_INVALID_ESCAPE
, 1);
645 if (!HexStringToInt(StringPiece(NextChar(), 2), &hex_digit
)) {
646 ReportError(JSONReader::JSON_INVALID_ESCAPE
, -1);
651 if (hex_digit
< kExtendedASCIIStart
)
652 string
.Append(hex_digit
);
654 DecodeUTF8(hex_digit
, &string
);
657 case 'u': { // UTF-16 sequence.
658 // UTF units are of the form \uXXXX.
659 if (!CanConsume(5)) { // 5 being 'u' and four HEX digits.
660 ReportError(JSONReader::JSON_INVALID_ESCAPE
, 0);
667 std::string utf8_units
;
668 if (!DecodeUTF16(&utf8_units
)) {
669 ReportError(JSONReader::JSON_INVALID_ESCAPE
, -1);
673 string
.AppendString(utf8_units
);
700 case 'v': // Not listed as valid escape sequence in the RFC.
703 // All other escape squences are illegal.
705 ReportError(JSONReader::JSON_INVALID_ESCAPE
, 0);
708 } else if (next_char
== '"') {
709 --index_
; // Rewind by one because of CBU8_NEXT.
713 if (next_char
< kExtendedASCIIStart
)
714 string
.Append(next_char
);
716 DecodeUTF8(next_char
, &string
);
720 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 0);
724 // Entry is at the first X in \uXXXX.
725 bool JSONParser::DecodeUTF16(std::string
* dest_string
) {
729 // This is a 32-bit field because the shift operations in the
730 // conversion process below cause MSVC to error about "data loss."
731 // This only stores UTF-16 code units, though.
732 // Consume the UTF-16 code unit, which may be a high surrogate.
733 int code_unit16_high
= 0;
734 if (!HexStringToInt(StringPiece(pos_
, 4), &code_unit16_high
))
737 // Only add 3, not 4, because at the end of this iteration, the parser has
738 // finished working with the last digit of the UTF sequence, meaning that
739 // the next iteration will advance to the next byte.
742 // Used to convert the UTF-16 code units to a code point and then to a UTF-8
743 // code unit sequence.
744 char code_unit8
[8] = { 0 };
747 // If this is a high surrogate, consume the next code unit to get the
749 if (CBU16_IS_SURROGATE(code_unit16_high
)) {
750 // Make sure this is the high surrogate. If not, it's an encoding
752 if (!CBU16_IS_SURROGATE_LEAD(code_unit16_high
))
755 // Make sure that the token has more characters to consume the
757 if (!CanConsume(6)) // 6 being '\' 'u' and four HEX digits.
759 if (*NextChar() != '\\' || *NextChar() != 'u')
762 NextChar(); // Read past 'u'.
763 int code_unit16_low
= 0;
764 if (!HexStringToInt(StringPiece(pos_
, 4), &code_unit16_low
))
769 if (!CBU16_IS_TRAIL(code_unit16_low
)) {
773 uint32 code_point
= CBU16_GET_SUPPLEMENTARY(code_unit16_high
,
776 CBU8_APPEND_UNSAFE(code_unit8
, offset
, code_point
);
779 DCHECK(CBU16_IS_SINGLE(code_unit16_high
));
780 CBU8_APPEND_UNSAFE(code_unit8
, offset
, code_unit16_high
);
783 dest_string
->append(code_unit8
);
787 void JSONParser::DecodeUTF8(const int32
& point
, StringBuilder
* dest
) {
788 // Anything outside of the basic ASCII plane will need to be decoded from
789 // int32 to a multi-byte sequence.
790 if (point
< kExtendedASCIIStart
) {
793 char utf8_units
[4] = { 0 };
795 CBU8_APPEND_UNSAFE(utf8_units
, offset
, point
);
797 // CBU8_APPEND_UNSAFE can overwrite up to 4 bytes, so utf8_units may not be
798 // zero terminated at this point. |offset| contains the correct length.
799 dest
->AppendString(std::string(utf8_units
, offset
));
803 Value
* JSONParser::ConsumeNumber() {
804 const char* num_start
= pos_
;
805 const int start_index
= index_
;
806 int end_index
= start_index
;
811 if (!ReadInt(false)) {
812 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
817 // The optional fraction part.
819 if (!CanConsume(1)) {
820 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
824 if (!ReadInt(true)) {
825 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
831 // Optional exponent part.
832 if (*pos_
== 'e' || *pos_
== 'E') {
834 if (*pos_
== '-' || *pos_
== '+')
836 if (!ReadInt(true)) {
837 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
843 // ReadInt is greedy because numbers have no easily detectable sentinel,
844 // so save off where the parser should be on exit (see Consume invariant at
845 // the top of the header), then make sure the next token is one which is
847 const char* exit_pos
= pos_
- 1;
848 int exit_index
= index_
- 1;
850 switch (GetNextToken()) {
853 case T_LIST_SEPARATOR
:
857 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
864 StringPiece
num_string(num_start
, end_index
- start_index
);
867 if (StringToInt(num_string
, &num_int
))
868 return new FundamentalValue(num_int
);
871 if (base::StringToDouble(num_string
.as_string(), &num_double
) &&
872 IsFinite(num_double
)) {
873 return new FundamentalValue(num_double
);
879 bool JSONParser::ReadInt(bool allow_leading_zeros
) {
884 while (CanConsume(1) && IsAsciiDigit(c
)) {
892 if (!allow_leading_zeros
&& len
> 1 && first
== '0')
898 Value
* JSONParser::ConsumeLiteral() {
901 const char* kTrueLiteral
= "true";
902 const int kTrueLen
= static_cast<int>(strlen(kTrueLiteral
));
903 if (!CanConsume(kTrueLen
- 1) ||
904 !StringsAreEqual(pos_
, kTrueLiteral
, kTrueLen
)) {
905 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
908 NextNChars(kTrueLen
- 1);
909 return new FundamentalValue(true);
912 const char* kFalseLiteral
= "false";
913 const int kFalseLen
= static_cast<int>(strlen(kFalseLiteral
));
914 if (!CanConsume(kFalseLen
- 1) ||
915 !StringsAreEqual(pos_
, kFalseLiteral
, kFalseLen
)) {
916 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
919 NextNChars(kFalseLen
- 1);
920 return new FundamentalValue(false);
923 const char* kNullLiteral
= "null";
924 const int kNullLen
= static_cast<int>(strlen(kNullLiteral
));
925 if (!CanConsume(kNullLen
- 1) ||
926 !StringsAreEqual(pos_
, kNullLiteral
, kNullLen
)) {
927 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
930 NextNChars(kNullLen
- 1);
931 return Value::CreateNullValue();
934 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN
, 1);
940 bool JSONParser::StringsAreEqual(const char* one
, const char* two
, size_t len
) {
941 return strncmp(one
, two
, len
) == 0;
944 void JSONParser::ReportError(JSONReader::JsonParseError code
,
947 error_line_
= line_number_
;
948 error_column_
= index_
- index_last_line_
+ column_adjust
;
952 std::string
JSONParser::FormatErrorMessage(int line
, int column
,
953 const std::string
& description
) {
954 if (line
|| column
) {
955 return StringPrintf("Line: %i, column: %i, %s",
956 line
, column
, description
.c_str());
961 } // namespace internal