1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "base/json/json_parser.h"
7 #include "base/float_util.h"
8 #include "base/logging.h"
9 #include "base/memory/scoped_ptr.h"
10 #include "base/strings/string_number_conversions.h"
11 #include "base/strings/string_piece.h"
12 #include "base/strings/string_util.h"
13 #include "base/strings/stringprintf.h"
14 #include "base/strings/utf_string_conversion_utils.h"
15 #include "base/strings/utf_string_conversions.h"
16 #include "base/third_party/icu/icu_utf.h"
17 #include "base/values.h"
24 const int kStackMaxDepth
= 100;
26 const int32 kExtendedASCIIStart
= 0x80;
28 // This and the class below are used to own the JSON input string for when
29 // string tokens are stored as StringPiece instead of std::string. This
30 // optimization avoids about 2/3rds of string memory copies. The constructor
31 // takes ownership of the input string. The real root value is Swap()ed into
33 class DictionaryHiddenRootValue
: public base::DictionaryValue
{
35 DictionaryHiddenRootValue(std::string
* json
, Value
* root
) : json_(json
) {
36 DCHECK(root
->IsType(Value::TYPE_DICTIONARY
));
37 DictionaryValue::Swap(static_cast<DictionaryValue
*>(root
));
40 virtual void Swap(DictionaryValue
* other
) OVERRIDE
{
41 DVLOG(1) << "Swap()ing a DictionaryValue inefficiently.";
43 // First deep copy to convert JSONStringValue to std::string and swap that
44 // copy with |other|, which contains the new contents of |this|.
45 scoped_ptr
<base::DictionaryValue
> copy(DeepCopy());
48 // Then erase the contents of the current dictionary and swap in the
49 // new contents, originally from |other|.
52 DictionaryValue::Swap(copy
.get());
55 // Not overriding DictionaryValue::Remove because it just calls through to
58 virtual bool RemoveWithoutPathExpansion(const std::string
& key
,
59 scoped_ptr
<Value
>* out
) OVERRIDE
{
60 // If the caller won't take ownership of the removed value, just call up.
62 return DictionaryValue::RemoveWithoutPathExpansion(key
, out
);
64 DVLOG(1) << "Remove()ing from a DictionaryValue inefficiently.";
66 // Otherwise, remove the value while its still "owned" by this and copy it
67 // to convert any JSONStringValues to std::string.
68 scoped_ptr
<Value
> out_owned
;
69 if (!DictionaryValue::RemoveWithoutPathExpansion(key
, &out_owned
))
72 out
->reset(out_owned
->DeepCopy());
78 scoped_ptr
<std::string
> json_
;
80 DISALLOW_COPY_AND_ASSIGN(DictionaryHiddenRootValue
);
83 class ListHiddenRootValue
: public base::ListValue
{
85 ListHiddenRootValue(std::string
* json
, Value
* root
) : json_(json
) {
86 DCHECK(root
->IsType(Value::TYPE_LIST
));
87 ListValue::Swap(static_cast<ListValue
*>(root
));
90 virtual void Swap(ListValue
* other
) OVERRIDE
{
91 DVLOG(1) << "Swap()ing a ListValue inefficiently.";
93 // First deep copy to convert JSONStringValue to std::string and swap that
94 // copy with |other|, which contains the new contents of |this|.
95 scoped_ptr
<base::ListValue
> copy(DeepCopy());
98 // Then erase the contents of the current list and swap in the new contents,
99 // originally from |other|.
102 ListValue::Swap(copy
.get());
105 virtual bool Remove(size_t index
, scoped_ptr
<Value
>* out
) OVERRIDE
{
106 // If the caller won't take ownership of the removed value, just call up.
108 return ListValue::Remove(index
, out
);
110 DVLOG(1) << "Remove()ing from a ListValue inefficiently.";
112 // Otherwise, remove the value while its still "owned" by this and copy it
113 // to convert any JSONStringValues to std::string.
114 scoped_ptr
<Value
> out_owned
;
115 if (!ListValue::Remove(index
, &out_owned
))
118 out
->reset(out_owned
->DeepCopy());
124 scoped_ptr
<std::string
> json_
;
126 DISALLOW_COPY_AND_ASSIGN(ListHiddenRootValue
);
129 // A variant on StringValue that uses StringPiece instead of copying the string
130 // into the Value. This can only be stored in a child of hidden root (above),
131 // otherwise the referenced string will not be guaranteed to outlive it.
132 class JSONStringValue
: public base::Value
{
134 explicit JSONStringValue(const base::StringPiece
& piece
)
135 : Value(TYPE_STRING
),
136 string_piece_(piece
) {
139 // Overridden from base::Value:
140 virtual bool GetAsString(std::string
* out_value
) const OVERRIDE
{
141 string_piece_
.CopyToString(out_value
);
144 virtual bool GetAsString(string16
* out_value
) const OVERRIDE
{
145 *out_value
= UTF8ToUTF16(string_piece_
);
148 virtual Value
* DeepCopy() const OVERRIDE
{
149 return new StringValue(string_piece_
.as_string());
151 virtual bool Equals(const Value
* other
) const OVERRIDE
{
152 std::string other_string
;
153 return other
->IsType(TYPE_STRING
) && other
->GetAsString(&other_string
) &&
154 StringPiece(other_string
) == string_piece_
;
158 // The location in the original input stream.
159 base::StringPiece string_piece_
;
161 DISALLOW_COPY_AND_ASSIGN(JSONStringValue
);
164 // Simple class that checks for maximum recursion/"stack overflow."
167 explicit StackMarker(int* depth
) : depth_(depth
) {
169 DCHECK_LE(*depth_
, kStackMaxDepth
);
175 bool IsTooDeep() const {
176 return *depth_
>= kStackMaxDepth
;
182 DISALLOW_COPY_AND_ASSIGN(StackMarker
);
187 JSONParser::JSONParser(int options
)
196 error_code_(JSONReader::JSON_NO_ERROR
),
201 JSONParser::~JSONParser() {
204 Value
* JSONParser::Parse(const StringPiece
& input
) {
205 scoped_ptr
<std::string
> input_copy
;
206 // If the children of a JSON root can be detached, then hidden roots cannot
207 // be used, so do not bother copying the input because StringPiece will not
209 if (!(options_
& JSON_DETACHABLE_CHILDREN
)) {
210 input_copy
.reset(new std::string(input
.as_string()));
211 start_pos_
= input_copy
->data();
213 start_pos_
= input
.data();
216 end_pos_
= start_pos_
+ input
.length();
219 index_last_line_
= 0;
221 error_code_
= JSONReader::JSON_NO_ERROR
;
225 // When the input JSON string starts with a UTF-8 Byte-Order-Mark
226 // <0xEF 0xBB 0xBF>, advance the start position to avoid the
227 // ParseNextToken function mis-treating a Unicode BOM as an invalid
228 // character and returning NULL.
229 if (CanConsume(3) && static_cast<uint8
>(*pos_
) == 0xEF &&
230 static_cast<uint8
>(*(pos_
+ 1)) == 0xBB &&
231 static_cast<uint8
>(*(pos_
+ 2)) == 0xBF) {
235 // Parse the first and any nested tokens.
236 scoped_ptr
<Value
> root(ParseNextToken());
240 // Make sure the input stream is at an end.
241 if (GetNextToken() != T_END_OF_INPUT
) {
242 if (!CanConsume(1) || (NextChar() && GetNextToken() != T_END_OF_INPUT
)) {
243 ReportError(JSONReader::JSON_UNEXPECTED_DATA_AFTER_ROOT
, 1);
248 // Dictionaries and lists can contain JSONStringValues, so wrap them in a
250 if (!(options_
& JSON_DETACHABLE_CHILDREN
)) {
251 if (root
->IsType(Value::TYPE_DICTIONARY
)) {
252 return new DictionaryHiddenRootValue(input_copy
.release(), root
.get());
253 } else if (root
->IsType(Value::TYPE_LIST
)) {
254 return new ListHiddenRootValue(input_copy
.release(), root
.get());
255 } else if (root
->IsType(Value::TYPE_STRING
)) {
256 // A string type could be a JSONStringValue, but because there's no
257 // corresponding HiddenRootValue, the memory will be lost. Deep copy to
259 return root
->DeepCopy();
263 // All other values can be returned directly.
264 return root
.release();
267 JSONReader::JsonParseError
JSONParser::error_code() const {
271 std::string
JSONParser::GetErrorMessage() const {
272 return FormatErrorMessage(error_line_
, error_column_
,
273 JSONReader::ErrorCodeToString(error_code_
));
276 // StringBuilder ///////////////////////////////////////////////////////////////
278 JSONParser::StringBuilder::StringBuilder()
284 JSONParser::StringBuilder::StringBuilder(const char* pos
)
290 void JSONParser::StringBuilder::Swap(StringBuilder
* other
) {
291 std::swap(other
->string_
, string_
);
292 std::swap(other
->pos_
, pos_
);
293 std::swap(other
->length_
, length_
);
296 JSONParser::StringBuilder::~StringBuilder() {
300 void JSONParser::StringBuilder::Append(const char& c
) {
305 string_
->push_back(c
);
310 void JSONParser::StringBuilder::AppendString(const std::string
& str
) {
312 string_
->append(str
);
315 void JSONParser::StringBuilder::Convert() {
318 string_
= new std::string(pos_
, length_
);
321 bool JSONParser::StringBuilder::CanBeStringPiece() const {
325 StringPiece
JSONParser::StringBuilder::AsStringPiece() {
327 return StringPiece();
328 return StringPiece(pos_
, length_
);
331 const std::string
& JSONParser::StringBuilder::AsString() {
337 // JSONParser private //////////////////////////////////////////////////////////
339 inline bool JSONParser::CanConsume(int length
) {
340 return pos_
+ length
<= end_pos_
;
343 const char* JSONParser::NextChar() {
344 DCHECK(CanConsume(1));
350 void JSONParser::NextNChars(int n
) {
351 DCHECK(CanConsume(n
));
356 JSONParser::Token
JSONParser::GetNextToken() {
357 EatWhitespaceAndComments();
359 return T_END_OF_INPUT
;
363 return T_OBJECT_BEGIN
;
367 return T_ARRAY_BEGIN
;
391 return T_LIST_SEPARATOR
;
393 return T_OBJECT_PAIR_SEPARATOR
;
395 return T_INVALID_TOKEN
;
399 void JSONParser::EatWhitespaceAndComments() {
400 while (pos_
< end_pos_
) {
404 index_last_line_
= index_
;
405 // Don't increment line_number_ twice for "\r\n".
406 if (!(*pos_
== '\n' && pos_
> start_pos_
&& *(pos_
- 1) == '\r'))
423 bool JSONParser::EatComment() {
424 if (*pos_
!= '/' || !CanConsume(1))
427 char next_char
= *NextChar();
428 if (next_char
== '/') {
429 // Single line comment, read to newline.
430 while (CanConsume(1)) {
431 char next_char
= *NextChar();
432 if (next_char
== '\n' || next_char
== '\r')
435 } else if (next_char
== '*') {
436 char previous_char
= '\0';
437 // Block comment, read until end marker.
438 while (CanConsume(1)) {
439 next_char
= *NextChar();
440 if (previous_char
== '*' && next_char
== '/') {
441 // EatWhitespaceAndComments will inspect pos_, which will still be on
442 // the last / of the comment, so advance once more (which may also be
447 previous_char
= next_char
;
450 // If the comment is unterminated, GetNextToken will report T_END_OF_INPUT.
456 Value
* JSONParser::ParseNextToken() {
457 return ParseToken(GetNextToken());
460 Value
* JSONParser::ParseToken(Token token
) {
463 return ConsumeDictionary();
465 return ConsumeList();
467 return ConsumeString();
469 return ConsumeNumber();
473 return ConsumeLiteral();
475 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN
, 1);
480 Value
* JSONParser::ConsumeDictionary() {
482 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN
, 1);
486 StackMarker
depth_check(&stack_depth_
);
487 if (depth_check
.IsTooDeep()) {
488 ReportError(JSONReader::JSON_TOO_MUCH_NESTING
, 1);
492 scoped_ptr
<DictionaryValue
> dict(new DictionaryValue
);
495 Token token
= GetNextToken();
496 while (token
!= T_OBJECT_END
) {
497 if (token
!= T_STRING
) {
498 ReportError(JSONReader::JSON_UNQUOTED_DICTIONARY_KEY
, 1);
502 // First consume the key.
504 if (!ConsumeStringRaw(&key
)) {
508 // Read the separator.
510 token
= GetNextToken();
511 if (token
!= T_OBJECT_PAIR_SEPARATOR
) {
512 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
516 // The next token is the value. Ownership transfers to |dict|.
518 Value
* value
= ParseNextToken();
520 // ReportError from deeper level.
524 dict
->SetWithoutPathExpansion(key
.AsString(), value
);
527 token
= GetNextToken();
528 if (token
== T_LIST_SEPARATOR
) {
530 token
= GetNextToken();
531 if (token
== T_OBJECT_END
&& !(options_
& JSON_ALLOW_TRAILING_COMMAS
)) {
532 ReportError(JSONReader::JSON_TRAILING_COMMA
, 1);
535 } else if (token
!= T_OBJECT_END
) {
536 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 0);
541 return dict
.release();
544 Value
* JSONParser::ConsumeList() {
546 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN
, 1);
550 StackMarker
depth_check(&stack_depth_
);
551 if (depth_check
.IsTooDeep()) {
552 ReportError(JSONReader::JSON_TOO_MUCH_NESTING
, 1);
556 scoped_ptr
<ListValue
> list(new ListValue
);
559 Token token
= GetNextToken();
560 while (token
!= T_ARRAY_END
) {
561 Value
* item
= ParseToken(token
);
563 // ReportError from deeper level.
570 token
= GetNextToken();
571 if (token
== T_LIST_SEPARATOR
) {
573 token
= GetNextToken();
574 if (token
== T_ARRAY_END
&& !(options_
& JSON_ALLOW_TRAILING_COMMAS
)) {
575 ReportError(JSONReader::JSON_TRAILING_COMMA
, 1);
578 } else if (token
!= T_ARRAY_END
) {
579 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
584 return list
.release();
587 Value
* JSONParser::ConsumeString() {
588 StringBuilder string
;
589 if (!ConsumeStringRaw(&string
))
592 // Create the Value representation, using a hidden root, if configured
593 // to do so, and if the string can be represented by StringPiece.
594 if (string
.CanBeStringPiece() && !(options_
& JSON_DETACHABLE_CHILDREN
)) {
595 return new JSONStringValue(string
.AsStringPiece());
597 if (string
.CanBeStringPiece())
599 return new StringValue(string
.AsString());
603 bool JSONParser::ConsumeStringRaw(StringBuilder
* out
) {
605 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN
, 1);
609 // StringBuilder will internally build a StringPiece unless a UTF-16
610 // conversion occurs, at which point it will perform a copy into a
612 StringBuilder
string(NextChar());
614 int length
= end_pos_
- start_pos_
;
617 while (CanConsume(1)) {
618 pos_
= start_pos_
+ index_
; // CBU8_NEXT is postcrement.
619 CBU8_NEXT(start_pos_
, index_
, length
, next_char
);
620 if (next_char
< 0 || !IsValidCharacter(next_char
)) {
621 ReportError(JSONReader::JSON_UNSUPPORTED_ENCODING
, 1);
625 // If this character is an escape sequence...
626 if (next_char
== '\\') {
627 // The input string will be adjusted (either by combining the two
628 // characters of an encoded escape sequence, or with a UTF conversion),
629 // so using StringPiece isn't possible -- force a conversion.
632 if (!CanConsume(1)) {
633 ReportError(JSONReader::JSON_INVALID_ESCAPE
, 0);
637 switch (*NextChar()) {
638 // Allowed esape sequences:
639 case 'x': { // UTF-8 sequence.
640 // UTF-8 \x escape sequences are not allowed in the spec, but they
641 // are supported here for backwards-compatiblity with the old parser.
642 if (!CanConsume(2)) {
643 ReportError(JSONReader::JSON_INVALID_ESCAPE
, 1);
648 if (!HexStringToInt(StringPiece(NextChar(), 2), &hex_digit
)) {
649 ReportError(JSONReader::JSON_INVALID_ESCAPE
, -1);
654 if (hex_digit
< kExtendedASCIIStart
)
655 string
.Append(hex_digit
);
657 DecodeUTF8(hex_digit
, &string
);
660 case 'u': { // UTF-16 sequence.
661 // UTF units are of the form \uXXXX.
662 if (!CanConsume(5)) { // 5 being 'u' and four HEX digits.
663 ReportError(JSONReader::JSON_INVALID_ESCAPE
, 0);
670 std::string utf8_units
;
671 if (!DecodeUTF16(&utf8_units
)) {
672 ReportError(JSONReader::JSON_INVALID_ESCAPE
, -1);
676 string
.AppendString(utf8_units
);
703 case 'v': // Not listed as valid escape sequence in the RFC.
706 // All other escape squences are illegal.
708 ReportError(JSONReader::JSON_INVALID_ESCAPE
, 0);
711 } else if (next_char
== '"') {
712 --index_
; // Rewind by one because of CBU8_NEXT.
716 if (next_char
< kExtendedASCIIStart
)
717 string
.Append(next_char
);
719 DecodeUTF8(next_char
, &string
);
723 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 0);
727 // Entry is at the first X in \uXXXX.
728 bool JSONParser::DecodeUTF16(std::string
* dest_string
) {
732 // This is a 32-bit field because the shift operations in the
733 // conversion process below cause MSVC to error about "data loss."
734 // This only stores UTF-16 code units, though.
735 // Consume the UTF-16 code unit, which may be a high surrogate.
736 int code_unit16_high
= 0;
737 if (!HexStringToInt(StringPiece(pos_
, 4), &code_unit16_high
))
740 // Only add 3, not 4, because at the end of this iteration, the parser has
741 // finished working with the last digit of the UTF sequence, meaning that
742 // the next iteration will advance to the next byte.
745 // Used to convert the UTF-16 code units to a code point and then to a UTF-8
746 // code unit sequence.
747 char code_unit8
[8] = { 0 };
750 // If this is a high surrogate, consume the next code unit to get the
752 if (CBU16_IS_SURROGATE(code_unit16_high
)) {
753 // Make sure this is the high surrogate. If not, it's an encoding
755 if (!CBU16_IS_SURROGATE_LEAD(code_unit16_high
))
758 // Make sure that the token has more characters to consume the
760 if (!CanConsume(6)) // 6 being '\' 'u' and four HEX digits.
762 if (*NextChar() != '\\' || *NextChar() != 'u')
765 NextChar(); // Read past 'u'.
766 int code_unit16_low
= 0;
767 if (!HexStringToInt(StringPiece(pos_
, 4), &code_unit16_low
))
772 if (!CBU16_IS_TRAIL(code_unit16_low
)) {
776 uint32 code_point
= CBU16_GET_SUPPLEMENTARY(code_unit16_high
,
779 CBU8_APPEND_UNSAFE(code_unit8
, offset
, code_point
);
782 DCHECK(CBU16_IS_SINGLE(code_unit16_high
));
783 CBU8_APPEND_UNSAFE(code_unit8
, offset
, code_unit16_high
);
786 dest_string
->append(code_unit8
);
790 void JSONParser::DecodeUTF8(const int32
& point
, StringBuilder
* dest
) {
791 // Anything outside of the basic ASCII plane will need to be decoded from
792 // int32 to a multi-byte sequence.
793 if (point
< kExtendedASCIIStart
) {
796 char utf8_units
[4] = { 0 };
798 CBU8_APPEND_UNSAFE(utf8_units
, offset
, point
);
800 // CBU8_APPEND_UNSAFE can overwrite up to 4 bytes, so utf8_units may not be
801 // zero terminated at this point. |offset| contains the correct length.
802 dest
->AppendString(std::string(utf8_units
, offset
));
806 Value
* JSONParser::ConsumeNumber() {
807 const char* num_start
= pos_
;
808 const int start_index
= index_
;
809 int end_index
= start_index
;
814 if (!ReadInt(false)) {
815 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
820 // The optional fraction part.
822 if (!CanConsume(1)) {
823 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
827 if (!ReadInt(true)) {
828 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
834 // Optional exponent part.
835 if (*pos_
== 'e' || *pos_
== 'E') {
837 if (*pos_
== '-' || *pos_
== '+')
839 if (!ReadInt(true)) {
840 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
846 // ReadInt is greedy because numbers have no easily detectable sentinel,
847 // so save off where the parser should be on exit (see Consume invariant at
848 // the top of the header), then make sure the next token is one which is
850 const char* exit_pos
= pos_
- 1;
851 int exit_index
= index_
- 1;
853 switch (GetNextToken()) {
856 case T_LIST_SEPARATOR
:
860 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
867 StringPiece
num_string(num_start
, end_index
- start_index
);
870 if (StringToInt(num_string
, &num_int
))
871 return new FundamentalValue(num_int
);
874 if (base::StringToDouble(num_string
.as_string(), &num_double
) &&
875 IsFinite(num_double
)) {
876 return new FundamentalValue(num_double
);
882 bool JSONParser::ReadInt(bool allow_leading_zeros
) {
887 while (CanConsume(1) && IsAsciiDigit(c
)) {
895 if (!allow_leading_zeros
&& len
> 1 && first
== '0')
901 Value
* JSONParser::ConsumeLiteral() {
904 const char* kTrueLiteral
= "true";
905 const int kTrueLen
= static_cast<int>(strlen(kTrueLiteral
));
906 if (!CanConsume(kTrueLen
- 1) ||
907 !StringsAreEqual(pos_
, kTrueLiteral
, kTrueLen
)) {
908 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
911 NextNChars(kTrueLen
- 1);
912 return new FundamentalValue(true);
915 const char* kFalseLiteral
= "false";
916 const int kFalseLen
= static_cast<int>(strlen(kFalseLiteral
));
917 if (!CanConsume(kFalseLen
- 1) ||
918 !StringsAreEqual(pos_
, kFalseLiteral
, kFalseLen
)) {
919 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
922 NextNChars(kFalseLen
- 1);
923 return new FundamentalValue(false);
926 const char* kNullLiteral
= "null";
927 const int kNullLen
= static_cast<int>(strlen(kNullLiteral
));
928 if (!CanConsume(kNullLen
- 1) ||
929 !StringsAreEqual(pos_
, kNullLiteral
, kNullLen
)) {
930 ReportError(JSONReader::JSON_SYNTAX_ERROR
, 1);
933 NextNChars(kNullLen
- 1);
934 return Value::CreateNullValue();
937 ReportError(JSONReader::JSON_UNEXPECTED_TOKEN
, 1);
943 bool JSONParser::StringsAreEqual(const char* one
, const char* two
, size_t len
) {
944 return strncmp(one
, two
, len
) == 0;
947 void JSONParser::ReportError(JSONReader::JsonParseError code
,
950 error_line_
= line_number_
;
951 error_column_
= index_
- index_last_line_
+ column_adjust
;
955 std::string
JSONParser::FormatErrorMessage(int line
, int column
,
956 const std::string
& description
) {
957 if (line
|| column
) {
958 return StringPrintf("Line: %i, column: %i, %s",
959 line
, column
, description
.c_str());
964 } // namespace internal