2 Copyright (C) 2017-2025 Free Software Foundation, Inc.
3 Contributed by David Malcolm <dmalcolm@redhat.com>.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 3, or (at your option) any later
12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
23 #include "coretypes.h"
24 #include "json-parsing.h"
25 #include "pretty-print.h"
27 #include "make-unique.h"
32 /* Declarations relating to parsing JSON, all within an
33 anonymous namespace. */
37 /* A typedef representing a single unicode character. */
39 typedef unsigned unichar
;
41 /* An enum for discriminating different kinds of JSON token. */
67 /* Human-readable descriptions of enum token_id. */
69 static const char *token_id_name
[] = {
86 /* Tokens within the JSON lexer. */
90 /* The kind of token. */
93 /* The location of this token within the unicode
95 location_map::range range
;
99 /* Value for TOK_ERROR and TOK_STRING. */
102 /* Value for TOK_FLOAT_NUMBER. */
105 /* Value for TOK_INTEGER_NUMBER. */
110 /* A class for lexing JSON. */
115 lexer (bool support_comments
);
118 std::unique_ptr
<error
> add_utf8 (size_t length
, const char *utf8_buf
);
120 const token
*peek ();
125 bool get_char (unichar
&out_char
, location_map::point
*out_point
);
127 location_map::point
get_next_point () const;
128 static void dump_token (FILE *outf
, const token
*tok
);
129 void lex_token (token
*out
);
130 void lex_string (token
*out
);
131 void lex_number (token
*out
, unichar first_char
);
132 bool rest_of_literal (token
*out
, const char *suffix
);
133 std::unique_ptr
<error
> make_error (const char *msg
);
134 bool consume_single_line_comment (token
*out
);
135 bool consume_multiline_comment (token
*out
);
138 auto_vec
<unichar
> m_buffer
;
140 int m_next_char_line
;
141 int m_next_char_column
;
142 int m_prev_line_final_column
; /* for handling unget_char after a '\n'. */
144 static const int MAX_TOKENS
= 1;
145 token m_next_tokens
[MAX_TOKENS
];
146 int m_num_next_tokens
;
148 bool m_support_comments
;
151 /* A class for parsing JSON. */
156 parser (location_map
*out_loc_map
,
157 bool support_comments
);
160 std::unique_ptr
<error
>
161 add_utf8 (size_t length
, const char *utf8_buf
);
163 parser_result_t
parse_value (int depth
);
164 parser_result_t
parse_object (int depth
);
165 parser_result_t
parse_array (int depth
);
167 std::unique_ptr
<error
>
171 location_map::point
get_next_token_start ();
172 location_map::point
get_next_token_end ();
174 std::unique_ptr
<error
>
175 require (enum token_id tok_id
);
177 result
<enum token_id
, std::unique_ptr
<error
>>
178 require_one_of (enum token_id tok_id_a
, enum token_id tok_id_b
);
180 std::unique_ptr
<error
>
181 error_at (const location_map::range
&r
,
182 const char *fmt
, ...) ATTRIBUTE_PRINTF_3
;
184 void maybe_record_range (json::value
*jv
, const location_map::range
&r
);
185 void maybe_record_range (json::value
*jv
,
186 const location_map::point
&start
,
187 const location_map::point
&end
);
191 location_map
*m_loc_map
;
194 } // anonymous namespace for parsing implementation
196 /* Parser implementation. */
200 lexer::lexer (bool support_comments
)
201 : m_buffer (), m_next_char_idx (0),
202 m_next_char_line (1), m_next_char_column (0),
203 m_prev_line_final_column (-1),
204 m_num_next_tokens (0),
205 m_support_comments (support_comments
)
213 while (m_num_next_tokens
> 0)
217 /* Peek the next token. */
222 if (m_num_next_tokens
== 0)
224 lex_token (&m_next_tokens
[0]);
227 return &m_next_tokens
[0];
230 /* Consume the next token. */
235 if (m_num_next_tokens
== 0)
238 gcc_assert (m_num_next_tokens
> 0);
239 gcc_assert (m_num_next_tokens
<= MAX_TOKENS
);
243 fprintf (stderr
, "consuming token: ");
244 dump_token (stderr
, &m_next_tokens
[0]);
245 fprintf (stderr
, "\n");
248 if (m_next_tokens
[0].id
== TOK_ERROR
249 || m_next_tokens
[0].id
== TOK_STRING
)
250 free (m_next_tokens
[0].u
.string
);
253 memmove (&m_next_tokens
[0], &m_next_tokens
[1],
254 sizeof (token
) * m_num_next_tokens
);
257 /* Add LENGTH bytes of UTF-8 encoded text from UTF8_BUF to this lexer's
259 Return null if successful, or the error if there was a problem. */
261 std::unique_ptr
<error
>
262 lexer::add_utf8 (size_t length
, const char *utf8_buf
)
264 /* Adapted from charset.c:one_utf8_to_cppchar. */
265 static const uchar masks
[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01 };
266 static const uchar patns
[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
268 const uchar
*inbuf
= (const unsigned char *) (utf8_buf
);
269 const uchar
**inbufp
= &inbuf
;
270 size_t *inbytesleftp
= &length
;
275 const uchar
*inbuf
= *inbufp
;
281 m_buffer
.safe_push (c
);
287 /* The number of leading 1-bits in the first byte indicates how many
289 for (nbytes
= 2; nbytes
< 7; nbytes
++)
290 if ((c
& ~masks
[nbytes
-1]) == patns
[nbytes
-1])
292 return make_error ("ill-formed UTF-8 sequence");
295 if (*inbytesleftp
< nbytes
)
296 return make_error ("ill-formed UTF-8 sequence");
298 c
= (c
& masks
[nbytes
-1]);
300 for (i
= 1; i
< nbytes
; i
++)
302 unichar n
= *inbuf
++;
303 if ((n
& 0xC0) != 0x80)
304 return make_error ("ill-formed UTF-8 sequence");
305 c
= ((c
<< 6) + (n
& 0x3F));
308 /* Make sure the shortest possible encoding was used. */
309 if (( c
<= 0x7F && nbytes
> 1)
310 || (c
<= 0x7FF && nbytes
> 2)
311 || (c
<= 0xFFFF && nbytes
> 3)
312 || (c
<= 0x1FFFFF && nbytes
> 4)
313 || (c
<= 0x3FFFFFF && nbytes
> 5))
314 return make_error ("ill-formed UTF-8:"
315 " shortest possible encoding not used");
317 /* Make sure the character is valid. */
318 if (c
> 0x7FFFFFFF || (c
>= 0xD800 && c
<= 0xDFFF))
319 return make_error ("ill-formed UTF-8: invalid character");
321 m_buffer
.safe_push (c
);
323 *inbytesleftp
-= nbytes
;
328 /* Attempt to get the next unicode character from this lexer's buffer.
329 If successful, write it to OUT_CHAR, and its location to *OUT_POINT,
331 Otherwise, return false. */
334 lexer::get_char (unichar
&out_char
, location_map::point
*out_point
)
336 if (m_next_char_idx
>= (int)m_buffer
.length ())
340 *out_point
= get_next_point ();
341 out_char
= m_buffer
[m_next_char_idx
++];
343 if (out_char
== '\n')
346 m_prev_line_final_column
= m_next_char_column
;
347 m_next_char_column
= 0;
350 m_next_char_column
++;
355 /* Undo the last successful get_char. */
361 if (m_next_char_column
> 0)
362 --m_next_char_column
;
366 m_next_char_column
= m_prev_line_final_column
;
367 /* We don't support more than one unget_char in a row. */
368 gcc_assert (m_prev_line_final_column
!= -1);
369 m_prev_line_final_column
= -1;
373 /* Get the location of the next char. */
376 lexer::get_next_point () const
378 location_map::point result
;
379 result
.m_unichar_idx
= m_next_char_idx
;
380 result
.m_line
= m_next_char_line
;
381 result
.m_column
= m_next_char_column
;
385 /* Print a textual representation of TOK to OUTF.
386 This is intended for debugging the lexer and parser,
387 rather than for user-facing output. */
390 lexer::dump_token (FILE *outf
, const token
*tok
)
395 fprintf (outf
, "TOK_ERROR (\"%s\")", tok
->u
.string
);
399 fprintf (outf
, "TOK_EOF");
402 case TOK_OPEN_SQUARE
:
403 fprintf (outf
, "TOK_OPEN_SQUARE");
407 fprintf (outf
, "TOK_OPEN_CURLY");
410 case TOK_CLOSE_SQUARE
:
411 fprintf (outf
, "TOK_CLOSE_SQUARE");
414 case TOK_CLOSE_CURLY
:
415 fprintf (outf
, "TOK_CLOSE_CURLY");
419 fprintf (outf
, "TOK_COLON");
423 fprintf (outf
, "TOK_COMMA");
427 fprintf (outf
, "TOK_TRUE");
431 fprintf (outf
, "TOK_FALSE");
435 fprintf (outf
, "TOK_NULL");
439 fprintf (outf
, "TOK_STRING (\"%s\")", tok
->u
.string
);
442 case TOK_FLOAT_NUMBER
:
443 fprintf (outf
, "TOK_FLOAT_NUMBER (%f)", tok
->u
.float_number
);
446 case TOK_INTEGER_NUMBER
:
447 fprintf (outf
, "TOK_INTEGER_NUMBER (%ld)", tok
->u
.integer_number
);
456 /* Treat "//" as a comment to the end of the line.
458 This isn't compliant with the JSON spec,
459 but is very handy for writing DejaGnu tests.
461 Return true if EOF and populate *OUT, false otherwise. */
464 lexer::consume_single_line_comment (token
*out
)
469 if (!get_char (next_char
, nullptr))
472 location_map::point p
= get_next_point ();
473 out
->range
.m_start
= p
;
474 out
->range
.m_end
= p
;
477 if (next_char
== '\n')
482 /* Treat '/' '*' as a multiline comment until the next closing '*' '/'.
484 This isn't compliant with the JSON spec,
485 but is very handy for writing DejaGnu tests.
487 Return true if EOF and populate *OUT, false otherwise. */
490 lexer::consume_multiline_comment (token
*out
)
495 if (!get_char (next_char
, nullptr))
498 gcc_unreachable (); // TODO
499 location_map::point p
= get_next_point ();
500 out
->range
.m_start
= p
;
501 out
->range
.m_end
= p
;
504 if (next_char
!= '*')
506 if (!get_char (next_char
, nullptr))
509 gcc_unreachable (); // TODO
510 location_map::point p
= get_next_point ();
511 out
->range
.m_start
= p
;
512 out
->range
.m_end
= p
;
515 if (next_char
== '/')
520 /* Attempt to lex the input buffer, writing the next token to OUT.
521 On errors, TOK_ERROR (or TOK_EOF) is written to OUT. */
524 lexer::lex_token (token
*out
)
526 /* Skip to next non-whitespace char. */
528 location_map::point start_point
;
531 if (!get_char (next_char
, &start_point
))
534 location_map::point p
= get_next_point ();
535 out
->range
.m_start
= p
;
536 out
->range
.m_end
= p
;
539 if (m_support_comments
)
540 if (next_char
== '/')
542 location_map::point point
;
543 unichar next_next_char
;
544 if (get_char (next_next_char
, &point
))
546 switch (next_next_char
)
549 if (consume_single_line_comment (out
))
553 if (consume_multiline_comment (out
))
557 /* A stray single '/'. Break out of loop, so that we
558 handle it below as an unexpected character. */
566 && next_char
!= '\r')
572 out
->range
.m_start
= start_point
;
573 out
->range
.m_end
= start_point
;
578 out
->id
= TOK_OPEN_SQUARE
;
582 out
->id
= TOK_OPEN_CURLY
;
586 out
->id
= TOK_CLOSE_SQUARE
;
590 out
->id
= TOK_CLOSE_CURLY
;
616 lex_number (out
, next_char
);
620 /* Handle literal "true". */
621 if (rest_of_literal (out
, "rue"))
630 /* Handle literal "false". */
631 if (rest_of_literal (out
, "alse"))
640 /* Handle literal "null". */
641 if (rest_of_literal (out
, "ull"))
652 out
->u
.string
= xasprintf ("unexpected character: '%c'", next_char
);
657 /* Having consumed an open-quote character from the lexer's buffer, attempt
658 to lex the rest of a JSON string, writing the result to OUT (or TOK_ERROR)
659 if an error occurred.
660 (ECMA-404 section 9; RFC 7159 section 7). */
663 lexer::lex_string (token
*out
)
665 auto_vec
<unichar
> content
;
666 bool still_going
= true;
670 if (!get_char (uc
, &out
->range
.m_end
))
673 out
->range
.m_end
= get_next_point ();
674 out
->u
.string
= xstrdup ("EOF within string");
685 if (!get_char (next_char
, &out
->range
.m_end
))
688 out
->range
.m_end
= get_next_point ();
689 out
->u
.string
= xstrdup ("EOF within string");;
697 content
.safe_push (next_char
);
701 content
.safe_push ('\b');
705 content
.safe_push ('\f');
709 content
.safe_push ('\n');
713 content
.safe_push ('\r');
717 content
.safe_push ('\t');
723 for (int i
= 0; i
< 4; i
++)
726 if (!get_char (hexdigit
, &out
->range
.m_end
))
729 out
->range
.m_end
= get_next_point ();
730 out
->u
.string
= xstrdup ("EOF within string");
734 if (hexdigit
>= '0' && hexdigit
<= '9')
735 result
+= hexdigit
- '0';
736 else if (hexdigit
>= 'a' && hexdigit
<= 'f')
737 result
+= (hexdigit
- 'a') + 10;
738 else if (hexdigit
>= 'A' && hexdigit
<= 'F')
739 result
+= (hexdigit
- 'A') + 10;
743 out
->range
.m_start
= out
->range
.m_end
;
744 out
->u
.string
= xstrdup ("bogus hex char");
748 content
.safe_push (result
);
754 out
->u
.string
= xstrdup ("unrecognized escape char");
761 /* Reject unescaped control characters U+0000 through U+001F
762 (ECMA-404 section 9 para 1; RFC 7159 section 7 para 1). */
766 out
->range
.m_start
= out
->range
.m_end
;
767 out
->u
.string
= xstrdup ("unescaped control char");
771 /* Otherwise, add regular unicode code point. */
772 content
.safe_push (uc
);
777 out
->id
= TOK_STRING
;
779 auto_vec
<char> utf8_buf
;
780 // Adapted from libcpp/charset.c:one_cppchar_to_utf8
781 for (unsigned i
= 0; i
< content
.length (); i
++)
783 static const uchar masks
[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
784 static const uchar limits
[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
786 uchar buf
[6], *p
= &buf
[6];
787 unichar c
= content
[i
];
796 *--p
= ((c
& 0x3F) | 0x80);
800 while (c
>= 0x3F || (c
& limits
[nbytes
-1]));
801 *--p
= (c
| masks
[nbytes
-1]);
805 utf8_buf
.safe_push (*p
++);
808 out
->u
.string
= XNEWVEC (char, utf8_buf
.length () + 1);
809 for (unsigned i
= 0; i
< utf8_buf
.length (); i
++)
810 out
->u
.string
[i
] = utf8_buf
[i
];
811 out
->u
.string
[utf8_buf
.length ()] = '\0';
814 /* Having consumed FIRST_CHAR, an initial digit or '-' character from
815 the lexer's buffer attempt to lex the rest of a JSON number, writing
816 the result to OUT (or TOK_ERROR) if an error occurred.
817 (ECMA-404 section 8; RFC 7159 section 6). */
820 lexer::lex_number (token
*out
, unichar first_char
)
824 if (first_char
== '-')
827 if (!get_char (first_char
, &out
->range
.m_end
))
830 out
->range
.m_start
= out
->range
.m_end
;
831 out
->u
.string
= xstrdup ("expected digit");
836 if (first_char
== '0')
838 else if (!ISDIGIT (first_char
))
841 out
->range
.m_start
= out
->range
.m_end
;
842 out
->u
.string
= xstrdup ("expected digit");
847 /* Got a nonzero digit; expect zero or more digits. */
848 value
= first_char
- '0';
852 location_map::point point
;
853 if (!get_char (uc
, &point
))
859 out
->range
.m_end
= point
;
870 /* Optional '.', followed by one or more decimals. */
872 location_map::point point
;
873 if (get_char (next_char
, &point
))
875 if (next_char
== '.')
877 /* Parse decimal digits. */
878 bool had_digit
= false;
879 double digit_factor
= 0.1;
880 while (get_char (next_char
, &point
))
882 if (!ISDIGIT (next_char
))
887 value
+= (next_char
- '0') * digit_factor
;
890 out
->range
.m_end
= point
;
895 out
->range
.m_start
= point
;
896 out
->range
.m_start
= point
;
897 out
->u
.string
= xstrdup ("expected digit");
905 /* Parse 'e' and 'E'. */
906 unichar exponent_char
;
907 if (get_char (exponent_char
, &point
))
909 if (exponent_char
== 'e' || exponent_char
== 'E')
914 bool negate_exponent
= false;
915 bool had_exponent_digit
= false;
916 if (!get_char (sign_char
, &point
))
919 out
->range
.m_start
= point
;
920 out
->range
.m_start
= point
;
921 out
->u
.string
= xstrdup ("EOF within exponent");
924 if (sign_char
== '-')
925 negate_exponent
= true;
926 else if (sign_char
== '+')
928 else if (ISDIGIT (sign_char
))
930 exponent
= sign_char
- '0';
931 had_exponent_digit
= true;
936 out
->range
.m_start
= point
;
937 out
->range
.m_start
= point
;
939 = xstrdup ("expected '-','+' or digit within exponent");
942 out
->range
.m_end
= point
;
944 /* One or more digits (we might have seen the digit above,
949 location_map::point point
;
950 if (!get_char (uc
, &point
))
956 had_exponent_digit
= true;
957 out
->range
.m_end
= point
;
966 if (!had_exponent_digit
)
969 out
->range
.m_start
= point
;
970 out
->range
.m_start
= point
;
971 out
->u
.string
= xstrdup ("expected digit within exponent");
975 exponent
= -exponent
;
976 value
= value
* pow (10, exponent
);
985 if (value
== (long)value
)
987 out
->id
= TOK_INTEGER_NUMBER
;
988 out
->u
.integer_number
= value
;
992 out
->id
= TOK_FLOAT_NUMBER
;
993 out
->u
.float_number
= value
;
997 /* Determine if the next characters to be lexed match SUFFIX.
998 SUFFIX must be pure ASCII and not contain newlines.
999 If so, consume the characters and return true.
1000 Otherwise, return false. */
1003 lexer::rest_of_literal (token
*out
, const char *suffix
)
1006 int buf_idx
= m_next_char_idx
;
1009 if (suffix
[suffix_idx
] == '\0')
1011 m_next_char_idx
+= suffix_idx
;
1012 m_next_char_column
+= suffix_idx
;
1013 out
->range
.m_end
.m_unichar_idx
+= suffix_idx
;
1014 out
->range
.m_end
.m_column
+= suffix_idx
;
1017 if (buf_idx
>= (int)m_buffer
.length ())
1019 /* This assumes that suffix is ASCII. */
1020 if (m_buffer
[buf_idx
] != (unichar
)suffix
[suffix_idx
])
1027 /* Create a new error instance for MSG, using the location of the next
1028 character for the location of the error. */
1030 std::unique_ptr
<error
>
1031 lexer::make_error (const char *msg
)
1033 location_map::point p
;
1034 p
.m_unichar_idx
= m_next_char_idx
;
1035 p
.m_line
= m_next_char_line
;
1036 p
.m_column
= m_next_char_column
;
1037 location_map::range r
;
1040 return ::make_unique
<error
> (r
, xstrdup (msg
));
1043 /* parser's ctor. */
1045 parser::parser (location_map
*out_loc_map
,
1046 bool support_comments
)
1047 : m_lexer (support_comments
), m_loc_map (out_loc_map
)
1051 /* parser's dtor. */
1056 m_loc_map
->on_finished_parsing ();
1059 /* Add LENGTH bytes of UTF-8 encoded text from UTF8_BUF to this parser's
1062 std::unique_ptr
<error
>
1063 parser::add_utf8 (size_t length
, const char *utf8_buf
)
1065 return m_lexer
.add_utf8 (length
, utf8_buf
);
1068 /* Parse a JSON value (object, array, number, string, or literal).
1069 (ECMA-404 section 5; RFC 7159 section 3). */
1072 parser::parse_value (int depth
)
1074 const token
*tok
= m_lexer
.peek ();
1076 /* Avoid stack overflow with deeply-nested inputs; RFC 7159 section 9
1077 states: "An implementation may set limits on the maximum depth
1080 Ideally we'd avoid this limit (e.g. by rewriting parse_value,
1081 parse_object, and parse_array into a single function with a vec of
1083 const int MAX_DEPTH
= 100;
1084 if (depth
>= MAX_DEPTH
)
1085 return error_at (tok
->range
, "maximum nesting depth exceeded: %i",
1090 case TOK_OPEN_CURLY
:
1091 return parse_object (depth
);
1095 auto val
= ::make_unique
<string
> (tok
->u
.string
);
1097 maybe_record_range (val
.get (), tok
->range
);
1098 return parser_result_t (std::move (val
));
1101 case TOK_OPEN_SQUARE
:
1102 return parse_array (depth
);
1104 case TOK_FLOAT_NUMBER
:
1106 auto val
= ::make_unique
<float_number
> (tok
->u
.float_number
);
1108 maybe_record_range (val
.get (), tok
->range
);
1109 return parser_result_t (std::move (val
));
1112 case TOK_INTEGER_NUMBER
:
1114 auto val
= ::make_unique
<integer_number
> (tok
->u
.integer_number
);
1116 maybe_record_range (val
.get (), tok
->range
);
1117 return parser_result_t (std::move (val
));
1122 auto val
= ::make_unique
<literal
> (JSON_TRUE
);
1124 maybe_record_range (val
.get (), tok
->range
);
1125 return parser_result_t (std::move (val
));
1130 auto val
= ::make_unique
<literal
> (JSON_FALSE
);
1132 maybe_record_range (val
.get (), tok
->range
);
1133 return parser_result_t (std::move (val
));
1138 auto val
= ::make_unique
<literal
> (JSON_NULL
);
1140 maybe_record_range (val
.get (), tok
->range
);
1141 return parser_result_t (std::move (val
));
1145 return error_at (tok
->range
, "invalid JSON token: %s", tok
->u
.string
);
1148 return error_at (tok
->range
, "expected a JSON value but got %s",
1149 token_id_name
[tok
->id
]);
1153 /* Parse a JSON object.
1154 (ECMA-404 section 6; RFC 7159 section 4). */
1157 parser::parse_object (int depth
)
1159 location_map::point start
= get_next_token_start ();
1161 require (TOK_OPEN_CURLY
);
1163 auto obj
= ::make_unique
<object
> ();
1165 const token
*tok
= m_lexer
.peek ();
1166 if (tok
->id
== TOK_CLOSE_CURLY
)
1168 location_map::point end
= get_next_token_end ();
1169 maybe_record_range (obj
.get (), start
, end
);
1170 if (auto err
= require (TOK_CLOSE_CURLY
))
1171 return parser_result_t (std::move (err
));
1172 return parser_result_t (std::move (obj
));
1174 if (tok
->id
!= TOK_STRING
)
1175 return error_at (tok
->range
,
1176 "expected string for object key after '{'; got %s",
1177 token_id_name
[tok
->id
]);
1180 tok
= m_lexer
.peek ();
1181 if (tok
->id
!= TOK_STRING
)
1182 return error_at (tok
->range
,
1183 "expected string for object key after ','; got %s",
1184 token_id_name
[tok
->id
]);
1185 label_text key
= label_text::take (xstrdup (tok
->u
.string
));
1188 if (auto err
= require (TOK_COLON
))
1189 return parser_result_t (std::move (err
));
1191 parser_result_t r
= parse_value (depth
+ 1);
1195 return parser_result_t (std::move (obj
));
1197 /* We don't enforce uniqueness for keys. */
1198 obj
->set (key
.get (), std::move (r
.m_val
));
1200 location_map::point end
= get_next_token_end ();
1201 result
<enum token_id
, std::unique_ptr
<error
>> result
1202 (require_one_of (TOK_COMMA
, TOK_CLOSE_CURLY
));
1204 return parser_result_t (std::move (result
.m_err
));
1205 if (result
.m_val
== TOK_COMMA
)
1209 /* TOK_CLOSE_CURLY. */
1210 maybe_record_range (obj
.get (), start
, end
);
1211 return parser_result_t (std::move (obj
));
1216 /* Parse a JSON array.
1217 (ECMA-404 section 7; RFC 7159 section 5). */
1220 parser::parse_array (int depth
)
1222 location_map::point start
= get_next_token_start ();
1223 if (auto err
= require (TOK_OPEN_SQUARE
))
1224 return parser_result_t (std::move (err
));
1226 auto arr
= ::make_unique
<array
> ();
1228 const token
*tok
= m_lexer
.peek ();
1229 if (tok
->id
== TOK_CLOSE_SQUARE
)
1231 location_map::point end
= get_next_token_end ();
1232 maybe_record_range (arr
.get (), start
, end
);
1234 return parser_result_t (std::move (arr
));
1239 parser_result_t r
= parse_value (depth
+ 1);
1243 arr
->append (std::move (r
.m_val
));
1245 location_map::point end
= get_next_token_end ();
1246 result
<enum token_id
, std::unique_ptr
<error
>> result
1247 (require_one_of (TOK_COMMA
, TOK_CLOSE_SQUARE
));
1249 return parser_result_t (std::move (result
.m_err
));
1250 if (result
.m_val
== TOK_COMMA
)
1254 /* TOK_CLOSE_SQUARE. */
1255 maybe_record_range (arr
.get (), start
, end
);
1256 return parser_result_t (std::move (arr
));
1261 /* Get the start point of the next token. */
1264 parser::get_next_token_start ()
1266 const token
*tok
= m_lexer
.peek ();
1267 return tok
->range
.m_start
;
1270 /* Get the end point of the next token. */
1273 parser::get_next_token_end ()
1275 const token
*tok
= m_lexer
.peek ();
1276 return tok
->range
.m_end
;
1279 /* Require an EOF, or fail if there is surplus input. */
1281 std::unique_ptr
<error
>
1282 parser::require_eof ()
1284 return require (TOK_EOF
);
1287 /* Consume the next token, issuing an error if it is not of kind TOK_ID. */
1289 std::unique_ptr
<error
>
1290 parser::require (enum token_id tok_id
)
1292 const token
*tok
= m_lexer
.peek ();
1293 if (tok
->id
!= tok_id
)
1295 if (tok
->id
== TOK_ERROR
)
1296 return error_at (tok
->range
,
1297 "expected %s; got bad token: %s",
1298 token_id_name
[tok_id
], tok
->u
.string
);
1300 return error_at (tok
->range
,
1301 "expected %s; got %s", token_id_name
[tok_id
],
1302 token_id_name
[tok
->id
]);
1308 /* Consume the next token, issuing an error if it is not of
1309 kind TOK_ID_A or TOK_ID_B.
1310 Return which kind it was. */
1312 result
<enum token_id
, std::unique_ptr
<error
>>
1313 parser::require_one_of (enum token_id tok_id_a
, enum token_id tok_id_b
)
1315 const token
*tok
= m_lexer
.peek ();
1316 if ((tok
->id
!= tok_id_a
)
1317 && (tok
->id
!= tok_id_b
))
1319 if (tok
->id
== TOK_ERROR
)
1320 return error_at (tok
->range
, "expected %s or %s; got bad token: %s",
1321 token_id_name
[tok_id_a
], token_id_name
[tok_id_b
],
1324 return error_at (tok
->range
, "expected %s or %s; got %s",
1325 token_id_name
[tok_id_a
], token_id_name
[tok_id_b
],
1326 token_id_name
[tok
->id
]);
1328 enum token_id id
= tok
->id
;
1330 return result
<enum token_id
, std::unique_ptr
<error
>> (id
);
1333 /* Genarate a parsing error. */
1335 std::unique_ptr
<error
>
1336 parser::error_at (const location_map::range
&r
, const char *fmt
, ...)
1340 char *formatted_msg
= xvasprintf (fmt
, ap
);
1343 return ::make_unique
<error
> (r
, formatted_msg
);
1346 /* Record that JV has range R within the input file. */
1349 parser::maybe_record_range (json::value
*jv
, const location_map::range
&r
)
1352 m_loc_map
->record_range_for_value (jv
, r
);
1355 /* Record that JV has range START to END within the input file. */
1358 parser::maybe_record_range (json::value
*jv
,
1359 const location_map::point
&start
,
1360 const location_map::point
&end
)
1364 location_map::range r
;
1367 m_loc_map
->record_range_for_value (jv
, r
);
1371 /* Attempt to parse the UTF-8 encoded buffer at UTF8_BUF
1372 of the given LENGTH.
1373 If ALLOW_COMMENTS is true, then allow C and C++ style-comments in the
1374 buffer, as an extension to JSON, otherwise forbid them.
1375 If successful, return an json::value in the result.
1376 if there was a problem, return a json::error in the result.
1377 If OUT_LOC_MAP is non-NULL, notify *OUT_LOC_MAP about
1378 source locations of nodes seen during parsing. */
1381 json::parse_utf8_string (size_t length
,
1382 const char *utf8_buf
,
1383 bool allow_comments
,
1384 location_map
*out_loc_map
)
1386 parser
p (out_loc_map
, allow_comments
);
1387 if (auto err
= p
.add_utf8 (length
, utf8_buf
))
1388 return parser_result_t (std::move (err
));
1389 parser_result_t r
= p
.parse_value (0);
1392 if (auto err
= p
.require_eof ())
1393 return parser_result_t (std::move (err
));
1397 /* Attempt to parse the nil-terminated UTF-8 encoded buffer at
1399 If ALLOW_COMMENTS is true, then allow C and C++ style-comments in the
1400 buffer, as an extension to JSON, otherwise forbid them.
1401 If successful, return a non-NULL json::value *.
1402 if there was a problem, return NULL and write an error
1403 message to err_out, which must be deleted by the caller.
1404 If OUT_LOC_MAP is non-NULL, notify *OUT_LOC_MAP about
1405 source locations of nodes seen during parsing. */
1407 json::parser_result_t
1408 json::parse_utf8_string (const char *utf8
,
1409 bool allow_comments
,
1410 location_map
*out_loc_map
)
1412 return parse_utf8_string (strlen (utf8
), utf8
, allow_comments
,
1419 namespace selftest
{
1423 #define ASSERT_PRINT_EQ(JV, FORMATTED, EXPECTED_JSON) \
1424 assert_print_eq (SELFTEST_LOCATION, JV, FORMATTED, EXPECTED_JSON)
1426 /* Implementation detail of ASSERT_RANGE_EQ. */
1429 assert_point_eq (const location
&loc
,
1430 const location_map::point
&actual_point
,
1431 size_t exp_unichar_idx
, int exp_line
, int exp_column
)
1433 ASSERT_EQ_AT (loc
, actual_point
.m_unichar_idx
, exp_unichar_idx
);
1434 ASSERT_EQ_AT (loc
, actual_point
.m_line
, exp_line
);
1435 ASSERT_EQ_AT (loc
, actual_point
.m_column
, exp_column
);
1438 /* Implementation detail of ASSERT_RANGE_EQ. */
1441 assert_range_eq (const location
&loc
,
1442 const location_map::range
&actual_range
,
1443 /* Expected location. */
1444 size_t start_unichar_idx
, int start_line
, int start_column
,
1445 size_t end_unichar_idx
, int end_line
, int end_column
)
1447 assert_point_eq (loc
, actual_range
.m_start
,
1448 start_unichar_idx
, start_line
, start_column
);
1449 assert_point_eq (loc
, actual_range
.m_end
,
1450 end_unichar_idx
, end_line
, end_column
);
1453 /* Assert that ACTUAL_RANGE starts at
1454 (START_UNICHAR_IDX, START_LINE, START_COLUMN)
1455 and ends at (END_UNICHAR_IDX, END_LINE, END_COLUMN). */
1457 #define ASSERT_RANGE_EQ(ACTUAL_RANGE, \
1458 START_UNICHAR_IDX, START_LINE, START_COLUMN, \
1459 END_UNICHAR_IDX, END_LINE, END_COLUMN) \
1460 assert_range_eq ((SELFTEST_LOCATION), (ACTUAL_RANGE), \
1461 (START_UNICHAR_IDX), (START_LINE), (START_COLUMN), \
1462 (END_UNICHAR_IDX), (END_LINE), (END_COLUMN))
1464 /* Implementation detail of ASSERT_ERR_EQ. */
1467 assert_err_eq (const location
&loc
,
1468 const json::error
*actual_err
,
1469 /* Expected location. */
1470 size_t start_unichar_idx
, int start_line
, int start_column
,
1471 size_t end_unichar_idx
, int end_line
, int end_column
,
1472 const char *expected_msg
)
1474 ASSERT_TRUE_AT (loc
, actual_err
);
1475 const location_map::range
&actual_range
= actual_err
->get_range ();
1476 ASSERT_EQ_AT (loc
, actual_range
.m_start
.m_unichar_idx
, start_unichar_idx
);
1477 ASSERT_EQ_AT (loc
, actual_range
.m_start
.m_line
, start_line
);
1478 ASSERT_EQ_AT (loc
, actual_range
.m_start
.m_column
, start_column
);
1479 ASSERT_EQ_AT (loc
, actual_range
.m_end
.m_unichar_idx
, end_unichar_idx
);
1480 ASSERT_EQ_AT (loc
, actual_range
.m_end
.m_line
, end_line
);
1481 ASSERT_EQ_AT (loc
, actual_range
.m_end
.m_column
, end_column
);
1482 ASSERT_STREQ_AT (loc
, actual_err
->get_msg (), expected_msg
);
1485 /* Assert that ACTUAL_ERR is a non-NULL json::error *,
1486 with message EXPECTED_MSG, and that its location starts
1487 at (START_UNICHAR_IDX, START_LINE, START_COLUMN)
1488 and ends at (END_UNICHAR_IDX, END_LINE, END_COLUMN). */
1490 #define ASSERT_ERR_EQ(ACTUAL_ERR, \
1491 START_UNICHAR_IDX, START_LINE, START_COLUMN, \
1492 END_UNICHAR_IDX, END_LINE, END_COLUMN, \
1494 assert_err_eq ((SELFTEST_LOCATION), (ACTUAL_ERR), \
1495 (START_UNICHAR_IDX), (START_LINE), (START_COLUMN), \
1496 (END_UNICHAR_IDX), (END_LINE), (END_COLUMN), \
1499 /* Verify that the JSON lexer works as expected. */
1507 /* 01234567890123456789012345678901234567890123456789. */
1510 " [ ] null true false { } \"foo\" \n");
1511 auto err
= l
.add_utf8 (strlen (str
), str
);
1512 ASSERT_EQ (err
, nullptr);
1516 const size_t line_offset
= 0;
1518 /* Expect token: "1066" in columns 4-7. */
1520 const token
*tok
= l
.peek ();
1521 ASSERT_EQ (tok
->id
, TOK_INTEGER_NUMBER
);
1522 ASSERT_EQ (tok
->u
.integer_number
, 1066);
1523 ASSERT_RANGE_EQ (tok
->range
,
1524 line_offset
+ 4, 1, 4,
1525 line_offset
+ 7, 1, 7);
1528 /* Expect token: "-1" in columns 11-12. */
1530 const token
*tok
= l
.peek ();
1531 ASSERT_EQ (tok
->id
, TOK_INTEGER_NUMBER
);
1532 ASSERT_EQ (tok
->u
.integer_number
, -1);
1533 ASSERT_RANGE_EQ (tok
->range
,
1534 line_offset
+ 11, 1, 11,
1535 line_offset
+ 12, 1, 12);
1542 const size_t line_offset
= 16;
1544 /* Expect token: "-273.15" in columns 4-10. */
1546 const token
*tok
= l
.peek ();
1547 ASSERT_EQ (tok
->id
, TOK_FLOAT_NUMBER
);
1548 ASSERT_EQ (int(tok
->u
.float_number
), int(-273.15));
1549 ASSERT_RANGE_EQ (tok
->range
,
1550 line_offset
+ 4, 2, 4,
1551 line_offset
+ 10, 2, 10);
1554 /* Expect token: "1e6" in columns 12-14. */
1556 const token
*tok
= l
.peek ();
1557 ASSERT_EQ (tok
->id
, TOK_INTEGER_NUMBER
);
1558 ASSERT_EQ (tok
->u
.integer_number
, 1000000);
1559 ASSERT_RANGE_EQ (tok
->range
,
1560 line_offset
+ 12, 2, 12,
1561 line_offset
+ 14, 2, 14);
1568 const size_t line_offset
= 32;
1570 /* Expect token: "[". */
1572 const token
*tok
= l
.peek ();
1573 ASSERT_EQ (tok
->id
, TOK_OPEN_SQUARE
);
1574 ASSERT_RANGE_EQ (tok
->range
,
1575 line_offset
+ 2, 3, 2,
1576 line_offset
+ 2, 3, 2);
1579 /* Expect token: "]". */
1581 const token
*tok
= l
.peek ();
1582 ASSERT_EQ (tok
->id
, TOK_CLOSE_SQUARE
);
1583 ASSERT_RANGE_EQ (tok
->range
,
1584 line_offset
+ 6, 3, 6,
1585 line_offset
+ 6, 3, 6);
1588 /* Expect token: "null". */
1590 const token
*tok
= l
.peek ();
1591 ASSERT_EQ (tok
->id
, TOK_NULL
);
1592 ASSERT_RANGE_EQ (tok
->range
,
1593 line_offset
+ 8, 3, 8,
1594 line_offset
+ 11, 3, 11);
1597 /* Expect token: "true". */
1599 const token
*tok
= l
.peek ();
1600 ASSERT_EQ (tok
->id
, TOK_TRUE
);
1601 ASSERT_RANGE_EQ (tok
->range
,
1602 line_offset
+ 15, 3, 15,
1603 line_offset
+ 18, 3, 18);
1606 /* Expect token: "false". */
1608 const token
*tok
= l
.peek ();
1609 ASSERT_EQ (tok
->id
, TOK_FALSE
);
1610 ASSERT_RANGE_EQ (tok
->range
,
1611 line_offset
+ 21, 3, 21,
1612 line_offset
+ 25, 3, 25);
1615 /* Expect token: "{". */
1617 const token
*tok
= l
.peek ();
1618 ASSERT_EQ (tok
->id
, TOK_OPEN_CURLY
);
1619 ASSERT_RANGE_EQ (tok
->range
,
1620 line_offset
+ 28, 3, 28,
1621 line_offset
+ 28, 3, 28);
1624 /* Expect token: "}". */
1626 const token
*tok
= l
.peek ();
1627 ASSERT_EQ (tok
->id
, TOK_CLOSE_CURLY
);
1628 ASSERT_RANGE_EQ (tok
->range
,
1629 line_offset
+ 31, 3, 31,
1630 line_offset
+ 31, 3, 31);
1633 /* Expect token: "\"foo\"". */
1635 const token
*tok
= l
.peek ();
1636 ASSERT_EQ (tok
->id
, TOK_STRING
);
1637 ASSERT_RANGE_EQ (tok
->range
,
1638 line_offset
+ 34, 3, 34,
1639 line_offset
+ 38, 3, 38);
1645 /* Verify that the JSON lexer complains about single-line comments
1646 when comments are disabled. */
1649 test_lexing_unsupported_single_line_comment ()
1654 /* 01234567890123456789012345678901234567890123456789. */
1655 = (" 1066 // Hello world\n");
1656 auto err
= l
.add_utf8 (strlen (str
), str
);
1657 ASSERT_EQ (err
, nullptr);
1661 const size_t line_offset
= 0;
1662 const int line_1
= 1;
1664 /* Expect token: "1066" in columns 4-7. */
1666 const token
*tok
= l
.peek ();
1667 ASSERT_EQ (tok
->id
, TOK_INTEGER_NUMBER
);
1668 ASSERT_EQ (tok
->u
.integer_number
, 1066);
1669 ASSERT_RANGE_EQ (tok
->range
,
1670 line_offset
+ 4, line_1
, 4,
1671 line_offset
+ 7, line_1
, 7);
1677 const token
*tok
= l
.peek ();
1678 ASSERT_EQ (tok
->id
, TOK_ERROR
);
1679 ASSERT_STREQ (tok
->u
.string
, "unexpected character: '/'");
1680 ASSERT_RANGE_EQ (tok
->range
,
1681 line_offset
+ 11, line_1
, 11,
1682 line_offset
+ 11, line_1
, 11);
1688 /* Verify that the JSON lexer complains about multiline comments
1689 when comments are disabled. */
1692 test_lexing_unsupported_multiline_comment ()
1697 /* 01234567890123456789012345678901234567890123456789. */
1698 = (" 1066 /* Hello world\n"
1699 " continuation of comment\n"
1700 " end of comment */ 42\n");
1701 auto err
= l
.add_utf8 (strlen (str
), str
);
1702 ASSERT_EQ (err
, nullptr);
1706 const size_t line_offset
= 0;
1707 const int line_1
= 1;
1709 /* Expect token: "1066" in line 1, columns 4-7. */
1711 const token
*tok
= l
.peek ();
1712 ASSERT_EQ (tok
->id
, TOK_INTEGER_NUMBER
);
1713 ASSERT_EQ (tok
->u
.integer_number
, 1066);
1714 ASSERT_RANGE_EQ (tok
->range
,
1715 line_offset
+ 4, line_1
, 4,
1716 line_offset
+ 7, line_1
, 7);
1722 const token
*tok
= l
.peek ();
1723 ASSERT_EQ (tok
->id
, TOK_ERROR
);
1724 ASSERT_STREQ (tok
->u
.string
, "unexpected character: '/'");
1725 ASSERT_RANGE_EQ (tok
->range
,
1726 line_offset
+ 11, line_1
, 11,
1727 line_offset
+ 11, line_1
, 11);
1733 /* Verify that the JSON lexer handles single-line comments
1734 when comments are enabled. */
1737 test_lexing_supported_single_line_comment ()
1742 /* 01234567890123456789012345678901234567890123456789. */
1743 = (" 1066 // Hello world\n"
1745 auto err
= l
.add_utf8 (strlen (str
), str
);
1746 ASSERT_EQ (err
, nullptr);
1748 const size_t line_1_offset
= 0;
1749 const size_t line_2_offset
= 26;
1750 const size_t line_3_offset
= line_2_offset
+ 17;
1752 /* Expect token: "1066" in line 1, columns 4-7. */
1754 const int line_1
= 1;
1755 const token
*tok
= l
.peek ();
1756 ASSERT_EQ (tok
->id
, TOK_INTEGER_NUMBER
);
1757 ASSERT_EQ (tok
->u
.integer_number
, 1066);
1758 ASSERT_RANGE_EQ (tok
->range
,
1759 line_1_offset
+ 4, line_1
, 4,
1760 line_1_offset
+ 7, line_1
, 7);
1764 /* Expect token: "42" in line 2, columns 5-6. */
1766 const int line_2
= 2;
1767 const token
*tok
= l
.peek ();
1768 ASSERT_EQ (tok
->id
, TOK_INTEGER_NUMBER
);
1769 ASSERT_EQ (tok
->u
.integer_number
, 42);
1770 ASSERT_RANGE_EQ (tok
->range
,
1771 line_2_offset
+ 5, line_2
, 5,
1772 line_2_offset
+ 6, line_2
, 6);
1778 const int line_3
= 3;
1779 const token
*tok
= l
.peek ();
1780 ASSERT_EQ (tok
->id
, TOK_EOF
);
1781 ASSERT_RANGE_EQ (tok
->range
,
1782 line_3_offset
+ 0, line_3
, 0,
1783 line_3_offset
+ 0, line_3
, 0);
1788 /* Verify that the JSON lexer handles multiline comments
1789 when comments are enabled. */
1792 test_lexing_supported_multiline_comment ()
1797 /* 01234567890123456789012345678901234567890123456789. */
1798 = (" 1066 /* Hello world\n"
1799 " continuation of comment\n"
1800 " end of comment */ 42\n");
1801 auto err
= l
.add_utf8 (strlen (str
), str
);
1802 ASSERT_EQ (err
, nullptr);
1804 const size_t line_1_offset
= 0;
1805 const size_t line_2_offset
= 26;
1806 const size_t line_3_offset
= line_2_offset
+ 25;
1807 const size_t line_4_offset
= line_3_offset
+ 23;
1809 /* Expect token: "1066" in line 1, columns 4-7. */
1811 const int line_1
= 1;
1812 const token
*tok
= l
.peek ();
1813 ASSERT_EQ (tok
->id
, TOK_INTEGER_NUMBER
);
1814 ASSERT_EQ (tok
->u
.integer_number
, 1066);
1815 ASSERT_RANGE_EQ (tok
->range
,
1816 line_1_offset
+ 4, line_1
, 4,
1817 line_1_offset
+ 7, line_1
, 7);
1821 /* Expect token: "42" in line 3, columns 20-21. */
1823 const int line_3
= 3;
1824 const token
*tok
= l
.peek ();
1825 ASSERT_EQ (tok
->id
, TOK_INTEGER_NUMBER
);
1826 ASSERT_EQ (tok
->u
.integer_number
, 42);
1827 ASSERT_RANGE_EQ (tok
->range
,
1828 line_3_offset
+ 20, line_3
, 20,
1829 line_3_offset
+ 21, line_3
, 21);
1835 const int line_4
= 4;
1836 const token
*tok
= l
.peek ();
1837 ASSERT_EQ (tok
->id
, TOK_EOF
);
1838 ASSERT_RANGE_EQ (tok
->range
,
1839 line_4_offset
+ 0, line_4
, 0,
1840 line_4_offset
+ 0, line_4
, 0);
1845 /* Helper class for writing JSON parsing testcases.
1846 Attempts to parse a string in ctor, and captures the result (either
1847 a json::value or a json::error), and a location map. */
1849 struct parser_testcase
1852 parser_testcase (const char *utf8_string
, bool allow_comments
= false)
1854 m_result (parse_utf8_string (utf8_string
, allow_comments
, &m_loc_map
))
1858 const json::value
*get_value () const { return m_result
.m_val
.get (); }
1859 const json::error
*get_error () const { return m_result
.m_err
.get (); }
1861 const location_map::range
*
1862 get_range_for_value (const json::value
*jv
) const
1864 return m_loc_map
.get_range_for_value (jv
);
1868 /* Concrete implementation of location_map for use in
1869 JSON parsing selftests. */
1870 class test_location_map
: public location_map
1873 void record_range_for_value (json::value
*jv
, const range
&r
) final override
1878 range
*get_range_for_value (const json::value
*jv
) const
1880 return const_cast<hash_map
<const json::value
*, range
> &> (m_map
)
1885 hash_map
<const json::value
*, range
> m_map
;
1888 test_location_map m_loc_map
;
1889 json::parser_result_t m_result
;
1892 /* Verify that parse_utf8_string works as expected. */
1895 test_parse_string ()
1897 const int line_1
= 1;
1900 parser_testcase
tc ("\"foo\"");
1901 ASSERT_EQ (tc
.get_error (), nullptr);
1902 const json::value
*jv
= tc
.get_value ();
1903 ASSERT_EQ (jv
->get_kind (), JSON_STRING
);
1904 ASSERT_STREQ (as_a
<const json::string
*> (jv
)->get_string (), "foo");
1905 ASSERT_PRINT_EQ (*jv
, true, "\"foo\"");
1906 auto range
= tc
.get_range_for_value (jv
);
1907 ASSERT_TRUE (range
);
1908 ASSERT_RANGE_EQ (*range
,
1914 const char *contains_quotes
= "\"before \\\"quoted\\\" after\"";
1915 parser_testcase
tc (contains_quotes
);
1916 ASSERT_EQ (tc
.get_error (), nullptr);
1917 const json::value
*jv
= tc
.get_value ();
1918 ASSERT_EQ (jv
->get_kind (), JSON_STRING
);
1919 ASSERT_STREQ (as_a
<const json::string
*> (jv
)->get_string (),
1920 "before \"quoted\" after");
1921 ASSERT_PRINT_EQ (*jv
, true, contains_quotes
);
1922 auto range
= tc
.get_range_for_value (jv
);
1923 ASSERT_TRUE (range
);
1924 ASSERT_RANGE_EQ (*range
,
1929 /* Test of non-ASCII input. This string is the Japanese word "mojibake",
1930 written as C octal-escaped UTF-8. */
1931 const char *mojibake
= (/* Opening quote. */
1933 /* U+6587 CJK UNIFIED IDEOGRAPH-6587
1934 UTF-8: 0xE6 0x96 0x87
1935 C octal escaped UTF-8: \346\226\207. */
1937 /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
1938 UTF-8: 0xE5 0xAD 0x97
1939 C octal escaped UTF-8: \345\255\227. */
1941 /* U+5316 CJK UNIFIED IDEOGRAPH-5316
1942 UTF-8: 0xE5 0x8C 0x96
1943 C octal escaped UTF-8: \345\214\226. */
1945 /* U+3051 HIRAGANA LETTER KE
1946 UTF-8: 0xE3 0x81 0x91
1947 C octal escaped UTF-8: \343\201\221. */
1949 /* Closing quote. */
1952 parser_testcase
tc (mojibake
);
1953 ASSERT_EQ (tc
.get_error (), nullptr);
1954 const json::value
*jv
= tc
.get_value ();
1955 ASSERT_EQ (jv
->get_kind (), JSON_STRING
);
1956 /* Result of get_string should be UTF-8 encoded, without quotes. */
1957 ASSERT_STREQ (as_a
<const json::string
*> (jv
)->get_string (),
1958 "\346\226\207" "\345\255\227" "\345\214\226" "\343\201\221");
1959 /* Result of dump should be UTF-8 encoded, with quotes. */
1960 ASSERT_PRINT_EQ (*jv
, false, mojibake
);
1961 auto range
= tc
.get_range_for_value (jv
);
1962 ASSERT_TRUE (range
);
1963 ASSERT_RANGE_EQ (*range
,
1968 /* Test of \u-escaped unicode. This is "mojibake" again, as above. */
1970 const char *escaped_unicode
= "\"\\u6587\\u5b57\\u5316\\u3051\"";
1971 parser_testcase
tc (escaped_unicode
);
1972 ASSERT_EQ (tc
.get_error (), nullptr);
1973 const json::value
*jv
= tc
.get_value ();
1974 ASSERT_EQ (jv
->get_kind (), JSON_STRING
);
1975 /* Result of get_string should be UTF-8 encoded, without quotes. */
1976 ASSERT_STREQ (as_a
<const json::string
*> (jv
)->get_string (),
1977 "\346\226\207" "\345\255\227" "\345\214\226" "\343\201\221");
1978 /* Result of dump should be UTF-8 encoded, with quotes. */
1979 ASSERT_PRINT_EQ (*jv
, false, mojibake
);
1980 auto range
= tc
.get_range_for_value (jv
);
1981 ASSERT_TRUE (range
);
1982 ASSERT_RANGE_EQ (*range
,
1988 /* Verify that we can parse various kinds of JSON numbers. */
1991 test_parse_number ()
1993 const int line_1
= 1;
1996 parser_testcase
tc ("42");
1997 ASSERT_EQ (tc
.get_error (), nullptr);
1998 const json::value
*jv
= tc
.get_value ();
1999 ASSERT_EQ (jv
->get_kind (), JSON_INTEGER
);
2000 ASSERT_EQ (as_a
<const json::integer_number
*> (jv
)->get (), 42.0);
2001 ASSERT_PRINT_EQ (*jv
, true, "42");
2002 auto range
= tc
.get_range_for_value (jv
);
2003 ASSERT_TRUE (range
);
2004 ASSERT_RANGE_EQ (*range
,
2009 /* Negative number. */
2011 parser_testcase
tc ("-17");
2012 ASSERT_EQ (tc
.get_error (), nullptr);
2013 const json::value
*jv
= tc
.get_value ();
2014 ASSERT_EQ (jv
->get_kind (), JSON_INTEGER
);
2015 ASSERT_EQ (as_a
<const json::integer_number
*> (jv
)->get (), -17.0);
2016 ASSERT_PRINT_EQ (*jv
, true, "-17");
2017 auto range
= tc
.get_range_for_value (jv
);
2018 ASSERT_TRUE (range
);
2019 ASSERT_RANGE_EQ (*range
,
2026 parser_testcase
tc ("3.141");
2027 ASSERT_EQ (tc
.get_error (), nullptr);
2028 const json::value
*jv
= tc
.get_value ();
2029 ASSERT_EQ (JSON_FLOAT
, jv
->get_kind ());
2030 ASSERT_NEAR (3.141, ((const json::float_number
*)jv
)->get (), 0.001);
2031 auto range
= tc
.get_range_for_value (jv
);
2032 ASSERT_TRUE (range
);
2033 ASSERT_RANGE_EQ (*range
,
2041 parser_testcase
tc ("3.141e+0");
2042 ASSERT_EQ (tc
.get_error (), nullptr);
2043 const json::value
*jv
= tc
.get_value ();
2044 ASSERT_EQ (jv
->get_kind (), JSON_FLOAT
);
2045 ASSERT_NEAR (as_a
<const json::float_number
*> (jv
)->get (), 3.141, 0.1);
2046 auto range
= tc
.get_range_for_value (jv
);
2047 ASSERT_TRUE (range
);
2048 ASSERT_RANGE_EQ (*range
,
2053 parser_testcase
tc ("42e2");
2054 ASSERT_EQ (tc
.get_error (), nullptr);
2055 const json::value
*jv
= tc
.get_value ();
2056 ASSERT_EQ (jv
->get_kind (), JSON_INTEGER
);
2057 ASSERT_EQ (as_a
<const json::integer_number
*> (jv
)->get (), 4200);
2058 ASSERT_PRINT_EQ (*jv
, true, "4200");
2059 auto range
= tc
.get_range_for_value (jv
);
2060 ASSERT_TRUE (range
);
2061 ASSERT_RANGE_EQ (*range
,
2066 parser_testcase
tc ("42e-1");
2067 ASSERT_EQ (tc
.get_error (), nullptr);
2068 const json::value
*jv
= tc
.get_value ();
2069 ASSERT_EQ (jv
->get_kind (), JSON_FLOAT
);
2070 ASSERT_NEAR (as_a
<const json::float_number
*> (jv
)->get (), 4.2, 0.1);
2071 auto range
= tc
.get_range_for_value (jv
);
2072 ASSERT_TRUE (range
);
2073 ASSERT_RANGE_EQ (*range
,
2080 /* Verify that JSON array parsing works. */
2085 const int line_1
= 1;
2087 parser_testcase
tc ("[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]");
2088 ASSERT_EQ (tc
.get_error (), nullptr);
2089 const json::value
*jv
= tc
.get_value ();
2090 ASSERT_EQ (jv
->get_kind (), JSON_ARRAY
);
2091 const json::array
*arr
= as_a
<const json::array
*> (jv
);
2092 ASSERT_EQ (arr
->length (), 10);
2093 auto range
= tc
.get_range_for_value (jv
);
2094 ASSERT_TRUE (range
);
2095 ASSERT_RANGE_EQ (*range
,
2098 for (int i
= 0; i
< 10; i
++)
2100 json::value
*element
= arr
->get (i
);
2101 ASSERT_EQ (element
->get_kind (), JSON_INTEGER
);
2102 ASSERT_EQ (as_a
<json::integer_number
*> (element
)->get (), i
);
2103 range
= tc
.get_range_for_value (element
);
2104 ASSERT_TRUE (range
);
2105 const int offset
= 1 + (i
* 3);
2106 ASSERT_RANGE_EQ (*range
,
2107 offset
, line_1
, offset
,
2108 offset
, line_1
, offset
);
2110 ASSERT_PRINT_EQ (*jv
, false, "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]");
2113 /* Verify that JSON object parsing works. */
2116 test_parse_object ()
2118 const int line_1
= 1;
2119 std::unique_ptr
<error
> err
;
2121 /* 01 2345 678 9012 345 6789 0123456789012. */
2122 parser_testcase
tc ("{\"foo\": \"bar\", \"baz\": [42, null]}");
2124 ASSERT_EQ (tc
.get_error (), nullptr);
2125 const json::value
*jv
= tc
.get_value ();
2126 ASSERT_NE (jv
, nullptr);
2127 ASSERT_EQ (jv
->get_kind (), JSON_OBJECT
);
2128 auto range
= tc
.get_range_for_value (jv
);
2129 ASSERT_TRUE (range
);
2130 ASSERT_RANGE_EQ (*range
,
2133 const json::object
*jo
= static_cast <const json::object
*> (jv
);
2135 json::value
*foo_value
= jo
->get ("foo");
2136 ASSERT_NE (foo_value
, nullptr);
2137 ASSERT_EQ (foo_value
->get_kind (), JSON_STRING
);
2138 ASSERT_STREQ (as_a
<json::string
*> (foo_value
)->get_string (), "bar");
2139 range
= tc
.get_range_for_value (foo_value
);
2140 ASSERT_TRUE (range
);
2141 ASSERT_RANGE_EQ (*range
,
2145 json::value
*baz_value
= jo
->get ("baz");
2146 ASSERT_NE (baz_value
, nullptr);
2147 ASSERT_EQ (baz_value
->get_kind (), JSON_ARRAY
);
2148 range
= tc
.get_range_for_value (baz_value
);
2149 ASSERT_TRUE (range
);
2150 ASSERT_RANGE_EQ (*range
,
2154 json::array
*baz_array
= as_a
<json::array
*> (baz_value
);
2155 ASSERT_EQ (baz_array
->length (), 2);
2157 json::value
*element0
= baz_array
->get (0);
2158 ASSERT_EQ (as_a
<json::integer_number
*> (element0
)->get (), 42);
2159 range
= tc
.get_range_for_value (element0
);
2160 ASSERT_TRUE (range
);
2161 ASSERT_RANGE_EQ (*range
,
2165 json::value
*element1
= baz_array
->get (1);
2166 ASSERT_EQ (element1
->get_kind (), JSON_NULL
);
2167 range
= tc
.get_range_for_value (element1
);
2168 ASSERT_TRUE (range
);
2169 ASSERT_RANGE_EQ (*range
,
2174 /* Verify that the JSON literals "true", "false" and "null" are parsed
2178 test_parse_literals ()
2180 const int line_1
= 1;
2182 parser_testcase
tc ("true");
2183 ASSERT_EQ (tc
.get_error (), nullptr);
2184 const json::value
*jv
= tc
.get_value ();
2185 ASSERT_NE (jv
, nullptr);
2186 ASSERT_EQ (jv
->get_kind (), JSON_TRUE
);
2187 ASSERT_PRINT_EQ (*jv
, false, "true");
2188 auto range
= tc
.get_range_for_value (jv
);
2189 ASSERT_TRUE (range
);
2190 ASSERT_RANGE_EQ (*range
,
2196 parser_testcase
tc ("false");
2197 ASSERT_EQ (tc
.get_error (), nullptr);
2198 const json::value
*jv
= tc
.get_value ();
2199 ASSERT_NE (jv
, nullptr);
2200 ASSERT_EQ (jv
->get_kind (), JSON_FALSE
);
2201 ASSERT_PRINT_EQ (*jv
, false, "false");
2202 auto range
= tc
.get_range_for_value (jv
);
2203 ASSERT_TRUE (range
);
2204 ASSERT_RANGE_EQ (*range
,
2210 parser_testcase
tc ("null");
2211 ASSERT_EQ (tc
.get_error (), nullptr);
2212 const json::value
*jv
= tc
.get_value ();
2213 ASSERT_NE (jv
, nullptr);
2214 ASSERT_EQ (jv
->get_kind (), JSON_NULL
);
2215 ASSERT_PRINT_EQ (*jv
, false, "null");
2216 auto range
= tc
.get_range_for_value (jv
);
2217 ASSERT_TRUE (range
);
2218 ASSERT_RANGE_EQ (*range
,
2224 /* Verify that we can parse a simple JSON-RPC request. */
2227 test_parse_jsonrpc ()
2229 std::unique_ptr
<error
> err
;
2232 /* 01 23456789 012 3456 789 0123456 789 012345678 90. */
2233 = ("{\"jsonrpc\": \"2.0\", \"method\": \"subtract\",\n"
2235 /* 0 1234567 8901234567890 1234 56789012345678 90. */
2236 " \"params\": [42, 23], \"id\": 1}");
2237 const int line_1
= 1;
2238 const int line_2
= 2;
2239 const size_t line_2_offset
= 41;
2240 parser_testcase
tc (request
);
2241 ASSERT_EQ (tc
.get_error (), nullptr);
2242 const json::value
*jv
= tc
.get_value ();
2243 ASSERT_NE (jv
, nullptr);
2244 auto range
= tc
.get_range_for_value (jv
);
2245 ASSERT_TRUE (range
);
2246 ASSERT_RANGE_EQ (*range
,
2248 line_2_offset
+ 28, line_2
, 28);
2251 /* Verify that we can parse an empty JSON object. */
2254 test_parse_empty_object ()
2256 const int line_1
= 1;
2257 std::unique_ptr
<error
> err
;
2258 parser_testcase
tc ("{}");
2259 ASSERT_EQ (tc
.get_error (), nullptr);
2260 const json::value
*jv
= tc
.get_value ();
2261 ASSERT_NE (jv
, nullptr);
2262 ASSERT_EQ (jv
->get_kind (), JSON_OBJECT
);
2263 ASSERT_PRINT_EQ (*jv
, true, "{}");
2264 auto range
= tc
.get_range_for_value (jv
);
2265 ASSERT_TRUE (range
);
2266 ASSERT_RANGE_EQ (*range
,
2271 /* Verify that comment-parsing can be enabled or disabled. */
2274 test_parsing_comments ()
2276 const char *str
= ("// foo\n"
2281 /* Parsing with comment support disabled. */
2283 parser_testcase
tc (str
);
2284 ASSERT_NE (tc
.get_error (), nullptr);
2285 ASSERT_STREQ (tc
.get_error ()->get_msg (),
2286 "invalid JSON token: unexpected character: '/'");
2287 ASSERT_EQ (tc
.get_value (), nullptr);
2290 /* Parsing with comment support enabled. */
2292 parser_testcase
tc (str
, true);
2293 ASSERT_EQ (tc
.get_error (), nullptr);
2294 const json::value
*jv
= tc
.get_value ();
2295 ASSERT_NE (jv
, nullptr);
2296 ASSERT_EQ (jv
->get_kind (), JSON_INTEGER
);
2297 ASSERT_EQ (((const json::integer_number
*)jv
)->get (), 42);
2301 /* Verify that we can parse an empty JSON string. */
2304 test_error_empty_string ()
2306 const int line_1
= 1;
2307 parser_testcase
tc ("");
2308 ASSERT_ERR_EQ (tc
.get_error (),
2311 "expected a JSON value but got EOF");
2312 ASSERT_EQ (tc
.get_value (), nullptr);
2315 /* Verify that JSON parsing gracefully handles an invalid token. */
2318 test_error_bad_token ()
2320 const int line_1
= 1;
2321 parser_testcase
tc (" not valid ");
2322 ASSERT_ERR_EQ (tc
.get_error (),
2325 "invalid JSON token: unexpected character: 'n'");
2326 ASSERT_EQ (tc
.get_value (), nullptr);
2329 /* Verify that JSON parsing gracefully handles a missing comma
2330 within an object. */
2333 test_error_object_with_missing_comma ()
2335 const int line_1
= 1;
2337 /* 01 2345 6789012 3456 7890. */
2338 const char *json
= "{\"foo\" : 42 \"bar\"";
2339 parser_testcase
tc (json
);
2340 ASSERT_ERR_EQ (tc
.get_error (),
2343 "expected ',' or '}'; got string");
2344 ASSERT_EQ (tc
.get_value (), nullptr);
2347 /* Verify that JSON parsing gracefully handles a missing comma
2351 test_error_array_with_missing_comma ()
2353 const int line_1
= 1;
2355 const char *json
= "[0, 1 42]";
2356 parser_testcase
tc (json
);
2357 ASSERT_ERR_EQ (tc
.get_error (),
2360 "expected ',' or ']'; got number");
2361 ASSERT_EQ (tc
.get_value (), nullptr);
2364 /* Run all of the selftests within this file. */
2367 json_parser_cc_tests ()
2370 test_lexing_unsupported_single_line_comment ();
2371 test_lexing_unsupported_multiline_comment ();
2372 test_lexing_supported_single_line_comment ();
2373 test_lexing_supported_multiline_comment ();
2374 test_parse_string ();
2375 test_parse_number ();
2376 test_parse_array ();
2377 test_parse_object ();
2378 test_parse_literals ();
2379 test_parse_jsonrpc ();
2380 test_parse_empty_object ();
2381 test_parsing_comments ();
2382 test_error_empty_string ();
2383 test_error_bad_token ();
2384 test_error_object_with_missing_comma ();
2385 test_error_array_with_missing_comma ();
2388 } // namespace selftest
2390 #endif /* #if CHECKING_P */