gcc/json-parsing.cc

   1 /* JSON parsing
   2    Copyright (C) 2017-2025 Free Software Foundation, Inc.
   3    Contributed by David Malcolm <dmalcolm@redhat.com>.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "json-parsing.h"
  25 #include "pretty-print.h"
  26 #include "math.h"
  27 #include "make-unique.h"
  28 #include "selftest.h"
  29
  30 using namespace json;
  31
  32 /* Declarations relating to parsing JSON, all within an
  33    anonymous namespace.  */
  34
  35 namespace {
  36
  37 /* A typedef representing a single unicode character.  */
  38
  39 typedef unsigned unichar;
  40
  41 /* An enum for discriminating different kinds of JSON token.  */
  42
  43 enum token_id
  44 {
  45   TOK_ERROR,
  46
  47   TOK_EOF,
  48
  49   /* Punctuation.  */
  50   TOK_OPEN_SQUARE,
  51   TOK_OPEN_CURLY,
  52   TOK_CLOSE_SQUARE,
  53   TOK_CLOSE_CURLY,
  54   TOK_COLON,
  55   TOK_COMMA,
  56
  57   /* Literal names.  */
  58   TOK_TRUE,
  59   TOK_FALSE,
  60   TOK_NULL,
  61
  62   TOK_STRING,
  63   TOK_FLOAT_NUMBER,
  64   TOK_INTEGER_NUMBER
  65 };
  66
  67 /* Human-readable descriptions of enum token_id.  */
  68
  69 static const char *token_id_name[] = {
  70   "error",
  71   "EOF",
  72   "'['",
  73   "'{'",
  74   "']'",
  75   "'}'",
  76   "':'",
  77   "','",
  78   "'true'",
  79   "'false'",
  80   "'null'",
  81   "string",
  82   "number",
  83   "number"
  84 };
  85
  86 /* Tokens within the JSON lexer.  */
  87
  88 struct token
  89 {
  90   /* The kind of token.  */
  91   enum token_id id;
  92
  93   /* The location of this token within the unicode
  94      character stream.  */
  95   location_map::range range;
  96
  97   union
  98   {
  99     /* Value for TOK_ERROR and TOK_STRING.  */
 100     char *string;
 101
 102     /* Value for TOK_FLOAT_NUMBER.  */
 103     double float_number;
 104
 105     /* Value for TOK_INTEGER_NUMBER.  */
 106     long integer_number;
 107   } u;
 108 };
 109
 110 /* A class for lexing JSON.  */
 111
 112 class lexer
 113 {
 114  public:
 115   lexer (bool support_comments);
 116   ~lexer ();
 117
 118   std::unique_ptr<error> add_utf8 (size_t length, const char *utf8_buf);
 119
 120   const token *peek ();
 121
 122   void consume ();
 123
 124  private:
 125   bool get_char (unichar &out_char, location_map::point *out_point);
 126   void unget_char ();
 127   location_map::point get_next_point () const;
 128   static void dump_token (FILE *outf, const token *tok);
 129   void lex_token (token *out);
 130   void lex_string (token *out);
 131   void lex_number (token *out, unichar first_char);
 132   bool rest_of_literal (token *out, const char *suffix);
 133   std::unique_ptr<error> make_error (const char *msg);
 134   bool consume_single_line_comment (token *out);
 135   bool consume_multiline_comment (token *out);
 136
 137  private:
 138   auto_vec<unichar> m_buffer;
 139   int m_next_char_idx;
 140   int m_next_char_line;
 141   int m_next_char_column;
 142   int m_prev_line_final_column; /* for handling unget_char after a '\n'.  */
 143
 144   static const int MAX_TOKENS = 1;
 145   token m_next_tokens[MAX_TOKENS];
 146   int m_num_next_tokens;
 147
 148   bool m_support_comments;
 149 };
 150
 151 /* A class for parsing JSON.  */
 152
 153 class parser
 154 {
 155  public:
 156   parser (location_map *out_loc_map,
 157           bool support_comments);
 158   ~parser ();
 159
 160   std::unique_ptr<error>
 161   add_utf8 (size_t length, const char *utf8_buf);
 162
 163   parser_result_t parse_value (int depth);
 164   parser_result_t parse_object (int depth);
 165   parser_result_t parse_array (int depth);
 166
 167   std::unique_ptr<error>
 168   require_eof ();
 169
 170  private:
 171   location_map::point get_next_token_start ();
 172   location_map::point get_next_token_end ();
 173
 174   std::unique_ptr<error>
 175   require (enum token_id tok_id);
 176
 177   result<enum token_id, std::unique_ptr<error>>
 178   require_one_of (enum token_id tok_id_a, enum token_id tok_id_b);
 179
 180   std::unique_ptr<error>
 181   error_at (const location_map::range &r,
 182             const char *fmt, ...) ATTRIBUTE_PRINTF_3;
 183
 184   void maybe_record_range (json::value *jv, const location_map::range &r);
 185   void maybe_record_range (json::value *jv,
 186                            const location_map::point &start,
 187                            const location_map::point &end);
 188
 189  private:
 190   lexer m_lexer;
 191   location_map *m_loc_map;
 192 };
 193
 194 } // anonymous namespace for parsing implementation
 195
 196 /* Parser implementation.  */
 197
 198 /* lexer's ctor.  */
 199
 200 lexer::lexer (bool support_comments)
 201 : m_buffer (), m_next_char_idx (0),
 202   m_next_char_line (1), m_next_char_column (0),
 203   m_prev_line_final_column (-1),
 204   m_num_next_tokens (0),
 205   m_support_comments (support_comments)
 206 {
 207 }
 208
 209 /* lexer's dtor.  */
 210
 211 lexer::~lexer ()
 212 {
 213   while (m_num_next_tokens > 0)
 214     consume ();
 215 }
 216
 217 /* Peek the next token.  */
 218
 219 const token *
 220 lexer::peek ()
 221 {
 222   if (m_num_next_tokens == 0)
 223     {
 224       lex_token (&m_next_tokens[0]);
 225       m_num_next_tokens++;
 226     }
 227   return &m_next_tokens[0];
 228 }
 229
 230 /* Consume the next token.  */
 231
 232 void
 233 lexer::consume ()
 234 {
 235   if (m_num_next_tokens == 0)
 236     peek ();
 237
 238   gcc_assert (m_num_next_tokens > 0);
 239   gcc_assert (m_num_next_tokens <= MAX_TOKENS);
 240
 241   if (0)
 242     {
 243       fprintf (stderr, "consuming token: ");
 244       dump_token (stderr, &m_next_tokens[0]);
 245       fprintf (stderr, "\n");
 246     }
 247
 248   if (m_next_tokens[0].id == TOK_ERROR
 249       || m_next_tokens[0].id == TOK_STRING)
 250     free (m_next_tokens[0].u.string);
 251
 252   m_num_next_tokens--;
 253   memmove (&m_next_tokens[0], &m_next_tokens[1],
 254            sizeof (token) * m_num_next_tokens);
 255 }
 256
 257 /* Add LENGTH bytes of UTF-8 encoded text from UTF8_BUF to this lexer's
 258    buffer.
 259    Return null if successful, or the error if there was a problem.  */
 260
 261 std::unique_ptr<error>
 262 lexer::add_utf8 (size_t length, const char *utf8_buf)
 263 {
 264   /* Adapted from charset.c:one_utf8_to_cppchar.  */
 265   static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01 };
 266   static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
 267
 268   const uchar *inbuf = (const unsigned char *) (utf8_buf);
 269   const uchar **inbufp = &inbuf;
 270   size_t *inbytesleftp = &length;
 271
 272   while (length > 0)
 273     {
 274       unichar c;
 275       const uchar *inbuf = *inbufp;
 276       size_t nbytes, i;
 277
 278       c = *inbuf;
 279       if (c < 0x80)
 280         {
 281           m_buffer.safe_push (c);
 282           *inbytesleftp -= 1;
 283           *inbufp += 1;
 284           continue;
 285         }
 286
 287       /* The number of leading 1-bits in the first byte indicates how many
 288          bytes follow.  */
 289       for (nbytes = 2; nbytes < 7; nbytes++)
 290         if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
 291           goto found;
 292       return make_error ("ill-formed UTF-8 sequence");
 293     found:
 294
 295       if (*inbytesleftp < nbytes)
 296         return make_error ("ill-formed UTF-8 sequence");
 297
 298       c = (c & masks[nbytes-1]);
 299       inbuf++;
 300       for (i = 1; i < nbytes; i++)
 301         {
 302           unichar n = *inbuf++;
 303           if ((n & 0xC0) != 0x80)
 304             return make_error ("ill-formed UTF-8 sequence");
 305           c = ((c << 6) + (n & 0x3F));
 306         }
 307
 308       /* Make sure the shortest possible encoding was used.  */
 309       if ((   c <=      0x7F && nbytes > 1)
 310           || (c <=     0x7FF && nbytes > 2)
 311           || (c <=    0xFFFF && nbytes > 3)
 312           || (c <=  0x1FFFFF && nbytes > 4)
 313           || (c <= 0x3FFFFFF && nbytes > 5))
 314         return make_error ("ill-formed UTF-8:"
 315                            " shortest possible encoding not used");
 316
 317       /* Make sure the character is valid.  */
 318       if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF))
 319         return make_error ("ill-formed UTF-8: invalid character");
 320
 321       m_buffer.safe_push (c);
 322       *inbufp = inbuf;
 323       *inbytesleftp -= nbytes;
 324     }
 325   return nullptr;
 326 }
 327
 328 /* Attempt to get the next unicode character from this lexer's buffer.
 329    If successful, write it to OUT_CHAR, and its location to *OUT_POINT,
 330    and return true.
 331    Otherwise, return false.  */
 332
 333 bool
 334 lexer::get_char (unichar &out_char, location_map::point *out_point)
 335 {
 336   if (m_next_char_idx >= (int)m_buffer.length ())
 337     return false;
 338
 339   if (out_point)
 340     *out_point = get_next_point ();
 341   out_char = m_buffer[m_next_char_idx++];
 342
 343   if (out_char == '\n')
 344     {
 345       m_next_char_line++;
 346       m_prev_line_final_column = m_next_char_column;
 347       m_next_char_column = 0;
 348     }
 349   else
 350     m_next_char_column++;
 351
 352   return true;
 353 }
 354
 355 /* Undo the last successful get_char.  */
 356
 357 void
 358 lexer::unget_char ()
 359 {
 360   --m_next_char_idx;
 361   if (m_next_char_column > 0)
 362     --m_next_char_column;
 363   else
 364     {
 365       m_next_char_line--;
 366       m_next_char_column = m_prev_line_final_column;
 367       /* We don't support more than one unget_char in a row.  */
 368       gcc_assert (m_prev_line_final_column != -1);
 369       m_prev_line_final_column = -1;
 370     }
 371 }
 372
 373 /* Get the location of the next char.  */
 374
 375 location_map::point
 376 lexer::get_next_point () const
 377 {
 378   location_map::point result;
 379   result.m_unichar_idx = m_next_char_idx;
 380   result.m_line = m_next_char_line;
 381   result.m_column = m_next_char_column;
 382   return result;
 383 }
 384
 385 /* Print a textual representation of TOK to OUTF.
 386    This is intended for debugging the lexer and parser,
 387    rather than for user-facing output.  */
 388
 389 void
 390 lexer::dump_token (FILE *outf, const token *tok)
 391 {
 392   switch (tok->id)
 393     {
 394     case TOK_ERROR:
 395       fprintf (outf, "TOK_ERROR (\"%s\")", tok->u.string);
 396       break;
 397
 398     case TOK_EOF:
 399       fprintf (outf, "TOK_EOF");
 400       break;
 401
 402     case TOK_OPEN_SQUARE:
 403       fprintf (outf, "TOK_OPEN_SQUARE");
 404       break;
 405
 406     case TOK_OPEN_CURLY:
 407       fprintf (outf, "TOK_OPEN_CURLY");
 408       break;
 409
 410     case TOK_CLOSE_SQUARE:
 411       fprintf (outf, "TOK_CLOSE_SQUARE");
 412       break;
 413
 414     case TOK_CLOSE_CURLY:
 415       fprintf (outf, "TOK_CLOSE_CURLY");
 416       break;
 417
 418     case TOK_COLON:
 419       fprintf (outf, "TOK_COLON");
 420       break;
 421
 422     case TOK_COMMA:
 423       fprintf (outf, "TOK_COMMA");
 424       break;
 425
 426     case TOK_TRUE:
 427       fprintf (outf, "TOK_TRUE");
 428       break;
 429
 430     case TOK_FALSE:
 431       fprintf (outf, "TOK_FALSE");
 432       break;
 433
 434     case TOK_NULL:
 435       fprintf (outf, "TOK_NULL");
 436       break;
 437
 438     case TOK_STRING:
 439       fprintf (outf, "TOK_STRING (\"%s\")", tok->u.string);
 440       break;
 441
 442     case TOK_FLOAT_NUMBER:
 443       fprintf (outf, "TOK_FLOAT_NUMBER (%f)", tok->u.float_number);
 444       break;
 445
 446     case TOK_INTEGER_NUMBER:
 447       fprintf (outf, "TOK_INTEGER_NUMBER (%ld)", tok->u.integer_number);
 448       break;
 449
 450     default:
 451       gcc_unreachable ();
 452       break;
 453     }
 454 }
 455
 456 /* Treat "//" as a comment to the end of the line.
 457
 458    This isn't compliant with the JSON spec,
 459    but is very handy for writing DejaGnu tests.
 460
 461    Return true if EOF and populate *OUT, false otherwise.  */
 462
 463 bool
 464 lexer::consume_single_line_comment (token *out)
 465 {
 466   while (1)
 467     {
 468       unichar next_char;
 469       if (!get_char (next_char, nullptr))
 470         {
 471           out->id = TOK_EOF;
 472           location_map::point p = get_next_point ();
 473           out->range.m_start = p;
 474           out->range.m_end = p;
 475           return true;
 476         }
 477       if (next_char == '\n')
 478         return false;
 479     }
 480 }
 481
 482 /* Treat '/' '*' as a multiline comment until the next closing '*' '/'.
 483
 484    This isn't compliant with the JSON spec,
 485    but is very handy for writing DejaGnu tests.
 486
 487    Return true if EOF and populate *OUT, false otherwise.  */
 488
 489 bool
 490 lexer::consume_multiline_comment (token *out)
 491 {
 492   while (1)
 493     {
 494       unichar next_char;
 495       if (!get_char (next_char, nullptr))
 496         {
 497           out->id = TOK_ERROR;
 498           gcc_unreachable (); // TODO
 499           location_map::point p = get_next_point ();
 500           out->range.m_start = p;
 501           out->range.m_end = p;
 502           return true;
 503         }
 504       if (next_char != '*')
 505         continue;
 506       if (!get_char (next_char, nullptr))
 507         {
 508           out->id = TOK_ERROR;
 509           gcc_unreachable (); // TODO
 510           location_map::point p = get_next_point ();
 511           out->range.m_start = p;
 512           out->range.m_end = p;
 513           return true;
 514         }
 515       if (next_char == '/')
 516         return false;
 517     }
 518 }
 519
 520 /* Attempt to lex the input buffer, writing the next token to OUT.
 521    On errors, TOK_ERROR (or TOK_EOF) is written to OUT.  */
 522
 523 void
 524 lexer::lex_token (token *out)
 525 {
 526   /* Skip to next non-whitespace char.  */
 527   unichar next_char;
 528   location_map::point start_point;
 529   while (1)
 530     {
 531       if (!get_char (next_char, &start_point))
 532         {
 533           out->id = TOK_EOF;
 534           location_map::point p = get_next_point ();
 535           out->range.m_start = p;
 536           out->range.m_end = p;
 537           return;
 538         }
 539       if (m_support_comments)
 540         if (next_char == '/')
 541           {
 542             location_map::point point;
 543             unichar next_next_char;
 544             if (get_char (next_next_char, &point))
 545               {
 546                 switch (next_next_char)
 547                   {
 548                   case '/':
 549                     if (consume_single_line_comment (out))
 550                       return;
 551                     continue;
 552                   case '*':
 553                     if (consume_multiline_comment (out))
 554                       return;
 555                     continue;
 556                   default:
 557                     /* A stray single '/'.  Break out of loop, so that we
 558                        handle it below as an unexpected character.  */
 559                     goto non_whitespace;
 560                   }
 561               }
 562           }
 563       if (next_char != ' '
 564           && next_char != '\t'
 565           && next_char != '\n'
 566           && next_char != '\r')
 567         break;
 568     }
 569
 570  non_whitespace:
 571
 572   out->range.m_start = start_point;
 573   out->range.m_end = start_point;
 574
 575   switch (next_char)
 576     {
 577     case '[':
 578       out->id = TOK_OPEN_SQUARE;
 579       break;
 580
 581     case '{':
 582       out->id = TOK_OPEN_CURLY;
 583       break;
 584
 585     case ']':
 586       out->id = TOK_CLOSE_SQUARE;
 587       break;
 588
 589     case '}':
 590       out->id = TOK_CLOSE_CURLY;
 591       break;
 592
 593     case ':':
 594       out->id = TOK_COLON;
 595       break;
 596
 597     case ',':
 598       out->id = TOK_COMMA;
 599       break;
 600
 601     case '"':
 602       lex_string (out);
 603       break;
 604
 605     case '-':
 606     case '0':
 607     case '1':
 608     case '2':
 609     case '3':
 610     case '4':
 611     case '5':
 612     case '6':
 613     case '7':
 614     case '8':
 615     case '9':
 616       lex_number (out, next_char);
 617       break;
 618
 619     case 't':
 620       /* Handle literal "true".  */
 621       if (rest_of_literal (out, "rue"))
 622         {
 623           out->id = TOK_TRUE;
 624           break;
 625         }
 626       else
 627         goto err;
 628
 629     case 'f':
 630       /* Handle literal "false".  */
 631       if (rest_of_literal (out, "alse"))
 632         {
 633           out->id = TOK_FALSE;
 634           break;
 635         }
 636       else
 637         goto err;
 638
 639     case 'n':
 640       /* Handle literal "null".  */
 641       if (rest_of_literal (out, "ull"))
 642         {
 643           out->id = TOK_NULL;
 644           break;
 645         }
 646       else
 647         goto err;
 648
 649     err:
 650     default:
 651       out->id = TOK_ERROR;
 652       out->u.string = xasprintf ("unexpected character: '%c'", next_char);
 653       break;
 654     }
 655 }
 656
 657 /* Having consumed an open-quote character from the lexer's buffer, attempt
 658    to lex the rest of a JSON string, writing the result to OUT (or TOK_ERROR)
 659    if an error occurred.
 660    (ECMA-404 section 9; RFC 7159 section 7).  */
 661
 662 void
 663 lexer::lex_string (token *out)
 664 {
 665   auto_vec<unichar> content;
 666   bool still_going = true;
 667   while (still_going)
 668     {
 669       unichar uc;
 670       if (!get_char (uc, &out->range.m_end))
 671         {
 672           out->id = TOK_ERROR;
 673           out->range.m_end = get_next_point ();
 674           out->u.string = xstrdup ("EOF within string");
 675           return;
 676         }
 677       switch (uc)
 678         {
 679         case '"':
 680           still_going = false;
 681           break;
 682         case '\\':
 683           {
 684             unichar next_char;
 685             if (!get_char (next_char, &out->range.m_end))
 686               {
 687                 out->id = TOK_ERROR;
 688                 out->range.m_end = get_next_point ();
 689                 out->u.string = xstrdup ("EOF within string");;
 690                 return;
 691               }
 692             switch (next_char)
 693               {
 694               case '"':
 695               case '\\':
 696               case '/':
 697                 content.safe_push (next_char);
 698                 break;
 699
 700               case 'b':
 701                 content.safe_push ('\b');
 702                 break;
 703
 704               case 'f':
 705                 content.safe_push ('\f');
 706                 break;
 707
 708               case 'n':
 709                 content.safe_push ('\n');
 710                 break;
 711
 712               case 'r':
 713                 content.safe_push ('\r');
 714                 break;
 715
 716               case 't':
 717                 content.safe_push ('\t');
 718                 break;
 719
 720               case 'u':
 721                 {
 722                   unichar result = 0;
 723                   for (int i = 0; i < 4; i++)
 724                     {
 725                       unichar hexdigit;
 726                       if (!get_char (hexdigit, &out->range.m_end))
 727                         {
 728                           out->id = TOK_ERROR;
 729                           out->range.m_end = get_next_point ();
 730                           out->u.string = xstrdup ("EOF within string");
 731                           return;
 732                         }
 733                       result <<= 4;
 734                       if (hexdigit >= '0' && hexdigit <= '9')
 735                         result += hexdigit - '0';
 736                       else if (hexdigit >= 'a' && hexdigit <= 'f')
 737                         result += (hexdigit - 'a') + 10;
 738                       else if (hexdigit >= 'A' && hexdigit <= 'F')
 739                         result += (hexdigit - 'A') + 10;
 740                       else
 741                         {
 742                           out->id = TOK_ERROR;
 743                           out->range.m_start = out->range.m_end;
 744                           out->u.string = xstrdup ("bogus hex char");
 745                           return;
 746                         }
 747                     }
 748                   content.safe_push (result);
 749                 }
 750                 break;
 751
 752               default:
 753                 out->id = TOK_ERROR;
 754                 out->u.string = xstrdup ("unrecognized escape char");
 755                 return;
 756               }
 757           }
 758           break;
 759
 760         default:
 761           /* Reject unescaped control characters U+0000 through U+001F
 762              (ECMA-404 section 9 para 1; RFC 7159 section 7 para 1).  */
 763           if (uc <= 0x1f)
 764             {
 765                 out->id = TOK_ERROR;
 766                 out->range.m_start = out->range.m_end;
 767                 out->u.string = xstrdup ("unescaped control char");
 768                 return;
 769             }
 770
 771           /* Otherwise, add regular unicode code point.  */
 772           content.safe_push (uc);
 773           break;
 774         }
 775     }
 776
 777   out->id = TOK_STRING;
 778
 779   auto_vec<char> utf8_buf;
 780   // Adapted from libcpp/charset.c:one_cppchar_to_utf8
 781   for (unsigned i = 0; i < content.length (); i++)
 782     {
 783       static const uchar masks[6] =  { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
 784       static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
 785       size_t nbytes;
 786       uchar buf[6], *p = &buf[6];
 787       unichar c = content[i];
 788
 789       nbytes = 1;
 790       if (c < 0x80)
 791         *--p = c;
 792       else
 793         {
 794           do
 795             {
 796               *--p = ((c & 0x3F) | 0x80);
 797               c >>= 6;
 798               nbytes++;
 799             }
 800           while (c >= 0x3F || (c & limits[nbytes-1]));
 801           *--p = (c | masks[nbytes-1]);
 802         }
 803
 804       while (p < &buf[6])
 805         utf8_buf.safe_push (*p++);
 806     }
 807
 808   out->u.string = XNEWVEC (char, utf8_buf.length () + 1);
 809   for (unsigned i = 0; i < utf8_buf.length (); i++)
 810     out->u.string[i] = utf8_buf[i];
 811   out->u.string[utf8_buf.length ()] = '\0';
 812 }
 813
 814 /* Having consumed FIRST_CHAR, an initial digit or '-' character from
 815    the lexer's buffer attempt to lex the rest of a JSON number, writing
 816    the result to OUT (or TOK_ERROR) if an error occurred.
 817    (ECMA-404 section 8; RFC 7159 section 6).  */
 818
 819 void
 820 lexer::lex_number (token *out, unichar first_char)
 821 {
 822   bool negate = false;
 823   double value = 0.0;
 824   if (first_char == '-')
 825     {
 826       negate = true;
 827       if (!get_char (first_char, &out->range.m_end))
 828         {
 829           out->id = TOK_ERROR;
 830           out->range.m_start = out->range.m_end;
 831           out->u.string = xstrdup ("expected digit");
 832           return;
 833         }
 834     }
 835
 836   if (first_char == '0')
 837     value = 0.0;
 838   else if (!ISDIGIT (first_char))
 839     {
 840       out->id = TOK_ERROR;
 841       out->range.m_start = out->range.m_end;
 842       out->u.string = xstrdup ("expected digit");
 843       return;
 844     }
 845   else
 846     {
 847       /* Got a nonzero digit; expect zero or more digits.  */
 848       value = first_char - '0';
 849       while (1)
 850         {
 851           unichar uc;
 852           location_map::point point;
 853           if (!get_char (uc, &point))
 854             break;
 855           if (ISDIGIT (uc))
 856             {
 857               value *= 10;
 858               value += uc -'0';
 859               out->range.m_end = point;
 860               continue;
 861             }
 862           else
 863             {
 864               unget_char ();
 865               break;
 866             }
 867         }
 868     }
 869
 870   /* Optional '.', followed by one or more decimals.  */
 871   unichar next_char;
 872   location_map::point point;
 873   if (get_char (next_char, &point))
 874     {
 875       if (next_char == '.')
 876         {
 877           /* Parse decimal digits.  */
 878           bool had_digit = false;
 879           double digit_factor = 0.1;
 880           while (get_char (next_char, &point))
 881             {
 882               if (!ISDIGIT (next_char))
 883                 {
 884                   unget_char ();
 885                   break;
 886                 }
 887               value += (next_char - '0') * digit_factor;
 888               digit_factor *= 0.1;
 889               had_digit = true;
 890               out->range.m_end = point;
 891             }
 892           if (!had_digit)
 893             {
 894               out->id = TOK_ERROR;
 895               out->range.m_start = point;
 896               out->range.m_start = point;
 897               out->u.string = xstrdup ("expected digit");
 898               return;
 899             }
 900         }
 901       else
 902         unget_char ();
 903     }
 904
 905   /* Parse 'e' and 'E'.  */
 906   unichar exponent_char;
 907   if (get_char (exponent_char, &point))
 908     {
 909       if (exponent_char == 'e' || exponent_char == 'E')
 910         {
 911           /* Optional +/-.  */
 912           unichar sign_char;
 913           int exponent = 0;
 914           bool negate_exponent = false;
 915           bool had_exponent_digit = false;
 916           if (!get_char (sign_char, &point))
 917             {
 918               out->id = TOK_ERROR;
 919               out->range.m_start = point;
 920               out->range.m_start = point;
 921               out->u.string = xstrdup ("EOF within exponent");
 922               return;
 923             }
 924           if (sign_char == '-')
 925             negate_exponent = true;
 926           else if (sign_char == '+')
 927             ;
 928           else if (ISDIGIT (sign_char))
 929             {
 930               exponent = sign_char - '0';
 931               had_exponent_digit = true;
 932             }
 933           else
 934             {
 935               out->id = TOK_ERROR;
 936               out->range.m_start = point;
 937               out->range.m_start = point;
 938               out->u.string
 939                 = xstrdup ("expected '-','+' or digit within exponent");
 940               return;
 941             }
 942           out->range.m_end = point;
 943
 944           /* One or more digits (we might have seen the digit above,
 945              though).  */
 946           while (1)
 947             {
 948               unichar uc;
 949               location_map::point point;
 950               if (!get_char (uc, &point))
 951                 break;
 952               if (ISDIGIT (uc))
 953                 {
 954                   exponent *= 10;
 955                   exponent += uc -'0';
 956                   had_exponent_digit = true;
 957                   out->range.m_end = point;
 958                   continue;
 959                 }
 960               else
 961                 {
 962                   unget_char ();
 963                   break;
 964                 }
 965             }
 966           if (!had_exponent_digit)
 967             {
 968               out->id = TOK_ERROR;
 969               out->range.m_start = point;
 970               out->range.m_start = point;
 971               out->u.string = xstrdup ("expected digit within exponent");
 972               return;
 973             }
 974           if (negate_exponent)
 975             exponent = -exponent;
 976           value = value * pow (10, exponent);
 977         }
 978       else
 979         unget_char ();
 980     }
 981
 982   if (negate)
 983     value = -value;
 984
 985   if (value == (long)value)
 986     {
 987       out->id = TOK_INTEGER_NUMBER;
 988       out->u.integer_number = value;
 989     }
 990   else
 991     {
 992       out->id = TOK_FLOAT_NUMBER;
 993       out->u.float_number = value;
 994     }
 995 }
 996
 997 /* Determine if the next characters to be lexed match SUFFIX.
 998    SUFFIX must be pure ASCII and not contain newlines.
 999    If so, consume the characters and return true.
1000    Otherwise, return false.  */
1001
1002 bool
1003 lexer::rest_of_literal (token *out, const char *suffix)
1004 {
1005   int suffix_idx = 0;
1006   int buf_idx = m_next_char_idx;
1007   while (1)
1008     {
1009       if (suffix[suffix_idx] == '\0')
1010         {
1011           m_next_char_idx += suffix_idx;
1012           m_next_char_column += suffix_idx;
1013           out->range.m_end.m_unichar_idx += suffix_idx;
1014           out->range.m_end.m_column += suffix_idx;
1015           return true;
1016         }
1017       if (buf_idx >= (int)m_buffer.length ())
1018         return false;
1019       /* This assumes that suffix is ASCII.  */
1020       if (m_buffer[buf_idx] != (unichar)suffix[suffix_idx])
1021         return false;
1022       buf_idx++;
1023       suffix_idx++;
1024     }
1025 }
1026
1027 /* Create a new error instance for MSG, using the location of the next
1028    character for the location of the error.  */
1029
1030 std::unique_ptr<error>
1031 lexer::make_error (const char *msg)
1032 {
1033   location_map::point p;
1034   p.m_unichar_idx = m_next_char_idx;
1035   p.m_line = m_next_char_line;
1036   p.m_column = m_next_char_column;
1037   location_map::range r;
1038   r.m_start = p;
1039   r.m_end = p;
1040   return ::make_unique<error> (r, xstrdup (msg));
1041 }
1042
1043 /* parser's ctor.  */
1044
1045 parser::parser (location_map *out_loc_map,
1046                 bool support_comments)
1047 : m_lexer (support_comments), m_loc_map (out_loc_map)
1048 {
1049 }
1050
1051 /* parser's dtor.  */
1052
1053 parser::~parser ()
1054 {
1055   if (m_loc_map)
1056     m_loc_map->on_finished_parsing ();
1057 }
1058
1059 /* Add LENGTH bytes of UTF-8 encoded text from UTF8_BUF to this parser's
1060    lexer's buffer.  */
1061
1062 std::unique_ptr<error>
1063 parser::add_utf8 (size_t length, const char *utf8_buf)
1064 {
1065   return m_lexer.add_utf8 (length, utf8_buf);
1066 }
1067
1068 /* Parse a JSON value (object, array, number, string, or literal).
1069    (ECMA-404 section 5; RFC 7159 section 3).  */
1070
1071 parser_result_t
1072 parser::parse_value (int depth)
1073 {
1074   const token *tok = m_lexer.peek ();
1075
1076   /* Avoid stack overflow with deeply-nested inputs; RFC 7159 section 9
1077      states: "An implementation may set limits on the maximum depth
1078      of nesting.".
1079
1080      Ideally we'd avoid this limit (e.g. by rewriting parse_value,
1081      parse_object, and parse_array into a single function with a vec of
1082      state).  */
1083   const int MAX_DEPTH = 100;
1084   if (depth >= MAX_DEPTH)
1085     return error_at (tok->range, "maximum nesting depth exceeded: %i",
1086                      MAX_DEPTH);
1087
1088   switch (tok->id)
1089     {
1090     case TOK_OPEN_CURLY:
1091       return parse_object (depth);
1092
1093     case TOK_STRING:
1094       {
1095         auto val = ::make_unique<string> (tok->u.string);
1096         m_lexer.consume ();
1097         maybe_record_range (val.get (), tok->range);
1098         return parser_result_t (std::move (val));
1099       }
1100
1101     case TOK_OPEN_SQUARE:
1102       return parse_array (depth);
1103
1104     case TOK_FLOAT_NUMBER:
1105       {
1106         auto val = ::make_unique<float_number> (tok->u.float_number);
1107         m_lexer.consume ();
1108         maybe_record_range (val.get (), tok->range);
1109         return parser_result_t (std::move (val));
1110       }
1111
1112     case TOK_INTEGER_NUMBER:
1113       {
1114         auto val = ::make_unique<integer_number> (tok->u.integer_number);
1115         m_lexer.consume ();
1116         maybe_record_range (val.get (), tok->range);
1117         return parser_result_t (std::move (val));
1118       }
1119
1120     case TOK_TRUE:
1121       {
1122         auto val = ::make_unique<literal> (JSON_TRUE);
1123         m_lexer.consume ();
1124         maybe_record_range (val.get (), tok->range);
1125         return parser_result_t (std::move (val));
1126       }
1127
1128     case TOK_FALSE:
1129       {
1130         auto val = ::make_unique<literal> (JSON_FALSE);
1131         m_lexer.consume ();
1132         maybe_record_range (val.get (), tok->range);
1133         return parser_result_t (std::move (val));
1134       }
1135
1136     case TOK_NULL:
1137       {
1138         auto val = ::make_unique<literal> (JSON_NULL);
1139         m_lexer.consume ();
1140         maybe_record_range (val.get (), tok->range);
1141         return parser_result_t (std::move (val));
1142       }
1143
1144     case TOK_ERROR:
1145       return error_at (tok->range, "invalid JSON token: %s", tok->u.string);
1146
1147     default:
1148       return error_at (tok->range, "expected a JSON value but got %s",
1149                        token_id_name[tok->id]);
1150     }
1151 }
1152
1153 /* Parse a JSON object.
1154    (ECMA-404 section 6; RFC 7159 section 4).  */
1155
1156 parser_result_t
1157 parser::parse_object (int depth)
1158 {
1159   location_map::point start = get_next_token_start ();
1160
1161   require (TOK_OPEN_CURLY);
1162
1163   auto obj = ::make_unique<object> ();
1164
1165   const token *tok = m_lexer.peek ();
1166   if (tok->id == TOK_CLOSE_CURLY)
1167     {
1168       location_map::point end = get_next_token_end ();
1169       maybe_record_range (obj.get (), start, end);
1170       if (auto err = require (TOK_CLOSE_CURLY))
1171         return parser_result_t (std::move (err));
1172       return parser_result_t (std::move (obj));
1173     }
1174   if (tok->id != TOK_STRING)
1175     return error_at (tok->range,
1176                      "expected string for object key after '{'; got %s",
1177                      token_id_name[tok->id]);
1178   while (true)
1179     {
1180       tok = m_lexer.peek ();
1181       if (tok->id != TOK_STRING)
1182         return error_at (tok->range,
1183                          "expected string for object key after ','; got %s",
1184                          token_id_name[tok->id]);
1185       label_text key = label_text::take (xstrdup (tok->u.string));
1186       m_lexer.consume ();
1187
1188       if (auto err = require (TOK_COLON))
1189         return parser_result_t (std::move (err));
1190
1191       parser_result_t r = parse_value (depth + 1);
1192       if (r.m_err)
1193         return r;
1194       if (!r.m_val)
1195         return parser_result_t (std::move (obj));
1196
1197       /* We don't enforce uniqueness for keys.  */
1198       obj->set (key.get (), std::move (r.m_val));
1199
1200       location_map::point end = get_next_token_end ();
1201       result<enum token_id, std::unique_ptr<error>> result
1202         (require_one_of (TOK_COMMA, TOK_CLOSE_CURLY));
1203       if (result.m_err)
1204         return parser_result_t (std::move (result.m_err));
1205       if (result.m_val == TOK_COMMA)
1206         continue;
1207       else
1208         {
1209           /* TOK_CLOSE_CURLY.  */
1210           maybe_record_range (obj.get (), start, end);
1211           return parser_result_t (std::move (obj));
1212         }
1213     }
1214 }
1215
1216 /* Parse a JSON array.
1217    (ECMA-404 section 7; RFC 7159 section 5).  */
1218
1219 parser_result_t
1220 parser::parse_array (int depth)
1221 {
1222   location_map::point start = get_next_token_start ();
1223   if (auto err = require (TOK_OPEN_SQUARE))
1224     return parser_result_t (std::move (err));
1225
1226   auto arr = ::make_unique<array> ();
1227
1228   const token *tok = m_lexer.peek ();
1229   if (tok->id == TOK_CLOSE_SQUARE)
1230     {
1231       location_map::point end = get_next_token_end ();
1232       maybe_record_range (arr.get (), start, end);
1233       m_lexer.consume ();
1234       return parser_result_t (std::move (arr));
1235     }
1236
1237   while (true)
1238     {
1239       parser_result_t r = parse_value (depth + 1);
1240       if (r.m_err)
1241         return r;
1242
1243       arr->append (std::move (r.m_val));
1244
1245       location_map::point end = get_next_token_end ();
1246       result<enum token_id, std::unique_ptr<error>> result
1247         (require_one_of (TOK_COMMA, TOK_CLOSE_SQUARE));
1248       if (result.m_err)
1249         return parser_result_t (std::move (result.m_err));
1250       if (result.m_val == TOK_COMMA)
1251         continue;
1252       else
1253         {
1254           /* TOK_CLOSE_SQUARE.  */
1255           maybe_record_range (arr.get (), start, end);
1256           return parser_result_t (std::move (arr));
1257         }
1258     }
1259 }
1260
1261 /* Get the start point of the next token.  */
1262
1263 location_map::point
1264 parser::get_next_token_start ()
1265 {
1266   const token *tok = m_lexer.peek ();
1267   return tok->range.m_start;
1268 }
1269
1270 /* Get the end point of the next token.  */
1271
1272 location_map::point
1273 parser::get_next_token_end ()
1274 {
1275   const token *tok = m_lexer.peek ();
1276   return tok->range.m_end;
1277 }
1278
1279 /* Require an EOF, or fail if there is surplus input.  */
1280
1281 std::unique_ptr<error>
1282 parser::require_eof ()
1283 {
1284   return require (TOK_EOF);
1285 }
1286
1287 /* Consume the next token, issuing an error if it is not of kind TOK_ID.  */
1288
1289 std::unique_ptr<error>
1290 parser::require (enum token_id tok_id)
1291 {
1292   const token *tok = m_lexer.peek ();
1293   if (tok->id != tok_id)
1294     {
1295       if (tok->id == TOK_ERROR)
1296         return error_at (tok->range,
1297                          "expected %s; got bad token: %s",
1298                          token_id_name[tok_id], tok->u.string);
1299       else
1300         return error_at (tok->range,
1301                          "expected %s; got %s", token_id_name[tok_id],
1302                          token_id_name[tok->id]);
1303     }
1304   m_lexer.consume ();
1305   return nullptr;
1306 }
1307
1308 /* Consume the next token, issuing an error if it is not of
1309    kind TOK_ID_A or TOK_ID_B.
1310    Return which kind it was.  */
1311
1312 result<enum token_id, std::unique_ptr<error>>
1313 parser::require_one_of (enum token_id tok_id_a, enum token_id tok_id_b)
1314 {
1315   const token *tok = m_lexer.peek ();
1316   if ((tok->id != tok_id_a)
1317       && (tok->id != tok_id_b))
1318     {
1319       if (tok->id == TOK_ERROR)
1320         return error_at (tok->range, "expected %s or %s; got bad token: %s",
1321                          token_id_name[tok_id_a], token_id_name[tok_id_b],
1322                          tok->u.string);
1323       else
1324         return error_at (tok->range, "expected %s or %s; got %s",
1325                          token_id_name[tok_id_a], token_id_name[tok_id_b],
1326                          token_id_name[tok->id]);
1327     }
1328   enum token_id id = tok->id;
1329   m_lexer.consume ();
1330   return result<enum token_id, std::unique_ptr<error>> (id);
1331 }
1332
1333 /* Genarate a parsing error.  */
1334
1335 std::unique_ptr<error>
1336 parser::error_at (const location_map::range &r, const char *fmt, ...)
1337 {
1338   va_list ap;
1339   va_start (ap, fmt);
1340   char *formatted_msg = xvasprintf (fmt, ap);
1341   va_end (ap);
1342
1343   return ::make_unique<error> (r, formatted_msg);
1344 }
1345
1346 /* Record that JV has range R within the input file.  */
1347
1348 void
1349 parser::maybe_record_range (json::value *jv, const location_map::range &r)
1350 {
1351   if (m_loc_map)
1352     m_loc_map->record_range_for_value (jv, r);
1353 }
1354
1355 /* Record that JV has range START to END within the input file.  */
1356
1357 void
1358 parser::maybe_record_range (json::value *jv,
1359                             const location_map::point &start,
1360                             const location_map::point &end)
1361 {
1362   if (m_loc_map)
1363     {
1364       location_map::range r;
1365       r.m_start = start;
1366       r.m_end = end;
1367       m_loc_map->record_range_for_value (jv, r);
1368     }
1369 }
1370
1371 /* Attempt to parse the UTF-8 encoded buffer at UTF8_BUF
1372    of the given LENGTH.
1373    If ALLOW_COMMENTS is true, then allow C and C++ style-comments in the
1374    buffer, as an extension to JSON, otherwise forbid them.
1375    If successful, return an json::value in the result.
1376    if there was a problem, return a json::error in the result.
1377    If OUT_LOC_MAP is non-NULL, notify *OUT_LOC_MAP about
1378    source locations of nodes seen during parsing.  */
1379
1380 parser_result_t
1381 json::parse_utf8_string (size_t length,
1382                          const char *utf8_buf,
1383                          bool allow_comments,
1384                          location_map *out_loc_map)
1385 {
1386   parser p (out_loc_map, allow_comments);
1387   if (auto err = p.add_utf8 (length, utf8_buf))
1388     return parser_result_t (std::move (err));
1389   parser_result_t r = p.parse_value (0);
1390   if (r.m_err)
1391     return r;
1392   if (auto err = p.require_eof ())
1393     return parser_result_t (std::move (err));
1394   return r;
1395 }
1396
1397 /* Attempt to parse the nil-terminated UTF-8 encoded buffer at
1398    UTF8_BUF.
1399    If ALLOW_COMMENTS is true, then allow C and C++ style-comments in the
1400    buffer, as an extension to JSON, otherwise forbid them.
1401    If successful, return a non-NULL json::value *.
1402    if there was a problem, return NULL and write an error
1403    message to err_out, which must be deleted by the caller.
1404    If OUT_LOC_MAP is non-NULL, notify *OUT_LOC_MAP about
1405    source locations of nodes seen during parsing.  */
1406
1407 json::parser_result_t
1408 json::parse_utf8_string (const char *utf8,
1409                          bool allow_comments,
1410                          location_map *out_loc_map)
1411 {
1412   return parse_utf8_string (strlen (utf8), utf8, allow_comments,
1413                             out_loc_map);
1414 }
1415
1416 \f
1417 #if CHECKING_P
1418
1419 namespace selftest {
1420
1421 /* Selftests.  */
1422
1423 #define ASSERT_PRINT_EQ(JV, FORMATTED, EXPECTED_JSON)   \
1424   assert_print_eq (SELFTEST_LOCATION, JV, FORMATTED, EXPECTED_JSON)
1425
1426 /* Implementation detail of ASSERT_RANGE_EQ.  */
1427
1428 static void
1429 assert_point_eq (const location &loc,
1430                  const location_map::point &actual_point,
1431                  size_t exp_unichar_idx, int exp_line, int exp_column)
1432 {
1433   ASSERT_EQ_AT (loc, actual_point.m_unichar_idx, exp_unichar_idx);
1434   ASSERT_EQ_AT (loc, actual_point.m_line, exp_line);
1435   ASSERT_EQ_AT (loc, actual_point.m_column, exp_column);
1436 }
1437
1438 /* Implementation detail of ASSERT_RANGE_EQ.  */
1439
1440 static void
1441 assert_range_eq (const location &loc,
1442                  const location_map::range &actual_range,
1443                  /* Expected location.  */
1444                  size_t start_unichar_idx, int start_line, int start_column,
1445                  size_t end_unichar_idx, int end_line, int end_column)
1446 {
1447   assert_point_eq (loc, actual_range.m_start,
1448                    start_unichar_idx, start_line, start_column);
1449   assert_point_eq (loc, actual_range.m_end,
1450                    end_unichar_idx, end_line, end_column);
1451 }
1452
1453 /* Assert that ACTUAL_RANGE starts at
1454    (START_UNICHAR_IDX, START_LINE, START_COLUMN)
1455    and ends at (END_UNICHAR_IDX, END_LINE, END_COLUMN).  */
1456
1457 #define ASSERT_RANGE_EQ(ACTUAL_RANGE, \
1458                         START_UNICHAR_IDX, START_LINE, START_COLUMN,    \
1459                         END_UNICHAR_IDX, END_LINE, END_COLUMN)          \
1460   assert_range_eq ((SELFTEST_LOCATION), (ACTUAL_RANGE), \
1461                    (START_UNICHAR_IDX), (START_LINE), (START_COLUMN),   \
1462                    (END_UNICHAR_IDX), (END_LINE), (END_COLUMN))
1463
1464 /* Implementation detail of ASSERT_ERR_EQ.  */
1465
1466 static void
1467 assert_err_eq (const location &loc,
1468                const json::error *actual_err,
1469                /* Expected location.  */
1470                size_t start_unichar_idx, int start_line, int start_column,
1471                size_t end_unichar_idx, int end_line, int end_column,
1472                const char *expected_msg)
1473 {
1474   ASSERT_TRUE_AT (loc, actual_err);
1475   const location_map::range &actual_range = actual_err->get_range ();
1476   ASSERT_EQ_AT (loc, actual_range.m_start.m_unichar_idx, start_unichar_idx);
1477   ASSERT_EQ_AT (loc, actual_range.m_start.m_line, start_line);
1478   ASSERT_EQ_AT (loc, actual_range.m_start.m_column, start_column);
1479   ASSERT_EQ_AT (loc, actual_range.m_end.m_unichar_idx, end_unichar_idx);
1480   ASSERT_EQ_AT (loc, actual_range.m_end.m_line, end_line);
1481   ASSERT_EQ_AT (loc, actual_range.m_end.m_column, end_column);
1482   ASSERT_STREQ_AT (loc, actual_err->get_msg (), expected_msg);
1483 }
1484
1485 /* Assert that ACTUAL_ERR is a non-NULL json::error *,
1486    with message EXPECTED_MSG, and that its location starts
1487    at (START_UNICHAR_IDX, START_LINE, START_COLUMN)
1488    and ends at (END_UNICHAR_IDX, END_LINE, END_COLUMN).  */
1489
1490 #define ASSERT_ERR_EQ(ACTUAL_ERR, \
1491                       START_UNICHAR_IDX, START_LINE, START_COLUMN,      \
1492                       END_UNICHAR_IDX, END_LINE, END_COLUMN,    \
1493                       EXPECTED_MSG)                 \
1494   assert_err_eq ((SELFTEST_LOCATION), (ACTUAL_ERR), \
1495                  (START_UNICHAR_IDX), (START_LINE), (START_COLUMN),     \
1496                  (END_UNICHAR_IDX), (END_LINE), (END_COLUMN),   \
1497                  (EXPECTED_MSG))
1498
1499 /* Verify that the JSON lexer works as expected.  */
1500
1501 static void
1502 test_lexer ()
1503 {
1504   lexer l (false);
1505   const char *str
1506     /*  0         1         2         3         4         .  */
1507     /*  01234567890123456789012345678901234567890123456789.  */
1508     = ("    1066   -1  \n"
1509        "    -273.15 1e6\n"
1510        "  [   ] null   true  false  {  }  \"foo\" \n");
1511   auto err = l.add_utf8 (strlen (str), str);
1512   ASSERT_EQ (err, nullptr);
1513
1514   /* Line 1.  */
1515   {
1516     const size_t line_offset = 0;
1517
1518     /* Expect token: "1066" in columns 4-7.  */
1519     {
1520       const token *tok = l.peek ();
1521       ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
1522       ASSERT_EQ (tok->u.integer_number, 1066);
1523       ASSERT_RANGE_EQ (tok->range,
1524                        line_offset + 4, 1, 4,
1525                        line_offset + 7, 1, 7);
1526       l.consume ();
1527     }
1528     /* Expect token: "-1" in columns 11-12.  */
1529     {
1530       const token *tok = l.peek ();
1531       ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
1532       ASSERT_EQ (tok->u.integer_number, -1);
1533       ASSERT_RANGE_EQ (tok->range,
1534                        line_offset + 11, 1, 11,
1535                        line_offset + 12, 1, 12);
1536       l.consume ();
1537     }
1538   }
1539
1540   /* Line 2.  */
1541   {
1542     const size_t line_offset = 16;
1543
1544     /* Expect token: "-273.15" in columns 4-10.  */
1545     {
1546       const token *tok = l.peek ();
1547       ASSERT_EQ (tok->id, TOK_FLOAT_NUMBER);
1548       ASSERT_EQ (int(tok->u.float_number), int(-273.15));
1549       ASSERT_RANGE_EQ (tok->range,
1550                        line_offset + 4, 2, 4,
1551                        line_offset + 10, 2, 10);
1552       l.consume ();
1553     }
1554     /* Expect token: "1e6" in columns 12-14.  */
1555     {
1556       const token *tok = l.peek ();
1557       ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
1558       ASSERT_EQ (tok->u.integer_number, 1000000);
1559       ASSERT_RANGE_EQ (tok->range,
1560                        line_offset + 12, 2, 12,
1561                        line_offset + 14, 2, 14);
1562       l.consume ();
1563     }
1564   }
1565
1566   /* Line 3.  */
1567   {
1568     const size_t line_offset = 32;
1569
1570     /* Expect token: "[".  */
1571     {
1572       const token *tok = l.peek ();
1573       ASSERT_EQ (tok->id, TOK_OPEN_SQUARE);
1574       ASSERT_RANGE_EQ (tok->range,
1575                        line_offset + 2, 3, 2,
1576                        line_offset + 2, 3, 2);
1577       l.consume ();
1578     }
1579     /* Expect token: "]".  */
1580     {
1581       const token *tok = l.peek ();
1582       ASSERT_EQ (tok->id, TOK_CLOSE_SQUARE);
1583       ASSERT_RANGE_EQ (tok->range,
1584                        line_offset + 6, 3, 6,
1585                        line_offset + 6, 3, 6);
1586       l.consume ();
1587     }
1588     /* Expect token: "null".  */
1589     {
1590       const token *tok = l.peek ();
1591       ASSERT_EQ (tok->id, TOK_NULL);
1592       ASSERT_RANGE_EQ (tok->range,
1593                        line_offset + 8, 3, 8,
1594                        line_offset + 11, 3, 11);
1595       l.consume ();
1596     }
1597     /* Expect token: "true".  */
1598     {
1599       const token *tok = l.peek ();
1600       ASSERT_EQ (tok->id, TOK_TRUE);
1601       ASSERT_RANGE_EQ (tok->range,
1602                        line_offset + 15, 3, 15,
1603                        line_offset + 18, 3, 18);
1604       l.consume ();
1605     }
1606     /* Expect token: "false".  */
1607     {
1608       const token *tok = l.peek ();
1609       ASSERT_EQ (tok->id, TOK_FALSE);
1610       ASSERT_RANGE_EQ (tok->range,
1611                        line_offset + 21, 3, 21,
1612                        line_offset + 25, 3, 25);
1613       l.consume ();
1614     }
1615     /* Expect token: "{".  */
1616     {
1617       const token *tok = l.peek ();
1618       ASSERT_EQ (tok->id, TOK_OPEN_CURLY);
1619       ASSERT_RANGE_EQ (tok->range,
1620                        line_offset + 28, 3, 28,
1621                        line_offset + 28, 3, 28);
1622       l.consume ();
1623     }
1624     /* Expect token: "}".  */
1625     {
1626       const token *tok = l.peek ();
1627       ASSERT_EQ (tok->id, TOK_CLOSE_CURLY);
1628       ASSERT_RANGE_EQ (tok->range,
1629                        line_offset + 31, 3, 31,
1630                        line_offset + 31, 3, 31);
1631       l.consume ();
1632     }
1633     /* Expect token: "\"foo\"".  */
1634     {
1635       const token *tok = l.peek ();
1636       ASSERT_EQ (tok->id, TOK_STRING);
1637       ASSERT_RANGE_EQ (tok->range,
1638                        line_offset + 34, 3, 34,
1639                        line_offset + 38, 3, 38);
1640       l.consume ();
1641     }
1642   }
1643 }
1644
1645 /* Verify that the JSON lexer complains about single-line comments
1646    when comments are disabled.  */
1647
1648 static void
1649 test_lexing_unsupported_single_line_comment ()
1650 {
1651   lexer l (false);
1652   const char *str
1653     /*  0         1         2         3         4         .  */
1654     /*  01234567890123456789012345678901234567890123456789.  */
1655     = ("    1066   // Hello world\n");
1656   auto err = l.add_utf8 (strlen (str), str);
1657   ASSERT_EQ (err, nullptr);
1658
1659   /* Line 1.  */
1660   {
1661     const size_t line_offset = 0;
1662     const int line_1 = 1;
1663
1664     /* Expect token: "1066" in columns 4-7.  */
1665     {
1666       const token *tok = l.peek ();
1667       ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
1668       ASSERT_EQ (tok->u.integer_number, 1066);
1669       ASSERT_RANGE_EQ (tok->range,
1670                        line_offset + 4, line_1, 4,
1671                        line_offset + 7, line_1, 7);
1672       l.consume ();
1673     }
1674
1675     /* Expect error.  */
1676     {
1677       const token *tok = l.peek ();
1678       ASSERT_EQ (tok->id, TOK_ERROR);
1679       ASSERT_STREQ (tok->u.string, "unexpected character: '/'");
1680       ASSERT_RANGE_EQ (tok->range,
1681                        line_offset + 11, line_1, 11,
1682                        line_offset + 11, line_1, 11);
1683       l.consume ();
1684     }
1685   }
1686 }
1687
1688 /* Verify that the JSON lexer complains about multiline comments
1689    when comments are disabled.  */
1690
1691 static void
1692 test_lexing_unsupported_multiline_comment ()
1693 {
1694   lexer l (false);
1695   const char *str
1696     /*  0         1         2         3         4         .  */
1697     /*  01234567890123456789012345678901234567890123456789.  */
1698     = ("    1066   /* Hello world\n"
1699        " continuation of comment\n"
1700        " end of comment */  42\n");
1701   auto err = l.add_utf8 (strlen (str), str);
1702   ASSERT_EQ (err, nullptr);
1703
1704   /* Line 1.  */
1705   {
1706     const size_t line_offset = 0;
1707     const int line_1 = 1;
1708
1709     /* Expect token: "1066" in line 1, columns 4-7.  */
1710     {
1711       const token *tok = l.peek ();
1712       ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
1713       ASSERT_EQ (tok->u.integer_number, 1066);
1714       ASSERT_RANGE_EQ (tok->range,
1715                        line_offset + 4, line_1, 4,
1716                        line_offset + 7, line_1, 7);
1717       l.consume ();
1718     }
1719
1720     /* Expect error.  */
1721     {
1722       const token *tok = l.peek ();
1723       ASSERT_EQ (tok->id, TOK_ERROR);
1724       ASSERT_STREQ (tok->u.string, "unexpected character: '/'");
1725       ASSERT_RANGE_EQ (tok->range,
1726                        line_offset + 11, line_1, 11,
1727                        line_offset + 11, line_1, 11);
1728       l.consume ();
1729     }
1730   }
1731 }
1732
1733 /* Verify that the JSON lexer handles single-line comments
1734    when comments are enabled.  */
1735
1736 static void
1737 test_lexing_supported_single_line_comment ()
1738 {
1739   lexer l (true);
1740   const char *str
1741     /*  0         1         2         3         4         .  */
1742     /*  01234567890123456789012345678901234567890123456789.  */
1743     = ("    1066   // Hello world\n"
1744        "     42   // etc\n");
1745   auto err = l.add_utf8 (strlen (str), str);
1746   ASSERT_EQ (err, nullptr);
1747
1748   const size_t line_1_offset = 0;
1749   const size_t line_2_offset = 26;
1750   const size_t line_3_offset = line_2_offset + 17;
1751
1752   /* Expect token: "1066" in line 1, columns 4-7.  */
1753   {
1754     const int line_1 = 1;
1755     const token *tok = l.peek ();
1756     ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
1757     ASSERT_EQ (tok->u.integer_number, 1066);
1758     ASSERT_RANGE_EQ (tok->range,
1759                      line_1_offset + 4, line_1, 4,
1760                      line_1_offset + 7, line_1, 7);
1761     l.consume ();
1762   }
1763
1764   /* Expect token: "42" in line 2, columns 5-6.  */
1765   {
1766     const int line_2 = 2;
1767     const token *tok = l.peek ();
1768     ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
1769     ASSERT_EQ (tok->u.integer_number, 42);
1770     ASSERT_RANGE_EQ (tok->range,
1771                      line_2_offset + 5, line_2, 5,
1772                      line_2_offset + 6, line_2, 6);
1773     l.consume ();
1774   }
1775
1776   /* Expect EOF.  */
1777   {
1778     const int line_3 = 3;
1779     const token *tok = l.peek ();
1780     ASSERT_EQ (tok->id, TOK_EOF);
1781     ASSERT_RANGE_EQ (tok->range,
1782                      line_3_offset + 0, line_3, 0,
1783                      line_3_offset + 0, line_3, 0);
1784     l.consume ();
1785   }
1786 }
1787
1788 /* Verify that the JSON lexer handles multiline comments
1789    when comments are enabled.  */
1790
1791 static void
1792 test_lexing_supported_multiline_comment ()
1793 {
1794   lexer l (true);
1795   const char *str
1796     /*  0         1         2         3         4         .  */
1797     /*  01234567890123456789012345678901234567890123456789.  */
1798     = ("    1066   /* Hello world\n"
1799        " continuation of comment\n"
1800        " end of comment */  42\n");
1801   auto err = l.add_utf8 (strlen (str), str);
1802   ASSERT_EQ (err, nullptr);
1803
1804   const size_t line_1_offset = 0;
1805   const size_t line_2_offset = 26;
1806   const size_t line_3_offset = line_2_offset + 25;
1807   const size_t line_4_offset = line_3_offset + 23;
1808
1809   /* Expect token: "1066" in line 1, columns 4-7.  */
1810   {
1811     const int line_1 = 1;
1812     const token *tok = l.peek ();
1813     ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
1814     ASSERT_EQ (tok->u.integer_number, 1066);
1815     ASSERT_RANGE_EQ (tok->range,
1816                      line_1_offset + 4, line_1, 4,
1817                      line_1_offset + 7, line_1, 7);
1818     l.consume ();
1819   }
1820
1821   /* Expect token: "42" in line 3, columns 20-21.  */
1822   {
1823     const int line_3 = 3;
1824     const token *tok = l.peek ();
1825     ASSERT_EQ (tok->id, TOK_INTEGER_NUMBER);
1826     ASSERT_EQ (tok->u.integer_number, 42);
1827     ASSERT_RANGE_EQ (tok->range,
1828                      line_3_offset + 20, line_3, 20,
1829                      line_3_offset + 21, line_3, 21);
1830     l.consume ();
1831   }
1832
1833   /* Expect EOF.  */
1834   {
1835     const int line_4 = 4;
1836     const token *tok = l.peek ();
1837     ASSERT_EQ (tok->id, TOK_EOF);
1838     ASSERT_RANGE_EQ (tok->range,
1839                      line_4_offset + 0, line_4, 0,
1840                      line_4_offset + 0, line_4, 0);
1841     l.consume ();
1842   }
1843 }
1844
1845 /* Helper class for writing JSON parsing testcases.
1846    Attempts to parse a string in ctor, and captures the result (either
1847    a json::value or a json::error), and a location map.  */
1848
1849 struct parser_testcase
1850 {
1851 public:
1852   parser_testcase (const char *utf8_string, bool allow_comments = false)
1853   : m_loc_map (),
1854     m_result (parse_utf8_string (utf8_string, allow_comments, &m_loc_map))
1855   {
1856   }
1857
1858   const json::value *get_value () const { return m_result.m_val.get (); }
1859   const json::error *get_error () const { return m_result.m_err.get (); }
1860
1861   const location_map::range *
1862   get_range_for_value (const json::value *jv) const
1863   {
1864     return m_loc_map.get_range_for_value (jv);
1865   }
1866
1867 private:
1868   /* Concrete implementation of location_map for use in
1869      JSON parsing selftests.  */
1870   class test_location_map : public location_map
1871   {
1872   public:
1873     void record_range_for_value (json::value *jv, const range &r) final override
1874     {
1875       m_map.put (jv, r);
1876     }
1877
1878     range *get_range_for_value (const json::value *jv) const
1879     {
1880       return const_cast<hash_map<const json::value *, range> &> (m_map)
1881         .get (jv);
1882     }
1883
1884   private:
1885     hash_map<const json::value *, range> m_map;
1886   };
1887
1888   test_location_map m_loc_map;
1889   json::parser_result_t m_result;
1890 };
1891
1892 /* Verify that parse_utf8_string works as expected.  */
1893
1894 static void
1895 test_parse_string ()
1896 {
1897   const int line_1 = 1;
1898
1899   {
1900     parser_testcase tc ("\"foo\"");
1901     ASSERT_EQ (tc.get_error (), nullptr);
1902     const json::value *jv = tc.get_value ();
1903     ASSERT_EQ (jv->get_kind (), JSON_STRING);
1904     ASSERT_STREQ (as_a <const json::string *> (jv)->get_string (), "foo");
1905     ASSERT_PRINT_EQ (*jv, true, "\"foo\"");
1906     auto range = tc.get_range_for_value (jv);
1907     ASSERT_TRUE (range);
1908     ASSERT_RANGE_EQ (*range,
1909                      0, line_1, 0,
1910                      4, line_1, 4);
1911   }
1912
1913   {
1914     const char *contains_quotes = "\"before \\\"quoted\\\" after\"";
1915     parser_testcase tc (contains_quotes);
1916     ASSERT_EQ (tc.get_error (), nullptr);
1917     const json::value *jv = tc.get_value ();
1918     ASSERT_EQ (jv->get_kind (), JSON_STRING);
1919     ASSERT_STREQ (as_a <const json::string *> (jv)->get_string (),
1920                   "before \"quoted\" after");
1921     ASSERT_PRINT_EQ (*jv, true, contains_quotes);
1922     auto range = tc.get_range_for_value (jv);
1923     ASSERT_TRUE (range);
1924     ASSERT_RANGE_EQ (*range,
1925                      0, line_1, 0,
1926                      24, line_1, 24);
1927   }
1928
1929   /* Test of non-ASCII input.  This string is the Japanese word "mojibake",
1930      written as C octal-escaped UTF-8.  */
1931   const char *mojibake = (/* Opening quote.  */
1932                           "\""
1933                           /* U+6587 CJK UNIFIED IDEOGRAPH-6587
1934                              UTF-8: 0xE6 0x96 0x87
1935                              C octal escaped UTF-8: \346\226\207.  */
1936                           "\346\226\207"
1937                           /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
1938                              UTF-8: 0xE5 0xAD 0x97
1939                              C octal escaped UTF-8: \345\255\227.  */
1940                           "\345\255\227"
1941                           /* U+5316 CJK UNIFIED IDEOGRAPH-5316
1942                              UTF-8: 0xE5 0x8C 0x96
1943                              C octal escaped UTF-8: \345\214\226.  */
1944                           "\345\214\226"
1945                           /* U+3051 HIRAGANA LETTER KE
1946                              UTF-8: 0xE3 0x81 0x91
1947                              C octal escaped UTF-8: \343\201\221.  */
1948                           "\343\201\221"
1949                           /* Closing quote.  */
1950                           "\"");
1951   {
1952     parser_testcase tc (mojibake);
1953     ASSERT_EQ (tc.get_error (), nullptr);
1954     const json::value *jv = tc.get_value ();
1955     ASSERT_EQ (jv->get_kind (), JSON_STRING);
1956     /* Result of get_string should be UTF-8 encoded, without quotes.  */
1957     ASSERT_STREQ (as_a <const json::string *> (jv)->get_string (),
1958                   "\346\226\207" "\345\255\227" "\345\214\226" "\343\201\221");
1959     /* Result of dump should be UTF-8 encoded, with quotes.  */
1960     ASSERT_PRINT_EQ (*jv, false, mojibake);
1961     auto range = tc.get_range_for_value (jv);
1962     ASSERT_TRUE (range);
1963     ASSERT_RANGE_EQ (*range,
1964                      0, line_1, 0,
1965                      5, line_1, 5);
1966   }
1967
1968   /* Test of \u-escaped unicode.  This is "mojibake" again, as above.  */
1969   {
1970     const char *escaped_unicode = "\"\\u6587\\u5b57\\u5316\\u3051\"";
1971     parser_testcase tc (escaped_unicode);
1972     ASSERT_EQ (tc.get_error (), nullptr);
1973     const json::value *jv = tc.get_value ();
1974     ASSERT_EQ (jv->get_kind (), JSON_STRING);
1975     /* Result of get_string should be UTF-8 encoded, without quotes.  */
1976     ASSERT_STREQ (as_a <const json::string *> (jv)->get_string (),
1977                   "\346\226\207" "\345\255\227" "\345\214\226" "\343\201\221");
1978     /* Result of dump should be UTF-8 encoded, with quotes.  */
1979     ASSERT_PRINT_EQ (*jv, false, mojibake);
1980     auto range = tc.get_range_for_value (jv);
1981     ASSERT_TRUE (range);
1982     ASSERT_RANGE_EQ (*range,
1983                      0, line_1, 0,
1984                      25, line_1, 25);
1985   }
1986 }
1987
1988 /* Verify that we can parse various kinds of JSON numbers.  */
1989
1990 static void
1991 test_parse_number ()
1992 {
1993   const int line_1 = 1;
1994
1995   {
1996     parser_testcase tc ("42");
1997     ASSERT_EQ (tc.get_error (), nullptr);
1998     const json::value *jv = tc.get_value ();
1999     ASSERT_EQ (jv->get_kind (), JSON_INTEGER);
2000     ASSERT_EQ (as_a <const json::integer_number *> (jv)->get (), 42.0);
2001     ASSERT_PRINT_EQ (*jv, true, "42");
2002     auto range = tc.get_range_for_value (jv);
2003     ASSERT_TRUE (range);
2004     ASSERT_RANGE_EQ (*range,
2005                      0, line_1, 0,
2006                      1, line_1, 1);
2007   }
2008
2009   /* Negative number.  */
2010   {
2011     parser_testcase tc ("-17");
2012     ASSERT_EQ (tc.get_error (), nullptr);
2013     const json::value *jv = tc.get_value ();
2014     ASSERT_EQ (jv->get_kind (), JSON_INTEGER);
2015     ASSERT_EQ (as_a<const json::integer_number *> (jv)->get (), -17.0);
2016     ASSERT_PRINT_EQ (*jv, true, "-17");
2017     auto range = tc.get_range_for_value (jv);
2018     ASSERT_TRUE (range);
2019     ASSERT_RANGE_EQ (*range,
2020                      0, line_1, 0,
2021                      2, line_1, 2);
2022   }
2023
2024   /* Decimal.  */
2025   {
2026     parser_testcase tc ("3.141");
2027     ASSERT_EQ (tc.get_error (), nullptr);
2028     const json::value *jv = tc.get_value ();
2029     ASSERT_EQ (JSON_FLOAT, jv->get_kind ());
2030     ASSERT_NEAR (3.141, ((const json::float_number *)jv)->get (), 0.001);
2031     auto range = tc.get_range_for_value (jv);
2032     ASSERT_TRUE (range);
2033     ASSERT_RANGE_EQ (*range,
2034                      0, line_1, 0,
2035                      4, line_1, 4);
2036   }
2037
2038   /* Exponents.  */
2039   {
2040     {
2041       parser_testcase tc ("3.141e+0");
2042       ASSERT_EQ (tc.get_error (), nullptr);
2043       const json::value *jv = tc.get_value ();
2044       ASSERT_EQ (jv->get_kind (), JSON_FLOAT);
2045       ASSERT_NEAR (as_a <const json::float_number *> (jv)->get (), 3.141, 0.1);
2046       auto range = tc.get_range_for_value (jv);
2047       ASSERT_TRUE (range);
2048       ASSERT_RANGE_EQ (*range,
2049                        0, line_1, 0,
2050                        7, line_1, 7);
2051     }
2052     {
2053       parser_testcase tc ("42e2");
2054       ASSERT_EQ (tc.get_error (), nullptr);
2055       const json::value *jv = tc.get_value ();
2056       ASSERT_EQ (jv->get_kind (), JSON_INTEGER);
2057       ASSERT_EQ (as_a <const json::integer_number *> (jv)->get (), 4200);
2058       ASSERT_PRINT_EQ (*jv, true, "4200");
2059       auto range = tc.get_range_for_value (jv);
2060       ASSERT_TRUE (range);
2061       ASSERT_RANGE_EQ (*range,
2062                        0, line_1, 0,
2063                        3, line_1, 3);
2064     }
2065     {
2066       parser_testcase tc ("42e-1");
2067       ASSERT_EQ (tc.get_error (), nullptr);
2068       const json::value *jv = tc.get_value ();
2069       ASSERT_EQ (jv->get_kind (), JSON_FLOAT);
2070       ASSERT_NEAR (as_a <const json::float_number *> (jv)->get (), 4.2, 0.1);
2071       auto range = tc.get_range_for_value (jv);
2072       ASSERT_TRUE (range);
2073       ASSERT_RANGE_EQ (*range,
2074                        0, line_1, 0,
2075                        4, line_1, 4);
2076     }
2077   }
2078 }
2079
2080 /* Verify that JSON array parsing works.  */
2081
2082 static void
2083 test_parse_array ()
2084 {
2085   const int line_1 = 1;
2086
2087   parser_testcase tc ("[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]");
2088   ASSERT_EQ (tc.get_error (), nullptr);
2089   const json::value *jv = tc.get_value ();
2090   ASSERT_EQ (jv->get_kind (), JSON_ARRAY);
2091   const json::array *arr = as_a <const json::array *> (jv);
2092   ASSERT_EQ (arr->length (), 10);
2093   auto range = tc.get_range_for_value (jv);
2094   ASSERT_TRUE (range);
2095   ASSERT_RANGE_EQ (*range,
2096                    0, line_1, 0,
2097                    29, line_1, 29);
2098   for (int i = 0; i < 10; i++)
2099     {
2100       json::value *element = arr->get (i);
2101       ASSERT_EQ (element->get_kind (), JSON_INTEGER);
2102       ASSERT_EQ (as_a <json::integer_number *> (element)->get (), i);
2103       range = tc.get_range_for_value (element);
2104       ASSERT_TRUE (range);
2105       const int offset = 1 + (i * 3);
2106       ASSERT_RANGE_EQ (*range,
2107                        offset, line_1, offset,
2108                        offset, line_1, offset);
2109     }
2110   ASSERT_PRINT_EQ (*jv, false, "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]");
2111 }
2112
2113 /* Verify that JSON object parsing works.  */
2114
2115 static void
2116 test_parse_object ()
2117 {
2118   const int line_1 = 1;
2119   std::unique_ptr<error> err;
2120   /*                   0            1            2         3  .  */
2121   /*                   01 2345 678 9012 345 6789 0123456789012.  */
2122   parser_testcase tc ("{\"foo\": \"bar\", \"baz\": [42, null]}");
2123
2124   ASSERT_EQ (tc.get_error (), nullptr);
2125   const json::value *jv = tc.get_value ();
2126   ASSERT_NE (jv, nullptr);
2127   ASSERT_EQ (jv->get_kind (), JSON_OBJECT);
2128   auto range = tc.get_range_for_value (jv);
2129   ASSERT_TRUE (range);
2130   ASSERT_RANGE_EQ (*range,
2131                    0, line_1, 0,
2132                    32, line_1, 32);
2133   const json::object *jo = static_cast <const json::object *> (jv);
2134
2135   json::value *foo_value = jo->get ("foo");
2136   ASSERT_NE (foo_value, nullptr);
2137   ASSERT_EQ (foo_value->get_kind (), JSON_STRING);
2138   ASSERT_STREQ (as_a <json::string *> (foo_value)->get_string (), "bar");
2139   range = tc.get_range_for_value (foo_value);
2140   ASSERT_TRUE (range);
2141   ASSERT_RANGE_EQ (*range,
2142                    8, line_1, 8,
2143                    12, line_1, 12);
2144
2145   json::value *baz_value = jo->get ("baz");
2146   ASSERT_NE (baz_value, nullptr);
2147   ASSERT_EQ (baz_value->get_kind (), JSON_ARRAY);
2148   range = tc.get_range_for_value (baz_value);
2149   ASSERT_TRUE (range);
2150   ASSERT_RANGE_EQ (*range,
2151                    22, line_1, 22,
2152                    31, line_1, 31);
2153
2154   json::array *baz_array = as_a <json::array *> (baz_value);
2155   ASSERT_EQ (baz_array->length (), 2);
2156
2157   json::value *element0 = baz_array->get (0);
2158   ASSERT_EQ (as_a <json::integer_number *> (element0)->get (), 42);
2159   range = tc.get_range_for_value (element0);
2160   ASSERT_TRUE (range);
2161   ASSERT_RANGE_EQ (*range,
2162                    23, line_1, 23,
2163                    24, line_1, 24);
2164
2165   json::value *element1 = baz_array->get (1);
2166   ASSERT_EQ (element1->get_kind (), JSON_NULL);
2167   range = tc.get_range_for_value (element1);
2168   ASSERT_TRUE (range);
2169   ASSERT_RANGE_EQ (*range,
2170                    27, line_1, 27,
2171                    30, line_1, 30);
2172 }
2173
2174 /* Verify that the JSON literals "true", "false" and "null" are parsed
2175    correctly.  */
2176
2177 static void
2178 test_parse_literals ()
2179 {
2180   const int line_1 = 1;
2181   {
2182     parser_testcase tc ("true");
2183     ASSERT_EQ (tc.get_error (), nullptr);
2184     const json::value *jv = tc.get_value ();
2185     ASSERT_NE (jv, nullptr);
2186     ASSERT_EQ (jv->get_kind (), JSON_TRUE);
2187     ASSERT_PRINT_EQ (*jv, false, "true");
2188     auto range = tc.get_range_for_value (jv);
2189     ASSERT_TRUE (range);
2190     ASSERT_RANGE_EQ (*range,
2191                      0, line_1, 0,
2192                      3, line_1, 3);
2193   }
2194
2195   {
2196     parser_testcase tc ("false");
2197     ASSERT_EQ (tc.get_error (), nullptr);
2198     const json::value *jv = tc.get_value ();
2199     ASSERT_NE (jv, nullptr);
2200     ASSERT_EQ (jv->get_kind (), JSON_FALSE);
2201     ASSERT_PRINT_EQ (*jv, false, "false");
2202     auto range = tc.get_range_for_value (jv);
2203     ASSERT_TRUE (range);
2204     ASSERT_RANGE_EQ (*range,
2205                      0, line_1, 0,
2206                      4, line_1, 4);
2207   }
2208
2209   {
2210     parser_testcase tc ("null");
2211     ASSERT_EQ (tc.get_error (), nullptr);
2212     const json::value *jv = tc.get_value ();
2213     ASSERT_NE (jv, nullptr);
2214     ASSERT_EQ (jv->get_kind (), JSON_NULL);
2215     ASSERT_PRINT_EQ (*jv, false, "null");
2216     auto range = tc.get_range_for_value (jv);
2217     ASSERT_TRUE (range);
2218     ASSERT_RANGE_EQ (*range,
2219                      0, line_1, 0,
2220                      3, line_1, 3);
2221   }
2222 }
2223
2224 /* Verify that we can parse a simple JSON-RPC request.  */
2225
2226 static void
2227 test_parse_jsonrpc ()
2228 {
2229   std::unique_ptr<error> err;
2230   const char *request
2231     /*  0           1            2           3          4.  */
2232     /*  01 23456789 012 3456 789 0123456 789 012345678 90.  */
2233     = ("{\"jsonrpc\": \"2.0\", \"method\": \"subtract\",\n"
2234     /*  0           1         2           3          4.  */
2235     /*  0 1234567 8901234567890 1234 56789012345678 90.  */
2236        " \"params\": [42, 23], \"id\": 1}");
2237   const int line_1 = 1;
2238   const int line_2 = 2;
2239   const size_t line_2_offset = 41;
2240   parser_testcase tc (request);
2241   ASSERT_EQ (tc.get_error (), nullptr);
2242   const json::value *jv = tc.get_value ();
2243   ASSERT_NE (jv, nullptr);
2244   auto range = tc.get_range_for_value (jv);
2245   ASSERT_TRUE (range);
2246   ASSERT_RANGE_EQ (*range,
2247                    0, line_1, 0,
2248                    line_2_offset + 28, line_2, 28);
2249 }
2250
2251 /* Verify that we can parse an empty JSON object.  */
2252
2253 static void
2254 test_parse_empty_object ()
2255 {
2256   const int line_1 = 1;
2257   std::unique_ptr<error> err;
2258   parser_testcase tc ("{}");
2259   ASSERT_EQ (tc.get_error (), nullptr);
2260   const json::value *jv = tc.get_value ();
2261   ASSERT_NE (jv, nullptr);
2262   ASSERT_EQ (jv->get_kind (), JSON_OBJECT);
2263   ASSERT_PRINT_EQ (*jv, true, "{}");
2264   auto range = tc.get_range_for_value (jv);
2265   ASSERT_TRUE (range);
2266   ASSERT_RANGE_EQ (*range,
2267                    0, line_1, 0,
2268                    1, line_1, 1);
2269 }
2270
2271 /* Verify that comment-parsing can be enabled or disabled.  */
2272
2273 static void
2274 test_parsing_comments ()
2275 {
2276   const char *str = ("// foo\n"
2277                      "/*...\n"
2278                      "...*/ 42 // bar\n"
2279                      "/* etc */\n");
2280
2281   /* Parsing with comment support disabled.  */
2282   {
2283     parser_testcase tc (str);
2284     ASSERT_NE (tc.get_error (), nullptr);
2285     ASSERT_STREQ (tc.get_error ()->get_msg (),
2286                   "invalid JSON token: unexpected character: '/'");
2287     ASSERT_EQ (tc.get_value (), nullptr);
2288   }
2289
2290   /* Parsing with comment support enabled.  */
2291   {
2292     parser_testcase tc (str, true);
2293     ASSERT_EQ (tc.get_error (), nullptr);
2294     const json::value *jv = tc.get_value ();
2295     ASSERT_NE (jv, nullptr);
2296     ASSERT_EQ (jv->get_kind (), JSON_INTEGER);
2297     ASSERT_EQ (((const json::integer_number *)jv)->get (), 42);
2298   }
2299 }
2300
2301 /* Verify that we can parse an empty JSON string.  */
2302
2303 static void
2304 test_error_empty_string ()
2305 {
2306   const int line_1 = 1;
2307   parser_testcase tc ("");
2308   ASSERT_ERR_EQ (tc.get_error (),
2309                  0, line_1, 0,
2310                  0, line_1, 0,
2311                  "expected a JSON value but got EOF");
2312   ASSERT_EQ (tc.get_value (), nullptr);
2313 }
2314
2315 /* Verify that JSON parsing gracefully handles an invalid token.  */
2316
2317 static void
2318 test_error_bad_token ()
2319 {
2320   const int line_1 = 1;
2321   parser_testcase tc ("  not valid ");
2322   ASSERT_ERR_EQ (tc.get_error (),
2323                  2, line_1, 2,
2324                  2, line_1, 2,
2325                  "invalid JSON token: unexpected character: 'n'");
2326   ASSERT_EQ (tc.get_value (), nullptr);
2327 }
2328
2329 /* Verify that JSON parsing gracefully handles a missing comma
2330    within an object.  */
2331
2332 static void
2333 test_error_object_with_missing_comma ()
2334 {
2335   const int line_1 = 1;
2336   /*                  0           1           2.  */
2337   /*                  01 2345 6789012 3456 7890.  */
2338   const char *json = "{\"foo\" : 42 \"bar\"";
2339   parser_testcase tc (json);
2340   ASSERT_ERR_EQ (tc.get_error (),
2341                  12, line_1, 12,
2342                  16, line_1, 16,
2343                  "expected ',' or '}'; got string");
2344   ASSERT_EQ (tc.get_value (), nullptr);
2345 }
2346
2347 /* Verify that JSON parsing gracefully handles a missing comma
2348    within an array.  */
2349
2350 static void
2351 test_error_array_with_missing_comma ()
2352 {
2353   const int line_1 = 1;
2354   /*                  01234567.  */
2355   const char *json = "[0, 1 42]";
2356   parser_testcase tc (json);
2357   ASSERT_ERR_EQ (tc.get_error (),
2358                  6, line_1, 6,
2359                  7, line_1, 7,
2360                  "expected ',' or ']'; got number");
2361   ASSERT_EQ (tc.get_value (), nullptr);
2362 }
2363
2364 /* Run all of the selftests within this file.  */
2365
2366 void
2367 json_parser_cc_tests ()
2368 {
2369   test_lexer ();
2370   test_lexing_unsupported_single_line_comment ();
2371   test_lexing_unsupported_multiline_comment ();
2372   test_lexing_supported_single_line_comment ();
2373   test_lexing_supported_multiline_comment ();
2374   test_parse_string ();
2375   test_parse_number ();
2376   test_parse_array ();
2377   test_parse_object ();
2378   test_parse_literals ();
2379   test_parse_jsonrpc ();
2380   test_parse_empty_object ();
2381   test_parsing_comments ();
2382   test_error_empty_string ();
2383   test_error_bad_token ();
2384   test_error_object_with_missing_comma ();
2385   test_error_array_with_missing_comma ();
2386 }
2387
2388 } // namespace selftest
2389
2390 #endif /* #if CHECKING_P */