src/common/jsonapi.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * jsonapi.c
   4  *              JSON parser and lexer interfaces
   5  *
   6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  * IDENTIFICATION
  10  *        src/common/jsonapi.c
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14 #ifndef FRONTEND
  15 #include "postgres.h"
  16 #else
  17 #include "postgres_fe.h"
  18 #endif
  19
  20 #include "common/jsonapi.h"
  21 #include "mb/pg_wchar.h"
  22 #include "port/pg_lfind.h"
  23
  24 #ifdef JSONAPI_USE_PQEXPBUFFER
  25 #include "pqexpbuffer.h"
  26 #else
  27 #include "lib/stringinfo.h"
  28 #include "miscadmin.h"
  29 #endif
  30
  31 /*
  32  * By default, we will use palloc/pfree along with StringInfo.  In libpq,
  33  * use malloc and PQExpBuffer, and return JSON_OUT_OF_MEMORY on out-of-memory.
  34  */
  35 #ifdef JSONAPI_USE_PQEXPBUFFER
  36
  37 #define STRDUP(s) strdup(s)
  38 #define ALLOC(size) malloc(size)
  39 #define ALLOC0(size) calloc(1, size)
  40 #define REALLOC realloc
  41 #define FREE(s) free(s)
  42
  43 #define jsonapi_appendStringInfo                        appendPQExpBuffer
  44 #define jsonapi_appendBinaryStringInfo          appendBinaryPQExpBuffer
  45 #define jsonapi_appendStringInfoChar            appendPQExpBufferChar
  46 /* XXX should we add a macro version to PQExpBuffer? */
  47 #define jsonapi_appendStringInfoCharMacro       appendPQExpBufferChar
  48 #define jsonapi_makeStringInfo                          createPQExpBuffer
  49 #define jsonapi_initStringInfo                          initPQExpBuffer
  50 #define jsonapi_resetStringInfo                         resetPQExpBuffer
  51 #define jsonapi_termStringInfo                          termPQExpBuffer
  52 #define jsonapi_destroyStringInfo                       destroyPQExpBuffer
  53
  54 #else                                                   /* !JSONAPI_USE_PQEXPBUFFER */
  55
  56 #define STRDUP(s) pstrdup(s)
  57 #define ALLOC(size) palloc(size)
  58 #define ALLOC0(size) palloc0(size)
  59 #define REALLOC repalloc
  60
  61 #ifdef FRONTEND
  62 #define FREE pfree
  63 #else
  64 /*
  65  * Backend pfree() doesn't handle NULL pointers like the frontend's does; smooth
  66  * that over to reduce mental gymnastics. Avoid multiple evaluation of the macro
  67  * argument to avoid future hair-pulling.
  68  */
  69 #define FREE(s) do {    \
  70         void *__v = (s);        \
  71         if (__v)                        \
  72                 pfree(__v);             \
  73 } while (0)
  74 #endif
  75
  76 #define jsonapi_appendStringInfo                        appendStringInfo
  77 #define jsonapi_appendBinaryStringInfo          appendBinaryStringInfo
  78 #define jsonapi_appendStringInfoChar            appendStringInfoChar
  79 #define jsonapi_appendStringInfoCharMacro       appendStringInfoCharMacro
  80 #define jsonapi_makeStringInfo                          makeStringInfo
  81 #define jsonapi_initStringInfo                          initStringInfo
  82 #define jsonapi_resetStringInfo                         resetStringInfo
  83 #define jsonapi_termStringInfo(s)                       pfree((s)->data)
  84 #define jsonapi_destroyStringInfo                       destroyStringInfo
  85
  86 #endif                                                  /* JSONAPI_USE_PQEXPBUFFER */
  87
  88 /*
  89  * The context of the parser is maintained by the recursive descent
  90  * mechanism, but is passed explicitly to the error reporting routine
  91  * for better diagnostics.
  92  */
  93 typedef enum                                    /* contexts of JSON parser */
  94 {
  95         JSON_PARSE_VALUE,                       /* expecting a value */
  96         JSON_PARSE_STRING,                      /* expecting a string (for a field name) */
  97         JSON_PARSE_ARRAY_START,         /* saw '[', expecting value or ']' */
  98         JSON_PARSE_ARRAY_NEXT,          /* saw array element, expecting ',' or ']' */
  99         JSON_PARSE_OBJECT_START,        /* saw '{', expecting label or '}' */
 100         JSON_PARSE_OBJECT_LABEL,        /* saw object label, expecting ':' */
 101         JSON_PARSE_OBJECT_NEXT,         /* saw object value, expecting ',' or '}' */
 102         JSON_PARSE_OBJECT_COMMA,        /* saw object ',', expecting next label */
 103         JSON_PARSE_END,                         /* saw the end of a document, expect nothing */
 104 } JsonParseContext;
 105
 106 /*
 107  * Setup for table-driven parser.
 108  * These enums need to be separate from the JsonTokenType and from each other
 109  * so we can have all of them on the prediction stack, which consists of
 110  * tokens, non-terminals, and semantic action markers.
 111  */
 112
 113 enum JsonNonTerminal
 114 {
 115         JSON_NT_JSON = 32,
 116         JSON_NT_ARRAY_ELEMENTS,
 117         JSON_NT_MORE_ARRAY_ELEMENTS,
 118         JSON_NT_KEY_PAIRS,
 119         JSON_NT_MORE_KEY_PAIRS,
 120 };
 121
 122 enum JsonParserSem
 123 {
 124         JSON_SEM_OSTART = 64,
 125         JSON_SEM_OEND,
 126         JSON_SEM_ASTART,
 127         JSON_SEM_AEND,
 128         JSON_SEM_OFIELD_INIT,
 129         JSON_SEM_OFIELD_START,
 130         JSON_SEM_OFIELD_END,
 131         JSON_SEM_AELEM_START,
 132         JSON_SEM_AELEM_END,
 133         JSON_SEM_SCALAR_INIT,
 134         JSON_SEM_SCALAR_CALL,
 135 };
 136
 137 /*
 138  * struct containing the 3 stacks used in non-recursive parsing,
 139  * and the token and value for scalars that need to be preserved
 140  * across calls.
 141  *
 142  * typedef appears in jsonapi.h
 143  */
 144 struct JsonParserStack
 145 {
 146         int                     stack_size;
 147         char       *prediction;
 148         size_t          pred_index;
 149         /* these two are indexed by lex_level */
 150         char      **fnames;
 151         bool       *fnull;
 152         JsonTokenType scalar_tok;
 153         char       *scalar_val;
 154 };
 155
 156 /*
 157  * struct containing state used when there is a possible partial token at the
 158  * end of a json chunk when we are doing incremental parsing.
 159  *
 160  * typedef appears in jsonapi.h
 161  */
 162 struct JsonIncrementalState
 163 {
 164         bool            started;
 165         bool            is_last_chunk;
 166         bool            partial_completed;
 167         jsonapi_StrValType partial_token;
 168 };
 169
 170 /*
 171  * constants and macros used in the nonrecursive parser
 172  */
 173 #define JSON_NUM_TERMINALS 13
 174 #define JSON_NUM_NONTERMINALS 5
 175 #define JSON_NT_OFFSET JSON_NT_JSON
 176 /* for indexing the table */
 177 #define OFS(NT) (NT) - JSON_NT_OFFSET
 178 /* classify items we get off the stack */
 179 #define IS_SEM(x) ((x) & 0x40)
 180 #define IS_NT(x)  ((x) & 0x20)
 181
 182 /*
 183  * These productions are stored in reverse order right to left so that when
 184  * they are pushed on the stack what we expect next is at the top of the stack.
 185  */
 186 static char JSON_PROD_EPSILON[] = {0};  /* epsilon - an empty production */
 187
 188 /* JSON -> string */
 189 static char JSON_PROD_SCALAR_STRING[] = {JSON_SEM_SCALAR_CALL, JSON_TOKEN_STRING, JSON_SEM_SCALAR_INIT, 0};
 190
 191 /* JSON -> number */
 192 static char JSON_PROD_SCALAR_NUMBER[] = {JSON_SEM_SCALAR_CALL, JSON_TOKEN_NUMBER, JSON_SEM_SCALAR_INIT, 0};
 193
 194 /* JSON -> 'true' */
 195 static char JSON_PROD_SCALAR_TRUE[] = {JSON_SEM_SCALAR_CALL, JSON_TOKEN_TRUE, JSON_SEM_SCALAR_INIT, 0};
 196
 197 /* JSON -> 'false' */
 198 static char JSON_PROD_SCALAR_FALSE[] = {JSON_SEM_SCALAR_CALL, JSON_TOKEN_FALSE, JSON_SEM_SCALAR_INIT, 0};
 199
 200 /* JSON -> 'null' */
 201 static char JSON_PROD_SCALAR_NULL[] = {JSON_SEM_SCALAR_CALL, JSON_TOKEN_NULL, JSON_SEM_SCALAR_INIT, 0};
 202
 203 /* JSON -> '{' KEY_PAIRS '}' */
 204 static char JSON_PROD_OBJECT[] = {JSON_SEM_OEND, JSON_TOKEN_OBJECT_END, JSON_NT_KEY_PAIRS, JSON_TOKEN_OBJECT_START, JSON_SEM_OSTART, 0};
 205
 206 /* JSON -> '[' ARRAY_ELEMENTS ']' */
 207 static char JSON_PROD_ARRAY[] = {JSON_SEM_AEND, JSON_TOKEN_ARRAY_END, JSON_NT_ARRAY_ELEMENTS, JSON_TOKEN_ARRAY_START, JSON_SEM_ASTART, 0};
 208
 209 /* ARRAY_ELEMENTS -> JSON MORE_ARRAY_ELEMENTS */
 210 static char JSON_PROD_ARRAY_ELEMENTS[] = {JSON_NT_MORE_ARRAY_ELEMENTS, JSON_SEM_AELEM_END, JSON_NT_JSON, JSON_SEM_AELEM_START, 0};
 211
 212 /* MORE_ARRAY_ELEMENTS -> ',' JSON MORE_ARRAY_ELEMENTS */
 213 static char JSON_PROD_MORE_ARRAY_ELEMENTS[] = {JSON_NT_MORE_ARRAY_ELEMENTS, JSON_SEM_AELEM_END, JSON_NT_JSON, JSON_SEM_AELEM_START, JSON_TOKEN_COMMA, 0};
 214
 215 /* KEY_PAIRS -> string ':' JSON MORE_KEY_PAIRS */
 216 static char JSON_PROD_KEY_PAIRS[] = {JSON_NT_MORE_KEY_PAIRS, JSON_SEM_OFIELD_END, JSON_NT_JSON, JSON_SEM_OFIELD_START, JSON_TOKEN_COLON, JSON_TOKEN_STRING, JSON_SEM_OFIELD_INIT, 0};
 217
 218 /* MORE_KEY_PAIRS -> ',' string ':'  JSON MORE_KEY_PAIRS */
 219 static char JSON_PROD_MORE_KEY_PAIRS[] = {JSON_NT_MORE_KEY_PAIRS, JSON_SEM_OFIELD_END, JSON_NT_JSON, JSON_SEM_OFIELD_START, JSON_TOKEN_COLON, JSON_TOKEN_STRING, JSON_SEM_OFIELD_INIT, JSON_TOKEN_COMMA, 0};
 220
 221 /*
 222  * Note: there are also epsilon productions for ARRAY_ELEMENTS,
 223  * MORE_ARRAY_ELEMENTS, KEY_PAIRS and MORE_KEY_PAIRS
 224  * They are all the same as none require any semantic actions.
 225  */
 226
 227 /*
 228  * Table connecting the productions with their director sets of
 229  * terminal symbols.
 230  * Any combination not specified here represents an error.
 231  */
 232
 233 typedef struct
 234 {
 235         size_t          len;
 236         char       *prod;
 237 } td_entry;
 238
 239 #define TD_ENTRY(PROD) { sizeof(PROD) - 1, (PROD) }
 240
 241 static td_entry td_parser_table[JSON_NUM_NONTERMINALS][JSON_NUM_TERMINALS] =
 242 {
 243         /* JSON */
 244         [OFS(JSON_NT_JSON)][JSON_TOKEN_STRING] = TD_ENTRY(JSON_PROD_SCALAR_STRING),
 245         [OFS(JSON_NT_JSON)][JSON_TOKEN_NUMBER] = TD_ENTRY(JSON_PROD_SCALAR_NUMBER),
 246         [OFS(JSON_NT_JSON)][JSON_TOKEN_TRUE] = TD_ENTRY(JSON_PROD_SCALAR_TRUE),
 247         [OFS(JSON_NT_JSON)][JSON_TOKEN_FALSE] = TD_ENTRY(JSON_PROD_SCALAR_FALSE),
 248         [OFS(JSON_NT_JSON)][JSON_TOKEN_NULL] = TD_ENTRY(JSON_PROD_SCALAR_NULL),
 249         [OFS(JSON_NT_JSON)][JSON_TOKEN_ARRAY_START] = TD_ENTRY(JSON_PROD_ARRAY),
 250         [OFS(JSON_NT_JSON)][JSON_TOKEN_OBJECT_START] = TD_ENTRY(JSON_PROD_OBJECT),
 251         /* ARRAY_ELEMENTS */
 252         [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_START] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
 253         [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_OBJECT_START] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
 254         [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_STRING] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
 255         [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_NUMBER] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
 256         [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_TRUE] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
 257         [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_FALSE] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
 258         [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_NULL] = TD_ENTRY(JSON_PROD_ARRAY_ELEMENTS),
 259         [OFS(JSON_NT_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_END] = TD_ENTRY(JSON_PROD_EPSILON),
 260         /* MORE_ARRAY_ELEMENTS */
 261         [OFS(JSON_NT_MORE_ARRAY_ELEMENTS)][JSON_TOKEN_COMMA] = TD_ENTRY(JSON_PROD_MORE_ARRAY_ELEMENTS),
 262         [OFS(JSON_NT_MORE_ARRAY_ELEMENTS)][JSON_TOKEN_ARRAY_END] = TD_ENTRY(JSON_PROD_EPSILON),
 263         /* KEY_PAIRS */
 264         [OFS(JSON_NT_KEY_PAIRS)][JSON_TOKEN_STRING] = TD_ENTRY(JSON_PROD_KEY_PAIRS),
 265         [OFS(JSON_NT_KEY_PAIRS)][JSON_TOKEN_OBJECT_END] = TD_ENTRY(JSON_PROD_EPSILON),
 266         /* MORE_KEY_PAIRS */
 267         [OFS(JSON_NT_MORE_KEY_PAIRS)][JSON_TOKEN_COMMA] = TD_ENTRY(JSON_PROD_MORE_KEY_PAIRS),
 268         [OFS(JSON_NT_MORE_KEY_PAIRS)][JSON_TOKEN_OBJECT_END] = TD_ENTRY(JSON_PROD_EPSILON),
 269 };
 270
 271 /* the GOAL production. Not stored in the table, but will be the initial contents of the prediction stack */
 272 static char JSON_PROD_GOAL[] = {JSON_TOKEN_END, JSON_NT_JSON, 0};
 273
 274 static inline JsonParseErrorType json_lex_string(JsonLexContext *lex);
 275 static inline JsonParseErrorType json_lex_number(JsonLexContext *lex, const char *s,
 276                                                                                                  bool *num_err, size_t *total_len);
 277 static inline JsonParseErrorType parse_scalar(JsonLexContext *lex, const JsonSemAction *sem);
 278 static JsonParseErrorType parse_object_field(JsonLexContext *lex, const JsonSemAction *sem);
 279 static JsonParseErrorType parse_object(JsonLexContext *lex, const JsonSemAction *sem);
 280 static JsonParseErrorType parse_array_element(JsonLexContext *lex, const JsonSemAction *sem);
 281 static JsonParseErrorType parse_array(JsonLexContext *lex, const JsonSemAction *sem);
 282 static JsonParseErrorType report_parse_error(JsonParseContext ctx, JsonLexContext *lex);
 283 static bool allocate_incremental_state(JsonLexContext *lex);
 284 static inline void set_fname(JsonLexContext *lex, char *fname);
 285
 286 /* the null action object used for pure validation */
 287 const JsonSemAction nullSemAction =
 288 {
 289         NULL, NULL, NULL, NULL, NULL,
 290         NULL, NULL, NULL, NULL, NULL
 291 };
 292
 293 /* sentinels used for out-of-memory conditions */
 294 static JsonLexContext failed_oom;
 295 static JsonIncrementalState failed_inc_oom;
 296
 297 /* Parser support routines */
 298
 299 /*
 300  * lex_peek
 301  *
 302  * what is the current look_ahead token?
 303 */
 304 static inline JsonTokenType
 305 lex_peek(JsonLexContext *lex)
 306 {
 307         return lex->token_type;
 308 }
 309
 310 /*
 311  * lex_expect
 312  *
 313  * move the lexer to the next token if the current look_ahead token matches
 314  * the parameter token. Otherwise, report an error.
 315  */
 316 static inline JsonParseErrorType
 317 lex_expect(JsonParseContext ctx, JsonLexContext *lex, JsonTokenType token)
 318 {
 319         if (lex_peek(lex) == token)
 320                 return json_lex(lex);
 321         else
 322                 return report_parse_error(ctx, lex);
 323 }
 324
 325 /* chars to consider as part of an alphanumeric token */
 326 #define JSON_ALPHANUMERIC_CHAR(c)  \
 327         (((c) >= 'a' && (c) <= 'z') || \
 328          ((c) >= 'A' && (c) <= 'Z') || \
 329          ((c) >= '0' && (c) <= '9') || \
 330          (c) == '_' || \
 331          IS_HIGHBIT_SET(c))
 332
 333 /*
 334  * Utility function to check if a string is a valid JSON number.
 335  *
 336  * str is of length len, and need not be null-terminated.
 337  */
 338 bool
 339 IsValidJsonNumber(const char *str, size_t len)
 340 {
 341         bool            numeric_error;
 342         size_t          total_len;
 343         JsonLexContext dummy_lex = {0};
 344
 345         if (len <= 0)
 346                 return false;
 347
 348         /*
 349          * json_lex_number expects a leading  '-' to have been eaten already.
 350          *
 351          * having to cast away the constness of str is ugly, but there's not much
 352          * easy alternative.
 353          */
 354         if (*str == '-')
 355         {
 356                 dummy_lex.input = str + 1;
 357                 dummy_lex.input_length = len - 1;
 358         }
 359         else
 360         {
 361                 dummy_lex.input = str;
 362                 dummy_lex.input_length = len;
 363         }
 364
 365         dummy_lex.token_start = dummy_lex.input;
 366
 367         json_lex_number(&dummy_lex, dummy_lex.input, &numeric_error, &total_len);
 368
 369         return (!numeric_error) && (total_len == dummy_lex.input_length);
 370 }
 371
 372 /*
 373  * makeJsonLexContextCstringLen
 374  *              Initialize the given JsonLexContext object, or create one
 375  *
 376  * If a valid 'lex' pointer is given, it is initialized.  This can
 377  * be used for stack-allocated structs, saving overhead.  If NULL is
 378  * given, a new struct is allocated.
 379  *
 380  * If need_escapes is true, ->strval stores the unescaped lexemes.
 381  * Unescaping is expensive, so only request it when necessary.
 382  *
 383  * If need_escapes is true or lex was given as NULL, then caller is
 384  * responsible for freeing the returned struct, either by calling
 385  * freeJsonLexContext() or (in backend environment) via memory context
 386  * cleanup.
 387  *
 388  * In shlib code, any out-of-memory failures will be deferred to time
 389  * of use; this function is guaranteed to return a valid JsonLexContext.
 390  */
 391 JsonLexContext *
 392 makeJsonLexContextCstringLen(JsonLexContext *lex, const char *json,
 393                                                          size_t len, int encoding, bool need_escapes)
 394 {
 395         if (lex == NULL)
 396         {
 397                 lex = ALLOC0(sizeof(JsonLexContext));
 398                 if (!lex)
 399                         return &failed_oom;
 400                 lex->flags |= JSONLEX_FREE_STRUCT;
 401         }
 402         else
 403                 memset(lex, 0, sizeof(JsonLexContext));
 404
 405         lex->errormsg = NULL;
 406         lex->input = lex->token_terminator = lex->line_start = json;
 407         lex->line_number = 1;
 408         lex->input_length = len;
 409         lex->input_encoding = encoding;
 410         lex->need_escapes = need_escapes;
 411         if (need_escapes)
 412         {
 413                 /*
 414                  * This call can fail in shlib code. We defer error handling to time
 415                  * of use (json_lex_string()) since we might not need to parse any
 416                  * strings anyway.
 417                  */
 418                 lex->strval = jsonapi_makeStringInfo();
 419                 lex->flags |= JSONLEX_FREE_STRVAL;
 420         }
 421
 422         return lex;
 423 }
 424
 425 /*
 426  * Allocates the internal bookkeeping structures for incremental parsing. This
 427  * can only fail in-band with shlib code.
 428  */
 429 #define JS_STACK_CHUNK_SIZE 64
 430 #define JS_MAX_PROD_LEN 10              /* more than we need */
 431 #define JSON_TD_MAX_STACK 6400  /* hard coded for now - this is a REALLY high
 432                                                                  * number */
 433 static bool
 434 allocate_incremental_state(JsonLexContext *lex)
 435 {
 436         void       *pstack,
 437                            *prediction,
 438                            *fnames,
 439                            *fnull;
 440
 441         lex->inc_state = ALLOC0(sizeof(JsonIncrementalState));
 442         pstack = ALLOC0(sizeof(JsonParserStack));
 443         prediction = ALLOC(JS_STACK_CHUNK_SIZE * JS_MAX_PROD_LEN);
 444         fnames = ALLOC(JS_STACK_CHUNK_SIZE * sizeof(char *));
 445         fnull = ALLOC(JS_STACK_CHUNK_SIZE * sizeof(bool));
 446
 447 #ifdef JSONAPI_USE_PQEXPBUFFER
 448         if (!lex->inc_state
 449                 || !pstack
 450                 || !prediction
 451                 || !fnames
 452                 || !fnull)
 453         {
 454                 FREE(lex->inc_state);
 455                 FREE(pstack);
 456                 FREE(prediction);
 457                 FREE(fnames);
 458                 FREE(fnull);
 459
 460                 lex->inc_state = &failed_inc_oom;
 461                 return false;
 462         }
 463 #endif
 464
 465         jsonapi_initStringInfo(&(lex->inc_state->partial_token));
 466         lex->pstack = pstack;
 467         lex->pstack->stack_size = JS_STACK_CHUNK_SIZE;
 468         lex->pstack->prediction = prediction;
 469         lex->pstack->fnames = fnames;
 470         lex->pstack->fnull = fnull;
 471
 472         /*
 473          * fnames between 0 and lex_level must always be defined so that
 474          * freeJsonLexContext() can handle them safely. inc/dec_lex_level() handle
 475          * the rest.
 476          */
 477         Assert(lex->lex_level == 0);
 478         lex->pstack->fnames[0] = NULL;
 479
 480         lex->incremental = true;
 481         return true;
 482 }
 483
 484
 485 /*
 486  * makeJsonLexContextIncremental
 487  *
 488  * Similar to above but set up for use in incremental parsing. That means we
 489  * need explicit stacks for predictions, field names and null indicators, but
 490  * we don't need the input, that will be handed in bit by bit to the
 491  * parse routine. We also need an accumulator for partial tokens in case
 492  * the boundary between chunks happens to fall in the middle of a token.
 493  *
 494  * In shlib code, any out-of-memory failures will be deferred to time of use;
 495  * this function is guaranteed to return a valid JsonLexContext.
 496  */
 497 JsonLexContext *
 498 makeJsonLexContextIncremental(JsonLexContext *lex, int encoding,
 499                                                           bool need_escapes)
 500 {
 501         if (lex == NULL)
 502         {
 503                 lex = ALLOC0(sizeof(JsonLexContext));
 504                 if (!lex)
 505                         return &failed_oom;
 506
 507                 lex->flags |= JSONLEX_FREE_STRUCT;
 508         }
 509         else
 510                 memset(lex, 0, sizeof(JsonLexContext));
 511
 512         lex->line_number = 1;
 513         lex->input_encoding = encoding;
 514
 515         if (!allocate_incremental_state(lex))
 516         {
 517                 if (lex->flags & JSONLEX_FREE_STRUCT)
 518                 {
 519                         FREE(lex);
 520                         return &failed_oom;
 521                 }
 522
 523                 /* lex->inc_state tracks the OOM failure; we can return here. */
 524                 return lex;
 525         }
 526
 527         lex->need_escapes = need_escapes;
 528         if (need_escapes)
 529         {
 530                 /*
 531                  * This call can fail in shlib code. We defer error handling to time
 532                  * of use (json_lex_string()) since we might not need to parse any
 533                  * strings anyway.
 534                  */
 535                 lex->strval = jsonapi_makeStringInfo();
 536                 lex->flags |= JSONLEX_FREE_STRVAL;
 537         }
 538
 539         return lex;
 540 }
 541
 542 void
 543 setJsonLexContextOwnsTokens(JsonLexContext *lex, bool owned_by_context)
 544 {
 545         if (lex->incremental && lex->inc_state->started)
 546         {
 547                 /*
 548                  * Switching this flag after parsing has already started is a
 549                  * programming error.
 550                  */
 551                 Assert(false);
 552                 return;
 553         }
 554
 555         if (owned_by_context)
 556                 lex->flags |= JSONLEX_CTX_OWNS_TOKENS;
 557         else
 558                 lex->flags &= ~JSONLEX_CTX_OWNS_TOKENS;
 559 }
 560
 561 static inline bool
 562 inc_lex_level(JsonLexContext *lex)
 563 {
 564         if (lex->incremental && (lex->lex_level + 1) >= lex->pstack->stack_size)
 565         {
 566                 size_t          new_stack_size;
 567                 char       *new_prediction;
 568                 char      **new_fnames;
 569                 bool       *new_fnull;
 570
 571                 new_stack_size = lex->pstack->stack_size + JS_STACK_CHUNK_SIZE;
 572
 573                 new_prediction = REALLOC(lex->pstack->prediction,
 574                                                                  new_stack_size * JS_MAX_PROD_LEN);
 575 #ifdef JSONAPI_USE_PQEXPBUFFER
 576                 if (!new_prediction)
 577                         return false;
 578 #endif
 579                 lex->pstack->prediction = new_prediction;
 580
 581                 new_fnames = REALLOC(lex->pstack->fnames,
 582                                                          new_stack_size * sizeof(char *));
 583 #ifdef JSONAPI_USE_PQEXPBUFFER
 584                 if (!new_fnames)
 585                         return false;
 586 #endif
 587                 lex->pstack->fnames = new_fnames;
 588
 589                 new_fnull = REALLOC(lex->pstack->fnull, new_stack_size * sizeof(bool));
 590 #ifdef JSONAPI_USE_PQEXPBUFFER
 591                 if (!new_fnull)
 592                         return false;
 593 #endif
 594                 lex->pstack->fnull = new_fnull;
 595
 596                 lex->pstack->stack_size = new_stack_size;
 597         }
 598
 599         lex->lex_level += 1;
 600
 601         if (lex->incremental)
 602         {
 603                 /*
 604                  * Ensure freeJsonLexContext() remains safe even if no fname is
 605                  * assigned at this level.
 606                  */
 607                 lex->pstack->fnames[lex->lex_level] = NULL;
 608         }
 609
 610         return true;
 611 }
 612
 613 static inline void
 614 dec_lex_level(JsonLexContext *lex)
 615 {
 616         set_fname(lex, NULL);           /* free the current level's fname, if needed */
 617         lex->lex_level -= 1;
 618 }
 619
 620 static inline void
 621 push_prediction(JsonParserStack *pstack, td_entry entry)
 622 {
 623         memcpy(pstack->prediction + pstack->pred_index, entry.prod, entry.len);
 624         pstack->pred_index += entry.len;
 625 }
 626
 627 static inline char
 628 pop_prediction(JsonParserStack *pstack)
 629 {
 630         Assert(pstack->pred_index > 0);
 631         return pstack->prediction[--pstack->pred_index];
 632 }
 633
 634 static inline char
 635 next_prediction(JsonParserStack *pstack)
 636 {
 637         Assert(pstack->pred_index > 0);
 638         return pstack->prediction[pstack->pred_index - 1];
 639 }
 640
 641 static inline bool
 642 have_prediction(JsonParserStack *pstack)
 643 {
 644         return pstack->pred_index > 0;
 645 }
 646
 647 static inline void
 648 set_fname(JsonLexContext *lex, char *fname)
 649 {
 650         if (lex->flags & JSONLEX_CTX_OWNS_TOKENS)
 651         {
 652                 /*
 653                  * Don't leak prior fnames. If one hasn't been assigned yet,
 654                  * inc_lex_level ensured that it's NULL (and therefore safe to free).
 655                  */
 656                 FREE(lex->pstack->fnames[lex->lex_level]);
 657         }
 658
 659         lex->pstack->fnames[lex->lex_level] = fname;
 660 }
 661
 662 static inline char *
 663 get_fname(JsonLexContext *lex)
 664 {
 665         return lex->pstack->fnames[lex->lex_level];
 666 }
 667
 668 static inline void
 669 set_fnull(JsonLexContext *lex, bool fnull)
 670 {
 671         lex->pstack->fnull[lex->lex_level] = fnull;
 672 }
 673
 674 static inline bool
 675 get_fnull(JsonLexContext *lex)
 676 {
 677         return lex->pstack->fnull[lex->lex_level];
 678 }
 679
 680 /*
 681  * Free memory in a JsonLexContext.
 682  *
 683  * There's no need for this if a *lex pointer was given when the object was
 684  * made, need_escapes was false, and json_errdetail() was not called; or if (in
 685  * backend environment) a memory context delete/reset is imminent.
 686  */
 687 void
 688 freeJsonLexContext(JsonLexContext *lex)
 689 {
 690         static const JsonLexContext empty = {0};
 691
 692         if (!lex || lex == &failed_oom)
 693                 return;
 694
 695         if (lex->flags & JSONLEX_FREE_STRVAL)
 696                 jsonapi_destroyStringInfo(lex->strval);
 697
 698         if (lex->errormsg)
 699                 jsonapi_destroyStringInfo(lex->errormsg);
 700
 701         if (lex->incremental)
 702         {
 703                 jsonapi_termStringInfo(&lex->inc_state->partial_token);
 704                 FREE(lex->inc_state);
 705                 FREE(lex->pstack->prediction);
 706
 707                 if (lex->flags & JSONLEX_CTX_OWNS_TOKENS)
 708                 {
 709                         int                     i;
 710
 711                         /* Clean up any tokens that were left behind. */
 712                         for (i = 0; i <= lex->lex_level; i++)
 713                                 FREE(lex->pstack->fnames[i]);
 714                 }
 715
 716                 FREE(lex->pstack->fnames);
 717                 FREE(lex->pstack->fnull);
 718                 FREE(lex->pstack->scalar_val);
 719                 FREE(lex->pstack);
 720         }
 721
 722         if (lex->flags & JSONLEX_FREE_STRUCT)
 723                 FREE(lex);
 724         else
 725                 *lex = empty;
 726 }
 727
 728 /*
 729  * pg_parse_json
 730  *
 731  * Publicly visible entry point for the JSON parser.
 732  *
 733  * lex is a lexing context, set up for the json to be processed by calling
 734  * makeJsonLexContext(). sem is a structure of function pointers to semantic
 735  * action routines to be called at appropriate spots during parsing, and a
 736  * pointer to a state object to be passed to those routines.
 737  *
 738  * If FORCE_JSON_PSTACK is defined then the routine will call the non-recursive
 739  * JSON parser. This is a useful way to validate that it's doing the right
 740  * thing at least for non-incremental cases. If this is on we expect to see
 741  * regression diffs relating to error messages about stack depth, but no
 742  * other differences.
 743  */
 744 JsonParseErrorType
 745 pg_parse_json(JsonLexContext *lex, const JsonSemAction *sem)
 746 {
 747 #ifdef FORCE_JSON_PSTACK
 748         /*
 749          * We don't need partial token processing, there is only one chunk. But we
 750          * still need to init the partial token string so that freeJsonLexContext
 751          * works, so perform the full incremental initialization.
 752          */
 753         if (!allocate_incremental_state(lex))
 754                 return JSON_OUT_OF_MEMORY;
 755
 756         return pg_parse_json_incremental(lex, sem, lex->input, lex->input_length, true);
 757
 758 #else
 759
 760         JsonTokenType tok;
 761         JsonParseErrorType result;
 762
 763         if (lex == &failed_oom)
 764                 return JSON_OUT_OF_MEMORY;
 765         if (lex->incremental)
 766                 return JSON_INVALID_LEXER_TYPE;
 767
 768         /* get the initial token */
 769         result = json_lex(lex);
 770         if (result != JSON_SUCCESS)
 771                 return result;
 772
 773         tok = lex_peek(lex);
 774
 775         /* parse by recursive descent */
 776         switch (tok)
 777         {
 778                 case JSON_TOKEN_OBJECT_START:
 779                         result = parse_object(lex, sem);
 780                         break;
 781                 case JSON_TOKEN_ARRAY_START:
 782                         result = parse_array(lex, sem);
 783                         break;
 784                 default:
 785                         result = parse_scalar(lex, sem);        /* json can be a bare scalar */
 786         }
 787
 788         if (result == JSON_SUCCESS)
 789                 result = lex_expect(JSON_PARSE_END, lex, JSON_TOKEN_END);
 790
 791         return result;
 792 #endif
 793 }
 794
 795 /*
 796  * json_count_array_elements
 797  *
 798  * Returns number of array elements in lex context at start of array token
 799  * until end of array token at same nesting level.
 800  *
 801  * Designed to be called from array_start routines.
 802  */
 803 JsonParseErrorType
 804 json_count_array_elements(JsonLexContext *lex, int *elements)
 805 {
 806         JsonLexContext copylex;
 807         int                     count;
 808         JsonParseErrorType result;
 809
 810         if (lex == &failed_oom)
 811                 return JSON_OUT_OF_MEMORY;
 812
 813         /*
 814          * It's safe to do this with a shallow copy because the lexical routines
 815          * don't scribble on the input. They do scribble on the other pointers
 816          * etc, so doing this with a copy makes that safe.
 817          */
 818         memcpy(&copylex, lex, sizeof(JsonLexContext));
 819         copylex.need_escapes = false;   /* not interested in values here */
 820         copylex.lex_level++;
 821
 822         count = 0;
 823         result = lex_expect(JSON_PARSE_ARRAY_START, &copylex,
 824                                                 JSON_TOKEN_ARRAY_START);
 825         if (result != JSON_SUCCESS)
 826                 return result;
 827         if (lex_peek(&copylex) != JSON_TOKEN_ARRAY_END)
 828         {
 829                 while (1)
 830                 {
 831                         count++;
 832                         result = parse_array_element(&copylex, &nullSemAction);
 833                         if (result != JSON_SUCCESS)
 834                                 return result;
 835                         if (copylex.token_type != JSON_TOKEN_COMMA)
 836                                 break;
 837                         result = json_lex(&copylex);
 838                         if (result != JSON_SUCCESS)
 839                                 return result;
 840                 }
 841         }
 842         result = lex_expect(JSON_PARSE_ARRAY_NEXT, &copylex,
 843                                                 JSON_TOKEN_ARRAY_END);
 844         if (result != JSON_SUCCESS)
 845                 return result;
 846
 847         *elements = count;
 848         return JSON_SUCCESS;
 849 }
 850
 851 /*
 852  * pg_parse_json_incremental
 853  *
 854  * Routine for incremental parsing of json. This uses the non-recursive top
 855  * down method of the Dragon Book Algorithm 4.3. It's somewhat slower than
 856  * the Recursive Descent pattern used above, so we only use it for incremental
 857  * parsing of JSON.
 858  *
 859  * The lexing context needs to be set up by a call to
 860  * makeJsonLexContextIncremental(). sem is a structure of function pointers
 861  * to semantic action routines, which should function exactly as those used
 862  * in the recursive descent parser.
 863  *
 864  * This routine can be called repeatedly with chunks of JSON. On the final
 865  * chunk is_last must be set to true. len is the length of the json chunk,
 866  * which does not need to be null terminated.
 867  */
 868 JsonParseErrorType
 869 pg_parse_json_incremental(JsonLexContext *lex,
 870                                                   const JsonSemAction *sem,
 871                                                   const char *json,
 872                                                   size_t len,
 873                                                   bool is_last)
 874 {
 875         JsonTokenType tok;
 876         JsonParseErrorType result;
 877         JsonParseContext ctx = JSON_PARSE_VALUE;
 878         JsonParserStack *pstack = lex->pstack;
 879
 880         if (lex == &failed_oom || lex->inc_state == &failed_inc_oom)
 881                 return JSON_OUT_OF_MEMORY;
 882         if (!lex->incremental)
 883                 return JSON_INVALID_LEXER_TYPE;
 884
 885         lex->input = lex->token_terminator = lex->line_start = json;
 886         lex->input_length = len;
 887         lex->inc_state->is_last_chunk = is_last;
 888         lex->inc_state->started = true;
 889
 890         /* get the initial token */
 891         result = json_lex(lex);
 892         if (result != JSON_SUCCESS)
 893                 return result;
 894
 895         tok = lex_peek(lex);
 896
 897         /* use prediction stack for incremental parsing */
 898
 899         if (!have_prediction(pstack))
 900         {
 901                 td_entry        goal = TD_ENTRY(JSON_PROD_GOAL);
 902
 903                 push_prediction(pstack, goal);
 904         }
 905
 906         while (have_prediction(pstack))
 907         {
 908                 char            top = pop_prediction(pstack);
 909                 td_entry        entry;
 910
 911                 /*
 912                  * these first two branches are the guts of the Table Driven method
 913                  */
 914                 if (top == tok)
 915                 {
 916                         /*
 917                          * tok can only be a terminal symbol, so top must be too. the
 918                          * token matches the top of the stack, so get the next token.
 919                          */
 920                         if (tok < JSON_TOKEN_END)
 921                         {
 922                                 result = json_lex(lex);
 923                                 if (result != JSON_SUCCESS)
 924                                         return result;
 925                                 tok = lex_peek(lex);
 926                         }
 927                 }
 928                 else if (IS_NT(top) && (entry = td_parser_table[OFS(top)][tok]).prod != NULL)
 929                 {
 930                         /*
 931                          * the token is in the director set for a production of the
 932                          * non-terminal at the top of the stack, so push the reversed RHS
 933                          * of the production onto the stack.
 934                          */
 935                         push_prediction(pstack, entry);
 936                 }
 937                 else if (IS_SEM(top))
 938                 {
 939                         /*
 940                          * top is a semantic action marker, so take action accordingly.
 941                          * It's important to have these markers in the prediction stack
 942                          * before any token they might need so we don't advance the token
 943                          * prematurely. Note in a couple of cases we need to do something
 944                          * both before and after the token.
 945                          */
 946                         switch (top)
 947                         {
 948                                 case JSON_SEM_OSTART:
 949                                         {
 950                                                 json_struct_action ostart = sem->object_start;
 951
 952                                                 if (lex->lex_level >= JSON_TD_MAX_STACK)
 953                                                         return JSON_NESTING_TOO_DEEP;
 954
 955                                                 if (ostart != NULL)
 956                                                 {
 957                                                         result = (*ostart) (sem->semstate);
 958                                                         if (result != JSON_SUCCESS)
 959                                                                 return result;
 960                                                 }
 961
 962                                                 if (!inc_lex_level(lex))
 963                                                         return JSON_OUT_OF_MEMORY;
 964                                         }
 965                                         break;
 966                                 case JSON_SEM_OEND:
 967                                         {
 968                                                 json_struct_action oend = sem->object_end;
 969
 970                                                 dec_lex_level(lex);
 971                                                 if (oend != NULL)
 972                                                 {
 973                                                         result = (*oend) (sem->semstate);
 974                                                         if (result != JSON_SUCCESS)
 975                                                                 return result;
 976                                                 }
 977                                         }
 978                                         break;
 979                                 case JSON_SEM_ASTART:
 980                                         {
 981                                                 json_struct_action astart = sem->array_start;
 982
 983                                                 if (lex->lex_level >= JSON_TD_MAX_STACK)
 984                                                         return JSON_NESTING_TOO_DEEP;
 985
 986                                                 if (astart != NULL)
 987                                                 {
 988                                                         result = (*astart) (sem->semstate);
 989                                                         if (result != JSON_SUCCESS)
 990                                                                 return result;
 991                                                 }
 992
 993                                                 if (!inc_lex_level(lex))
 994                                                         return JSON_OUT_OF_MEMORY;
 995                                         }
 996                                         break;
 997                                 case JSON_SEM_AEND:
 998                                         {
 999                                                 json_struct_action aend = sem->array_end;
1000
1001                                                 dec_lex_level(lex);
1002                                                 if (aend != NULL)
1003                                                 {
1004                                                         result = (*aend) (sem->semstate);
1005                                                         if (result != JSON_SUCCESS)
1006                                                                 return result;
1007                                                 }
1008                                         }
1009                                         break;
1010                                 case JSON_SEM_OFIELD_INIT:
1011                                         {
1012                                                 /*
1013                                                  * all we do here is save out the field name. We have
1014                                                  * to wait to get past the ':' to see if the next
1015                                                  * value is null so we can call the semantic routine
1016                                                  */
1017                                                 char       *fname = NULL;
1018                                                 json_ofield_action ostart = sem->object_field_start;
1019                                                 json_ofield_action oend = sem->object_field_end;
1020
1021                                                 if ((ostart != NULL || oend != NULL) && lex->need_escapes)
1022                                                 {
1023                                                         fname = STRDUP(lex->strval->data);
1024                                                         if (fname == NULL)
1025                                                                 return JSON_OUT_OF_MEMORY;
1026                                                 }
1027                                                 set_fname(lex, fname);
1028                                         }
1029                                         break;
1030                                 case JSON_SEM_OFIELD_START:
1031                                         {
1032                                                 /*
1033                                                  * the current token should be the first token of the
1034                                                  * value
1035                                                  */
1036                                                 bool            isnull = tok == JSON_TOKEN_NULL;
1037                                                 json_ofield_action ostart = sem->object_field_start;
1038
1039                                                 set_fnull(lex, isnull);
1040
1041                                                 if (ostart != NULL)
1042                                                 {
1043                                                         char       *fname = get_fname(lex);
1044
1045                                                         result = (*ostart) (sem->semstate, fname, isnull);
1046                                                         if (result != JSON_SUCCESS)
1047                                                                 return result;
1048                                                 }
1049                                         }
1050                                         break;
1051                                 case JSON_SEM_OFIELD_END:
1052                                         {
1053                                                 json_ofield_action oend = sem->object_field_end;
1054
1055                                                 if (oend != NULL)
1056                                                 {
1057                                                         char       *fname = get_fname(lex);
1058                                                         bool            isnull = get_fnull(lex);
1059
1060                                                         result = (*oend) (sem->semstate, fname, isnull);
1061                                                         if (result != JSON_SUCCESS)
1062                                                                 return result;
1063                                                 }
1064                                         }
1065                                         break;
1066                                 case JSON_SEM_AELEM_START:
1067                                         {
1068                                                 json_aelem_action astart = sem->array_element_start;
1069                                                 bool            isnull = tok == JSON_TOKEN_NULL;
1070
1071                                                 set_fnull(lex, isnull);
1072
1073                                                 if (astart != NULL)
1074                                                 {
1075                                                         result = (*astart) (sem->semstate, isnull);
1076                                                         if (result != JSON_SUCCESS)
1077                                                                 return result;
1078                                                 }
1079                                         }
1080                                         break;
1081                                 case JSON_SEM_AELEM_END:
1082                                         {
1083                                                 json_aelem_action aend = sem->array_element_end;
1084
1085                                                 if (aend != NULL)
1086                                                 {
1087                                                         bool            isnull = get_fnull(lex);
1088
1089                                                         result = (*aend) (sem->semstate, isnull);
1090                                                         if (result != JSON_SUCCESS)
1091                                                                 return result;
1092                                                 }
1093                                         }
1094                                         break;
1095                                 case JSON_SEM_SCALAR_INIT:
1096                                         {
1097                                                 json_scalar_action sfunc = sem->scalar;
1098
1099                                                 pstack->scalar_val = NULL;
1100
1101                                                 if (sfunc != NULL)
1102                                                 {
1103                                                         /*
1104                                                          * extract the de-escaped string value, or the raw
1105                                                          * lexeme
1106                                                          */
1107                                                         /*
1108                                                          * XXX copied from RD parser but looks like a
1109                                                          * buglet
1110                                                          */
1111                                                         if (tok == JSON_TOKEN_STRING)
1112                                                         {
1113                                                                 if (lex->need_escapes)
1114                                                                 {
1115                                                                         pstack->scalar_val = STRDUP(lex->strval->data);
1116                                                                         if (pstack->scalar_val == NULL)
1117                                                                                 return JSON_OUT_OF_MEMORY;
1118                                                                 }
1119                                                         }
1120                                                         else
1121                                                         {
1122                                                                 ptrdiff_t       tlen = (lex->token_terminator - lex->token_start);
1123
1124                                                                 pstack->scalar_val = ALLOC(tlen + 1);
1125                                                                 if (pstack->scalar_val == NULL)
1126                                                                         return JSON_OUT_OF_MEMORY;
1127
1128                                                                 memcpy(pstack->scalar_val, lex->token_start, tlen);
1129                                                                 pstack->scalar_val[tlen] = '\0';
1130                                                         }
1131                                                         pstack->scalar_tok = tok;
1132                                                 }
1133                                         }
1134                                         break;
1135                                 case JSON_SEM_SCALAR_CALL:
1136                                         {
1137                                                 /*
1138                                                  * We'd like to be able to get rid of this business of
1139                                                  * two bits of scalar action, but we can't. It breaks
1140                                                  * certain semantic actions which expect that when
1141                                                  * called the lexer has consumed the item. See for
1142                                                  * example get_scalar() in jsonfuncs.c.
1143                                                  */
1144                                                 json_scalar_action sfunc = sem->scalar;
1145
1146                                                 if (sfunc != NULL)
1147                                                 {
1148                                                         result = (*sfunc) (sem->semstate, pstack->scalar_val, pstack->scalar_tok);
1149
1150                                                         /*
1151                                                          * Either ownership of the token passed to the
1152                                                          * callback, or we need to free it now. Either
1153                                                          * way, clear our pointer to it so it doesn't get
1154                                                          * freed in the future.
1155                                                          */
1156                                                         if (lex->flags & JSONLEX_CTX_OWNS_TOKENS)
1157                                                                 FREE(pstack->scalar_val);
1158                                                         pstack->scalar_val = NULL;
1159
1160                                                         if (result != JSON_SUCCESS)
1161                                                                 return result;
1162                                                 }
1163                                         }
1164                                         break;
1165                                 default:
1166                                         /* should not happen */
1167                                         break;
1168                         }
1169                 }
1170                 else
1171                 {
1172                         /*
1173                          * The token didn't match the stack top if it's a terminal nor a
1174                          * production for the stack top if it's a non-terminal.
1175                          *
1176                          * Various cases here are Asserted to be not possible, as the
1177                          * token would not appear at the top of the prediction stack
1178                          * unless the lookahead matched.
1179                          */
1180                         switch (top)
1181                         {
1182                                 case JSON_TOKEN_STRING:
1183                                         if (next_prediction(pstack) == JSON_TOKEN_COLON)
1184                                                 ctx = JSON_PARSE_STRING;
1185                                         else
1186                                         {
1187                                                 Assert(false);
1188                                                 ctx = JSON_PARSE_VALUE;
1189                                         }
1190                                         break;
1191                                 case JSON_TOKEN_NUMBER:
1192                                 case JSON_TOKEN_TRUE:
1193                                 case JSON_TOKEN_FALSE:
1194                                 case JSON_TOKEN_NULL:
1195                                 case JSON_TOKEN_ARRAY_START:
1196                                 case JSON_TOKEN_OBJECT_START:
1197                                         Assert(false);
1198                                         ctx = JSON_PARSE_VALUE;
1199                                         break;
1200                                 case JSON_TOKEN_ARRAY_END:
1201                                         Assert(false);
1202                                         ctx = JSON_PARSE_ARRAY_NEXT;
1203                                         break;
1204                                 case JSON_TOKEN_OBJECT_END:
1205                                         Assert(false);
1206                                         ctx = JSON_PARSE_OBJECT_NEXT;
1207                                         break;
1208                                 case JSON_TOKEN_COMMA:
1209                                         Assert(false);
1210                                         if (next_prediction(pstack) == JSON_TOKEN_STRING)
1211                                                 ctx = JSON_PARSE_OBJECT_NEXT;
1212                                         else
1213                                                 ctx = JSON_PARSE_ARRAY_NEXT;
1214                                         break;
1215                                 case JSON_TOKEN_COLON:
1216                                         ctx = JSON_PARSE_OBJECT_LABEL;
1217                                         break;
1218                                 case JSON_TOKEN_END:
1219                                         ctx = JSON_PARSE_END;
1220                                         break;
1221                                 case JSON_NT_MORE_ARRAY_ELEMENTS:
1222                                         ctx = JSON_PARSE_ARRAY_NEXT;
1223                                         break;
1224                                 case JSON_NT_ARRAY_ELEMENTS:
1225                                         ctx = JSON_PARSE_ARRAY_START;
1226                                         break;
1227                                 case JSON_NT_MORE_KEY_PAIRS:
1228                                         ctx = JSON_PARSE_OBJECT_NEXT;
1229                                         break;
1230                                 case JSON_NT_KEY_PAIRS:
1231                                         ctx = JSON_PARSE_OBJECT_START;
1232                                         break;
1233                                 default:
1234                                         ctx = JSON_PARSE_VALUE;
1235                         }
1236                         return report_parse_error(ctx, lex);
1237                 }
1238         }
1239
1240         return JSON_SUCCESS;
1241 }
1242
1243 /*
1244  *      Recursive Descent parse routines. There is one for each structural
1245  *      element in a json document:
1246  *        - scalar (string, number, true, false, null)
1247  *        - array  ( [ ] )
1248  *        - array element
1249  *        - object ( { } )
1250  *        - object field
1251  */
1252 static inline JsonParseErrorType
1253 parse_scalar(JsonLexContext *lex, const JsonSemAction *sem)
1254 {
1255         char       *val = NULL;
1256         json_scalar_action sfunc = sem->scalar;
1257         JsonTokenType tok = lex_peek(lex);
1258         JsonParseErrorType result;
1259
1260         /* a scalar must be a string, a number, true, false, or null */
1261         if (tok != JSON_TOKEN_STRING && tok != JSON_TOKEN_NUMBER &&
1262                 tok != JSON_TOKEN_TRUE && tok != JSON_TOKEN_FALSE &&
1263                 tok != JSON_TOKEN_NULL)
1264                 return report_parse_error(JSON_PARSE_VALUE, lex);
1265
1266         /* if no semantic function, just consume the token */
1267         if (sfunc == NULL)
1268                 return json_lex(lex);
1269
1270         /* extract the de-escaped string value, or the raw lexeme */
1271         if (lex_peek(lex) == JSON_TOKEN_STRING)
1272         {
1273                 if (lex->need_escapes)
1274                 {
1275                         val = STRDUP(lex->strval->data);
1276                         if (val == NULL)
1277                                 return JSON_OUT_OF_MEMORY;
1278                 }
1279         }
1280         else
1281         {
1282                 int                     len = (lex->token_terminator - lex->token_start);
1283
1284                 val = ALLOC(len + 1);
1285                 if (val == NULL)
1286                         return JSON_OUT_OF_MEMORY;
1287
1288                 memcpy(val, lex->token_start, len);
1289                 val[len] = '\0';
1290         }
1291
1292         /* consume the token */
1293         result = json_lex(lex);
1294         if (result != JSON_SUCCESS)
1295         {
1296                 FREE(val);
1297                 return result;
1298         }
1299
1300         /* invoke the callback, which may take ownership of val */
1301         result = (*sfunc) (sem->semstate, val, tok);
1302
1303         if (lex->flags & JSONLEX_CTX_OWNS_TOKENS)
1304                 FREE(val);
1305
1306         return result;
1307 }
1308
1309 static JsonParseErrorType
1310 parse_object_field(JsonLexContext *lex, const JsonSemAction *sem)
1311 {
1312         /*
1313          * An object field is "fieldname" : value where value can be a scalar,
1314          * object or array.  Note: in user-facing docs and error messages, we
1315          * generally call a field name a "key".
1316          */
1317
1318         char       *fname = NULL;
1319         json_ofield_action ostart = sem->object_field_start;
1320         json_ofield_action oend = sem->object_field_end;
1321         bool            isnull;
1322         JsonTokenType tok;
1323         JsonParseErrorType result;
1324
1325         if (lex_peek(lex) != JSON_TOKEN_STRING)
1326                 return report_parse_error(JSON_PARSE_STRING, lex);
1327         if ((ostart != NULL || oend != NULL) && lex->need_escapes)
1328         {
1329                 fname = STRDUP(lex->strval->data);
1330                 if (fname == NULL)
1331                         return JSON_OUT_OF_MEMORY;
1332         }
1333         result = json_lex(lex);
1334         if (result != JSON_SUCCESS)
1335         {
1336                 FREE(fname);
1337                 return result;
1338         }
1339
1340         result = lex_expect(JSON_PARSE_OBJECT_LABEL, lex, JSON_TOKEN_COLON);
1341         if (result != JSON_SUCCESS)
1342         {
1343                 FREE(fname);
1344                 return result;
1345         }
1346
1347         tok = lex_peek(lex);
1348         isnull = tok == JSON_TOKEN_NULL;
1349
1350         if (ostart != NULL)
1351         {
1352                 result = (*ostart) (sem->semstate, fname, isnull);
1353                 if (result != JSON_SUCCESS)
1354                         goto ofield_cleanup;
1355         }
1356
1357         switch (tok)
1358         {
1359                 case JSON_TOKEN_OBJECT_START:
1360                         result = parse_object(lex, sem);
1361                         break;
1362                 case JSON_TOKEN_ARRAY_START:
1363                         result = parse_array(lex, sem);
1364                         break;
1365                 default:
1366                         result = parse_scalar(lex, sem);
1367         }
1368         if (result != JSON_SUCCESS)
1369                 goto ofield_cleanup;
1370
1371         if (oend != NULL)
1372         {
1373                 result = (*oend) (sem->semstate, fname, isnull);
1374                 if (result != JSON_SUCCESS)
1375                         goto ofield_cleanup;
1376         }
1377
1378 ofield_cleanup:
1379         if (lex->flags & JSONLEX_CTX_OWNS_TOKENS)
1380                 FREE(fname);
1381         return result;
1382 }
1383
1384 static JsonParseErrorType
1385 parse_object(JsonLexContext *lex, const JsonSemAction *sem)
1386 {
1387         /*
1388          * an object is a possibly empty sequence of object fields, separated by
1389          * commas and surrounded by curly braces.
1390          */
1391         json_struct_action ostart = sem->object_start;
1392         json_struct_action oend = sem->object_end;
1393         JsonTokenType tok;
1394         JsonParseErrorType result;
1395
1396 #ifndef FRONTEND
1397
1398         /*
1399          * TODO: clients need some way to put a bound on stack growth. Parse level
1400          * limits maybe?
1401          */
1402         check_stack_depth();
1403 #endif
1404
1405         if (ostart != NULL)
1406         {
1407                 result = (*ostart) (sem->semstate);
1408                 if (result != JSON_SUCCESS)
1409                         return result;
1410         }
1411
1412         /*
1413          * Data inside an object is at a higher nesting level than the object
1414          * itself. Note that we increment this after we call the semantic routine
1415          * for the object start and restore it before we call the routine for the
1416          * object end.
1417          */
1418         lex->lex_level++;
1419
1420         Assert(lex_peek(lex) == JSON_TOKEN_OBJECT_START);
1421         result = json_lex(lex);
1422         if (result != JSON_SUCCESS)
1423                 return result;
1424
1425         tok = lex_peek(lex);
1426         switch (tok)
1427         {
1428                 case JSON_TOKEN_STRING:
1429                         result = parse_object_field(lex, sem);
1430                         while (result == JSON_SUCCESS && lex_peek(lex) == JSON_TOKEN_COMMA)
1431                         {
1432                                 result = json_lex(lex);
1433                                 if (result != JSON_SUCCESS)
1434                                         break;
1435                                 result = parse_object_field(lex, sem);
1436                         }
1437                         break;
1438                 case JSON_TOKEN_OBJECT_END:
1439                         break;
1440                 default:
1441                         /* case of an invalid initial token inside the object */
1442                         result = report_parse_error(JSON_PARSE_OBJECT_START, lex);
1443         }
1444         if (result != JSON_SUCCESS)
1445                 return result;
1446
1447         result = lex_expect(JSON_PARSE_OBJECT_NEXT, lex, JSON_TOKEN_OBJECT_END);
1448         if (result != JSON_SUCCESS)
1449                 return result;
1450
1451         lex->lex_level--;
1452
1453         if (oend != NULL)
1454         {
1455                 result = (*oend) (sem->semstate);
1456                 if (result != JSON_SUCCESS)
1457                         return result;
1458         }
1459
1460         return JSON_SUCCESS;
1461 }
1462
1463 static JsonParseErrorType
1464 parse_array_element(JsonLexContext *lex, const JsonSemAction *sem)
1465 {
1466         json_aelem_action astart = sem->array_element_start;
1467         json_aelem_action aend = sem->array_element_end;
1468         JsonTokenType tok = lex_peek(lex);
1469         JsonParseErrorType result;
1470         bool            isnull;
1471
1472         isnull = tok == JSON_TOKEN_NULL;
1473
1474         if (astart != NULL)
1475         {
1476                 result = (*astart) (sem->semstate, isnull);
1477                 if (result != JSON_SUCCESS)
1478                         return result;
1479         }
1480
1481         /* an array element is any object, array or scalar */
1482         switch (tok)
1483         {
1484                 case JSON_TOKEN_OBJECT_START:
1485                         result = parse_object(lex, sem);
1486                         break;
1487                 case JSON_TOKEN_ARRAY_START:
1488                         result = parse_array(lex, sem);
1489                         break;
1490                 default:
1491                         result = parse_scalar(lex, sem);
1492         }
1493
1494         if (result != JSON_SUCCESS)
1495                 return result;
1496
1497         if (aend != NULL)
1498         {
1499                 result = (*aend) (sem->semstate, isnull);
1500                 if (result != JSON_SUCCESS)
1501                         return result;
1502         }
1503
1504         return JSON_SUCCESS;
1505 }
1506
1507 static JsonParseErrorType
1508 parse_array(JsonLexContext *lex, const JsonSemAction *sem)
1509 {
1510         /*
1511          * an array is a possibly empty sequence of array elements, separated by
1512          * commas and surrounded by square brackets.
1513          */
1514         json_struct_action astart = sem->array_start;
1515         json_struct_action aend = sem->array_end;
1516         JsonParseErrorType result;
1517
1518 #ifndef FRONTEND
1519         check_stack_depth();
1520 #endif
1521
1522         if (astart != NULL)
1523         {
1524                 result = (*astart) (sem->semstate);
1525                 if (result != JSON_SUCCESS)
1526                         return result;
1527         }
1528
1529         /*
1530          * Data inside an array is at a higher nesting level than the array
1531          * itself. Note that we increment this after we call the semantic routine
1532          * for the array start and restore it before we call the routine for the
1533          * array end.
1534          */
1535         lex->lex_level++;
1536
1537         result = lex_expect(JSON_PARSE_ARRAY_START, lex, JSON_TOKEN_ARRAY_START);
1538         if (result == JSON_SUCCESS && lex_peek(lex) != JSON_TOKEN_ARRAY_END)
1539         {
1540                 result = parse_array_element(lex, sem);
1541
1542                 while (result == JSON_SUCCESS && lex_peek(lex) == JSON_TOKEN_COMMA)
1543                 {
1544                         result = json_lex(lex);
1545                         if (result != JSON_SUCCESS)
1546                                 break;
1547                         result = parse_array_element(lex, sem);
1548                 }
1549         }
1550         if (result != JSON_SUCCESS)
1551                 return result;
1552
1553         result = lex_expect(JSON_PARSE_ARRAY_NEXT, lex, JSON_TOKEN_ARRAY_END);
1554         if (result != JSON_SUCCESS)
1555                 return result;
1556
1557         lex->lex_level--;
1558
1559         if (aend != NULL)
1560         {
1561                 result = (*aend) (sem->semstate);
1562                 if (result != JSON_SUCCESS)
1563                         return result;
1564         }
1565
1566         return JSON_SUCCESS;
1567 }
1568
1569 /*
1570  * Lex one token from the input stream.
1571  *
1572  * When doing incremental parsing, we can reach the end of the input string
1573  * without having (or knowing we have) a complete token. If it's not the
1574  * final chunk of input, the partial token is then saved to the lex
1575  * structure's ptok StringInfo. On subsequent calls input is appended to this
1576  * buffer until we have something that we think is a complete token,
1577  * which is then lexed using a recursive call to json_lex. Processing then
1578  * continues as normal on subsequent calls.
1579  *
1580  * Note than when doing incremental processing, the lex.prev_token_terminator
1581  * should not be relied on. It could point into a previous input chunk or
1582  * worse.
1583  */
1584 JsonParseErrorType
1585 json_lex(JsonLexContext *lex)
1586 {
1587         const char *s;
1588         const char *const end = lex->input + lex->input_length;
1589         JsonParseErrorType result;
1590
1591         if (lex == &failed_oom || lex->inc_state == &failed_inc_oom)
1592                 return JSON_OUT_OF_MEMORY;
1593
1594         if (lex->incremental)
1595         {
1596                 if (lex->inc_state->partial_completed)
1597                 {
1598                         /*
1599                          * We just lexed a completed partial token on the last call, so
1600                          * reset everything
1601                          */
1602                         jsonapi_resetStringInfo(&(lex->inc_state->partial_token));
1603                         lex->token_terminator = lex->input;
1604                         lex->inc_state->partial_completed = false;
1605                 }
1606
1607 #ifdef JSONAPI_USE_PQEXPBUFFER
1608                 /* Make sure our partial token buffer is valid before using it below. */
1609                 if (PQExpBufferDataBroken(lex->inc_state->partial_token))
1610                         return JSON_OUT_OF_MEMORY;
1611 #endif
1612         }
1613
1614         s = lex->token_terminator;
1615
1616         if (lex->incremental && lex->inc_state->partial_token.len)
1617         {
1618                 /*
1619                  * We have a partial token. Extend it and if completed lex it by a
1620                  * recursive call
1621                  */
1622                 jsonapi_StrValType *ptok = &(lex->inc_state->partial_token);
1623                 size_t          added = 0;
1624                 bool            tok_done = false;
1625                 JsonLexContext dummy_lex = {0};
1626                 JsonParseErrorType partial_result;
1627
1628                 if (ptok->data[0] == '"')
1629                 {
1630                         /*
1631                          * It's a string. Accumulate characters until we reach an
1632                          * unescaped '"'.
1633                          */
1634                         int                     escapes = 0;
1635
1636                         for (int i = ptok->len - 1; i > 0; i--)
1637                         {
1638                                 /* count the trailing backslashes on the partial token */
1639                                 if (ptok->data[i] == '\\')
1640                                         escapes++;
1641                                 else
1642                                         break;
1643                         }
1644
1645                         for (size_t i = 0; i < lex->input_length; i++)
1646                         {
1647                                 char            c = lex->input[i];
1648
1649                                 jsonapi_appendStringInfoCharMacro(ptok, c);
1650                                 added++;
1651                                 if (c == '"' && escapes % 2 == 0)
1652                                 {
1653                                         tok_done = true;
1654                                         break;
1655                                 }
1656                                 if (c == '\\')
1657                                         escapes++;
1658                                 else
1659                                         escapes = 0;
1660                         }
1661                 }
1662                 else
1663                 {
1664                         /* not a string */
1665                         char            c = ptok->data[0];
1666
1667                         if (c == '-' || (c >= '0' && c <= '9'))
1668                         {
1669                                 /* for numbers look for possible numeric continuations */
1670
1671                                 bool            numend = false;
1672
1673                                 for (size_t i = 0; i < lex->input_length && !numend; i++)
1674                                 {
1675                                         char            cc = lex->input[i];
1676
1677                                         switch (cc)
1678                                         {
1679                                                 case '+':
1680                                                 case '-':
1681                                                 case 'e':
1682                                                 case 'E':
1683                                                 case '0':
1684                                                 case '1':
1685                                                 case '2':
1686                                                 case '3':
1687                                                 case '4':
1688                                                 case '5':
1689                                                 case '6':
1690                                                 case '7':
1691                                                 case '8':
1692                                                 case '9':
1693                                                         {
1694                                                                 jsonapi_appendStringInfoCharMacro(ptok, cc);
1695                                                                 added++;
1696                                                         }
1697                                                         break;
1698                                                 default:
1699                                                         numend = true;
1700                                         }
1701                                 }
1702                         }
1703
1704                         /*
1705                          * Add any remaining alphanumeric chars. This takes care of the
1706                          * {null, false, true} literals as well as any trailing
1707                          * alphanumeric junk on non-string tokens.
1708                          */
1709                         for (size_t i = added; i < lex->input_length; i++)
1710                         {
1711                                 char            cc = lex->input[i];
1712
1713                                 if (JSON_ALPHANUMERIC_CHAR(cc))
1714                                 {
1715                                         jsonapi_appendStringInfoCharMacro(ptok, cc);
1716                                         added++;
1717                                 }
1718                                 else
1719                                 {
1720                                         tok_done = true;
1721                                         break;
1722                                 }
1723                         }
1724                         if (added == lex->input_length &&
1725                                 lex->inc_state->is_last_chunk)
1726                         {
1727                                 tok_done = true;
1728                         }
1729                 }
1730
1731                 if (!tok_done)
1732                 {
1733                         /* We should have consumed the whole chunk in this case. */
1734                         Assert(added == lex->input_length);
1735
1736                         if (!lex->inc_state->is_last_chunk)
1737                                 return JSON_INCOMPLETE;
1738
1739                         /* json_errdetail() needs access to the accumulated token. */
1740                         lex->token_start = ptok->data;
1741                         lex->token_terminator = ptok->data + ptok->len;
1742                         return JSON_INVALID_TOKEN;
1743                 }
1744
1745                 /*
1746                  * Everything up to lex->input[added] has been added to the partial
1747                  * token, so move the input past it.
1748                  */
1749                 lex->input += added;
1750                 lex->input_length -= added;
1751
1752                 dummy_lex.input = dummy_lex.token_terminator =
1753                         dummy_lex.line_start = ptok->data;
1754                 dummy_lex.line_number = lex->line_number;
1755                 dummy_lex.input_length = ptok->len;
1756                 dummy_lex.input_encoding = lex->input_encoding;
1757                 dummy_lex.incremental = false;
1758                 dummy_lex.need_escapes = lex->need_escapes;
1759                 dummy_lex.strval = lex->strval;
1760
1761                 partial_result = json_lex(&dummy_lex);
1762
1763                 /*
1764                  * We either have a complete token or an error. In either case we need
1765                  * to point to the partial token data for the semantic or error
1766                  * routines. If it's not an error we'll readjust on the next call to
1767                  * json_lex.
1768                  */
1769                 lex->token_type = dummy_lex.token_type;
1770                 lex->line_number = dummy_lex.line_number;
1771
1772                 /*
1773                  * We know the prev_token_terminator must be back in some previous
1774                  * piece of input, so we just make it NULL.
1775                  */
1776                 lex->prev_token_terminator = NULL;
1777
1778                 /*
1779                  * Normally token_start would be ptok->data, but it could be later,
1780                  * see json_lex_string's handling of invalid escapes.
1781                  */
1782                 lex->token_start = dummy_lex.token_start;
1783                 lex->token_terminator = dummy_lex.token_terminator;
1784                 if (partial_result == JSON_SUCCESS)
1785                 {
1786                         /* make sure we've used all the input */
1787                         if (lex->token_terminator - lex->token_start != ptok->len)
1788                         {
1789                                 Assert(false);
1790                                 return JSON_INVALID_TOKEN;
1791                         }
1792
1793                         lex->inc_state->partial_completed = true;
1794                 }
1795                 return partial_result;
1796                 /* end of partial token processing */
1797         }
1798
1799         /* Skip leading whitespace. */
1800         while (s < end && (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r'))
1801         {
1802                 if (*s++ == '\n')
1803                 {
1804                         ++lex->line_number;
1805                         lex->line_start = s;
1806                 }
1807         }
1808         lex->token_start = s;
1809
1810         /* Determine token type. */
1811         if (s >= end)
1812         {
1813                 lex->token_start = NULL;
1814                 lex->prev_token_terminator = lex->token_terminator;
1815                 lex->token_terminator = s;
1816                 lex->token_type = JSON_TOKEN_END;
1817         }
1818         else
1819         {
1820                 switch (*s)
1821                 {
1822                                 /* Single-character token, some kind of punctuation mark. */
1823                         case '{':
1824                                 lex->prev_token_terminator = lex->token_terminator;
1825                                 lex->token_terminator = s + 1;
1826                                 lex->token_type = JSON_TOKEN_OBJECT_START;
1827                                 break;
1828                         case '}':
1829                                 lex->prev_token_terminator = lex->token_terminator;
1830                                 lex->token_terminator = s + 1;
1831                                 lex->token_type = JSON_TOKEN_OBJECT_END;
1832                                 break;
1833                         case '[':
1834                                 lex->prev_token_terminator = lex->token_terminator;
1835                                 lex->token_terminator = s + 1;
1836                                 lex->token_type = JSON_TOKEN_ARRAY_START;
1837                                 break;
1838                         case ']':
1839                                 lex->prev_token_terminator = lex->token_terminator;
1840                                 lex->token_terminator = s + 1;
1841                                 lex->token_type = JSON_TOKEN_ARRAY_END;
1842                                 break;
1843                         case ',':
1844                                 lex->prev_token_terminator = lex->token_terminator;
1845                                 lex->token_terminator = s + 1;
1846                                 lex->token_type = JSON_TOKEN_COMMA;
1847                                 break;
1848                         case ':':
1849                                 lex->prev_token_terminator = lex->token_terminator;
1850                                 lex->token_terminator = s + 1;
1851                                 lex->token_type = JSON_TOKEN_COLON;
1852                                 break;
1853                         case '"':
1854                                 /* string */
1855                                 result = json_lex_string(lex);
1856                                 if (result != JSON_SUCCESS)
1857                                         return result;
1858                                 lex->token_type = JSON_TOKEN_STRING;
1859                                 break;
1860                         case '-':
1861                                 /* Negative number. */
1862                                 result = json_lex_number(lex, s + 1, NULL, NULL);
1863                                 if (result != JSON_SUCCESS)
1864                                         return result;
1865                                 lex->token_type = JSON_TOKEN_NUMBER;
1866                                 break;
1867                         case '0':
1868                         case '1':
1869                         case '2':
1870                         case '3':
1871                         case '4':
1872                         case '5':
1873                         case '6':
1874                         case '7':
1875                         case '8':
1876                         case '9':
1877                                 /* Positive number. */
1878                                 result = json_lex_number(lex, s, NULL, NULL);
1879                                 if (result != JSON_SUCCESS)
1880                                         return result;
1881                                 lex->token_type = JSON_TOKEN_NUMBER;
1882                                 break;
1883                         default:
1884                                 {
1885                                         const char *p;
1886
1887                                         /*
1888                                          * We're not dealing with a string, number, legal
1889                                          * punctuation mark, or end of string.  The only legal
1890                                          * tokens we might find here are true, false, and null,
1891                                          * but for error reporting purposes we scan until we see a
1892                                          * non-alphanumeric character.  That way, we can report
1893                                          * the whole word as an unexpected token, rather than just
1894                                          * some unintuitive prefix thereof.
1895                                          */
1896                                         for (p = s; p < end && JSON_ALPHANUMERIC_CHAR(*p); p++)
1897                                                  /* skip */ ;
1898
1899                                         /*
1900                                          * We got some sort of unexpected punctuation or an
1901                                          * otherwise unexpected character, so just complain about
1902                                          * that one character.
1903                                          */
1904                                         if (p == s)
1905                                         {
1906                                                 lex->prev_token_terminator = lex->token_terminator;
1907                                                 lex->token_terminator = s + 1;
1908                                                 return JSON_INVALID_TOKEN;
1909                                         }
1910
1911                                         if (lex->incremental && !lex->inc_state->is_last_chunk &&
1912                                                 p == lex->input + lex->input_length)
1913                                         {
1914                                                 jsonapi_appendBinaryStringInfo(&(lex->inc_state->partial_token), s, end - s);
1915                                                 return JSON_INCOMPLETE;
1916                                         }
1917
1918                                         /*
1919                                          * We've got a real alphanumeric token here.  If it
1920                                          * happens to be true, false, or null, all is well.  If
1921                                          * not, error out.
1922                                          */
1923                                         lex->prev_token_terminator = lex->token_terminator;
1924                                         lex->token_terminator = p;
1925                                         if (p - s == 4)
1926                                         {
1927                                                 if (memcmp(s, "true", 4) == 0)
1928                                                         lex->token_type = JSON_TOKEN_TRUE;
1929                                                 else if (memcmp(s, "null", 4) == 0)
1930                                                         lex->token_type = JSON_TOKEN_NULL;
1931                                                 else
1932                                                         return JSON_INVALID_TOKEN;
1933                                         }
1934                                         else if (p - s == 5 && memcmp(s, "false", 5) == 0)
1935                                                 lex->token_type = JSON_TOKEN_FALSE;
1936                                         else
1937                                                 return JSON_INVALID_TOKEN;
1938                                 }
1939                 }                                               /* end of switch */
1940         }
1941
1942         if (lex->incremental && lex->token_type == JSON_TOKEN_END && !lex->inc_state->is_last_chunk)
1943                 return JSON_INCOMPLETE;
1944         else
1945                 return JSON_SUCCESS;
1946 }
1947
1948 /*
1949  * The next token in the input stream is known to be a string; lex it.
1950  *
1951  * If lex->strval isn't NULL, fill it with the decoded string.
1952  * Set lex->token_terminator to the end of the decoded input, and in
1953  * success cases, transfer its previous value to lex->prev_token_terminator.
1954  * Return JSON_SUCCESS or an error code.
1955  *
1956  * Note: be careful that all error exits advance lex->token_terminator
1957  * to the point after the character we detected the error on.
1958  */
1959 static inline JsonParseErrorType
1960 json_lex_string(JsonLexContext *lex)
1961 {
1962         const char *s;
1963         const char *const end = lex->input + lex->input_length;
1964         int                     hi_surrogate = -1;
1965
1966         /* Convenience macros for error exits */
1967 #define FAIL_OR_INCOMPLETE_AT_CHAR_START(code) \
1968         do { \
1969                 if (lex->incremental && !lex->inc_state->is_last_chunk) \
1970                 { \
1971                         jsonapi_appendBinaryStringInfo(&lex->inc_state->partial_token, \
1972                                                                                    lex->token_start, \
1973                                                                                    end - lex->token_start); \
1974                         return JSON_INCOMPLETE; \
1975                 } \
1976                 lex->token_terminator = s; \
1977                 return code; \
1978         } while (0)
1979 #define FAIL_AT_CHAR_END(code) \
1980         do { \
1981                 const char         *term = s + pg_encoding_mblen(lex->input_encoding, s); \
1982                 lex->token_terminator = (term <= end) ? term : end; \
1983                 return code; \
1984         } while (0)
1985
1986         if (lex->need_escapes)
1987         {
1988 #ifdef JSONAPI_USE_PQEXPBUFFER
1989                 /* make sure initialization succeeded */
1990                 if (lex->strval == NULL)
1991                         return JSON_OUT_OF_MEMORY;
1992 #endif
1993                 jsonapi_resetStringInfo(lex->strval);
1994         }
1995
1996         Assert(lex->input_length > 0);
1997         s = lex->token_start;
1998         for (;;)
1999         {
2000                 s++;
2001                 /* Premature end of the string. */
2002                 if (s >= end)
2003                         FAIL_OR_INCOMPLETE_AT_CHAR_START(JSON_INVALID_TOKEN);
2004                 else if (*s == '"')
2005                         break;
2006                 else if (*s == '\\')
2007                 {
2008                         /* OK, we have an escape character. */
2009                         s++;
2010                         if (s >= end)
2011                                 FAIL_OR_INCOMPLETE_AT_CHAR_START(JSON_INVALID_TOKEN);
2012                         else if (*s == 'u')
2013                         {
2014                                 int                     i;
2015                                 int                     ch = 0;
2016
2017                                 for (i = 1; i <= 4; i++)
2018                                 {
2019                                         s++;
2020                                         if (s >= end)
2021                                                 FAIL_OR_INCOMPLETE_AT_CHAR_START(JSON_INVALID_TOKEN);
2022                                         else if (*s >= '0' && *s <= '9')
2023                                                 ch = (ch * 16) + (*s - '0');
2024                                         else if (*s >= 'a' && *s <= 'f')
2025                                                 ch = (ch * 16) + (*s - 'a') + 10;
2026                                         else if (*s >= 'A' && *s <= 'F')
2027                                                 ch = (ch * 16) + (*s - 'A') + 10;
2028                                         else
2029                                                 FAIL_AT_CHAR_END(JSON_UNICODE_ESCAPE_FORMAT);
2030                                 }
2031                                 if (lex->need_escapes)
2032                                 {
2033                                         /*
2034                                          * Combine surrogate pairs.
2035                                          */
2036                                         if (is_utf16_surrogate_first(ch))
2037                                         {
2038                                                 if (hi_surrogate != -1)
2039                                                         FAIL_AT_CHAR_END(JSON_UNICODE_HIGH_SURROGATE);
2040                                                 hi_surrogate = ch;
2041                                                 continue;
2042                                         }
2043                                         else if (is_utf16_surrogate_second(ch))
2044                                         {
2045                                                 if (hi_surrogate == -1)
2046                                                         FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
2047                                                 ch = surrogate_pair_to_codepoint(hi_surrogate, ch);
2048                                                 hi_surrogate = -1;
2049                                         }
2050
2051                                         if (hi_surrogate != -1)
2052                                                 FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
2053
2054                                         /*
2055                                          * Reject invalid cases.  We can't have a value above
2056                                          * 0xFFFF here (since we only accepted 4 hex digits
2057                                          * above), so no need to test for out-of-range chars.
2058                                          */
2059                                         if (ch == 0)
2060                                         {
2061                                                 /* We can't allow this, since our TEXT type doesn't */
2062                                                 FAIL_AT_CHAR_END(JSON_UNICODE_CODE_POINT_ZERO);
2063                                         }
2064
2065                                         /*
2066                                          * Add the represented character to lex->strval.  In the
2067                                          * backend, we can let pg_unicode_to_server_noerror()
2068                                          * handle any required character set conversion; in
2069                                          * frontend, we can only deal with trivial conversions.
2070                                          */
2071 #ifndef FRONTEND
2072                                         {
2073                                                 char            cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
2074
2075                                                 if (!pg_unicode_to_server_noerror(ch, (unsigned char *) cbuf))
2076                                                         FAIL_AT_CHAR_END(JSON_UNICODE_UNTRANSLATABLE);
2077                                                 appendStringInfoString(lex->strval, cbuf);
2078                                         }
2079 #else
2080                                         if (lex->input_encoding == PG_UTF8)
2081                                         {
2082                                                 /* OK, we can map the code point to UTF8 easily */
2083                                                 char            utf8str[5];
2084                                                 int                     utf8len;
2085
2086                                                 unicode_to_utf8(ch, (unsigned char *) utf8str);
2087                                                 utf8len = pg_utf_mblen((unsigned char *) utf8str);
2088                                                 jsonapi_appendBinaryStringInfo(lex->strval, utf8str, utf8len);
2089                                         }
2090                                         else if (ch <= 0x007f)
2091                                         {
2092                                                 /* The ASCII range is the same in all encodings */
2093                                                 jsonapi_appendStringInfoChar(lex->strval, (char) ch);
2094                                         }
2095                                         else
2096                                                 FAIL_AT_CHAR_END(JSON_UNICODE_HIGH_ESCAPE);
2097 #endif                                                  /* FRONTEND */
2098                                 }
2099                         }
2100                         else if (lex->need_escapes)
2101                         {
2102                                 if (hi_surrogate != -1)
2103                                         FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
2104
2105                                 switch (*s)
2106                                 {
2107                                         case '"':
2108                                         case '\\':
2109                                         case '/':
2110                                                 jsonapi_appendStringInfoChar(lex->strval, *s);
2111                                                 break;
2112                                         case 'b':
2113                                                 jsonapi_appendStringInfoChar(lex->strval, '\b');
2114                                                 break;
2115                                         case 'f':
2116                                                 jsonapi_appendStringInfoChar(lex->strval, '\f');
2117                                                 break;
2118                                         case 'n':
2119                                                 jsonapi_appendStringInfoChar(lex->strval, '\n');
2120                                                 break;
2121                                         case 'r':
2122                                                 jsonapi_appendStringInfoChar(lex->strval, '\r');
2123                                                 break;
2124                                         case 't':
2125                                                 jsonapi_appendStringInfoChar(lex->strval, '\t');
2126                                                 break;
2127                                         default:
2128
2129                                                 /*
2130                                                  * Not a valid string escape, so signal error.  We
2131                                                  * adjust token_start so that just the escape sequence
2132                                                  * is reported, not the whole string.
2133                                                  */
2134                                                 lex->token_start = s;
2135                                                 FAIL_AT_CHAR_END(JSON_ESCAPING_INVALID);
2136                                 }
2137                         }
2138                         else if (strchr("\"\\/bfnrt", *s) == NULL)
2139                         {
2140                                 /*
2141                                  * Simpler processing if we're not bothered about de-escaping
2142                                  *
2143                                  * It's very tempting to remove the strchr() call here and
2144                                  * replace it with a switch statement, but testing so far has
2145                                  * shown it's not a performance win.
2146                                  */
2147                                 lex->token_start = s;
2148                                 FAIL_AT_CHAR_END(JSON_ESCAPING_INVALID);
2149                         }
2150                 }
2151                 else
2152                 {
2153                         const char *p = s;
2154
2155                         if (hi_surrogate != -1)
2156                                 FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
2157
2158                         /*
2159                          * Skip to the first byte that requires special handling, so we
2160                          * can batch calls to jsonapi_appendBinaryStringInfo.
2161                          */
2162                         while (p < end - sizeof(Vector8) &&
2163                                    !pg_lfind8('\\', (uint8 *) p, sizeof(Vector8)) &&
2164                                    !pg_lfind8('"', (uint8 *) p, sizeof(Vector8)) &&
2165                                    !pg_lfind8_le(31, (uint8 *) p, sizeof(Vector8)))
2166                                 p += sizeof(Vector8);
2167
2168                         for (; p < end; p++)
2169                         {
2170                                 if (*p == '\\' || *p == '"')
2171                                         break;
2172                                 else if ((unsigned char) *p <= 31)
2173                                 {
2174                                         /* Per RFC4627, these characters MUST be escaped. */
2175                                         /*
2176                                          * Since *p isn't printable, exclude it from the context
2177                                          * string
2178                                          */
2179                                         lex->token_terminator = p;
2180                                         return JSON_ESCAPING_REQUIRED;
2181                                 }
2182                         }
2183
2184                         if (lex->need_escapes)
2185                                 jsonapi_appendBinaryStringInfo(lex->strval, s, p - s);
2186
2187                         /*
2188                          * s will be incremented at the top of the loop, so set it to just
2189                          * behind our lookahead position
2190                          */
2191                         s = p - 1;
2192                 }
2193         }
2194
2195         if (hi_surrogate != -1)
2196         {
2197                 lex->token_terminator = s + 1;
2198                 return JSON_UNICODE_LOW_SURROGATE;
2199         }
2200
2201 #ifdef JSONAPI_USE_PQEXPBUFFER
2202         if (lex->need_escapes && PQExpBufferBroken(lex->strval))
2203                 return JSON_OUT_OF_MEMORY;
2204 #endif
2205
2206         /* Hooray, we found the end of the string! */
2207         lex->prev_token_terminator = lex->token_terminator;
2208         lex->token_terminator = s + 1;
2209         return JSON_SUCCESS;
2210
2211 #undef FAIL_OR_INCOMPLETE_AT_CHAR_START
2212 #undef FAIL_AT_CHAR_END
2213 }
2214
2215 /*
2216  * The next token in the input stream is known to be a number; lex it.
2217  *
2218  * In JSON, a number consists of four parts:
2219  *
2220  * (1) An optional minus sign ('-').
2221  *
2222  * (2) Either a single '0', or a string of one or more digits that does not
2223  *         begin with a '0'.
2224  *
2225  * (3) An optional decimal part, consisting of a period ('.') followed by
2226  *         one or more digits.  (Note: While this part can be omitted
2227  *         completely, it's not OK to have only the decimal point without
2228  *         any digits afterwards.)
2229  *
2230  * (4) An optional exponent part, consisting of 'e' or 'E', optionally
2231  *         followed by '+' or '-', followed by one or more digits.  (Note:
2232  *         As with the decimal part, if 'e' or 'E' is present, it must be
2233  *         followed by at least one digit.)
2234  *
2235  * The 's' argument to this function points to the ostensible beginning
2236  * of part 2 - i.e. the character after any optional minus sign, or the
2237  * first character of the string if there is none.
2238  *
2239  * If num_err is not NULL, we return an error flag to *num_err rather than
2240  * raising an error for a badly-formed number.  Also, if total_len is not NULL
2241  * the distance from lex->input to the token end+1 is returned to *total_len.
2242  */
2243 static inline JsonParseErrorType
2244 json_lex_number(JsonLexContext *lex, const char *s,
2245                                 bool *num_err, size_t *total_len)
2246 {
2247         bool            error = false;
2248         int                     len = s - lex->input;
2249
2250         /* Part (1): leading sign indicator. */
2251         /* Caller already did this for us; so do nothing. */
2252
2253         /* Part (2): parse main digit string. */
2254         if (len < lex->input_length && *s == '0')
2255         {
2256                 s++;
2257                 len++;
2258         }
2259         else if (len < lex->input_length && *s >= '1' && *s <= '9')
2260         {
2261                 do
2262                 {
2263                         s++;
2264                         len++;
2265                 } while (len < lex->input_length && *s >= '0' && *s <= '9');
2266         }
2267         else
2268                 error = true;
2269
2270         /* Part (3): parse optional decimal portion. */
2271         if (len < lex->input_length && *s == '.')
2272         {
2273                 s++;
2274                 len++;
2275                 if (len == lex->input_length || *s < '0' || *s > '9')
2276                         error = true;
2277                 else
2278                 {
2279                         do
2280                         {
2281                                 s++;
2282                                 len++;
2283                         } while (len < lex->input_length && *s >= '0' && *s <= '9');
2284                 }
2285         }
2286
2287         /* Part (4): parse optional exponent. */
2288         if (len < lex->input_length && (*s == 'e' || *s == 'E'))
2289         {
2290                 s++;
2291                 len++;
2292                 if (len < lex->input_length && (*s == '+' || *s == '-'))
2293                 {
2294                         s++;
2295                         len++;
2296                 }
2297                 if (len == lex->input_length || *s < '0' || *s > '9')
2298                         error = true;
2299                 else
2300                 {
2301                         do
2302                         {
2303                                 s++;
2304                                 len++;
2305                         } while (len < lex->input_length && *s >= '0' && *s <= '9');
2306                 }
2307         }
2308
2309         /*
2310          * Check for trailing garbage.  As in json_lex(), any alphanumeric stuff
2311          * here should be considered part of the token for error-reporting
2312          * purposes.
2313          */
2314         for (; len < lex->input_length && JSON_ALPHANUMERIC_CHAR(*s); s++, len++)
2315                 error = true;
2316
2317         if (total_len != NULL)
2318                 *total_len = len;
2319
2320         if (lex->incremental && !lex->inc_state->is_last_chunk &&
2321                 len >= lex->input_length)
2322         {
2323                 jsonapi_appendBinaryStringInfo(&lex->inc_state->partial_token,
2324                                                                            lex->token_start, s - lex->token_start);
2325                 if (num_err != NULL)
2326                         *num_err = error;
2327
2328                 return JSON_INCOMPLETE;
2329         }
2330         else if (num_err != NULL)
2331         {
2332                 /* let the caller handle any error */
2333                 *num_err = error;
2334         }
2335         else
2336         {
2337                 /* return token endpoint */
2338                 lex->prev_token_terminator = lex->token_terminator;
2339                 lex->token_terminator = s;
2340                 /* handle error if any */
2341                 if (error)
2342                         return JSON_INVALID_TOKEN;
2343         }
2344
2345         return JSON_SUCCESS;
2346 }
2347
2348 /*
2349  * Report a parse error.
2350  *
2351  * lex->token_start and lex->token_terminator must identify the current token.
2352  */
2353 static JsonParseErrorType
2354 report_parse_error(JsonParseContext ctx, JsonLexContext *lex)
2355 {
2356         /* Handle case where the input ended prematurely. */
2357         if (lex->token_start == NULL || lex->token_type == JSON_TOKEN_END)
2358                 return JSON_EXPECTED_MORE;
2359
2360         /* Otherwise choose the error type based on the parsing context. */
2361         switch (ctx)
2362         {
2363                 case JSON_PARSE_END:
2364                         return JSON_EXPECTED_END;
2365                 case JSON_PARSE_VALUE:
2366                         return JSON_EXPECTED_JSON;
2367                 case JSON_PARSE_STRING:
2368                         return JSON_EXPECTED_STRING;
2369                 case JSON_PARSE_ARRAY_START:
2370                         return JSON_EXPECTED_ARRAY_FIRST;
2371                 case JSON_PARSE_ARRAY_NEXT:
2372                         return JSON_EXPECTED_ARRAY_NEXT;
2373                 case JSON_PARSE_OBJECT_START:
2374                         return JSON_EXPECTED_OBJECT_FIRST;
2375                 case JSON_PARSE_OBJECT_LABEL:
2376                         return JSON_EXPECTED_COLON;
2377                 case JSON_PARSE_OBJECT_NEXT:
2378                         return JSON_EXPECTED_OBJECT_NEXT;
2379                 case JSON_PARSE_OBJECT_COMMA:
2380                         return JSON_EXPECTED_STRING;
2381         }
2382
2383         /*
2384          * We don't use a default: case, so that the compiler will warn about
2385          * unhandled enum values.
2386          */
2387         Assert(false);
2388         return JSON_SUCCESS;            /* silence stupider compilers */
2389 }
2390
2391 /*
2392  * Construct an (already translated) detail message for a JSON error.
2393  *
2394  * The returned pointer should not be freed, the allocation is either static
2395  * or owned by the JsonLexContext.
2396  */
2397 char *
2398 json_errdetail(JsonParseErrorType error, JsonLexContext *lex)
2399 {
2400         if (error == JSON_OUT_OF_MEMORY || lex == &failed_oom)
2401         {
2402                 /* Short circuit. Allocating anything for this case is unhelpful. */
2403                 return _("out of memory");
2404         }
2405
2406         if (lex->errormsg)
2407                 jsonapi_resetStringInfo(lex->errormsg);
2408         else
2409                 lex->errormsg = jsonapi_makeStringInfo();
2410
2411         /*
2412          * A helper for error messages that should print the current token. The
2413          * format must contain exactly one %.*s specifier.
2414          */
2415 #define json_token_error(lex, format) \
2416         jsonapi_appendStringInfo((lex)->errormsg, _(format), \
2417                                                          (int) ((lex)->token_terminator - (lex)->token_start), \
2418                                                          (lex)->token_start);
2419
2420         switch (error)
2421         {
2422                 case JSON_INCOMPLETE:
2423                 case JSON_SUCCESS:
2424                         /* fall through to the error code after switch */
2425                         break;
2426                 case JSON_INVALID_LEXER_TYPE:
2427                         if (lex->incremental)
2428                                 return _("Recursive descent parser cannot use incremental lexer.");
2429                         else
2430                                 return _("Incremental parser requires incremental lexer.");
2431                 case JSON_NESTING_TOO_DEEP:
2432                         return (_("JSON nested too deep, maximum permitted depth is 6400."));
2433                 case JSON_ESCAPING_INVALID:
2434                         json_token_error(lex, "Escape sequence \"\\%.*s\" is invalid.");
2435                         break;
2436                 case JSON_ESCAPING_REQUIRED:
2437                         jsonapi_appendStringInfo(lex->errormsg,
2438                                                                          _("Character with value 0x%02x must be escaped."),
2439                                                                          (unsigned char) *(lex->token_terminator));
2440                         break;
2441                 case JSON_EXPECTED_END:
2442                         json_token_error(lex, "Expected end of input, but found \"%.*s\".");
2443                         break;
2444                 case JSON_EXPECTED_ARRAY_FIRST:
2445                         json_token_error(lex, "Expected array element or \"]\", but found \"%.*s\".");
2446                         break;
2447                 case JSON_EXPECTED_ARRAY_NEXT:
2448                         json_token_error(lex, "Expected \",\" or \"]\", but found \"%.*s\".");
2449                         break;
2450                 case JSON_EXPECTED_COLON:
2451                         json_token_error(lex, "Expected \":\", but found \"%.*s\".");
2452                         break;
2453                 case JSON_EXPECTED_JSON:
2454                         json_token_error(lex, "Expected JSON value, but found \"%.*s\".");
2455                         break;
2456                 case JSON_EXPECTED_MORE:
2457                         return _("The input string ended unexpectedly.");
2458                 case JSON_EXPECTED_OBJECT_FIRST:
2459                         json_token_error(lex, "Expected string or \"}\", but found \"%.*s\".");
2460                         break;
2461                 case JSON_EXPECTED_OBJECT_NEXT:
2462                         json_token_error(lex, "Expected \",\" or \"}\", but found \"%.*s\".");
2463                         break;
2464                 case JSON_EXPECTED_STRING:
2465                         json_token_error(lex, "Expected string, but found \"%.*s\".");
2466                         break;
2467                 case JSON_INVALID_TOKEN:
2468                         json_token_error(lex, "Token \"%.*s\" is invalid.");
2469                         break;
2470                 case JSON_OUT_OF_MEMORY:
2471                         /* should have been handled above; use the error path */
2472                         break;
2473                 case JSON_UNICODE_CODE_POINT_ZERO:
2474                         return _("\\u0000 cannot be converted to text.");
2475                 case JSON_UNICODE_ESCAPE_FORMAT:
2476                         return _("\"\\u\" must be followed by four hexadecimal digits.");
2477                 case JSON_UNICODE_HIGH_ESCAPE:
2478                         /* note: this case is only reachable in frontend not backend */
2479                         return _("Unicode escape values cannot be used for code point values above 007F when the encoding is not UTF8.");
2480                 case JSON_UNICODE_UNTRANSLATABLE:
2481
2482                         /*
2483                          * Note: this case is only reachable in backend and not frontend.
2484                          * #ifdef it away so the frontend doesn't try to link against
2485                          * backend functionality.
2486                          */
2487 #ifndef FRONTEND
2488                         return psprintf(_("Unicode escape value could not be translated to the server's encoding %s."),
2489                                                         GetDatabaseEncodingName());
2490 #else
2491                         Assert(false);
2492                         break;
2493 #endif
2494                 case JSON_UNICODE_HIGH_SURROGATE:
2495                         return _("Unicode high surrogate must not follow a high surrogate.");
2496                 case JSON_UNICODE_LOW_SURROGATE:
2497                         return _("Unicode low surrogate must follow a high surrogate.");
2498                 case JSON_SEM_ACTION_FAILED:
2499                         /* fall through to the error code after switch */
2500                         break;
2501         }
2502 #undef json_token_error
2503
2504         /* Note that lex->errormsg can be NULL in shlib code. */
2505         if (lex->errormsg && lex->errormsg->len == 0)
2506         {
2507                 /*
2508                  * We don't use a default: case, so that the compiler will warn about
2509                  * unhandled enum values.  But this needs to be here anyway to cover
2510                  * the possibility of an incorrect input.
2511                  */
2512                 jsonapi_appendStringInfo(lex->errormsg,
2513                                                                  "unexpected json parse error type: %d",
2514                                                                  (int) error);
2515         }
2516
2517 #ifdef JSONAPI_USE_PQEXPBUFFER
2518         if (PQExpBufferBroken(lex->errormsg))
2519                 return _("out of memory while constructing error description");
2520 #endif
2521
2522         return lex->errormsg->data;
2523 }