src/backend/utils/adt/varlena.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * varlena.c
   4  *        Functions for the variable-length built-in types.
   5  *
   6  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        src/backend/utils/adt/varlena.c
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 #include "postgres.h"
  16
  17 #include <ctype.h>
  18 #include <limits.h>
  19
  20 #include "access/detoast.h"
  21 #include "access/toast_compression.h"
  22 #include "catalog/pg_collation.h"
  23 #include "catalog/pg_type.h"
  24 #include "common/hashfn.h"
  25 #include "common/int.h"
  26 #include "common/unicode_category.h"
  27 #include "common/unicode_norm.h"
  28 #include "common/unicode_version.h"
  29 #include "funcapi.h"
  30 #include "lib/hyperloglog.h"
  31 #include "libpq/pqformat.h"
  32 #include "miscadmin.h"
  33 #include "nodes/execnodes.h"
  34 #include "parser/scansup.h"
  35 #include "port/pg_bswap.h"
  36 #include "regex/regex.h"
  37 #include "utils/builtins.h"
  38 #include "utils/bytea.h"
  39 #include "utils/guc.h"
  40 #include "utils/lsyscache.h"
  41 #include "utils/memutils.h"
  42 #include "utils/pg_locale.h"
  43 #include "utils/sortsupport.h"
  44 #include "utils/varlena.h"
  45
  46
  47 /* GUC variable */
  48 int                     bytea_output = BYTEA_OUTPUT_HEX;
  49
  50 typedef struct varlena VarString;
  51
  52 /*
  53  * State for text_position_* functions.
  54  */
  55 typedef struct
  56 {
  57         bool            is_multibyte_char_in_char;      /* need to check char boundaries? */
  58
  59         char       *str1;                       /* haystack string */
  60         char       *str2;                       /* needle string */
  61         int                     len1;                   /* string lengths in bytes */
  62         int                     len2;
  63
  64         /* Skip table for Boyer-Moore-Horspool search algorithm: */
  65         int                     skiptablemask;  /* mask for ANDing with skiptable subscripts */
  66         int                     skiptable[256]; /* skip distance for given mismatched char */
  67
  68         char       *last_match;         /* pointer to last match in 'str1' */
  69
  70         /*
  71          * Sometimes we need to convert the byte position of a match to a
  72          * character position.  These store the last position that was converted,
  73          * so that on the next call, we can continue from that point, rather than
  74          * count characters from the very beginning.
  75          */
  76         char       *refpoint;           /* pointer within original haystack string */
  77         int                     refpos;                 /* 0-based character offset of the same point */
  78 } TextPositionState;
  79
  80 typedef struct
  81 {
  82         char       *buf1;                       /* 1st string, or abbreviation original string
  83                                                                  * buf */
  84         char       *buf2;                       /* 2nd string, or abbreviation strxfrm() buf */
  85         int                     buflen1;                /* Allocated length of buf1 */
  86         int                     buflen2;                /* Allocated length of buf2 */
  87         int                     last_len1;              /* Length of last buf1 string/strxfrm() input */
  88         int                     last_len2;              /* Length of last buf2 string/strxfrm() blob */
  89         int                     last_returned;  /* Last comparison result (cache) */
  90         bool            cache_blob;             /* Does buf2 contain strxfrm() blob, etc? */
  91         bool            collate_c;
  92         Oid                     typid;                  /* Actual datatype (text/bpchar/bytea/name) */
  93         hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
  94         hyperLogLogState full_card; /* Full key cardinality state */
  95         double          prop_card;              /* Required cardinality proportion */
  96         pg_locale_t locale;
  97 } VarStringSortSupport;
  98
  99 /*
 100  * Output data for split_text(): we output either to an array or a table.
 101  * tupstore and tupdesc must be set up in advance to output to a table.
 102  */
 103 typedef struct
 104 {
 105         ArrayBuildState *astate;
 106         Tuplestorestate *tupstore;
 107         TupleDesc       tupdesc;
 108 } SplitTextOutputData;
 109
 110 /*
 111  * This should be large enough that most strings will fit, but small enough
 112  * that we feel comfortable putting it on the stack
 113  */
 114 #define TEXTBUFLEN              1024
 115
 116 #define DatumGetVarStringP(X)           ((VarString *) PG_DETOAST_DATUM(X))
 117 #define DatumGetVarStringPP(X)          ((VarString *) PG_DETOAST_DATUM_PACKED(X))
 118
 119 static int      varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
 120 static int      bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
 121 static int      namefastcmp_c(Datum x, Datum y, SortSupport ssup);
 122 static int      varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup);
 123 static int      namefastcmp_locale(Datum x, Datum y, SortSupport ssup);
 124 static int      varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup);
 125 static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
 126 static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
 127 static int32 text_length(Datum str);
 128 static text *text_catenate(text *t1, text *t2);
 129 static text *text_substring(Datum str,
 130                                                         int32 start,
 131                                                         int32 length,
 132                                                         bool length_not_specified);
 133 static text *text_overlay(text *t1, text *t2, int sp, int sl);
 134 static int      text_position(text *t1, text *t2, Oid collid);
 135 static void text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state);
 136 static bool text_position_next(TextPositionState *state);
 137 static char *text_position_next_internal(char *start_ptr, TextPositionState *state);
 138 static char *text_position_get_match_ptr(TextPositionState *state);
 139 static int      text_position_get_match_pos(TextPositionState *state);
 140 static void text_position_cleanup(TextPositionState *state);
 141 static void check_collation_set(Oid collid);
 142 static int      text_cmp(text *arg1, text *arg2, Oid collid);
 143 static bytea *bytea_catenate(bytea *t1, bytea *t2);
 144 static bytea *bytea_substring(Datum str,
 145                                                           int S,
 146                                                           int L,
 147                                                           bool length_not_specified);
 148 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
 149 static void appendStringInfoText(StringInfo str, const text *t);
 150 static bool split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate);
 151 static void split_text_accum_result(SplitTextOutputData *tstate,
 152                                                                         text *field_value,
 153                                                                         text *null_string,
 154                                                                         Oid collation);
 155 static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
 156                                                                         const char *fldsep, const char *null_string);
 157 static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
 158 static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
 159                                                                          int *value);
 160 static const char *text_format_parse_format(const char *start_ptr,
 161                                                                                         const char *end_ptr,
 162                                                                                         int *argpos, int *widthpos,
 163                                                                                         int *flags, int *width);
 164 static void text_format_string_conversion(StringInfo buf, char conversion,
 165                                                                                   FmgrInfo *typOutputInfo,
 166                                                                                   Datum value, bool isNull,
 167                                                                                   int flags, int width);
 168 static void text_format_append_string(StringInfo buf, const char *str,
 169                                                                           int flags, int width);
 170
 171
 172 /*****************************************************************************
 173  *       CONVERSION ROUTINES EXPORTED FOR USE BY C CODE                                                  *
 174  *****************************************************************************/
 175
 176 /*
 177  * cstring_to_text
 178  *
 179  * Create a text value from a null-terminated C string.
 180  *
 181  * The new text value is freshly palloc'd with a full-size VARHDR.
 182  */
 183 text *
 184 cstring_to_text(const char *s)
 185 {
 186         return cstring_to_text_with_len(s, strlen(s));
 187 }
 188
 189 /*
 190  * cstring_to_text_with_len
 191  *
 192  * Same as cstring_to_text except the caller specifies the string length;
 193  * the string need not be null_terminated.
 194  */
 195 text *
 196 cstring_to_text_with_len(const char *s, int len)
 197 {
 198         text       *result = (text *) palloc(len + VARHDRSZ);
 199
 200         SET_VARSIZE(result, len + VARHDRSZ);
 201         memcpy(VARDATA(result), s, len);
 202
 203         return result;
 204 }
 205
 206 /*
 207  * text_to_cstring
 208  *
 209  * Create a palloc'd, null-terminated C string from a text value.
 210  *
 211  * We support being passed a compressed or toasted text value.
 212  * This is a bit bogus since such values shouldn't really be referred to as
 213  * "text *", but it seems useful for robustness.  If we didn't handle that
 214  * case here, we'd need another routine that did, anyway.
 215  */
 216 char *
 217 text_to_cstring(const text *t)
 218 {
 219         /* must cast away the const, unfortunately */
 220         text       *tunpacked = pg_detoast_datum_packed(unconstify(text *, t));
 221         int                     len = VARSIZE_ANY_EXHDR(tunpacked);
 222         char       *result;
 223
 224         result = (char *) palloc(len + 1);
 225         memcpy(result, VARDATA_ANY(tunpacked), len);
 226         result[len] = '\0';
 227
 228         if (tunpacked != t)
 229                 pfree(tunpacked);
 230
 231         return result;
 232 }
 233
 234 /*
 235  * text_to_cstring_buffer
 236  *
 237  * Copy a text value into a caller-supplied buffer of size dst_len.
 238  *
 239  * The text string is truncated if necessary to fit.  The result is
 240  * guaranteed null-terminated (unless dst_len == 0).
 241  *
 242  * We support being passed a compressed or toasted text value.
 243  * This is a bit bogus since such values shouldn't really be referred to as
 244  * "text *", but it seems useful for robustness.  If we didn't handle that
 245  * case here, we'd need another routine that did, anyway.
 246  */
 247 void
 248 text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
 249 {
 250         /* must cast away the const, unfortunately */
 251         text       *srcunpacked = pg_detoast_datum_packed(unconstify(text *, src));
 252         size_t          src_len = VARSIZE_ANY_EXHDR(srcunpacked);
 253
 254         if (dst_len > 0)
 255         {
 256                 dst_len--;
 257                 if (dst_len >= src_len)
 258                         dst_len = src_len;
 259                 else                                    /* ensure truncation is encoding-safe */
 260                         dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
 261                 memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
 262                 dst[dst_len] = '\0';
 263         }
 264
 265         if (srcunpacked != src)
 266                 pfree(srcunpacked);
 267 }
 268
 269
 270 /*****************************************************************************
 271  *       USER I/O ROUTINES                                                                                                               *
 272  *****************************************************************************/
 273
 274
 275 #define VAL(CH)                 ((CH) - '0')
 276 #define DIG(VAL)                ((VAL) + '0')
 277
 278 /*
 279  *              byteain                 - converts from printable representation of byte array
 280  *
 281  *              Non-printable characters must be passed as '\nnn' (octal) and are
 282  *              converted to internal form.  '\' must be passed as '\\'.
 283  *              ereport(ERROR, ...) if bad form.
 284  *
 285  *              BUGS:
 286  *                              The input is scanned twice.
 287  *                              The error checking of input is minimal.
 288  */
 289 Datum
 290 byteain(PG_FUNCTION_ARGS)
 291 {
 292         char       *inputText = PG_GETARG_CSTRING(0);
 293         Node       *escontext = fcinfo->context;
 294         char       *tp;
 295         char       *rp;
 296         int                     bc;
 297         bytea      *result;
 298
 299         /* Recognize hex input */
 300         if (inputText[0] == '\\' && inputText[1] == 'x')
 301         {
 302                 size_t          len = strlen(inputText);
 303
 304                 bc = (len - 2) / 2 + VARHDRSZ;  /* maximum possible length */
 305                 result = palloc(bc);
 306                 bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result),
 307                                                          escontext);
 308                 SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
 309
 310                 PG_RETURN_BYTEA_P(result);
 311         }
 312
 313         /* Else, it's the traditional escaped style */
 314         for (bc = 0, tp = inputText; *tp != '\0'; bc++)
 315         {
 316                 if (tp[0] != '\\')
 317                         tp++;
 318                 else if ((tp[0] == '\\') &&
 319                                  (tp[1] >= '0' && tp[1] <= '3') &&
 320                                  (tp[2] >= '0' && tp[2] <= '7') &&
 321                                  (tp[3] >= '0' && tp[3] <= '7'))
 322                         tp += 4;
 323                 else if ((tp[0] == '\\') &&
 324                                  (tp[1] == '\\'))
 325                         tp += 2;
 326                 else
 327                 {
 328                         /*
 329                          * one backslash, not followed by another or ### valid octal
 330                          */
 331                         ereturn(escontext, (Datum) 0,
 332                                         (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 333                                          errmsg("invalid input syntax for type %s", "bytea")));
 334                 }
 335         }
 336
 337         bc += VARHDRSZ;
 338
 339         result = (bytea *) palloc(bc);
 340         SET_VARSIZE(result, bc);
 341
 342         tp = inputText;
 343         rp = VARDATA(result);
 344         while (*tp != '\0')
 345         {
 346                 if (tp[0] != '\\')
 347                         *rp++ = *tp++;
 348                 else if ((tp[0] == '\\') &&
 349                                  (tp[1] >= '0' && tp[1] <= '3') &&
 350                                  (tp[2] >= '0' && tp[2] <= '7') &&
 351                                  (tp[3] >= '0' && tp[3] <= '7'))
 352                 {
 353                         bc = VAL(tp[1]);
 354                         bc <<= 3;
 355                         bc += VAL(tp[2]);
 356                         bc <<= 3;
 357                         *rp++ = bc + VAL(tp[3]);
 358
 359                         tp += 4;
 360                 }
 361                 else if ((tp[0] == '\\') &&
 362                                  (tp[1] == '\\'))
 363                 {
 364                         *rp++ = '\\';
 365                         tp += 2;
 366                 }
 367                 else
 368                 {
 369                         /*
 370                          * We should never get here. The first pass should not allow it.
 371                          */
 372                         ereturn(escontext, (Datum) 0,
 373                                         (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 374                                          errmsg("invalid input syntax for type %s", "bytea")));
 375                 }
 376         }
 377
 378         PG_RETURN_BYTEA_P(result);
 379 }
 380
 381 /*
 382  *              byteaout                - converts to printable representation of byte array
 383  *
 384  *              In the traditional escaped format, non-printable characters are
 385  *              printed as '\nnn' (octal) and '\' as '\\'.
 386  */
 387 Datum
 388 byteaout(PG_FUNCTION_ARGS)
 389 {
 390         bytea      *vlena = PG_GETARG_BYTEA_PP(0);
 391         char       *result;
 392         char       *rp;
 393
 394         if (bytea_output == BYTEA_OUTPUT_HEX)
 395         {
 396                 /* Print hex format */
 397                 rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
 398                 *rp++ = '\\';
 399                 *rp++ = 'x';
 400                 rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
 401         }
 402         else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
 403         {
 404                 /* Print traditional escaped format */
 405                 char       *vp;
 406                 uint64          len;
 407                 int                     i;
 408
 409                 len = 1;                                /* empty string has 1 char */
 410                 vp = VARDATA_ANY(vlena);
 411                 for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
 412                 {
 413                         if (*vp == '\\')
 414                                 len += 2;
 415                         else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
 416                                 len += 4;
 417                         else
 418                                 len++;
 419                 }
 420
 421                 /*
 422                  * In principle len can't overflow uint32 if the input fit in 1GB, but
 423                  * for safety let's check rather than relying on palloc's internal
 424                  * check.
 425                  */
 426                 if (len > MaxAllocSize)
 427                         ereport(ERROR,
 428                                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 429                                          errmsg_internal("result of bytea output conversion is too large")));
 430                 rp = result = (char *) palloc(len);
 431
 432                 vp = VARDATA_ANY(vlena);
 433                 for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
 434                 {
 435                         if (*vp == '\\')
 436                         {
 437                                 *rp++ = '\\';
 438                                 *rp++ = '\\';
 439                         }
 440                         else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
 441                         {
 442                                 int                     val;    /* holds unprintable chars */
 443
 444                                 val = *vp;
 445                                 rp[0] = '\\';
 446                                 rp[3] = DIG(val & 07);
 447                                 val >>= 3;
 448                                 rp[2] = DIG(val & 07);
 449                                 val >>= 3;
 450                                 rp[1] = DIG(val & 03);
 451                                 rp += 4;
 452                         }
 453                         else
 454                                 *rp++ = *vp;
 455                 }
 456         }
 457         else
 458         {
 459                 elog(ERROR, "unrecognized \"bytea_output\" setting: %d",
 460                          bytea_output);
 461                 rp = result = NULL;             /* keep compiler quiet */
 462         }
 463         *rp = '\0';
 464         PG_RETURN_CSTRING(result);
 465 }
 466
 467 /*
 468  *              bytearecv                       - converts external binary format to bytea
 469  */
 470 Datum
 471 bytearecv(PG_FUNCTION_ARGS)
 472 {
 473         StringInfo      buf = (StringInfo) PG_GETARG_POINTER(0);
 474         bytea      *result;
 475         int                     nbytes;
 476
 477         nbytes = buf->len - buf->cursor;
 478         result = (bytea *) palloc(nbytes + VARHDRSZ);
 479         SET_VARSIZE(result, nbytes + VARHDRSZ);
 480         pq_copymsgbytes(buf, VARDATA(result), nbytes);
 481         PG_RETURN_BYTEA_P(result);
 482 }
 483
 484 /*
 485  *              byteasend                       - converts bytea to binary format
 486  *
 487  * This is a special case: just copy the input...
 488  */
 489 Datum
 490 byteasend(PG_FUNCTION_ARGS)
 491 {
 492         bytea      *vlena = PG_GETARG_BYTEA_P_COPY(0);
 493
 494         PG_RETURN_BYTEA_P(vlena);
 495 }
 496
 497 Datum
 498 bytea_string_agg_transfn(PG_FUNCTION_ARGS)
 499 {
 500         StringInfo      state;
 501
 502         state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
 503
 504         /* Append the value unless null, preceding it with the delimiter. */
 505         if (!PG_ARGISNULL(1))
 506         {
 507                 bytea      *value = PG_GETARG_BYTEA_PP(1);
 508                 bool            isfirst = false;
 509
 510                 /*
 511                  * You might think we can just throw away the first delimiter, however
 512                  * we must keep it as we may be a parallel worker doing partial
 513                  * aggregation building a state to send to the main process.  We need
 514                  * to keep the delimiter of every aggregation so that the combine
 515                  * function can properly join up the strings of two separately
 516                  * partially aggregated results.  The first delimiter is only stripped
 517                  * off in the final function.  To know how much to strip off the front
 518                  * of the string, we store the length of the first delimiter in the
 519                  * StringInfo's cursor field, which we don't otherwise need here.
 520                  */
 521                 if (state == NULL)
 522                 {
 523                         state = makeStringAggState(fcinfo);
 524                         isfirst = true;
 525                 }
 526
 527                 if (!PG_ARGISNULL(2))
 528                 {
 529                         bytea      *delim = PG_GETARG_BYTEA_PP(2);
 530
 531                         appendBinaryStringInfo(state, VARDATA_ANY(delim),
 532                                                                    VARSIZE_ANY_EXHDR(delim));
 533                         if (isfirst)
 534                                 state->cursor = VARSIZE_ANY_EXHDR(delim);
 535                 }
 536
 537                 appendBinaryStringInfo(state, VARDATA_ANY(value),
 538                                                            VARSIZE_ANY_EXHDR(value));
 539         }
 540
 541         /*
 542          * The transition type for string_agg() is declared to be "internal",
 543          * which is a pass-by-value type the same size as a pointer.
 544          */
 545         if (state)
 546                 PG_RETURN_POINTER(state);
 547         PG_RETURN_NULL();
 548 }
 549
 550 Datum
 551 bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
 552 {
 553         StringInfo      state;
 554
 555         /* cannot be called directly because of internal-type argument */
 556         Assert(AggCheckCallContext(fcinfo, NULL));
 557
 558         state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
 559
 560         if (state != NULL)
 561         {
 562                 /* As per comment in transfn, strip data before the cursor position */
 563                 bytea      *result;
 564                 int                     strippedlen = state->len - state->cursor;
 565
 566                 result = (bytea *) palloc(strippedlen + VARHDRSZ);
 567                 SET_VARSIZE(result, strippedlen + VARHDRSZ);
 568                 memcpy(VARDATA(result), &state->data[state->cursor], strippedlen);
 569                 PG_RETURN_BYTEA_P(result);
 570         }
 571         else
 572                 PG_RETURN_NULL();
 573 }
 574
 575 /*
 576  *              textin                  - converts cstring to internal representation
 577  */
 578 Datum
 579 textin(PG_FUNCTION_ARGS)
 580 {
 581         char       *inputText = PG_GETARG_CSTRING(0);
 582
 583         PG_RETURN_TEXT_P(cstring_to_text(inputText));
 584 }
 585
 586 /*
 587  *              textout                 - converts internal representation to cstring
 588  */
 589 Datum
 590 textout(PG_FUNCTION_ARGS)
 591 {
 592         Datum           txt = PG_GETARG_DATUM(0);
 593
 594         PG_RETURN_CSTRING(TextDatumGetCString(txt));
 595 }
 596
 597 /*
 598  *              textrecv                        - converts external binary format to text
 599  */
 600 Datum
 601 textrecv(PG_FUNCTION_ARGS)
 602 {
 603         StringInfo      buf = (StringInfo) PG_GETARG_POINTER(0);
 604         text       *result;
 605         char       *str;
 606         int                     nbytes;
 607
 608         str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
 609
 610         result = cstring_to_text_with_len(str, nbytes);
 611         pfree(str);
 612         PG_RETURN_TEXT_P(result);
 613 }
 614
 615 /*
 616  *              textsend                        - converts text to binary format
 617  */
 618 Datum
 619 textsend(PG_FUNCTION_ARGS)
 620 {
 621         text       *t = PG_GETARG_TEXT_PP(0);
 622         StringInfoData buf;
 623
 624         pq_begintypsend(&buf);
 625         pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
 626         PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
 627 }
 628
 629
 630 /*
 631  *              unknownin                       - converts cstring to internal representation
 632  */
 633 Datum
 634 unknownin(PG_FUNCTION_ARGS)
 635 {
 636         char       *str = PG_GETARG_CSTRING(0);
 637
 638         /* representation is same as cstring */
 639         PG_RETURN_CSTRING(pstrdup(str));
 640 }
 641
 642 /*
 643  *              unknownout                      - converts internal representation to cstring
 644  */
 645 Datum
 646 unknownout(PG_FUNCTION_ARGS)
 647 {
 648         /* representation is same as cstring */
 649         char       *str = PG_GETARG_CSTRING(0);
 650
 651         PG_RETURN_CSTRING(pstrdup(str));
 652 }
 653
 654 /*
 655  *              unknownrecv                     - converts external binary format to unknown
 656  */
 657 Datum
 658 unknownrecv(PG_FUNCTION_ARGS)
 659 {
 660         StringInfo      buf = (StringInfo) PG_GETARG_POINTER(0);
 661         char       *str;
 662         int                     nbytes;
 663
 664         str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
 665         /* representation is same as cstring */
 666         PG_RETURN_CSTRING(str);
 667 }
 668
 669 /*
 670  *              unknownsend                     - converts unknown to binary format
 671  */
 672 Datum
 673 unknownsend(PG_FUNCTION_ARGS)
 674 {
 675         /* representation is same as cstring */
 676         char       *str = PG_GETARG_CSTRING(0);
 677         StringInfoData buf;
 678
 679         pq_begintypsend(&buf);
 680         pq_sendtext(&buf, str, strlen(str));
 681         PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
 682 }
 683
 684
 685 /* ========== PUBLIC ROUTINES ========== */
 686
 687 /*
 688  * textlen -
 689  *        returns the logical length of a text*
 690  *         (which is less than the VARSIZE of the text*)
 691  */
 692 Datum
 693 textlen(PG_FUNCTION_ARGS)
 694 {
 695         Datum           str = PG_GETARG_DATUM(0);
 696
 697         /* try to avoid decompressing argument */
 698         PG_RETURN_INT32(text_length(str));
 699 }
 700
 701 /*
 702  * text_length -
 703  *      Does the real work for textlen()
 704  *
 705  *      This is broken out so it can be called directly by other string processing
 706  *      functions.  Note that the argument is passed as a Datum, to indicate that
 707  *      it may still be in compressed form.  We can avoid decompressing it at all
 708  *      in some cases.
 709  */
 710 static int32
 711 text_length(Datum str)
 712 {
 713         /* fastpath when max encoding length is one */
 714         if (pg_database_encoding_max_length() == 1)
 715                 PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
 716         else
 717         {
 718                 text       *t = DatumGetTextPP(str);
 719
 720                 PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
 721                                                                                          VARSIZE_ANY_EXHDR(t)));
 722         }
 723 }
 724
 725 /*
 726  * textoctetlen -
 727  *        returns the physical length of a text*
 728  *         (which is less than the VARSIZE of the text*)
 729  */
 730 Datum
 731 textoctetlen(PG_FUNCTION_ARGS)
 732 {
 733         Datum           str = PG_GETARG_DATUM(0);
 734
 735         /* We need not detoast the input at all */
 736         PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
 737 }
 738
 739 /*
 740  * textcat -
 741  *        takes two text* and returns a text* that is the concatenation of
 742  *        the two.
 743  *
 744  * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
 745  * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
 746  * Allocate space for output in all cases.
 747  * XXX - thomas 1997-07-10
 748  */
 749 Datum
 750 textcat(PG_FUNCTION_ARGS)
 751 {
 752         text       *t1 = PG_GETARG_TEXT_PP(0);
 753         text       *t2 = PG_GETARG_TEXT_PP(1);
 754
 755         PG_RETURN_TEXT_P(text_catenate(t1, t2));
 756 }
 757
 758 /*
 759  * text_catenate
 760  *      Guts of textcat(), broken out so it can be used by other functions
 761  *
 762  * Arguments can be in short-header form, but not compressed or out-of-line
 763  */
 764 static text *
 765 text_catenate(text *t1, text *t2)
 766 {
 767         text       *result;
 768         int                     len1,
 769                                 len2,
 770                                 len;
 771         char       *ptr;
 772
 773         len1 = VARSIZE_ANY_EXHDR(t1);
 774         len2 = VARSIZE_ANY_EXHDR(t2);
 775
 776         /* paranoia ... probably should throw error instead? */
 777         if (len1 < 0)
 778                 len1 = 0;
 779         if (len2 < 0)
 780                 len2 = 0;
 781
 782         len = len1 + len2 + VARHDRSZ;
 783         result = (text *) palloc(len);
 784
 785         /* Set size of result string... */
 786         SET_VARSIZE(result, len);
 787
 788         /* Fill data field of result string... */
 789         ptr = VARDATA(result);
 790         if (len1 > 0)
 791                 memcpy(ptr, VARDATA_ANY(t1), len1);
 792         if (len2 > 0)
 793                 memcpy(ptr + len1, VARDATA_ANY(t2), len2);
 794
 795         return result;
 796 }
 797
 798 /*
 799  * charlen_to_bytelen()
 800  *      Compute the number of bytes occupied by n characters starting at *p
 801  *
 802  * It is caller's responsibility that there actually are n characters;
 803  * the string need not be null-terminated.
 804  */
 805 static int
 806 charlen_to_bytelen(const char *p, int n)
 807 {
 808         if (pg_database_encoding_max_length() == 1)
 809         {
 810                 /* Optimization for single-byte encodings */
 811                 return n;
 812         }
 813         else
 814         {
 815                 const char *s;
 816
 817                 for (s = p; n > 0; n--)
 818                         s += pg_mblen(s);
 819
 820                 return s - p;
 821         }
 822 }
 823
 824 /*
 825  * text_substr()
 826  * Return a substring starting at the specified position.
 827  * - thomas 1997-12-31
 828  *
 829  * Input:
 830  *      - string
 831  *      - starting position (is one-based)
 832  *      - string length
 833  *
 834  * If the starting position is zero or less, then return from the start of the string
 835  *      adjusting the length to be consistent with the "negative start" per SQL.
 836  * If the length is less than zero, return the remaining string.
 837  *
 838  * Added multibyte support.
 839  * - Tatsuo Ishii 1998-4-21
 840  * Changed behavior if starting position is less than one to conform to SQL behavior.
 841  * Formerly returned the entire string; now returns a portion.
 842  * - Thomas Lockhart 1998-12-10
 843  * Now uses faster TOAST-slicing interface
 844  * - John Gray 2002-02-22
 845  * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
 846  * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
 847  * error; if E < 1, return '', not entire string). Fixed MB related bug when
 848  * S > LC and < LC + 4 sometimes garbage characters are returned.
 849  * - Joe Conway 2002-08-10
 850  */
 851 Datum
 852 text_substr(PG_FUNCTION_ARGS)
 853 {
 854         PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
 855                                                                         PG_GETARG_INT32(1),
 856                                                                         PG_GETARG_INT32(2),
 857                                                                         false));
 858 }
 859
 860 /*
 861  * text_substr_no_len -
 862  *        Wrapper to avoid opr_sanity failure due to
 863  *        one function accepting a different number of args.
 864  */
 865 Datum
 866 text_substr_no_len(PG_FUNCTION_ARGS)
 867 {
 868         PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
 869                                                                         PG_GETARG_INT32(1),
 870                                                                         -1, true));
 871 }
 872
 873 /*
 874  * text_substring -
 875  *      Does the real work for text_substr() and text_substr_no_len()
 876  *
 877  *      This is broken out so it can be called directly by other string processing
 878  *      functions.  Note that the argument is passed as a Datum, to indicate that
 879  *      it may still be in compressed/toasted form.  We can avoid detoasting all
 880  *      of it in some cases.
 881  *
 882  *      The result is always a freshly palloc'd datum.
 883  */
 884 static text *
 885 text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
 886 {
 887         int32           eml = pg_database_encoding_max_length();
 888         int32           S = start;              /* start position */
 889         int32           S1;                             /* adjusted start position */
 890         int32           L1;                             /* adjusted substring length */
 891         int32           E;                              /* end position */
 892
 893         /*
 894          * SQL99 says S can be zero or negative (which we don't document), but we
 895          * still must fetch from the start of the string.
 896          * https://www.postgresql.org/message-id/170905442373.643.11536838320909376197%40wrigleys.postgresql.org
 897          */
 898         S1 = Max(S, 1);
 899
 900         /* life is easy if the encoding max length is 1 */
 901         if (eml == 1)
 902         {
 903                 if (length_not_specified)       /* special case - get length to end of
 904                                                                          * string */
 905                         L1 = -1;
 906                 else if (length < 0)
 907                 {
 908                         /* SQL99 says to throw an error for E < S, i.e., negative length */
 909                         ereport(ERROR,
 910                                         (errcode(ERRCODE_SUBSTRING_ERROR),
 911                                          errmsg("negative substring length not allowed")));
 912                         L1 = -1;                        /* silence stupider compilers */
 913                 }
 914                 else if (pg_add_s32_overflow(S, length, &E))
 915                 {
 916                         /*
 917                          * L could be large enough for S + L to overflow, in which case
 918                          * the substring must run to end of string.
 919                          */
 920                         L1 = -1;
 921                 }
 922                 else
 923                 {
 924                         /*
 925                          * A zero or negative value for the end position can happen if the
 926                          * start was negative or one. SQL99 says to return a zero-length
 927                          * string.
 928                          */
 929                         if (E < 1)
 930                                 return cstring_to_text("");
 931
 932                         L1 = E - S1;
 933                 }
 934
 935                 /*
 936                  * If the start position is past the end of the string, SQL99 says to
 937                  * return a zero-length string -- DatumGetTextPSlice() will do that
 938                  * for us.  We need only convert S1 to zero-based starting position.
 939                  */
 940                 return DatumGetTextPSlice(str, S1 - 1, L1);
 941         }
 942         else if (eml > 1)
 943         {
 944                 /*
 945                  * When encoding max length is > 1, we can't get LC without
 946                  * detoasting, so we'll grab a conservatively large slice now and go
 947                  * back later to do the right thing
 948                  */
 949                 int32           slice_start;
 950                 int32           slice_size;
 951                 int32           slice_strlen;
 952                 text       *slice;
 953                 int32           E1;
 954                 int32           i;
 955                 char       *p;
 956                 char       *s;
 957                 text       *ret;
 958
 959                 /*
 960                  * We need to start at position zero because there is no way to know
 961                  * in advance which byte offset corresponds to the supplied start
 962                  * position.
 963                  */
 964                 slice_start = 0;
 965
 966                 if (length_not_specified)       /* special case - get length to end of
 967                                                                          * string */
 968                         slice_size = L1 = -1;
 969                 else if (length < 0)
 970                 {
 971                         /* SQL99 says to throw an error for E < S, i.e., negative length */
 972                         ereport(ERROR,
 973                                         (errcode(ERRCODE_SUBSTRING_ERROR),
 974                                          errmsg("negative substring length not allowed")));
 975                         slice_size = L1 = -1;   /* silence stupider compilers */
 976                 }
 977                 else if (pg_add_s32_overflow(S, length, &E))
 978                 {
 979                         /*
 980                          * L could be large enough for S + L to overflow, in which case
 981                          * the substring must run to end of string.
 982                          */
 983                         slice_size = L1 = -1;
 984                 }
 985                 else
 986                 {
 987                         /*
 988                          * A zero or negative value for the end position can happen if the
 989                          * start was negative or one. SQL99 says to return a zero-length
 990                          * string.
 991                          */
 992                         if (E < 1)
 993                                 return cstring_to_text("");
 994
 995                         /*
 996                          * if E is past the end of the string, the tuple toaster will
 997                          * truncate the length for us
 998                          */
 999                         L1 = E - S1;
1000
1001                         /*
1002                          * Total slice size in bytes can't be any longer than the start
1003                          * position plus substring length times the encoding max length.
1004                          * If that overflows, we can just use -1.
1005                          */
1006                         if (pg_mul_s32_overflow(E, eml, &slice_size))
1007                                 slice_size = -1;
1008                 }
1009
1010                 /*
1011                  * If we're working with an untoasted source, no need to do an extra
1012                  * copying step.
1013                  */
1014                 if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
1015                         VARATT_IS_EXTERNAL(DatumGetPointer(str)))
1016                         slice = DatumGetTextPSlice(str, slice_start, slice_size);
1017                 else
1018                         slice = (text *) DatumGetPointer(str);
1019
1020                 /* see if we got back an empty string */
1021                 if (VARSIZE_ANY_EXHDR(slice) == 0)
1022                 {
1023                         if (slice != (text *) DatumGetPointer(str))
1024                                 pfree(slice);
1025                         return cstring_to_text("");
1026                 }
1027
1028                 /* Now we can get the actual length of the slice in MB characters */
1029                 slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
1030                                                                                         VARSIZE_ANY_EXHDR(slice));
1031
1032                 /*
1033                  * Check that the start position wasn't > slice_strlen. If so, SQL99
1034                  * says to return a zero-length string.
1035                  */
1036                 if (S1 > slice_strlen)
1037                 {
1038                         if (slice != (text *) DatumGetPointer(str))
1039                                 pfree(slice);
1040                         return cstring_to_text("");
1041                 }
1042
1043                 /*
1044                  * Adjust L1 and E1 now that we know the slice string length. Again
1045                  * remember that S1 is one based, and slice_start is zero based.
1046                  */
1047                 if (L1 > -1)
1048                         E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
1049                 else
1050                         E1 = slice_start + 1 + slice_strlen;
1051
1052                 /*
1053                  * Find the start position in the slice; remember S1 is not zero based
1054                  */
1055                 p = VARDATA_ANY(slice);
1056                 for (i = 0; i < S1 - 1; i++)
1057                         p += pg_mblen(p);
1058
1059                 /* hang onto a pointer to our start position */
1060                 s = p;
1061
1062                 /*
1063                  * Count the actual bytes used by the substring of the requested
1064                  * length.
1065                  */
1066                 for (i = S1; i < E1; i++)
1067                         p += pg_mblen(p);
1068
1069                 ret = (text *) palloc(VARHDRSZ + (p - s));
1070                 SET_VARSIZE(ret, VARHDRSZ + (p - s));
1071                 memcpy(VARDATA(ret), s, (p - s));
1072
1073                 if (slice != (text *) DatumGetPointer(str))
1074                         pfree(slice);
1075
1076                 return ret;
1077         }
1078         else
1079                 elog(ERROR, "invalid backend encoding: encoding max length < 1");
1080
1081         /* not reached: suppress compiler warning */
1082         return NULL;
1083 }
1084
1085 /*
1086  * textoverlay
1087  *      Replace specified substring of first string with second
1088  *
1089  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1090  * This code is a direct implementation of what the standard says.
1091  */
1092 Datum
1093 textoverlay(PG_FUNCTION_ARGS)
1094 {
1095         text       *t1 = PG_GETARG_TEXT_PP(0);
1096         text       *t2 = PG_GETARG_TEXT_PP(1);
1097         int                     sp = PG_GETARG_INT32(2);        /* substring start position */
1098         int                     sl = PG_GETARG_INT32(3);        /* substring length */
1099
1100         PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1101 }
1102
1103 Datum
1104 textoverlay_no_len(PG_FUNCTION_ARGS)
1105 {
1106         text       *t1 = PG_GETARG_TEXT_PP(0);
1107         text       *t2 = PG_GETARG_TEXT_PP(1);
1108         int                     sp = PG_GETARG_INT32(2);        /* substring start position */
1109         int                     sl;
1110
1111         sl = text_length(PointerGetDatum(t2));  /* defaults to length(t2) */
1112         PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1113 }
1114
1115 static text *
1116 text_overlay(text *t1, text *t2, int sp, int sl)
1117 {
1118         text       *result;
1119         text       *s1;
1120         text       *s2;
1121         int                     sp_pl_sl;
1122
1123         /*
1124          * Check for possible integer-overflow cases.  For negative sp, throw a
1125          * "substring length" error because that's what should be expected
1126          * according to the spec's definition of OVERLAY().
1127          */
1128         if (sp <= 0)
1129                 ereport(ERROR,
1130                                 (errcode(ERRCODE_SUBSTRING_ERROR),
1131                                  errmsg("negative substring length not allowed")));
1132         if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
1133                 ereport(ERROR,
1134                                 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1135                                  errmsg("integer out of range")));
1136
1137         s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1138         s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1139         result = text_catenate(s1, t2);
1140         result = text_catenate(result, s2);
1141
1142         return result;
1143 }
1144
1145 /*
1146  * textpos -
1147  *        Return the position of the specified substring.
1148  *        Implements the SQL POSITION() function.
1149  *        Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1150  * - thomas 1997-07-27
1151  */
1152 Datum
1153 textpos(PG_FUNCTION_ARGS)
1154 {
1155         text       *str = PG_GETARG_TEXT_PP(0);
1156         text       *search_str = PG_GETARG_TEXT_PP(1);
1157
1158         PG_RETURN_INT32((int32) text_position(str, search_str, PG_GET_COLLATION()));
1159 }
1160
1161 /*
1162  * text_position -
1163  *      Does the real work for textpos()
1164  *
1165  * Inputs:
1166  *              t1 - string to be searched
1167  *              t2 - pattern to match within t1
1168  * Result:
1169  *              Character index of the first matched char, starting from 1,
1170  *              or 0 if no match.
1171  *
1172  *      This is broken out so it can be called directly by other string processing
1173  *      functions.
1174  */
1175 static int
1176 text_position(text *t1, text *t2, Oid collid)
1177 {
1178         TextPositionState state;
1179         int                     result;
1180
1181         /* Empty needle always matches at position 1 */
1182         if (VARSIZE_ANY_EXHDR(t2) < 1)
1183                 return 1;
1184
1185         /* Otherwise, can't match if haystack is shorter than needle */
1186         if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1187                 return 0;
1188
1189         text_position_setup(t1, t2, collid, &state);
1190         if (!text_position_next(&state))
1191                 result = 0;
1192         else
1193                 result = text_position_get_match_pos(&state);
1194         text_position_cleanup(&state);
1195         return result;
1196 }
1197
1198
1199 /*
1200  * text_position_setup, text_position_next, text_position_cleanup -
1201  *      Component steps of text_position()
1202  *
1203  * These are broken out so that a string can be efficiently searched for
1204  * multiple occurrences of the same pattern.  text_position_next may be
1205  * called multiple times, and it advances to the next match on each call.
1206  * text_position_get_match_ptr() and text_position_get_match_pos() return
1207  * a pointer or 1-based character position of the last match, respectively.
1208  *
1209  * The "state" variable is normally just a local variable in the caller.
1210  *
1211  * NOTE: text_position_next skips over the matched portion.  For example,
1212  * searching for "xx" in "xxx" returns only one match, not two.
1213  */
1214
1215 static void
1216 text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
1217 {
1218         int                     len1 = VARSIZE_ANY_EXHDR(t1);
1219         int                     len2 = VARSIZE_ANY_EXHDR(t2);
1220         pg_locale_t mylocale;
1221
1222         check_collation_set(collid);
1223
1224         mylocale = pg_newlocale_from_collation(collid);
1225
1226         if (!mylocale->deterministic)
1227                 ereport(ERROR,
1228                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1229                                  errmsg("nondeterministic collations are not supported for substring searches")));
1230
1231         Assert(len1 > 0);
1232         Assert(len2 > 0);
1233
1234         /*
1235          * Even with a multi-byte encoding, we perform the search using the raw
1236          * byte sequence, ignoring multibyte issues.  For UTF-8, that works fine,
1237          * because in UTF-8 the byte sequence of one character cannot contain
1238          * another character.  For other multi-byte encodings, we do the search
1239          * initially as a simple byte search, ignoring multibyte issues, but
1240          * verify afterwards that the match we found is at a character boundary,
1241          * and continue the search if it was a false match.
1242          */
1243         if (pg_database_encoding_max_length() == 1)
1244                 state->is_multibyte_char_in_char = false;
1245         else if (GetDatabaseEncoding() == PG_UTF8)
1246                 state->is_multibyte_char_in_char = false;
1247         else
1248                 state->is_multibyte_char_in_char = true;
1249
1250         state->str1 = VARDATA_ANY(t1);
1251         state->str2 = VARDATA_ANY(t2);
1252         state->len1 = len1;
1253         state->len2 = len2;
1254         state->last_match = NULL;
1255         state->refpoint = state->str1;
1256         state->refpos = 0;
1257
1258         /*
1259          * Prepare the skip table for Boyer-Moore-Horspool searching.  In these
1260          * notes we use the terminology that the "haystack" is the string to be
1261          * searched (t1) and the "needle" is the pattern being sought (t2).
1262          *
1263          * If the needle is empty or bigger than the haystack then there is no
1264          * point in wasting cycles initializing the table.  We also choose not to
1265          * use B-M-H for needles of length 1, since the skip table can't possibly
1266          * save anything in that case.
1267          */
1268         if (len1 >= len2 && len2 > 1)
1269         {
1270                 int                     searchlength = len1 - len2;
1271                 int                     skiptablemask;
1272                 int                     last;
1273                 int                     i;
1274                 const char *str2 = state->str2;
1275
1276                 /*
1277                  * First we must determine how much of the skip table to use.  The
1278                  * declaration of TextPositionState allows up to 256 elements, but for
1279                  * short search problems we don't really want to have to initialize so
1280                  * many elements --- it would take too long in comparison to the
1281                  * actual search time.  So we choose a useful skip table size based on
1282                  * the haystack length minus the needle length.  The closer the needle
1283                  * length is to the haystack length the less useful skipping becomes.
1284                  *
1285                  * Note: since we use bit-masking to select table elements, the skip
1286                  * table size MUST be a power of 2, and so the mask must be 2^N-1.
1287                  */
1288                 if (searchlength < 16)
1289                         skiptablemask = 3;
1290                 else if (searchlength < 64)
1291                         skiptablemask = 7;
1292                 else if (searchlength < 128)
1293                         skiptablemask = 15;
1294                 else if (searchlength < 512)
1295                         skiptablemask = 31;
1296                 else if (searchlength < 2048)
1297                         skiptablemask = 63;
1298                 else if (searchlength < 4096)
1299                         skiptablemask = 127;
1300                 else
1301                         skiptablemask = 255;
1302                 state->skiptablemask = skiptablemask;
1303
1304                 /*
1305                  * Initialize the skip table.  We set all elements to the needle
1306                  * length, since this is the correct skip distance for any character
1307                  * not found in the needle.
1308                  */
1309                 for (i = 0; i <= skiptablemask; i++)
1310                         state->skiptable[i] = len2;
1311
1312                 /*
1313                  * Now examine the needle.  For each character except the last one,
1314                  * set the corresponding table element to the appropriate skip
1315                  * distance.  Note that when two characters share the same skip table
1316                  * entry, the one later in the needle must determine the skip
1317                  * distance.
1318                  */
1319                 last = len2 - 1;
1320
1321                 for (i = 0; i < last; i++)
1322                         state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1323         }
1324 }
1325
1326 /*
1327  * Advance to the next match, starting from the end of the previous match
1328  * (or the beginning of the string, on first call).  Returns true if a match
1329  * is found.
1330  *
1331  * Note that this refuses to match an empty-string needle.  Most callers
1332  * will have handled that case specially and we'll never see it here.
1333  */
1334 static bool
1335 text_position_next(TextPositionState *state)
1336 {
1337         int                     needle_len = state->len2;
1338         char       *start_ptr;
1339         char       *matchptr;
1340
1341         if (needle_len <= 0)
1342                 return false;                   /* result for empty pattern */
1343
1344         /* Start from the point right after the previous match. */
1345         if (state->last_match)
1346                 start_ptr = state->last_match + needle_len;
1347         else
1348                 start_ptr = state->str1;
1349
1350 retry:
1351         matchptr = text_position_next_internal(start_ptr, state);
1352
1353         if (!matchptr)
1354                 return false;
1355
1356         /*
1357          * Found a match for the byte sequence.  If this is a multibyte encoding,
1358          * where one character's byte sequence can appear inside a longer
1359          * multi-byte character, we need to verify that the match was at a
1360          * character boundary, not in the middle of a multi-byte character.
1361          */
1362         if (state->is_multibyte_char_in_char)
1363         {
1364                 /* Walk one character at a time, until we reach the match. */
1365
1366                 /* the search should never move backwards. */
1367                 Assert(state->refpoint <= matchptr);
1368
1369                 while (state->refpoint < matchptr)
1370                 {
1371                         /* step to next character. */
1372                         state->refpoint += pg_mblen(state->refpoint);
1373                         state->refpos++;
1374
1375                         /*
1376                          * If we stepped over the match's start position, then it was a
1377                          * false positive, where the byte sequence appeared in the middle
1378                          * of a multi-byte character.  Skip it, and continue the search at
1379                          * the next character boundary.
1380                          */
1381                         if (state->refpoint > matchptr)
1382                         {
1383                                 start_ptr = state->refpoint;
1384                                 goto retry;
1385                         }
1386                 }
1387         }
1388
1389         state->last_match = matchptr;
1390         return true;
1391 }
1392
1393 /*
1394  * Subroutine of text_position_next().  This searches for the raw byte
1395  * sequence, ignoring any multi-byte encoding issues.  Returns the first
1396  * match starting at 'start_ptr', or NULL if no match is found.
1397  */
1398 static char *
1399 text_position_next_internal(char *start_ptr, TextPositionState *state)
1400 {
1401         int                     haystack_len = state->len1;
1402         int                     needle_len = state->len2;
1403         int                     skiptablemask = state->skiptablemask;
1404         const char *haystack = state->str1;
1405         const char *needle = state->str2;
1406         const char *haystack_end = &haystack[haystack_len];
1407         const char *hptr;
1408
1409         Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1410
1411         if (needle_len == 1)
1412         {
1413                 /* No point in using B-M-H for a one-character needle */
1414                 char            nchar = *needle;
1415
1416                 hptr = start_ptr;
1417                 while (hptr < haystack_end)
1418                 {
1419                         if (*hptr == nchar)
1420                                 return (char *) hptr;
1421                         hptr++;
1422                 }
1423         }
1424         else
1425         {
1426                 const char *needle_last = &needle[needle_len - 1];
1427
1428                 /* Start at startpos plus the length of the needle */
1429                 hptr = start_ptr + needle_len - 1;
1430                 while (hptr < haystack_end)
1431                 {
1432                         /* Match the needle scanning *backward* */
1433                         const char *nptr;
1434                         const char *p;
1435
1436                         nptr = needle_last;
1437                         p = hptr;
1438                         while (*nptr == *p)
1439                         {
1440                                 /* Matched it all?      If so, return 1-based position */
1441                                 if (nptr == needle)
1442                                         return (char *) p;
1443                                 nptr--, p--;
1444                         }
1445
1446                         /*
1447                          * No match, so use the haystack char at hptr to decide how far to
1448                          * advance.  If the needle had any occurrence of that character
1449                          * (or more precisely, one sharing the same skiptable entry)
1450                          * before its last character, then we advance far enough to align
1451                          * the last such needle character with that haystack position.
1452                          * Otherwise we can advance by the whole needle length.
1453                          */
1454                         hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1455                 }
1456         }
1457
1458         return 0;                                       /* not found */
1459 }
1460
1461 /*
1462  * Return a pointer to the current match.
1463  *
1464  * The returned pointer points into the original haystack string.
1465  */
1466 static char *
1467 text_position_get_match_ptr(TextPositionState *state)
1468 {
1469         return state->last_match;
1470 }
1471
1472 /*
1473  * Return the offset of the current match.
1474  *
1475  * The offset is in characters, 1-based.
1476  */
1477 static int
1478 text_position_get_match_pos(TextPositionState *state)
1479 {
1480         /* Convert the byte position to char position. */
1481         state->refpos += pg_mbstrlen_with_len(state->refpoint,
1482                                                                                   state->last_match - state->refpoint);
1483         state->refpoint = state->last_match;
1484         return state->refpos + 1;
1485 }
1486
1487 /*
1488  * Reset search state to the initial state installed by text_position_setup.
1489  *
1490  * The next call to text_position_next will search from the beginning
1491  * of the string.
1492  */
1493 static void
1494 text_position_reset(TextPositionState *state)
1495 {
1496         state->last_match = NULL;
1497         state->refpoint = state->str1;
1498         state->refpos = 0;
1499 }
1500
1501 static void
1502 text_position_cleanup(TextPositionState *state)
1503 {
1504         /* no cleanup needed */
1505 }
1506
1507
1508 static void
1509 check_collation_set(Oid collid)
1510 {
1511         if (!OidIsValid(collid))
1512         {
1513                 /*
1514                  * This typically means that the parser could not resolve a conflict
1515                  * of implicit collations, so report it that way.
1516                  */
1517                 ereport(ERROR,
1518                                 (errcode(ERRCODE_INDETERMINATE_COLLATION),
1519                                  errmsg("could not determine which collation to use for string comparison"),
1520                                  errhint("Use the COLLATE clause to set the collation explicitly.")));
1521         }
1522 }
1523
1524 /*
1525  * varstr_cmp()
1526  *
1527  * Comparison function for text strings with given lengths, using the
1528  * appropriate locale. Returns an integer less than, equal to, or greater than
1529  * zero, indicating whether arg1 is less than, equal to, or greater than arg2.
1530  *
1531  * Note: many functions that depend on this are marked leakproof; therefore,
1532  * avoid reporting the actual contents of the input when throwing errors.
1533  * All errors herein should be things that can't happen except on corrupt
1534  * data, anyway; otherwise we will have trouble with indexing strings that
1535  * would cause them.
1536  */
1537 int
1538 varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
1539 {
1540         int                     result;
1541         pg_locale_t mylocale;
1542
1543         check_collation_set(collid);
1544
1545         mylocale = pg_newlocale_from_collation(collid);
1546
1547         if (mylocale->collate_is_c)
1548         {
1549                 result = memcmp(arg1, arg2, Min(len1, len2));
1550                 if ((result == 0) && (len1 != len2))
1551                         result = (len1 < len2) ? -1 : 1;
1552         }
1553         else
1554         {
1555                 /*
1556                  * memcmp() can't tell us which of two unequal strings sorts first,
1557                  * but it's a cheap way to tell if they're equal.  Testing shows that
1558                  * memcmp() followed by strcoll() is only trivially slower than
1559                  * strcoll() by itself, so we don't lose much if this doesn't work out
1560                  * very often, and if it does - for example, because there are many
1561                  * equal strings in the input - then we win big by avoiding expensive
1562                  * collation-aware comparisons.
1563                  */
1564                 if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1565                         return 0;
1566
1567                 result = pg_strncoll(arg1, len1, arg2, len2, mylocale);
1568
1569                 /* Break tie if necessary. */
1570                 if (result == 0 && mylocale->deterministic)
1571                 {
1572                         result = memcmp(arg1, arg2, Min(len1, len2));
1573                         if ((result == 0) && (len1 != len2))
1574                                 result = (len1 < len2) ? -1 : 1;
1575                 }
1576         }
1577
1578         return result;
1579 }
1580
1581 /* text_cmp()
1582  * Internal comparison function for text strings.
1583  * Returns -1, 0 or 1
1584  */
1585 static int
1586 text_cmp(text *arg1, text *arg2, Oid collid)
1587 {
1588         char       *a1p,
1589                            *a2p;
1590         int                     len1,
1591                                 len2;
1592
1593         a1p = VARDATA_ANY(arg1);
1594         a2p = VARDATA_ANY(arg2);
1595
1596         len1 = VARSIZE_ANY_EXHDR(arg1);
1597         len2 = VARSIZE_ANY_EXHDR(arg2);
1598
1599         return varstr_cmp(a1p, len1, a2p, len2, collid);
1600 }
1601
1602 /*
1603  * Comparison functions for text strings.
1604  *
1605  * Note: btree indexes need these routines not to leak memory; therefore,
1606  * be careful to free working copies of toasted datums.  Most places don't
1607  * need to be so careful.
1608  */
1609
1610 Datum
1611 texteq(PG_FUNCTION_ARGS)
1612 {
1613         Oid                     collid = PG_GET_COLLATION();
1614         pg_locale_t mylocale = 0;
1615         bool            result;
1616
1617         check_collation_set(collid);
1618
1619         mylocale = pg_newlocale_from_collation(collid);
1620
1621         if (mylocale->deterministic)
1622         {
1623                 Datum           arg1 = PG_GETARG_DATUM(0);
1624                 Datum           arg2 = PG_GETARG_DATUM(1);
1625                 Size            len1,
1626                                         len2;
1627
1628                 /*
1629                  * Since we only care about equality or not-equality, we can avoid all
1630                  * the expense of strcoll() here, and just do bitwise comparison.  In
1631                  * fact, we don't even have to do a bitwise comparison if we can show
1632                  * the lengths of the strings are unequal; which might save us from
1633                  * having to detoast one or both values.
1634                  */
1635                 len1 = toast_raw_datum_size(arg1);
1636                 len2 = toast_raw_datum_size(arg2);
1637                 if (len1 != len2)
1638                         result = false;
1639                 else
1640                 {
1641                         text       *targ1 = DatumGetTextPP(arg1);
1642                         text       *targ2 = DatumGetTextPP(arg2);
1643
1644                         result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1645                                                          len1 - VARHDRSZ) == 0);
1646
1647                         PG_FREE_IF_COPY(targ1, 0);
1648                         PG_FREE_IF_COPY(targ2, 1);
1649                 }
1650         }
1651         else
1652         {
1653                 text       *arg1 = PG_GETARG_TEXT_PP(0);
1654                 text       *arg2 = PG_GETARG_TEXT_PP(1);
1655
1656                 result = (text_cmp(arg1, arg2, collid) == 0);
1657
1658                 PG_FREE_IF_COPY(arg1, 0);
1659                 PG_FREE_IF_COPY(arg2, 1);
1660         }
1661
1662         PG_RETURN_BOOL(result);
1663 }
1664
1665 Datum
1666 textne(PG_FUNCTION_ARGS)
1667 {
1668         Oid                     collid = PG_GET_COLLATION();
1669         pg_locale_t mylocale;
1670         bool            result;
1671
1672         check_collation_set(collid);
1673
1674         mylocale = pg_newlocale_from_collation(collid);
1675
1676         if (mylocale->deterministic)
1677         {
1678                 Datum           arg1 = PG_GETARG_DATUM(0);
1679                 Datum           arg2 = PG_GETARG_DATUM(1);
1680                 Size            len1,
1681                                         len2;
1682
1683                 /* See comment in texteq() */
1684                 len1 = toast_raw_datum_size(arg1);
1685                 len2 = toast_raw_datum_size(arg2);
1686                 if (len1 != len2)
1687                         result = true;
1688                 else
1689                 {
1690                         text       *targ1 = DatumGetTextPP(arg1);
1691                         text       *targ2 = DatumGetTextPP(arg2);
1692
1693                         result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1694                                                          len1 - VARHDRSZ) != 0);
1695
1696                         PG_FREE_IF_COPY(targ1, 0);
1697                         PG_FREE_IF_COPY(targ2, 1);
1698                 }
1699         }
1700         else
1701         {
1702                 text       *arg1 = PG_GETARG_TEXT_PP(0);
1703                 text       *arg2 = PG_GETARG_TEXT_PP(1);
1704
1705                 result = (text_cmp(arg1, arg2, collid) != 0);
1706
1707                 PG_FREE_IF_COPY(arg1, 0);
1708                 PG_FREE_IF_COPY(arg2, 1);
1709         }
1710
1711         PG_RETURN_BOOL(result);
1712 }
1713
1714 Datum
1715 text_lt(PG_FUNCTION_ARGS)
1716 {
1717         text       *arg1 = PG_GETARG_TEXT_PP(0);
1718         text       *arg2 = PG_GETARG_TEXT_PP(1);
1719         bool            result;
1720
1721         result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1722
1723         PG_FREE_IF_COPY(arg1, 0);
1724         PG_FREE_IF_COPY(arg2, 1);
1725
1726         PG_RETURN_BOOL(result);
1727 }
1728
1729 Datum
1730 text_le(PG_FUNCTION_ARGS)
1731 {
1732         text       *arg1 = PG_GETARG_TEXT_PP(0);
1733         text       *arg2 = PG_GETARG_TEXT_PP(1);
1734         bool            result;
1735
1736         result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1737
1738         PG_FREE_IF_COPY(arg1, 0);
1739         PG_FREE_IF_COPY(arg2, 1);
1740
1741         PG_RETURN_BOOL(result);
1742 }
1743
1744 Datum
1745 text_gt(PG_FUNCTION_ARGS)
1746 {
1747         text       *arg1 = PG_GETARG_TEXT_PP(0);
1748         text       *arg2 = PG_GETARG_TEXT_PP(1);
1749         bool            result;
1750
1751         result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1752
1753         PG_FREE_IF_COPY(arg1, 0);
1754         PG_FREE_IF_COPY(arg2, 1);
1755
1756         PG_RETURN_BOOL(result);
1757 }
1758
1759 Datum
1760 text_ge(PG_FUNCTION_ARGS)
1761 {
1762         text       *arg1 = PG_GETARG_TEXT_PP(0);
1763         text       *arg2 = PG_GETARG_TEXT_PP(1);
1764         bool            result;
1765
1766         result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1767
1768         PG_FREE_IF_COPY(arg1, 0);
1769         PG_FREE_IF_COPY(arg2, 1);
1770
1771         PG_RETURN_BOOL(result);
1772 }
1773
1774 Datum
1775 text_starts_with(PG_FUNCTION_ARGS)
1776 {
1777         Datum           arg1 = PG_GETARG_DATUM(0);
1778         Datum           arg2 = PG_GETARG_DATUM(1);
1779         Oid                     collid = PG_GET_COLLATION();
1780         pg_locale_t mylocale;
1781         bool            result;
1782         Size            len1,
1783                                 len2;
1784
1785         check_collation_set(collid);
1786
1787         mylocale = pg_newlocale_from_collation(collid);
1788
1789         if (!mylocale->deterministic)
1790                 ereport(ERROR,
1791                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1792                                  errmsg("nondeterministic collations are not supported for substring searches")));
1793
1794         len1 = toast_raw_datum_size(arg1);
1795         len2 = toast_raw_datum_size(arg2);
1796         if (len2 > len1)
1797                 result = false;
1798         else
1799         {
1800                 text       *targ1 = text_substring(arg1, 1, len2, false);
1801                 text       *targ2 = DatumGetTextPP(arg2);
1802
1803                 result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1804                                                  VARSIZE_ANY_EXHDR(targ2)) == 0);
1805
1806                 PG_FREE_IF_COPY(targ1, 0);
1807                 PG_FREE_IF_COPY(targ2, 1);
1808         }
1809
1810         PG_RETURN_BOOL(result);
1811 }
1812
1813 Datum
1814 bttextcmp(PG_FUNCTION_ARGS)
1815 {
1816         text       *arg1 = PG_GETARG_TEXT_PP(0);
1817         text       *arg2 = PG_GETARG_TEXT_PP(1);
1818         int32           result;
1819
1820         result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1821
1822         PG_FREE_IF_COPY(arg1, 0);
1823         PG_FREE_IF_COPY(arg2, 1);
1824
1825         PG_RETURN_INT32(result);
1826 }
1827
1828 Datum
1829 bttextsortsupport(PG_FUNCTION_ARGS)
1830 {
1831         SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
1832         Oid                     collid = ssup->ssup_collation;
1833         MemoryContext oldcontext;
1834
1835         oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1836
1837         /* Use generic string SortSupport */
1838         varstr_sortsupport(ssup, TEXTOID, collid);
1839
1840         MemoryContextSwitchTo(oldcontext);
1841
1842         PG_RETURN_VOID();
1843 }
1844
1845 /*
1846  * Generic sortsupport interface for character type's operator classes.
1847  * Includes locale support, and support for BpChar semantics (i.e. removing
1848  * trailing spaces before comparison).
1849  *
1850  * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
1851  * same representation.  Callers that always use the C collation (e.g.
1852  * non-collatable type callers like bytea) may have NUL bytes in their strings;
1853  * this will not work with any other collation, though.
1854  */
1855 void
1856 varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
1857 {
1858         bool            abbreviate = ssup->abbreviate;
1859         bool            collate_c = false;
1860         VarStringSortSupport *sss;
1861         pg_locale_t locale;
1862
1863         check_collation_set(collid);
1864
1865         locale = pg_newlocale_from_collation(collid);
1866
1867         /*
1868          * If possible, set ssup->comparator to a function which can be used to
1869          * directly compare two datums.  If we can do this, we'll avoid the
1870          * overhead of a trip through the fmgr layer for every comparison, which
1871          * can be substantial.
1872          *
1873          * Most typically, we'll set the comparator to varlenafastcmp_locale,
1874          * which uses strcoll() to perform comparisons.  We use that for the
1875          * BpChar case too, but type NAME uses namefastcmp_locale. However, if
1876          * LC_COLLATE = C, we can make things quite a bit faster with
1877          * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
1878          * memcmp() rather than strcoll().
1879          */
1880         if (locale->collate_is_c)
1881         {
1882                 if (typid == BPCHAROID)
1883                         ssup->comparator = bpcharfastcmp_c;
1884                 else if (typid == NAMEOID)
1885                 {
1886                         ssup->comparator = namefastcmp_c;
1887                         /* Not supporting abbreviation with type NAME, for now */
1888                         abbreviate = false;
1889                 }
1890                 else
1891                         ssup->comparator = varstrfastcmp_c;
1892
1893                 collate_c = true;
1894         }
1895         else
1896         {
1897                 /*
1898                  * We use varlenafastcmp_locale except for type NAME.
1899                  */
1900                 if (typid == NAMEOID)
1901                 {
1902                         ssup->comparator = namefastcmp_locale;
1903                         /* Not supporting abbreviation with type NAME, for now */
1904                         abbreviate = false;
1905                 }
1906                 else
1907                         ssup->comparator = varlenafastcmp_locale;
1908
1909                 /*
1910                  * Unfortunately, it seems that abbreviation for non-C collations is
1911                  * broken on many common platforms; see pg_strxfrm_enabled().
1912                  *
1913                  * Even apart from the risk of broken locales, it's possible that
1914                  * there are platforms where the use of abbreviated keys should be
1915                  * disabled at compile time.  Having only 4 byte datums could make
1916                  * worst-case performance drastically more likely, for example.
1917                  * Moreover, macOS's strxfrm() implementation is known to not
1918                  * effectively concentrate a significant amount of entropy from the
1919                  * original string in earlier transformed blobs.  It's possible that
1920                  * other supported platforms are similarly encumbered.  So, if we ever
1921                  * get past disabling this categorically, we may still want or need to
1922                  * disable it for particular platforms.
1923                  */
1924                 if (!pg_strxfrm_enabled(locale))
1925                         abbreviate = false;
1926         }
1927
1928         /*
1929          * If we're using abbreviated keys, or if we're using a locale-aware
1930          * comparison, we need to initialize a VarStringSortSupport object. Both
1931          * cases will make use of the temporary buffers we initialize here for
1932          * scratch space (and to detect requirement for BpChar semantics from
1933          * caller), and the abbreviation case requires additional state.
1934          */
1935         if (abbreviate || !collate_c)
1936         {
1937                 sss = palloc(sizeof(VarStringSortSupport));
1938                 sss->buf1 = palloc(TEXTBUFLEN);
1939                 sss->buflen1 = TEXTBUFLEN;
1940                 sss->buf2 = palloc(TEXTBUFLEN);
1941                 sss->buflen2 = TEXTBUFLEN;
1942                 /* Start with invalid values */
1943                 sss->last_len1 = -1;
1944                 sss->last_len2 = -1;
1945                 /* Initialize */
1946                 sss->last_returned = 0;
1947                 if (collate_c)
1948                         sss->locale = NULL;
1949                 else
1950                         sss->locale = locale;
1951
1952                 /*
1953                  * To avoid somehow confusing a strxfrm() blob and an original string,
1954                  * constantly keep track of the variety of data that buf1 and buf2
1955                  * currently contain.
1956                  *
1957                  * Comparisons may be interleaved with conversion calls.  Frequently,
1958                  * conversions and comparisons are batched into two distinct phases,
1959                  * but the correctness of caching cannot hinge upon this.  For
1960                  * comparison caching, buffer state is only trusted if cache_blob is
1961                  * found set to false, whereas strxfrm() caching only trusts the state
1962                  * when cache_blob is found set to true.
1963                  *
1964                  * Arbitrarily initialize cache_blob to true.
1965                  */
1966                 sss->cache_blob = true;
1967                 sss->collate_c = collate_c;
1968                 sss->typid = typid;
1969                 ssup->ssup_extra = sss;
1970
1971                 /*
1972                  * If possible, plan to use the abbreviated keys optimization.  The
1973                  * core code may switch back to authoritative comparator should
1974                  * abbreviation be aborted.
1975                  */
1976                 if (abbreviate)
1977                 {
1978                         sss->prop_card = 0.20;
1979                         initHyperLogLog(&sss->abbr_card, 10);
1980                         initHyperLogLog(&sss->full_card, 10);
1981                         ssup->abbrev_full_comparator = ssup->comparator;
1982                         ssup->comparator = ssup_datum_unsigned_cmp;
1983                         ssup->abbrev_converter = varstr_abbrev_convert;
1984                         ssup->abbrev_abort = varstr_abbrev_abort;
1985                 }
1986         }
1987 }
1988
1989 /*
1990  * sortsupport comparison func (for C locale case)
1991  */
1992 static int
1993 varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
1994 {
1995         VarString  *arg1 = DatumGetVarStringPP(x);
1996         VarString  *arg2 = DatumGetVarStringPP(y);
1997         char       *a1p,
1998                            *a2p;
1999         int                     len1,
2000                                 len2,
2001                                 result;
2002
2003         a1p = VARDATA_ANY(arg1);
2004         a2p = VARDATA_ANY(arg2);
2005
2006         len1 = VARSIZE_ANY_EXHDR(arg1);
2007         len2 = VARSIZE_ANY_EXHDR(arg2);
2008
2009         result = memcmp(a1p, a2p, Min(len1, len2));
2010         if ((result == 0) && (len1 != len2))
2011                 result = (len1 < len2) ? -1 : 1;
2012
2013         /* We can't afford to leak memory here. */
2014         if (PointerGetDatum(arg1) != x)
2015                 pfree(arg1);
2016         if (PointerGetDatum(arg2) != y)
2017                 pfree(arg2);
2018
2019         return result;
2020 }
2021
2022 /*
2023  * sortsupport comparison func (for BpChar C locale case)
2024  *
2025  * BpChar outsources its sortsupport to this module.  Specialization for the
2026  * varstr_sortsupport BpChar case, modeled on
2027  * internal_bpchar_pattern_compare().
2028  */
2029 static int
2030 bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
2031 {
2032         BpChar     *arg1 = DatumGetBpCharPP(x);
2033         BpChar     *arg2 = DatumGetBpCharPP(y);
2034         char       *a1p,
2035                            *a2p;
2036         int                     len1,
2037                                 len2,
2038                                 result;
2039
2040         a1p = VARDATA_ANY(arg1);
2041         a2p = VARDATA_ANY(arg2);
2042
2043         len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2044         len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2045
2046         result = memcmp(a1p, a2p, Min(len1, len2));
2047         if ((result == 0) && (len1 != len2))
2048                 result = (len1 < len2) ? -1 : 1;
2049
2050         /* We can't afford to leak memory here. */
2051         if (PointerGetDatum(arg1) != x)
2052                 pfree(arg1);
2053         if (PointerGetDatum(arg2) != y)
2054                 pfree(arg2);
2055
2056         return result;
2057 }
2058
2059 /*
2060  * sortsupport comparison func (for NAME C locale case)
2061  */
2062 static int
2063 namefastcmp_c(Datum x, Datum y, SortSupport ssup)
2064 {
2065         Name            arg1 = DatumGetName(x);
2066         Name            arg2 = DatumGetName(y);
2067
2068         return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
2069 }
2070
2071 /*
2072  * sortsupport comparison func (for locale case with all varlena types)
2073  */
2074 static int
2075 varlenafastcmp_locale(Datum x, Datum y, SortSupport ssup)
2076 {
2077         VarString  *arg1 = DatumGetVarStringPP(x);
2078         VarString  *arg2 = DatumGetVarStringPP(y);
2079         char       *a1p,
2080                            *a2p;
2081         int                     len1,
2082                                 len2,
2083                                 result;
2084
2085         a1p = VARDATA_ANY(arg1);
2086         a2p = VARDATA_ANY(arg2);
2087
2088         len1 = VARSIZE_ANY_EXHDR(arg1);
2089         len2 = VARSIZE_ANY_EXHDR(arg2);
2090
2091         result = varstrfastcmp_locale(a1p, len1, a2p, len2, ssup);
2092
2093         /* We can't afford to leak memory here. */
2094         if (PointerGetDatum(arg1) != x)
2095                 pfree(arg1);
2096         if (PointerGetDatum(arg2) != y)
2097                 pfree(arg2);
2098
2099         return result;
2100 }
2101
2102 /*
2103  * sortsupport comparison func (for locale case with NAME type)
2104  */
2105 static int
2106 namefastcmp_locale(Datum x, Datum y, SortSupport ssup)
2107 {
2108         Name            arg1 = DatumGetName(x);
2109         Name            arg2 = DatumGetName(y);
2110
2111         return varstrfastcmp_locale(NameStr(*arg1), strlen(NameStr(*arg1)),
2112                                                                 NameStr(*arg2), strlen(NameStr(*arg2)),
2113                                                                 ssup);
2114 }
2115
2116 /*
2117  * sortsupport comparison func for locale cases
2118  */
2119 static int
2120 varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
2121 {
2122         VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2123         int                     result;
2124         bool            arg1_match;
2125
2126         /* Fast pre-check for equality, as discussed in varstr_cmp() */
2127         if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2128         {
2129                 /*
2130                  * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2131                  * last_len2.  Existing contents of buffers might still be used by
2132                  * next call.
2133                  *
2134                  * It's fine to allow the comparison of BpChar padding bytes here,
2135                  * even though that implies that the memcmp() will usually be
2136                  * performed for BpChar callers (though multibyte characters could
2137                  * still prevent that from occurring).  The memcmp() is still very
2138                  * cheap, and BpChar's funny semantics have us remove trailing spaces
2139                  * (not limited to padding), so we need make no distinction between
2140                  * padding space characters and "real" space characters.
2141                  */
2142                 return 0;
2143         }
2144
2145         if (sss->typid == BPCHAROID)
2146         {
2147                 /* Get true number of bytes, ignoring trailing spaces */
2148                 len1 = bpchartruelen(a1p, len1);
2149                 len2 = bpchartruelen(a2p, len2);
2150         }
2151
2152         if (len1 >= sss->buflen1)
2153         {
2154                 sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2155                 sss->buf1 = repalloc(sss->buf1, sss->buflen1);
2156         }
2157         if (len2 >= sss->buflen2)
2158         {
2159                 sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2160                 sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2161         }
2162
2163         /*
2164          * We're likely to be asked to compare the same strings repeatedly, and
2165          * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2166          * comparisons, even though in general there is no reason to think that
2167          * that will work out (every string datum may be unique).  Caching does
2168          * not slow things down measurably when it doesn't work out, and can speed
2169          * things up by rather a lot when it does.  In part, this is because the
2170          * memcmp() compares data from cachelines that are needed in L1 cache even
2171          * when the last comparison's result cannot be reused.
2172          */
2173         arg1_match = true;
2174         if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2175         {
2176                 arg1_match = false;
2177                 memcpy(sss->buf1, a1p, len1);
2178                 sss->buf1[len1] = '\0';
2179                 sss->last_len1 = len1;
2180         }
2181
2182         /*
2183          * If we're comparing the same two strings as last time, we can return the
2184          * same answer without calling strcoll() again.  This is more likely than
2185          * it seems (at least with moderate to low cardinality sets), because
2186          * quicksort compares the same pivot against many values.
2187          */
2188         if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2189         {
2190                 memcpy(sss->buf2, a2p, len2);
2191                 sss->buf2[len2] = '\0';
2192                 sss->last_len2 = len2;
2193         }
2194         else if (arg1_match && !sss->cache_blob)
2195         {
2196                 /* Use result cached following last actual strcoll() call */
2197                 return sss->last_returned;
2198         }
2199
2200         result = pg_strcoll(sss->buf1, sss->buf2, sss->locale);
2201
2202         /* Break tie if necessary. */
2203         if (result == 0 && sss->locale->deterministic)
2204                 result = strcmp(sss->buf1, sss->buf2);
2205
2206         /* Cache result, perhaps saving an expensive strcoll() call next time */
2207         sss->cache_blob = false;
2208         sss->last_returned = result;
2209         return result;
2210 }
2211
2212 /*
2213  * Conversion routine for sortsupport.  Converts original to abbreviated key
2214  * representation.  Our encoding strategy is simple -- pack the first 8 bytes
2215  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2216  * stored in reverse order), and treat it as an unsigned integer.  When the "C"
2217  * locale is used, or in case of bytea, just memcpy() from original instead.
2218  */
2219 static Datum
2220 varstr_abbrev_convert(Datum original, SortSupport ssup)
2221 {
2222         const size_t max_prefix_bytes = sizeof(Datum);
2223         VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2224         VarString  *authoritative = DatumGetVarStringPP(original);
2225         char       *authoritative_data = VARDATA_ANY(authoritative);
2226
2227         /* working state */
2228         Datum           res;
2229         char       *pres;
2230         int                     len;
2231         uint32          hash;
2232
2233         pres = (char *) &res;
2234         /* memset(), so any non-overwritten bytes are NUL */
2235         memset(pres, 0, max_prefix_bytes);
2236         len = VARSIZE_ANY_EXHDR(authoritative);
2237
2238         /* Get number of bytes, ignoring trailing spaces */
2239         if (sss->typid == BPCHAROID)
2240                 len = bpchartruelen(authoritative_data, len);
2241
2242         /*
2243          * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2244          * abbreviate keys.  The full comparator for the C locale is always
2245          * memcmp().  It would be incorrect to allow bytea callers (callers that
2246          * always force the C collation -- bytea isn't a collatable type, but this
2247          * approach is convenient) to use strxfrm().  This is because bytea
2248          * strings may contain NUL bytes.  Besides, this should be faster, too.
2249          *
2250          * More generally, it's okay that bytea callers can have NUL bytes in
2251          * strings because abbreviated cmp need not make a distinction between
2252          * terminating NUL bytes, and NUL bytes representing actual NULs in the
2253          * authoritative representation.  Hopefully a comparison at or past one
2254          * abbreviated key's terminating NUL byte will resolve the comparison
2255          * without consulting the authoritative representation; specifically, some
2256          * later non-NUL byte in the longer string can resolve the comparison
2257          * against a subsequent terminating NUL in the shorter string.  There will
2258          * usually be what is effectively a "length-wise" resolution there and
2259          * then.
2260          *
2261          * If that doesn't work out -- if all bytes in the longer string
2262          * positioned at or past the offset of the smaller string's (first)
2263          * terminating NUL are actually representative of NUL bytes in the
2264          * authoritative binary string (perhaps with some *terminating* NUL bytes
2265          * towards the end of the longer string iff it happens to still be small)
2266          * -- then an authoritative tie-breaker will happen, and do the right
2267          * thing: explicitly consider string length.
2268          */
2269         if (sss->collate_c)
2270                 memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
2271         else
2272         {
2273                 Size            bsize;
2274
2275                 /*
2276                  * We're not using the C collation, so fall back on strxfrm or ICU
2277                  * analogs.
2278                  */
2279
2280                 /* By convention, we use buffer 1 to store and NUL-terminate */
2281                 if (len >= sss->buflen1)
2282                 {
2283                         sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2284                         sss->buf1 = repalloc(sss->buf1, sss->buflen1);
2285                 }
2286
2287                 /* Might be able to reuse strxfrm() blob from last call */
2288                 if (sss->last_len1 == len && sss->cache_blob &&
2289                         memcmp(sss->buf1, authoritative_data, len) == 0)
2290                 {
2291                         memcpy(pres, sss->buf2, Min(max_prefix_bytes, sss->last_len2));
2292                         /* No change affecting cardinality, so no hashing required */
2293                         goto done;
2294                 }
2295
2296                 memcpy(sss->buf1, authoritative_data, len);
2297
2298                 /*
2299                  * pg_strxfrm() and pg_strxfrm_prefix expect NUL-terminated strings.
2300                  */
2301                 sss->buf1[len] = '\0';
2302                 sss->last_len1 = len;
2303
2304                 if (pg_strxfrm_prefix_enabled(sss->locale))
2305                 {
2306                         if (sss->buflen2 < max_prefix_bytes)
2307                         {
2308                                 sss->buflen2 = Max(max_prefix_bytes,
2309                                                                    Min(sss->buflen2 * 2, MaxAllocSize));
2310                                 sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2311                         }
2312
2313                         bsize = pg_strxfrm_prefix(sss->buf2, sss->buf1,
2314                                                                           max_prefix_bytes, sss->locale);
2315                         sss->last_len2 = bsize;
2316                 }
2317                 else
2318                 {
2319                         /*
2320                          * Loop: Call pg_strxfrm(), possibly enlarge buffer, and try
2321                          * again.  The pg_strxfrm() function leaves the result buffer
2322                          * content undefined if the result did not fit, so we need to
2323                          * retry until everything fits, even though we only need the first
2324                          * few bytes in the end.
2325                          */
2326                         for (;;)
2327                         {
2328                                 bsize = pg_strxfrm(sss->buf2, sss->buf1, sss->buflen2,
2329                                                                    sss->locale);
2330
2331                                 sss->last_len2 = bsize;
2332                                 if (bsize < sss->buflen2)
2333                                         break;
2334
2335                                 /*
2336                                  * Grow buffer and retry.
2337                                  */
2338                                 sss->buflen2 = Max(bsize + 1,
2339                                                                    Min(sss->buflen2 * 2, MaxAllocSize));
2340                                 sss->buf2 = repalloc(sss->buf2, sss->buflen2);
2341                         }
2342                 }
2343
2344                 /*
2345                  * Every Datum byte is always compared.  This is safe because the
2346                  * strxfrm() blob is itself NUL terminated, leaving no danger of
2347                  * misinterpreting any NUL bytes not intended to be interpreted as
2348                  * logically representing termination.
2349                  *
2350                  * (Actually, even if there were NUL bytes in the blob it would be
2351                  * okay.  See remarks on bytea case above.)
2352                  */
2353                 memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize));
2354         }
2355
2356         /*
2357          * Maintain approximate cardinality of both abbreviated keys and original,
2358          * authoritative keys using HyperLogLog.  Used as cheap insurance against
2359          * the worst case, where we do many string transformations for no saving
2360          * in full strcoll()-based comparisons.  These statistics are used by
2361          * varstr_abbrev_abort().
2362          *
2363          * First, Hash key proper, or a significant fraction of it.  Mix in length
2364          * in order to compensate for cases where differences are past
2365          * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2366          */
2367         hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2368                                                                    Min(len, PG_CACHE_LINE_SIZE)));
2369
2370         if (len > PG_CACHE_LINE_SIZE)
2371                 hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2372
2373         addHyperLogLog(&sss->full_card, hash);
2374
2375         /* Hash abbreviated key */
2376 #if SIZEOF_DATUM == 8
2377         {
2378                 uint32          lohalf,
2379                                         hihalf;
2380
2381                 lohalf = (uint32) res;
2382                 hihalf = (uint32) (res >> 32);
2383                 hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2384         }
2385 #else                                                   /* SIZEOF_DATUM != 8 */
2386         hash = DatumGetUInt32(hash_uint32((uint32) res));
2387 #endif
2388
2389         addHyperLogLog(&sss->abbr_card, hash);
2390
2391         /* Cache result, perhaps saving an expensive strxfrm() call next time */
2392         sss->cache_blob = true;
2393 done:
2394
2395         /*
2396          * Byteswap on little-endian machines.
2397          *
2398          * This is needed so that ssup_datum_unsigned_cmp() (an unsigned integer
2399          * 3-way comparator) works correctly on all platforms.  If we didn't do
2400          * this, the comparator would have to call memcmp() with a pair of
2401          * pointers to the first byte of each abbreviated key, which is slower.
2402          */
2403         res = DatumBigEndianToNative(res);
2404
2405         /* Don't leak memory here */
2406         if (PointerGetDatum(authoritative) != original)
2407                 pfree(authoritative);
2408
2409         return res;
2410 }
2411
2412 /*
2413  * Callback for estimating effectiveness of abbreviated key optimization, using
2414  * heuristic rules.  Returns value indicating if the abbreviation optimization
2415  * should be aborted, based on its projected effectiveness.
2416  */
2417 static bool
2418 varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2419 {
2420         VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2421         double          abbrev_distinct,
2422                                 key_distinct;
2423
2424         Assert(ssup->abbreviate);
2425
2426         /* Have a little patience */
2427         if (memtupcount < 100)
2428                 return false;
2429
2430         abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2431         key_distinct = estimateHyperLogLog(&sss->full_card);
2432
2433         /*
2434          * Clamp cardinality estimates to at least one distinct value.  While
2435          * NULLs are generally disregarded, if only NULL values were seen so far,
2436          * that might misrepresent costs if we failed to clamp.
2437          */
2438         if (abbrev_distinct <= 1.0)
2439                 abbrev_distinct = 1.0;
2440
2441         if (key_distinct <= 1.0)
2442                 key_distinct = 1.0;
2443
2444         /*
2445          * In the worst case all abbreviated keys are identical, while at the same
2446          * time there are differences within full key strings not captured in
2447          * abbreviations.
2448          */
2449         if (trace_sort)
2450         {
2451                 double          norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2452
2453                 elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2454                          "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2455                          memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2456                          sss->prop_card);
2457         }
2458
2459         /*
2460          * If the number of distinct abbreviated keys approximately matches the
2461          * number of distinct authoritative original keys, that's reason enough to
2462          * proceed.  We can win even with a very low cardinality set if most
2463          * tie-breakers only memcmp().  This is by far the most important
2464          * consideration.
2465          *
2466          * While comparisons that are resolved at the abbreviated key level are
2467          * considerably cheaper than tie-breakers resolved with memcmp(), both of
2468          * those two outcomes are so much cheaper than a full strcoll() once
2469          * sorting is underway that it doesn't seem worth it to weigh abbreviated
2470          * cardinality against the overall size of the set in order to more
2471          * accurately model costs.  Assume that an abbreviated comparison, and an
2472          * abbreviated comparison with a cheap memcmp()-based authoritative
2473          * resolution are equivalent.
2474          */
2475         if (abbrev_distinct > key_distinct * sss->prop_card)
2476         {
2477                 /*
2478                  * When we have exceeded 10,000 tuples, decay required cardinality
2479                  * aggressively for next call.
2480                  *
2481                  * This is useful because the number of comparisons required on
2482                  * average increases at a linearithmic rate, and at roughly 10,000
2483                  * tuples that factor will start to dominate over the linear costs of
2484                  * string transformation (this is a conservative estimate).  The decay
2485                  * rate is chosen to be a little less aggressive than halving -- which
2486                  * (since we're called at points at which memtupcount has doubled)
2487                  * would never see the cost model actually abort past the first call
2488                  * following a decay.  This decay rate is mostly a precaution against
2489                  * a sudden, violent swing in how well abbreviated cardinality tracks
2490                  * full key cardinality.  The decay also serves to prevent a marginal
2491                  * case from being aborted too late, when too much has already been
2492                  * invested in string transformation.
2493                  *
2494                  * It's possible for sets of several million distinct strings with
2495                  * mere tens of thousands of distinct abbreviated keys to still
2496                  * benefit very significantly.  This will generally occur provided
2497                  * each abbreviated key is a proxy for a roughly uniform number of the
2498                  * set's full keys. If it isn't so, we hope to catch that early and
2499                  * abort.  If it isn't caught early, by the time the problem is
2500                  * apparent it's probably not worth aborting.
2501                  */
2502                 if (memtupcount > 10000)
2503                         sss->prop_card *= 0.65;
2504
2505                 return false;
2506         }
2507
2508         /*
2509          * Abort abbreviation strategy.
2510          *
2511          * The worst case, where all abbreviated keys are identical while all
2512          * original strings differ will typically only see a regression of about
2513          * 10% in execution time for small to medium sized lists of strings.
2514          * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2515          * often expect very large improvements, particularly with sets of strings
2516          * of moderately high to high abbreviated cardinality.  There is little to
2517          * lose but much to gain, which our strategy reflects.
2518          */
2519         if (trace_sort)
2520                 elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2521                          "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2522                          memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2523
2524         return true;
2525 }
2526
2527 /*
2528  * Generic equalimage support function for character type's operator classes.
2529  * Disables the use of deduplication with nondeterministic collations.
2530  */
2531 Datum
2532 btvarstrequalimage(PG_FUNCTION_ARGS)
2533 {
2534         /* Oid          opcintype = PG_GETARG_OID(0); */
2535         Oid                     collid = PG_GET_COLLATION();
2536         pg_locale_t locale;
2537
2538         check_collation_set(collid);
2539
2540         locale = pg_newlocale_from_collation(collid);
2541
2542         PG_RETURN_BOOL(locale->deterministic);
2543 }
2544
2545 Datum
2546 text_larger(PG_FUNCTION_ARGS)
2547 {
2548         text       *arg1 = PG_GETARG_TEXT_PP(0);
2549         text       *arg2 = PG_GETARG_TEXT_PP(1);
2550         text       *result;
2551
2552         result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2553
2554         PG_RETURN_TEXT_P(result);
2555 }
2556
2557 Datum
2558 text_smaller(PG_FUNCTION_ARGS)
2559 {
2560         text       *arg1 = PG_GETARG_TEXT_PP(0);
2561         text       *arg2 = PG_GETARG_TEXT_PP(1);
2562         text       *result;
2563
2564         result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2565
2566         PG_RETURN_TEXT_P(result);
2567 }
2568
2569
2570 /*
2571  * Cross-type comparison functions for types text and name.
2572  */
2573
2574 Datum
2575 nameeqtext(PG_FUNCTION_ARGS)
2576 {
2577         Name            arg1 = PG_GETARG_NAME(0);
2578         text       *arg2 = PG_GETARG_TEXT_PP(1);
2579         size_t          len1 = strlen(NameStr(*arg1));
2580         size_t          len2 = VARSIZE_ANY_EXHDR(arg2);
2581         Oid                     collid = PG_GET_COLLATION();
2582         bool            result;
2583
2584         check_collation_set(collid);
2585
2586         if (collid == C_COLLATION_OID)
2587                 result = (len1 == len2 &&
2588                                   memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2589         else
2590                 result = (varstr_cmp(NameStr(*arg1), len1,
2591                                                          VARDATA_ANY(arg2), len2,
2592                                                          collid) == 0);
2593
2594         PG_FREE_IF_COPY(arg2, 1);
2595
2596         PG_RETURN_BOOL(result);
2597 }
2598
2599 Datum
2600 texteqname(PG_FUNCTION_ARGS)
2601 {
2602         text       *arg1 = PG_GETARG_TEXT_PP(0);
2603         Name            arg2 = PG_GETARG_NAME(1);
2604         size_t          len1 = VARSIZE_ANY_EXHDR(arg1);
2605         size_t          len2 = strlen(NameStr(*arg2));
2606         Oid                     collid = PG_GET_COLLATION();
2607         bool            result;
2608
2609         check_collation_set(collid);
2610
2611         if (collid == C_COLLATION_OID)
2612                 result = (len1 == len2 &&
2613                                   memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2614         else
2615                 result = (varstr_cmp(VARDATA_ANY(arg1), len1,
2616                                                          NameStr(*arg2), len2,
2617                                                          collid) == 0);
2618
2619         PG_FREE_IF_COPY(arg1, 0);
2620
2621         PG_RETURN_BOOL(result);
2622 }
2623
2624 Datum
2625 namenetext(PG_FUNCTION_ARGS)
2626 {
2627         Name            arg1 = PG_GETARG_NAME(0);
2628         text       *arg2 = PG_GETARG_TEXT_PP(1);
2629         size_t          len1 = strlen(NameStr(*arg1));
2630         size_t          len2 = VARSIZE_ANY_EXHDR(arg2);
2631         Oid                     collid = PG_GET_COLLATION();
2632         bool            result;
2633
2634         check_collation_set(collid);
2635
2636         if (collid == C_COLLATION_OID)
2637                 result = !(len1 == len2 &&
2638                                    memcmp(NameStr(*arg1), VARDATA_ANY(arg2), len1) == 0);
2639         else
2640                 result = !(varstr_cmp(NameStr(*arg1), len1,
2641                                                           VARDATA_ANY(arg2), len2,
2642                                                           collid) == 0);
2643
2644         PG_FREE_IF_COPY(arg2, 1);
2645
2646         PG_RETURN_BOOL(result);
2647 }
2648
2649 Datum
2650 textnename(PG_FUNCTION_ARGS)
2651 {
2652         text       *arg1 = PG_GETARG_TEXT_PP(0);
2653         Name            arg2 = PG_GETARG_NAME(1);
2654         size_t          len1 = VARSIZE_ANY_EXHDR(arg1);
2655         size_t          len2 = strlen(NameStr(*arg2));
2656         Oid                     collid = PG_GET_COLLATION();
2657         bool            result;
2658
2659         check_collation_set(collid);
2660
2661         if (collid == C_COLLATION_OID)
2662                 result = !(len1 == len2 &&
2663                                    memcmp(VARDATA_ANY(arg1), NameStr(*arg2), len1) == 0);
2664         else
2665                 result = !(varstr_cmp(VARDATA_ANY(arg1), len1,
2666                                                           NameStr(*arg2), len2,
2667                                                           collid) == 0);
2668
2669         PG_FREE_IF_COPY(arg1, 0);
2670
2671         PG_RETURN_BOOL(result);
2672 }
2673
2674 Datum
2675 btnametextcmp(PG_FUNCTION_ARGS)
2676 {
2677         Name            arg1 = PG_GETARG_NAME(0);
2678         text       *arg2 = PG_GETARG_TEXT_PP(1);
2679         int32           result;
2680
2681         result = varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
2682                                                 VARDATA_ANY(arg2), VARSIZE_ANY_EXHDR(arg2),
2683                                                 PG_GET_COLLATION());
2684
2685         PG_FREE_IF_COPY(arg2, 1);
2686
2687         PG_RETURN_INT32(result);
2688 }
2689
2690 Datum
2691 bttextnamecmp(PG_FUNCTION_ARGS)
2692 {
2693         text       *arg1 = PG_GETARG_TEXT_PP(0);
2694         Name            arg2 = PG_GETARG_NAME(1);
2695         int32           result;
2696
2697         result = varstr_cmp(VARDATA_ANY(arg1), VARSIZE_ANY_EXHDR(arg1),
2698                                                 NameStr(*arg2), strlen(NameStr(*arg2)),
2699                                                 PG_GET_COLLATION());
2700
2701         PG_FREE_IF_COPY(arg1, 0);
2702
2703         PG_RETURN_INT32(result);
2704 }
2705
2706 #define CmpCall(cmpfunc) \
2707         DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
2708                                                                                   PG_GET_COLLATION(), \
2709                                                                                   PG_GETARG_DATUM(0), \
2710                                                                                   PG_GETARG_DATUM(1)))
2711
2712 Datum
2713 namelttext(PG_FUNCTION_ARGS)
2714 {
2715         PG_RETURN_BOOL(CmpCall(btnametextcmp) < 0);
2716 }
2717
2718 Datum
2719 nameletext(PG_FUNCTION_ARGS)
2720 {
2721         PG_RETURN_BOOL(CmpCall(btnametextcmp) <= 0);
2722 }
2723
2724 Datum
2725 namegttext(PG_FUNCTION_ARGS)
2726 {
2727         PG_RETURN_BOOL(CmpCall(btnametextcmp) > 0);
2728 }
2729
2730 Datum
2731 namegetext(PG_FUNCTION_ARGS)
2732 {
2733         PG_RETURN_BOOL(CmpCall(btnametextcmp) >= 0);
2734 }
2735
2736 Datum
2737 textltname(PG_FUNCTION_ARGS)
2738 {
2739         PG_RETURN_BOOL(CmpCall(bttextnamecmp) < 0);
2740 }
2741
2742 Datum
2743 textlename(PG_FUNCTION_ARGS)
2744 {
2745         PG_RETURN_BOOL(CmpCall(bttextnamecmp) <= 0);
2746 }
2747
2748 Datum
2749 textgtname(PG_FUNCTION_ARGS)
2750 {
2751         PG_RETURN_BOOL(CmpCall(bttextnamecmp) > 0);
2752 }
2753
2754 Datum
2755 textgename(PG_FUNCTION_ARGS)
2756 {
2757         PG_RETURN_BOOL(CmpCall(bttextnamecmp) >= 0);
2758 }
2759
2760 #undef CmpCall
2761
2762
2763 /*
2764  * The following operators support character-by-character comparison
2765  * of text datums, to allow building indexes suitable for LIKE clauses.
2766  * Note that the regular texteq/textne comparison operators, and regular
2767  * support functions 1 and 2 with "C" collation are assumed to be
2768  * compatible with these!
2769  */
2770
2771 static int
2772 internal_text_pattern_compare(text *arg1, text *arg2)
2773 {
2774         int                     result;
2775         int                     len1,
2776                                 len2;
2777
2778         len1 = VARSIZE_ANY_EXHDR(arg1);
2779         len2 = VARSIZE_ANY_EXHDR(arg2);
2780
2781         result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
2782         if (result != 0)
2783                 return result;
2784         else if (len1 < len2)
2785                 return -1;
2786         else if (len1 > len2)
2787                 return 1;
2788         else
2789                 return 0;
2790 }
2791
2792
2793 Datum
2794 text_pattern_lt(PG_FUNCTION_ARGS)
2795 {
2796         text       *arg1 = PG_GETARG_TEXT_PP(0);
2797         text       *arg2 = PG_GETARG_TEXT_PP(1);
2798         int                     result;
2799
2800         result = internal_text_pattern_compare(arg1, arg2);
2801
2802         PG_FREE_IF_COPY(arg1, 0);
2803         PG_FREE_IF_COPY(arg2, 1);
2804
2805         PG_RETURN_BOOL(result < 0);
2806 }
2807
2808
2809 Datum
2810 text_pattern_le(PG_FUNCTION_ARGS)
2811 {
2812         text       *arg1 = PG_GETARG_TEXT_PP(0);
2813         text       *arg2 = PG_GETARG_TEXT_PP(1);
2814         int                     result;
2815
2816         result = internal_text_pattern_compare(arg1, arg2);
2817
2818         PG_FREE_IF_COPY(arg1, 0);
2819         PG_FREE_IF_COPY(arg2, 1);
2820
2821         PG_RETURN_BOOL(result <= 0);
2822 }
2823
2824
2825 Datum
2826 text_pattern_ge(PG_FUNCTION_ARGS)
2827 {
2828         text       *arg1 = PG_GETARG_TEXT_PP(0);
2829         text       *arg2 = PG_GETARG_TEXT_PP(1);
2830         int                     result;
2831
2832         result = internal_text_pattern_compare(arg1, arg2);
2833
2834         PG_FREE_IF_COPY(arg1, 0);
2835         PG_FREE_IF_COPY(arg2, 1);
2836
2837         PG_RETURN_BOOL(result >= 0);
2838 }
2839
2840
2841 Datum
2842 text_pattern_gt(PG_FUNCTION_ARGS)
2843 {
2844         text       *arg1 = PG_GETARG_TEXT_PP(0);
2845         text       *arg2 = PG_GETARG_TEXT_PP(1);
2846         int                     result;
2847
2848         result = internal_text_pattern_compare(arg1, arg2);
2849
2850         PG_FREE_IF_COPY(arg1, 0);
2851         PG_FREE_IF_COPY(arg2, 1);
2852
2853         PG_RETURN_BOOL(result > 0);
2854 }
2855
2856
2857 Datum
2858 bttext_pattern_cmp(PG_FUNCTION_ARGS)
2859 {
2860         text       *arg1 = PG_GETARG_TEXT_PP(0);
2861         text       *arg2 = PG_GETARG_TEXT_PP(1);
2862         int                     result;
2863
2864         result = internal_text_pattern_compare(arg1, arg2);
2865
2866         PG_FREE_IF_COPY(arg1, 0);
2867         PG_FREE_IF_COPY(arg2, 1);
2868
2869         PG_RETURN_INT32(result);
2870 }
2871
2872
2873 Datum
2874 bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
2875 {
2876         SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
2877         MemoryContext oldcontext;
2878
2879         oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
2880
2881         /* Use generic string SortSupport, forcing "C" collation */
2882         varstr_sortsupport(ssup, TEXTOID, C_COLLATION_OID);
2883
2884         MemoryContextSwitchTo(oldcontext);
2885
2886         PG_RETURN_VOID();
2887 }
2888
2889
2890 /*-------------------------------------------------------------
2891  * byteaoctetlen
2892  *
2893  * get the number of bytes contained in an instance of type 'bytea'
2894  *-------------------------------------------------------------
2895  */
2896 Datum
2897 byteaoctetlen(PG_FUNCTION_ARGS)
2898 {
2899         Datum           str = PG_GETARG_DATUM(0);
2900
2901         /* We need not detoast the input at all */
2902         PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
2903 }
2904
2905 /*
2906  * byteacat -
2907  *        takes two bytea* and returns a bytea* that is the concatenation of
2908  *        the two.
2909  *
2910  * Cloned from textcat and modified as required.
2911  */
2912 Datum
2913 byteacat(PG_FUNCTION_ARGS)
2914 {
2915         bytea      *t1 = PG_GETARG_BYTEA_PP(0);
2916         bytea      *t2 = PG_GETARG_BYTEA_PP(1);
2917
2918         PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
2919 }
2920
2921 /*
2922  * bytea_catenate
2923  *      Guts of byteacat(), broken out so it can be used by other functions
2924  *
2925  * Arguments can be in short-header form, but not compressed or out-of-line
2926  */
2927 static bytea *
2928 bytea_catenate(bytea *t1, bytea *t2)
2929 {
2930         bytea      *result;
2931         int                     len1,
2932                                 len2,
2933                                 len;
2934         char       *ptr;
2935
2936         len1 = VARSIZE_ANY_EXHDR(t1);
2937         len2 = VARSIZE_ANY_EXHDR(t2);
2938
2939         /* paranoia ... probably should throw error instead? */
2940         if (len1 < 0)
2941                 len1 = 0;
2942         if (len2 < 0)
2943                 len2 = 0;
2944
2945         len = len1 + len2 + VARHDRSZ;
2946         result = (bytea *) palloc(len);
2947
2948         /* Set size of result string... */
2949         SET_VARSIZE(result, len);
2950
2951         /* Fill data field of result string... */
2952         ptr = VARDATA(result);
2953         if (len1 > 0)
2954                 memcpy(ptr, VARDATA_ANY(t1), len1);
2955         if (len2 > 0)
2956                 memcpy(ptr + len1, VARDATA_ANY(t2), len2);
2957
2958         return result;
2959 }
2960
2961 #define PG_STR_GET_BYTEA(str_) \
2962         DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
2963
2964 /*
2965  * bytea_substr()
2966  * Return a substring starting at the specified position.
2967  * Cloned from text_substr and modified as required.
2968  *
2969  * Input:
2970  *      - string
2971  *      - starting position (is one-based)
2972  *      - string length (optional)
2973  *
2974  * If the starting position is zero or less, then return from the start of the string
2975  * adjusting the length to be consistent with the "negative start" per SQL.
2976  * If the length is less than zero, an ERROR is thrown. If no third argument
2977  * (length) is provided, the length to the end of the string is assumed.
2978  */
2979 Datum
2980 bytea_substr(PG_FUNCTION_ARGS)
2981 {
2982         PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
2983                                                                           PG_GETARG_INT32(1),
2984                                                                           PG_GETARG_INT32(2),
2985                                                                           false));
2986 }
2987
2988 /*
2989  * bytea_substr_no_len -
2990  *        Wrapper to avoid opr_sanity failure due to
2991  *        one function accepting a different number of args.
2992  */
2993 Datum
2994 bytea_substr_no_len(PG_FUNCTION_ARGS)
2995 {
2996         PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
2997                                                                           PG_GETARG_INT32(1),
2998                                                                           -1,
2999                                                                           true));
3000 }
3001
3002 static bytea *
3003 bytea_substring(Datum str,
3004                                 int S,
3005                                 int L,
3006                                 bool length_not_specified)
3007 {
3008         int32           S1;                             /* adjusted start position */
3009         int32           L1;                             /* adjusted substring length */
3010         int32           E;                              /* end position */
3011
3012         /*
3013          * The logic here should generally match text_substring().
3014          */
3015         S1 = Max(S, 1);
3016
3017         if (length_not_specified)
3018         {
3019                 /*
3020                  * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3021                  * end of the string if we pass it a negative value for length.
3022                  */
3023                 L1 = -1;
3024         }
3025         else if (L < 0)
3026         {
3027                 /* SQL99 says to throw an error for E < S, i.e., negative length */
3028                 ereport(ERROR,
3029                                 (errcode(ERRCODE_SUBSTRING_ERROR),
3030                                  errmsg("negative substring length not allowed")));
3031                 L1 = -1;                                /* silence stupider compilers */
3032         }
3033         else if (pg_add_s32_overflow(S, L, &E))
3034         {
3035                 /*
3036                  * L could be large enough for S + L to overflow, in which case the
3037                  * substring must run to end of string.
3038                  */
3039                 L1 = -1;
3040         }
3041         else
3042         {
3043                 /*
3044                  * A zero or negative value for the end position can happen if the
3045                  * start was negative or one. SQL99 says to return a zero-length
3046                  * string.
3047                  */
3048                 if (E < 1)
3049                         return PG_STR_GET_BYTEA("");
3050
3051                 L1 = E - S1;
3052         }
3053
3054         /*
3055          * If the start position is past the end of the string, SQL99 says to
3056          * return a zero-length string -- DatumGetByteaPSlice() will do that for
3057          * us.  We need only convert S1 to zero-based starting position.
3058          */
3059         return DatumGetByteaPSlice(str, S1 - 1, L1);
3060 }
3061
3062 /*
3063  * byteaoverlay
3064  *      Replace specified substring of first string with second
3065  *
3066  * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3067  * This code is a direct implementation of what the standard says.
3068  */
3069 Datum
3070 byteaoverlay(PG_FUNCTION_ARGS)
3071 {
3072         bytea      *t1 = PG_GETARG_BYTEA_PP(0);
3073         bytea      *t2 = PG_GETARG_BYTEA_PP(1);
3074         int                     sp = PG_GETARG_INT32(2);        /* substring start position */
3075         int                     sl = PG_GETARG_INT32(3);        /* substring length */
3076
3077         PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3078 }
3079
3080 Datum
3081 byteaoverlay_no_len(PG_FUNCTION_ARGS)
3082 {
3083         bytea      *t1 = PG_GETARG_BYTEA_PP(0);
3084         bytea      *t2 = PG_GETARG_BYTEA_PP(1);
3085         int                     sp = PG_GETARG_INT32(2);        /* substring start position */
3086         int                     sl;
3087
3088         sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
3089         PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
3090 }
3091
3092 static bytea *
3093 bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
3094 {
3095         bytea      *result;
3096         bytea      *s1;
3097         bytea      *s2;
3098         int                     sp_pl_sl;
3099
3100         /*
3101          * Check for possible integer-overflow cases.  For negative sp, throw a
3102          * "substring length" error because that's what should be expected
3103          * according to the spec's definition of OVERLAY().
3104          */
3105         if (sp <= 0)
3106                 ereport(ERROR,
3107                                 (errcode(ERRCODE_SUBSTRING_ERROR),
3108                                  errmsg("negative substring length not allowed")));
3109         if (pg_add_s32_overflow(sp, sl, &sp_pl_sl))
3110                 ereport(ERROR,
3111                                 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
3112                                  errmsg("integer out of range")));
3113
3114         s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
3115         s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
3116         result = bytea_catenate(s1, t2);
3117         result = bytea_catenate(result, s2);
3118
3119         return result;
3120 }
3121
3122 /*
3123  * bit_count
3124  */
3125 Datum
3126 bytea_bit_count(PG_FUNCTION_ARGS)
3127 {
3128         bytea      *t1 = PG_GETARG_BYTEA_PP(0);
3129
3130         PG_RETURN_INT64(pg_popcount(VARDATA_ANY(t1), VARSIZE_ANY_EXHDR(t1)));
3131 }
3132
3133 /*
3134  * byteapos -
3135  *        Return the position of the specified substring.
3136  *        Implements the SQL POSITION() function.
3137  * Cloned from textpos and modified as required.
3138  */
3139 Datum
3140 byteapos(PG_FUNCTION_ARGS)
3141 {
3142         bytea      *t1 = PG_GETARG_BYTEA_PP(0);
3143         bytea      *t2 = PG_GETARG_BYTEA_PP(1);
3144         int                     pos;
3145         int                     px,
3146                                 p;
3147         int                     len1,
3148                                 len2;
3149         char       *p1,
3150                            *p2;
3151
3152         len1 = VARSIZE_ANY_EXHDR(t1);
3153         len2 = VARSIZE_ANY_EXHDR(t2);
3154
3155         if (len2 <= 0)
3156                 PG_RETURN_INT32(1);             /* result for empty pattern */
3157
3158         p1 = VARDATA_ANY(t1);
3159         p2 = VARDATA_ANY(t2);
3160
3161         pos = 0;
3162         px = (len1 - len2);
3163         for (p = 0; p <= px; p++)
3164         {
3165                 if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
3166                 {
3167                         pos = p + 1;
3168                         break;
3169                 };
3170                 p1++;
3171         };
3172
3173         PG_RETURN_INT32(pos);
3174 }
3175
3176 /*-------------------------------------------------------------
3177  * byteaGetByte
3178  *
3179  * this routine treats "bytea" as an array of bytes.
3180  * It returns the Nth byte (a number between 0 and 255).
3181  *-------------------------------------------------------------
3182  */
3183 Datum
3184 byteaGetByte(PG_FUNCTION_ARGS)
3185 {
3186         bytea      *v = PG_GETARG_BYTEA_PP(0);
3187         int32           n = PG_GETARG_INT32(1);
3188         int                     len;
3189         int                     byte;
3190
3191         len = VARSIZE_ANY_EXHDR(v);
3192
3193         if (n < 0 || n >= len)
3194                 ereport(ERROR,
3195                                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3196                                  errmsg("index %d out of valid range, 0..%d",
3197                                                 n, len - 1)));
3198
3199         byte = ((unsigned char *) VARDATA_ANY(v))[n];
3200
3201         PG_RETURN_INT32(byte);
3202 }
3203
3204 /*-------------------------------------------------------------
3205  * byteaGetBit
3206  *
3207  * This routine treats a "bytea" type like an array of bits.
3208  * It returns the value of the Nth bit (0 or 1).
3209  *
3210  *-------------------------------------------------------------
3211  */
3212 Datum
3213 byteaGetBit(PG_FUNCTION_ARGS)
3214 {
3215         bytea      *v = PG_GETARG_BYTEA_PP(0);
3216         int64           n = PG_GETARG_INT64(1);
3217         int                     byteNo,
3218                                 bitNo;
3219         int                     len;
3220         int                     byte;
3221
3222         len = VARSIZE_ANY_EXHDR(v);
3223
3224         if (n < 0 || n >= (int64) len * 8)
3225                 ereport(ERROR,
3226                                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3227                                  errmsg("index %lld out of valid range, 0..%lld",
3228                                                 (long long) n, (long long) len * 8 - 1)));
3229
3230         /* n/8 is now known < len, so safe to cast to int */
3231         byteNo = (int) (n / 8);
3232         bitNo = (int) (n % 8);
3233
3234         byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3235
3236         if (byte & (1 << bitNo))
3237                 PG_RETURN_INT32(1);
3238         else
3239                 PG_RETURN_INT32(0);
3240 }
3241
3242 /*-------------------------------------------------------------
3243  * byteaSetByte
3244  *
3245  * Given an instance of type 'bytea' creates a new one with
3246  * the Nth byte set to the given value.
3247  *
3248  *-------------------------------------------------------------
3249  */
3250 Datum
3251 byteaSetByte(PG_FUNCTION_ARGS)
3252 {
3253         bytea      *res = PG_GETARG_BYTEA_P_COPY(0);
3254         int32           n = PG_GETARG_INT32(1);
3255         int32           newByte = PG_GETARG_INT32(2);
3256         int                     len;
3257
3258         len = VARSIZE(res) - VARHDRSZ;
3259
3260         if (n < 0 || n >= len)
3261                 ereport(ERROR,
3262                                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3263                                  errmsg("index %d out of valid range, 0..%d",
3264                                                 n, len - 1)));
3265
3266         /*
3267          * Now set the byte.
3268          */
3269         ((unsigned char *) VARDATA(res))[n] = newByte;
3270
3271         PG_RETURN_BYTEA_P(res);
3272 }
3273
3274 /*-------------------------------------------------------------
3275  * byteaSetBit
3276  *
3277  * Given an instance of type 'bytea' creates a new one with
3278  * the Nth bit set to the given value.
3279  *
3280  *-------------------------------------------------------------
3281  */
3282 Datum
3283 byteaSetBit(PG_FUNCTION_ARGS)
3284 {
3285         bytea      *res = PG_GETARG_BYTEA_P_COPY(0);
3286         int64           n = PG_GETARG_INT64(1);
3287         int32           newBit = PG_GETARG_INT32(2);
3288         int                     len;
3289         int                     oldByte,
3290                                 newByte;
3291         int                     byteNo,
3292                                 bitNo;
3293
3294         len = VARSIZE(res) - VARHDRSZ;
3295
3296         if (n < 0 || n >= (int64) len * 8)
3297                 ereport(ERROR,
3298                                 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3299                                  errmsg("index %lld out of valid range, 0..%lld",
3300                                                 (long long) n, (long long) len * 8 - 1)));
3301
3302         /* n/8 is now known < len, so safe to cast to int */
3303         byteNo = (int) (n / 8);
3304         bitNo = (int) (n % 8);
3305
3306         /*
3307          * sanity check!
3308          */
3309         if (newBit != 0 && newBit != 1)
3310                 ereport(ERROR,
3311                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3312                                  errmsg("new bit must be 0 or 1")));
3313
3314         /*
3315          * Update the byte.
3316          */
3317         oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3318
3319         if (newBit == 0)
3320                 newByte = oldByte & (~(1 << bitNo));
3321         else
3322                 newByte = oldByte | (1 << bitNo);
3323
3324         ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3325
3326         PG_RETURN_BYTEA_P(res);
3327 }
3328
3329
3330 /* text_name()
3331  * Converts a text type to a Name type.
3332  */
3333 Datum
3334 text_name(PG_FUNCTION_ARGS)
3335 {
3336         text       *s = PG_GETARG_TEXT_PP(0);
3337         Name            result;
3338         int                     len;
3339
3340         len = VARSIZE_ANY_EXHDR(s);
3341
3342         /* Truncate oversize input */
3343         if (len >= NAMEDATALEN)
3344                 len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3345
3346         /* We use palloc0 here to ensure result is zero-padded */
3347         result = (Name) palloc0(NAMEDATALEN);
3348         memcpy(NameStr(*result), VARDATA_ANY(s), len);
3349
3350         PG_RETURN_NAME(result);
3351 }
3352
3353 /* name_text()
3354  * Converts a Name type to a text type.
3355  */
3356 Datum
3357 name_text(PG_FUNCTION_ARGS)
3358 {
3359         Name            s = PG_GETARG_NAME(0);
3360
3361         PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
3362 }
3363
3364
3365 /*
3366  * textToQualifiedNameList - convert a text object to list of names
3367  *
3368  * This implements the input parsing needed by nextval() and other
3369  * functions that take a text parameter representing a qualified name.
3370  * We split the name at dots, downcase if not double-quoted, and
3371  * truncate names if they're too long.
3372  */
3373 List *
3374 textToQualifiedNameList(text *textval)
3375 {
3376         char       *rawname;
3377         List       *result = NIL;
3378         List       *namelist;
3379         ListCell   *l;
3380
3381         /* Convert to C string (handles possible detoasting). */
3382         /* Note we rely on being able to modify rawname below. */
3383         rawname = text_to_cstring(textval);
3384
3385         if (!SplitIdentifierString(rawname, '.', &namelist))
3386                 ereport(ERROR,
3387                                 (errcode(ERRCODE_INVALID_NAME),
3388                                  errmsg("invalid name syntax")));
3389
3390         if (namelist == NIL)
3391                 ereport(ERROR,
3392                                 (errcode(ERRCODE_INVALID_NAME),
3393                                  errmsg("invalid name syntax")));
3394
3395         foreach(l, namelist)
3396         {
3397                 char       *curname = (char *) lfirst(l);
3398
3399                 result = lappend(result, makeString(pstrdup(curname)));
3400         }
3401
3402         pfree(rawname);
3403         list_free(namelist);
3404
3405         return result;
3406 }
3407
3408 /*
3409  * SplitIdentifierString --- parse a string containing identifiers
3410  *
3411  * This is the guts of textToQualifiedNameList, and is exported for use in
3412  * other situations such as parsing GUC variables.  In the GUC case, it's
3413  * important to avoid memory leaks, so the API is designed to minimize the
3414  * amount of stuff that needs to be allocated and freed.
3415  *
3416  * Inputs:
3417  *      rawstring: the input string; must be overwritable!      On return, it's
3418  *                         been modified to contain the separated identifiers.
3419  *      separator: the separator punctuation expected between identifiers
3420  *                         (typically '.' or ',').  Whitespace may also appear around
3421  *                         identifiers.
3422  * Outputs:
3423  *      namelist: filled with a palloc'd list of pointers to identifiers within
3424  *                        rawstring.  Caller should list_free() this even on error return.
3425  *
3426  * Returns true if okay, false if there is a syntax error in the string.
3427  *
3428  * Note that an empty string is considered okay here, though not in
3429  * textToQualifiedNameList.
3430  */
3431 bool
3432 SplitIdentifierString(char *rawstring, char separator,
3433                                           List **namelist)
3434 {
3435         char       *nextp = rawstring;
3436         bool            done = false;
3437
3438         *namelist = NIL;
3439
3440         while (scanner_isspace(*nextp))
3441                 nextp++;                                /* skip leading whitespace */
3442
3443         if (*nextp == '\0')
3444                 return true;                    /* allow empty string */
3445
3446         /* At the top of the loop, we are at start of a new identifier. */
3447         do
3448         {
3449                 char       *curname;
3450                 char       *endp;
3451
3452                 if (*nextp == '"')
3453                 {
3454                         /* Quoted name --- collapse quote-quote pairs, no downcasing */
3455                         curname = nextp + 1;
3456                         for (;;)
3457                         {
3458                                 endp = strchr(nextp + 1, '"');
3459                                 if (endp == NULL)
3460                                         return false;   /* mismatched quotes */
3461                                 if (endp[1] != '"')
3462                                         break;          /* found end of quoted name */
3463                                 /* Collapse adjacent quotes into one quote, and look again */
3464                                 memmove(endp, endp + 1, strlen(endp));
3465                                 nextp = endp;
3466                         }
3467                         /* endp now points at the terminating quote */
3468                         nextp = endp + 1;
3469                 }
3470                 else
3471                 {
3472                         /* Unquoted name --- extends to separator or whitespace */
3473                         char       *downname;
3474                         int                     len;
3475
3476                         curname = nextp;
3477                         while (*nextp && *nextp != separator &&
3478                                    !scanner_isspace(*nextp))
3479                                 nextp++;
3480                         endp = nextp;
3481                         if (curname == nextp)
3482                                 return false;   /* empty unquoted name not allowed */
3483
3484                         /*
3485                          * Downcase the identifier, using same code as main lexer does.
3486                          *
3487                          * XXX because we want to overwrite the input in-place, we cannot
3488                          * support a downcasing transformation that increases the string
3489                          * length.  This is not a problem given the current implementation
3490                          * of downcase_truncate_identifier, but we'll probably have to do
3491                          * something about this someday.
3492                          */
3493                         len = endp - curname;
3494                         downname = downcase_truncate_identifier(curname, len, false);
3495                         Assert(strlen(downname) <= len);
3496                         strncpy(curname, downname, len);        /* strncpy is required here */
3497                         pfree(downname);
3498                 }
3499
3500                 while (scanner_isspace(*nextp))
3501                         nextp++;                        /* skip trailing whitespace */
3502
3503                 if (*nextp == separator)
3504                 {
3505                         nextp++;
3506                         while (scanner_isspace(*nextp))
3507                                 nextp++;                /* skip leading whitespace for next */
3508                         /* we expect another name, so done remains false */
3509                 }
3510                 else if (*nextp == '\0')
3511                         done = true;
3512                 else
3513                         return false;           /* invalid syntax */
3514
3515                 /* Now safe to overwrite separator with a null */
3516                 *endp = '\0';
3517
3518                 /* Truncate name if it's overlength */
3519                 truncate_identifier(curname, strlen(curname), false);
3520
3521                 /*
3522                  * Finished isolating current name --- add it to list
3523                  */
3524                 *namelist = lappend(*namelist, curname);
3525
3526                 /* Loop back if we didn't reach end of string */
3527         } while (!done);
3528
3529         return true;
3530 }
3531
3532
3533 /*
3534  * SplitDirectoriesString --- parse a string containing file/directory names
3535  *
3536  * This works fine on file names too; the function name is historical.
3537  *
3538  * This is similar to SplitIdentifierString, except that the parsing
3539  * rules are meant to handle pathnames instead of identifiers: there is
3540  * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3541  * and we apply canonicalize_path() to each extracted string.  Because of the
3542  * last, the returned strings are separately palloc'd rather than being
3543  * pointers into rawstring --- but we still scribble on rawstring.
3544  *
3545  * Inputs:
3546  *      rawstring: the input string; must be modifiable!
3547  *      separator: the separator punctuation expected between directories
3548  *                         (typically ',' or ';').  Whitespace may also appear around
3549  *                         directories.
3550  * Outputs:
3551  *      namelist: filled with a palloc'd list of directory names.
3552  *                        Caller should list_free_deep() this even on error return.
3553  *
3554  * Returns true if okay, false if there is a syntax error in the string.
3555  *
3556  * Note that an empty string is considered okay here.
3557  */
3558 bool
3559 SplitDirectoriesString(char *rawstring, char separator,
3560                                            List **namelist)
3561 {
3562         char       *nextp = rawstring;
3563         bool            done = false;
3564
3565         *namelist = NIL;
3566
3567         while (scanner_isspace(*nextp))
3568                 nextp++;                                /* skip leading whitespace */
3569
3570         if (*nextp == '\0')
3571                 return true;                    /* allow empty string */
3572
3573         /* At the top of the loop, we are at start of a new directory. */
3574         do
3575         {
3576                 char       *curname;
3577                 char       *endp;
3578
3579                 if (*nextp == '"')
3580                 {
3581                         /* Quoted name --- collapse quote-quote pairs */
3582                         curname = nextp + 1;
3583                         for (;;)
3584                         {
3585                                 endp = strchr(nextp + 1, '"');
3586                                 if (endp == NULL)
3587                                         return false;   /* mismatched quotes */
3588                                 if (endp[1] != '"')
3589                                         break;          /* found end of quoted name */
3590                                 /* Collapse adjacent quotes into one quote, and look again */
3591                                 memmove(endp, endp + 1, strlen(endp));
3592                                 nextp = endp;
3593                         }
3594                         /* endp now points at the terminating quote */
3595                         nextp = endp + 1;
3596                 }
3597                 else
3598                 {
3599                         /* Unquoted name --- extends to separator or end of string */
3600                         curname = endp = nextp;
3601                         while (*nextp && *nextp != separator)
3602                         {
3603                                 /* trailing whitespace should not be included in name */
3604                                 if (!scanner_isspace(*nextp))
3605                                         endp = nextp + 1;
3606                                 nextp++;
3607                         }
3608                         if (curname == endp)
3609                                 return false;   /* empty unquoted name not allowed */
3610                 }
3611
3612                 while (scanner_isspace(*nextp))
3613                         nextp++;                        /* skip trailing whitespace */
3614
3615                 if (*nextp == separator)
3616                 {
3617                         nextp++;
3618                         while (scanner_isspace(*nextp))
3619                                 nextp++;                /* skip leading whitespace for next */
3620                         /* we expect another name, so done remains false */
3621                 }
3622                 else if (*nextp == '\0')
3623                         done = true;
3624                 else
3625                         return false;           /* invalid syntax */
3626
3627                 /* Now safe to overwrite separator with a null */
3628                 *endp = '\0';
3629
3630                 /* Truncate path if it's overlength */
3631                 if (strlen(curname) >= MAXPGPATH)
3632                         curname[MAXPGPATH - 1] = '\0';
3633
3634                 /*
3635                  * Finished isolating current name --- add it to list
3636                  */
3637                 curname = pstrdup(curname);
3638                 canonicalize_path(curname);
3639                 *namelist = lappend(*namelist, curname);
3640
3641                 /* Loop back if we didn't reach end of string */
3642         } while (!done);
3643
3644         return true;
3645 }
3646
3647
3648 /*
3649  * SplitGUCList --- parse a string containing identifiers or file names
3650  *
3651  * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3652  * presuming whether the elements will be taken as identifiers or file names.
3653  * We assume the input has already been through flatten_set_variable_args(),
3654  * so that we need never downcase (if appropriate, that was done already).
3655  * Nor do we ever truncate, since we don't know the correct max length.
3656  * We disallow embedded whitespace for simplicity (it shouldn't matter,
3657  * because any embedded whitespace should have led to double-quoting).
3658  * Otherwise the API is identical to SplitIdentifierString.
3659  *
3660  * XXX it's annoying to have so many copies of this string-splitting logic.
3661  * However, it's not clear that having one function with a bunch of option
3662  * flags would be much better.
3663  *
3664  * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3665  * Be sure to update that if you have to change this.
3666  *
3667  * Inputs:
3668  *      rawstring: the input string; must be overwritable!      On return, it's
3669  *                         been modified to contain the separated identifiers.
3670  *      separator: the separator punctuation expected between identifiers
3671  *                         (typically '.' or ',').  Whitespace may also appear around
3672  *                         identifiers.
3673  * Outputs:
3674  *      namelist: filled with a palloc'd list of pointers to identifiers within
3675  *                        rawstring.  Caller should list_free() this even on error return.
3676  *
3677  * Returns true if okay, false if there is a syntax error in the string.
3678  */
3679 bool
3680 SplitGUCList(char *rawstring, char separator,
3681                          List **namelist)
3682 {
3683         char       *nextp = rawstring;
3684         bool            done = false;
3685
3686         *namelist = NIL;
3687
3688         while (scanner_isspace(*nextp))
3689                 nextp++;                                /* skip leading whitespace */
3690
3691         if (*nextp == '\0')
3692                 return true;                    /* allow empty string */
3693
3694         /* At the top of the loop, we are at start of a new identifier. */
3695         do
3696         {
3697                 char       *curname;
3698                 char       *endp;
3699
3700                 if (*nextp == '"')
3701                 {
3702                         /* Quoted name --- collapse quote-quote pairs */
3703                         curname = nextp + 1;
3704                         for (;;)
3705                         {
3706                                 endp = strchr(nextp + 1, '"');
3707                                 if (endp == NULL)
3708                                         return false;   /* mismatched quotes */
3709                                 if (endp[1] != '"')
3710                                         break;          /* found end of quoted name */
3711                                 /* Collapse adjacent quotes into one quote, and look again */
3712                                 memmove(endp, endp + 1, strlen(endp));
3713                                 nextp = endp;
3714                         }
3715                         /* endp now points at the terminating quote */
3716                         nextp = endp + 1;
3717                 }
3718                 else
3719                 {
3720                         /* Unquoted name --- extends to separator or whitespace */
3721                         curname = nextp;
3722                         while (*nextp && *nextp != separator &&
3723                                    !scanner_isspace(*nextp))
3724                                 nextp++;
3725                         endp = nextp;
3726                         if (curname == nextp)
3727                                 return false;   /* empty unquoted name not allowed */
3728                 }
3729
3730                 while (scanner_isspace(*nextp))
3731                         nextp++;                        /* skip trailing whitespace */
3732
3733                 if (*nextp == separator)
3734                 {
3735                         nextp++;
3736                         while (scanner_isspace(*nextp))
3737                                 nextp++;                /* skip leading whitespace for next */
3738                         /* we expect another name, so done remains false */
3739                 }
3740                 else if (*nextp == '\0')
3741                         done = true;
3742                 else
3743                         return false;           /* invalid syntax */
3744
3745                 /* Now safe to overwrite separator with a null */
3746                 *endp = '\0';
3747
3748                 /*
3749                  * Finished isolating current name --- add it to list
3750                  */
3751                 *namelist = lappend(*namelist, curname);
3752
3753                 /* Loop back if we didn't reach end of string */
3754         } while (!done);
3755
3756         return true;
3757 }
3758
3759
3760 /*****************************************************************************
3761  *      Comparison Functions used for bytea
3762  *
3763  * Note: btree indexes need these routines not to leak memory; therefore,
3764  * be careful to free working copies of toasted datums.  Most places don't
3765  * need to be so careful.
3766  *****************************************************************************/
3767
3768 Datum
3769 byteaeq(PG_FUNCTION_ARGS)
3770 {
3771         Datum           arg1 = PG_GETARG_DATUM(0);
3772         Datum           arg2 = PG_GETARG_DATUM(1);
3773         bool            result;
3774         Size            len1,
3775                                 len2;
3776
3777         /*
3778          * We can use a fast path for unequal lengths, which might save us from
3779          * having to detoast one or both values.
3780          */
3781         len1 = toast_raw_datum_size(arg1);
3782         len2 = toast_raw_datum_size(arg2);
3783         if (len1 != len2)
3784                 result = false;
3785         else
3786         {
3787                 bytea      *barg1 = DatumGetByteaPP(arg1);
3788                 bytea      *barg2 = DatumGetByteaPP(arg2);
3789
3790                 result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3791                                                  len1 - VARHDRSZ) == 0);
3792
3793                 PG_FREE_IF_COPY(barg1, 0);
3794                 PG_FREE_IF_COPY(barg2, 1);
3795         }
3796
3797         PG_RETURN_BOOL(result);
3798 }
3799
3800 Datum
3801 byteane(PG_FUNCTION_ARGS)
3802 {
3803         Datum           arg1 = PG_GETARG_DATUM(0);
3804         Datum           arg2 = PG_GETARG_DATUM(1);
3805         bool            result;
3806         Size            len1,
3807                                 len2;
3808
3809         /*
3810          * We can use a fast path for unequal lengths, which might save us from
3811          * having to detoast one or both values.
3812          */
3813         len1 = toast_raw_datum_size(arg1);
3814         len2 = toast_raw_datum_size(arg2);
3815         if (len1 != len2)
3816                 result = true;
3817         else
3818         {
3819                 bytea      *barg1 = DatumGetByteaPP(arg1);
3820                 bytea      *barg2 = DatumGetByteaPP(arg2);
3821
3822                 result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3823                                                  len1 - VARHDRSZ) != 0);
3824
3825                 PG_FREE_IF_COPY(barg1, 0);
3826                 PG_FREE_IF_COPY(barg2, 1);
3827         }
3828
3829         PG_RETURN_BOOL(result);
3830 }
3831
3832 Datum
3833 bytealt(PG_FUNCTION_ARGS)
3834 {
3835         bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
3836         bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
3837         int                     len1,
3838                                 len2;
3839         int                     cmp;
3840
3841         len1 = VARSIZE_ANY_EXHDR(arg1);
3842         len2 = VARSIZE_ANY_EXHDR(arg2);
3843
3844         cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3845
3846         PG_FREE_IF_COPY(arg1, 0);
3847         PG_FREE_IF_COPY(arg2, 1);
3848
3849         PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
3850 }
3851
3852 Datum
3853 byteale(PG_FUNCTION_ARGS)
3854 {
3855         bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
3856         bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
3857         int                     len1,
3858                                 len2;
3859         int                     cmp;
3860
3861         len1 = VARSIZE_ANY_EXHDR(arg1);
3862         len2 = VARSIZE_ANY_EXHDR(arg2);
3863
3864         cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3865
3866         PG_FREE_IF_COPY(arg1, 0);
3867         PG_FREE_IF_COPY(arg2, 1);
3868
3869         PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
3870 }
3871
3872 Datum
3873 byteagt(PG_FUNCTION_ARGS)
3874 {
3875         bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
3876         bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
3877         int                     len1,
3878                                 len2;
3879         int                     cmp;
3880
3881         len1 = VARSIZE_ANY_EXHDR(arg1);
3882         len2 = VARSIZE_ANY_EXHDR(arg2);
3883
3884         cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3885
3886         PG_FREE_IF_COPY(arg1, 0);
3887         PG_FREE_IF_COPY(arg2, 1);
3888
3889         PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
3890 }
3891
3892 Datum
3893 byteage(PG_FUNCTION_ARGS)
3894 {
3895         bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
3896         bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
3897         int                     len1,
3898                                 len2;
3899         int                     cmp;
3900
3901         len1 = VARSIZE_ANY_EXHDR(arg1);
3902         len2 = VARSIZE_ANY_EXHDR(arg2);
3903
3904         cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3905
3906         PG_FREE_IF_COPY(arg1, 0);
3907         PG_FREE_IF_COPY(arg2, 1);
3908
3909         PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
3910 }
3911
3912 Datum
3913 byteacmp(PG_FUNCTION_ARGS)
3914 {
3915         bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
3916         bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
3917         int                     len1,
3918                                 len2;
3919         int                     cmp;
3920
3921         len1 = VARSIZE_ANY_EXHDR(arg1);
3922         len2 = VARSIZE_ANY_EXHDR(arg2);
3923
3924         cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3925         if ((cmp == 0) && (len1 != len2))
3926                 cmp = (len1 < len2) ? -1 : 1;
3927
3928         PG_FREE_IF_COPY(arg1, 0);
3929         PG_FREE_IF_COPY(arg2, 1);
3930
3931         PG_RETURN_INT32(cmp);
3932 }
3933
3934 Datum
3935 bytea_larger(PG_FUNCTION_ARGS)
3936 {
3937         bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
3938         bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
3939         bytea      *result;
3940         int                     len1,
3941                                 len2;
3942         int                     cmp;
3943
3944         len1 = VARSIZE_ANY_EXHDR(arg1);
3945         len2 = VARSIZE_ANY_EXHDR(arg2);
3946
3947         cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3948         result = ((cmp > 0) || ((cmp == 0) && (len1 > len2)) ? arg1 : arg2);
3949
3950         PG_RETURN_BYTEA_P(result);
3951 }
3952
3953 Datum
3954 bytea_smaller(PG_FUNCTION_ARGS)
3955 {
3956         bytea      *arg1 = PG_GETARG_BYTEA_PP(0);
3957         bytea      *arg2 = PG_GETARG_BYTEA_PP(1);
3958         bytea      *result;
3959         int                     len1,
3960                                 len2;
3961         int                     cmp;
3962
3963         len1 = VARSIZE_ANY_EXHDR(arg1);
3964         len2 = VARSIZE_ANY_EXHDR(arg2);
3965
3966         cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3967         result = ((cmp < 0) || ((cmp == 0) && (len1 < len2)) ? arg1 : arg2);
3968
3969         PG_RETURN_BYTEA_P(result);
3970 }
3971
3972 Datum
3973 bytea_sortsupport(PG_FUNCTION_ARGS)
3974 {
3975         SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
3976         MemoryContext oldcontext;
3977
3978         oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3979
3980         /* Use generic string SortSupport, forcing "C" collation */
3981         varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
3982
3983         MemoryContextSwitchTo(oldcontext);
3984
3985         PG_RETURN_VOID();
3986 }
3987
3988 /*
3989  * appendStringInfoText
3990  *
3991  * Append a text to str.
3992  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
3993  */
3994 static void
3995 appendStringInfoText(StringInfo str, const text *t)
3996 {
3997         appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
3998 }
3999
4000 /*
4001  * replace_text
4002  * replace all occurrences of 'old_sub_str' in 'orig_str'
4003  * with 'new_sub_str' to form 'new_str'
4004  *
4005  * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
4006  * otherwise returns 'new_str'
4007  */
4008 Datum
4009 replace_text(PG_FUNCTION_ARGS)
4010 {
4011         text       *src_text = PG_GETARG_TEXT_PP(0);
4012         text       *from_sub_text = PG_GETARG_TEXT_PP(1);
4013         text       *to_sub_text = PG_GETARG_TEXT_PP(2);
4014         int                     src_text_len;
4015         int                     from_sub_text_len;
4016         TextPositionState state;
4017         text       *ret_text;
4018         int                     chunk_len;
4019         char       *curr_ptr;
4020         char       *start_ptr;
4021         StringInfoData str;
4022         bool            found;
4023
4024         src_text_len = VARSIZE_ANY_EXHDR(src_text);
4025         from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text);
4026
4027         /* Return unmodified source string if empty source or pattern */
4028         if (src_text_len < 1 || from_sub_text_len < 1)
4029         {
4030                 PG_RETURN_TEXT_P(src_text);
4031         }
4032
4033         text_position_setup(src_text, from_sub_text, PG_GET_COLLATION(), &state);
4034
4035         found = text_position_next(&state);
4036
4037         /* When the from_sub_text is not found, there is nothing to do. */
4038         if (!found)
4039         {
4040                 text_position_cleanup(&state);
4041                 PG_RETURN_TEXT_P(src_text);
4042         }
4043         curr_ptr = text_position_get_match_ptr(&state);
4044         start_ptr = VARDATA_ANY(src_text);
4045
4046         initStringInfo(&str);
4047
4048         do
4049         {
4050                 CHECK_FOR_INTERRUPTS();
4051
4052                 /* copy the data skipped over by last text_position_next() */
4053                 chunk_len = curr_ptr - start_ptr;
4054                 appendBinaryStringInfo(&str, start_ptr, chunk_len);
4055
4056                 appendStringInfoText(&str, to_sub_text);
4057
4058                 start_ptr = curr_ptr + from_sub_text_len;
4059
4060                 found = text_position_next(&state);
4061                 if (found)
4062                         curr_ptr = text_position_get_match_ptr(&state);
4063         }
4064         while (found);
4065
4066         /* copy trailing data */
4067         chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4068         appendBinaryStringInfo(&str, start_ptr, chunk_len);
4069
4070         text_position_cleanup(&state);
4071
4072         ret_text = cstring_to_text_with_len(str.data, str.len);
4073         pfree(str.data);
4074
4075         PG_RETURN_TEXT_P(ret_text);
4076 }
4077
4078 /*
4079  * check_replace_text_has_escape
4080  *
4081  * Returns 0 if text contains no backslashes that need processing.
4082  * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
4083  * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
4084  */
4085 static int
4086 check_replace_text_has_escape(const text *replace_text)
4087 {
4088         int                     result = 0;
4089         const char *p = VARDATA_ANY(replace_text);
4090         const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4091
4092         while (p < p_end)
4093         {
4094                 /* Find next escape char, if any. */
4095                 p = memchr(p, '\\', p_end - p);
4096                 if (p == NULL)
4097                         break;
4098                 p++;
4099                 /* Note: a backslash at the end doesn't require extra processing. */
4100                 if (p < p_end)
4101                 {
4102                         if (*p >= '1' && *p <= '9')
4103                                 return 2;               /* Found a submatch specifier, so done */
4104                         result = 1;                     /* Found some other sequence, keep looking */
4105                         p++;
4106                 }
4107         }
4108         return result;
4109 }
4110
4111 /*
4112  * appendStringInfoRegexpSubstr
4113  *
4114  * Append replace_text to str, substituting regexp back references for
4115  * \n escapes.  start_ptr is the start of the match in the source string,
4116  * at logical character position data_pos.
4117  */
4118 static void
4119 appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
4120                                                          regmatch_t *pmatch,
4121                                                          char *start_ptr, int data_pos)
4122 {
4123         const char *p = VARDATA_ANY(replace_text);
4124         const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4125
4126         while (p < p_end)
4127         {
4128                 const char *chunk_start = p;
4129                 int                     so;
4130                 int                     eo;
4131
4132                 /* Find next escape char, if any. */
4133                 p = memchr(p, '\\', p_end - p);
4134                 if (p == NULL)
4135                         p = p_end;
4136
4137                 /* Copy the text we just scanned over, if any. */
4138                 if (p > chunk_start)
4139                         appendBinaryStringInfo(str, chunk_start, p - chunk_start);
4140
4141                 /* Done if at end of string, else advance over escape char. */
4142                 if (p >= p_end)
4143                         break;
4144                 p++;
4145
4146                 if (p >= p_end)
4147                 {
4148                         /* Escape at very end of input.  Treat same as unexpected char */
4149                         appendStringInfoChar(str, '\\');
4150                         break;
4151                 }
4152
4153                 if (*p >= '1' && *p <= '9')
4154                 {
4155                         /* Use the back reference of regexp. */
4156                         int                     idx = *p - '0';
4157
4158                         so = pmatch[idx].rm_so;
4159                         eo = pmatch[idx].rm_eo;
4160                         p++;
4161                 }
4162                 else if (*p == '&')
4163                 {
4164                         /* Use the entire matched string. */
4165                         so = pmatch[0].rm_so;
4166                         eo = pmatch[0].rm_eo;
4167                         p++;
4168                 }
4169                 else if (*p == '\\')
4170                 {
4171                         /* \\ means transfer one \ to output. */
4172                         appendStringInfoChar(str, '\\');
4173                         p++;
4174                         continue;
4175                 }
4176                 else
4177                 {
4178                         /*
4179                          * If escape char is not followed by any expected char, just treat
4180                          * it as ordinary data to copy.  (XXX would it be better to throw
4181                          * an error?)
4182                          */
4183                         appendStringInfoChar(str, '\\');
4184                         continue;
4185                 }
4186
4187                 if (so >= 0 && eo >= 0)
4188                 {
4189                         /*
4190                          * Copy the text that is back reference of regexp.  Note so and eo
4191                          * are counted in characters not bytes.
4192                          */
4193                         char       *chunk_start;
4194                         int                     chunk_len;
4195
4196                         Assert(so >= data_pos);
4197                         chunk_start = start_ptr;
4198                         chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
4199                         chunk_len = charlen_to_bytelen(chunk_start, eo - so);
4200                         appendBinaryStringInfo(str, chunk_start, chunk_len);
4201                 }
4202         }
4203 }
4204
4205 /*
4206  * replace_text_regexp
4207  *
4208  * replace substring(s) in src_text that match pattern with replace_text.
4209  * The replace_text can contain backslash markers to substitute
4210  * (parts of) the matched text.
4211  *
4212  * cflags: regexp compile flags.
4213  * collation: collation to use.
4214  * search_start: the character (not byte) offset in src_text at which to
4215  * begin searching.
4216  * n: if 0, replace all matches; if > 0, replace only the N'th match.
4217  */
4218 text *
4219 replace_text_regexp(text *src_text, text *pattern_text,
4220                                         text *replace_text,
4221                                         int cflags, Oid collation,
4222                                         int search_start, int n)
4223 {
4224         text       *ret_text;
4225         regex_t    *re;
4226         int                     src_text_len = VARSIZE_ANY_EXHDR(src_text);
4227         int                     nmatches = 0;
4228         StringInfoData buf;
4229         regmatch_t      pmatch[10];             /* main match, plus \1 to \9 */
4230         int                     nmatch = lengthof(pmatch);
4231         pg_wchar   *data;
4232         size_t          data_len;
4233         int                     data_pos;
4234         char       *start_ptr;
4235         int                     escape_status;
4236
4237         initStringInfo(&buf);
4238
4239         /* Convert data string to wide characters. */
4240         data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
4241         data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
4242
4243         /* Check whether replace_text has escapes, especially regexp submatches. */
4244         escape_status = check_replace_text_has_escape(replace_text);
4245
4246         /* If no regexp submatches, we can use REG_NOSUB. */
4247         if (escape_status < 2)
4248         {
4249                 cflags |= REG_NOSUB;
4250                 /* Also tell pg_regexec we only want the whole-match location. */
4251                 nmatch = 1;
4252         }
4253
4254         /* Prepare the regexp. */
4255         re = RE_compile_and_cache(pattern_text, cflags, collation);
4256
4257         /* start_ptr points to the data_pos'th character of src_text */
4258         start_ptr = (char *) VARDATA_ANY(src_text);
4259         data_pos = 0;
4260
4261         while (search_start <= data_len)
4262         {
4263                 int                     regexec_result;
4264
4265                 CHECK_FOR_INTERRUPTS();
4266
4267                 regexec_result = pg_regexec(re,
4268                                                                         data,
4269                                                                         data_len,
4270                                                                         search_start,
4271                                                                         NULL,   /* no details */
4272                                                                         nmatch,
4273                                                                         pmatch,
4274                                                                         0);
4275
4276                 if (regexec_result == REG_NOMATCH)
4277                         break;
4278
4279                 if (regexec_result != REG_OKAY)
4280                 {
4281                         char            errMsg[100];
4282
4283                         pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
4284                         ereport(ERROR,
4285                                         (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
4286                                          errmsg("regular expression failed: %s", errMsg)));
4287                 }
4288
4289                 /*
4290                  * Count matches, and decide whether to replace this match.
4291                  */
4292                 nmatches++;
4293                 if (n > 0 && nmatches != n)
4294                 {
4295                         /*
4296                          * No, so advance search_start, but not start_ptr/data_pos. (Thus,
4297                          * we treat the matched text as if it weren't matched, and copy it
4298                          * to the output later.)
4299                          */
4300                         search_start = pmatch[0].rm_eo;
4301                         if (pmatch[0].rm_so == pmatch[0].rm_eo)
4302                                 search_start++;
4303                         continue;
4304                 }
4305
4306                 /*
4307                  * Copy the text to the left of the match position.  Note we are given
4308                  * character not byte indexes.
4309                  */
4310                 if (pmatch[0].rm_so - data_pos > 0)
4311                 {
4312                         int                     chunk_len;
4313
4314                         chunk_len = charlen_to_bytelen(start_ptr,
4315                                                                                    pmatch[0].rm_so - data_pos);
4316                         appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4317
4318                         /*
4319                          * Advance start_ptr over that text, to avoid multiple rescans of
4320                          * it if the replace_text contains multiple back-references.
4321                          */
4322                         start_ptr += chunk_len;
4323                         data_pos = pmatch[0].rm_so;
4324                 }
4325
4326                 /*
4327                  * Copy the replace_text, processing escapes if any are present.
4328                  */
4329                 if (escape_status > 0)
4330                         appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
4331                                                                                  start_ptr, data_pos);
4332                 else
4333                         appendStringInfoText(&buf, replace_text);
4334
4335                 /* Advance start_ptr and data_pos over the matched text. */
4336                 start_ptr += charlen_to_bytelen(start_ptr,
4337                                                                                 pmatch[0].rm_eo - data_pos);
4338                 data_pos = pmatch[0].rm_eo;
4339
4340                 /*
4341                  * If we only want to replace one occurrence, we're done.
4342                  */
4343                 if (n > 0)
4344                         break;
4345
4346                 /*
4347                  * Advance search position.  Normally we start the next search at the
4348                  * end of the previous match; but if the match was of zero length, we
4349                  * have to advance by one character, or we'd just find the same match
4350                  * again.
4351                  */
4352                 search_start = data_pos;
4353                 if (pmatch[0].rm_so == pmatch[0].rm_eo)
4354                         search_start++;
4355         }
4356
4357         /*
4358          * Copy the text to the right of the last match.
4359          */
4360         if (data_pos < data_len)
4361         {
4362                 int                     chunk_len;
4363
4364                 chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4365                 appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4366         }
4367
4368         ret_text = cstring_to_text_with_len(buf.data, buf.len);
4369         pfree(buf.data);
4370         pfree(data);
4371
4372         return ret_text;
4373 }
4374
4375 /*
4376  * split_part
4377  * parse input string based on provided field separator
4378  * return N'th item (1 based, negative counts from end)
4379  */
4380 Datum
4381 split_part(PG_FUNCTION_ARGS)
4382 {
4383         text       *inputstring = PG_GETARG_TEXT_PP(0);
4384         text       *fldsep = PG_GETARG_TEXT_PP(1);
4385         int                     fldnum = PG_GETARG_INT32(2);
4386         int                     inputstring_len;
4387         int                     fldsep_len;
4388         TextPositionState state;
4389         char       *start_ptr;
4390         char       *end_ptr;
4391         text       *result_text;
4392         bool            found;
4393
4394         /* field number is 1 based */
4395         if (fldnum == 0)
4396                 ereport(ERROR,
4397                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4398                                  errmsg("field position must not be zero")));
4399
4400         inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4401         fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4402
4403         /* return empty string for empty input string */
4404         if (inputstring_len < 1)
4405                 PG_RETURN_TEXT_P(cstring_to_text(""));
4406
4407         /* handle empty field separator */
4408         if (fldsep_len < 1)
4409         {
4410                 /* if first or last field, return input string, else empty string */
4411                 if (fldnum == 1 || fldnum == -1)
4412                         PG_RETURN_TEXT_P(inputstring);
4413                 else
4414                         PG_RETURN_TEXT_P(cstring_to_text(""));
4415         }
4416
4417         /* find the first field separator */
4418         text_position_setup(inputstring, fldsep, PG_GET_COLLATION(), &state);
4419
4420         found = text_position_next(&state);
4421
4422         /* special case if fldsep not found at all */
4423         if (!found)
4424         {
4425                 text_position_cleanup(&state);
4426                 /* if first or last field, return input string, else empty string */
4427                 if (fldnum == 1 || fldnum == -1)
4428                         PG_RETURN_TEXT_P(inputstring);
4429                 else
4430                         PG_RETURN_TEXT_P(cstring_to_text(""));
4431         }
4432
4433         /*
4434          * take care of a negative field number (i.e. count from the right) by
4435          * converting to a positive field number; we need total number of fields
4436          */
4437         if (fldnum < 0)
4438         {
4439                 /* we found a fldsep, so there are at least two fields */
4440                 int                     numfields = 2;
4441
4442                 while (text_position_next(&state))
4443                         numfields++;
4444
4445                 /* special case of last field does not require an extra pass */
4446                 if (fldnum == -1)
4447                 {
4448                         start_ptr = text_position_get_match_ptr(&state) + fldsep_len;
4449                         end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
4450                         text_position_cleanup(&state);
4451                         PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
4452                                                                                                           end_ptr - start_ptr));
4453                 }
4454
4455                 /* else, convert fldnum to positive notation */
4456                 fldnum += numfields + 1;
4457
4458                 /* if nonexistent field, return empty string */
4459                 if (fldnum <= 0)
4460                 {
4461                         text_position_cleanup(&state);
4462                         PG_RETURN_TEXT_P(cstring_to_text(""));
4463                 }
4464
4465                 /* reset to pointing at first match, but now with positive fldnum */
4466                 text_position_reset(&state);
4467                 found = text_position_next(&state);
4468                 Assert(found);
4469         }
4470
4471         /* identify bounds of first field */
4472         start_ptr = VARDATA_ANY(inputstring);
4473         end_ptr = text_position_get_match_ptr(&state);
4474
4475         while (found && --fldnum > 0)
4476         {
4477                 /* identify bounds of next field */
4478                 start_ptr = end_ptr + fldsep_len;
4479                 found = text_position_next(&state);
4480                 if (found)
4481                         end_ptr = text_position_get_match_ptr(&state);
4482         }
4483
4484         text_position_cleanup(&state);
4485
4486         if (fldnum > 0)
4487         {
4488                 /* N'th field separator not found */
4489                 /* if last field requested, return it, else empty string */
4490                 if (fldnum == 1)
4491                 {
4492                         int                     last_len = start_ptr - VARDATA_ANY(inputstring);
4493
4494                         result_text = cstring_to_text_with_len(start_ptr,
4495                                                                                                    inputstring_len - last_len);
4496                 }
4497                 else
4498                         result_text = cstring_to_text("");
4499         }
4500         else
4501         {
4502                 /* non-last field requested */
4503                 result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr);
4504         }
4505
4506         PG_RETURN_TEXT_P(result_text);
4507 }
4508
4509 /*
4510  * Convenience function to return true when two text params are equal.
4511  */
4512 static bool
4513 text_isequal(text *txt1, text *txt2, Oid collid)
4514 {
4515         return DatumGetBool(DirectFunctionCall2Coll(texteq,
4516                                                                                                 collid,
4517                                                                                                 PointerGetDatum(txt1),
4518                                                                                                 PointerGetDatum(txt2)));
4519 }
4520
4521 /*
4522  * text_to_array
4523  * parse input string and return text array of elements,
4524  * based on provided field separator
4525  */
4526 Datum
4527 text_to_array(PG_FUNCTION_ARGS)
4528 {
4529         SplitTextOutputData tstate;
4530
4531         /* For array output, tstate should start as all zeroes */
4532         memset(&tstate, 0, sizeof(tstate));
4533
4534         if (!split_text(fcinfo, &tstate))
4535                 PG_RETURN_NULL();
4536
4537         if (tstate.astate == NULL)
4538                 PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4539
4540         PG_RETURN_DATUM(makeArrayResult(tstate.astate,
4541                                                                         CurrentMemoryContext));
4542 }
4543
4544 /*
4545  * text_to_array_null
4546  * parse input string and return text array of elements,
4547  * based on provided field separator and null string
4548  *
4549  * This is a separate entry point only to prevent the regression tests from
4550  * complaining about different argument sets for the same internal function.
4551  */
4552 Datum
4553 text_to_array_null(PG_FUNCTION_ARGS)
4554 {
4555         return text_to_array(fcinfo);
4556 }
4557
4558 /*
4559  * text_to_table
4560  * parse input string and return table of elements,
4561  * based on provided field separator
4562  */
4563 Datum
4564 text_to_table(PG_FUNCTION_ARGS)
4565 {
4566         ReturnSetInfo *rsi = (ReturnSetInfo *) fcinfo->resultinfo;
4567         SplitTextOutputData tstate;
4568
4569         tstate.astate = NULL;
4570         InitMaterializedSRF(fcinfo, MAT_SRF_USE_EXPECTED_DESC);
4571         tstate.tupstore = rsi->setResult;
4572         tstate.tupdesc = rsi->setDesc;
4573
4574         (void) split_text(fcinfo, &tstate);
4575
4576         return (Datum) 0;
4577 }
4578
4579 /*
4580  * text_to_table_null
4581  * parse input string and return table of elements,
4582  * based on provided field separator and null string
4583  *
4584  * This is a separate entry point only to prevent the regression tests from
4585  * complaining about different argument sets for the same internal function.
4586  */
4587 Datum
4588 text_to_table_null(PG_FUNCTION_ARGS)
4589 {
4590         return text_to_table(fcinfo);
4591 }
4592
4593 /*
4594  * Common code for text_to_array, text_to_array_null, text_to_table
4595  * and text_to_table_null functions.
4596  *
4597  * These are not strict so we have to test for null inputs explicitly.
4598  * Returns false if result is to be null, else returns true.
4599  *
4600  * Note that if the result is valid but empty (zero elements), we return
4601  * without changing *tstate --- caller must handle that case, too.
4602  */
4603 static bool
4604 split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
4605 {
4606         text       *inputstring;
4607         text       *fldsep;
4608         text       *null_string;
4609         Oid                     collation = PG_GET_COLLATION();
4610         int                     inputstring_len;
4611         int                     fldsep_len;
4612         char       *start_ptr;
4613         text       *result_text;
4614
4615         /* when input string is NULL, then result is NULL too */
4616         if (PG_ARGISNULL(0))
4617                 return false;
4618
4619         inputstring = PG_GETARG_TEXT_PP(0);
4620
4621         /* fldsep can be NULL */
4622         if (!PG_ARGISNULL(1))
4623                 fldsep = PG_GETARG_TEXT_PP(1);
4624         else
4625                 fldsep = NULL;
4626
4627         /* null_string can be NULL or omitted */
4628         if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4629                 null_string = PG_GETARG_TEXT_PP(2);
4630         else
4631                 null_string = NULL;
4632
4633         if (fldsep != NULL)
4634         {
4635                 /*
4636                  * Normal case with non-null fldsep.  Use the text_position machinery
4637                  * to search for occurrences of fldsep.
4638                  */
4639                 TextPositionState state;
4640
4641                 inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4642                 fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
4643
4644                 /* return empty set for empty input string */
4645                 if (inputstring_len < 1)
4646                         return true;
4647
4648                 /* empty field separator: return input string as a one-element set */
4649                 if (fldsep_len < 1)
4650                 {
4651                         split_text_accum_result(tstate, inputstring,
4652                                                                         null_string, collation);
4653                         return true;
4654                 }
4655
4656                 text_position_setup(inputstring, fldsep, collation, &state);
4657
4658                 start_ptr = VARDATA_ANY(inputstring);
4659
4660                 for (;;)
4661                 {
4662                         bool            found;
4663                         char       *end_ptr;
4664                         int                     chunk_len;
4665
4666                         CHECK_FOR_INTERRUPTS();
4667
4668                         found = text_position_next(&state);
4669                         if (!found)
4670                         {
4671                                 /* fetch last field */
4672                                 chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4673                                 end_ptr = NULL; /* not used, but some compilers complain */
4674                         }
4675                         else
4676                         {
4677                                 /* fetch non-last field */
4678                                 end_ptr = text_position_get_match_ptr(&state);
4679                                 chunk_len = end_ptr - start_ptr;
4680                         }
4681
4682                         /* build a temp text datum to pass to split_text_accum_result */
4683                         result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4684
4685                         /* stash away this field */
4686                         split_text_accum_result(tstate, result_text,
4687                                                                         null_string, collation);
4688
4689                         pfree(result_text);
4690
4691                         if (!found)
4692                                 break;
4693
4694                         start_ptr = end_ptr + fldsep_len;
4695                 }
4696
4697                 text_position_cleanup(&state);
4698         }
4699         else
4700         {
4701                 /*
4702                  * When fldsep is NULL, each character in the input string becomes a
4703                  * separate element in the result set.  The separator is effectively
4704                  * the space between characters.
4705                  */
4706                 inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4707
4708                 start_ptr = VARDATA_ANY(inputstring);
4709
4710                 while (inputstring_len > 0)
4711                 {
4712                         int                     chunk_len = pg_mblen(start_ptr);
4713
4714                         CHECK_FOR_INTERRUPTS();
4715
4716                         /* build a temp text datum to pass to split_text_accum_result */
4717                         result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4718
4719                         /* stash away this field */
4720                         split_text_accum_result(tstate, result_text,
4721                                                                         null_string, collation);
4722
4723                         pfree(result_text);
4724
4725                         start_ptr += chunk_len;
4726                         inputstring_len -= chunk_len;
4727                 }
4728         }
4729
4730         return true;
4731 }
4732
4733 /*
4734  * Add text item to result set (table or array).
4735  *
4736  * This is also responsible for checking to see if the item matches
4737  * the null_string, in which case we should emit NULL instead.
4738  */
4739 static void
4740 split_text_accum_result(SplitTextOutputData *tstate,
4741                                                 text *field_value,
4742                                                 text *null_string,
4743                                                 Oid collation)
4744 {
4745         bool            is_null = false;
4746
4747         if (null_string && text_isequal(field_value, null_string, collation))
4748                 is_null = true;
4749
4750         if (tstate->tupstore)
4751         {
4752                 Datum           values[1];
4753                 bool            nulls[1];
4754
4755                 values[0] = PointerGetDatum(field_value);
4756                 nulls[0] = is_null;
4757
4758                 tuplestore_putvalues(tstate->tupstore,
4759                                                          tstate->tupdesc,
4760                                                          values,
4761                                                          nulls);
4762         }
4763         else
4764         {
4765                 tstate->astate = accumArrayResult(tstate->astate,
4766                                                                                   PointerGetDatum(field_value),
4767                                                                                   is_null,
4768                                                                                   TEXTOID,
4769                                                                                   CurrentMemoryContext);
4770         }
4771 }
4772
4773 /*
4774  * array_to_text
4775  * concatenate Cstring representation of input array elements
4776  * using provided field separator
4777  */
4778 Datum
4779 array_to_text(PG_FUNCTION_ARGS)
4780 {
4781         ArrayType  *v = PG_GETARG_ARRAYTYPE_P(0);
4782         char       *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4783
4784         PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
4785 }
4786
4787 /*
4788  * array_to_text_null
4789  * concatenate Cstring representation of input array elements
4790  * using provided field separator and null string
4791  *
4792  * This version is not strict so we have to test for null inputs explicitly.
4793  */
4794 Datum
4795 array_to_text_null(PG_FUNCTION_ARGS)
4796 {
4797         ArrayType  *v;
4798         char       *fldsep;
4799         char       *null_string;
4800
4801         /* returns NULL when first or second parameter is NULL */
4802         if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
4803                 PG_RETURN_NULL();
4804
4805         v = PG_GETARG_ARRAYTYPE_P(0);
4806         fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4807
4808         /* NULL null string is passed through as a null pointer */
4809         if (!PG_ARGISNULL(2))
4810                 null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
4811         else
4812                 null_string = NULL;
4813
4814         PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
4815 }
4816
4817 /*
4818  * common code for array_to_text and array_to_text_null functions
4819  */
4820 static text *
4821 array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
4822                                            const char *fldsep, const char *null_string)
4823 {
4824         text       *result;
4825         int                     nitems,
4826                            *dims,
4827                                 ndims;
4828         Oid                     element_type;
4829         int                     typlen;
4830         bool            typbyval;
4831         char            typalign;
4832         StringInfoData buf;
4833         bool            printed = false;
4834         char       *p;
4835         bits8      *bitmap;
4836         int                     bitmask;
4837         int                     i;
4838         ArrayMetaState *my_extra;
4839
4840         ndims = ARR_NDIM(v);
4841         dims = ARR_DIMS(v);
4842         nitems = ArrayGetNItems(ndims, dims);
4843
4844         /* if there are no elements, return an empty string */
4845         if (nitems == 0)
4846                 return cstring_to_text_with_len("", 0);
4847
4848         element_type = ARR_ELEMTYPE(v);
4849         initStringInfo(&buf);
4850
4851         /*
4852          * We arrange to look up info about element type, including its output
4853          * conversion proc, only once per series of calls, assuming the element
4854          * type doesn't change underneath us.
4855          */
4856         my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4857         if (my_extra == NULL)
4858         {
4859                 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4860                                                                                                           sizeof(ArrayMetaState));
4861                 my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4862                 my_extra->element_type = ~element_type;
4863         }
4864
4865         if (my_extra->element_type != element_type)
4866         {
4867                 /*
4868                  * Get info about element type, including its output conversion proc
4869                  */
4870                 get_type_io_data(element_type, IOFunc_output,
4871                                                  &my_extra->typlen, &my_extra->typbyval,
4872                                                  &my_extra->typalign, &my_extra->typdelim,
4873                                                  &my_extra->typioparam, &my_extra->typiofunc);
4874                 fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
4875                                           fcinfo->flinfo->fn_mcxt);
4876                 my_extra->element_type = element_type;
4877         }
4878         typlen = my_extra->typlen;
4879         typbyval = my_extra->typbyval;
4880         typalign = my_extra->typalign;
4881
4882         p = ARR_DATA_PTR(v);
4883         bitmap = ARR_NULLBITMAP(v);
4884         bitmask = 1;
4885
4886         for (i = 0; i < nitems; i++)
4887         {
4888                 Datum           itemvalue;
4889                 char       *value;
4890
4891                 /* Get source element, checking for NULL */
4892                 if (bitmap && (*bitmap & bitmask) == 0)
4893                 {
4894                         /* if null_string is NULL, we just ignore null elements */
4895                         if (null_string != NULL)
4896                         {
4897                                 if (printed)
4898                                         appendStringInfo(&buf, "%s%s", fldsep, null_string);
4899                                 else
4900                                         appendStringInfoString(&buf, null_string);
4901                                 printed = true;
4902                         }
4903                 }
4904                 else
4905                 {
4906                         itemvalue = fetch_att(p, typbyval, typlen);
4907
4908                         value = OutputFunctionCall(&my_extra->proc, itemvalue);
4909
4910                         if (printed)
4911                                 appendStringInfo(&buf, "%s%s", fldsep, value);
4912                         else
4913                                 appendStringInfoString(&buf, value);
4914                         printed = true;
4915
4916                         p = att_addlength_pointer(p, typlen, p);
4917                         p = (char *) att_align_nominal(p, typalign);
4918                 }
4919
4920                 /* advance bitmap pointer if any */
4921                 if (bitmap)
4922                 {
4923                         bitmask <<= 1;
4924                         if (bitmask == 0x100)
4925                         {
4926                                 bitmap++;
4927                                 bitmask = 1;
4928                         }
4929                 }
4930         }
4931
4932         result = cstring_to_text_with_len(buf.data, buf.len);
4933         pfree(buf.data);
4934
4935         return result;
4936 }
4937
4938 /*
4939  * Workhorse for to_bin, to_oct, and to_hex.  Note that base must be > 1 and <=
4940  * 16.
4941  */
4942 static inline text *
4943 convert_to_base(uint64 value, int base)
4944 {
4945         const char *digits = "0123456789abcdef";
4946
4947         /* We size the buffer for to_bin's longest possible return value. */
4948         char            buf[sizeof(uint64) * BITS_PER_BYTE];
4949         char       *const end = buf + sizeof(buf);
4950         char       *ptr = end;
4951
4952         Assert(base > 1);
4953         Assert(base <= 16);
4954
4955         do
4956         {
4957                 *--ptr = digits[value % base];
4958                 value /= base;
4959         } while (ptr > buf && value);
4960
4961         return cstring_to_text_with_len(ptr, end - ptr);
4962 }
4963
4964 /*
4965  * Convert an integer to a string containing a base-2 (binary) representation
4966  * of the number.
4967  */
4968 Datum
4969 to_bin32(PG_FUNCTION_ARGS)
4970 {
4971         uint64          value = (uint32) PG_GETARG_INT32(0);
4972
4973         PG_RETURN_TEXT_P(convert_to_base(value, 2));
4974 }
4975 Datum
4976 to_bin64(PG_FUNCTION_ARGS)
4977 {
4978         uint64          value = (uint64) PG_GETARG_INT64(0);
4979
4980         PG_RETURN_TEXT_P(convert_to_base(value, 2));
4981 }
4982
4983 /*
4984  * Convert an integer to a string containing a base-8 (oct) representation of
4985  * the number.
4986  */
4987 Datum
4988 to_oct32(PG_FUNCTION_ARGS)
4989 {
4990         uint64          value = (uint32) PG_GETARG_INT32(0);
4991
4992         PG_RETURN_TEXT_P(convert_to_base(value, 8));
4993 }
4994 Datum
4995 to_oct64(PG_FUNCTION_ARGS)
4996 {
4997         uint64          value = (uint64) PG_GETARG_INT64(0);
4998
4999         PG_RETURN_TEXT_P(convert_to_base(value, 8));
5000 }
5001
5002 /*
5003  * Convert an integer to a string containing a base-16 (hex) representation of
5004  * the number.
5005  */
5006 Datum
5007 to_hex32(PG_FUNCTION_ARGS)
5008 {
5009         uint64          value = (uint32) PG_GETARG_INT32(0);
5010
5011         PG_RETURN_TEXT_P(convert_to_base(value, 16));
5012 }
5013 Datum
5014 to_hex64(PG_FUNCTION_ARGS)
5015 {
5016         uint64          value = (uint64) PG_GETARG_INT64(0);
5017
5018         PG_RETURN_TEXT_P(convert_to_base(value, 16));
5019 }
5020
5021 /*
5022  * Return the size of a datum, possibly compressed
5023  *
5024  * Works on any data type
5025  */
5026 Datum
5027 pg_column_size(PG_FUNCTION_ARGS)
5028 {
5029         Datum           value = PG_GETARG_DATUM(0);
5030         int32           result;
5031         int                     typlen;
5032
5033         /* On first call, get the input type's typlen, and save at *fn_extra */
5034         if (fcinfo->flinfo->fn_extra == NULL)
5035         {
5036                 /* Lookup the datatype of the supplied argument */
5037                 Oid                     argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5038
5039                 typlen = get_typlen(argtypeid);
5040                 if (typlen == 0)                /* should not happen */
5041                         elog(ERROR, "cache lookup failed for type %u", argtypeid);
5042
5043                 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5044                                                                                                           sizeof(int));
5045                 *((int *) fcinfo->flinfo->fn_extra) = typlen;
5046         }
5047         else
5048                 typlen = *((int *) fcinfo->flinfo->fn_extra);
5049
5050         if (typlen == -1)
5051         {
5052                 /* varlena type, possibly toasted */
5053                 result = toast_datum_size(value);
5054         }
5055         else if (typlen == -2)
5056         {
5057                 /* cstring */
5058                 result = strlen(DatumGetCString(value)) + 1;
5059         }
5060         else
5061         {
5062                 /* ordinary fixed-width type */
5063                 result = typlen;
5064         }
5065
5066         PG_RETURN_INT32(result);
5067 }
5068
5069 /*
5070  * Return the compression method stored in the compressed attribute.  Return
5071  * NULL for non varlena type or uncompressed data.
5072  */
5073 Datum
5074 pg_column_compression(PG_FUNCTION_ARGS)
5075 {
5076         int                     typlen;
5077         char       *result;
5078         ToastCompressionId cmid;
5079
5080         /* On first call, get the input type's typlen, and save at *fn_extra */
5081         if (fcinfo->flinfo->fn_extra == NULL)
5082         {
5083                 /* Lookup the datatype of the supplied argument */
5084                 Oid                     argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5085
5086                 typlen = get_typlen(argtypeid);
5087                 if (typlen == 0)                /* should not happen */
5088                         elog(ERROR, "cache lookup failed for type %u", argtypeid);
5089
5090                 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5091                                                                                                           sizeof(int));
5092                 *((int *) fcinfo->flinfo->fn_extra) = typlen;
5093         }
5094         else
5095                 typlen = *((int *) fcinfo->flinfo->fn_extra);
5096
5097         if (typlen != -1)
5098                 PG_RETURN_NULL();
5099
5100         /* get the compression method id stored in the compressed varlena */
5101         cmid = toast_get_compression_id((struct varlena *)
5102                                                                         DatumGetPointer(PG_GETARG_DATUM(0)));
5103         if (cmid == TOAST_INVALID_COMPRESSION_ID)
5104                 PG_RETURN_NULL();
5105
5106         /* convert compression method id to compression method name */
5107         switch (cmid)
5108         {
5109                 case TOAST_PGLZ_COMPRESSION_ID:
5110                         result = "pglz";
5111                         break;
5112                 case TOAST_LZ4_COMPRESSION_ID:
5113                         result = "lz4";
5114                         break;
5115                 default:
5116                         elog(ERROR, "invalid compression method id %d", cmid);
5117         }
5118
5119         PG_RETURN_TEXT_P(cstring_to_text(result));
5120 }
5121
5122 /*
5123  * Return the chunk_id of the on-disk TOASTed value.  Return NULL if the value
5124  * is un-TOASTed or not on-disk.
5125  */
5126 Datum
5127 pg_column_toast_chunk_id(PG_FUNCTION_ARGS)
5128 {
5129         int                     typlen;
5130         struct varlena *attr;
5131         struct varatt_external toast_pointer;
5132
5133         /* On first call, get the input type's typlen, and save at *fn_extra */
5134         if (fcinfo->flinfo->fn_extra == NULL)
5135         {
5136                 /* Lookup the datatype of the supplied argument */
5137                 Oid                     argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
5138
5139                 typlen = get_typlen(argtypeid);
5140                 if (typlen == 0)                /* should not happen */
5141                         elog(ERROR, "cache lookup failed for type %u", argtypeid);
5142
5143                 fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5144                                                                                                           sizeof(int));
5145                 *((int *) fcinfo->flinfo->fn_extra) = typlen;
5146         }
5147         else
5148                 typlen = *((int *) fcinfo->flinfo->fn_extra);
5149
5150         if (typlen != -1)
5151                 PG_RETURN_NULL();
5152
5153         attr = (struct varlena *) DatumGetPointer(PG_GETARG_DATUM(0));
5154
5155         if (!VARATT_IS_EXTERNAL_ONDISK(attr))
5156                 PG_RETURN_NULL();
5157
5158         VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
5159
5160         PG_RETURN_OID(toast_pointer.va_valueid);
5161 }
5162
5163 /*
5164  * string_agg - Concatenates values and returns string.
5165  *
5166  * Syntax: string_agg(value text, delimiter text) RETURNS text
5167  *
5168  * Note: Any NULL values are ignored. The first-call delimiter isn't
5169  * actually used at all, and on subsequent calls the delimiter precedes
5170  * the associated value.
5171  */
5172
5173 /* subroutine to initialize state */
5174 static StringInfo
5175 makeStringAggState(FunctionCallInfo fcinfo)
5176 {
5177         StringInfo      state;
5178         MemoryContext aggcontext;
5179         MemoryContext oldcontext;
5180
5181         if (!AggCheckCallContext(fcinfo, &aggcontext))
5182         {
5183                 /* cannot be called directly because of internal-type argument */
5184                 elog(ERROR, "string_agg_transfn called in non-aggregate context");
5185         }
5186
5187         /*
5188          * Create state in aggregate context.  It'll stay there across subsequent
5189          * calls.
5190          */
5191         oldcontext = MemoryContextSwitchTo(aggcontext);
5192         state = makeStringInfo();
5193         MemoryContextSwitchTo(oldcontext);
5194
5195         return state;
5196 }
5197
5198 Datum
5199 string_agg_transfn(PG_FUNCTION_ARGS)
5200 {
5201         StringInfo      state;
5202
5203         state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5204
5205         /* Append the value unless null, preceding it with the delimiter. */
5206         if (!PG_ARGISNULL(1))
5207         {
5208                 text       *value = PG_GETARG_TEXT_PP(1);
5209                 bool            isfirst = false;
5210
5211                 /*
5212                  * You might think we can just throw away the first delimiter, however
5213                  * we must keep it as we may be a parallel worker doing partial
5214                  * aggregation building a state to send to the main process.  We need
5215                  * to keep the delimiter of every aggregation so that the combine
5216                  * function can properly join up the strings of two separately
5217                  * partially aggregated results.  The first delimiter is only stripped
5218                  * off in the final function.  To know how much to strip off the front
5219                  * of the string, we store the length of the first delimiter in the
5220                  * StringInfo's cursor field, which we don't otherwise need here.
5221                  */
5222                 if (state == NULL)
5223                 {
5224                         state = makeStringAggState(fcinfo);
5225                         isfirst = true;
5226                 }
5227
5228                 if (!PG_ARGISNULL(2))
5229                 {
5230                         text       *delim = PG_GETARG_TEXT_PP(2);
5231
5232                         appendStringInfoText(state, delim);
5233                         if (isfirst)
5234                                 state->cursor = VARSIZE_ANY_EXHDR(delim);
5235                 }
5236
5237                 appendStringInfoText(state, value);
5238         }
5239
5240         /*
5241          * The transition type for string_agg() is declared to be "internal",
5242          * which is a pass-by-value type the same size as a pointer.
5243          */
5244         if (state)
5245                 PG_RETURN_POINTER(state);
5246         PG_RETURN_NULL();
5247 }
5248
5249 /*
5250  * string_agg_combine
5251  *              Aggregate combine function for string_agg(text) and string_agg(bytea)
5252  */
5253 Datum
5254 string_agg_combine(PG_FUNCTION_ARGS)
5255 {
5256         StringInfo      state1;
5257         StringInfo      state2;
5258         MemoryContext agg_context;
5259
5260         if (!AggCheckCallContext(fcinfo, &agg_context))
5261                 elog(ERROR, "aggregate function called in non-aggregate context");
5262
5263         state1 = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5264         state2 = PG_ARGISNULL(1) ? NULL : (StringInfo) PG_GETARG_POINTER(1);
5265
5266         if (state2 == NULL)
5267         {
5268                 /*
5269                  * NULL state2 is easy, just return state1, which we know is already
5270                  * in the agg_context
5271                  */
5272                 if (state1 == NULL)
5273                         PG_RETURN_NULL();
5274                 PG_RETURN_POINTER(state1);
5275         }
5276
5277         if (state1 == NULL)
5278         {
5279                 /* We must copy state2's data into the agg_context */
5280                 MemoryContext old_context;
5281
5282                 old_context = MemoryContextSwitchTo(agg_context);
5283                 state1 = makeStringAggState(fcinfo);
5284                 appendBinaryStringInfo(state1, state2->data, state2->len);
5285                 state1->cursor = state2->cursor;
5286                 MemoryContextSwitchTo(old_context);
5287         }
5288         else if (state2->len > 0)
5289         {
5290                 /* Combine ... state1->cursor does not change in this case */
5291                 appendBinaryStringInfo(state1, state2->data, state2->len);
5292         }
5293
5294         PG_RETURN_POINTER(state1);
5295 }
5296
5297 /*
5298  * string_agg_serialize
5299  *              Aggregate serialize function for string_agg(text) and string_agg(bytea)
5300  *
5301  * This is strict, so we need not handle NULL input
5302  */
5303 Datum
5304 string_agg_serialize(PG_FUNCTION_ARGS)
5305 {
5306         StringInfo      state;
5307         StringInfoData buf;
5308         bytea      *result;
5309
5310         /* cannot be called directly because of internal-type argument */
5311         Assert(AggCheckCallContext(fcinfo, NULL));
5312
5313         state = (StringInfo) PG_GETARG_POINTER(0);
5314
5315         pq_begintypsend(&buf);
5316
5317         /* cursor */
5318         pq_sendint(&buf, state->cursor, 4);
5319
5320         /* data */
5321         pq_sendbytes(&buf, state->data, state->len);
5322
5323         result = pq_endtypsend(&buf);
5324
5325         PG_RETURN_BYTEA_P(result);
5326 }
5327
5328 /*
5329  * string_agg_deserialize
5330  *              Aggregate deserial function for string_agg(text) and string_agg(bytea)
5331  *
5332  * This is strict, so we need not handle NULL input
5333  */
5334 Datum
5335 string_agg_deserialize(PG_FUNCTION_ARGS)
5336 {
5337         bytea      *sstate;
5338         StringInfo      result;
5339         StringInfoData buf;
5340         char       *data;
5341         int                     datalen;
5342
5343         /* cannot be called directly because of internal-type argument */
5344         Assert(AggCheckCallContext(fcinfo, NULL));
5345
5346         sstate = PG_GETARG_BYTEA_PP(0);
5347
5348         /*
5349          * Initialize a StringInfo so that we can "receive" it using the standard
5350          * recv-function infrastructure.
5351          */
5352         initReadOnlyStringInfo(&buf, VARDATA_ANY(sstate),
5353                                                    VARSIZE_ANY_EXHDR(sstate));
5354
5355         result = makeStringAggState(fcinfo);
5356
5357         /* cursor */
5358         result->cursor = pq_getmsgint(&buf, 4);
5359
5360         /* data */
5361         datalen = VARSIZE_ANY_EXHDR(sstate) - 4;
5362         data = (char *) pq_getmsgbytes(&buf, datalen);
5363         appendBinaryStringInfo(result, data, datalen);
5364
5365         pq_getmsgend(&buf);
5366
5367         PG_RETURN_POINTER(result);
5368 }
5369
5370 Datum
5371 string_agg_finalfn(PG_FUNCTION_ARGS)
5372 {
5373         StringInfo      state;
5374
5375         /* cannot be called directly because of internal-type argument */
5376         Assert(AggCheckCallContext(fcinfo, NULL));
5377
5378         state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
5379
5380         if (state != NULL)
5381         {
5382                 /* As per comment in transfn, strip data before the cursor position */
5383                 PG_RETURN_TEXT_P(cstring_to_text_with_len(&state->data[state->cursor],
5384                                                                                                   state->len - state->cursor));
5385         }
5386         else
5387                 PG_RETURN_NULL();
5388 }
5389
5390 /*
5391  * Prepare cache with fmgr info for the output functions of the datatypes of
5392  * the arguments of a concat-like function, beginning with argument "argidx".
5393  * (Arguments before that will have corresponding slots in the resulting
5394  * FmgrInfo array, but we don't fill those slots.)
5395  */
5396 static FmgrInfo *
5397 build_concat_foutcache(FunctionCallInfo fcinfo, int argidx)
5398 {
5399         FmgrInfo   *foutcache;
5400         int                     i;
5401
5402         /* We keep the info in fn_mcxt so it survives across calls */
5403         foutcache = (FmgrInfo *) MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
5404                                                                                                 PG_NARGS() * sizeof(FmgrInfo));
5405
5406         for (i = argidx; i < PG_NARGS(); i++)
5407         {
5408                 Oid                     valtype;
5409                 Oid                     typOutput;
5410                 bool            typIsVarlena;
5411
5412                 valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
5413                 if (!OidIsValid(valtype))
5414                         elog(ERROR, "could not determine data type of concat() input");
5415
5416                 getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
5417                 fmgr_info_cxt(typOutput, &foutcache[i], fcinfo->flinfo->fn_mcxt);
5418         }
5419
5420         fcinfo->flinfo->fn_extra = foutcache;
5421
5422         return foutcache;
5423 }
5424
5425 /*
5426  * Implementation of both concat() and concat_ws().
5427  *
5428  * sepstr is the separator string to place between values.
5429  * argidx identifies the first argument to concatenate (counting from zero);
5430  * note that this must be constant across any one series of calls.
5431  *
5432  * Returns NULL if result should be NULL, else text value.
5433  */
5434 static text *
5435 concat_internal(const char *sepstr, int argidx,
5436                                 FunctionCallInfo fcinfo)
5437 {
5438         text       *result;
5439         StringInfoData str;
5440         FmgrInfo   *foutcache;
5441         bool            first_arg = true;
5442         int                     i;
5443
5444         /*
5445          * concat(VARIADIC some-array) is essentially equivalent to
5446          * array_to_text(), ie concat the array elements with the given separator.
5447          * So we just pass the case off to that code.
5448          */
5449         if (get_fn_expr_variadic(fcinfo->flinfo))
5450         {
5451                 ArrayType  *arr;
5452
5453                 /* Should have just the one argument */
5454                 Assert(argidx == PG_NARGS() - 1);
5455
5456                 /* concat(VARIADIC NULL) is defined as NULL */
5457                 if (PG_ARGISNULL(argidx))
5458                         return NULL;
5459
5460                 /*
5461                  * Non-null argument had better be an array.  We assume that any call
5462                  * context that could let get_fn_expr_variadic return true will have
5463                  * checked that a VARIADIC-labeled parameter actually is an array.  So
5464                  * it should be okay to just Assert that it's an array rather than
5465                  * doing a full-fledged error check.
5466                  */
5467                 Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
5468
5469                 /* OK, safe to fetch the array value */
5470                 arr = PG_GETARG_ARRAYTYPE_P(argidx);
5471
5472                 /*
5473                  * And serialize the array.  We tell array_to_text to ignore null
5474                  * elements, which matches the behavior of the loop below.
5475                  */
5476                 return array_to_text_internal(fcinfo, arr, sepstr, NULL);
5477         }
5478
5479         /* Normal case without explicit VARIADIC marker */
5480         initStringInfo(&str);
5481
5482         /* Get output function info, building it if first time through */
5483         foutcache = (FmgrInfo *) fcinfo->flinfo->fn_extra;
5484         if (foutcache == NULL)
5485                 foutcache = build_concat_foutcache(fcinfo, argidx);
5486
5487         for (i = argidx; i < PG_NARGS(); i++)
5488         {
5489                 if (!PG_ARGISNULL(i))
5490                 {
5491                         Datum           value = PG_GETARG_DATUM(i);
5492
5493                         /* add separator if appropriate */
5494                         if (first_arg)
5495                                 first_arg = false;
5496                         else
5497                                 appendStringInfoString(&str, sepstr);
5498
5499                         /* call the appropriate type output function, append the result */
5500                         appendStringInfoString(&str,
5501                                                                    OutputFunctionCall(&foutcache[i], value));
5502                 }
5503         }
5504
5505         result = cstring_to_text_with_len(str.data, str.len);
5506         pfree(str.data);
5507
5508         return result;
5509 }
5510
5511 /*
5512  * Concatenate all arguments. NULL arguments are ignored.
5513  */
5514 Datum
5515 text_concat(PG_FUNCTION_ARGS)
5516 {
5517         text       *result;
5518
5519         result = concat_internal("", 0, fcinfo);
5520         if (result == NULL)
5521                 PG_RETURN_NULL();
5522         PG_RETURN_TEXT_P(result);
5523 }
5524
5525 /*
5526  * Concatenate all but first argument value with separators. The first
5527  * parameter is used as the separator. NULL arguments are ignored.
5528  */
5529 Datum
5530 text_concat_ws(PG_FUNCTION_ARGS)
5531 {
5532         char       *sep;
5533         text       *result;
5534
5535         /* return NULL when separator is NULL */
5536         if (PG_ARGISNULL(0))
5537                 PG_RETURN_NULL();
5538         sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
5539
5540         result = concat_internal(sep, 1, fcinfo);
5541         if (result == NULL)
5542                 PG_RETURN_NULL();
5543         PG_RETURN_TEXT_P(result);
5544 }
5545
5546 /*
5547  * Return first n characters in the string. When n is negative,
5548  * return all but last |n| characters.
5549  */
5550 Datum
5551 text_left(PG_FUNCTION_ARGS)
5552 {
5553         int                     n = PG_GETARG_INT32(1);
5554
5555         if (n < 0)
5556         {
5557                 text       *str = PG_GETARG_TEXT_PP(0);
5558                 const char *p = VARDATA_ANY(str);
5559                 int                     len = VARSIZE_ANY_EXHDR(str);
5560                 int                     rlen;
5561
5562                 n = pg_mbstrlen_with_len(p, len) + n;
5563                 rlen = pg_mbcharcliplen(p, len, n);
5564                 PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
5565         }
5566         else
5567                 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n, false));
5568 }
5569
5570 /*
5571  * Return last n characters in the string. When n is negative,
5572  * return all but first |n| characters.
5573  */
5574 Datum
5575 text_right(PG_FUNCTION_ARGS)
5576 {
5577         text       *str = PG_GETARG_TEXT_PP(0);
5578         const char *p = VARDATA_ANY(str);
5579         int                     len = VARSIZE_ANY_EXHDR(str);
5580         int                     n = PG_GETARG_INT32(1);
5581         int                     off;
5582
5583         if (n < 0)
5584                 n = -n;
5585         else
5586                 n = pg_mbstrlen_with_len(p, len) - n;
5587         off = pg_mbcharcliplen(p, len, n);
5588
5589         PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
5590 }
5591
5592 /*
5593  * Return reversed string
5594  */
5595 Datum
5596 text_reverse(PG_FUNCTION_ARGS)
5597 {
5598         text       *str = PG_GETARG_TEXT_PP(0);
5599         const char *p = VARDATA_ANY(str);
5600         int                     len = VARSIZE_ANY_EXHDR(str);
5601         const char *endp = p + len;
5602         text       *result;
5603         char       *dst;
5604
5605         result = palloc(len + VARHDRSZ);
5606         dst = (char *) VARDATA(result) + len;
5607         SET_VARSIZE(result, len + VARHDRSZ);
5608
5609         if (pg_database_encoding_max_length() > 1)
5610         {
5611                 /* multibyte version */
5612                 while (p < endp)
5613                 {
5614                         int                     sz;
5615
5616                         sz = pg_mblen(p);
5617                         dst -= sz;
5618                         memcpy(dst, p, sz);
5619                         p += sz;
5620                 }
5621         }
5622         else
5623         {
5624                 /* single byte version */
5625                 while (p < endp)
5626                         *(--dst) = *p++;
5627         }
5628
5629         PG_RETURN_TEXT_P(result);
5630 }
5631
5632
5633 /*
5634  * Support macros for text_format()
5635  */
5636 #define TEXT_FORMAT_FLAG_MINUS  0x0001  /* is minus flag present? */
5637
5638 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5639         do { \
5640                 if (++(ptr) >= (end_ptr)) \
5641                         ereport(ERROR, \
5642                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5643                                          errmsg("unterminated format() type specifier"), \
5644                                          errhint("For a single \"%%\" use \"%%%%\"."))); \
5645         } while (0)
5646
5647 /*
5648  * Returns a formatted string
5649  */
5650 Datum
5651 text_format(PG_FUNCTION_ARGS)
5652 {
5653         text       *fmt;
5654         StringInfoData str;
5655         const char *cp;
5656         const char *start_ptr;
5657         const char *end_ptr;
5658         text       *result;
5659         int                     arg;
5660         bool            funcvariadic;
5661         int                     nargs;
5662         Datum      *elements = NULL;
5663         bool       *nulls = NULL;
5664         Oid                     element_type = InvalidOid;
5665         Oid                     prev_type = InvalidOid;
5666         Oid                     prev_width_type = InvalidOid;
5667         FmgrInfo        typoutputfinfo;
5668         FmgrInfo        typoutputinfo_width;
5669
5670         /* When format string is null, immediately return null */
5671         if (PG_ARGISNULL(0))
5672                 PG_RETURN_NULL();
5673
5674         /* If argument is marked VARIADIC, expand array into elements */
5675         if (get_fn_expr_variadic(fcinfo->flinfo))
5676         {
5677                 ArrayType  *arr;
5678                 int16           elmlen;
5679                 bool            elmbyval;
5680                 char            elmalign;
5681                 int                     nitems;
5682
5683                 /* Should have just the one argument */
5684                 Assert(PG_NARGS() == 2);
5685
5686                 /* If argument is NULL, we treat it as zero-length array */
5687                 if (PG_ARGISNULL(1))
5688                         nitems = 0;
5689                 else
5690                 {
5691                         /*
5692                          * Non-null argument had better be an array.  We assume that any
5693                          * call context that could let get_fn_expr_variadic return true
5694                          * will have checked that a VARIADIC-labeled parameter actually is
5695                          * an array.  So it should be okay to just Assert that it's an
5696                          * array rather than doing a full-fledged error check.
5697                          */
5698                         Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
5699
5700                         /* OK, safe to fetch the array value */
5701                         arr = PG_GETARG_ARRAYTYPE_P(1);
5702
5703                         /* Get info about array element type */
5704                         element_type = ARR_ELEMTYPE(arr);
5705                         get_typlenbyvalalign(element_type,
5706                                                                  &elmlen, &elmbyval, &elmalign);
5707
5708                         /* Extract all array elements */
5709                         deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5710                                                           &elements, &nulls, &nitems);
5711                 }
5712
5713                 nargs = nitems + 1;
5714                 funcvariadic = true;
5715         }
5716         else
5717         {
5718                 /* Non-variadic case, we'll process the arguments individually */
5719                 nargs = PG_NARGS();
5720                 funcvariadic = false;
5721         }
5722
5723         /* Setup for main loop. */
5724         fmt = PG_GETARG_TEXT_PP(0);
5725         start_ptr = VARDATA_ANY(fmt);
5726         end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5727         initStringInfo(&str);
5728         arg = 1;                                        /* next argument position to print */
5729
5730         /* Scan format string, looking for conversion specifiers. */
5731         for (cp = start_ptr; cp < end_ptr; cp++)
5732         {
5733                 int                     argpos;
5734                 int                     widthpos;
5735                 int                     flags;
5736                 int                     width;
5737                 Datum           value;
5738                 bool            isNull;
5739                 Oid                     typid;
5740
5741                 /*
5742                  * If it's not the start of a conversion specifier, just copy it to
5743                  * the output buffer.
5744                  */
5745                 if (*cp != '%')
5746                 {
5747                         appendStringInfoCharMacro(&str, *cp);
5748                         continue;
5749                 }
5750
5751                 ADVANCE_PARSE_POINTER(cp, end_ptr);
5752
5753                 /* Easy case: %% outputs a single % */
5754                 if (*cp == '%')
5755                 {
5756                         appendStringInfoCharMacro(&str, *cp);
5757                         continue;
5758                 }
5759
5760                 /* Parse the optional portions of the format specifier */
5761                 cp = text_format_parse_format(cp, end_ptr,
5762                                                                           &argpos, &widthpos,
5763                                                                           &flags, &width);
5764
5765                 /*
5766                  * Next we should see the main conversion specifier.  Whether or not
5767                  * an argument position was present, it's known that at least one
5768                  * character remains in the string at this point.  Experience suggests
5769                  * that it's worth checking that that character is one of the expected
5770                  * ones before we try to fetch arguments, so as to produce the least
5771                  * confusing response to a mis-formatted specifier.
5772                  */
5773                 if (strchr("sIL", *cp) == NULL)
5774                         ereport(ERROR,
5775                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5776                                          errmsg("unrecognized format() type specifier \"%.*s\"",
5777                                                         pg_mblen(cp), cp),
5778                                          errhint("For a single \"%%\" use \"%%%%\".")));
5779
5780                 /* If indirect width was specified, get its value */
5781                 if (widthpos >= 0)
5782                 {
5783                         /* Collect the specified or next argument position */
5784                         if (widthpos > 0)
5785                                 arg = widthpos;
5786                         if (arg >= nargs)
5787                                 ereport(ERROR,
5788                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5789                                                  errmsg("too few arguments for format()")));
5790
5791                         /* Get the value and type of the selected argument */
5792                         if (!funcvariadic)
5793                         {
5794                                 value = PG_GETARG_DATUM(arg);
5795                                 isNull = PG_ARGISNULL(arg);
5796                                 typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5797                         }
5798                         else
5799                         {
5800                                 value = elements[arg - 1];
5801                                 isNull = nulls[arg - 1];
5802                                 typid = element_type;
5803                         }
5804                         if (!OidIsValid(typid))
5805                                 elog(ERROR, "could not determine data type of format() input");
5806
5807                         arg++;
5808
5809                         /* We can treat NULL width the same as zero */
5810                         if (isNull)
5811                                 width = 0;
5812                         else if (typid == INT4OID)
5813                                 width = DatumGetInt32(value);
5814                         else if (typid == INT2OID)
5815                                 width = DatumGetInt16(value);
5816                         else
5817                         {
5818                                 /* For less-usual datatypes, convert to text then to int */
5819                                 char       *str;
5820
5821                                 if (typid != prev_width_type)
5822                                 {
5823                                         Oid                     typoutputfunc;
5824                                         bool            typIsVarlena;
5825
5826                                         getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5827                                         fmgr_info(typoutputfunc, &typoutputinfo_width);
5828                                         prev_width_type = typid;
5829                                 }
5830
5831                                 str = OutputFunctionCall(&typoutputinfo_width, value);
5832
5833                                 /* pg_strtoint32 will complain about bad data or overflow */
5834                                 width = pg_strtoint32(str);
5835
5836                                 pfree(str);
5837                         }
5838                 }
5839
5840                 /* Collect the specified or next argument position */
5841                 if (argpos > 0)
5842                         arg = argpos;
5843                 if (arg >= nargs)
5844                         ereport(ERROR,
5845                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5846                                          errmsg("too few arguments for format()")));
5847
5848                 /* Get the value and type of the selected argument */
5849                 if (!funcvariadic)
5850                 {
5851                         value = PG_GETARG_DATUM(arg);
5852                         isNull = PG_ARGISNULL(arg);
5853                         typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5854                 }
5855                 else
5856                 {
5857                         value = elements[arg - 1];
5858                         isNull = nulls[arg - 1];
5859                         typid = element_type;
5860                 }
5861                 if (!OidIsValid(typid))
5862                         elog(ERROR, "could not determine data type of format() input");
5863
5864                 arg++;
5865
5866                 /*
5867                  * Get the appropriate typOutput function, reusing previous one if
5868                  * same type as previous argument.  That's particularly useful in the
5869                  * variadic-array case, but often saves work even for ordinary calls.
5870                  */
5871                 if (typid != prev_type)
5872                 {
5873                         Oid                     typoutputfunc;
5874                         bool            typIsVarlena;
5875
5876                         getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5877                         fmgr_info(typoutputfunc, &typoutputfinfo);
5878                         prev_type = typid;
5879                 }
5880
5881                 /*
5882                  * And now we can format the value.
5883                  */
5884                 switch (*cp)
5885                 {
5886                         case 's':
5887                         case 'I':
5888                         case 'L':
5889                                 text_format_string_conversion(&str, *cp, &typoutputfinfo,
5890                                                                                           value, isNull,
5891                                                                                           flags, width);
5892                                 break;
5893                         default:
5894                                 /* should not get here, because of previous check */
5895                                 ereport(ERROR,
5896                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5897                                                  errmsg("unrecognized format() type specifier \"%.*s\"",
5898                                                                 pg_mblen(cp), cp),
5899                                                  errhint("For a single \"%%\" use \"%%%%\".")));
5900                                 break;
5901                 }
5902         }
5903
5904         /* Don't need deconstruct_array results anymore. */
5905         if (elements != NULL)
5906                 pfree(elements);
5907         if (nulls != NULL)
5908                 pfree(nulls);
5909
5910         /* Generate results. */
5911         result = cstring_to_text_with_len(str.data, str.len);
5912         pfree(str.data);
5913
5914         PG_RETURN_TEXT_P(result);
5915 }
5916
5917 /*
5918  * Parse contiguous digits as a decimal number.
5919  *
5920  * Returns true if some digits could be parsed.
5921  * The value is returned into *value, and *ptr is advanced to the next
5922  * character to be parsed.
5923  *
5924  * Note parsing invariant: at least one character is known available before
5925  * string end (end_ptr) at entry, and this is still true at exit.
5926  */
5927 static bool
5928 text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5929 {
5930         bool            found = false;
5931         const char *cp = *ptr;
5932         int                     val = 0;
5933
5934         while (*cp >= '0' && *cp <= '9')
5935         {
5936                 int8            digit = (*cp - '0');
5937
5938                 if (unlikely(pg_mul_s32_overflow(val, 10, &val)) ||
5939                         unlikely(pg_add_s32_overflow(val, digit, &val)))
5940                         ereport(ERROR,
5941                                         (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5942                                          errmsg("number is out of range")));
5943                 ADVANCE_PARSE_POINTER(cp, end_ptr);
5944                 found = true;
5945         }
5946
5947         *ptr = cp;
5948         *value = val;
5949
5950         return found;
5951 }
5952
5953 /*
5954  * Parse a format specifier (generally following the SUS printf spec).
5955  *
5956  * We have already advanced over the initial '%', and we are looking for
5957  * [argpos][flags][width]type (but the type character is not consumed here).
5958  *
5959  * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5960  * Output parameters:
5961  *      argpos: argument position for value to be printed.  -1 means unspecified.
5962  *      widthpos: argument position for width.  Zero means the argument position
5963  *                      was unspecified (ie, take the next arg) and -1 means no width
5964  *                      argument (width was omitted or specified as a constant).
5965  *      flags: bitmask of flags.
5966  *      width: directly-specified width value.  Zero means the width was omitted
5967  *                      (note it's not necessary to distinguish this case from an explicit
5968  *                      zero width value).
5969  *
5970  * The function result is the next character position to be parsed, ie, the
5971  * location where the type character is/should be.
5972  *
5973  * Note parsing invariant: at least one character is known available before
5974  * string end (end_ptr) at entry, and this is still true at exit.
5975  */
5976 static const char *
5977 text_format_parse_format(const char *start_ptr, const char *end_ptr,
5978                                                  int *argpos, int *widthpos,
5979                                                  int *flags, int *width)
5980 {
5981         const char *cp = start_ptr;
5982         int                     n;
5983
5984         /* set defaults for output parameters */
5985         *argpos = -1;
5986         *widthpos = -1;
5987         *flags = 0;
5988         *width = 0;
5989
5990         /* try to identify first number */
5991         if (text_format_parse_digits(&cp, end_ptr, &n))
5992         {
5993                 if (*cp != '$')
5994                 {
5995                         /* Must be just a width and a type, so we're done */
5996                         *width = n;
5997                         return cp;
5998                 }
5999                 /* The number was argument position */
6000                 *argpos = n;
6001                 /* Explicit 0 for argument index is immediately refused */
6002                 if (n == 0)
6003                         ereport(ERROR,
6004                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6005                                          errmsg("format specifies argument 0, but arguments are numbered from 1")));
6006                 ADVANCE_PARSE_POINTER(cp, end_ptr);
6007         }
6008
6009         /* Handle flags (only minus is supported now) */
6010         while (*cp == '-')
6011         {
6012                 *flags |= TEXT_FORMAT_FLAG_MINUS;
6013                 ADVANCE_PARSE_POINTER(cp, end_ptr);
6014         }
6015
6016         if (*cp == '*')
6017         {
6018                 /* Handle indirect width */
6019                 ADVANCE_PARSE_POINTER(cp, end_ptr);
6020                 if (text_format_parse_digits(&cp, end_ptr, &n))
6021                 {
6022                         /* number in this position must be closed by $ */
6023                         if (*cp != '$')
6024                                 ereport(ERROR,
6025                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6026                                                  errmsg("width argument position must be ended by \"$\"")));
6027                         /* The number was width argument position */
6028                         *widthpos = n;
6029                         /* Explicit 0 for argument index is immediately refused */
6030                         if (n == 0)
6031                                 ereport(ERROR,
6032                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6033                                                  errmsg("format specifies argument 0, but arguments are numbered from 1")));
6034                         ADVANCE_PARSE_POINTER(cp, end_ptr);
6035                 }
6036                 else
6037                         *widthpos = 0;          /* width's argument position is unspecified */
6038         }
6039         else
6040         {
6041                 /* Check for direct width specification */
6042                 if (text_format_parse_digits(&cp, end_ptr, &n))
6043                         *width = n;
6044         }
6045
6046         /* cp should now be pointing at type character */
6047         return cp;
6048 }
6049
6050 /*
6051  * Format a %s, %I, or %L conversion
6052  */
6053 static void
6054 text_format_string_conversion(StringInfo buf, char conversion,
6055                                                           FmgrInfo *typOutputInfo,
6056                                                           Datum value, bool isNull,
6057                                                           int flags, int width)
6058 {
6059         char       *str;
6060
6061         /* Handle NULL arguments before trying to stringify the value. */
6062         if (isNull)
6063         {
6064                 if (conversion == 's')
6065                         text_format_append_string(buf, "", flags, width);
6066                 else if (conversion == 'L')
6067                         text_format_append_string(buf, "NULL", flags, width);
6068                 else if (conversion == 'I')
6069                         ereport(ERROR,
6070                                         (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
6071                                          errmsg("null values cannot be formatted as an SQL identifier")));
6072                 return;
6073         }
6074
6075         /* Stringify. */
6076         str = OutputFunctionCall(typOutputInfo, value);
6077
6078         /* Escape. */
6079         if (conversion == 'I')
6080         {
6081                 /* quote_identifier may or may not allocate a new string. */
6082                 text_format_append_string(buf, quote_identifier(str), flags, width);
6083         }
6084         else if (conversion == 'L')
6085         {
6086                 char       *qstr = quote_literal_cstr(str);
6087
6088                 text_format_append_string(buf, qstr, flags, width);
6089                 /* quote_literal_cstr() always allocates a new string */
6090                 pfree(qstr);
6091         }
6092         else
6093                 text_format_append_string(buf, str, flags, width);
6094
6095         /* Cleanup. */
6096         pfree(str);
6097 }
6098
6099 /*
6100  * Append str to buf, padding as directed by flags/width
6101  */
6102 static void
6103 text_format_append_string(StringInfo buf, const char *str,
6104                                                   int flags, int width)
6105 {
6106         bool            align_to_left = false;
6107         int                     len;
6108
6109         /* fast path for typical easy case */
6110         if (width == 0)
6111         {
6112                 appendStringInfoString(buf, str);
6113                 return;
6114         }
6115
6116         if (width < 0)
6117         {
6118                 /* Negative width: implicit '-' flag, then take absolute value */
6119                 align_to_left = true;
6120                 /* -INT_MIN is undefined */
6121                 if (width <= INT_MIN)
6122                         ereport(ERROR,
6123                                         (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
6124                                          errmsg("number is out of range")));
6125                 width = -width;
6126         }
6127         else if (flags & TEXT_FORMAT_FLAG_MINUS)
6128                 align_to_left = true;
6129
6130         len = pg_mbstrlen(str);
6131         if (align_to_left)
6132         {
6133                 /* left justify */
6134                 appendStringInfoString(buf, str);
6135                 if (len < width)
6136                         appendStringInfoSpaces(buf, width - len);
6137         }
6138         else
6139         {
6140                 /* right justify */
6141                 if (len < width)
6142                         appendStringInfoSpaces(buf, width - len);
6143                 appendStringInfoString(buf, str);
6144         }
6145 }
6146
6147 /*
6148  * text_format_nv - nonvariadic wrapper for text_format function.
6149  *
6150  * note: this wrapper is necessary to pass the sanity check in opr_sanity,
6151  * which checks that all built-in functions that share the implementing C
6152  * function take the same number of arguments.
6153  */
6154 Datum
6155 text_format_nv(PG_FUNCTION_ARGS)
6156 {
6157         return text_format(fcinfo);
6158 }
6159
6160 /*
6161  * Helper function for Levenshtein distance functions. Faster than memcmp(),
6162  * for this use case.
6163  */
6164 static inline bool
6165 rest_of_char_same(const char *s1, const char *s2, int len)
6166 {
6167         while (len > 0)
6168         {
6169                 len--;
6170                 if (s1[len] != s2[len])
6171                         return false;
6172         }
6173         return true;
6174 }
6175
6176 /* Expand each Levenshtein distance variant */
6177 #include "levenshtein.c"
6178 #define LEVENSHTEIN_LESS_EQUAL
6179 #include "levenshtein.c"
6180
6181
6182 /*
6183  * The following *ClosestMatch() functions can be used to determine whether a
6184  * user-provided string resembles any known valid values, which is useful for
6185  * providing hints in log messages, among other things.  Use these functions
6186  * like so:
6187  *
6188  *              initClosestMatch(&state, source_string, max_distance);
6189  *
6190  *              for (int i = 0; i < num_valid_strings; i++)
6191  *                      updateClosestMatch(&state, valid_strings[i]);
6192  *
6193  *              closestMatch = getClosestMatch(&state);
6194  */
6195
6196 /*
6197  * Initialize the given state with the source string and maximum Levenshtein
6198  * distance to consider.
6199  */
6200 void
6201 initClosestMatch(ClosestMatchState *state, const char *source, int max_d)
6202 {
6203         Assert(state);
6204         Assert(max_d >= 0);
6205
6206         state->source = source;
6207         state->min_d = -1;
6208         state->max_d = max_d;
6209         state->match = NULL;
6210 }
6211
6212 /*
6213  * If the candidate string is a closer match than the current one saved (or
6214  * there is no match saved), save it as the closest match.
6215  *
6216  * If the source or candidate string is NULL, empty, or too long, this function
6217  * takes no action.  Likewise, if the Levenshtein distance exceeds the maximum
6218  * allowed or more than half the characters are different, no action is taken.
6219  */
6220 void
6221 updateClosestMatch(ClosestMatchState *state, const char *candidate)
6222 {
6223         int                     dist;
6224
6225         Assert(state);
6226
6227         if (state->source == NULL || state->source[0] == '\0' ||
6228                 candidate == NULL || candidate[0] == '\0')
6229                 return;
6230
6231         /*
6232          * To avoid ERROR-ing, we check the lengths here instead of setting
6233          * 'trusted' to false in the call to varstr_levenshtein_less_equal().
6234          */
6235         if (strlen(state->source) > MAX_LEVENSHTEIN_STRLEN ||
6236                 strlen(candidate) > MAX_LEVENSHTEIN_STRLEN)
6237                 return;
6238
6239         dist = varstr_levenshtein_less_equal(state->source, strlen(state->source),
6240                                                                                  candidate, strlen(candidate), 1, 1, 1,
6241                                                                                  state->max_d, true);
6242         if (dist <= state->max_d &&
6243                 dist <= strlen(state->source) / 2 &&
6244                 (state->min_d == -1 || dist < state->min_d))
6245         {
6246                 state->min_d = dist;
6247                 state->match = candidate;
6248         }
6249 }
6250
6251 /*
6252  * Return the closest match.  If no suitable candidates were provided via
6253  * updateClosestMatch(), return NULL.
6254  */
6255 const char *
6256 getClosestMatch(ClosestMatchState *state)
6257 {
6258         Assert(state);
6259
6260         return state->match;
6261 }
6262
6263
6264 /*
6265  * Unicode support
6266  */
6267
6268 static UnicodeNormalizationForm
6269 unicode_norm_form_from_string(const char *formstr)
6270 {
6271         UnicodeNormalizationForm form = -1;
6272
6273         /*
6274          * Might as well check this while we're here.
6275          */
6276         if (GetDatabaseEncoding() != PG_UTF8)
6277                 ereport(ERROR,
6278                                 (errcode(ERRCODE_SYNTAX_ERROR),
6279                                  errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
6280
6281         if (pg_strcasecmp(formstr, "NFC") == 0)
6282                 form = UNICODE_NFC;
6283         else if (pg_strcasecmp(formstr, "NFD") == 0)
6284                 form = UNICODE_NFD;
6285         else if (pg_strcasecmp(formstr, "NFKC") == 0)
6286                 form = UNICODE_NFKC;
6287         else if (pg_strcasecmp(formstr, "NFKD") == 0)
6288                 form = UNICODE_NFKD;
6289         else
6290                 ereport(ERROR,
6291                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6292                                  errmsg("invalid normalization form: %s", formstr)));
6293
6294         return form;
6295 }
6296
6297 /*
6298  * Returns version of Unicode used by Postgres in "major.minor" format (the
6299  * same format as the Unicode version reported by ICU). The third component
6300  * ("update version") never involves additions to the character repertoire and
6301  * is unimportant for most purposes.
6302  *
6303  * See: https://unicode.org/versions/
6304  */
6305 Datum
6306 unicode_version(PG_FUNCTION_ARGS)
6307 {
6308         PG_RETURN_TEXT_P(cstring_to_text(PG_UNICODE_VERSION));
6309 }
6310
6311 /*
6312  * Returns version of Unicode used by ICU, if enabled; otherwise NULL.
6313  */
6314 Datum
6315 icu_unicode_version(PG_FUNCTION_ARGS)
6316 {
6317 #ifdef USE_ICU
6318         PG_RETURN_TEXT_P(cstring_to_text(U_UNICODE_VERSION));
6319 #else
6320         PG_RETURN_NULL();
6321 #endif
6322 }
6323
6324 /*
6325  * Check whether the string contains only assigned Unicode code
6326  * points. Requires that the database encoding is UTF-8.
6327  */
6328 Datum
6329 unicode_assigned(PG_FUNCTION_ARGS)
6330 {
6331         text       *input = PG_GETARG_TEXT_PP(0);
6332         unsigned char *p;
6333         int                     size;
6334
6335         if (GetDatabaseEncoding() != PG_UTF8)
6336                 ereport(ERROR,
6337                                 (errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
6338
6339         /* convert to pg_wchar */
6340         size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6341         p = (unsigned char *) VARDATA_ANY(input);
6342         for (int i = 0; i < size; i++)
6343         {
6344                 pg_wchar        uchar = utf8_to_unicode(p);
6345                 int                     category = unicode_category(uchar);
6346
6347                 if (category == PG_U_UNASSIGNED)
6348                         PG_RETURN_BOOL(false);
6349
6350                 p += pg_utf_mblen(p);
6351         }
6352
6353         PG_RETURN_BOOL(true);
6354 }
6355
6356 Datum
6357 unicode_normalize_func(PG_FUNCTION_ARGS)
6358 {
6359         text       *input = PG_GETARG_TEXT_PP(0);
6360         char       *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6361         UnicodeNormalizationForm form;
6362         int                     size;
6363         pg_wchar   *input_chars;
6364         pg_wchar   *output_chars;
6365         unsigned char *p;
6366         text       *result;
6367         int                     i;
6368
6369         form = unicode_norm_form_from_string(formstr);
6370
6371         /* convert to pg_wchar */
6372         size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6373         input_chars = palloc((size + 1) * sizeof(pg_wchar));
6374         p = (unsigned char *) VARDATA_ANY(input);
6375         for (i = 0; i < size; i++)
6376         {
6377                 input_chars[i] = utf8_to_unicode(p);
6378                 p += pg_utf_mblen(p);
6379         }
6380         input_chars[i] = (pg_wchar) '\0';
6381         Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6382
6383         /* action */
6384         output_chars = unicode_normalize(form, input_chars);
6385
6386         /* convert back to UTF-8 string */
6387         size = 0;
6388         for (pg_wchar *wp = output_chars; *wp; wp++)
6389         {
6390                 unsigned char buf[4];
6391
6392                 unicode_to_utf8(*wp, buf);
6393                 size += pg_utf_mblen(buf);
6394         }
6395
6396         result = palloc(size + VARHDRSZ);
6397         SET_VARSIZE(result, size + VARHDRSZ);
6398
6399         p = (unsigned char *) VARDATA_ANY(result);
6400         for (pg_wchar *wp = output_chars; *wp; wp++)
6401         {
6402                 unicode_to_utf8(*wp, p);
6403                 p += pg_utf_mblen(p);
6404         }
6405         Assert((char *) p == (char *) result + size + VARHDRSZ);
6406
6407         PG_RETURN_TEXT_P(result);
6408 }
6409
6410 /*
6411  * Check whether the string is in the specified Unicode normalization form.
6412  *
6413  * This is done by converting the string to the specified normal form and then
6414  * comparing that to the original string.  To speed that up, we also apply the
6415  * "quick check" algorithm specified in UAX #15, which can give a yes or no
6416  * answer for many strings by just scanning the string once.
6417  *
6418  * This function should generally be optimized for the case where the string
6419  * is in fact normalized.  In that case, we'll end up looking at the entire
6420  * string, so it's probably not worth doing any incremental conversion etc.
6421  */
6422 Datum
6423 unicode_is_normalized(PG_FUNCTION_ARGS)
6424 {
6425         text       *input = PG_GETARG_TEXT_PP(0);
6426         char       *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
6427         UnicodeNormalizationForm form;
6428         int                     size;
6429         pg_wchar   *input_chars;
6430         pg_wchar   *output_chars;
6431         unsigned char *p;
6432         int                     i;
6433         UnicodeNormalizationQC quickcheck;
6434         int                     output_size;
6435         bool            result;
6436
6437         form = unicode_norm_form_from_string(formstr);
6438
6439         /* convert to pg_wchar */
6440         size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6441         input_chars = palloc((size + 1) * sizeof(pg_wchar));
6442         p = (unsigned char *) VARDATA_ANY(input);
6443         for (i = 0; i < size; i++)
6444         {
6445                 input_chars[i] = utf8_to_unicode(p);
6446                 p += pg_utf_mblen(p);
6447         }
6448         input_chars[i] = (pg_wchar) '\0';
6449         Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
6450
6451         /* quick check (see UAX #15) */
6452         quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
6453         if (quickcheck == UNICODE_NORM_QC_YES)
6454                 PG_RETURN_BOOL(true);
6455         else if (quickcheck == UNICODE_NORM_QC_NO)
6456                 PG_RETURN_BOOL(false);
6457
6458         /* normalize and compare with original */
6459         output_chars = unicode_normalize(form, input_chars);
6460
6461         output_size = 0;
6462         for (pg_wchar *wp = output_chars; *wp; wp++)
6463                 output_size++;
6464
6465         result = (size == output_size) &&
6466                 (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
6467
6468         PG_RETURN_BOOL(result);
6469 }
6470
6471 /*
6472  * Check if first n chars are hexadecimal digits
6473  */
6474 static bool
6475 isxdigits_n(const char *instr, size_t n)
6476 {
6477         for (size_t i = 0; i < n; i++)
6478                 if (!isxdigit((unsigned char) instr[i]))
6479                         return false;
6480
6481         return true;
6482 }
6483
6484 static unsigned int
6485 hexval(unsigned char c)
6486 {
6487         if (c >= '0' && c <= '9')
6488                 return c - '0';
6489         if (c >= 'a' && c <= 'f')
6490                 return c - 'a' + 0xA;
6491         if (c >= 'A' && c <= 'F')
6492                 return c - 'A' + 0xA;
6493         elog(ERROR, "invalid hexadecimal digit");
6494         return 0;                                       /* not reached */
6495 }
6496
6497 /*
6498  * Translate string with hexadecimal digits to number
6499  */
6500 static unsigned int
6501 hexval_n(const char *instr, size_t n)
6502 {
6503         unsigned int result = 0;
6504
6505         for (size_t i = 0; i < n; i++)
6506                 result += hexval(instr[i]) << (4 * (n - i - 1));
6507
6508         return result;
6509 }
6510
6511 /*
6512  * Replaces Unicode escape sequences by Unicode characters
6513  */
6514 Datum
6515 unistr(PG_FUNCTION_ARGS)
6516 {
6517         text       *input_text = PG_GETARG_TEXT_PP(0);
6518         char       *instr;
6519         int                     len;
6520         StringInfoData str;
6521         text       *result;
6522         pg_wchar        pair_first = 0;
6523         char            cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
6524
6525         instr = VARDATA_ANY(input_text);
6526         len = VARSIZE_ANY_EXHDR(input_text);
6527
6528         initStringInfo(&str);
6529
6530         while (len > 0)
6531         {
6532                 if (instr[0] == '\\')
6533                 {
6534                         if (len >= 2 &&
6535                                 instr[1] == '\\')
6536                         {
6537                                 if (pair_first)
6538                                         goto invalid_pair;
6539                                 appendStringInfoChar(&str, '\\');
6540                                 instr += 2;
6541                                 len -= 2;
6542                         }
6543                         else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
6544                                          (len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
6545                         {
6546                                 pg_wchar        unicode;
6547                                 int                     offset = instr[1] == 'u' ? 2 : 1;
6548
6549                                 unicode = hexval_n(instr + offset, 4);
6550
6551                                 if (!is_valid_unicode_codepoint(unicode))
6552                                         ereport(ERROR,
6553                                                         errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6554                                                         errmsg("invalid Unicode code point: %04X", unicode));
6555
6556                                 if (pair_first)
6557                                 {
6558                                         if (is_utf16_surrogate_second(unicode))
6559                                         {
6560                                                 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6561                                                 pair_first = 0;
6562                                         }
6563                                         else
6564                                                 goto invalid_pair;
6565                                 }
6566                                 else if (is_utf16_surrogate_second(unicode))
6567                                         goto invalid_pair;
6568
6569                                 if (is_utf16_surrogate_first(unicode))
6570                                         pair_first = unicode;
6571                                 else
6572                                 {
6573                                         pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6574                                         appendStringInfoString(&str, cbuf);
6575                                 }
6576
6577                                 instr += 4 + offset;
6578                                 len -= 4 + offset;
6579                         }
6580                         else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
6581                         {
6582                                 pg_wchar        unicode;
6583
6584                                 unicode = hexval_n(instr + 2, 6);
6585
6586                                 if (!is_valid_unicode_codepoint(unicode))
6587                                         ereport(ERROR,
6588                                                         errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6589                                                         errmsg("invalid Unicode code point: %04X", unicode));
6590
6591                                 if (pair_first)
6592                                 {
6593                                         if (is_utf16_surrogate_second(unicode))
6594                                         {
6595                                                 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6596                                                 pair_first = 0;
6597                                         }
6598                                         else
6599                                                 goto invalid_pair;
6600                                 }
6601                                 else if (is_utf16_surrogate_second(unicode))
6602                                         goto invalid_pair;
6603
6604                                 if (is_utf16_surrogate_first(unicode))
6605                                         pair_first = unicode;
6606                                 else
6607                                 {
6608                                         pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6609                                         appendStringInfoString(&str, cbuf);
6610                                 }
6611
6612                                 instr += 8;
6613                                 len -= 8;
6614                         }
6615                         else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
6616                         {
6617                                 pg_wchar        unicode;
6618
6619                                 unicode = hexval_n(instr + 2, 8);
6620
6621                                 if (!is_valid_unicode_codepoint(unicode))
6622                                         ereport(ERROR,
6623                                                         errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6624                                                         errmsg("invalid Unicode code point: %04X", unicode));
6625
6626                                 if (pair_first)
6627                                 {
6628                                         if (is_utf16_surrogate_second(unicode))
6629                                         {
6630                                                 unicode = surrogate_pair_to_codepoint(pair_first, unicode);
6631                                                 pair_first = 0;
6632                                         }
6633                                         else
6634                                                 goto invalid_pair;
6635                                 }
6636                                 else if (is_utf16_surrogate_second(unicode))
6637                                         goto invalid_pair;
6638
6639                                 if (is_utf16_surrogate_first(unicode))
6640                                         pair_first = unicode;
6641                                 else
6642                                 {
6643                                         pg_unicode_to_server(unicode, (unsigned char *) cbuf);
6644                                         appendStringInfoString(&str, cbuf);
6645                                 }
6646
6647                                 instr += 10;
6648                                 len -= 10;
6649                         }
6650                         else
6651                                 ereport(ERROR,
6652                                                 (errcode(ERRCODE_SYNTAX_ERROR),
6653                                                  errmsg("invalid Unicode escape"),
6654                                                  errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
6655                 }
6656                 else
6657                 {
6658                         if (pair_first)
6659                                 goto invalid_pair;
6660
6661                         appendStringInfoChar(&str, *instr++);
6662                         len--;
6663                 }
6664         }
6665
6666         /* unfinished surrogate pair? */
6667         if (pair_first)
6668                 goto invalid_pair;
6669
6670         result = cstring_to_text_with_len(str.data, str.len);
6671         pfree(str.data);
6672
6673         PG_RETURN_TEXT_P(result);
6674
6675 invalid_pair:
6676         ereport(ERROR,
6677                         (errcode(ERRCODE_SYNTAX_ERROR),
6678                          errmsg("invalid Unicode surrogate pair")));
6679         PG_RETURN_NULL();                       /* keep compiler quiet */
6680 }