src/backend/utils/adt/regexp.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * regexp.c
   4  *        Postgres' interface to the regular expression package.
   5  *
   6  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        $PostgreSQL$
  12  *
  13  *              Alistair Crooks added the code for the regex caching
  14  *              agc - cached the regular expressions used - there's a good chance
  15  *              that we'll get a hit, so this saves a compile step for every
  16  *              attempted match. I haven't actually measured the speed improvement,
  17  *              but it `looks' a lot quicker visually when watching regression
  18  *              test output.
  19  *
  20  *              agc - incorporated Keith Bostic's Berkeley regex code into
  21  *              the tree for all ports. To distinguish this regex code from any that
  22  *              is existent on a platform, I've prepended the string "pg_" to
  23  *              the functions regcomp, regerror, regexec and regfree.
  24  *              Fixed a bug that was originally a typo by me, where `i' was used
  25  *              instead of `oldest' when compiling regular expressions - benign
  26  *              results mostly, although occasionally it bit you...
  27  *
  28  *-------------------------------------------------------------------------
  29  */
  30 #include "postgres.h"
  31
  32 #include "catalog/pg_type.h"
  33 #include "funcapi.h"
  34 #include "regex/regex.h"
  35 #include "utils/builtins.h"
  36 #include "utils/guc.h"
  37
  38 #define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \
  39         (PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL)
  40
  41
  42 /* GUC-settable flavor parameter */
  43 int                     regex_flavor = REG_ADVANCED;
  44
  45
  46 /* all the options of interest for regex functions */
  47 typedef struct pg_re_flags
  48 {
  49         int                     cflags;                 /* compile flags for Spencer's regex code */
  50         bool            glob;                   /* do it globally (for each occurrence) */
  51 } pg_re_flags;
  52
  53 /* cross-call state for regexp_matches(), also regexp_split() */
  54 typedef struct regexp_matches_ctx
  55 {
  56         text       *orig_str;           /* data string in original TEXT form */
  57         int                     nmatches;               /* number of places where pattern matched */
  58         int                     npatterns;              /* number of capturing subpatterns */
  59         /* We store start char index and end+1 char index for each match */
  60         /* so the number of entries in match_locs is nmatches * npatterns * 2 */
  61         int                *match_locs;         /* 0-based character indexes */
  62         int                     next_match;             /* 0-based index of next match to process */
  63         /* workspace for build_regexp_matches_result() */
  64         Datum      *elems;                      /* has npatterns elements */
  65         bool       *nulls;                      /* has npatterns elements */
  66 } regexp_matches_ctx;
  67
  68 /*
  69  * We cache precompiled regular expressions using a "self organizing list"
  70  * structure, in which recently-used items tend to be near the front.
  71  * Whenever we use an entry, it's moved up to the front of the list.
  72  * Over time, an item's average position corresponds to its frequency of use.
  73  *
  74  * When we first create an entry, it's inserted at the front of
  75  * the array, dropping the entry at the end of the array if necessary to
  76  * make room.  (This might seem to be weighting the new entry too heavily,
  77  * but if we insert new entries further back, we'll be unable to adjust to
  78  * a sudden shift in the query mix where we are presented with MAX_CACHED_RES
  79  * never-before-seen items used circularly.  We ought to be able to handle
  80  * that case, so we have to insert at the front.)
  81  *
  82  * Knuth mentions a variant strategy in which a used item is moved up just
  83  * one place in the list.  Although he says this uses fewer comparisons on
  84  * average, it seems not to adapt very well to the situation where you have
  85  * both some reusable patterns and a steady stream of non-reusable patterns.
  86  * A reusable pattern that isn't used at least as often as non-reusable
  87  * patterns are seen will "fail to keep up" and will drop off the end of the
  88  * cache.  With move-to-front, a reusable pattern is guaranteed to stay in
  89  * the cache as long as it's used at least once in every MAX_CACHED_RES uses.
  90  */
  91
  92 /* this is the maximum number of cached regular expressions */
  93 #ifndef MAX_CACHED_RES
  94 #define MAX_CACHED_RES  32
  95 #endif
  96
  97 /* this structure describes one cached regular expression */
  98 typedef struct cached_re_str
  99 {
 100         char       *cre_pat;            /* original RE (not null terminated!) */
 101         int                     cre_pat_len;    /* length of original RE, in bytes */
 102         int                     cre_flags;              /* compile flags: extended,icase etc */
 103         regex_t         cre_re;                 /* the compiled regular expression */
 104 } cached_re_str;
 105
 106 static int      num_res = 0;            /* # of cached re's */
 107 static cached_re_str re_array[MAX_CACHED_RES];  /* cached re's */
 108
 109
 110 /* Local functions */
 111 static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
 112                                          text *flags,
 113                                          bool force_glob,
 114                                          bool use_subpatterns,
 115                                          bool ignore_degenerate);
 116 static void cleanup_regexp_matches(regexp_matches_ctx *matchctx);
 117 static ArrayType *build_regexp_matches_result(regexp_matches_ctx *matchctx);
 118 static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
 119
 120
 121 /*
 122  * RE_compile_and_cache - compile a RE, caching if possible
 123  *
 124  * Returns regex_t *
 125  *
 126  *      text_re --- the pattern, expressed as a TEXT object
 127  *      cflags --- compile options for the pattern
 128  *
 129  * Pattern is given in the database encoding.  We internally convert to
 130  * an array of pg_wchar, which is what Spencer's regex package wants.
 131  */
 132 static regex_t *
 133 RE_compile_and_cache(text *text_re, int cflags)
 134 {
 135         int                     text_re_len = VARSIZE_ANY_EXHDR(text_re);
 136         char       *text_re_val = VARDATA_ANY(text_re);
 137         pg_wchar   *pattern;
 138         int                     pattern_len;
 139         int                     i;
 140         int                     regcomp_result;
 141         cached_re_str re_temp;
 142         char            errMsg[100];
 143
 144         /*
 145          * Look for a match among previously compiled REs.      Since the data
 146          * structure is self-organizing with most-used entries at the front, our
 147          * search strategy can just be to scan from the front.
 148          */
 149         for (i = 0; i < num_res; i++)
 150         {
 151                 if (re_array[i].cre_pat_len == text_re_len &&
 152                         re_array[i].cre_flags == cflags &&
 153                         memcmp(re_array[i].cre_pat, text_re_val, text_re_len) == 0)
 154                 {
 155                         /*
 156                          * Found a match; move it to front if not there already.
 157                          */
 158                         if (i > 0)
 159                         {
 160                                 re_temp = re_array[i];
 161                                 memmove(&re_array[1], &re_array[0], i * sizeof(cached_re_str));
 162                                 re_array[0] = re_temp;
 163                         }
 164
 165                         return &re_array[0].cre_re;
 166                 }
 167         }
 168
 169         /*
 170          * Couldn't find it, so try to compile the new RE.  To avoid leaking
 171          * resources on failure, we build into the re_temp local.
 172          */
 173
 174         /* Convert pattern string to wide characters */
 175         pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
 176         pattern_len = pg_mb2wchar_with_len(text_re_val,
 177                                                                            pattern,
 178                                                                            text_re_len);
 179
 180         regcomp_result = pg_regcomp(&re_temp.cre_re,
 181                                                                 pattern,
 182                                                                 pattern_len,
 183                                                                 cflags);
 184
 185         pfree(pattern);
 186
 187         if (regcomp_result != REG_OKAY)
 188         {
 189                 /* re didn't compile */
 190                 pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg));
 191                 /* XXX should we pg_regfree here? */
 192                 ereport(ERROR,
 193                                 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
 194                                  errmsg("invalid regular expression: %s", errMsg)));
 195         }
 196
 197         /*
 198          * We use malloc/free for the cre_pat field because the storage has to
 199          * persist across transactions, and because we want to get control back on
 200          * out-of-memory.  The Max() is because some malloc implementations return
 201          * NULL for malloc(0).
 202          */
 203         re_temp.cre_pat = malloc(Max(text_re_len, 1));
 204         if (re_temp.cre_pat == NULL)
 205         {
 206                 pg_regfree(&re_temp.cre_re);
 207                 ereport(ERROR,
 208                                 (errcode(ERRCODE_OUT_OF_MEMORY),
 209                                  errmsg("out of memory")));
 210         }
 211         memcpy(re_temp.cre_pat, text_re_val, text_re_len);
 212         re_temp.cre_pat_len = text_re_len;
 213         re_temp.cre_flags = cflags;
 214
 215         /*
 216          * Okay, we have a valid new item in re_temp; insert it into the storage
 217          * array.  Discard last entry if needed.
 218          */
 219         if (num_res >= MAX_CACHED_RES)
 220         {
 221                 --num_res;
 222                 Assert(num_res < MAX_CACHED_RES);
 223                 pg_regfree(&re_array[num_res].cre_re);
 224                 free(re_array[num_res].cre_pat);
 225         }
 226
 227         if (num_res > 0)
 228                 memmove(&re_array[1], &re_array[0], num_res * sizeof(cached_re_str));
 229
 230         re_array[0] = re_temp;
 231         num_res++;
 232
 233         return &re_array[0].cre_re;
 234 }
 235
 236 /*
 237  * RE_wchar_execute - execute a RE on pg_wchar data
 238  *
 239  * Returns TRUE on match, FALSE on no match
 240  *
 241  *      re --- the compiled pattern as returned by RE_compile_and_cache
 242  *      data --- the data to match against (need not be null-terminated)
 243  *      data_len --- the length of the data string
 244  *      start_search -- the offset in the data to start searching
 245  *      nmatch, pmatch  --- optional return area for match details
 246  *
 247  * Data is given as array of pg_wchar which is what Spencer's regex package
 248  * wants.
 249  */
 250 static bool
 251 RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len,
 252                                  int start_search, int nmatch, regmatch_t *pmatch)
 253 {
 254         int                     regexec_result;
 255         char            errMsg[100];
 256
 257         /* Perform RE match and return result */
 258         regexec_result = pg_regexec(re,
 259                                                                 data,
 260                                                                 data_len,
 261                                                                 start_search,
 262                                                                 NULL,   /* no details */
 263                                                                 nmatch,
 264                                                                 pmatch,
 265                                                                 0);
 266
 267         if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
 268         {
 269                 /* re failed??? */
 270                 pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
 271                 ereport(ERROR,
 272                                 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
 273                                  errmsg("regular expression failed: %s", errMsg)));
 274         }
 275
 276         return (regexec_result == REG_OKAY);
 277 }
 278
 279 /*
 280  * RE_execute - execute a RE
 281  *
 282  * Returns TRUE on match, FALSE on no match
 283  *
 284  *      re --- the compiled pattern as returned by RE_compile_and_cache
 285  *      dat --- the data to match against (need not be null-terminated)
 286  *      dat_len --- the length of the data string
 287  *      nmatch, pmatch  --- optional return area for match details
 288  *
 289  * Data is given in the database encoding.      We internally
 290  * convert to array of pg_wchar which is what Spencer's regex package wants.
 291  */
 292 static bool
 293 RE_execute(regex_t *re, char *dat, int dat_len,
 294                    int nmatch, regmatch_t *pmatch)
 295 {
 296         pg_wchar   *data;
 297         int                     data_len;
 298         bool            match;
 299
 300         /* Convert data string to wide characters */
 301         data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
 302         data_len = pg_mb2wchar_with_len(dat, data, dat_len);
 303
 304         /* Perform RE match and return result */
 305         match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch);
 306
 307         pfree(data);
 308         return match;
 309 }
 310
 311 /*
 312  * RE_compile_and_execute - compile and execute a RE
 313  *
 314  * Returns TRUE on match, FALSE on no match
 315  *
 316  *      text_re --- the pattern, expressed as a TEXT object
 317  *      dat --- the data to match against (need not be null-terminated)
 318  *      dat_len --- the length of the data string
 319  *      cflags --- compile options for the pattern
 320  *      nmatch, pmatch  --- optional return area for match details
 321  *
 322  * Both pattern and data are given in the database encoding.  We internally
 323  * convert to array of pg_wchar which is what Spencer's regex package wants.
 324  */
 325 static bool
 326 RE_compile_and_execute(text *text_re, char *dat, int dat_len,
 327                                            int cflags, int nmatch, regmatch_t *pmatch)
 328 {
 329         regex_t    *re;
 330
 331         /* Compile RE */
 332         re = RE_compile_and_cache(text_re, cflags);
 333
 334         return RE_execute(re, dat, dat_len, nmatch, pmatch);
 335 }
 336
 337
 338 /*
 339  * parse_re_flags - parse the options argument of regexp_matches and friends
 340  *
 341  *      flags --- output argument, filled with desired options
 342  *      opts --- TEXT object, or NULL for defaults
 343  *
 344  * This accepts all the options allowed by any of the callers; callers that
 345  * don't want some have to reject them after the fact.
 346  */
 347 static void
 348 parse_re_flags(pg_re_flags *flags, text *opts)
 349 {
 350         /* regex_flavor is always folded into the compile flags */
 351         flags->cflags = regex_flavor;
 352         flags->glob = false;
 353
 354         if (opts)
 355         {
 356                 char       *opt_p = VARDATA_ANY(opts);
 357                 int                     opt_len = VARSIZE_ANY_EXHDR(opts);
 358                 int                     i;
 359
 360                 for (i = 0; i < opt_len; i++)
 361                 {
 362                         switch (opt_p[i])
 363                         {
 364                                 case 'g':
 365                                         flags->glob = true;
 366                                         break;
 367                                 case 'b':               /* BREs (but why???) */
 368                                         flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED | REG_QUOTE);
 369                                         break;
 370                                 case 'c':               /* case sensitive */
 371                                         flags->cflags &= ~REG_ICASE;
 372                                         break;
 373                                 case 'e':               /* plain EREs */
 374                                         flags->cflags |= REG_EXTENDED;
 375                                         flags->cflags &= ~(REG_ADVANCED | REG_QUOTE);
 376                                         break;
 377                                 case 'i':               /* case insensitive */
 378                                         flags->cflags |= REG_ICASE;
 379                                         break;
 380                                 case 'm':               /* Perloid synonym for n */
 381                                 case 'n':               /* \n affects ^ $ . [^ */
 382                                         flags->cflags |= REG_NEWLINE;
 383                                         break;
 384                                 case 'p':               /* ~Perl, \n affects . [^ */
 385                                         flags->cflags |= REG_NLSTOP;
 386                                         flags->cflags &= ~REG_NLANCH;
 387                                         break;
 388                                 case 'q':               /* literal string */
 389                                         flags->cflags |= REG_QUOTE;
 390                                         flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED);
 391                                         break;
 392                                 case 's':               /* single line, \n ordinary */
 393                                         flags->cflags &= ~REG_NEWLINE;
 394                                         break;
 395                                 case 't':               /* tight syntax */
 396                                         flags->cflags &= ~REG_EXPANDED;
 397                                         break;
 398                                 case 'w':               /* weird, \n affects ^ $ only */
 399                                         flags->cflags &= ~REG_NLSTOP;
 400                                         flags->cflags |= REG_NLANCH;
 401                                         break;
 402                                 case 'x':               /* expanded syntax */
 403                                         flags->cflags |= REG_EXPANDED;
 404                                         break;
 405                                 default:
 406                                         ereport(ERROR,
 407                                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 408                                                          errmsg("invalid regexp option: \"%c\"",
 409                                                                         opt_p[i])));
 410                                         break;
 411                         }
 412                 }
 413         }
 414 }
 415
 416
 417 /*
 418  * report whether regex_flavor is currently BASIC
 419  */
 420 bool
 421 regex_flavor_is_basic(void)
 422 {
 423         return (regex_flavor == REG_BASIC);
 424 }
 425
 426
 427 /*
 428  *      interface routines called by the function manager
 429  */
 430
 431 Datum
 432 nameregexeq(PG_FUNCTION_ARGS)
 433 {
 434         Name            n = PG_GETARG_NAME(0);
 435         text       *p = PG_GETARG_TEXT_PP(1);
 436
 437         PG_RETURN_BOOL(RE_compile_and_execute(p,
 438                                                                                   NameStr(*n),
 439                                                                                   strlen(NameStr(*n)),
 440                                                                                   regex_flavor,
 441                                                                                   0, NULL));
 442 }
 443
 444 Datum
 445 nameregexne(PG_FUNCTION_ARGS)
 446 {
 447         Name            n = PG_GETARG_NAME(0);
 448         text       *p = PG_GETARG_TEXT_PP(1);
 449
 450         PG_RETURN_BOOL(!RE_compile_and_execute(p,
 451                                                                                    NameStr(*n),
 452                                                                                    strlen(NameStr(*n)),
 453                                                                                    regex_flavor,
 454                                                                                    0, NULL));
 455 }
 456
 457 Datum
 458 textregexeq(PG_FUNCTION_ARGS)
 459 {
 460         text       *s = PG_GETARG_TEXT_PP(0);
 461         text       *p = PG_GETARG_TEXT_PP(1);
 462
 463         PG_RETURN_BOOL(RE_compile_and_execute(p,
 464                                                                                   VARDATA_ANY(s),
 465                                                                                   VARSIZE_ANY_EXHDR(s),
 466                                                                                   regex_flavor,
 467                                                                                   0, NULL));
 468 }
 469
 470 Datum
 471 textregexne(PG_FUNCTION_ARGS)
 472 {
 473         text       *s = PG_GETARG_TEXT_PP(0);
 474         text       *p = PG_GETARG_TEXT_PP(1);
 475
 476         PG_RETURN_BOOL(!RE_compile_and_execute(p,
 477                                                                                    VARDATA_ANY(s),
 478                                                                                    VARSIZE_ANY_EXHDR(s),
 479                                                                                    regex_flavor,
 480                                                                                    0, NULL));
 481 }
 482
 483
 484 /*
 485  *      routines that use the regexp stuff, but ignore the case.
 486  *      for this, we use the REG_ICASE flag to pg_regcomp
 487  */
 488
 489
 490 Datum
 491 nameicregexeq(PG_FUNCTION_ARGS)
 492 {
 493         Name            n = PG_GETARG_NAME(0);
 494         text       *p = PG_GETARG_TEXT_PP(1);
 495
 496         PG_RETURN_BOOL(RE_compile_and_execute(p,
 497                                                                                   NameStr(*n),
 498                                                                                   strlen(NameStr(*n)),
 499                                                                                   regex_flavor | REG_ICASE,
 500                                                                                   0, NULL));
 501 }
 502
 503 Datum
 504 nameicregexne(PG_FUNCTION_ARGS)
 505 {
 506         Name            n = PG_GETARG_NAME(0);
 507         text       *p = PG_GETARG_TEXT_PP(1);
 508
 509         PG_RETURN_BOOL(!RE_compile_and_execute(p,
 510                                                                                    NameStr(*n),
 511                                                                                    strlen(NameStr(*n)),
 512                                                                                    regex_flavor | REG_ICASE,
 513                                                                                    0, NULL));
 514 }
 515
 516 Datum
 517 texticregexeq(PG_FUNCTION_ARGS)
 518 {
 519         text       *s = PG_GETARG_TEXT_PP(0);
 520         text       *p = PG_GETARG_TEXT_PP(1);
 521
 522         PG_RETURN_BOOL(RE_compile_and_execute(p,
 523                                                                                   VARDATA_ANY(s),
 524                                                                                   VARSIZE_ANY_EXHDR(s),
 525                                                                                   regex_flavor | REG_ICASE,
 526                                                                                   0, NULL));
 527 }
 528
 529 Datum
 530 texticregexne(PG_FUNCTION_ARGS)
 531 {
 532         text       *s = PG_GETARG_TEXT_PP(0);
 533         text       *p = PG_GETARG_TEXT_PP(1);
 534
 535         PG_RETURN_BOOL(!RE_compile_and_execute(p,
 536                                                                                    VARDATA_ANY(s),
 537                                                                                    VARSIZE_ANY_EXHDR(s),
 538                                                                                    regex_flavor | REG_ICASE,
 539                                                                                    0, NULL));
 540 }
 541
 542
 543 /*
 544  * textregexsubstr()
 545  *              Return a substring matched by a regular expression.
 546  */
 547 Datum
 548 textregexsubstr(PG_FUNCTION_ARGS)
 549 {
 550         text       *s = PG_GETARG_TEXT_PP(0);
 551         text       *p = PG_GETARG_TEXT_PP(1);
 552         regex_t    *re;
 553         regmatch_t      pmatch[2];
 554         int                     so,
 555                                 eo;
 556
 557         /* Compile RE */
 558         re = RE_compile_and_cache(p, regex_flavor);
 559
 560         /*
 561          * We pass two regmatch_t structs to get info about the overall match and
 562          * the match for the first parenthesized subexpression (if any). If there
 563          * is a parenthesized subexpression, we return what it matched; else
 564          * return what the whole regexp matched.
 565          */
 566         if (!RE_execute(re,
 567                                         VARDATA_ANY(s), VARSIZE_ANY_EXHDR(s),
 568                                         2, pmatch))
 569                 PG_RETURN_NULL();               /* definitely no match */
 570
 571         if (re->re_nsub > 0)
 572         {
 573                 /* has parenthesized subexpressions, use the first one */
 574                 so = pmatch[1].rm_so;
 575                 eo = pmatch[1].rm_eo;
 576         }
 577         else
 578         {
 579                 /* no parenthesized subexpression, use whole match */
 580                 so = pmatch[0].rm_so;
 581                 eo = pmatch[0].rm_eo;
 582         }
 583
 584         /*
 585          * It is possible to have a match to the whole pattern but no match for a
 586          * subexpression; for example 'foo(bar)?' is considered to match 'foo' but
 587          * there is no subexpression match.  So this extra test for match failure
 588          * is not redundant.
 589          */
 590         if (so < 0 || eo < 0)
 591                 PG_RETURN_NULL();
 592
 593         return DirectFunctionCall3(text_substr,
 594                                                            PointerGetDatum(s),
 595                                                            Int32GetDatum(so + 1),
 596                                                            Int32GetDatum(eo - so));
 597 }
 598
 599 /*
 600  * textregexreplace_noopt()
 601  *              Return a string matched by a regular expression, with replacement.
 602  *
 603  * This version doesn't have an option argument: we default to case
 604  * sensitive match, replace the first instance only.
 605  */
 606 Datum
 607 textregexreplace_noopt(PG_FUNCTION_ARGS)
 608 {
 609         text       *s = PG_GETARG_TEXT_PP(0);
 610         text       *p = PG_GETARG_TEXT_PP(1);
 611         text       *r = PG_GETARG_TEXT_PP(2);
 612         regex_t    *re;
 613
 614         re = RE_compile_and_cache(p, regex_flavor);
 615
 616         PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, false));
 617 }
 618
 619 /*
 620  * textregexreplace()
 621  *              Return a string matched by a regular expression, with replacement.
 622  */
 623 Datum
 624 textregexreplace(PG_FUNCTION_ARGS)
 625 {
 626         text       *s = PG_GETARG_TEXT_PP(0);
 627         text       *p = PG_GETARG_TEXT_PP(1);
 628         text       *r = PG_GETARG_TEXT_PP(2);
 629         text       *opt = PG_GETARG_TEXT_PP(3);
 630         regex_t    *re;
 631         pg_re_flags flags;
 632
 633         parse_re_flags(&flags, opt);
 634
 635         re = RE_compile_and_cache(p, flags.cflags);
 636
 637         PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, flags.glob));
 638 }
 639
 640 /*
 641  * similar_escape()
 642  * Convert a SQL99 regexp pattern to POSIX style, so it can be used by
 643  * our regexp engine.
 644  */
 645 Datum
 646 similar_escape(PG_FUNCTION_ARGS)
 647 {
 648         text       *pat_text;
 649         text       *esc_text;
 650         text       *result;
 651         char       *p,
 652                            *e,
 653                            *r;
 654         int                     plen,
 655                                 elen;
 656         bool            afterescape = false;
 657         int                     nquotes = 0;
 658
 659         /* This function is not strict, so must test explicitly */
 660         if (PG_ARGISNULL(0))
 661                 PG_RETURN_NULL();
 662         pat_text = PG_GETARG_TEXT_PP(0);
 663         p = VARDATA_ANY(pat_text);
 664         plen = VARSIZE_ANY_EXHDR(pat_text);
 665         if (PG_ARGISNULL(1))
 666         {
 667                 /* No ESCAPE clause provided; default to backslash as escape */
 668                 e = "\\";
 669                 elen = 1;
 670         }
 671         else
 672         {
 673                 esc_text = PG_GETARG_TEXT_PP(1);
 674                 e = VARDATA_ANY(esc_text);
 675                 elen = VARSIZE_ANY_EXHDR(esc_text);
 676                 if (elen == 0)
 677                         e = NULL;                       /* no escape character */
 678                 else if (elen != 1)
 679                         ereport(ERROR,
 680                                         (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
 681                                          errmsg("invalid escape string"),
 682                                   errhint("Escape string must be empty or one character.")));
 683         }
 684
 685         /*----------
 686          * We surround the transformed input string with
 687          *                      ***:^(?: ... )$
 688          * which is bizarre enough to require some explanation.  "***:" is a
 689          * director prefix to force the regex to be treated as an ARE regardless
 690          * of the current regex_flavor setting.  We need "^" and "$" to force
 691          * the pattern to match the entire input string as per SQL99 spec.      The
 692          * "(?:" and ")" are a non-capturing set of parens; we have to have
 693          * parens in case the string contains "|", else the "^" and "$" will
 694          * be bound into the first and last alternatives which is not what we
 695          * want, and the parens must be non capturing because we don't want them
 696          * to count when selecting output for SUBSTRING.
 697          *----------
 698          */
 699
 700         /*
 701          * We need room for the prefix/postfix plus as many as 2 output bytes per
 702          * input byte
 703          */
 704         result = (text *) palloc(VARHDRSZ + 10 + 2 * plen);
 705         r = VARDATA(result);
 706
 707         *r++ = '*';
 708         *r++ = '*';
 709         *r++ = '*';
 710         *r++ = ':';
 711         *r++ = '^';
 712         *r++ = '(';
 713         *r++ = '?';
 714         *r++ = ':';
 715
 716         while (plen > 0)
 717         {
 718                 char            pchar = *p;
 719
 720                 if (afterescape)
 721                 {
 722                         if (pchar == '"')       /* for SUBSTRING patterns */
 723                                 *r++ = ((nquotes++ % 2) == 0) ? '(' : ')';
 724                         else
 725                         {
 726                                 *r++ = '\\';
 727                                 *r++ = pchar;
 728                         }
 729                         afterescape = false;
 730                 }
 731                 else if (e && pchar == *e)
 732                 {
 733                         /* SQL99 escape character; do not send to output */
 734                         afterescape = true;
 735                 }
 736                 else if (pchar == '%')
 737                 {
 738                         *r++ = '.';
 739                         *r++ = '*';
 740                 }
 741                 else if (pchar == '_')
 742                         *r++ = '.';
 743                 else if (pchar == '\\' || pchar == '.' || pchar == '?' ||
 744                                  pchar == '{')
 745                 {
 746                         *r++ = '\\';
 747                         *r++ = pchar;
 748                 }
 749                 else
 750                         *r++ = pchar;
 751                 p++, plen--;
 752         }
 753
 754         *r++ = ')';
 755         *r++ = '$';
 756
 757         SET_VARSIZE(result, r - ((char *) result));
 758
 759         PG_RETURN_TEXT_P(result);
 760 }
 761
 762 /*
 763  * regexp_matches()
 764  *              Return a table of matches of a pattern within a string.
 765  */
 766 Datum
 767 regexp_matches(PG_FUNCTION_ARGS)
 768 {
 769         FuncCallContext *funcctx;
 770         regexp_matches_ctx *matchctx;
 771
 772         if (SRF_IS_FIRSTCALL())
 773         {
 774                 text       *pattern = PG_GETARG_TEXT_PP(1);
 775                 text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
 776                 MemoryContext oldcontext;
 777
 778                 funcctx = SRF_FIRSTCALL_INIT();
 779                 oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
 780
 781                 /* be sure to copy the input string into the multi-call ctx */
 782                 matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
 783                                                                                 flags, false, true, false);
 784
 785                 /* Pre-create workspace that build_regexp_matches_result needs */
 786                 matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
 787                 matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
 788
 789                 MemoryContextSwitchTo(oldcontext);
 790                 funcctx->user_fctx = (void *) matchctx;
 791         }
 792
 793         funcctx = SRF_PERCALL_SETUP();
 794         matchctx = (regexp_matches_ctx *) funcctx->user_fctx;
 795
 796         if (matchctx->next_match < matchctx->nmatches)
 797         {
 798                 ArrayType  *result_ary;
 799
 800                 result_ary = build_regexp_matches_result(matchctx);
 801                 matchctx->next_match++;
 802                 SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
 803         }
 804
 805         /* release space in multi-call ctx to avoid intraquery memory leak */
 806         cleanup_regexp_matches(matchctx);
 807
 808         SRF_RETURN_DONE(funcctx);
 809 }
 810
 811 /* This is separate to keep the opr_sanity regression test from complaining */
 812 Datum
 813 regexp_matches_no_flags(PG_FUNCTION_ARGS)
 814 {
 815         return regexp_matches(fcinfo);
 816 }
 817
 818 /*
 819  * setup_regexp_matches --- do the initial matching for regexp_matches()
 820  *              or regexp_split()
 821  *
 822  * To avoid having to re-find the compiled pattern on each call, we do
 823  * all the matching in one swoop.  The returned regexp_matches_ctx contains
 824  * the locations of all the substrings matching the pattern.
 825  *
 826  * The three bool parameters have only two patterns (one for each caller)
 827  * but it seems clearer to distinguish the functionality this way than to
 828  * key it all off one "is_split" flag.
 829  */
 830 static regexp_matches_ctx *
 831 setup_regexp_matches(text *orig_str, text *pattern, text *flags,
 832                                          bool force_glob, bool use_subpatterns,
 833                                          bool ignore_degenerate)
 834 {
 835         regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
 836         int                     orig_len;
 837         pg_wchar   *wide_str;
 838         int                     wide_len;
 839         pg_re_flags re_flags;
 840         regex_t    *cpattern;
 841         regmatch_t *pmatch;
 842         int                     pmatch_len;
 843         int                     array_len;
 844         int                     array_idx;
 845         int                     prev_match_end;
 846         int                     start_search;
 847
 848         /* save original string --- we'll extract result substrings from it */
 849         matchctx->orig_str = orig_str;
 850
 851         /* convert string to pg_wchar form for matching */
 852         orig_len = VARSIZE_ANY_EXHDR(orig_str);
 853         wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
 854         wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
 855
 856         /* determine options */
 857         parse_re_flags(&re_flags, flags);
 858         if (force_glob)
 859         {
 860                 /* user mustn't specify 'g' for regexp_split */
 861                 if (re_flags.glob)
 862                         ereport(ERROR,
 863                                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 864                                  errmsg("regexp_split does not support the global option")));
 865                 /* but we find all the matches anyway */
 866                 re_flags.glob = true;
 867         }
 868
 869         /* set up the compiled pattern */
 870         cpattern = RE_compile_and_cache(pattern, re_flags.cflags);
 871
 872         /* do we want to remember subpatterns? */
 873         if (use_subpatterns && cpattern->re_nsub > 0)
 874         {
 875                 matchctx->npatterns = cpattern->re_nsub;
 876                 pmatch_len = cpattern->re_nsub + 1;
 877         }
 878         else
 879         {
 880                 use_subpatterns = false;
 881                 matchctx->npatterns = 1;
 882                 pmatch_len = 1;
 883         }
 884
 885         /* temporary output space for RE package */
 886         pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
 887
 888         /* the real output space (grown dynamically if needed) */
 889         array_len = re_flags.glob ? 256 : 32;
 890         matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
 891         array_idx = 0;
 892
 893         /* search for the pattern, perhaps repeatedly */
 894         prev_match_end = 0;
 895         start_search = 0;
 896         while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search,
 897                                                         pmatch_len, pmatch))
 898         {
 899                 /*
 900                  * If requested, ignore degenerate matches, which are zero-length
 901                  * matches occurring at the start or end of a string or just after a
 902                  * previous match.
 903                  */
 904                 if (!ignore_degenerate ||
 905                         (pmatch[0].rm_so < wide_len &&
 906                          pmatch[0].rm_eo > prev_match_end))
 907                 {
 908                         /* enlarge output space if needed */
 909                         while (array_idx + matchctx->npatterns * 2 > array_len)
 910                         {
 911                                 array_len *= 2;
 912                                 matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
 913                                                                                                         sizeof(int) * array_len);
 914                         }
 915
 916                         /* save this match's locations */
 917                         if (use_subpatterns)
 918                         {
 919                                 int                     i;
 920
 921                                 for (i = 1; i <= matchctx->npatterns; i++)
 922                                 {
 923                                         matchctx->match_locs[array_idx++] = pmatch[i].rm_so;
 924                                         matchctx->match_locs[array_idx++] = pmatch[i].rm_eo;
 925                                 }
 926                         }
 927                         else
 928                         {
 929                                 matchctx->match_locs[array_idx++] = pmatch[0].rm_so;
 930                                 matchctx->match_locs[array_idx++] = pmatch[0].rm_eo;
 931                         }
 932                         matchctx->nmatches++;
 933                 }
 934                 prev_match_end = pmatch[0].rm_eo;
 935
 936                 /* if not glob, stop after one match */
 937                 if (!re_flags.glob)
 938                         break;
 939
 940                 /*
 941                  * Advance search position.  Normally we start just after the end of
 942                  * the previous match, but always advance at least one character (the
 943                  * special case can occur if the pattern matches zero characters just
 944                  * after the prior match or at the end of the string).
 945                  */
 946                 if (start_search < pmatch[0].rm_eo)
 947                         start_search = pmatch[0].rm_eo;
 948                 else
 949                         start_search++;
 950                 if (start_search > wide_len)
 951                         break;
 952         }
 953
 954         /* Clean up temp storage */
 955         pfree(wide_str);
 956         pfree(pmatch);
 957
 958         return matchctx;
 959 }
 960
 961 /*
 962  * cleanup_regexp_matches - release memory of a regexp_matches_ctx
 963  */
 964 static void
 965 cleanup_regexp_matches(regexp_matches_ctx *matchctx)
 966 {
 967         pfree(matchctx->orig_str);
 968         pfree(matchctx->match_locs);
 969         if (matchctx->elems)
 970                 pfree(matchctx->elems);
 971         if (matchctx->nulls)
 972                 pfree(matchctx->nulls);
 973         pfree(matchctx);
 974 }
 975
 976 /*
 977  * build_regexp_matches_result - build output array for current match
 978  */
 979 static ArrayType *
 980 build_regexp_matches_result(regexp_matches_ctx *matchctx)
 981 {
 982         Datum      *elems = matchctx->elems;
 983         bool       *nulls = matchctx->nulls;
 984         int                     dims[1];
 985         int                     lbs[1];
 986         int                     loc;
 987         int                     i;
 988
 989         /* Extract matching substrings from the original string */
 990         loc = matchctx->next_match * matchctx->npatterns * 2;
 991         for (i = 0; i < matchctx->npatterns; i++)
 992         {
 993                 int                     so = matchctx->match_locs[loc++];
 994                 int                     eo = matchctx->match_locs[loc++];
 995
 996                 if (so < 0 || eo < 0)
 997                 {
 998                         elems[i] = (Datum) 0;
 999                         nulls[i] = true;
1000                 }
1001                 else
1002                 {
1003                         elems[i] = DirectFunctionCall3(text_substr,
1004                                                                                  PointerGetDatum(matchctx->orig_str),
1005                                                                                    Int32GetDatum(so + 1),
1006                                                                                    Int32GetDatum(eo - so));
1007                         nulls[i] = false;
1008                 }
1009         }
1010
1011         /* And form an array */
1012         dims[0] = matchctx->npatterns;
1013         lbs[0] = 1;
1014         /* XXX: this hardcodes assumptions about the text type */
1015         return construct_md_array(elems, nulls, 1, dims, lbs,
1016                                                           TEXTOID, -1, false, 'i');
1017 }
1018
1019 /*
1020  * regexp_split_to_table()
1021  *              Split the string at matches of the pattern, returning the
1022  *              split-out substrings as a table.
1023  */
1024 Datum
1025 regexp_split_to_table(PG_FUNCTION_ARGS)
1026 {
1027         FuncCallContext *funcctx;
1028         regexp_matches_ctx *splitctx;
1029
1030         if (SRF_IS_FIRSTCALL())
1031         {
1032                 text       *pattern = PG_GETARG_TEXT_PP(1);
1033                 text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
1034                 MemoryContext oldcontext;
1035
1036                 funcctx = SRF_FIRSTCALL_INIT();
1037                 oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
1038
1039                 /* be sure to copy the input string into the multi-call ctx */
1040                 splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
1041                                                                                 flags, true, false, true);
1042
1043                 MemoryContextSwitchTo(oldcontext);
1044                 funcctx->user_fctx = (void *) splitctx;
1045         }
1046
1047         funcctx = SRF_PERCALL_SETUP();
1048         splitctx = (regexp_matches_ctx *) funcctx->user_fctx;
1049
1050         if (splitctx->next_match <= splitctx->nmatches)
1051         {
1052                 Datum           result = build_regexp_split_result(splitctx);
1053
1054                 splitctx->next_match++;
1055                 SRF_RETURN_NEXT(funcctx, result);
1056         }
1057
1058         /* release space in multi-call ctx to avoid intraquery memory leak */
1059         cleanup_regexp_matches(splitctx);
1060
1061         SRF_RETURN_DONE(funcctx);
1062 }
1063
1064 /* This is separate to keep the opr_sanity regression test from complaining */
1065 Datum
1066 regexp_split_to_table_no_flags(PG_FUNCTION_ARGS)
1067 {
1068         return regexp_split_to_table(fcinfo);
1069 }
1070
1071 /*
1072  * regexp_split_to_array()
1073  *              Split the string at matches of the pattern, returning the
1074  *              split-out substrings as an array.
1075  */
1076 Datum
1077 regexp_split_to_array(PG_FUNCTION_ARGS)
1078 {
1079         ArrayBuildState *astate = NULL;
1080         regexp_matches_ctx *splitctx;
1081
1082         splitctx = setup_regexp_matches(PG_GETARG_TEXT_PP(0),
1083                                                                         PG_GETARG_TEXT_PP(1),
1084                                                                         PG_GETARG_TEXT_PP_IF_EXISTS(2),
1085                                                                         true, false, true);
1086
1087         while (splitctx->next_match <= splitctx->nmatches)
1088         {
1089                 astate = accumArrayResult(astate,
1090                                                                   build_regexp_split_result(splitctx),
1091                                                                   false,
1092                                                                   TEXTOID,
1093                                                                   CurrentMemoryContext);
1094                 splitctx->next_match++;
1095         }
1096
1097         /*
1098          * We don't call cleanup_regexp_matches here; it would try to pfree the
1099          * input string, which we didn't copy.  The space is not in a long-lived
1100          * memory context anyway.
1101          */
1102
1103         PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, CurrentMemoryContext));
1104 }
1105
1106 /* This is separate to keep the opr_sanity regression test from complaining */
1107 Datum
1108 regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
1109 {
1110         return regexp_split_to_array(fcinfo);
1111 }
1112
1113 /*
1114  * build_regexp_split_result - build output string for current match
1115  *
1116  * We return the string between the current match and the previous one,
1117  * or the string after the last match when next_match == nmatches.
1118  */
1119 static Datum
1120 build_regexp_split_result(regexp_matches_ctx *splitctx)
1121 {
1122         int                     startpos;
1123         int                     endpos;
1124
1125         if (splitctx->next_match > 0)
1126                 startpos = splitctx->match_locs[splitctx->next_match * 2 - 1];
1127         else
1128                 startpos = 0;
1129         if (startpos < 0)
1130                 elog(ERROR, "invalid match ending position");
1131
1132         if (splitctx->next_match < splitctx->nmatches)
1133         {
1134                 endpos = splitctx->match_locs[splitctx->next_match * 2];
1135                 if (endpos < startpos)
1136                         elog(ERROR, "invalid match starting position");
1137                 return DirectFunctionCall3(text_substr,
1138                                                                    PointerGetDatum(splitctx->orig_str),
1139                                                                    Int32GetDatum(startpos + 1),
1140                                                                    Int32GetDatum(endpos - startpos));
1141         }
1142         else
1143         {
1144                 /* no more matches, return rest of string */
1145                 return DirectFunctionCall2(text_substr_no_len,
1146                                                                    PointerGetDatum(splitctx->orig_str),
1147                                                                    Int32GetDatum(startpos + 1));
1148         }
1149 }