Don't use 'return' where you should use 'PG_RETURN_xxx'.
[PostgreSQL.git] / src / backend / utils / adt / regexp.c
blob0c58746819b19e9fa95fe31d9a072860d4a7e361
1 /*-------------------------------------------------------------------------
3 * regexp.c
4 * Postgres' interface to the regular expression package.
6 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
10 * IDENTIFICATION
11 * $PostgreSQL$
13 * Alistair Crooks added the code for the regex caching
14 * agc - cached the regular expressions used - there's a good chance
15 * that we'll get a hit, so this saves a compile step for every
16 * attempted match. I haven't actually measured the speed improvement,
17 * but it `looks' a lot quicker visually when watching regression
18 * test output.
20 * agc - incorporated Keith Bostic's Berkeley regex code into
21 * the tree for all ports. To distinguish this regex code from any that
22 * is existent on a platform, I've prepended the string "pg_" to
23 * the functions regcomp, regerror, regexec and regfree.
24 * Fixed a bug that was originally a typo by me, where `i' was used
25 * instead of `oldest' when compiling regular expressions - benign
26 * results mostly, although occasionally it bit you...
28 *-------------------------------------------------------------------------
30 #include "postgres.h"
32 #include "catalog/pg_type.h"
33 #include "funcapi.h"
34 #include "regex/regex.h"
35 #include "utils/builtins.h"
36 #include "utils/guc.h"
38 #define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \
39 (PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL)
42 /* GUC-settable flavor parameter */
43 int regex_flavor = REG_ADVANCED;
46 /* all the options of interest for regex functions */
47 typedef struct pg_re_flags
49 int cflags; /* compile flags for Spencer's regex code */
50 bool glob; /* do it globally (for each occurrence) */
51 } pg_re_flags;
53 /* cross-call state for regexp_matches(), also regexp_split() */
54 typedef struct regexp_matches_ctx
56 text *orig_str; /* data string in original TEXT form */
57 int nmatches; /* number of places where pattern matched */
58 int npatterns; /* number of capturing subpatterns */
59 /* We store start char index and end+1 char index for each match */
60 /* so the number of entries in match_locs is nmatches * npatterns * 2 */
61 int *match_locs; /* 0-based character indexes */
62 int next_match; /* 0-based index of next match to process */
63 /* workspace for build_regexp_matches_result() */
64 Datum *elems; /* has npatterns elements */
65 bool *nulls; /* has npatterns elements */
66 } regexp_matches_ctx;
69 * We cache precompiled regular expressions using a "self organizing list"
70 * structure, in which recently-used items tend to be near the front.
71 * Whenever we use an entry, it's moved up to the front of the list.
72 * Over time, an item's average position corresponds to its frequency of use.
74 * When we first create an entry, it's inserted at the front of
75 * the array, dropping the entry at the end of the array if necessary to
76 * make room. (This might seem to be weighting the new entry too heavily,
77 * but if we insert new entries further back, we'll be unable to adjust to
78 * a sudden shift in the query mix where we are presented with MAX_CACHED_RES
79 * never-before-seen items used circularly. We ought to be able to handle
80 * that case, so we have to insert at the front.)
82 * Knuth mentions a variant strategy in which a used item is moved up just
83 * one place in the list. Although he says this uses fewer comparisons on
84 * average, it seems not to adapt very well to the situation where you have
85 * both some reusable patterns and a steady stream of non-reusable patterns.
86 * A reusable pattern that isn't used at least as often as non-reusable
87 * patterns are seen will "fail to keep up" and will drop off the end of the
88 * cache. With move-to-front, a reusable pattern is guaranteed to stay in
89 * the cache as long as it's used at least once in every MAX_CACHED_RES uses.
92 /* this is the maximum number of cached regular expressions */
93 #ifndef MAX_CACHED_RES
94 #define MAX_CACHED_RES 32
95 #endif
97 /* this structure describes one cached regular expression */
98 typedef struct cached_re_str
100 char *cre_pat; /* original RE (not null terminated!) */
101 int cre_pat_len; /* length of original RE, in bytes */
102 int cre_flags; /* compile flags: extended,icase etc */
103 regex_t cre_re; /* the compiled regular expression */
104 } cached_re_str;
106 static int num_res = 0; /* # of cached re's */
107 static cached_re_str re_array[MAX_CACHED_RES]; /* cached re's */
110 /* Local functions */
111 static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
112 text *flags,
113 bool force_glob,
114 bool use_subpatterns,
115 bool ignore_degenerate);
116 static void cleanup_regexp_matches(regexp_matches_ctx *matchctx);
117 static ArrayType *build_regexp_matches_result(regexp_matches_ctx *matchctx);
118 static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
122 * RE_compile_and_cache - compile a RE, caching if possible
124 * Returns regex_t *
126 * text_re --- the pattern, expressed as a TEXT object
127 * cflags --- compile options for the pattern
129 * Pattern is given in the database encoding. We internally convert to
130 * an array of pg_wchar, which is what Spencer's regex package wants.
132 static regex_t *
133 RE_compile_and_cache(text *text_re, int cflags)
135 int text_re_len = VARSIZE_ANY_EXHDR(text_re);
136 char *text_re_val = VARDATA_ANY(text_re);
137 pg_wchar *pattern;
138 int pattern_len;
139 int i;
140 int regcomp_result;
141 cached_re_str re_temp;
142 char errMsg[100];
145 * Look for a match among previously compiled REs. Since the data
146 * structure is self-organizing with most-used entries at the front, our
147 * search strategy can just be to scan from the front.
149 for (i = 0; i < num_res; i++)
151 if (re_array[i].cre_pat_len == text_re_len &&
152 re_array[i].cre_flags == cflags &&
153 memcmp(re_array[i].cre_pat, text_re_val, text_re_len) == 0)
156 * Found a match; move it to front if not there already.
158 if (i > 0)
160 re_temp = re_array[i];
161 memmove(&re_array[1], &re_array[0], i * sizeof(cached_re_str));
162 re_array[0] = re_temp;
165 return &re_array[0].cre_re;
170 * Couldn't find it, so try to compile the new RE. To avoid leaking
171 * resources on failure, we build into the re_temp local.
174 /* Convert pattern string to wide characters */
175 pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
176 pattern_len = pg_mb2wchar_with_len(text_re_val,
177 pattern,
178 text_re_len);
180 regcomp_result = pg_regcomp(&re_temp.cre_re,
181 pattern,
182 pattern_len,
183 cflags);
185 pfree(pattern);
187 if (regcomp_result != REG_OKAY)
189 /* re didn't compile */
190 pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg));
191 /* XXX should we pg_regfree here? */
192 ereport(ERROR,
193 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
194 errmsg("invalid regular expression: %s", errMsg)));
198 * We use malloc/free for the cre_pat field because the storage has to
199 * persist across transactions, and because we want to get control back on
200 * out-of-memory. The Max() is because some malloc implementations return
201 * NULL for malloc(0).
203 re_temp.cre_pat = malloc(Max(text_re_len, 1));
204 if (re_temp.cre_pat == NULL)
206 pg_regfree(&re_temp.cre_re);
207 ereport(ERROR,
208 (errcode(ERRCODE_OUT_OF_MEMORY),
209 errmsg("out of memory")));
211 memcpy(re_temp.cre_pat, text_re_val, text_re_len);
212 re_temp.cre_pat_len = text_re_len;
213 re_temp.cre_flags = cflags;
216 * Okay, we have a valid new item in re_temp; insert it into the storage
217 * array. Discard last entry if needed.
219 if (num_res >= MAX_CACHED_RES)
221 --num_res;
222 Assert(num_res < MAX_CACHED_RES);
223 pg_regfree(&re_array[num_res].cre_re);
224 free(re_array[num_res].cre_pat);
227 if (num_res > 0)
228 memmove(&re_array[1], &re_array[0], num_res * sizeof(cached_re_str));
230 re_array[0] = re_temp;
231 num_res++;
233 return &re_array[0].cre_re;
237 * RE_wchar_execute - execute a RE on pg_wchar data
239 * Returns TRUE on match, FALSE on no match
241 * re --- the compiled pattern as returned by RE_compile_and_cache
242 * data --- the data to match against (need not be null-terminated)
243 * data_len --- the length of the data string
244 * start_search -- the offset in the data to start searching
245 * nmatch, pmatch --- optional return area for match details
247 * Data is given as array of pg_wchar which is what Spencer's regex package
248 * wants.
250 static bool
251 RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len,
252 int start_search, int nmatch, regmatch_t *pmatch)
254 int regexec_result;
255 char errMsg[100];
257 /* Perform RE match and return result */
258 regexec_result = pg_regexec(re,
259 data,
260 data_len,
261 start_search,
262 NULL, /* no details */
263 nmatch,
264 pmatch,
267 if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
269 /* re failed??? */
270 pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
271 ereport(ERROR,
272 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
273 errmsg("regular expression failed: %s", errMsg)));
276 return (regexec_result == REG_OKAY);
280 * RE_execute - execute a RE
282 * Returns TRUE on match, FALSE on no match
284 * re --- the compiled pattern as returned by RE_compile_and_cache
285 * dat --- the data to match against (need not be null-terminated)
286 * dat_len --- the length of the data string
287 * nmatch, pmatch --- optional return area for match details
289 * Data is given in the database encoding. We internally
290 * convert to array of pg_wchar which is what Spencer's regex package wants.
292 static bool
293 RE_execute(regex_t *re, char *dat, int dat_len,
294 int nmatch, regmatch_t *pmatch)
296 pg_wchar *data;
297 int data_len;
298 bool match;
300 /* Convert data string to wide characters */
301 data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
302 data_len = pg_mb2wchar_with_len(dat, data, dat_len);
304 /* Perform RE match and return result */
305 match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch);
307 pfree(data);
308 return match;
312 * RE_compile_and_execute - compile and execute a RE
314 * Returns TRUE on match, FALSE on no match
316 * text_re --- the pattern, expressed as a TEXT object
317 * dat --- the data to match against (need not be null-terminated)
318 * dat_len --- the length of the data string
319 * cflags --- compile options for the pattern
320 * nmatch, pmatch --- optional return area for match details
322 * Both pattern and data are given in the database encoding. We internally
323 * convert to array of pg_wchar which is what Spencer's regex package wants.
325 static bool
326 RE_compile_and_execute(text *text_re, char *dat, int dat_len,
327 int cflags, int nmatch, regmatch_t *pmatch)
329 regex_t *re;
331 /* Compile RE */
332 re = RE_compile_and_cache(text_re, cflags);
334 return RE_execute(re, dat, dat_len, nmatch, pmatch);
339 * parse_re_flags - parse the options argument of regexp_matches and friends
341 * flags --- output argument, filled with desired options
342 * opts --- TEXT object, or NULL for defaults
344 * This accepts all the options allowed by any of the callers; callers that
345 * don't want some have to reject them after the fact.
347 static void
348 parse_re_flags(pg_re_flags *flags, text *opts)
350 /* regex_flavor is always folded into the compile flags */
351 flags->cflags = regex_flavor;
352 flags->glob = false;
354 if (opts)
356 char *opt_p = VARDATA_ANY(opts);
357 int opt_len = VARSIZE_ANY_EXHDR(opts);
358 int i;
360 for (i = 0; i < opt_len; i++)
362 switch (opt_p[i])
364 case 'g':
365 flags->glob = true;
366 break;
367 case 'b': /* BREs (but why???) */
368 flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED | REG_QUOTE);
369 break;
370 case 'c': /* case sensitive */
371 flags->cflags &= ~REG_ICASE;
372 break;
373 case 'e': /* plain EREs */
374 flags->cflags |= REG_EXTENDED;
375 flags->cflags &= ~(REG_ADVANCED | REG_QUOTE);
376 break;
377 case 'i': /* case insensitive */
378 flags->cflags |= REG_ICASE;
379 break;
380 case 'm': /* Perloid synonym for n */
381 case 'n': /* \n affects ^ $ . [^ */
382 flags->cflags |= REG_NEWLINE;
383 break;
384 case 'p': /* ~Perl, \n affects . [^ */
385 flags->cflags |= REG_NLSTOP;
386 flags->cflags &= ~REG_NLANCH;
387 break;
388 case 'q': /* literal string */
389 flags->cflags |= REG_QUOTE;
390 flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED);
391 break;
392 case 's': /* single line, \n ordinary */
393 flags->cflags &= ~REG_NEWLINE;
394 break;
395 case 't': /* tight syntax */
396 flags->cflags &= ~REG_EXPANDED;
397 break;
398 case 'w': /* weird, \n affects ^ $ only */
399 flags->cflags &= ~REG_NLSTOP;
400 flags->cflags |= REG_NLANCH;
401 break;
402 case 'x': /* expanded syntax */
403 flags->cflags |= REG_EXPANDED;
404 break;
405 default:
406 ereport(ERROR,
407 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
408 errmsg("invalid regexp option: \"%c\"",
409 opt_p[i])));
410 break;
418 * report whether regex_flavor is currently BASIC
420 bool
421 regex_flavor_is_basic(void)
423 return (regex_flavor == REG_BASIC);
428 * interface routines called by the function manager
431 Datum
432 nameregexeq(PG_FUNCTION_ARGS)
434 Name n = PG_GETARG_NAME(0);
435 text *p = PG_GETARG_TEXT_PP(1);
437 PG_RETURN_BOOL(RE_compile_and_execute(p,
438 NameStr(*n),
439 strlen(NameStr(*n)),
440 regex_flavor,
441 0, NULL));
444 Datum
445 nameregexne(PG_FUNCTION_ARGS)
447 Name n = PG_GETARG_NAME(0);
448 text *p = PG_GETARG_TEXT_PP(1);
450 PG_RETURN_BOOL(!RE_compile_and_execute(p,
451 NameStr(*n),
452 strlen(NameStr(*n)),
453 regex_flavor,
454 0, NULL));
457 Datum
458 textregexeq(PG_FUNCTION_ARGS)
460 text *s = PG_GETARG_TEXT_PP(0);
461 text *p = PG_GETARG_TEXT_PP(1);
463 PG_RETURN_BOOL(RE_compile_and_execute(p,
464 VARDATA_ANY(s),
465 VARSIZE_ANY_EXHDR(s),
466 regex_flavor,
467 0, NULL));
470 Datum
471 textregexne(PG_FUNCTION_ARGS)
473 text *s = PG_GETARG_TEXT_PP(0);
474 text *p = PG_GETARG_TEXT_PP(1);
476 PG_RETURN_BOOL(!RE_compile_and_execute(p,
477 VARDATA_ANY(s),
478 VARSIZE_ANY_EXHDR(s),
479 regex_flavor,
480 0, NULL));
485 * routines that use the regexp stuff, but ignore the case.
486 * for this, we use the REG_ICASE flag to pg_regcomp
490 Datum
491 nameicregexeq(PG_FUNCTION_ARGS)
493 Name n = PG_GETARG_NAME(0);
494 text *p = PG_GETARG_TEXT_PP(1);
496 PG_RETURN_BOOL(RE_compile_and_execute(p,
497 NameStr(*n),
498 strlen(NameStr(*n)),
499 regex_flavor | REG_ICASE,
500 0, NULL));
503 Datum
504 nameicregexne(PG_FUNCTION_ARGS)
506 Name n = PG_GETARG_NAME(0);
507 text *p = PG_GETARG_TEXT_PP(1);
509 PG_RETURN_BOOL(!RE_compile_and_execute(p,
510 NameStr(*n),
511 strlen(NameStr(*n)),
512 regex_flavor | REG_ICASE,
513 0, NULL));
516 Datum
517 texticregexeq(PG_FUNCTION_ARGS)
519 text *s = PG_GETARG_TEXT_PP(0);
520 text *p = PG_GETARG_TEXT_PP(1);
522 PG_RETURN_BOOL(RE_compile_and_execute(p,
523 VARDATA_ANY(s),
524 VARSIZE_ANY_EXHDR(s),
525 regex_flavor | REG_ICASE,
526 0, NULL));
529 Datum
530 texticregexne(PG_FUNCTION_ARGS)
532 text *s = PG_GETARG_TEXT_PP(0);
533 text *p = PG_GETARG_TEXT_PP(1);
535 PG_RETURN_BOOL(!RE_compile_and_execute(p,
536 VARDATA_ANY(s),
537 VARSIZE_ANY_EXHDR(s),
538 regex_flavor | REG_ICASE,
539 0, NULL));
544 * textregexsubstr()
545 * Return a substring matched by a regular expression.
547 Datum
548 textregexsubstr(PG_FUNCTION_ARGS)
550 text *s = PG_GETARG_TEXT_PP(0);
551 text *p = PG_GETARG_TEXT_PP(1);
552 regex_t *re;
553 regmatch_t pmatch[2];
554 int so,
557 /* Compile RE */
558 re = RE_compile_and_cache(p, regex_flavor);
561 * We pass two regmatch_t structs to get info about the overall match and
562 * the match for the first parenthesized subexpression (if any). If there
563 * is a parenthesized subexpression, we return what it matched; else
564 * return what the whole regexp matched.
566 if (!RE_execute(re,
567 VARDATA_ANY(s), VARSIZE_ANY_EXHDR(s),
568 2, pmatch))
569 PG_RETURN_NULL(); /* definitely no match */
571 if (re->re_nsub > 0)
573 /* has parenthesized subexpressions, use the first one */
574 so = pmatch[1].rm_so;
575 eo = pmatch[1].rm_eo;
577 else
579 /* no parenthesized subexpression, use whole match */
580 so = pmatch[0].rm_so;
581 eo = pmatch[0].rm_eo;
585 * It is possible to have a match to the whole pattern but no match for a
586 * subexpression; for example 'foo(bar)?' is considered to match 'foo' but
587 * there is no subexpression match. So this extra test for match failure
588 * is not redundant.
590 if (so < 0 || eo < 0)
591 PG_RETURN_NULL();
593 return DirectFunctionCall3(text_substr,
594 PointerGetDatum(s),
595 Int32GetDatum(so + 1),
596 Int32GetDatum(eo - so));
600 * textregexreplace_noopt()
601 * Return a string matched by a regular expression, with replacement.
603 * This version doesn't have an option argument: we default to case
604 * sensitive match, replace the first instance only.
606 Datum
607 textregexreplace_noopt(PG_FUNCTION_ARGS)
609 text *s = PG_GETARG_TEXT_PP(0);
610 text *p = PG_GETARG_TEXT_PP(1);
611 text *r = PG_GETARG_TEXT_PP(2);
612 regex_t *re;
614 re = RE_compile_and_cache(p, regex_flavor);
616 PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, false));
620 * textregexreplace()
621 * Return a string matched by a regular expression, with replacement.
623 Datum
624 textregexreplace(PG_FUNCTION_ARGS)
626 text *s = PG_GETARG_TEXT_PP(0);
627 text *p = PG_GETARG_TEXT_PP(1);
628 text *r = PG_GETARG_TEXT_PP(2);
629 text *opt = PG_GETARG_TEXT_PP(3);
630 regex_t *re;
631 pg_re_flags flags;
633 parse_re_flags(&flags, opt);
635 re = RE_compile_and_cache(p, flags.cflags);
637 PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, flags.glob));
641 * similar_escape()
642 * Convert a SQL99 regexp pattern to POSIX style, so it can be used by
643 * our regexp engine.
645 Datum
646 similar_escape(PG_FUNCTION_ARGS)
648 text *pat_text;
649 text *esc_text;
650 text *result;
651 char *p,
654 int plen,
655 elen;
656 bool afterescape = false;
657 int nquotes = 0;
659 /* This function is not strict, so must test explicitly */
660 if (PG_ARGISNULL(0))
661 PG_RETURN_NULL();
662 pat_text = PG_GETARG_TEXT_PP(0);
663 p = VARDATA_ANY(pat_text);
664 plen = VARSIZE_ANY_EXHDR(pat_text);
665 if (PG_ARGISNULL(1))
667 /* No ESCAPE clause provided; default to backslash as escape */
668 e = "\\";
669 elen = 1;
671 else
673 esc_text = PG_GETARG_TEXT_PP(1);
674 e = VARDATA_ANY(esc_text);
675 elen = VARSIZE_ANY_EXHDR(esc_text);
676 if (elen == 0)
677 e = NULL; /* no escape character */
678 else if (elen != 1)
679 ereport(ERROR,
680 (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
681 errmsg("invalid escape string"),
682 errhint("Escape string must be empty or one character.")));
685 /*----------
686 * We surround the transformed input string with
687 * ***:^(?: ... )$
688 * which is bizarre enough to require some explanation. "***:" is a
689 * director prefix to force the regex to be treated as an ARE regardless
690 * of the current regex_flavor setting. We need "^" and "$" to force
691 * the pattern to match the entire input string as per SQL99 spec. The
692 * "(?:" and ")" are a non-capturing set of parens; we have to have
693 * parens in case the string contains "|", else the "^" and "$" will
694 * be bound into the first and last alternatives which is not what we
695 * want, and the parens must be non capturing because we don't want them
696 * to count when selecting output for SUBSTRING.
697 *----------
701 * We need room for the prefix/postfix plus as many as 2 output bytes per
702 * input byte
704 result = (text *) palloc(VARHDRSZ + 10 + 2 * plen);
705 r = VARDATA(result);
707 *r++ = '*';
708 *r++ = '*';
709 *r++ = '*';
710 *r++ = ':';
711 *r++ = '^';
712 *r++ = '(';
713 *r++ = '?';
714 *r++ = ':';
716 while (plen > 0)
718 char pchar = *p;
720 if (afterescape)
722 if (pchar == '"') /* for SUBSTRING patterns */
723 *r++ = ((nquotes++ % 2) == 0) ? '(' : ')';
724 else
726 *r++ = '\\';
727 *r++ = pchar;
729 afterescape = false;
731 else if (e && pchar == *e)
733 /* SQL99 escape character; do not send to output */
734 afterescape = true;
736 else if (pchar == '%')
738 *r++ = '.';
739 *r++ = '*';
741 else if (pchar == '_')
742 *r++ = '.';
743 else if (pchar == '\\' || pchar == '.' || pchar == '?' ||
744 pchar == '{')
746 *r++ = '\\';
747 *r++ = pchar;
749 else
750 *r++ = pchar;
751 p++, plen--;
754 *r++ = ')';
755 *r++ = '$';
757 SET_VARSIZE(result, r - ((char *) result));
759 PG_RETURN_TEXT_P(result);
763 * regexp_matches()
764 * Return a table of matches of a pattern within a string.
766 Datum
767 regexp_matches(PG_FUNCTION_ARGS)
769 FuncCallContext *funcctx;
770 regexp_matches_ctx *matchctx;
772 if (SRF_IS_FIRSTCALL())
774 text *pattern = PG_GETARG_TEXT_PP(1);
775 text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
776 MemoryContext oldcontext;
778 funcctx = SRF_FIRSTCALL_INIT();
779 oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
781 /* be sure to copy the input string into the multi-call ctx */
782 matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
783 flags, false, true, false);
785 /* Pre-create workspace that build_regexp_matches_result needs */
786 matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
787 matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
789 MemoryContextSwitchTo(oldcontext);
790 funcctx->user_fctx = (void *) matchctx;
793 funcctx = SRF_PERCALL_SETUP();
794 matchctx = (regexp_matches_ctx *) funcctx->user_fctx;
796 if (matchctx->next_match < matchctx->nmatches)
798 ArrayType *result_ary;
800 result_ary = build_regexp_matches_result(matchctx);
801 matchctx->next_match++;
802 SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
805 /* release space in multi-call ctx to avoid intraquery memory leak */
806 cleanup_regexp_matches(matchctx);
808 SRF_RETURN_DONE(funcctx);
811 /* This is separate to keep the opr_sanity regression test from complaining */
812 Datum
813 regexp_matches_no_flags(PG_FUNCTION_ARGS)
815 return regexp_matches(fcinfo);
819 * setup_regexp_matches --- do the initial matching for regexp_matches()
820 * or regexp_split()
822 * To avoid having to re-find the compiled pattern on each call, we do
823 * all the matching in one swoop. The returned regexp_matches_ctx contains
824 * the locations of all the substrings matching the pattern.
826 * The three bool parameters have only two patterns (one for each caller)
827 * but it seems clearer to distinguish the functionality this way than to
828 * key it all off one "is_split" flag.
830 static regexp_matches_ctx *
831 setup_regexp_matches(text *orig_str, text *pattern, text *flags,
832 bool force_glob, bool use_subpatterns,
833 bool ignore_degenerate)
835 regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
836 int orig_len;
837 pg_wchar *wide_str;
838 int wide_len;
839 pg_re_flags re_flags;
840 regex_t *cpattern;
841 regmatch_t *pmatch;
842 int pmatch_len;
843 int array_len;
844 int array_idx;
845 int prev_match_end;
846 int start_search;
848 /* save original string --- we'll extract result substrings from it */
849 matchctx->orig_str = orig_str;
851 /* convert string to pg_wchar form for matching */
852 orig_len = VARSIZE_ANY_EXHDR(orig_str);
853 wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
854 wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
856 /* determine options */
857 parse_re_flags(&re_flags, flags);
858 if (force_glob)
860 /* user mustn't specify 'g' for regexp_split */
861 if (re_flags.glob)
862 ereport(ERROR,
863 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
864 errmsg("regexp_split does not support the global option")));
865 /* but we find all the matches anyway */
866 re_flags.glob = true;
869 /* set up the compiled pattern */
870 cpattern = RE_compile_and_cache(pattern, re_flags.cflags);
872 /* do we want to remember subpatterns? */
873 if (use_subpatterns && cpattern->re_nsub > 0)
875 matchctx->npatterns = cpattern->re_nsub;
876 pmatch_len = cpattern->re_nsub + 1;
878 else
880 use_subpatterns = false;
881 matchctx->npatterns = 1;
882 pmatch_len = 1;
885 /* temporary output space for RE package */
886 pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
888 /* the real output space (grown dynamically if needed) */
889 array_len = re_flags.glob ? 256 : 32;
890 matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
891 array_idx = 0;
893 /* search for the pattern, perhaps repeatedly */
894 prev_match_end = 0;
895 start_search = 0;
896 while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search,
897 pmatch_len, pmatch))
900 * If requested, ignore degenerate matches, which are zero-length
901 * matches occurring at the start or end of a string or just after a
902 * previous match.
904 if (!ignore_degenerate ||
905 (pmatch[0].rm_so < wide_len &&
906 pmatch[0].rm_eo > prev_match_end))
908 /* enlarge output space if needed */
909 while (array_idx + matchctx->npatterns * 2 > array_len)
911 array_len *= 2;
912 matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
913 sizeof(int) * array_len);
916 /* save this match's locations */
917 if (use_subpatterns)
919 int i;
921 for (i = 1; i <= matchctx->npatterns; i++)
923 matchctx->match_locs[array_idx++] = pmatch[i].rm_so;
924 matchctx->match_locs[array_idx++] = pmatch[i].rm_eo;
927 else
929 matchctx->match_locs[array_idx++] = pmatch[0].rm_so;
930 matchctx->match_locs[array_idx++] = pmatch[0].rm_eo;
932 matchctx->nmatches++;
934 prev_match_end = pmatch[0].rm_eo;
936 /* if not glob, stop after one match */
937 if (!re_flags.glob)
938 break;
941 * Advance search position. Normally we start just after the end of
942 * the previous match, but always advance at least one character (the
943 * special case can occur if the pattern matches zero characters just
944 * after the prior match or at the end of the string).
946 if (start_search < pmatch[0].rm_eo)
947 start_search = pmatch[0].rm_eo;
948 else
949 start_search++;
950 if (start_search > wide_len)
951 break;
954 /* Clean up temp storage */
955 pfree(wide_str);
956 pfree(pmatch);
958 return matchctx;
962 * cleanup_regexp_matches - release memory of a regexp_matches_ctx
964 static void
965 cleanup_regexp_matches(regexp_matches_ctx *matchctx)
967 pfree(matchctx->orig_str);
968 pfree(matchctx->match_locs);
969 if (matchctx->elems)
970 pfree(matchctx->elems);
971 if (matchctx->nulls)
972 pfree(matchctx->nulls);
973 pfree(matchctx);
977 * build_regexp_matches_result - build output array for current match
979 static ArrayType *
980 build_regexp_matches_result(regexp_matches_ctx *matchctx)
982 Datum *elems = matchctx->elems;
983 bool *nulls = matchctx->nulls;
984 int dims[1];
985 int lbs[1];
986 int loc;
987 int i;
989 /* Extract matching substrings from the original string */
990 loc = matchctx->next_match * matchctx->npatterns * 2;
991 for (i = 0; i < matchctx->npatterns; i++)
993 int so = matchctx->match_locs[loc++];
994 int eo = matchctx->match_locs[loc++];
996 if (so < 0 || eo < 0)
998 elems[i] = (Datum) 0;
999 nulls[i] = true;
1001 else
1003 elems[i] = DirectFunctionCall3(text_substr,
1004 PointerGetDatum(matchctx->orig_str),
1005 Int32GetDatum(so + 1),
1006 Int32GetDatum(eo - so));
1007 nulls[i] = false;
1011 /* And form an array */
1012 dims[0] = matchctx->npatterns;
1013 lbs[0] = 1;
1014 /* XXX: this hardcodes assumptions about the text type */
1015 return construct_md_array(elems, nulls, 1, dims, lbs,
1016 TEXTOID, -1, false, 'i');
1020 * regexp_split_to_table()
1021 * Split the string at matches of the pattern, returning the
1022 * split-out substrings as a table.
1024 Datum
1025 regexp_split_to_table(PG_FUNCTION_ARGS)
1027 FuncCallContext *funcctx;
1028 regexp_matches_ctx *splitctx;
1030 if (SRF_IS_FIRSTCALL())
1032 text *pattern = PG_GETARG_TEXT_PP(1);
1033 text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
1034 MemoryContext oldcontext;
1036 funcctx = SRF_FIRSTCALL_INIT();
1037 oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
1039 /* be sure to copy the input string into the multi-call ctx */
1040 splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
1041 flags, true, false, true);
1043 MemoryContextSwitchTo(oldcontext);
1044 funcctx->user_fctx = (void *) splitctx;
1047 funcctx = SRF_PERCALL_SETUP();
1048 splitctx = (regexp_matches_ctx *) funcctx->user_fctx;
1050 if (splitctx->next_match <= splitctx->nmatches)
1052 Datum result = build_regexp_split_result(splitctx);
1054 splitctx->next_match++;
1055 SRF_RETURN_NEXT(funcctx, result);
1058 /* release space in multi-call ctx to avoid intraquery memory leak */
1059 cleanup_regexp_matches(splitctx);
1061 SRF_RETURN_DONE(funcctx);
1064 /* This is separate to keep the opr_sanity regression test from complaining */
1065 Datum
1066 regexp_split_to_table_no_flags(PG_FUNCTION_ARGS)
1068 return regexp_split_to_table(fcinfo);
1072 * regexp_split_to_array()
1073 * Split the string at matches of the pattern, returning the
1074 * split-out substrings as an array.
1076 Datum
1077 regexp_split_to_array(PG_FUNCTION_ARGS)
1079 ArrayBuildState *astate = NULL;
1080 regexp_matches_ctx *splitctx;
1082 splitctx = setup_regexp_matches(PG_GETARG_TEXT_PP(0),
1083 PG_GETARG_TEXT_PP(1),
1084 PG_GETARG_TEXT_PP_IF_EXISTS(2),
1085 true, false, true);
1087 while (splitctx->next_match <= splitctx->nmatches)
1089 astate = accumArrayResult(astate,
1090 build_regexp_split_result(splitctx),
1091 false,
1092 TEXTOID,
1093 CurrentMemoryContext);
1094 splitctx->next_match++;
1098 * We don't call cleanup_regexp_matches here; it would try to pfree the
1099 * input string, which we didn't copy. The space is not in a long-lived
1100 * memory context anyway.
1103 PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, CurrentMemoryContext));
1106 /* This is separate to keep the opr_sanity regression test from complaining */
1107 Datum
1108 regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
1110 return regexp_split_to_array(fcinfo);
1114 * build_regexp_split_result - build output string for current match
1116 * We return the string between the current match and the previous one,
1117 * or the string after the last match when next_match == nmatches.
1119 static Datum
1120 build_regexp_split_result(regexp_matches_ctx *splitctx)
1122 int startpos;
1123 int endpos;
1125 if (splitctx->next_match > 0)
1126 startpos = splitctx->match_locs[splitctx->next_match * 2 - 1];
1127 else
1128 startpos = 0;
1129 if (startpos < 0)
1130 elog(ERROR, "invalid match ending position");
1132 if (splitctx->next_match < splitctx->nmatches)
1134 endpos = splitctx->match_locs[splitctx->next_match * 2];
1135 if (endpos < startpos)
1136 elog(ERROR, "invalid match starting position");
1137 return DirectFunctionCall3(text_substr,
1138 PointerGetDatum(splitctx->orig_str),
1139 Int32GetDatum(startpos + 1),
1140 Int32GetDatum(endpos - startpos));
1142 else
1144 /* no more matches, return rest of string */
1145 return DirectFunctionCall2(text_substr_no_len,
1146 PointerGetDatum(splitctx->orig_str),
1147 Int32GetDatum(startpos + 1));