1 /*-------------------------------------------------------------------------
4 * Postgres' interface to the regular expression package.
6 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
13 * Alistair Crooks added the code for the regex caching
14 * agc - cached the regular expressions used - there's a good chance
15 * that we'll get a hit, so this saves a compile step for every
16 * attempted match. I haven't actually measured the speed improvement,
17 * but it `looks' a lot quicker visually when watching regression
20 * agc - incorporated Keith Bostic's Berkeley regex code into
21 * the tree for all ports. To distinguish this regex code from any that
22 * is existent on a platform, I've prepended the string "pg_" to
23 * the functions regcomp, regerror, regexec and regfree.
24 * Fixed a bug that was originally a typo by me, where `i' was used
25 * instead of `oldest' when compiling regular expressions - benign
26 * results mostly, although occasionally it bit you...
28 *-------------------------------------------------------------------------
32 #include "catalog/pg_type.h"
34 #include "regex/regex.h"
35 #include "utils/builtins.h"
36 #include "utils/guc.h"
38 #define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \
39 (PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL)
42 /* GUC-settable flavor parameter */
43 int regex_flavor
= REG_ADVANCED
;
46 /* all the options of interest for regex functions */
47 typedef struct pg_re_flags
49 int cflags
; /* compile flags for Spencer's regex code */
50 bool glob
; /* do it globally (for each occurrence) */
53 /* cross-call state for regexp_matches(), also regexp_split() */
54 typedef struct regexp_matches_ctx
56 text
*orig_str
; /* data string in original TEXT form */
57 int nmatches
; /* number of places where pattern matched */
58 int npatterns
; /* number of capturing subpatterns */
59 /* We store start char index and end+1 char index for each match */
60 /* so the number of entries in match_locs is nmatches * npatterns * 2 */
61 int *match_locs
; /* 0-based character indexes */
62 int next_match
; /* 0-based index of next match to process */
63 /* workspace for build_regexp_matches_result() */
64 Datum
*elems
; /* has npatterns elements */
65 bool *nulls
; /* has npatterns elements */
69 * We cache precompiled regular expressions using a "self organizing list"
70 * structure, in which recently-used items tend to be near the front.
71 * Whenever we use an entry, it's moved up to the front of the list.
72 * Over time, an item's average position corresponds to its frequency of use.
74 * When we first create an entry, it's inserted at the front of
75 * the array, dropping the entry at the end of the array if necessary to
76 * make room. (This might seem to be weighting the new entry too heavily,
77 * but if we insert new entries further back, we'll be unable to adjust to
78 * a sudden shift in the query mix where we are presented with MAX_CACHED_RES
79 * never-before-seen items used circularly. We ought to be able to handle
80 * that case, so we have to insert at the front.)
82 * Knuth mentions a variant strategy in which a used item is moved up just
83 * one place in the list. Although he says this uses fewer comparisons on
84 * average, it seems not to adapt very well to the situation where you have
85 * both some reusable patterns and a steady stream of non-reusable patterns.
86 * A reusable pattern that isn't used at least as often as non-reusable
87 * patterns are seen will "fail to keep up" and will drop off the end of the
88 * cache. With move-to-front, a reusable pattern is guaranteed to stay in
89 * the cache as long as it's used at least once in every MAX_CACHED_RES uses.
92 /* this is the maximum number of cached regular expressions */
93 #ifndef MAX_CACHED_RES
94 #define MAX_CACHED_RES 32
97 /* this structure describes one cached regular expression */
98 typedef struct cached_re_str
100 char *cre_pat
; /* original RE (not null terminated!) */
101 int cre_pat_len
; /* length of original RE, in bytes */
102 int cre_flags
; /* compile flags: extended,icase etc */
103 regex_t cre_re
; /* the compiled regular expression */
106 static int num_res
= 0; /* # of cached re's */
107 static cached_re_str re_array
[MAX_CACHED_RES
]; /* cached re's */
110 /* Local functions */
111 static regexp_matches_ctx
*setup_regexp_matches(text
*orig_str
, text
*pattern
,
114 bool use_subpatterns
,
115 bool ignore_degenerate
);
116 static void cleanup_regexp_matches(regexp_matches_ctx
*matchctx
);
117 static ArrayType
*build_regexp_matches_result(regexp_matches_ctx
*matchctx
);
118 static Datum
build_regexp_split_result(regexp_matches_ctx
*splitctx
);
122 * RE_compile_and_cache - compile a RE, caching if possible
126 * text_re --- the pattern, expressed as a TEXT object
127 * cflags --- compile options for the pattern
129 * Pattern is given in the database encoding. We internally convert to
130 * an array of pg_wchar, which is what Spencer's regex package wants.
133 RE_compile_and_cache(text
*text_re
, int cflags
)
135 int text_re_len
= VARSIZE_ANY_EXHDR(text_re
);
136 char *text_re_val
= VARDATA_ANY(text_re
);
141 cached_re_str re_temp
;
145 * Look for a match among previously compiled REs. Since the data
146 * structure is self-organizing with most-used entries at the front, our
147 * search strategy can just be to scan from the front.
149 for (i
= 0; i
< num_res
; i
++)
151 if (re_array
[i
].cre_pat_len
== text_re_len
&&
152 re_array
[i
].cre_flags
== cflags
&&
153 memcmp(re_array
[i
].cre_pat
, text_re_val
, text_re_len
) == 0)
156 * Found a match; move it to front if not there already.
160 re_temp
= re_array
[i
];
161 memmove(&re_array
[1], &re_array
[0], i
* sizeof(cached_re_str
));
162 re_array
[0] = re_temp
;
165 return &re_array
[0].cre_re
;
170 * Couldn't find it, so try to compile the new RE. To avoid leaking
171 * resources on failure, we build into the re_temp local.
174 /* Convert pattern string to wide characters */
175 pattern
= (pg_wchar
*) palloc((text_re_len
+ 1) * sizeof(pg_wchar
));
176 pattern_len
= pg_mb2wchar_with_len(text_re_val
,
180 regcomp_result
= pg_regcomp(&re_temp
.cre_re
,
187 if (regcomp_result
!= REG_OKAY
)
189 /* re didn't compile */
190 pg_regerror(regcomp_result
, &re_temp
.cre_re
, errMsg
, sizeof(errMsg
));
191 /* XXX should we pg_regfree here? */
193 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION
),
194 errmsg("invalid regular expression: %s", errMsg
)));
198 * We use malloc/free for the cre_pat field because the storage has to
199 * persist across transactions, and because we want to get control back on
200 * out-of-memory. The Max() is because some malloc implementations return
201 * NULL for malloc(0).
203 re_temp
.cre_pat
= malloc(Max(text_re_len
, 1));
204 if (re_temp
.cre_pat
== NULL
)
206 pg_regfree(&re_temp
.cre_re
);
208 (errcode(ERRCODE_OUT_OF_MEMORY
),
209 errmsg("out of memory")));
211 memcpy(re_temp
.cre_pat
, text_re_val
, text_re_len
);
212 re_temp
.cre_pat_len
= text_re_len
;
213 re_temp
.cre_flags
= cflags
;
216 * Okay, we have a valid new item in re_temp; insert it into the storage
217 * array. Discard last entry if needed.
219 if (num_res
>= MAX_CACHED_RES
)
222 Assert(num_res
< MAX_CACHED_RES
);
223 pg_regfree(&re_array
[num_res
].cre_re
);
224 free(re_array
[num_res
].cre_pat
);
228 memmove(&re_array
[1], &re_array
[0], num_res
* sizeof(cached_re_str
));
230 re_array
[0] = re_temp
;
233 return &re_array
[0].cre_re
;
237 * RE_wchar_execute - execute a RE on pg_wchar data
239 * Returns TRUE on match, FALSE on no match
241 * re --- the compiled pattern as returned by RE_compile_and_cache
242 * data --- the data to match against (need not be null-terminated)
243 * data_len --- the length of the data string
244 * start_search -- the offset in the data to start searching
245 * nmatch, pmatch --- optional return area for match details
247 * Data is given as array of pg_wchar which is what Spencer's regex package
251 RE_wchar_execute(regex_t
*re
, pg_wchar
*data
, int data_len
,
252 int start_search
, int nmatch
, regmatch_t
*pmatch
)
257 /* Perform RE match and return result */
258 regexec_result
= pg_regexec(re
,
262 NULL
, /* no details */
267 if (regexec_result
!= REG_OKAY
&& regexec_result
!= REG_NOMATCH
)
270 pg_regerror(regexec_result
, re
, errMsg
, sizeof(errMsg
));
272 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION
),
273 errmsg("regular expression failed: %s", errMsg
)));
276 return (regexec_result
== REG_OKAY
);
280 * RE_execute - execute a RE
282 * Returns TRUE on match, FALSE on no match
284 * re --- the compiled pattern as returned by RE_compile_and_cache
285 * dat --- the data to match against (need not be null-terminated)
286 * dat_len --- the length of the data string
287 * nmatch, pmatch --- optional return area for match details
289 * Data is given in the database encoding. We internally
290 * convert to array of pg_wchar which is what Spencer's regex package wants.
293 RE_execute(regex_t
*re
, char *dat
, int dat_len
,
294 int nmatch
, regmatch_t
*pmatch
)
300 /* Convert data string to wide characters */
301 data
= (pg_wchar
*) palloc((dat_len
+ 1) * sizeof(pg_wchar
));
302 data_len
= pg_mb2wchar_with_len(dat
, data
, dat_len
);
304 /* Perform RE match and return result */
305 match
= RE_wchar_execute(re
, data
, data_len
, 0, nmatch
, pmatch
);
312 * RE_compile_and_execute - compile and execute a RE
314 * Returns TRUE on match, FALSE on no match
316 * text_re --- the pattern, expressed as a TEXT object
317 * dat --- the data to match against (need not be null-terminated)
318 * dat_len --- the length of the data string
319 * cflags --- compile options for the pattern
320 * nmatch, pmatch --- optional return area for match details
322 * Both pattern and data are given in the database encoding. We internally
323 * convert to array of pg_wchar which is what Spencer's regex package wants.
326 RE_compile_and_execute(text
*text_re
, char *dat
, int dat_len
,
327 int cflags
, int nmatch
, regmatch_t
*pmatch
)
332 re
= RE_compile_and_cache(text_re
, cflags
);
334 return RE_execute(re
, dat
, dat_len
, nmatch
, pmatch
);
339 * parse_re_flags - parse the options argument of regexp_matches and friends
341 * flags --- output argument, filled with desired options
342 * opts --- TEXT object, or NULL for defaults
344 * This accepts all the options allowed by any of the callers; callers that
345 * don't want some have to reject them after the fact.
348 parse_re_flags(pg_re_flags
*flags
, text
*opts
)
350 /* regex_flavor is always folded into the compile flags */
351 flags
->cflags
= regex_flavor
;
356 char *opt_p
= VARDATA_ANY(opts
);
357 int opt_len
= VARSIZE_ANY_EXHDR(opts
);
360 for (i
= 0; i
< opt_len
; i
++)
367 case 'b': /* BREs (but why???) */
368 flags
->cflags
&= ~(REG_ADVANCED
| REG_EXTENDED
| REG_QUOTE
);
370 case 'c': /* case sensitive */
371 flags
->cflags
&= ~REG_ICASE
;
373 case 'e': /* plain EREs */
374 flags
->cflags
|= REG_EXTENDED
;
375 flags
->cflags
&= ~(REG_ADVANCED
| REG_QUOTE
);
377 case 'i': /* case insensitive */
378 flags
->cflags
|= REG_ICASE
;
380 case 'm': /* Perloid synonym for n */
381 case 'n': /* \n affects ^ $ . [^ */
382 flags
->cflags
|= REG_NEWLINE
;
384 case 'p': /* ~Perl, \n affects . [^ */
385 flags
->cflags
|= REG_NLSTOP
;
386 flags
->cflags
&= ~REG_NLANCH
;
388 case 'q': /* literal string */
389 flags
->cflags
|= REG_QUOTE
;
390 flags
->cflags
&= ~(REG_ADVANCED
| REG_EXTENDED
);
392 case 's': /* single line, \n ordinary */
393 flags
->cflags
&= ~REG_NEWLINE
;
395 case 't': /* tight syntax */
396 flags
->cflags
&= ~REG_EXPANDED
;
398 case 'w': /* weird, \n affects ^ $ only */
399 flags
->cflags
&= ~REG_NLSTOP
;
400 flags
->cflags
|= REG_NLANCH
;
402 case 'x': /* expanded syntax */
403 flags
->cflags
|= REG_EXPANDED
;
407 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
408 errmsg("invalid regexp option: \"%c\"",
418 * report whether regex_flavor is currently BASIC
421 regex_flavor_is_basic(void)
423 return (regex_flavor
== REG_BASIC
);
428 * interface routines called by the function manager
432 nameregexeq(PG_FUNCTION_ARGS
)
434 Name n
= PG_GETARG_NAME(0);
435 text
*p
= PG_GETARG_TEXT_PP(1);
437 PG_RETURN_BOOL(RE_compile_and_execute(p
,
445 nameregexne(PG_FUNCTION_ARGS
)
447 Name n
= PG_GETARG_NAME(0);
448 text
*p
= PG_GETARG_TEXT_PP(1);
450 PG_RETURN_BOOL(!RE_compile_and_execute(p
,
458 textregexeq(PG_FUNCTION_ARGS
)
460 text
*s
= PG_GETARG_TEXT_PP(0);
461 text
*p
= PG_GETARG_TEXT_PP(1);
463 PG_RETURN_BOOL(RE_compile_and_execute(p
,
465 VARSIZE_ANY_EXHDR(s
),
471 textregexne(PG_FUNCTION_ARGS
)
473 text
*s
= PG_GETARG_TEXT_PP(0);
474 text
*p
= PG_GETARG_TEXT_PP(1);
476 PG_RETURN_BOOL(!RE_compile_and_execute(p
,
478 VARSIZE_ANY_EXHDR(s
),
485 * routines that use the regexp stuff, but ignore the case.
486 * for this, we use the REG_ICASE flag to pg_regcomp
491 nameicregexeq(PG_FUNCTION_ARGS
)
493 Name n
= PG_GETARG_NAME(0);
494 text
*p
= PG_GETARG_TEXT_PP(1);
496 PG_RETURN_BOOL(RE_compile_and_execute(p
,
499 regex_flavor
| REG_ICASE
,
504 nameicregexne(PG_FUNCTION_ARGS
)
506 Name n
= PG_GETARG_NAME(0);
507 text
*p
= PG_GETARG_TEXT_PP(1);
509 PG_RETURN_BOOL(!RE_compile_and_execute(p
,
512 regex_flavor
| REG_ICASE
,
517 texticregexeq(PG_FUNCTION_ARGS
)
519 text
*s
= PG_GETARG_TEXT_PP(0);
520 text
*p
= PG_GETARG_TEXT_PP(1);
522 PG_RETURN_BOOL(RE_compile_and_execute(p
,
524 VARSIZE_ANY_EXHDR(s
),
525 regex_flavor
| REG_ICASE
,
530 texticregexne(PG_FUNCTION_ARGS
)
532 text
*s
= PG_GETARG_TEXT_PP(0);
533 text
*p
= PG_GETARG_TEXT_PP(1);
535 PG_RETURN_BOOL(!RE_compile_and_execute(p
,
537 VARSIZE_ANY_EXHDR(s
),
538 regex_flavor
| REG_ICASE
,
545 * Return a substring matched by a regular expression.
548 textregexsubstr(PG_FUNCTION_ARGS
)
550 text
*s
= PG_GETARG_TEXT_PP(0);
551 text
*p
= PG_GETARG_TEXT_PP(1);
553 regmatch_t pmatch
[2];
558 re
= RE_compile_and_cache(p
, regex_flavor
);
561 * We pass two regmatch_t structs to get info about the overall match and
562 * the match for the first parenthesized subexpression (if any). If there
563 * is a parenthesized subexpression, we return what it matched; else
564 * return what the whole regexp matched.
567 VARDATA_ANY(s
), VARSIZE_ANY_EXHDR(s
),
569 PG_RETURN_NULL(); /* definitely no match */
573 /* has parenthesized subexpressions, use the first one */
574 so
= pmatch
[1].rm_so
;
575 eo
= pmatch
[1].rm_eo
;
579 /* no parenthesized subexpression, use whole match */
580 so
= pmatch
[0].rm_so
;
581 eo
= pmatch
[0].rm_eo
;
585 * It is possible to have a match to the whole pattern but no match for a
586 * subexpression; for example 'foo(bar)?' is considered to match 'foo' but
587 * there is no subexpression match. So this extra test for match failure
590 if (so
< 0 || eo
< 0)
593 return DirectFunctionCall3(text_substr
,
595 Int32GetDatum(so
+ 1),
596 Int32GetDatum(eo
- so
));
600 * textregexreplace_noopt()
601 * Return a string matched by a regular expression, with replacement.
603 * This version doesn't have an option argument: we default to case
604 * sensitive match, replace the first instance only.
607 textregexreplace_noopt(PG_FUNCTION_ARGS
)
609 text
*s
= PG_GETARG_TEXT_PP(0);
610 text
*p
= PG_GETARG_TEXT_PP(1);
611 text
*r
= PG_GETARG_TEXT_PP(2);
614 re
= RE_compile_and_cache(p
, regex_flavor
);
616 PG_RETURN_TEXT_P(replace_text_regexp(s
, (void *) re
, r
, false));
621 * Return a string matched by a regular expression, with replacement.
624 textregexreplace(PG_FUNCTION_ARGS
)
626 text
*s
= PG_GETARG_TEXT_PP(0);
627 text
*p
= PG_GETARG_TEXT_PP(1);
628 text
*r
= PG_GETARG_TEXT_PP(2);
629 text
*opt
= PG_GETARG_TEXT_PP(3);
633 parse_re_flags(&flags
, opt
);
635 re
= RE_compile_and_cache(p
, flags
.cflags
);
637 PG_RETURN_TEXT_P(replace_text_regexp(s
, (void *) re
, r
, flags
.glob
));
642 * Convert a SQL99 regexp pattern to POSIX style, so it can be used by
646 similar_escape(PG_FUNCTION_ARGS
)
656 bool afterescape
= false;
659 /* This function is not strict, so must test explicitly */
662 pat_text
= PG_GETARG_TEXT_PP(0);
663 p
= VARDATA_ANY(pat_text
);
664 plen
= VARSIZE_ANY_EXHDR(pat_text
);
667 /* No ESCAPE clause provided; default to backslash as escape */
673 esc_text
= PG_GETARG_TEXT_PP(1);
674 e
= VARDATA_ANY(esc_text
);
675 elen
= VARSIZE_ANY_EXHDR(esc_text
);
677 e
= NULL
; /* no escape character */
680 (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE
),
681 errmsg("invalid escape string"),
682 errhint("Escape string must be empty or one character.")));
686 * We surround the transformed input string with
688 * which is bizarre enough to require some explanation. "***:" is a
689 * director prefix to force the regex to be treated as an ARE regardless
690 * of the current regex_flavor setting. We need "^" and "$" to force
691 * the pattern to match the entire input string as per SQL99 spec. The
692 * "(?:" and ")" are a non-capturing set of parens; we have to have
693 * parens in case the string contains "|", else the "^" and "$" will
694 * be bound into the first and last alternatives which is not what we
695 * want, and the parens must be non capturing because we don't want them
696 * to count when selecting output for SUBSTRING.
701 * We need room for the prefix/postfix plus as many as 2 output bytes per
704 result
= (text
*) palloc(VARHDRSZ
+ 10 + 2 * plen
);
722 if (pchar
== '"') /* for SUBSTRING patterns */
723 *r
++ = ((nquotes
++ % 2) == 0) ? '(' : ')';
731 else if (e
&& pchar
== *e
)
733 /* SQL99 escape character; do not send to output */
736 else if (pchar
== '%')
741 else if (pchar
== '_')
743 else if (pchar
== '\\' || pchar
== '.' || pchar
== '?' ||
757 SET_VARSIZE(result
, r
- ((char *) result
));
759 PG_RETURN_TEXT_P(result
);
764 * Return a table of matches of a pattern within a string.
767 regexp_matches(PG_FUNCTION_ARGS
)
769 FuncCallContext
*funcctx
;
770 regexp_matches_ctx
*matchctx
;
772 if (SRF_IS_FIRSTCALL())
774 text
*pattern
= PG_GETARG_TEXT_PP(1);
775 text
*flags
= PG_GETARG_TEXT_PP_IF_EXISTS(2);
776 MemoryContext oldcontext
;
778 funcctx
= SRF_FIRSTCALL_INIT();
779 oldcontext
= MemoryContextSwitchTo(funcctx
->multi_call_memory_ctx
);
781 /* be sure to copy the input string into the multi-call ctx */
782 matchctx
= setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern
,
783 flags
, false, true, false);
785 /* Pre-create workspace that build_regexp_matches_result needs */
786 matchctx
->elems
= (Datum
*) palloc(sizeof(Datum
) * matchctx
->npatterns
);
787 matchctx
->nulls
= (bool *) palloc(sizeof(bool) * matchctx
->npatterns
);
789 MemoryContextSwitchTo(oldcontext
);
790 funcctx
->user_fctx
= (void *) matchctx
;
793 funcctx
= SRF_PERCALL_SETUP();
794 matchctx
= (regexp_matches_ctx
*) funcctx
->user_fctx
;
796 if (matchctx
->next_match
< matchctx
->nmatches
)
798 ArrayType
*result_ary
;
800 result_ary
= build_regexp_matches_result(matchctx
);
801 matchctx
->next_match
++;
802 SRF_RETURN_NEXT(funcctx
, PointerGetDatum(result_ary
));
805 /* release space in multi-call ctx to avoid intraquery memory leak */
806 cleanup_regexp_matches(matchctx
);
808 SRF_RETURN_DONE(funcctx
);
811 /* This is separate to keep the opr_sanity regression test from complaining */
813 regexp_matches_no_flags(PG_FUNCTION_ARGS
)
815 return regexp_matches(fcinfo
);
819 * setup_regexp_matches --- do the initial matching for regexp_matches()
822 * To avoid having to re-find the compiled pattern on each call, we do
823 * all the matching in one swoop. The returned regexp_matches_ctx contains
824 * the locations of all the substrings matching the pattern.
826 * The three bool parameters have only two patterns (one for each caller)
827 * but it seems clearer to distinguish the functionality this way than to
828 * key it all off one "is_split" flag.
830 static regexp_matches_ctx
*
831 setup_regexp_matches(text
*orig_str
, text
*pattern
, text
*flags
,
832 bool force_glob
, bool use_subpatterns
,
833 bool ignore_degenerate
)
835 regexp_matches_ctx
*matchctx
= palloc0(sizeof(regexp_matches_ctx
));
839 pg_re_flags re_flags
;
848 /* save original string --- we'll extract result substrings from it */
849 matchctx
->orig_str
= orig_str
;
851 /* convert string to pg_wchar form for matching */
852 orig_len
= VARSIZE_ANY_EXHDR(orig_str
);
853 wide_str
= (pg_wchar
*) palloc(sizeof(pg_wchar
) * (orig_len
+ 1));
854 wide_len
= pg_mb2wchar_with_len(VARDATA_ANY(orig_str
), wide_str
, orig_len
);
856 /* determine options */
857 parse_re_flags(&re_flags
, flags
);
860 /* user mustn't specify 'g' for regexp_split */
863 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
864 errmsg("regexp_split does not support the global option")));
865 /* but we find all the matches anyway */
866 re_flags
.glob
= true;
869 /* set up the compiled pattern */
870 cpattern
= RE_compile_and_cache(pattern
, re_flags
.cflags
);
872 /* do we want to remember subpatterns? */
873 if (use_subpatterns
&& cpattern
->re_nsub
> 0)
875 matchctx
->npatterns
= cpattern
->re_nsub
;
876 pmatch_len
= cpattern
->re_nsub
+ 1;
880 use_subpatterns
= false;
881 matchctx
->npatterns
= 1;
885 /* temporary output space for RE package */
886 pmatch
= palloc(sizeof(regmatch_t
) * pmatch_len
);
888 /* the real output space (grown dynamically if needed) */
889 array_len
= re_flags
.glob
? 256 : 32;
890 matchctx
->match_locs
= (int *) palloc(sizeof(int) * array_len
);
893 /* search for the pattern, perhaps repeatedly */
896 while (RE_wchar_execute(cpattern
, wide_str
, wide_len
, start_search
,
900 * If requested, ignore degenerate matches, which are zero-length
901 * matches occurring at the start or end of a string or just after a
904 if (!ignore_degenerate
||
905 (pmatch
[0].rm_so
< wide_len
&&
906 pmatch
[0].rm_eo
> prev_match_end
))
908 /* enlarge output space if needed */
909 while (array_idx
+ matchctx
->npatterns
* 2 > array_len
)
912 matchctx
->match_locs
= (int *) repalloc(matchctx
->match_locs
,
913 sizeof(int) * array_len
);
916 /* save this match's locations */
921 for (i
= 1; i
<= matchctx
->npatterns
; i
++)
923 matchctx
->match_locs
[array_idx
++] = pmatch
[i
].rm_so
;
924 matchctx
->match_locs
[array_idx
++] = pmatch
[i
].rm_eo
;
929 matchctx
->match_locs
[array_idx
++] = pmatch
[0].rm_so
;
930 matchctx
->match_locs
[array_idx
++] = pmatch
[0].rm_eo
;
932 matchctx
->nmatches
++;
934 prev_match_end
= pmatch
[0].rm_eo
;
936 /* if not glob, stop after one match */
941 * Advance search position. Normally we start just after the end of
942 * the previous match, but always advance at least one character (the
943 * special case can occur if the pattern matches zero characters just
944 * after the prior match or at the end of the string).
946 if (start_search
< pmatch
[0].rm_eo
)
947 start_search
= pmatch
[0].rm_eo
;
950 if (start_search
> wide_len
)
954 /* Clean up temp storage */
962 * cleanup_regexp_matches - release memory of a regexp_matches_ctx
965 cleanup_regexp_matches(regexp_matches_ctx
*matchctx
)
967 pfree(matchctx
->orig_str
);
968 pfree(matchctx
->match_locs
);
970 pfree(matchctx
->elems
);
972 pfree(matchctx
->nulls
);
977 * build_regexp_matches_result - build output array for current match
980 build_regexp_matches_result(regexp_matches_ctx
*matchctx
)
982 Datum
*elems
= matchctx
->elems
;
983 bool *nulls
= matchctx
->nulls
;
989 /* Extract matching substrings from the original string */
990 loc
= matchctx
->next_match
* matchctx
->npatterns
* 2;
991 for (i
= 0; i
< matchctx
->npatterns
; i
++)
993 int so
= matchctx
->match_locs
[loc
++];
994 int eo
= matchctx
->match_locs
[loc
++];
996 if (so
< 0 || eo
< 0)
998 elems
[i
] = (Datum
) 0;
1003 elems
[i
] = DirectFunctionCall3(text_substr
,
1004 PointerGetDatum(matchctx
->orig_str
),
1005 Int32GetDatum(so
+ 1),
1006 Int32GetDatum(eo
- so
));
1011 /* And form an array */
1012 dims
[0] = matchctx
->npatterns
;
1014 /* XXX: this hardcodes assumptions about the text type */
1015 return construct_md_array(elems
, nulls
, 1, dims
, lbs
,
1016 TEXTOID
, -1, false, 'i');
1020 * regexp_split_to_table()
1021 * Split the string at matches of the pattern, returning the
1022 * split-out substrings as a table.
1025 regexp_split_to_table(PG_FUNCTION_ARGS
)
1027 FuncCallContext
*funcctx
;
1028 regexp_matches_ctx
*splitctx
;
1030 if (SRF_IS_FIRSTCALL())
1032 text
*pattern
= PG_GETARG_TEXT_PP(1);
1033 text
*flags
= PG_GETARG_TEXT_PP_IF_EXISTS(2);
1034 MemoryContext oldcontext
;
1036 funcctx
= SRF_FIRSTCALL_INIT();
1037 oldcontext
= MemoryContextSwitchTo(funcctx
->multi_call_memory_ctx
);
1039 /* be sure to copy the input string into the multi-call ctx */
1040 splitctx
= setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern
,
1041 flags
, true, false, true);
1043 MemoryContextSwitchTo(oldcontext
);
1044 funcctx
->user_fctx
= (void *) splitctx
;
1047 funcctx
= SRF_PERCALL_SETUP();
1048 splitctx
= (regexp_matches_ctx
*) funcctx
->user_fctx
;
1050 if (splitctx
->next_match
<= splitctx
->nmatches
)
1052 Datum result
= build_regexp_split_result(splitctx
);
1054 splitctx
->next_match
++;
1055 SRF_RETURN_NEXT(funcctx
, result
);
1058 /* release space in multi-call ctx to avoid intraquery memory leak */
1059 cleanup_regexp_matches(splitctx
);
1061 SRF_RETURN_DONE(funcctx
);
1064 /* This is separate to keep the opr_sanity regression test from complaining */
1066 regexp_split_to_table_no_flags(PG_FUNCTION_ARGS
)
1068 return regexp_split_to_table(fcinfo
);
1072 * regexp_split_to_array()
1073 * Split the string at matches of the pattern, returning the
1074 * split-out substrings as an array.
1077 regexp_split_to_array(PG_FUNCTION_ARGS
)
1079 ArrayBuildState
*astate
= NULL
;
1080 regexp_matches_ctx
*splitctx
;
1082 splitctx
= setup_regexp_matches(PG_GETARG_TEXT_PP(0),
1083 PG_GETARG_TEXT_PP(1),
1084 PG_GETARG_TEXT_PP_IF_EXISTS(2),
1087 while (splitctx
->next_match
<= splitctx
->nmatches
)
1089 astate
= accumArrayResult(astate
,
1090 build_regexp_split_result(splitctx
),
1093 CurrentMemoryContext
);
1094 splitctx
->next_match
++;
1098 * We don't call cleanup_regexp_matches here; it would try to pfree the
1099 * input string, which we didn't copy. The space is not in a long-lived
1100 * memory context anyway.
1103 PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate
, CurrentMemoryContext
));
1106 /* This is separate to keep the opr_sanity regression test from complaining */
1108 regexp_split_to_array_no_flags(PG_FUNCTION_ARGS
)
1110 return regexp_split_to_array(fcinfo
);
1114 * build_regexp_split_result - build output string for current match
1116 * We return the string between the current match and the previous one,
1117 * or the string after the last match when next_match == nmatches.
1120 build_regexp_split_result(regexp_matches_ctx
*splitctx
)
1125 if (splitctx
->next_match
> 0)
1126 startpos
= splitctx
->match_locs
[splitctx
->next_match
* 2 - 1];
1130 elog(ERROR
, "invalid match ending position");
1132 if (splitctx
->next_match
< splitctx
->nmatches
)
1134 endpos
= splitctx
->match_locs
[splitctx
->next_match
* 2];
1135 if (endpos
< startpos
)
1136 elog(ERROR
, "invalid match starting position");
1137 return DirectFunctionCall3(text_substr
,
1138 PointerGetDatum(splitctx
->orig_str
),
1139 Int32GetDatum(startpos
+ 1),
1140 Int32GetDatum(endpos
- startpos
));
1144 /* no more matches, return rest of string */
1145 return DirectFunctionCall2(text_substr_no_len
,
1146 PointerGetDatum(splitctx
->orig_str
),
1147 Int32GetDatum(startpos
+ 1));