1 /*-------------------------------------------------------------------------
4 * Functions for the variable-length built-in types.
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/utils/adt/varlena.c
13 *-------------------------------------------------------------------------
20 #include "access/detoast.h"
21 #include "access/toast_compression.h"
22 #include "catalog/pg_collation.h"
23 #include "catalog/pg_type.h"
24 #include "common/hashfn.h"
25 #include "common/int.h"
26 #include "common/unicode_category.h"
27 #include "common/unicode_norm.h"
28 #include "common/unicode_version.h"
30 #include "lib/hyperloglog.h"
31 #include "libpq/pqformat.h"
32 #include "miscadmin.h"
33 #include "nodes/execnodes.h"
34 #include "parser/scansup.h"
35 #include "port/pg_bswap.h"
36 #include "regex/regex.h"
37 #include "utils/builtins.h"
38 #include "utils/bytea.h"
39 #include "utils/guc.h"
40 #include "utils/lsyscache.h"
41 #include "utils/memutils.h"
42 #include "utils/pg_locale.h"
43 #include "utils/sortsupport.h"
44 #include "utils/varlena.h"
48 int bytea_output
= BYTEA_OUTPUT_HEX
;
50 typedef struct varlena VarString
;
53 * State for text_position_* functions.
57 bool is_multibyte_char_in_char
; /* need to check char boundaries? */
59 char *str1
; /* haystack string */
60 char *str2
; /* needle string */
61 int len1
; /* string lengths in bytes */
64 /* Skip table for Boyer-Moore-Horspool search algorithm: */
65 int skiptablemask
; /* mask for ANDing with skiptable subscripts */
66 int skiptable
[256]; /* skip distance for given mismatched char */
68 char *last_match
; /* pointer to last match in 'str1' */
71 * Sometimes we need to convert the byte position of a match to a
72 * character position. These store the last position that was converted,
73 * so that on the next call, we can continue from that point, rather than
74 * count characters from the very beginning.
76 char *refpoint
; /* pointer within original haystack string */
77 int refpos
; /* 0-based character offset of the same point */
82 char *buf1
; /* 1st string, or abbreviation original string
84 char *buf2
; /* 2nd string, or abbreviation strxfrm() buf */
85 int buflen1
; /* Allocated length of buf1 */
86 int buflen2
; /* Allocated length of buf2 */
87 int last_len1
; /* Length of last buf1 string/strxfrm() input */
88 int last_len2
; /* Length of last buf2 string/strxfrm() blob */
89 int last_returned
; /* Last comparison result (cache) */
90 bool cache_blob
; /* Does buf2 contain strxfrm() blob, etc? */
92 Oid typid
; /* Actual datatype (text/bpchar/bytea/name) */
93 hyperLogLogState abbr_card
; /* Abbreviated key cardinality state */
94 hyperLogLogState full_card
; /* Full key cardinality state */
95 double prop_card
; /* Required cardinality proportion */
97 } VarStringSortSupport
;
100 * Output data for split_text(): we output either to an array or a table.
101 * tupstore and tupdesc must be set up in advance to output to a table.
105 ArrayBuildState
*astate
;
106 Tuplestorestate
*tupstore
;
108 } SplitTextOutputData
;
111 * This should be large enough that most strings will fit, but small enough
112 * that we feel comfortable putting it on the stack
114 #define TEXTBUFLEN 1024
116 #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
117 #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
119 static int varstrfastcmp_c(Datum x
, Datum y
, SortSupport ssup
);
120 static int bpcharfastcmp_c(Datum x
, Datum y
, SortSupport ssup
);
121 static int namefastcmp_c(Datum x
, Datum y
, SortSupport ssup
);
122 static int varlenafastcmp_locale(Datum x
, Datum y
, SortSupport ssup
);
123 static int namefastcmp_locale(Datum x
, Datum y
, SortSupport ssup
);
124 static int varstrfastcmp_locale(char *a1p
, int len1
, char *a2p
, int len2
, SortSupport ssup
);
125 static Datum
varstr_abbrev_convert(Datum original
, SortSupport ssup
);
126 static bool varstr_abbrev_abort(int memtupcount
, SortSupport ssup
);
127 static int32
text_length(Datum str
);
128 static text
*text_catenate(text
*t1
, text
*t2
);
129 static text
*text_substring(Datum str
,
132 bool length_not_specified
);
133 static text
*text_overlay(text
*t1
, text
*t2
, int sp
, int sl
);
134 static int text_position(text
*t1
, text
*t2
, Oid collid
);
135 static void text_position_setup(text
*t1
, text
*t2
, Oid collid
, TextPositionState
*state
);
136 static bool text_position_next(TextPositionState
*state
);
137 static char *text_position_next_internal(char *start_ptr
, TextPositionState
*state
);
138 static char *text_position_get_match_ptr(TextPositionState
*state
);
139 static int text_position_get_match_pos(TextPositionState
*state
);
140 static void text_position_cleanup(TextPositionState
*state
);
141 static void check_collation_set(Oid collid
);
142 static int text_cmp(text
*arg1
, text
*arg2
, Oid collid
);
143 static bytea
*bytea_catenate(bytea
*t1
, bytea
*t2
);
144 static bytea
*bytea_substring(Datum str
,
147 bool length_not_specified
);
148 static bytea
*bytea_overlay(bytea
*t1
, bytea
*t2
, int sp
, int sl
);
149 static void appendStringInfoText(StringInfo str
, const text
*t
);
150 static bool split_text(FunctionCallInfo fcinfo
, SplitTextOutputData
*tstate
);
151 static void split_text_accum_result(SplitTextOutputData
*tstate
,
155 static text
*array_to_text_internal(FunctionCallInfo fcinfo
, ArrayType
*v
,
156 const char *fldsep
, const char *null_string
);
157 static StringInfo
makeStringAggState(FunctionCallInfo fcinfo
);
158 static bool text_format_parse_digits(const char **ptr
, const char *end_ptr
,
160 static const char *text_format_parse_format(const char *start_ptr
,
162 int *argpos
, int *widthpos
,
163 int *flags
, int *width
);
164 static void text_format_string_conversion(StringInfo buf
, char conversion
,
165 FmgrInfo
*typOutputInfo
,
166 Datum value
, bool isNull
,
167 int flags
, int width
);
168 static void text_format_append_string(StringInfo buf
, const char *str
,
169 int flags
, int width
);
172 /*****************************************************************************
173 * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
174 *****************************************************************************/
179 * Create a text value from a null-terminated C string.
181 * The new text value is freshly palloc'd with a full-size VARHDR.
184 cstring_to_text(const char *s
)
186 return cstring_to_text_with_len(s
, strlen(s
));
190 * cstring_to_text_with_len
192 * Same as cstring_to_text except the caller specifies the string length;
193 * the string need not be null_terminated.
196 cstring_to_text_with_len(const char *s
, int len
)
198 text
*result
= (text
*) palloc(len
+ VARHDRSZ
);
200 SET_VARSIZE(result
, len
+ VARHDRSZ
);
201 memcpy(VARDATA(result
), s
, len
);
209 * Create a palloc'd, null-terminated C string from a text value.
211 * We support being passed a compressed or toasted text value.
212 * This is a bit bogus since such values shouldn't really be referred to as
213 * "text *", but it seems useful for robustness. If we didn't handle that
214 * case here, we'd need another routine that did, anyway.
217 text_to_cstring(const text
*t
)
219 /* must cast away the const, unfortunately */
220 text
*tunpacked
= pg_detoast_datum_packed(unconstify(text
*, t
));
221 int len
= VARSIZE_ANY_EXHDR(tunpacked
);
224 result
= (char *) palloc(len
+ 1);
225 memcpy(result
, VARDATA_ANY(tunpacked
), len
);
235 * text_to_cstring_buffer
237 * Copy a text value into a caller-supplied buffer of size dst_len.
239 * The text string is truncated if necessary to fit. The result is
240 * guaranteed null-terminated (unless dst_len == 0).
242 * We support being passed a compressed or toasted text value.
243 * This is a bit bogus since such values shouldn't really be referred to as
244 * "text *", but it seems useful for robustness. If we didn't handle that
245 * case here, we'd need another routine that did, anyway.
248 text_to_cstring_buffer(const text
*src
, char *dst
, size_t dst_len
)
250 /* must cast away the const, unfortunately */
251 text
*srcunpacked
= pg_detoast_datum_packed(unconstify(text
*, src
));
252 size_t src_len
= VARSIZE_ANY_EXHDR(srcunpacked
);
257 if (dst_len
>= src_len
)
259 else /* ensure truncation is encoding-safe */
260 dst_len
= pg_mbcliplen(VARDATA_ANY(srcunpacked
), src_len
, dst_len
);
261 memcpy(dst
, VARDATA_ANY(srcunpacked
), dst_len
);
265 if (srcunpacked
!= src
)
270 /*****************************************************************************
271 * USER I/O ROUTINES *
272 *****************************************************************************/
275 #define VAL(CH) ((CH) - '0')
276 #define DIG(VAL) ((VAL) + '0')
279 * byteain - converts from printable representation of byte array
281 * Non-printable characters must be passed as '\nnn' (octal) and are
282 * converted to internal form. '\' must be passed as '\\'.
283 * ereport(ERROR, ...) if bad form.
286 * The input is scanned twice.
287 * The error checking of input is minimal.
290 byteain(PG_FUNCTION_ARGS
)
292 char *inputText
= PG_GETARG_CSTRING(0);
293 Node
*escontext
= fcinfo
->context
;
299 /* Recognize hex input */
300 if (inputText
[0] == '\\' && inputText
[1] == 'x')
302 size_t len
= strlen(inputText
);
304 bc
= (len
- 2) / 2 + VARHDRSZ
; /* maximum possible length */
306 bc
= hex_decode_safe(inputText
+ 2, len
- 2, VARDATA(result
),
308 SET_VARSIZE(result
, bc
+ VARHDRSZ
); /* actual length */
310 PG_RETURN_BYTEA_P(result
);
313 /* Else, it's the traditional escaped style */
314 for (bc
= 0, tp
= inputText
; *tp
!= '\0'; bc
++)
318 else if ((tp
[0] == '\\') &&
319 (tp
[1] >= '0' && tp
[1] <= '3') &&
320 (tp
[2] >= '0' && tp
[2] <= '7') &&
321 (tp
[3] >= '0' && tp
[3] <= '7'))
323 else if ((tp
[0] == '\\') &&
329 * one backslash, not followed by another or ### valid octal
331 ereturn(escontext
, (Datum
) 0,
332 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION
),
333 errmsg("invalid input syntax for type %s", "bytea")));
339 result
= (bytea
*) palloc(bc
);
340 SET_VARSIZE(result
, bc
);
343 rp
= VARDATA(result
);
348 else if ((tp
[0] == '\\') &&
349 (tp
[1] >= '0' && tp
[1] <= '3') &&
350 (tp
[2] >= '0' && tp
[2] <= '7') &&
351 (tp
[3] >= '0' && tp
[3] <= '7'))
357 *rp
++ = bc
+ VAL(tp
[3]);
361 else if ((tp
[0] == '\\') &&
370 * We should never get here. The first pass should not allow it.
372 ereturn(escontext
, (Datum
) 0,
373 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION
),
374 errmsg("invalid input syntax for type %s", "bytea")));
378 PG_RETURN_BYTEA_P(result
);
382 * byteaout - converts to printable representation of byte array
384 * In the traditional escaped format, non-printable characters are
385 * printed as '\nnn' (octal) and '\' as '\\'.
388 byteaout(PG_FUNCTION_ARGS
)
390 bytea
*vlena
= PG_GETARG_BYTEA_PP(0);
394 if (bytea_output
== BYTEA_OUTPUT_HEX
)
396 /* Print hex format */
397 rp
= result
= palloc(VARSIZE_ANY_EXHDR(vlena
) * 2 + 2 + 1);
400 rp
+= hex_encode(VARDATA_ANY(vlena
), VARSIZE_ANY_EXHDR(vlena
), rp
);
402 else if (bytea_output
== BYTEA_OUTPUT_ESCAPE
)
404 /* Print traditional escaped format */
409 len
= 1; /* empty string has 1 char */
410 vp
= VARDATA_ANY(vlena
);
411 for (i
= VARSIZE_ANY_EXHDR(vlena
); i
!= 0; i
--, vp
++)
415 else if ((unsigned char) *vp
< 0x20 || (unsigned char) *vp
> 0x7e)
422 * In principle len can't overflow uint32 if the input fit in 1GB, but
423 * for safety let's check rather than relying on palloc's internal
426 if (len
> MaxAllocSize
)
428 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED
),
429 errmsg_internal("result of bytea output conversion is too large")));
430 rp
= result
= (char *) palloc(len
);
432 vp
= VARDATA_ANY(vlena
);
433 for (i
= VARSIZE_ANY_EXHDR(vlena
); i
!= 0; i
--, vp
++)
440 else if ((unsigned char) *vp
< 0x20 || (unsigned char) *vp
> 0x7e)
442 int val
; /* holds unprintable chars */
446 rp
[3] = DIG(val
& 07);
448 rp
[2] = DIG(val
& 07);
450 rp
[1] = DIG(val
& 03);
459 elog(ERROR
, "unrecognized \"bytea_output\" setting: %d",
461 rp
= result
= NULL
; /* keep compiler quiet */
464 PG_RETURN_CSTRING(result
);
468 * bytearecv - converts external binary format to bytea
471 bytearecv(PG_FUNCTION_ARGS
)
473 StringInfo buf
= (StringInfo
) PG_GETARG_POINTER(0);
477 nbytes
= buf
->len
- buf
->cursor
;
478 result
= (bytea
*) palloc(nbytes
+ VARHDRSZ
);
479 SET_VARSIZE(result
, nbytes
+ VARHDRSZ
);
480 pq_copymsgbytes(buf
, VARDATA(result
), nbytes
);
481 PG_RETURN_BYTEA_P(result
);
485 * byteasend - converts bytea to binary format
487 * This is a special case: just copy the input...
490 byteasend(PG_FUNCTION_ARGS
)
492 bytea
*vlena
= PG_GETARG_BYTEA_P_COPY(0);
494 PG_RETURN_BYTEA_P(vlena
);
498 bytea_string_agg_transfn(PG_FUNCTION_ARGS
)
502 state
= PG_ARGISNULL(0) ? NULL
: (StringInfo
) PG_GETARG_POINTER(0);
504 /* Append the value unless null, preceding it with the delimiter. */
505 if (!PG_ARGISNULL(1))
507 bytea
*value
= PG_GETARG_BYTEA_PP(1);
508 bool isfirst
= false;
511 * You might think we can just throw away the first delimiter, however
512 * we must keep it as we may be a parallel worker doing partial
513 * aggregation building a state to send to the main process. We need
514 * to keep the delimiter of every aggregation so that the combine
515 * function can properly join up the strings of two separately
516 * partially aggregated results. The first delimiter is only stripped
517 * off in the final function. To know how much to strip off the front
518 * of the string, we store the length of the first delimiter in the
519 * StringInfo's cursor field, which we don't otherwise need here.
523 state
= makeStringAggState(fcinfo
);
527 if (!PG_ARGISNULL(2))
529 bytea
*delim
= PG_GETARG_BYTEA_PP(2);
531 appendBinaryStringInfo(state
, VARDATA_ANY(delim
),
532 VARSIZE_ANY_EXHDR(delim
));
534 state
->cursor
= VARSIZE_ANY_EXHDR(delim
);
537 appendBinaryStringInfo(state
, VARDATA_ANY(value
),
538 VARSIZE_ANY_EXHDR(value
));
542 * The transition type for string_agg() is declared to be "internal",
543 * which is a pass-by-value type the same size as a pointer.
546 PG_RETURN_POINTER(state
);
551 bytea_string_agg_finalfn(PG_FUNCTION_ARGS
)
555 /* cannot be called directly because of internal-type argument */
556 Assert(AggCheckCallContext(fcinfo
, NULL
));
558 state
= PG_ARGISNULL(0) ? NULL
: (StringInfo
) PG_GETARG_POINTER(0);
562 /* As per comment in transfn, strip data before the cursor position */
564 int strippedlen
= state
->len
- state
->cursor
;
566 result
= (bytea
*) palloc(strippedlen
+ VARHDRSZ
);
567 SET_VARSIZE(result
, strippedlen
+ VARHDRSZ
);
568 memcpy(VARDATA(result
), &state
->data
[state
->cursor
], strippedlen
);
569 PG_RETURN_BYTEA_P(result
);
576 * textin - converts cstring to internal representation
579 textin(PG_FUNCTION_ARGS
)
581 char *inputText
= PG_GETARG_CSTRING(0);
583 PG_RETURN_TEXT_P(cstring_to_text(inputText
));
587 * textout - converts internal representation to cstring
590 textout(PG_FUNCTION_ARGS
)
592 Datum txt
= PG_GETARG_DATUM(0);
594 PG_RETURN_CSTRING(TextDatumGetCString(txt
));
598 * textrecv - converts external binary format to text
601 textrecv(PG_FUNCTION_ARGS
)
603 StringInfo buf
= (StringInfo
) PG_GETARG_POINTER(0);
608 str
= pq_getmsgtext(buf
, buf
->len
- buf
->cursor
, &nbytes
);
610 result
= cstring_to_text_with_len(str
, nbytes
);
612 PG_RETURN_TEXT_P(result
);
616 * textsend - converts text to binary format
619 textsend(PG_FUNCTION_ARGS
)
621 text
*t
= PG_GETARG_TEXT_PP(0);
624 pq_begintypsend(&buf
);
625 pq_sendtext(&buf
, VARDATA_ANY(t
), VARSIZE_ANY_EXHDR(t
));
626 PG_RETURN_BYTEA_P(pq_endtypsend(&buf
));
631 * unknownin - converts cstring to internal representation
634 unknownin(PG_FUNCTION_ARGS
)
636 char *str
= PG_GETARG_CSTRING(0);
638 /* representation is same as cstring */
639 PG_RETURN_CSTRING(pstrdup(str
));
643 * unknownout - converts internal representation to cstring
646 unknownout(PG_FUNCTION_ARGS
)
648 /* representation is same as cstring */
649 char *str
= PG_GETARG_CSTRING(0);
651 PG_RETURN_CSTRING(pstrdup(str
));
655 * unknownrecv - converts external binary format to unknown
658 unknownrecv(PG_FUNCTION_ARGS
)
660 StringInfo buf
= (StringInfo
) PG_GETARG_POINTER(0);
664 str
= pq_getmsgtext(buf
, buf
->len
- buf
->cursor
, &nbytes
);
665 /* representation is same as cstring */
666 PG_RETURN_CSTRING(str
);
670 * unknownsend - converts unknown to binary format
673 unknownsend(PG_FUNCTION_ARGS
)
675 /* representation is same as cstring */
676 char *str
= PG_GETARG_CSTRING(0);
679 pq_begintypsend(&buf
);
680 pq_sendtext(&buf
, str
, strlen(str
));
681 PG_RETURN_BYTEA_P(pq_endtypsend(&buf
));
685 /* ========== PUBLIC ROUTINES ========== */
689 * returns the logical length of a text*
690 * (which is less than the VARSIZE of the text*)
693 textlen(PG_FUNCTION_ARGS
)
695 Datum str
= PG_GETARG_DATUM(0);
697 /* try to avoid decompressing argument */
698 PG_RETURN_INT32(text_length(str
));
703 * Does the real work for textlen()
705 * This is broken out so it can be called directly by other string processing
706 * functions. Note that the argument is passed as a Datum, to indicate that
707 * it may still be in compressed form. We can avoid decompressing it at all
711 text_length(Datum str
)
713 /* fastpath when max encoding length is one */
714 if (pg_database_encoding_max_length() == 1)
715 PG_RETURN_INT32(toast_raw_datum_size(str
) - VARHDRSZ
);
718 text
*t
= DatumGetTextPP(str
);
720 PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t
),
721 VARSIZE_ANY_EXHDR(t
)));
727 * returns the physical length of a text*
728 * (which is less than the VARSIZE of the text*)
731 textoctetlen(PG_FUNCTION_ARGS
)
733 Datum str
= PG_GETARG_DATUM(0);
735 /* We need not detoast the input at all */
736 PG_RETURN_INT32(toast_raw_datum_size(str
) - VARHDRSZ
);
741 * takes two text* and returns a text* that is the concatenation of
744 * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
745 * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
746 * Allocate space for output in all cases.
747 * XXX - thomas 1997-07-10
750 textcat(PG_FUNCTION_ARGS
)
752 text
*t1
= PG_GETARG_TEXT_PP(0);
753 text
*t2
= PG_GETARG_TEXT_PP(1);
755 PG_RETURN_TEXT_P(text_catenate(t1
, t2
));
760 * Guts of textcat(), broken out so it can be used by other functions
762 * Arguments can be in short-header form, but not compressed or out-of-line
765 text_catenate(text
*t1
, text
*t2
)
773 len1
= VARSIZE_ANY_EXHDR(t1
);
774 len2
= VARSIZE_ANY_EXHDR(t2
);
776 /* paranoia ... probably should throw error instead? */
782 len
= len1
+ len2
+ VARHDRSZ
;
783 result
= (text
*) palloc(len
);
785 /* Set size of result string... */
786 SET_VARSIZE(result
, len
);
788 /* Fill data field of result string... */
789 ptr
= VARDATA(result
);
791 memcpy(ptr
, VARDATA_ANY(t1
), len1
);
793 memcpy(ptr
+ len1
, VARDATA_ANY(t2
), len2
);
799 * charlen_to_bytelen()
800 * Compute the number of bytes occupied by n characters starting at *p
802 * It is caller's responsibility that there actually are n characters;
803 * the string need not be null-terminated.
806 charlen_to_bytelen(const char *p
, int n
)
808 if (pg_database_encoding_max_length() == 1)
810 /* Optimization for single-byte encodings */
817 for (s
= p
; n
> 0; n
--)
826 * Return a substring starting at the specified position.
827 * - thomas 1997-12-31
831 * - starting position (is one-based)
834 * If the starting position is zero or less, then return from the start of the string
835 * adjusting the length to be consistent with the "negative start" per SQL.
836 * If the length is less than zero, return the remaining string.
838 * Added multibyte support.
839 * - Tatsuo Ishii 1998-4-21
840 * Changed behavior if starting position is less than one to conform to SQL behavior.
841 * Formerly returned the entire string; now returns a portion.
842 * - Thomas Lockhart 1998-12-10
843 * Now uses faster TOAST-slicing interface
844 * - John Gray 2002-02-22
845 * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
846 * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
847 * error; if E < 1, return '', not entire string). Fixed MB related bug when
848 * S > LC and < LC + 4 sometimes garbage characters are returned.
849 * - Joe Conway 2002-08-10
852 text_substr(PG_FUNCTION_ARGS
)
854 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
861 * text_substr_no_len -
862 * Wrapper to avoid opr_sanity failure due to
863 * one function accepting a different number of args.
866 text_substr_no_len(PG_FUNCTION_ARGS
)
868 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
875 * Does the real work for text_substr() and text_substr_no_len()
877 * This is broken out so it can be called directly by other string processing
878 * functions. Note that the argument is passed as a Datum, to indicate that
879 * it may still be in compressed/toasted form. We can avoid detoasting all
880 * of it in some cases.
882 * The result is always a freshly palloc'd datum.
885 text_substring(Datum str
, int32 start
, int32 length
, bool length_not_specified
)
887 int32 eml
= pg_database_encoding_max_length();
888 int32 S
= start
; /* start position */
889 int32 S1
; /* adjusted start position */
890 int32 L1
; /* adjusted substring length */
891 int32 E
; /* end position */
894 * SQL99 says S can be zero or negative (which we don't document), but we
895 * still must fetch from the start of the string.
896 * https://www.postgresql.org/message-id/170905442373.643.11536838320909376197%40wrigleys.postgresql.org
900 /* life is easy if the encoding max length is 1 */
903 if (length_not_specified
) /* special case - get length to end of
908 /* SQL99 says to throw an error for E < S, i.e., negative length */
910 (errcode(ERRCODE_SUBSTRING_ERROR
),
911 errmsg("negative substring length not allowed")));
912 L1
= -1; /* silence stupider compilers */
914 else if (pg_add_s32_overflow(S
, length
, &E
))
917 * L could be large enough for S + L to overflow, in which case
918 * the substring must run to end of string.
925 * A zero or negative value for the end position can happen if the
926 * start was negative or one. SQL99 says to return a zero-length
930 return cstring_to_text("");
936 * If the start position is past the end of the string, SQL99 says to
937 * return a zero-length string -- DatumGetTextPSlice() will do that
938 * for us. We need only convert S1 to zero-based starting position.
940 return DatumGetTextPSlice(str
, S1
- 1, L1
);
945 * When encoding max length is > 1, we can't get LC without
946 * detoasting, so we'll grab a conservatively large slice now and go
947 * back later to do the right thing
960 * We need to start at position zero because there is no way to know
961 * in advance which byte offset corresponds to the supplied start
966 if (length_not_specified
) /* special case - get length to end of
968 slice_size
= L1
= -1;
971 /* SQL99 says to throw an error for E < S, i.e., negative length */
973 (errcode(ERRCODE_SUBSTRING_ERROR
),
974 errmsg("negative substring length not allowed")));
975 slice_size
= L1
= -1; /* silence stupider compilers */
977 else if (pg_add_s32_overflow(S
, length
, &E
))
980 * L could be large enough for S + L to overflow, in which case
981 * the substring must run to end of string.
983 slice_size
= L1
= -1;
988 * A zero or negative value for the end position can happen if the
989 * start was negative or one. SQL99 says to return a zero-length
993 return cstring_to_text("");
996 * if E is past the end of the string, the tuple toaster will
997 * truncate the length for us
1002 * Total slice size in bytes can't be any longer than the start
1003 * position plus substring length times the encoding max length.
1004 * If that overflows, we can just use -1.
1006 if (pg_mul_s32_overflow(E
, eml
, &slice_size
))
1011 * If we're working with an untoasted source, no need to do an extra
1014 if (VARATT_IS_COMPRESSED(DatumGetPointer(str
)) ||
1015 VARATT_IS_EXTERNAL(DatumGetPointer(str
)))
1016 slice
= DatumGetTextPSlice(str
, slice_start
, slice_size
);
1018 slice
= (text
*) DatumGetPointer(str
);
1020 /* see if we got back an empty string */
1021 if (VARSIZE_ANY_EXHDR(slice
) == 0)
1023 if (slice
!= (text
*) DatumGetPointer(str
))
1025 return cstring_to_text("");
1028 /* Now we can get the actual length of the slice in MB characters */
1029 slice_strlen
= pg_mbstrlen_with_len(VARDATA_ANY(slice
),
1030 VARSIZE_ANY_EXHDR(slice
));
1033 * Check that the start position wasn't > slice_strlen. If so, SQL99
1034 * says to return a zero-length string.
1036 if (S1
> slice_strlen
)
1038 if (slice
!= (text
*) DatumGetPointer(str
))
1040 return cstring_to_text("");
1044 * Adjust L1 and E1 now that we know the slice string length. Again
1045 * remember that S1 is one based, and slice_start is zero based.
1048 E1
= Min(S1
+ L1
, slice_start
+ 1 + slice_strlen
);
1050 E1
= slice_start
+ 1 + slice_strlen
;
1053 * Find the start position in the slice; remember S1 is not zero based
1055 p
= VARDATA_ANY(slice
);
1056 for (i
= 0; i
< S1
- 1; i
++)
1059 /* hang onto a pointer to our start position */
1063 * Count the actual bytes used by the substring of the requested
1066 for (i
= S1
; i
< E1
; i
++)
1069 ret
= (text
*) palloc(VARHDRSZ
+ (p
- s
));
1070 SET_VARSIZE(ret
, VARHDRSZ
+ (p
- s
));
1071 memcpy(VARDATA(ret
), s
, (p
- s
));
1073 if (slice
!= (text
*) DatumGetPointer(str
))
1079 elog(ERROR
, "invalid backend encoding: encoding max length < 1");
1081 /* not reached: suppress compiler warning */
1087 * Replace specified substring of first string with second
1089 * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1090 * This code is a direct implementation of what the standard says.
1093 textoverlay(PG_FUNCTION_ARGS
)
1095 text
*t1
= PG_GETARG_TEXT_PP(0);
1096 text
*t2
= PG_GETARG_TEXT_PP(1);
1097 int sp
= PG_GETARG_INT32(2); /* substring start position */
1098 int sl
= PG_GETARG_INT32(3); /* substring length */
1100 PG_RETURN_TEXT_P(text_overlay(t1
, t2
, sp
, sl
));
1104 textoverlay_no_len(PG_FUNCTION_ARGS
)
1106 text
*t1
= PG_GETARG_TEXT_PP(0);
1107 text
*t2
= PG_GETARG_TEXT_PP(1);
1108 int sp
= PG_GETARG_INT32(2); /* substring start position */
1111 sl
= text_length(PointerGetDatum(t2
)); /* defaults to length(t2) */
1112 PG_RETURN_TEXT_P(text_overlay(t1
, t2
, sp
, sl
));
1116 text_overlay(text
*t1
, text
*t2
, int sp
, int sl
)
1124 * Check for possible integer-overflow cases. For negative sp, throw a
1125 * "substring length" error because that's what should be expected
1126 * according to the spec's definition of OVERLAY().
1130 (errcode(ERRCODE_SUBSTRING_ERROR
),
1131 errmsg("negative substring length not allowed")));
1132 if (pg_add_s32_overflow(sp
, sl
, &sp_pl_sl
))
1134 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE
),
1135 errmsg("integer out of range")));
1137 s1
= text_substring(PointerGetDatum(t1
), 1, sp
- 1, false);
1138 s2
= text_substring(PointerGetDatum(t1
), sp_pl_sl
, -1, true);
1139 result
= text_catenate(s1
, t2
);
1140 result
= text_catenate(result
, s2
);
1147 * Return the position of the specified substring.
1148 * Implements the SQL POSITION() function.
1149 * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1150 * - thomas 1997-07-27
1153 textpos(PG_FUNCTION_ARGS
)
1155 text
*str
= PG_GETARG_TEXT_PP(0);
1156 text
*search_str
= PG_GETARG_TEXT_PP(1);
1158 PG_RETURN_INT32((int32
) text_position(str
, search_str
, PG_GET_COLLATION()));
1163 * Does the real work for textpos()
1166 * t1 - string to be searched
1167 * t2 - pattern to match within t1
1169 * Character index of the first matched char, starting from 1,
1172 * This is broken out so it can be called directly by other string processing
1176 text_position(text
*t1
, text
*t2
, Oid collid
)
1178 TextPositionState state
;
1181 /* Empty needle always matches at position 1 */
1182 if (VARSIZE_ANY_EXHDR(t2
) < 1)
1185 /* Otherwise, can't match if haystack is shorter than needle */
1186 if (VARSIZE_ANY_EXHDR(t1
) < VARSIZE_ANY_EXHDR(t2
))
1189 text_position_setup(t1
, t2
, collid
, &state
);
1190 if (!text_position_next(&state
))
1193 result
= text_position_get_match_pos(&state
);
1194 text_position_cleanup(&state
);
1200 * text_position_setup, text_position_next, text_position_cleanup -
1201 * Component steps of text_position()
1203 * These are broken out so that a string can be efficiently searched for
1204 * multiple occurrences of the same pattern. text_position_next may be
1205 * called multiple times, and it advances to the next match on each call.
1206 * text_position_get_match_ptr() and text_position_get_match_pos() return
1207 * a pointer or 1-based character position of the last match, respectively.
1209 * The "state" variable is normally just a local variable in the caller.
1211 * NOTE: text_position_next skips over the matched portion. For example,
1212 * searching for "xx" in "xxx" returns only one match, not two.
1216 text_position_setup(text
*t1
, text
*t2
, Oid collid
, TextPositionState
*state
)
1218 int len1
= VARSIZE_ANY_EXHDR(t1
);
1219 int len2
= VARSIZE_ANY_EXHDR(t2
);
1220 pg_locale_t mylocale
;
1222 check_collation_set(collid
);
1224 mylocale
= pg_newlocale_from_collation(collid
);
1226 if (!mylocale
->deterministic
)
1228 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED
),
1229 errmsg("nondeterministic collations are not supported for substring searches")));
1235 * Even with a multi-byte encoding, we perform the search using the raw
1236 * byte sequence, ignoring multibyte issues. For UTF-8, that works fine,
1237 * because in UTF-8 the byte sequence of one character cannot contain
1238 * another character. For other multi-byte encodings, we do the search
1239 * initially as a simple byte search, ignoring multibyte issues, but
1240 * verify afterwards that the match we found is at a character boundary,
1241 * and continue the search if it was a false match.
1243 if (pg_database_encoding_max_length() == 1)
1244 state
->is_multibyte_char_in_char
= false;
1245 else if (GetDatabaseEncoding() == PG_UTF8
)
1246 state
->is_multibyte_char_in_char
= false;
1248 state
->is_multibyte_char_in_char
= true;
1250 state
->str1
= VARDATA_ANY(t1
);
1251 state
->str2
= VARDATA_ANY(t2
);
1254 state
->last_match
= NULL
;
1255 state
->refpoint
= state
->str1
;
1259 * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1260 * notes we use the terminology that the "haystack" is the string to be
1261 * searched (t1) and the "needle" is the pattern being sought (t2).
1263 * If the needle is empty or bigger than the haystack then there is no
1264 * point in wasting cycles initializing the table. We also choose not to
1265 * use B-M-H for needles of length 1, since the skip table can't possibly
1266 * save anything in that case.
1268 if (len1
>= len2
&& len2
> 1)
1270 int searchlength
= len1
- len2
;
1274 const char *str2
= state
->str2
;
1277 * First we must determine how much of the skip table to use. The
1278 * declaration of TextPositionState allows up to 256 elements, but for
1279 * short search problems we don't really want to have to initialize so
1280 * many elements --- it would take too long in comparison to the
1281 * actual search time. So we choose a useful skip table size based on
1282 * the haystack length minus the needle length. The closer the needle
1283 * length is to the haystack length the less useful skipping becomes.
1285 * Note: since we use bit-masking to select table elements, the skip
1286 * table size MUST be a power of 2, and so the mask must be 2^N-1.
1288 if (searchlength
< 16)
1290 else if (searchlength
< 64)
1292 else if (searchlength
< 128)
1294 else if (searchlength
< 512)
1296 else if (searchlength
< 2048)
1298 else if (searchlength
< 4096)
1299 skiptablemask
= 127;
1301 skiptablemask
= 255;
1302 state
->skiptablemask
= skiptablemask
;
1305 * Initialize the skip table. We set all elements to the needle
1306 * length, since this is the correct skip distance for any character
1307 * not found in the needle.
1309 for (i
= 0; i
<= skiptablemask
; i
++)
1310 state
->skiptable
[i
] = len2
;
1313 * Now examine the needle. For each character except the last one,
1314 * set the corresponding table element to the appropriate skip
1315 * distance. Note that when two characters share the same skip table
1316 * entry, the one later in the needle must determine the skip
1321 for (i
= 0; i
< last
; i
++)
1322 state
->skiptable
[(unsigned char) str2
[i
] & skiptablemask
] = last
- i
;
1327 * Advance to the next match, starting from the end of the previous match
1328 * (or the beginning of the string, on first call). Returns true if a match
1331 * Note that this refuses to match an empty-string needle. Most callers
1332 * will have handled that case specially and we'll never see it here.
1335 text_position_next(TextPositionState
*state
)
1337 int needle_len
= state
->len2
;
1341 if (needle_len
<= 0)
1342 return false; /* result for empty pattern */
1344 /* Start from the point right after the previous match. */
1345 if (state
->last_match
)
1346 start_ptr
= state
->last_match
+ needle_len
;
1348 start_ptr
= state
->str1
;
1351 matchptr
= text_position_next_internal(start_ptr
, state
);
1357 * Found a match for the byte sequence. If this is a multibyte encoding,
1358 * where one character's byte sequence can appear inside a longer
1359 * multi-byte character, we need to verify that the match was at a
1360 * character boundary, not in the middle of a multi-byte character.
1362 if (state
->is_multibyte_char_in_char
)
1364 /* Walk one character at a time, until we reach the match. */
1366 /* the search should never move backwards. */
1367 Assert(state
->refpoint
<= matchptr
);
1369 while (state
->refpoint
< matchptr
)
1371 /* step to next character. */
1372 state
->refpoint
+= pg_mblen(state
->refpoint
);
1376 * If we stepped over the match's start position, then it was a
1377 * false positive, where the byte sequence appeared in the middle
1378 * of a multi-byte character. Skip it, and continue the search at
1379 * the next character boundary.
1381 if (state
->refpoint
> matchptr
)
1383 start_ptr
= state
->refpoint
;
1389 state
->last_match
= matchptr
;
1394 * Subroutine of text_position_next(). This searches for the raw byte
1395 * sequence, ignoring any multi-byte encoding issues. Returns the first
1396 * match starting at 'start_ptr', or NULL if no match is found.
1399 text_position_next_internal(char *start_ptr
, TextPositionState
*state
)
1401 int haystack_len
= state
->len1
;
1402 int needle_len
= state
->len2
;
1403 int skiptablemask
= state
->skiptablemask
;
1404 const char *haystack
= state
->str1
;
1405 const char *needle
= state
->str2
;
1406 const char *haystack_end
= &haystack
[haystack_len
];
1409 Assert(start_ptr
>= haystack
&& start_ptr
<= haystack_end
);
1411 if (needle_len
== 1)
1413 /* No point in using B-M-H for a one-character needle */
1414 char nchar
= *needle
;
1417 while (hptr
< haystack_end
)
1420 return (char *) hptr
;
1426 const char *needle_last
= &needle
[needle_len
- 1];
1428 /* Start at startpos plus the length of the needle */
1429 hptr
= start_ptr
+ needle_len
- 1;
1430 while (hptr
< haystack_end
)
1432 /* Match the needle scanning *backward* */
1440 /* Matched it all? If so, return 1-based position */
1447 * No match, so use the haystack char at hptr to decide how far to
1448 * advance. If the needle had any occurrence of that character
1449 * (or more precisely, one sharing the same skiptable entry)
1450 * before its last character, then we advance far enough to align
1451 * the last such needle character with that haystack position.
1452 * Otherwise we can advance by the whole needle length.
1454 hptr
+= state
->skiptable
[(unsigned char) *hptr
& skiptablemask
];
1458 return 0; /* not found */
1462 * Return a pointer to the current match.
1464 * The returned pointer points into the original haystack string.
1467 text_position_get_match_ptr(TextPositionState
*state
)
1469 return state
->last_match
;
1473 * Return the offset of the current match.
1475 * The offset is in characters, 1-based.
1478 text_position_get_match_pos(TextPositionState
*state
)
1480 /* Convert the byte position to char position. */
1481 state
->refpos
+= pg_mbstrlen_with_len(state
->refpoint
,
1482 state
->last_match
- state
->refpoint
);
1483 state
->refpoint
= state
->last_match
;
1484 return state
->refpos
+ 1;
1488 * Reset search state to the initial state installed by text_position_setup.
1490 * The next call to text_position_next will search from the beginning
1494 text_position_reset(TextPositionState
*state
)
1496 state
->last_match
= NULL
;
1497 state
->refpoint
= state
->str1
;
1502 text_position_cleanup(TextPositionState
*state
)
1504 /* no cleanup needed */
1509 check_collation_set(Oid collid
)
1511 if (!OidIsValid(collid
))
1514 * This typically means that the parser could not resolve a conflict
1515 * of implicit collations, so report it that way.
1518 (errcode(ERRCODE_INDETERMINATE_COLLATION
),
1519 errmsg("could not determine which collation to use for string comparison"),
1520 errhint("Use the COLLATE clause to set the collation explicitly.")));
1527 * Comparison function for text strings with given lengths, using the
1528 * appropriate locale. Returns an integer less than, equal to, or greater than
1529 * zero, indicating whether arg1 is less than, equal to, or greater than arg2.
1531 * Note: many functions that depend on this are marked leakproof; therefore,
1532 * avoid reporting the actual contents of the input when throwing errors.
1533 * All errors herein should be things that can't happen except on corrupt
1534 * data, anyway; otherwise we will have trouble with indexing strings that
1538 varstr_cmp(const char *arg1
, int len1
, const char *arg2
, int len2
, Oid collid
)
1541 pg_locale_t mylocale
;
1543 check_collation_set(collid
);
1545 mylocale
= pg_newlocale_from_collation(collid
);
1547 if (mylocale
->collate_is_c
)
1549 result
= memcmp(arg1
, arg2
, Min(len1
, len2
));
1550 if ((result
== 0) && (len1
!= len2
))
1551 result
= (len1
< len2
) ? -1 : 1;
1556 * memcmp() can't tell us which of two unequal strings sorts first,
1557 * but it's a cheap way to tell if they're equal. Testing shows that
1558 * memcmp() followed by strcoll() is only trivially slower than
1559 * strcoll() by itself, so we don't lose much if this doesn't work out
1560 * very often, and if it does - for example, because there are many
1561 * equal strings in the input - then we win big by avoiding expensive
1562 * collation-aware comparisons.
1564 if (len1
== len2
&& memcmp(arg1
, arg2
, len1
) == 0)
1567 result
= pg_strncoll(arg1
, len1
, arg2
, len2
, mylocale
);
1569 /* Break tie if necessary. */
1570 if (result
== 0 && mylocale
->deterministic
)
1572 result
= memcmp(arg1
, arg2
, Min(len1
, len2
));
1573 if ((result
== 0) && (len1
!= len2
))
1574 result
= (len1
< len2
) ? -1 : 1;
1582 * Internal comparison function for text strings.
1583 * Returns -1, 0 or 1
1586 text_cmp(text
*arg1
, text
*arg2
, Oid collid
)
1593 a1p
= VARDATA_ANY(arg1
);
1594 a2p
= VARDATA_ANY(arg2
);
1596 len1
= VARSIZE_ANY_EXHDR(arg1
);
1597 len2
= VARSIZE_ANY_EXHDR(arg2
);
1599 return varstr_cmp(a1p
, len1
, a2p
, len2
, collid
);
1603 * Comparison functions for text strings.
1605 * Note: btree indexes need these routines not to leak memory; therefore,
1606 * be careful to free working copies of toasted datums. Most places don't
1607 * need to be so careful.
1611 texteq(PG_FUNCTION_ARGS
)
1613 Oid collid
= PG_GET_COLLATION();
1614 pg_locale_t mylocale
= 0;
1617 check_collation_set(collid
);
1619 mylocale
= pg_newlocale_from_collation(collid
);
1621 if (mylocale
->deterministic
)
1623 Datum arg1
= PG_GETARG_DATUM(0);
1624 Datum arg2
= PG_GETARG_DATUM(1);
1629 * Since we only care about equality or not-equality, we can avoid all
1630 * the expense of strcoll() here, and just do bitwise comparison. In
1631 * fact, we don't even have to do a bitwise comparison if we can show
1632 * the lengths of the strings are unequal; which might save us from
1633 * having to detoast one or both values.
1635 len1
= toast_raw_datum_size(arg1
);
1636 len2
= toast_raw_datum_size(arg2
);
1641 text
*targ1
= DatumGetTextPP(arg1
);
1642 text
*targ2
= DatumGetTextPP(arg2
);
1644 result
= (memcmp(VARDATA_ANY(targ1
), VARDATA_ANY(targ2
),
1645 len1
- VARHDRSZ
) == 0);
1647 PG_FREE_IF_COPY(targ1
, 0);
1648 PG_FREE_IF_COPY(targ2
, 1);
1653 text
*arg1
= PG_GETARG_TEXT_PP(0);
1654 text
*arg2
= PG_GETARG_TEXT_PP(1);
1656 result
= (text_cmp(arg1
, arg2
, collid
) == 0);
1658 PG_FREE_IF_COPY(arg1
, 0);
1659 PG_FREE_IF_COPY(arg2
, 1);
1662 PG_RETURN_BOOL(result
);
1666 textne(PG_FUNCTION_ARGS
)
1668 Oid collid
= PG_GET_COLLATION();
1669 pg_locale_t mylocale
;
1672 check_collation_set(collid
);
1674 mylocale
= pg_newlocale_from_collation(collid
);
1676 if (mylocale
->deterministic
)
1678 Datum arg1
= PG_GETARG_DATUM(0);
1679 Datum arg2
= PG_GETARG_DATUM(1);
1683 /* See comment in texteq() */
1684 len1
= toast_raw_datum_size(arg1
);
1685 len2
= toast_raw_datum_size(arg2
);
1690 text
*targ1
= DatumGetTextPP(arg1
);
1691 text
*targ2
= DatumGetTextPP(arg2
);
1693 result
= (memcmp(VARDATA_ANY(targ1
), VARDATA_ANY(targ2
),
1694 len1
- VARHDRSZ
) != 0);
1696 PG_FREE_IF_COPY(targ1
, 0);
1697 PG_FREE_IF_COPY(targ2
, 1);
1702 text
*arg1
= PG_GETARG_TEXT_PP(0);
1703 text
*arg2
= PG_GETARG_TEXT_PP(1);
1705 result
= (text_cmp(arg1
, arg2
, collid
) != 0);
1707 PG_FREE_IF_COPY(arg1
, 0);
1708 PG_FREE_IF_COPY(arg2
, 1);
1711 PG_RETURN_BOOL(result
);
1715 text_lt(PG_FUNCTION_ARGS
)
1717 text
*arg1
= PG_GETARG_TEXT_PP(0);
1718 text
*arg2
= PG_GETARG_TEXT_PP(1);
1721 result
= (text_cmp(arg1
, arg2
, PG_GET_COLLATION()) < 0);
1723 PG_FREE_IF_COPY(arg1
, 0);
1724 PG_FREE_IF_COPY(arg2
, 1);
1726 PG_RETURN_BOOL(result
);
1730 text_le(PG_FUNCTION_ARGS
)
1732 text
*arg1
= PG_GETARG_TEXT_PP(0);
1733 text
*arg2
= PG_GETARG_TEXT_PP(1);
1736 result
= (text_cmp(arg1
, arg2
, PG_GET_COLLATION()) <= 0);
1738 PG_FREE_IF_COPY(arg1
, 0);
1739 PG_FREE_IF_COPY(arg2
, 1);
1741 PG_RETURN_BOOL(result
);
1745 text_gt(PG_FUNCTION_ARGS
)
1747 text
*arg1
= PG_GETARG_TEXT_PP(0);
1748 text
*arg2
= PG_GETARG_TEXT_PP(1);
1751 result
= (text_cmp(arg1
, arg2
, PG_GET_COLLATION()) > 0);
1753 PG_FREE_IF_COPY(arg1
, 0);
1754 PG_FREE_IF_COPY(arg2
, 1);
1756 PG_RETURN_BOOL(result
);
1760 text_ge(PG_FUNCTION_ARGS
)
1762 text
*arg1
= PG_GETARG_TEXT_PP(0);
1763 text
*arg2
= PG_GETARG_TEXT_PP(1);
1766 result
= (text_cmp(arg1
, arg2
, PG_GET_COLLATION()) >= 0);
1768 PG_FREE_IF_COPY(arg1
, 0);
1769 PG_FREE_IF_COPY(arg2
, 1);
1771 PG_RETURN_BOOL(result
);
1775 text_starts_with(PG_FUNCTION_ARGS
)
1777 Datum arg1
= PG_GETARG_DATUM(0);
1778 Datum arg2
= PG_GETARG_DATUM(1);
1779 Oid collid
= PG_GET_COLLATION();
1780 pg_locale_t mylocale
;
1785 check_collation_set(collid
);
1787 mylocale
= pg_newlocale_from_collation(collid
);
1789 if (!mylocale
->deterministic
)
1791 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED
),
1792 errmsg("nondeterministic collations are not supported for substring searches")));
1794 len1
= toast_raw_datum_size(arg1
);
1795 len2
= toast_raw_datum_size(arg2
);
1800 text
*targ1
= text_substring(arg1
, 1, len2
, false);
1801 text
*targ2
= DatumGetTextPP(arg2
);
1803 result
= (memcmp(VARDATA_ANY(targ1
), VARDATA_ANY(targ2
),
1804 VARSIZE_ANY_EXHDR(targ2
)) == 0);
1806 PG_FREE_IF_COPY(targ1
, 0);
1807 PG_FREE_IF_COPY(targ2
, 1);
1810 PG_RETURN_BOOL(result
);
1814 bttextcmp(PG_FUNCTION_ARGS
)
1816 text
*arg1
= PG_GETARG_TEXT_PP(0);
1817 text
*arg2
= PG_GETARG_TEXT_PP(1);
1820 result
= text_cmp(arg1
, arg2
, PG_GET_COLLATION());
1822 PG_FREE_IF_COPY(arg1
, 0);
1823 PG_FREE_IF_COPY(arg2
, 1);
1825 PG_RETURN_INT32(result
);
1829 bttextsortsupport(PG_FUNCTION_ARGS
)
1831 SortSupport ssup
= (SortSupport
) PG_GETARG_POINTER(0);
1832 Oid collid
= ssup
->ssup_collation
;
1833 MemoryContext oldcontext
;
1835 oldcontext
= MemoryContextSwitchTo(ssup
->ssup_cxt
);
1837 /* Use generic string SortSupport */
1838 varstr_sortsupport(ssup
, TEXTOID
, collid
);
1840 MemoryContextSwitchTo(oldcontext
);
1846 * Generic sortsupport interface for character type's operator classes.
1847 * Includes locale support, and support for BpChar semantics (i.e. removing
1848 * trailing spaces before comparison).
1850 * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
1851 * same representation. Callers that always use the C collation (e.g.
1852 * non-collatable type callers like bytea) may have NUL bytes in their strings;
1853 * this will not work with any other collation, though.
1856 varstr_sortsupport(SortSupport ssup
, Oid typid
, Oid collid
)
1858 bool abbreviate
= ssup
->abbreviate
;
1859 bool collate_c
= false;
1860 VarStringSortSupport
*sss
;
1863 check_collation_set(collid
);
1865 locale
= pg_newlocale_from_collation(collid
);
1868 * If possible, set ssup->comparator to a function which can be used to
1869 * directly compare two datums. If we can do this, we'll avoid the
1870 * overhead of a trip through the fmgr layer for every comparison, which
1871 * can be substantial.
1873 * Most typically, we'll set the comparator to varlenafastcmp_locale,
1874 * which uses strcoll() to perform comparisons. We use that for the
1875 * BpChar case too, but type NAME uses namefastcmp_locale. However, if
1876 * LC_COLLATE = C, we can make things quite a bit faster with
1877 * varstrfastcmp_c, bpcharfastcmp_c, or namefastcmp_c, all of which use
1878 * memcmp() rather than strcoll().
1880 if (locale
->collate_is_c
)
1882 if (typid
== BPCHAROID
)
1883 ssup
->comparator
= bpcharfastcmp_c
;
1884 else if (typid
== NAMEOID
)
1886 ssup
->comparator
= namefastcmp_c
;
1887 /* Not supporting abbreviation with type NAME, for now */
1891 ssup
->comparator
= varstrfastcmp_c
;
1898 * We use varlenafastcmp_locale except for type NAME.
1900 if (typid
== NAMEOID
)
1902 ssup
->comparator
= namefastcmp_locale
;
1903 /* Not supporting abbreviation with type NAME, for now */
1907 ssup
->comparator
= varlenafastcmp_locale
;
1910 * Unfortunately, it seems that abbreviation for non-C collations is
1911 * broken on many common platforms; see pg_strxfrm_enabled().
1913 * Even apart from the risk of broken locales, it's possible that
1914 * there are platforms where the use of abbreviated keys should be
1915 * disabled at compile time. Having only 4 byte datums could make
1916 * worst-case performance drastically more likely, for example.
1917 * Moreover, macOS's strxfrm() implementation is known to not
1918 * effectively concentrate a significant amount of entropy from the
1919 * original string in earlier transformed blobs. It's possible that
1920 * other supported platforms are similarly encumbered. So, if we ever
1921 * get past disabling this categorically, we may still want or need to
1922 * disable it for particular platforms.
1924 if (!pg_strxfrm_enabled(locale
))
1929 * If we're using abbreviated keys, or if we're using a locale-aware
1930 * comparison, we need to initialize a VarStringSortSupport object. Both
1931 * cases will make use of the temporary buffers we initialize here for
1932 * scratch space (and to detect requirement for BpChar semantics from
1933 * caller), and the abbreviation case requires additional state.
1935 if (abbreviate
|| !collate_c
)
1937 sss
= palloc(sizeof(VarStringSortSupport
));
1938 sss
->buf1
= palloc(TEXTBUFLEN
);
1939 sss
->buflen1
= TEXTBUFLEN
;
1940 sss
->buf2
= palloc(TEXTBUFLEN
);
1941 sss
->buflen2
= TEXTBUFLEN
;
1942 /* Start with invalid values */
1943 sss
->last_len1
= -1;
1944 sss
->last_len2
= -1;
1946 sss
->last_returned
= 0;
1950 sss
->locale
= locale
;
1953 * To avoid somehow confusing a strxfrm() blob and an original string,
1954 * constantly keep track of the variety of data that buf1 and buf2
1955 * currently contain.
1957 * Comparisons may be interleaved with conversion calls. Frequently,
1958 * conversions and comparisons are batched into two distinct phases,
1959 * but the correctness of caching cannot hinge upon this. For
1960 * comparison caching, buffer state is only trusted if cache_blob is
1961 * found set to false, whereas strxfrm() caching only trusts the state
1962 * when cache_blob is found set to true.
1964 * Arbitrarily initialize cache_blob to true.
1966 sss
->cache_blob
= true;
1967 sss
->collate_c
= collate_c
;
1969 ssup
->ssup_extra
= sss
;
1972 * If possible, plan to use the abbreviated keys optimization. The
1973 * core code may switch back to authoritative comparator should
1974 * abbreviation be aborted.
1978 sss
->prop_card
= 0.20;
1979 initHyperLogLog(&sss
->abbr_card
, 10);
1980 initHyperLogLog(&sss
->full_card
, 10);
1981 ssup
->abbrev_full_comparator
= ssup
->comparator
;
1982 ssup
->comparator
= ssup_datum_unsigned_cmp
;
1983 ssup
->abbrev_converter
= varstr_abbrev_convert
;
1984 ssup
->abbrev_abort
= varstr_abbrev_abort
;
1990 * sortsupport comparison func (for C locale case)
1993 varstrfastcmp_c(Datum x
, Datum y
, SortSupport ssup
)
1995 VarString
*arg1
= DatumGetVarStringPP(x
);
1996 VarString
*arg2
= DatumGetVarStringPP(y
);
2003 a1p
= VARDATA_ANY(arg1
);
2004 a2p
= VARDATA_ANY(arg2
);
2006 len1
= VARSIZE_ANY_EXHDR(arg1
);
2007 len2
= VARSIZE_ANY_EXHDR(arg2
);
2009 result
= memcmp(a1p
, a2p
, Min(len1
, len2
));
2010 if ((result
== 0) && (len1
!= len2
))
2011 result
= (len1
< len2
) ? -1 : 1;
2013 /* We can't afford to leak memory here. */
2014 if (PointerGetDatum(arg1
) != x
)
2016 if (PointerGetDatum(arg2
) != y
)
2023 * sortsupport comparison func (for BpChar C locale case)
2025 * BpChar outsources its sortsupport to this module. Specialization for the
2026 * varstr_sortsupport BpChar case, modeled on
2027 * internal_bpchar_pattern_compare().
2030 bpcharfastcmp_c(Datum x
, Datum y
, SortSupport ssup
)
2032 BpChar
*arg1
= DatumGetBpCharPP(x
);
2033 BpChar
*arg2
= DatumGetBpCharPP(y
);
2040 a1p
= VARDATA_ANY(arg1
);
2041 a2p
= VARDATA_ANY(arg2
);
2043 len1
= bpchartruelen(a1p
, VARSIZE_ANY_EXHDR(arg1
));
2044 len2
= bpchartruelen(a2p
, VARSIZE_ANY_EXHDR(arg2
));
2046 result
= memcmp(a1p
, a2p
, Min(len1
, len2
));
2047 if ((result
== 0) && (len1
!= len2
))
2048 result
= (len1
< len2
) ? -1 : 1;
2050 /* We can't afford to leak memory here. */
2051 if (PointerGetDatum(arg1
) != x
)
2053 if (PointerGetDatum(arg2
) != y
)
2060 * sortsupport comparison func (for NAME C locale case)
2063 namefastcmp_c(Datum x
, Datum y
, SortSupport ssup
)
2065 Name arg1
= DatumGetName(x
);
2066 Name arg2
= DatumGetName(y
);
2068 return strncmp(NameStr(*arg1
), NameStr(*arg2
), NAMEDATALEN
);
2072 * sortsupport comparison func (for locale case with all varlena types)
2075 varlenafastcmp_locale(Datum x
, Datum y
, SortSupport ssup
)
2077 VarString
*arg1
= DatumGetVarStringPP(x
);
2078 VarString
*arg2
= DatumGetVarStringPP(y
);
2085 a1p
= VARDATA_ANY(arg1
);
2086 a2p
= VARDATA_ANY(arg2
);
2088 len1
= VARSIZE_ANY_EXHDR(arg1
);
2089 len2
= VARSIZE_ANY_EXHDR(arg2
);
2091 result
= varstrfastcmp_locale(a1p
, len1
, a2p
, len2
, ssup
);
2093 /* We can't afford to leak memory here. */
2094 if (PointerGetDatum(arg1
) != x
)
2096 if (PointerGetDatum(arg2
) != y
)
2103 * sortsupport comparison func (for locale case with NAME type)
2106 namefastcmp_locale(Datum x
, Datum y
, SortSupport ssup
)
2108 Name arg1
= DatumGetName(x
);
2109 Name arg2
= DatumGetName(y
);
2111 return varstrfastcmp_locale(NameStr(*arg1
), strlen(NameStr(*arg1
)),
2112 NameStr(*arg2
), strlen(NameStr(*arg2
)),
2117 * sortsupport comparison func for locale cases
2120 varstrfastcmp_locale(char *a1p
, int len1
, char *a2p
, int len2
, SortSupport ssup
)
2122 VarStringSortSupport
*sss
= (VarStringSortSupport
*) ssup
->ssup_extra
;
2126 /* Fast pre-check for equality, as discussed in varstr_cmp() */
2127 if (len1
== len2
&& memcmp(a1p
, a2p
, len1
) == 0)
2130 * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2131 * last_len2. Existing contents of buffers might still be used by
2134 * It's fine to allow the comparison of BpChar padding bytes here,
2135 * even though that implies that the memcmp() will usually be
2136 * performed for BpChar callers (though multibyte characters could
2137 * still prevent that from occurring). The memcmp() is still very
2138 * cheap, and BpChar's funny semantics have us remove trailing spaces
2139 * (not limited to padding), so we need make no distinction between
2140 * padding space characters and "real" space characters.
2145 if (sss
->typid
== BPCHAROID
)
2147 /* Get true number of bytes, ignoring trailing spaces */
2148 len1
= bpchartruelen(a1p
, len1
);
2149 len2
= bpchartruelen(a2p
, len2
);
2152 if (len1
>= sss
->buflen1
)
2154 sss
->buflen1
= Max(len1
+ 1, Min(sss
->buflen1
* 2, MaxAllocSize
));
2155 sss
->buf1
= repalloc(sss
->buf1
, sss
->buflen1
);
2157 if (len2
>= sss
->buflen2
)
2159 sss
->buflen2
= Max(len2
+ 1, Min(sss
->buflen2
* 2, MaxAllocSize
));
2160 sss
->buf2
= repalloc(sss
->buf2
, sss
->buflen2
);
2164 * We're likely to be asked to compare the same strings repeatedly, and
2165 * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2166 * comparisons, even though in general there is no reason to think that
2167 * that will work out (every string datum may be unique). Caching does
2168 * not slow things down measurably when it doesn't work out, and can speed
2169 * things up by rather a lot when it does. In part, this is because the
2170 * memcmp() compares data from cachelines that are needed in L1 cache even
2171 * when the last comparison's result cannot be reused.
2174 if (len1
!= sss
->last_len1
|| memcmp(sss
->buf1
, a1p
, len1
) != 0)
2177 memcpy(sss
->buf1
, a1p
, len1
);
2178 sss
->buf1
[len1
] = '\0';
2179 sss
->last_len1
= len1
;
2183 * If we're comparing the same two strings as last time, we can return the
2184 * same answer without calling strcoll() again. This is more likely than
2185 * it seems (at least with moderate to low cardinality sets), because
2186 * quicksort compares the same pivot against many values.
2188 if (len2
!= sss
->last_len2
|| memcmp(sss
->buf2
, a2p
, len2
) != 0)
2190 memcpy(sss
->buf2
, a2p
, len2
);
2191 sss
->buf2
[len2
] = '\0';
2192 sss
->last_len2
= len2
;
2194 else if (arg1_match
&& !sss
->cache_blob
)
2196 /* Use result cached following last actual strcoll() call */
2197 return sss
->last_returned
;
2200 result
= pg_strcoll(sss
->buf1
, sss
->buf2
, sss
->locale
);
2202 /* Break tie if necessary. */
2203 if (result
== 0 && sss
->locale
->deterministic
)
2204 result
= strcmp(sss
->buf1
, sss
->buf2
);
2206 /* Cache result, perhaps saving an expensive strcoll() call next time */
2207 sss
->cache_blob
= false;
2208 sss
->last_returned
= result
;
2213 * Conversion routine for sortsupport. Converts original to abbreviated key
2214 * representation. Our encoding strategy is simple -- pack the first 8 bytes
2215 * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2216 * stored in reverse order), and treat it as an unsigned integer. When the "C"
2217 * locale is used, or in case of bytea, just memcpy() from original instead.
2220 varstr_abbrev_convert(Datum original
, SortSupport ssup
)
2222 const size_t max_prefix_bytes
= sizeof(Datum
);
2223 VarStringSortSupport
*sss
= (VarStringSortSupport
*) ssup
->ssup_extra
;
2224 VarString
*authoritative
= DatumGetVarStringPP(original
);
2225 char *authoritative_data
= VARDATA_ANY(authoritative
);
2233 pres
= (char *) &res
;
2234 /* memset(), so any non-overwritten bytes are NUL */
2235 memset(pres
, 0, max_prefix_bytes
);
2236 len
= VARSIZE_ANY_EXHDR(authoritative
);
2238 /* Get number of bytes, ignoring trailing spaces */
2239 if (sss
->typid
== BPCHAROID
)
2240 len
= bpchartruelen(authoritative_data
, len
);
2243 * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2244 * abbreviate keys. The full comparator for the C locale is always
2245 * memcmp(). It would be incorrect to allow bytea callers (callers that
2246 * always force the C collation -- bytea isn't a collatable type, but this
2247 * approach is convenient) to use strxfrm(). This is because bytea
2248 * strings may contain NUL bytes. Besides, this should be faster, too.
2250 * More generally, it's okay that bytea callers can have NUL bytes in
2251 * strings because abbreviated cmp need not make a distinction between
2252 * terminating NUL bytes, and NUL bytes representing actual NULs in the
2253 * authoritative representation. Hopefully a comparison at or past one
2254 * abbreviated key's terminating NUL byte will resolve the comparison
2255 * without consulting the authoritative representation; specifically, some
2256 * later non-NUL byte in the longer string can resolve the comparison
2257 * against a subsequent terminating NUL in the shorter string. There will
2258 * usually be what is effectively a "length-wise" resolution there and
2261 * If that doesn't work out -- if all bytes in the longer string
2262 * positioned at or past the offset of the smaller string's (first)
2263 * terminating NUL are actually representative of NUL bytes in the
2264 * authoritative binary string (perhaps with some *terminating* NUL bytes
2265 * towards the end of the longer string iff it happens to still be small)
2266 * -- then an authoritative tie-breaker will happen, and do the right
2267 * thing: explicitly consider string length.
2270 memcpy(pres
, authoritative_data
, Min(len
, max_prefix_bytes
));
2276 * We're not using the C collation, so fall back on strxfrm or ICU
2280 /* By convention, we use buffer 1 to store and NUL-terminate */
2281 if (len
>= sss
->buflen1
)
2283 sss
->buflen1
= Max(len
+ 1, Min(sss
->buflen1
* 2, MaxAllocSize
));
2284 sss
->buf1
= repalloc(sss
->buf1
, sss
->buflen1
);
2287 /* Might be able to reuse strxfrm() blob from last call */
2288 if (sss
->last_len1
== len
&& sss
->cache_blob
&&
2289 memcmp(sss
->buf1
, authoritative_data
, len
) == 0)
2291 memcpy(pres
, sss
->buf2
, Min(max_prefix_bytes
, sss
->last_len2
));
2292 /* No change affecting cardinality, so no hashing required */
2296 memcpy(sss
->buf1
, authoritative_data
, len
);
2299 * pg_strxfrm() and pg_strxfrm_prefix expect NUL-terminated strings.
2301 sss
->buf1
[len
] = '\0';
2302 sss
->last_len1
= len
;
2304 if (pg_strxfrm_prefix_enabled(sss
->locale
))
2306 if (sss
->buflen2
< max_prefix_bytes
)
2308 sss
->buflen2
= Max(max_prefix_bytes
,
2309 Min(sss
->buflen2
* 2, MaxAllocSize
));
2310 sss
->buf2
= repalloc(sss
->buf2
, sss
->buflen2
);
2313 bsize
= pg_strxfrm_prefix(sss
->buf2
, sss
->buf1
,
2314 max_prefix_bytes
, sss
->locale
);
2315 sss
->last_len2
= bsize
;
2320 * Loop: Call pg_strxfrm(), possibly enlarge buffer, and try
2321 * again. The pg_strxfrm() function leaves the result buffer
2322 * content undefined if the result did not fit, so we need to
2323 * retry until everything fits, even though we only need the first
2324 * few bytes in the end.
2328 bsize
= pg_strxfrm(sss
->buf2
, sss
->buf1
, sss
->buflen2
,
2331 sss
->last_len2
= bsize
;
2332 if (bsize
< sss
->buflen2
)
2336 * Grow buffer and retry.
2338 sss
->buflen2
= Max(bsize
+ 1,
2339 Min(sss
->buflen2
* 2, MaxAllocSize
));
2340 sss
->buf2
= repalloc(sss
->buf2
, sss
->buflen2
);
2345 * Every Datum byte is always compared. This is safe because the
2346 * strxfrm() blob is itself NUL terminated, leaving no danger of
2347 * misinterpreting any NUL bytes not intended to be interpreted as
2348 * logically representing termination.
2350 * (Actually, even if there were NUL bytes in the blob it would be
2351 * okay. See remarks on bytea case above.)
2353 memcpy(pres
, sss
->buf2
, Min(max_prefix_bytes
, bsize
));
2357 * Maintain approximate cardinality of both abbreviated keys and original,
2358 * authoritative keys using HyperLogLog. Used as cheap insurance against
2359 * the worst case, where we do many string transformations for no saving
2360 * in full strcoll()-based comparisons. These statistics are used by
2361 * varstr_abbrev_abort().
2363 * First, Hash key proper, or a significant fraction of it. Mix in length
2364 * in order to compensate for cases where differences are past
2365 * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2367 hash
= DatumGetUInt32(hash_any((unsigned char *) authoritative_data
,
2368 Min(len
, PG_CACHE_LINE_SIZE
)));
2370 if (len
> PG_CACHE_LINE_SIZE
)
2371 hash
^= DatumGetUInt32(hash_uint32((uint32
) len
));
2373 addHyperLogLog(&sss
->full_card
, hash
);
2375 /* Hash abbreviated key */
2376 #if SIZEOF_DATUM == 8
2381 lohalf
= (uint32
) res
;
2382 hihalf
= (uint32
) (res
>> 32);
2383 hash
= DatumGetUInt32(hash_uint32(lohalf
^ hihalf
));
2385 #else /* SIZEOF_DATUM != 8 */
2386 hash
= DatumGetUInt32(hash_uint32((uint32
) res
));
2389 addHyperLogLog(&sss
->abbr_card
, hash
);
2391 /* Cache result, perhaps saving an expensive strxfrm() call next time */
2392 sss
->cache_blob
= true;
2396 * Byteswap on little-endian machines.
2398 * This is needed so that ssup_datum_unsigned_cmp() (an unsigned integer
2399 * 3-way comparator) works correctly on all platforms. If we didn't do
2400 * this, the comparator would have to call memcmp() with a pair of
2401 * pointers to the first byte of each abbreviated key, which is slower.
2403 res
= DatumBigEndianToNative(res
);
2405 /* Don't leak memory here */
2406 if (PointerGetDatum(authoritative
) != original
)
2407 pfree(authoritative
);
2413 * Callback for estimating effectiveness of abbreviated key optimization, using
2414 * heuristic rules. Returns value indicating if the abbreviation optimization
2415 * should be aborted, based on its projected effectiveness.
2418 varstr_abbrev_abort(int memtupcount
, SortSupport ssup
)
2420 VarStringSortSupport
*sss
= (VarStringSortSupport
*) ssup
->ssup_extra
;
2421 double abbrev_distinct
,
2424 Assert(ssup
->abbreviate
);
2426 /* Have a little patience */
2427 if (memtupcount
< 100)
2430 abbrev_distinct
= estimateHyperLogLog(&sss
->abbr_card
);
2431 key_distinct
= estimateHyperLogLog(&sss
->full_card
);
2434 * Clamp cardinality estimates to at least one distinct value. While
2435 * NULLs are generally disregarded, if only NULL values were seen so far,
2436 * that might misrepresent costs if we failed to clamp.
2438 if (abbrev_distinct
<= 1.0)
2439 abbrev_distinct
= 1.0;
2441 if (key_distinct
<= 1.0)
2445 * In the worst case all abbreviated keys are identical, while at the same
2446 * time there are differences within full key strings not captured in
2451 double norm_abbrev_card
= abbrev_distinct
/ (double) memtupcount
;
2453 elog(LOG
, "varstr_abbrev: abbrev_distinct after %d: %f "
2454 "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2455 memtupcount
, abbrev_distinct
, key_distinct
, norm_abbrev_card
,
2460 * If the number of distinct abbreviated keys approximately matches the
2461 * number of distinct authoritative original keys, that's reason enough to
2462 * proceed. We can win even with a very low cardinality set if most
2463 * tie-breakers only memcmp(). This is by far the most important
2466 * While comparisons that are resolved at the abbreviated key level are
2467 * considerably cheaper than tie-breakers resolved with memcmp(), both of
2468 * those two outcomes are so much cheaper than a full strcoll() once
2469 * sorting is underway that it doesn't seem worth it to weigh abbreviated
2470 * cardinality against the overall size of the set in order to more
2471 * accurately model costs. Assume that an abbreviated comparison, and an
2472 * abbreviated comparison with a cheap memcmp()-based authoritative
2473 * resolution are equivalent.
2475 if (abbrev_distinct
> key_distinct
* sss
->prop_card
)
2478 * When we have exceeded 10,000 tuples, decay required cardinality
2479 * aggressively for next call.
2481 * This is useful because the number of comparisons required on
2482 * average increases at a linearithmic rate, and at roughly 10,000
2483 * tuples that factor will start to dominate over the linear costs of
2484 * string transformation (this is a conservative estimate). The decay
2485 * rate is chosen to be a little less aggressive than halving -- which
2486 * (since we're called at points at which memtupcount has doubled)
2487 * would never see the cost model actually abort past the first call
2488 * following a decay. This decay rate is mostly a precaution against
2489 * a sudden, violent swing in how well abbreviated cardinality tracks
2490 * full key cardinality. The decay also serves to prevent a marginal
2491 * case from being aborted too late, when too much has already been
2492 * invested in string transformation.
2494 * It's possible for sets of several million distinct strings with
2495 * mere tens of thousands of distinct abbreviated keys to still
2496 * benefit very significantly. This will generally occur provided
2497 * each abbreviated key is a proxy for a roughly uniform number of the
2498 * set's full keys. If it isn't so, we hope to catch that early and
2499 * abort. If it isn't caught early, by the time the problem is
2500 * apparent it's probably not worth aborting.
2502 if (memtupcount
> 10000)
2503 sss
->prop_card
*= 0.65;
2509 * Abort abbreviation strategy.
2511 * The worst case, where all abbreviated keys are identical while all
2512 * original strings differ will typically only see a regression of about
2513 * 10% in execution time for small to medium sized lists of strings.
2514 * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2515 * often expect very large improvements, particularly with sets of strings
2516 * of moderately high to high abbreviated cardinality. There is little to
2517 * lose but much to gain, which our strategy reflects.
2520 elog(LOG
, "varstr_abbrev: aborted abbreviation at %d "
2521 "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2522 memtupcount
, abbrev_distinct
, key_distinct
, sss
->prop_card
);
2528 * Generic equalimage support function for character type's operator classes.
2529 * Disables the use of deduplication with nondeterministic collations.
2532 btvarstrequalimage(PG_FUNCTION_ARGS
)
2534 /* Oid opcintype = PG_GETARG_OID(0); */
2535 Oid collid
= PG_GET_COLLATION();
2538 check_collation_set(collid
);
2540 locale
= pg_newlocale_from_collation(collid
);
2542 PG_RETURN_BOOL(locale
->deterministic
);
2546 text_larger(PG_FUNCTION_ARGS
)
2548 text
*arg1
= PG_GETARG_TEXT_PP(0);
2549 text
*arg2
= PG_GETARG_TEXT_PP(1);
2552 result
= ((text_cmp(arg1
, arg2
, PG_GET_COLLATION()) > 0) ? arg1
: arg2
);
2554 PG_RETURN_TEXT_P(result
);
2558 text_smaller(PG_FUNCTION_ARGS
)
2560 text
*arg1
= PG_GETARG_TEXT_PP(0);
2561 text
*arg2
= PG_GETARG_TEXT_PP(1);
2564 result
= ((text_cmp(arg1
, arg2
, PG_GET_COLLATION()) < 0) ? arg1
: arg2
);
2566 PG_RETURN_TEXT_P(result
);
2571 * Cross-type comparison functions for types text and name.
2575 nameeqtext(PG_FUNCTION_ARGS
)
2577 Name arg1
= PG_GETARG_NAME(0);
2578 text
*arg2
= PG_GETARG_TEXT_PP(1);
2579 size_t len1
= strlen(NameStr(*arg1
));
2580 size_t len2
= VARSIZE_ANY_EXHDR(arg2
);
2581 Oid collid
= PG_GET_COLLATION();
2584 check_collation_set(collid
);
2586 if (collid
== C_COLLATION_OID
)
2587 result
= (len1
== len2
&&
2588 memcmp(NameStr(*arg1
), VARDATA_ANY(arg2
), len1
) == 0);
2590 result
= (varstr_cmp(NameStr(*arg1
), len1
,
2591 VARDATA_ANY(arg2
), len2
,
2594 PG_FREE_IF_COPY(arg2
, 1);
2596 PG_RETURN_BOOL(result
);
2600 texteqname(PG_FUNCTION_ARGS
)
2602 text
*arg1
= PG_GETARG_TEXT_PP(0);
2603 Name arg2
= PG_GETARG_NAME(1);
2604 size_t len1
= VARSIZE_ANY_EXHDR(arg1
);
2605 size_t len2
= strlen(NameStr(*arg2
));
2606 Oid collid
= PG_GET_COLLATION();
2609 check_collation_set(collid
);
2611 if (collid
== C_COLLATION_OID
)
2612 result
= (len1
== len2
&&
2613 memcmp(VARDATA_ANY(arg1
), NameStr(*arg2
), len1
) == 0);
2615 result
= (varstr_cmp(VARDATA_ANY(arg1
), len1
,
2616 NameStr(*arg2
), len2
,
2619 PG_FREE_IF_COPY(arg1
, 0);
2621 PG_RETURN_BOOL(result
);
2625 namenetext(PG_FUNCTION_ARGS
)
2627 Name arg1
= PG_GETARG_NAME(0);
2628 text
*arg2
= PG_GETARG_TEXT_PP(1);
2629 size_t len1
= strlen(NameStr(*arg1
));
2630 size_t len2
= VARSIZE_ANY_EXHDR(arg2
);
2631 Oid collid
= PG_GET_COLLATION();
2634 check_collation_set(collid
);
2636 if (collid
== C_COLLATION_OID
)
2637 result
= !(len1
== len2
&&
2638 memcmp(NameStr(*arg1
), VARDATA_ANY(arg2
), len1
) == 0);
2640 result
= !(varstr_cmp(NameStr(*arg1
), len1
,
2641 VARDATA_ANY(arg2
), len2
,
2644 PG_FREE_IF_COPY(arg2
, 1);
2646 PG_RETURN_BOOL(result
);
2650 textnename(PG_FUNCTION_ARGS
)
2652 text
*arg1
= PG_GETARG_TEXT_PP(0);
2653 Name arg2
= PG_GETARG_NAME(1);
2654 size_t len1
= VARSIZE_ANY_EXHDR(arg1
);
2655 size_t len2
= strlen(NameStr(*arg2
));
2656 Oid collid
= PG_GET_COLLATION();
2659 check_collation_set(collid
);
2661 if (collid
== C_COLLATION_OID
)
2662 result
= !(len1
== len2
&&
2663 memcmp(VARDATA_ANY(arg1
), NameStr(*arg2
), len1
) == 0);
2665 result
= !(varstr_cmp(VARDATA_ANY(arg1
), len1
,
2666 NameStr(*arg2
), len2
,
2669 PG_FREE_IF_COPY(arg1
, 0);
2671 PG_RETURN_BOOL(result
);
2675 btnametextcmp(PG_FUNCTION_ARGS
)
2677 Name arg1
= PG_GETARG_NAME(0);
2678 text
*arg2
= PG_GETARG_TEXT_PP(1);
2681 result
= varstr_cmp(NameStr(*arg1
), strlen(NameStr(*arg1
)),
2682 VARDATA_ANY(arg2
), VARSIZE_ANY_EXHDR(arg2
),
2683 PG_GET_COLLATION());
2685 PG_FREE_IF_COPY(arg2
, 1);
2687 PG_RETURN_INT32(result
);
2691 bttextnamecmp(PG_FUNCTION_ARGS
)
2693 text
*arg1
= PG_GETARG_TEXT_PP(0);
2694 Name arg2
= PG_GETARG_NAME(1);
2697 result
= varstr_cmp(VARDATA_ANY(arg1
), VARSIZE_ANY_EXHDR(arg1
),
2698 NameStr(*arg2
), strlen(NameStr(*arg2
)),
2699 PG_GET_COLLATION());
2701 PG_FREE_IF_COPY(arg1
, 0);
2703 PG_RETURN_INT32(result
);
2706 #define CmpCall(cmpfunc) \
2707 DatumGetInt32(DirectFunctionCall2Coll(cmpfunc, \
2708 PG_GET_COLLATION(), \
2709 PG_GETARG_DATUM(0), \
2710 PG_GETARG_DATUM(1)))
2713 namelttext(PG_FUNCTION_ARGS
)
2715 PG_RETURN_BOOL(CmpCall(btnametextcmp
) < 0);
2719 nameletext(PG_FUNCTION_ARGS
)
2721 PG_RETURN_BOOL(CmpCall(btnametextcmp
) <= 0);
2725 namegttext(PG_FUNCTION_ARGS
)
2727 PG_RETURN_BOOL(CmpCall(btnametextcmp
) > 0);
2731 namegetext(PG_FUNCTION_ARGS
)
2733 PG_RETURN_BOOL(CmpCall(btnametextcmp
) >= 0);
2737 textltname(PG_FUNCTION_ARGS
)
2739 PG_RETURN_BOOL(CmpCall(bttextnamecmp
) < 0);
2743 textlename(PG_FUNCTION_ARGS
)
2745 PG_RETURN_BOOL(CmpCall(bttextnamecmp
) <= 0);
2749 textgtname(PG_FUNCTION_ARGS
)
2751 PG_RETURN_BOOL(CmpCall(bttextnamecmp
) > 0);
2755 textgename(PG_FUNCTION_ARGS
)
2757 PG_RETURN_BOOL(CmpCall(bttextnamecmp
) >= 0);
2764 * The following operators support character-by-character comparison
2765 * of text datums, to allow building indexes suitable for LIKE clauses.
2766 * Note that the regular texteq/textne comparison operators, and regular
2767 * support functions 1 and 2 with "C" collation are assumed to be
2768 * compatible with these!
2772 internal_text_pattern_compare(text
*arg1
, text
*arg2
)
2778 len1
= VARSIZE_ANY_EXHDR(arg1
);
2779 len2
= VARSIZE_ANY_EXHDR(arg2
);
2781 result
= memcmp(VARDATA_ANY(arg1
), VARDATA_ANY(arg2
), Min(len1
, len2
));
2784 else if (len1
< len2
)
2786 else if (len1
> len2
)
2794 text_pattern_lt(PG_FUNCTION_ARGS
)
2796 text
*arg1
= PG_GETARG_TEXT_PP(0);
2797 text
*arg2
= PG_GETARG_TEXT_PP(1);
2800 result
= internal_text_pattern_compare(arg1
, arg2
);
2802 PG_FREE_IF_COPY(arg1
, 0);
2803 PG_FREE_IF_COPY(arg2
, 1);
2805 PG_RETURN_BOOL(result
< 0);
2810 text_pattern_le(PG_FUNCTION_ARGS
)
2812 text
*arg1
= PG_GETARG_TEXT_PP(0);
2813 text
*arg2
= PG_GETARG_TEXT_PP(1);
2816 result
= internal_text_pattern_compare(arg1
, arg2
);
2818 PG_FREE_IF_COPY(arg1
, 0);
2819 PG_FREE_IF_COPY(arg2
, 1);
2821 PG_RETURN_BOOL(result
<= 0);
2826 text_pattern_ge(PG_FUNCTION_ARGS
)
2828 text
*arg1
= PG_GETARG_TEXT_PP(0);
2829 text
*arg2
= PG_GETARG_TEXT_PP(1);
2832 result
= internal_text_pattern_compare(arg1
, arg2
);
2834 PG_FREE_IF_COPY(arg1
, 0);
2835 PG_FREE_IF_COPY(arg2
, 1);
2837 PG_RETURN_BOOL(result
>= 0);
2842 text_pattern_gt(PG_FUNCTION_ARGS
)
2844 text
*arg1
= PG_GETARG_TEXT_PP(0);
2845 text
*arg2
= PG_GETARG_TEXT_PP(1);
2848 result
= internal_text_pattern_compare(arg1
, arg2
);
2850 PG_FREE_IF_COPY(arg1
, 0);
2851 PG_FREE_IF_COPY(arg2
, 1);
2853 PG_RETURN_BOOL(result
> 0);
2858 bttext_pattern_cmp(PG_FUNCTION_ARGS
)
2860 text
*arg1
= PG_GETARG_TEXT_PP(0);
2861 text
*arg2
= PG_GETARG_TEXT_PP(1);
2864 result
= internal_text_pattern_compare(arg1
, arg2
);
2866 PG_FREE_IF_COPY(arg1
, 0);
2867 PG_FREE_IF_COPY(arg2
, 1);
2869 PG_RETURN_INT32(result
);
2874 bttext_pattern_sortsupport(PG_FUNCTION_ARGS
)
2876 SortSupport ssup
= (SortSupport
) PG_GETARG_POINTER(0);
2877 MemoryContext oldcontext
;
2879 oldcontext
= MemoryContextSwitchTo(ssup
->ssup_cxt
);
2881 /* Use generic string SortSupport, forcing "C" collation */
2882 varstr_sortsupport(ssup
, TEXTOID
, C_COLLATION_OID
);
2884 MemoryContextSwitchTo(oldcontext
);
2890 /*-------------------------------------------------------------
2893 * get the number of bytes contained in an instance of type 'bytea'
2894 *-------------------------------------------------------------
2897 byteaoctetlen(PG_FUNCTION_ARGS
)
2899 Datum str
= PG_GETARG_DATUM(0);
2901 /* We need not detoast the input at all */
2902 PG_RETURN_INT32(toast_raw_datum_size(str
) - VARHDRSZ
);
2907 * takes two bytea* and returns a bytea* that is the concatenation of
2910 * Cloned from textcat and modified as required.
2913 byteacat(PG_FUNCTION_ARGS
)
2915 bytea
*t1
= PG_GETARG_BYTEA_PP(0);
2916 bytea
*t2
= PG_GETARG_BYTEA_PP(1);
2918 PG_RETURN_BYTEA_P(bytea_catenate(t1
, t2
));
2923 * Guts of byteacat(), broken out so it can be used by other functions
2925 * Arguments can be in short-header form, but not compressed or out-of-line
2928 bytea_catenate(bytea
*t1
, bytea
*t2
)
2936 len1
= VARSIZE_ANY_EXHDR(t1
);
2937 len2
= VARSIZE_ANY_EXHDR(t2
);
2939 /* paranoia ... probably should throw error instead? */
2945 len
= len1
+ len2
+ VARHDRSZ
;
2946 result
= (bytea
*) palloc(len
);
2948 /* Set size of result string... */
2949 SET_VARSIZE(result
, len
);
2951 /* Fill data field of result string... */
2952 ptr
= VARDATA(result
);
2954 memcpy(ptr
, VARDATA_ANY(t1
), len1
);
2956 memcpy(ptr
+ len1
, VARDATA_ANY(t2
), len2
);
2961 #define PG_STR_GET_BYTEA(str_) \
2962 DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
2966 * Return a substring starting at the specified position.
2967 * Cloned from text_substr and modified as required.
2971 * - starting position (is one-based)
2972 * - string length (optional)
2974 * If the starting position is zero or less, then return from the start of the string
2975 * adjusting the length to be consistent with the "negative start" per SQL.
2976 * If the length is less than zero, an ERROR is thrown. If no third argument
2977 * (length) is provided, the length to the end of the string is assumed.
2980 bytea_substr(PG_FUNCTION_ARGS
)
2982 PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
2989 * bytea_substr_no_len -
2990 * Wrapper to avoid opr_sanity failure due to
2991 * one function accepting a different number of args.
2994 bytea_substr_no_len(PG_FUNCTION_ARGS
)
2996 PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
3003 bytea_substring(Datum str
,
3006 bool length_not_specified
)
3008 int32 S1
; /* adjusted start position */
3009 int32 L1
; /* adjusted substring length */
3010 int32 E
; /* end position */
3013 * The logic here should generally match text_substring().
3017 if (length_not_specified
)
3020 * Not passed a length - DatumGetByteaPSlice() grabs everything to the
3021 * end of the string if we pass it a negative value for length.
3027 /* SQL99 says to throw an error for E < S, i.e., negative length */
3029 (errcode(ERRCODE_SUBSTRING_ERROR
),
3030 errmsg("negative substring length not allowed")));
3031 L1
= -1; /* silence stupider compilers */
3033 else if (pg_add_s32_overflow(S
, L
, &E
))
3036 * L could be large enough for S + L to overflow, in which case the
3037 * substring must run to end of string.
3044 * A zero or negative value for the end position can happen if the
3045 * start was negative or one. SQL99 says to return a zero-length
3049 return PG_STR_GET_BYTEA("");
3055 * If the start position is past the end of the string, SQL99 says to
3056 * return a zero-length string -- DatumGetByteaPSlice() will do that for
3057 * us. We need only convert S1 to zero-based starting position.
3059 return DatumGetByteaPSlice(str
, S1
- 1, L1
);
3064 * Replace specified substring of first string with second
3066 * The SQL standard defines OVERLAY() in terms of substring and concatenation.
3067 * This code is a direct implementation of what the standard says.
3070 byteaoverlay(PG_FUNCTION_ARGS
)
3072 bytea
*t1
= PG_GETARG_BYTEA_PP(0);
3073 bytea
*t2
= PG_GETARG_BYTEA_PP(1);
3074 int sp
= PG_GETARG_INT32(2); /* substring start position */
3075 int sl
= PG_GETARG_INT32(3); /* substring length */
3077 PG_RETURN_BYTEA_P(bytea_overlay(t1
, t2
, sp
, sl
));
3081 byteaoverlay_no_len(PG_FUNCTION_ARGS
)
3083 bytea
*t1
= PG_GETARG_BYTEA_PP(0);
3084 bytea
*t2
= PG_GETARG_BYTEA_PP(1);
3085 int sp
= PG_GETARG_INT32(2); /* substring start position */
3088 sl
= VARSIZE_ANY_EXHDR(t2
); /* defaults to length(t2) */
3089 PG_RETURN_BYTEA_P(bytea_overlay(t1
, t2
, sp
, sl
));
3093 bytea_overlay(bytea
*t1
, bytea
*t2
, int sp
, int sl
)
3101 * Check for possible integer-overflow cases. For negative sp, throw a
3102 * "substring length" error because that's what should be expected
3103 * according to the spec's definition of OVERLAY().
3107 (errcode(ERRCODE_SUBSTRING_ERROR
),
3108 errmsg("negative substring length not allowed")));
3109 if (pg_add_s32_overflow(sp
, sl
, &sp_pl_sl
))
3111 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE
),
3112 errmsg("integer out of range")));
3114 s1
= bytea_substring(PointerGetDatum(t1
), 1, sp
- 1, false);
3115 s2
= bytea_substring(PointerGetDatum(t1
), sp_pl_sl
, -1, true);
3116 result
= bytea_catenate(s1
, t2
);
3117 result
= bytea_catenate(result
, s2
);
3126 bytea_bit_count(PG_FUNCTION_ARGS
)
3128 bytea
*t1
= PG_GETARG_BYTEA_PP(0);
3130 PG_RETURN_INT64(pg_popcount(VARDATA_ANY(t1
), VARSIZE_ANY_EXHDR(t1
)));
3135 * Return the position of the specified substring.
3136 * Implements the SQL POSITION() function.
3137 * Cloned from textpos and modified as required.
3140 byteapos(PG_FUNCTION_ARGS
)
3142 bytea
*t1
= PG_GETARG_BYTEA_PP(0);
3143 bytea
*t2
= PG_GETARG_BYTEA_PP(1);
3152 len1
= VARSIZE_ANY_EXHDR(t1
);
3153 len2
= VARSIZE_ANY_EXHDR(t2
);
3156 PG_RETURN_INT32(1); /* result for empty pattern */
3158 p1
= VARDATA_ANY(t1
);
3159 p2
= VARDATA_ANY(t2
);
3163 for (p
= 0; p
<= px
; p
++)
3165 if ((*p2
== *p1
) && (memcmp(p1
, p2
, len2
) == 0))
3173 PG_RETURN_INT32(pos
);
3176 /*-------------------------------------------------------------
3179 * this routine treats "bytea" as an array of bytes.
3180 * It returns the Nth byte (a number between 0 and 255).
3181 *-------------------------------------------------------------
3184 byteaGetByte(PG_FUNCTION_ARGS
)
3186 bytea
*v
= PG_GETARG_BYTEA_PP(0);
3187 int32 n
= PG_GETARG_INT32(1);
3191 len
= VARSIZE_ANY_EXHDR(v
);
3193 if (n
< 0 || n
>= len
)
3195 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR
),
3196 errmsg("index %d out of valid range, 0..%d",
3199 byte
= ((unsigned char *) VARDATA_ANY(v
))[n
];
3201 PG_RETURN_INT32(byte
);
3204 /*-------------------------------------------------------------
3207 * This routine treats a "bytea" type like an array of bits.
3208 * It returns the value of the Nth bit (0 or 1).
3210 *-------------------------------------------------------------
3213 byteaGetBit(PG_FUNCTION_ARGS
)
3215 bytea
*v
= PG_GETARG_BYTEA_PP(0);
3216 int64 n
= PG_GETARG_INT64(1);
3222 len
= VARSIZE_ANY_EXHDR(v
);
3224 if (n
< 0 || n
>= (int64
) len
* 8)
3226 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR
),
3227 errmsg("index %lld out of valid range, 0..%lld",
3228 (long long) n
, (long long) len
* 8 - 1)));
3230 /* n/8 is now known < len, so safe to cast to int */
3231 byteNo
= (int) (n
/ 8);
3232 bitNo
= (int) (n
% 8);
3234 byte
= ((unsigned char *) VARDATA_ANY(v
))[byteNo
];
3236 if (byte
& (1 << bitNo
))
3242 /*-------------------------------------------------------------
3245 * Given an instance of type 'bytea' creates a new one with
3246 * the Nth byte set to the given value.
3248 *-------------------------------------------------------------
3251 byteaSetByte(PG_FUNCTION_ARGS
)
3253 bytea
*res
= PG_GETARG_BYTEA_P_COPY(0);
3254 int32 n
= PG_GETARG_INT32(1);
3255 int32 newByte
= PG_GETARG_INT32(2);
3258 len
= VARSIZE(res
) - VARHDRSZ
;
3260 if (n
< 0 || n
>= len
)
3262 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR
),
3263 errmsg("index %d out of valid range, 0..%d",
3269 ((unsigned char *) VARDATA(res
))[n
] = newByte
;
3271 PG_RETURN_BYTEA_P(res
);
3274 /*-------------------------------------------------------------
3277 * Given an instance of type 'bytea' creates a new one with
3278 * the Nth bit set to the given value.
3280 *-------------------------------------------------------------
3283 byteaSetBit(PG_FUNCTION_ARGS
)
3285 bytea
*res
= PG_GETARG_BYTEA_P_COPY(0);
3286 int64 n
= PG_GETARG_INT64(1);
3287 int32 newBit
= PG_GETARG_INT32(2);
3294 len
= VARSIZE(res
) - VARHDRSZ
;
3296 if (n
< 0 || n
>= (int64
) len
* 8)
3298 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR
),
3299 errmsg("index %lld out of valid range, 0..%lld",
3300 (long long) n
, (long long) len
* 8 - 1)));
3302 /* n/8 is now known < len, so safe to cast to int */
3303 byteNo
= (int) (n
/ 8);
3304 bitNo
= (int) (n
% 8);
3309 if (newBit
!= 0 && newBit
!= 1)
3311 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
3312 errmsg("new bit must be 0 or 1")));
3317 oldByte
= ((unsigned char *) VARDATA(res
))[byteNo
];
3320 newByte
= oldByte
& (~(1 << bitNo
));
3322 newByte
= oldByte
| (1 << bitNo
);
3324 ((unsigned char *) VARDATA(res
))[byteNo
] = newByte
;
3326 PG_RETURN_BYTEA_P(res
);
3331 * Converts a text type to a Name type.
3334 text_name(PG_FUNCTION_ARGS
)
3336 text
*s
= PG_GETARG_TEXT_PP(0);
3340 len
= VARSIZE_ANY_EXHDR(s
);
3342 /* Truncate oversize input */
3343 if (len
>= NAMEDATALEN
)
3344 len
= pg_mbcliplen(VARDATA_ANY(s
), len
, NAMEDATALEN
- 1);
3346 /* We use palloc0 here to ensure result is zero-padded */
3347 result
= (Name
) palloc0(NAMEDATALEN
);
3348 memcpy(NameStr(*result
), VARDATA_ANY(s
), len
);
3350 PG_RETURN_NAME(result
);
3354 * Converts a Name type to a text type.
3357 name_text(PG_FUNCTION_ARGS
)
3359 Name s
= PG_GETARG_NAME(0);
3361 PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s
)));
3366 * textToQualifiedNameList - convert a text object to list of names
3368 * This implements the input parsing needed by nextval() and other
3369 * functions that take a text parameter representing a qualified name.
3370 * We split the name at dots, downcase if not double-quoted, and
3371 * truncate names if they're too long.
3374 textToQualifiedNameList(text
*textval
)
3381 /* Convert to C string (handles possible detoasting). */
3382 /* Note we rely on being able to modify rawname below. */
3383 rawname
= text_to_cstring(textval
);
3385 if (!SplitIdentifierString(rawname
, '.', &namelist
))
3387 (errcode(ERRCODE_INVALID_NAME
),
3388 errmsg("invalid name syntax")));
3390 if (namelist
== NIL
)
3392 (errcode(ERRCODE_INVALID_NAME
),
3393 errmsg("invalid name syntax")));
3395 foreach(l
, namelist
)
3397 char *curname
= (char *) lfirst(l
);
3399 result
= lappend(result
, makeString(pstrdup(curname
)));
3403 list_free(namelist
);
3409 * SplitIdentifierString --- parse a string containing identifiers
3411 * This is the guts of textToQualifiedNameList, and is exported for use in
3412 * other situations such as parsing GUC variables. In the GUC case, it's
3413 * important to avoid memory leaks, so the API is designed to minimize the
3414 * amount of stuff that needs to be allocated and freed.
3417 * rawstring: the input string; must be overwritable! On return, it's
3418 * been modified to contain the separated identifiers.
3419 * separator: the separator punctuation expected between identifiers
3420 * (typically '.' or ','). Whitespace may also appear around
3423 * namelist: filled with a palloc'd list of pointers to identifiers within
3424 * rawstring. Caller should list_free() this even on error return.
3426 * Returns true if okay, false if there is a syntax error in the string.
3428 * Note that an empty string is considered okay here, though not in
3429 * textToQualifiedNameList.
3432 SplitIdentifierString(char *rawstring
, char separator
,
3435 char *nextp
= rawstring
;
3440 while (scanner_isspace(*nextp
))
3441 nextp
++; /* skip leading whitespace */
3444 return true; /* allow empty string */
3446 /* At the top of the loop, we are at start of a new identifier. */
3454 /* Quoted name --- collapse quote-quote pairs, no downcasing */
3455 curname
= nextp
+ 1;
3458 endp
= strchr(nextp
+ 1, '"');
3460 return false; /* mismatched quotes */
3462 break; /* found end of quoted name */
3463 /* Collapse adjacent quotes into one quote, and look again */
3464 memmove(endp
, endp
+ 1, strlen(endp
));
3467 /* endp now points at the terminating quote */
3472 /* Unquoted name --- extends to separator or whitespace */
3477 while (*nextp
&& *nextp
!= separator
&&
3478 !scanner_isspace(*nextp
))
3481 if (curname
== nextp
)
3482 return false; /* empty unquoted name not allowed */
3485 * Downcase the identifier, using same code as main lexer does.
3487 * XXX because we want to overwrite the input in-place, we cannot
3488 * support a downcasing transformation that increases the string
3489 * length. This is not a problem given the current implementation
3490 * of downcase_truncate_identifier, but we'll probably have to do
3491 * something about this someday.
3493 len
= endp
- curname
;
3494 downname
= downcase_truncate_identifier(curname
, len
, false);
3495 Assert(strlen(downname
) <= len
);
3496 strncpy(curname
, downname
, len
); /* strncpy is required here */
3500 while (scanner_isspace(*nextp
))
3501 nextp
++; /* skip trailing whitespace */
3503 if (*nextp
== separator
)
3506 while (scanner_isspace(*nextp
))
3507 nextp
++; /* skip leading whitespace for next */
3508 /* we expect another name, so done remains false */
3510 else if (*nextp
== '\0')
3513 return false; /* invalid syntax */
3515 /* Now safe to overwrite separator with a null */
3518 /* Truncate name if it's overlength */
3519 truncate_identifier(curname
, strlen(curname
), false);
3522 * Finished isolating current name --- add it to list
3524 *namelist
= lappend(*namelist
, curname
);
3526 /* Loop back if we didn't reach end of string */
3534 * SplitDirectoriesString --- parse a string containing file/directory names
3536 * This works fine on file names too; the function name is historical.
3538 * This is similar to SplitIdentifierString, except that the parsing
3539 * rules are meant to handle pathnames instead of identifiers: there is
3540 * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3541 * and we apply canonicalize_path() to each extracted string. Because of the
3542 * last, the returned strings are separately palloc'd rather than being
3543 * pointers into rawstring --- but we still scribble on rawstring.
3546 * rawstring: the input string; must be modifiable!
3547 * separator: the separator punctuation expected between directories
3548 * (typically ',' or ';'). Whitespace may also appear around
3551 * namelist: filled with a palloc'd list of directory names.
3552 * Caller should list_free_deep() this even on error return.
3554 * Returns true if okay, false if there is a syntax error in the string.
3556 * Note that an empty string is considered okay here.
3559 SplitDirectoriesString(char *rawstring
, char separator
,
3562 char *nextp
= rawstring
;
3567 while (scanner_isspace(*nextp
))
3568 nextp
++; /* skip leading whitespace */
3571 return true; /* allow empty string */
3573 /* At the top of the loop, we are at start of a new directory. */
3581 /* Quoted name --- collapse quote-quote pairs */
3582 curname
= nextp
+ 1;
3585 endp
= strchr(nextp
+ 1, '"');
3587 return false; /* mismatched quotes */
3589 break; /* found end of quoted name */
3590 /* Collapse adjacent quotes into one quote, and look again */
3591 memmove(endp
, endp
+ 1, strlen(endp
));
3594 /* endp now points at the terminating quote */
3599 /* Unquoted name --- extends to separator or end of string */
3600 curname
= endp
= nextp
;
3601 while (*nextp
&& *nextp
!= separator
)
3603 /* trailing whitespace should not be included in name */
3604 if (!scanner_isspace(*nextp
))
3608 if (curname
== endp
)
3609 return false; /* empty unquoted name not allowed */
3612 while (scanner_isspace(*nextp
))
3613 nextp
++; /* skip trailing whitespace */
3615 if (*nextp
== separator
)
3618 while (scanner_isspace(*nextp
))
3619 nextp
++; /* skip leading whitespace for next */
3620 /* we expect another name, so done remains false */
3622 else if (*nextp
== '\0')
3625 return false; /* invalid syntax */
3627 /* Now safe to overwrite separator with a null */
3630 /* Truncate path if it's overlength */
3631 if (strlen(curname
) >= MAXPGPATH
)
3632 curname
[MAXPGPATH
- 1] = '\0';
3635 * Finished isolating current name --- add it to list
3637 curname
= pstrdup(curname
);
3638 canonicalize_path(curname
);
3639 *namelist
= lappend(*namelist
, curname
);
3641 /* Loop back if we didn't reach end of string */
3649 * SplitGUCList --- parse a string containing identifiers or file names
3651 * This is used to split the value of a GUC_LIST_QUOTE GUC variable, without
3652 * presuming whether the elements will be taken as identifiers or file names.
3653 * We assume the input has already been through flatten_set_variable_args(),
3654 * so that we need never downcase (if appropriate, that was done already).
3655 * Nor do we ever truncate, since we don't know the correct max length.
3656 * We disallow embedded whitespace for simplicity (it shouldn't matter,
3657 * because any embedded whitespace should have led to double-quoting).
3658 * Otherwise the API is identical to SplitIdentifierString.
3660 * XXX it's annoying to have so many copies of this string-splitting logic.
3661 * However, it's not clear that having one function with a bunch of option
3662 * flags would be much better.
3664 * XXX there is a version of this function in src/bin/pg_dump/dumputils.c.
3665 * Be sure to update that if you have to change this.
3668 * rawstring: the input string; must be overwritable! On return, it's
3669 * been modified to contain the separated identifiers.
3670 * separator: the separator punctuation expected between identifiers
3671 * (typically '.' or ','). Whitespace may also appear around
3674 * namelist: filled with a palloc'd list of pointers to identifiers within
3675 * rawstring. Caller should list_free() this even on error return.
3677 * Returns true if okay, false if there is a syntax error in the string.
3680 SplitGUCList(char *rawstring
, char separator
,
3683 char *nextp
= rawstring
;
3688 while (scanner_isspace(*nextp
))
3689 nextp
++; /* skip leading whitespace */
3692 return true; /* allow empty string */
3694 /* At the top of the loop, we are at start of a new identifier. */
3702 /* Quoted name --- collapse quote-quote pairs */
3703 curname
= nextp
+ 1;
3706 endp
= strchr(nextp
+ 1, '"');
3708 return false; /* mismatched quotes */
3710 break; /* found end of quoted name */
3711 /* Collapse adjacent quotes into one quote, and look again */
3712 memmove(endp
, endp
+ 1, strlen(endp
));
3715 /* endp now points at the terminating quote */
3720 /* Unquoted name --- extends to separator or whitespace */
3722 while (*nextp
&& *nextp
!= separator
&&
3723 !scanner_isspace(*nextp
))
3726 if (curname
== nextp
)
3727 return false; /* empty unquoted name not allowed */
3730 while (scanner_isspace(*nextp
))
3731 nextp
++; /* skip trailing whitespace */
3733 if (*nextp
== separator
)
3736 while (scanner_isspace(*nextp
))
3737 nextp
++; /* skip leading whitespace for next */
3738 /* we expect another name, so done remains false */
3740 else if (*nextp
== '\0')
3743 return false; /* invalid syntax */
3745 /* Now safe to overwrite separator with a null */
3749 * Finished isolating current name --- add it to list
3751 *namelist
= lappend(*namelist
, curname
);
3753 /* Loop back if we didn't reach end of string */
3760 /*****************************************************************************
3761 * Comparison Functions used for bytea
3763 * Note: btree indexes need these routines not to leak memory; therefore,
3764 * be careful to free working copies of toasted datums. Most places don't
3765 * need to be so careful.
3766 *****************************************************************************/
3769 byteaeq(PG_FUNCTION_ARGS
)
3771 Datum arg1
= PG_GETARG_DATUM(0);
3772 Datum arg2
= PG_GETARG_DATUM(1);
3778 * We can use a fast path for unequal lengths, which might save us from
3779 * having to detoast one or both values.
3781 len1
= toast_raw_datum_size(arg1
);
3782 len2
= toast_raw_datum_size(arg2
);
3787 bytea
*barg1
= DatumGetByteaPP(arg1
);
3788 bytea
*barg2
= DatumGetByteaPP(arg2
);
3790 result
= (memcmp(VARDATA_ANY(barg1
), VARDATA_ANY(barg2
),
3791 len1
- VARHDRSZ
) == 0);
3793 PG_FREE_IF_COPY(barg1
, 0);
3794 PG_FREE_IF_COPY(barg2
, 1);
3797 PG_RETURN_BOOL(result
);
3801 byteane(PG_FUNCTION_ARGS
)
3803 Datum arg1
= PG_GETARG_DATUM(0);
3804 Datum arg2
= PG_GETARG_DATUM(1);
3810 * We can use a fast path for unequal lengths, which might save us from
3811 * having to detoast one or both values.
3813 len1
= toast_raw_datum_size(arg1
);
3814 len2
= toast_raw_datum_size(arg2
);
3819 bytea
*barg1
= DatumGetByteaPP(arg1
);
3820 bytea
*barg2
= DatumGetByteaPP(arg2
);
3822 result
= (memcmp(VARDATA_ANY(barg1
), VARDATA_ANY(barg2
),
3823 len1
- VARHDRSZ
) != 0);
3825 PG_FREE_IF_COPY(barg1
, 0);
3826 PG_FREE_IF_COPY(barg2
, 1);
3829 PG_RETURN_BOOL(result
);
3833 bytealt(PG_FUNCTION_ARGS
)
3835 bytea
*arg1
= PG_GETARG_BYTEA_PP(0);
3836 bytea
*arg2
= PG_GETARG_BYTEA_PP(1);
3841 len1
= VARSIZE_ANY_EXHDR(arg1
);
3842 len2
= VARSIZE_ANY_EXHDR(arg2
);
3844 cmp
= memcmp(VARDATA_ANY(arg1
), VARDATA_ANY(arg2
), Min(len1
, len2
));
3846 PG_FREE_IF_COPY(arg1
, 0);
3847 PG_FREE_IF_COPY(arg2
, 1);
3849 PG_RETURN_BOOL((cmp
< 0) || ((cmp
== 0) && (len1
< len2
)));
3853 byteale(PG_FUNCTION_ARGS
)
3855 bytea
*arg1
= PG_GETARG_BYTEA_PP(0);
3856 bytea
*arg2
= PG_GETARG_BYTEA_PP(1);
3861 len1
= VARSIZE_ANY_EXHDR(arg1
);
3862 len2
= VARSIZE_ANY_EXHDR(arg2
);
3864 cmp
= memcmp(VARDATA_ANY(arg1
), VARDATA_ANY(arg2
), Min(len1
, len2
));
3866 PG_FREE_IF_COPY(arg1
, 0);
3867 PG_FREE_IF_COPY(arg2
, 1);
3869 PG_RETURN_BOOL((cmp
< 0) || ((cmp
== 0) && (len1
<= len2
)));
3873 byteagt(PG_FUNCTION_ARGS
)
3875 bytea
*arg1
= PG_GETARG_BYTEA_PP(0);
3876 bytea
*arg2
= PG_GETARG_BYTEA_PP(1);
3881 len1
= VARSIZE_ANY_EXHDR(arg1
);
3882 len2
= VARSIZE_ANY_EXHDR(arg2
);
3884 cmp
= memcmp(VARDATA_ANY(arg1
), VARDATA_ANY(arg2
), Min(len1
, len2
));
3886 PG_FREE_IF_COPY(arg1
, 0);
3887 PG_FREE_IF_COPY(arg2
, 1);
3889 PG_RETURN_BOOL((cmp
> 0) || ((cmp
== 0) && (len1
> len2
)));
3893 byteage(PG_FUNCTION_ARGS
)
3895 bytea
*arg1
= PG_GETARG_BYTEA_PP(0);
3896 bytea
*arg2
= PG_GETARG_BYTEA_PP(1);
3901 len1
= VARSIZE_ANY_EXHDR(arg1
);
3902 len2
= VARSIZE_ANY_EXHDR(arg2
);
3904 cmp
= memcmp(VARDATA_ANY(arg1
), VARDATA_ANY(arg2
), Min(len1
, len2
));
3906 PG_FREE_IF_COPY(arg1
, 0);
3907 PG_FREE_IF_COPY(arg2
, 1);
3909 PG_RETURN_BOOL((cmp
> 0) || ((cmp
== 0) && (len1
>= len2
)));
3913 byteacmp(PG_FUNCTION_ARGS
)
3915 bytea
*arg1
= PG_GETARG_BYTEA_PP(0);
3916 bytea
*arg2
= PG_GETARG_BYTEA_PP(1);
3921 len1
= VARSIZE_ANY_EXHDR(arg1
);
3922 len2
= VARSIZE_ANY_EXHDR(arg2
);
3924 cmp
= memcmp(VARDATA_ANY(arg1
), VARDATA_ANY(arg2
), Min(len1
, len2
));
3925 if ((cmp
== 0) && (len1
!= len2
))
3926 cmp
= (len1
< len2
) ? -1 : 1;
3928 PG_FREE_IF_COPY(arg1
, 0);
3929 PG_FREE_IF_COPY(arg2
, 1);
3931 PG_RETURN_INT32(cmp
);
3935 bytea_larger(PG_FUNCTION_ARGS
)
3937 bytea
*arg1
= PG_GETARG_BYTEA_PP(0);
3938 bytea
*arg2
= PG_GETARG_BYTEA_PP(1);
3944 len1
= VARSIZE_ANY_EXHDR(arg1
);
3945 len2
= VARSIZE_ANY_EXHDR(arg2
);
3947 cmp
= memcmp(VARDATA_ANY(arg1
), VARDATA_ANY(arg2
), Min(len1
, len2
));
3948 result
= ((cmp
> 0) || ((cmp
== 0) && (len1
> len2
)) ? arg1
: arg2
);
3950 PG_RETURN_BYTEA_P(result
);
3954 bytea_smaller(PG_FUNCTION_ARGS
)
3956 bytea
*arg1
= PG_GETARG_BYTEA_PP(0);
3957 bytea
*arg2
= PG_GETARG_BYTEA_PP(1);
3963 len1
= VARSIZE_ANY_EXHDR(arg1
);
3964 len2
= VARSIZE_ANY_EXHDR(arg2
);
3966 cmp
= memcmp(VARDATA_ANY(arg1
), VARDATA_ANY(arg2
), Min(len1
, len2
));
3967 result
= ((cmp
< 0) || ((cmp
== 0) && (len1
< len2
)) ? arg1
: arg2
);
3969 PG_RETURN_BYTEA_P(result
);
3973 bytea_sortsupport(PG_FUNCTION_ARGS
)
3975 SortSupport ssup
= (SortSupport
) PG_GETARG_POINTER(0);
3976 MemoryContext oldcontext
;
3978 oldcontext
= MemoryContextSwitchTo(ssup
->ssup_cxt
);
3980 /* Use generic string SortSupport, forcing "C" collation */
3981 varstr_sortsupport(ssup
, BYTEAOID
, C_COLLATION_OID
);
3983 MemoryContextSwitchTo(oldcontext
);
3989 * appendStringInfoText
3991 * Append a text to str.
3992 * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
3995 appendStringInfoText(StringInfo str
, const text
*t
)
3997 appendBinaryStringInfo(str
, VARDATA_ANY(t
), VARSIZE_ANY_EXHDR(t
));
4002 * replace all occurrences of 'old_sub_str' in 'orig_str'
4003 * with 'new_sub_str' to form 'new_str'
4005 * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
4006 * otherwise returns 'new_str'
4009 replace_text(PG_FUNCTION_ARGS
)
4011 text
*src_text
= PG_GETARG_TEXT_PP(0);
4012 text
*from_sub_text
= PG_GETARG_TEXT_PP(1);
4013 text
*to_sub_text
= PG_GETARG_TEXT_PP(2);
4015 int from_sub_text_len
;
4016 TextPositionState state
;
4024 src_text_len
= VARSIZE_ANY_EXHDR(src_text
);
4025 from_sub_text_len
= VARSIZE_ANY_EXHDR(from_sub_text
);
4027 /* Return unmodified source string if empty source or pattern */
4028 if (src_text_len
< 1 || from_sub_text_len
< 1)
4030 PG_RETURN_TEXT_P(src_text
);
4033 text_position_setup(src_text
, from_sub_text
, PG_GET_COLLATION(), &state
);
4035 found
= text_position_next(&state
);
4037 /* When the from_sub_text is not found, there is nothing to do. */
4040 text_position_cleanup(&state
);
4041 PG_RETURN_TEXT_P(src_text
);
4043 curr_ptr
= text_position_get_match_ptr(&state
);
4044 start_ptr
= VARDATA_ANY(src_text
);
4046 initStringInfo(&str
);
4050 CHECK_FOR_INTERRUPTS();
4052 /* copy the data skipped over by last text_position_next() */
4053 chunk_len
= curr_ptr
- start_ptr
;
4054 appendBinaryStringInfo(&str
, start_ptr
, chunk_len
);
4056 appendStringInfoText(&str
, to_sub_text
);
4058 start_ptr
= curr_ptr
+ from_sub_text_len
;
4060 found
= text_position_next(&state
);
4062 curr_ptr
= text_position_get_match_ptr(&state
);
4066 /* copy trailing data */
4067 chunk_len
= ((char *) src_text
+ VARSIZE_ANY(src_text
)) - start_ptr
;
4068 appendBinaryStringInfo(&str
, start_ptr
, chunk_len
);
4070 text_position_cleanup(&state
);
4072 ret_text
= cstring_to_text_with_len(str
.data
, str
.len
);
4075 PG_RETURN_TEXT_P(ret_text
);
4079 * check_replace_text_has_escape
4081 * Returns 0 if text contains no backslashes that need processing.
4082 * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
4083 * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
4086 check_replace_text_has_escape(const text
*replace_text
)
4089 const char *p
= VARDATA_ANY(replace_text
);
4090 const char *p_end
= p
+ VARSIZE_ANY_EXHDR(replace_text
);
4094 /* Find next escape char, if any. */
4095 p
= memchr(p
, '\\', p_end
- p
);
4099 /* Note: a backslash at the end doesn't require extra processing. */
4102 if (*p
>= '1' && *p
<= '9')
4103 return 2; /* Found a submatch specifier, so done */
4104 result
= 1; /* Found some other sequence, keep looking */
4112 * appendStringInfoRegexpSubstr
4114 * Append replace_text to str, substituting regexp back references for
4115 * \n escapes. start_ptr is the start of the match in the source string,
4116 * at logical character position data_pos.
4119 appendStringInfoRegexpSubstr(StringInfo str
, text
*replace_text
,
4121 char *start_ptr
, int data_pos
)
4123 const char *p
= VARDATA_ANY(replace_text
);
4124 const char *p_end
= p
+ VARSIZE_ANY_EXHDR(replace_text
);
4128 const char *chunk_start
= p
;
4132 /* Find next escape char, if any. */
4133 p
= memchr(p
, '\\', p_end
- p
);
4137 /* Copy the text we just scanned over, if any. */
4138 if (p
> chunk_start
)
4139 appendBinaryStringInfo(str
, chunk_start
, p
- chunk_start
);
4141 /* Done if at end of string, else advance over escape char. */
4148 /* Escape at very end of input. Treat same as unexpected char */
4149 appendStringInfoChar(str
, '\\');
4153 if (*p
>= '1' && *p
<= '9')
4155 /* Use the back reference of regexp. */
4158 so
= pmatch
[idx
].rm_so
;
4159 eo
= pmatch
[idx
].rm_eo
;
4164 /* Use the entire matched string. */
4165 so
= pmatch
[0].rm_so
;
4166 eo
= pmatch
[0].rm_eo
;
4169 else if (*p
== '\\')
4171 /* \\ means transfer one \ to output. */
4172 appendStringInfoChar(str
, '\\');
4179 * If escape char is not followed by any expected char, just treat
4180 * it as ordinary data to copy. (XXX would it be better to throw
4183 appendStringInfoChar(str
, '\\');
4187 if (so
>= 0 && eo
>= 0)
4190 * Copy the text that is back reference of regexp. Note so and eo
4191 * are counted in characters not bytes.
4196 Assert(so
>= data_pos
);
4197 chunk_start
= start_ptr
;
4198 chunk_start
+= charlen_to_bytelen(chunk_start
, so
- data_pos
);
4199 chunk_len
= charlen_to_bytelen(chunk_start
, eo
- so
);
4200 appendBinaryStringInfo(str
, chunk_start
, chunk_len
);
4206 * replace_text_regexp
4208 * replace substring(s) in src_text that match pattern with replace_text.
4209 * The replace_text can contain backslash markers to substitute
4210 * (parts of) the matched text.
4212 * cflags: regexp compile flags.
4213 * collation: collation to use.
4214 * search_start: the character (not byte) offset in src_text at which to
4216 * n: if 0, replace all matches; if > 0, replace only the N'th match.
4219 replace_text_regexp(text
*src_text
, text
*pattern_text
,
4221 int cflags
, Oid collation
,
4222 int search_start
, int n
)
4226 int src_text_len
= VARSIZE_ANY_EXHDR(src_text
);
4229 regmatch_t pmatch
[10]; /* main match, plus \1 to \9 */
4230 int nmatch
= lengthof(pmatch
);
4237 initStringInfo(&buf
);
4239 /* Convert data string to wide characters. */
4240 data
= (pg_wchar
*) palloc((src_text_len
+ 1) * sizeof(pg_wchar
));
4241 data_len
= pg_mb2wchar_with_len(VARDATA_ANY(src_text
), data
, src_text_len
);
4243 /* Check whether replace_text has escapes, especially regexp submatches. */
4244 escape_status
= check_replace_text_has_escape(replace_text
);
4246 /* If no regexp submatches, we can use REG_NOSUB. */
4247 if (escape_status
< 2)
4249 cflags
|= REG_NOSUB
;
4250 /* Also tell pg_regexec we only want the whole-match location. */
4254 /* Prepare the regexp. */
4255 re
= RE_compile_and_cache(pattern_text
, cflags
, collation
);
4257 /* start_ptr points to the data_pos'th character of src_text */
4258 start_ptr
= (char *) VARDATA_ANY(src_text
);
4261 while (search_start
<= data_len
)
4265 CHECK_FOR_INTERRUPTS();
4267 regexec_result
= pg_regexec(re
,
4271 NULL
, /* no details */
4276 if (regexec_result
== REG_NOMATCH
)
4279 if (regexec_result
!= REG_OKAY
)
4283 pg_regerror(regexec_result
, re
, errMsg
, sizeof(errMsg
));
4285 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION
),
4286 errmsg("regular expression failed: %s", errMsg
)));
4290 * Count matches, and decide whether to replace this match.
4293 if (n
> 0 && nmatches
!= n
)
4296 * No, so advance search_start, but not start_ptr/data_pos. (Thus,
4297 * we treat the matched text as if it weren't matched, and copy it
4298 * to the output later.)
4300 search_start
= pmatch
[0].rm_eo
;
4301 if (pmatch
[0].rm_so
== pmatch
[0].rm_eo
)
4307 * Copy the text to the left of the match position. Note we are given
4308 * character not byte indexes.
4310 if (pmatch
[0].rm_so
- data_pos
> 0)
4314 chunk_len
= charlen_to_bytelen(start_ptr
,
4315 pmatch
[0].rm_so
- data_pos
);
4316 appendBinaryStringInfo(&buf
, start_ptr
, chunk_len
);
4319 * Advance start_ptr over that text, to avoid multiple rescans of
4320 * it if the replace_text contains multiple back-references.
4322 start_ptr
+= chunk_len
;
4323 data_pos
= pmatch
[0].rm_so
;
4327 * Copy the replace_text, processing escapes if any are present.
4329 if (escape_status
> 0)
4330 appendStringInfoRegexpSubstr(&buf
, replace_text
, pmatch
,
4331 start_ptr
, data_pos
);
4333 appendStringInfoText(&buf
, replace_text
);
4335 /* Advance start_ptr and data_pos over the matched text. */
4336 start_ptr
+= charlen_to_bytelen(start_ptr
,
4337 pmatch
[0].rm_eo
- data_pos
);
4338 data_pos
= pmatch
[0].rm_eo
;
4341 * If we only want to replace one occurrence, we're done.
4347 * Advance search position. Normally we start the next search at the
4348 * end of the previous match; but if the match was of zero length, we
4349 * have to advance by one character, or we'd just find the same match
4352 search_start
= data_pos
;
4353 if (pmatch
[0].rm_so
== pmatch
[0].rm_eo
)
4358 * Copy the text to the right of the last match.
4360 if (data_pos
< data_len
)
4364 chunk_len
= ((char *) src_text
+ VARSIZE_ANY(src_text
)) - start_ptr
;
4365 appendBinaryStringInfo(&buf
, start_ptr
, chunk_len
);
4368 ret_text
= cstring_to_text_with_len(buf
.data
, buf
.len
);
4377 * parse input string based on provided field separator
4378 * return N'th item (1 based, negative counts from end)
4381 split_part(PG_FUNCTION_ARGS
)
4383 text
*inputstring
= PG_GETARG_TEXT_PP(0);
4384 text
*fldsep
= PG_GETARG_TEXT_PP(1);
4385 int fldnum
= PG_GETARG_INT32(2);
4386 int inputstring_len
;
4388 TextPositionState state
;
4394 /* field number is 1 based */
4397 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
4398 errmsg("field position must not be zero")));
4400 inputstring_len
= VARSIZE_ANY_EXHDR(inputstring
);
4401 fldsep_len
= VARSIZE_ANY_EXHDR(fldsep
);
4403 /* return empty string for empty input string */
4404 if (inputstring_len
< 1)
4405 PG_RETURN_TEXT_P(cstring_to_text(""));
4407 /* handle empty field separator */
4410 /* if first or last field, return input string, else empty string */
4411 if (fldnum
== 1 || fldnum
== -1)
4412 PG_RETURN_TEXT_P(inputstring
);
4414 PG_RETURN_TEXT_P(cstring_to_text(""));
4417 /* find the first field separator */
4418 text_position_setup(inputstring
, fldsep
, PG_GET_COLLATION(), &state
);
4420 found
= text_position_next(&state
);
4422 /* special case if fldsep not found at all */
4425 text_position_cleanup(&state
);
4426 /* if first or last field, return input string, else empty string */
4427 if (fldnum
== 1 || fldnum
== -1)
4428 PG_RETURN_TEXT_P(inputstring
);
4430 PG_RETURN_TEXT_P(cstring_to_text(""));
4434 * take care of a negative field number (i.e. count from the right) by
4435 * converting to a positive field number; we need total number of fields
4439 /* we found a fldsep, so there are at least two fields */
4442 while (text_position_next(&state
))
4445 /* special case of last field does not require an extra pass */
4448 start_ptr
= text_position_get_match_ptr(&state
) + fldsep_len
;
4449 end_ptr
= VARDATA_ANY(inputstring
) + inputstring_len
;
4450 text_position_cleanup(&state
);
4451 PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr
,
4452 end_ptr
- start_ptr
));
4455 /* else, convert fldnum to positive notation */
4456 fldnum
+= numfields
+ 1;
4458 /* if nonexistent field, return empty string */
4461 text_position_cleanup(&state
);
4462 PG_RETURN_TEXT_P(cstring_to_text(""));
4465 /* reset to pointing at first match, but now with positive fldnum */
4466 text_position_reset(&state
);
4467 found
= text_position_next(&state
);
4471 /* identify bounds of first field */
4472 start_ptr
= VARDATA_ANY(inputstring
);
4473 end_ptr
= text_position_get_match_ptr(&state
);
4475 while (found
&& --fldnum
> 0)
4477 /* identify bounds of next field */
4478 start_ptr
= end_ptr
+ fldsep_len
;
4479 found
= text_position_next(&state
);
4481 end_ptr
= text_position_get_match_ptr(&state
);
4484 text_position_cleanup(&state
);
4488 /* N'th field separator not found */
4489 /* if last field requested, return it, else empty string */
4492 int last_len
= start_ptr
- VARDATA_ANY(inputstring
);
4494 result_text
= cstring_to_text_with_len(start_ptr
,
4495 inputstring_len
- last_len
);
4498 result_text
= cstring_to_text("");
4502 /* non-last field requested */
4503 result_text
= cstring_to_text_with_len(start_ptr
, end_ptr
- start_ptr
);
4506 PG_RETURN_TEXT_P(result_text
);
4510 * Convenience function to return true when two text params are equal.
4513 text_isequal(text
*txt1
, text
*txt2
, Oid collid
)
4515 return DatumGetBool(DirectFunctionCall2Coll(texteq
,
4517 PointerGetDatum(txt1
),
4518 PointerGetDatum(txt2
)));
4523 * parse input string and return text array of elements,
4524 * based on provided field separator
4527 text_to_array(PG_FUNCTION_ARGS
)
4529 SplitTextOutputData tstate
;
4531 /* For array output, tstate should start as all zeroes */
4532 memset(&tstate
, 0, sizeof(tstate
));
4534 if (!split_text(fcinfo
, &tstate
))
4537 if (tstate
.astate
== NULL
)
4538 PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID
));
4540 PG_RETURN_DATUM(makeArrayResult(tstate
.astate
,
4541 CurrentMemoryContext
));
4545 * text_to_array_null
4546 * parse input string and return text array of elements,
4547 * based on provided field separator and null string
4549 * This is a separate entry point only to prevent the regression tests from
4550 * complaining about different argument sets for the same internal function.
4553 text_to_array_null(PG_FUNCTION_ARGS
)
4555 return text_to_array(fcinfo
);
4560 * parse input string and return table of elements,
4561 * based on provided field separator
4564 text_to_table(PG_FUNCTION_ARGS
)
4566 ReturnSetInfo
*rsi
= (ReturnSetInfo
*) fcinfo
->resultinfo
;
4567 SplitTextOutputData tstate
;
4569 tstate
.astate
= NULL
;
4570 InitMaterializedSRF(fcinfo
, MAT_SRF_USE_EXPECTED_DESC
);
4571 tstate
.tupstore
= rsi
->setResult
;
4572 tstate
.tupdesc
= rsi
->setDesc
;
4574 (void) split_text(fcinfo
, &tstate
);
4580 * text_to_table_null
4581 * parse input string and return table of elements,
4582 * based on provided field separator and null string
4584 * This is a separate entry point only to prevent the regression tests from
4585 * complaining about different argument sets for the same internal function.
4588 text_to_table_null(PG_FUNCTION_ARGS
)
4590 return text_to_table(fcinfo
);
4594 * Common code for text_to_array, text_to_array_null, text_to_table
4595 * and text_to_table_null functions.
4597 * These are not strict so we have to test for null inputs explicitly.
4598 * Returns false if result is to be null, else returns true.
4600 * Note that if the result is valid but empty (zero elements), we return
4601 * without changing *tstate --- caller must handle that case, too.
4604 split_text(FunctionCallInfo fcinfo
, SplitTextOutputData
*tstate
)
4609 Oid collation
= PG_GET_COLLATION();
4610 int inputstring_len
;
4615 /* when input string is NULL, then result is NULL too */
4616 if (PG_ARGISNULL(0))
4619 inputstring
= PG_GETARG_TEXT_PP(0);
4621 /* fldsep can be NULL */
4622 if (!PG_ARGISNULL(1))
4623 fldsep
= PG_GETARG_TEXT_PP(1);
4627 /* null_string can be NULL or omitted */
4628 if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4629 null_string
= PG_GETARG_TEXT_PP(2);
4636 * Normal case with non-null fldsep. Use the text_position machinery
4637 * to search for occurrences of fldsep.
4639 TextPositionState state
;
4641 inputstring_len
= VARSIZE_ANY_EXHDR(inputstring
);
4642 fldsep_len
= VARSIZE_ANY_EXHDR(fldsep
);
4644 /* return empty set for empty input string */
4645 if (inputstring_len
< 1)
4648 /* empty field separator: return input string as a one-element set */
4651 split_text_accum_result(tstate
, inputstring
,
4652 null_string
, collation
);
4656 text_position_setup(inputstring
, fldsep
, collation
, &state
);
4658 start_ptr
= VARDATA_ANY(inputstring
);
4666 CHECK_FOR_INTERRUPTS();
4668 found
= text_position_next(&state
);
4671 /* fetch last field */
4672 chunk_len
= ((char *) inputstring
+ VARSIZE_ANY(inputstring
)) - start_ptr
;
4673 end_ptr
= NULL
; /* not used, but some compilers complain */
4677 /* fetch non-last field */
4678 end_ptr
= text_position_get_match_ptr(&state
);
4679 chunk_len
= end_ptr
- start_ptr
;
4682 /* build a temp text datum to pass to split_text_accum_result */
4683 result_text
= cstring_to_text_with_len(start_ptr
, chunk_len
);
4685 /* stash away this field */
4686 split_text_accum_result(tstate
, result_text
,
4687 null_string
, collation
);
4694 start_ptr
= end_ptr
+ fldsep_len
;
4697 text_position_cleanup(&state
);
4702 * When fldsep is NULL, each character in the input string becomes a
4703 * separate element in the result set. The separator is effectively
4704 * the space between characters.
4706 inputstring_len
= VARSIZE_ANY_EXHDR(inputstring
);
4708 start_ptr
= VARDATA_ANY(inputstring
);
4710 while (inputstring_len
> 0)
4712 int chunk_len
= pg_mblen(start_ptr
);
4714 CHECK_FOR_INTERRUPTS();
4716 /* build a temp text datum to pass to split_text_accum_result */
4717 result_text
= cstring_to_text_with_len(start_ptr
, chunk_len
);
4719 /* stash away this field */
4720 split_text_accum_result(tstate
, result_text
,
4721 null_string
, collation
);
4725 start_ptr
+= chunk_len
;
4726 inputstring_len
-= chunk_len
;
4734 * Add text item to result set (table or array).
4736 * This is also responsible for checking to see if the item matches
4737 * the null_string, in which case we should emit NULL instead.
4740 split_text_accum_result(SplitTextOutputData
*tstate
,
4745 bool is_null
= false;
4747 if (null_string
&& text_isequal(field_value
, null_string
, collation
))
4750 if (tstate
->tupstore
)
4755 values
[0] = PointerGetDatum(field_value
);
4758 tuplestore_putvalues(tstate
->tupstore
,
4765 tstate
->astate
= accumArrayResult(tstate
->astate
,
4766 PointerGetDatum(field_value
),
4769 CurrentMemoryContext
);
4775 * concatenate Cstring representation of input array elements
4776 * using provided field separator
4779 array_to_text(PG_FUNCTION_ARGS
)
4781 ArrayType
*v
= PG_GETARG_ARRAYTYPE_P(0);
4782 char *fldsep
= text_to_cstring(PG_GETARG_TEXT_PP(1));
4784 PG_RETURN_TEXT_P(array_to_text_internal(fcinfo
, v
, fldsep
, NULL
));
4788 * array_to_text_null
4789 * concatenate Cstring representation of input array elements
4790 * using provided field separator and null string
4792 * This version is not strict so we have to test for null inputs explicitly.
4795 array_to_text_null(PG_FUNCTION_ARGS
)
4801 /* returns NULL when first or second parameter is NULL */
4802 if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
4805 v
= PG_GETARG_ARRAYTYPE_P(0);
4806 fldsep
= text_to_cstring(PG_GETARG_TEXT_PP(1));
4808 /* NULL null string is passed through as a null pointer */
4809 if (!PG_ARGISNULL(2))
4810 null_string
= text_to_cstring(PG_GETARG_TEXT_PP(2));
4814 PG_RETURN_TEXT_P(array_to_text_internal(fcinfo
, v
, fldsep
, null_string
));
4818 * common code for array_to_text and array_to_text_null functions
4821 array_to_text_internal(FunctionCallInfo fcinfo
, ArrayType
*v
,
4822 const char *fldsep
, const char *null_string
)
4833 bool printed
= false;
4838 ArrayMetaState
*my_extra
;
4840 ndims
= ARR_NDIM(v
);
4842 nitems
= ArrayGetNItems(ndims
, dims
);
4844 /* if there are no elements, return an empty string */
4846 return cstring_to_text_with_len("", 0);
4848 element_type
= ARR_ELEMTYPE(v
);
4849 initStringInfo(&buf
);
4852 * We arrange to look up info about element type, including its output
4853 * conversion proc, only once per series of calls, assuming the element
4854 * type doesn't change underneath us.
4856 my_extra
= (ArrayMetaState
*) fcinfo
->flinfo
->fn_extra
;
4857 if (my_extra
== NULL
)
4859 fcinfo
->flinfo
->fn_extra
= MemoryContextAlloc(fcinfo
->flinfo
->fn_mcxt
,
4860 sizeof(ArrayMetaState
));
4861 my_extra
= (ArrayMetaState
*) fcinfo
->flinfo
->fn_extra
;
4862 my_extra
->element_type
= ~element_type
;
4865 if (my_extra
->element_type
!= element_type
)
4868 * Get info about element type, including its output conversion proc
4870 get_type_io_data(element_type
, IOFunc_output
,
4871 &my_extra
->typlen
, &my_extra
->typbyval
,
4872 &my_extra
->typalign
, &my_extra
->typdelim
,
4873 &my_extra
->typioparam
, &my_extra
->typiofunc
);
4874 fmgr_info_cxt(my_extra
->typiofunc
, &my_extra
->proc
,
4875 fcinfo
->flinfo
->fn_mcxt
);
4876 my_extra
->element_type
= element_type
;
4878 typlen
= my_extra
->typlen
;
4879 typbyval
= my_extra
->typbyval
;
4880 typalign
= my_extra
->typalign
;
4882 p
= ARR_DATA_PTR(v
);
4883 bitmap
= ARR_NULLBITMAP(v
);
4886 for (i
= 0; i
< nitems
; i
++)
4891 /* Get source element, checking for NULL */
4892 if (bitmap
&& (*bitmap
& bitmask
) == 0)
4894 /* if null_string is NULL, we just ignore null elements */
4895 if (null_string
!= NULL
)
4898 appendStringInfo(&buf
, "%s%s", fldsep
, null_string
);
4900 appendStringInfoString(&buf
, null_string
);
4906 itemvalue
= fetch_att(p
, typbyval
, typlen
);
4908 value
= OutputFunctionCall(&my_extra
->proc
, itemvalue
);
4911 appendStringInfo(&buf
, "%s%s", fldsep
, value
);
4913 appendStringInfoString(&buf
, value
);
4916 p
= att_addlength_pointer(p
, typlen
, p
);
4917 p
= (char *) att_align_nominal(p
, typalign
);
4920 /* advance bitmap pointer if any */
4924 if (bitmask
== 0x100)
4932 result
= cstring_to_text_with_len(buf
.data
, buf
.len
);
4939 * Workhorse for to_bin, to_oct, and to_hex. Note that base must be > 1 and <=
4942 static inline text
*
4943 convert_to_base(uint64 value
, int base
)
4945 const char *digits
= "0123456789abcdef";
4947 /* We size the buffer for to_bin's longest possible return value. */
4948 char buf
[sizeof(uint64
) * BITS_PER_BYTE
];
4949 char *const end
= buf
+ sizeof(buf
);
4957 *--ptr
= digits
[value
% base
];
4959 } while (ptr
> buf
&& value
);
4961 return cstring_to_text_with_len(ptr
, end
- ptr
);
4965 * Convert an integer to a string containing a base-2 (binary) representation
4969 to_bin32(PG_FUNCTION_ARGS
)
4971 uint64 value
= (uint32
) PG_GETARG_INT32(0);
4973 PG_RETURN_TEXT_P(convert_to_base(value
, 2));
4976 to_bin64(PG_FUNCTION_ARGS
)
4978 uint64 value
= (uint64
) PG_GETARG_INT64(0);
4980 PG_RETURN_TEXT_P(convert_to_base(value
, 2));
4984 * Convert an integer to a string containing a base-8 (oct) representation of
4988 to_oct32(PG_FUNCTION_ARGS
)
4990 uint64 value
= (uint32
) PG_GETARG_INT32(0);
4992 PG_RETURN_TEXT_P(convert_to_base(value
, 8));
4995 to_oct64(PG_FUNCTION_ARGS
)
4997 uint64 value
= (uint64
) PG_GETARG_INT64(0);
4999 PG_RETURN_TEXT_P(convert_to_base(value
, 8));
5003 * Convert an integer to a string containing a base-16 (hex) representation of
5007 to_hex32(PG_FUNCTION_ARGS
)
5009 uint64 value
= (uint32
) PG_GETARG_INT32(0);
5011 PG_RETURN_TEXT_P(convert_to_base(value
, 16));
5014 to_hex64(PG_FUNCTION_ARGS
)
5016 uint64 value
= (uint64
) PG_GETARG_INT64(0);
5018 PG_RETURN_TEXT_P(convert_to_base(value
, 16));
5022 * Return the size of a datum, possibly compressed
5024 * Works on any data type
5027 pg_column_size(PG_FUNCTION_ARGS
)
5029 Datum value
= PG_GETARG_DATUM(0);
5033 /* On first call, get the input type's typlen, and save at *fn_extra */
5034 if (fcinfo
->flinfo
->fn_extra
== NULL
)
5036 /* Lookup the datatype of the supplied argument */
5037 Oid argtypeid
= get_fn_expr_argtype(fcinfo
->flinfo
, 0);
5039 typlen
= get_typlen(argtypeid
);
5040 if (typlen
== 0) /* should not happen */
5041 elog(ERROR
, "cache lookup failed for type %u", argtypeid
);
5043 fcinfo
->flinfo
->fn_extra
= MemoryContextAlloc(fcinfo
->flinfo
->fn_mcxt
,
5045 *((int *) fcinfo
->flinfo
->fn_extra
) = typlen
;
5048 typlen
= *((int *) fcinfo
->flinfo
->fn_extra
);
5052 /* varlena type, possibly toasted */
5053 result
= toast_datum_size(value
);
5055 else if (typlen
== -2)
5058 result
= strlen(DatumGetCString(value
)) + 1;
5062 /* ordinary fixed-width type */
5066 PG_RETURN_INT32(result
);
5070 * Return the compression method stored in the compressed attribute. Return
5071 * NULL for non varlena type or uncompressed data.
5074 pg_column_compression(PG_FUNCTION_ARGS
)
5078 ToastCompressionId cmid
;
5080 /* On first call, get the input type's typlen, and save at *fn_extra */
5081 if (fcinfo
->flinfo
->fn_extra
== NULL
)
5083 /* Lookup the datatype of the supplied argument */
5084 Oid argtypeid
= get_fn_expr_argtype(fcinfo
->flinfo
, 0);
5086 typlen
= get_typlen(argtypeid
);
5087 if (typlen
== 0) /* should not happen */
5088 elog(ERROR
, "cache lookup failed for type %u", argtypeid
);
5090 fcinfo
->flinfo
->fn_extra
= MemoryContextAlloc(fcinfo
->flinfo
->fn_mcxt
,
5092 *((int *) fcinfo
->flinfo
->fn_extra
) = typlen
;
5095 typlen
= *((int *) fcinfo
->flinfo
->fn_extra
);
5100 /* get the compression method id stored in the compressed varlena */
5101 cmid
= toast_get_compression_id((struct varlena
*)
5102 DatumGetPointer(PG_GETARG_DATUM(0)));
5103 if (cmid
== TOAST_INVALID_COMPRESSION_ID
)
5106 /* convert compression method id to compression method name */
5109 case TOAST_PGLZ_COMPRESSION_ID
:
5112 case TOAST_LZ4_COMPRESSION_ID
:
5116 elog(ERROR
, "invalid compression method id %d", cmid
);
5119 PG_RETURN_TEXT_P(cstring_to_text(result
));
5123 * Return the chunk_id of the on-disk TOASTed value. Return NULL if the value
5124 * is un-TOASTed or not on-disk.
5127 pg_column_toast_chunk_id(PG_FUNCTION_ARGS
)
5130 struct varlena
*attr
;
5131 struct varatt_external toast_pointer
;
5133 /* On first call, get the input type's typlen, and save at *fn_extra */
5134 if (fcinfo
->flinfo
->fn_extra
== NULL
)
5136 /* Lookup the datatype of the supplied argument */
5137 Oid argtypeid
= get_fn_expr_argtype(fcinfo
->flinfo
, 0);
5139 typlen
= get_typlen(argtypeid
);
5140 if (typlen
== 0) /* should not happen */
5141 elog(ERROR
, "cache lookup failed for type %u", argtypeid
);
5143 fcinfo
->flinfo
->fn_extra
= MemoryContextAlloc(fcinfo
->flinfo
->fn_mcxt
,
5145 *((int *) fcinfo
->flinfo
->fn_extra
) = typlen
;
5148 typlen
= *((int *) fcinfo
->flinfo
->fn_extra
);
5153 attr
= (struct varlena
*) DatumGetPointer(PG_GETARG_DATUM(0));
5155 if (!VARATT_IS_EXTERNAL_ONDISK(attr
))
5158 VARATT_EXTERNAL_GET_POINTER(toast_pointer
, attr
);
5160 PG_RETURN_OID(toast_pointer
.va_valueid
);
5164 * string_agg - Concatenates values and returns string.
5166 * Syntax: string_agg(value text, delimiter text) RETURNS text
5168 * Note: Any NULL values are ignored. The first-call delimiter isn't
5169 * actually used at all, and on subsequent calls the delimiter precedes
5170 * the associated value.
5173 /* subroutine to initialize state */
5175 makeStringAggState(FunctionCallInfo fcinfo
)
5178 MemoryContext aggcontext
;
5179 MemoryContext oldcontext
;
5181 if (!AggCheckCallContext(fcinfo
, &aggcontext
))
5183 /* cannot be called directly because of internal-type argument */
5184 elog(ERROR
, "string_agg_transfn called in non-aggregate context");
5188 * Create state in aggregate context. It'll stay there across subsequent
5191 oldcontext
= MemoryContextSwitchTo(aggcontext
);
5192 state
= makeStringInfo();
5193 MemoryContextSwitchTo(oldcontext
);
5199 string_agg_transfn(PG_FUNCTION_ARGS
)
5203 state
= PG_ARGISNULL(0) ? NULL
: (StringInfo
) PG_GETARG_POINTER(0);
5205 /* Append the value unless null, preceding it with the delimiter. */
5206 if (!PG_ARGISNULL(1))
5208 text
*value
= PG_GETARG_TEXT_PP(1);
5209 bool isfirst
= false;
5212 * You might think we can just throw away the first delimiter, however
5213 * we must keep it as we may be a parallel worker doing partial
5214 * aggregation building a state to send to the main process. We need
5215 * to keep the delimiter of every aggregation so that the combine
5216 * function can properly join up the strings of two separately
5217 * partially aggregated results. The first delimiter is only stripped
5218 * off in the final function. To know how much to strip off the front
5219 * of the string, we store the length of the first delimiter in the
5220 * StringInfo's cursor field, which we don't otherwise need here.
5224 state
= makeStringAggState(fcinfo
);
5228 if (!PG_ARGISNULL(2))
5230 text
*delim
= PG_GETARG_TEXT_PP(2);
5232 appendStringInfoText(state
, delim
);
5234 state
->cursor
= VARSIZE_ANY_EXHDR(delim
);
5237 appendStringInfoText(state
, value
);
5241 * The transition type for string_agg() is declared to be "internal",
5242 * which is a pass-by-value type the same size as a pointer.
5245 PG_RETURN_POINTER(state
);
5250 * string_agg_combine
5251 * Aggregate combine function for string_agg(text) and string_agg(bytea)
5254 string_agg_combine(PG_FUNCTION_ARGS
)
5258 MemoryContext agg_context
;
5260 if (!AggCheckCallContext(fcinfo
, &agg_context
))
5261 elog(ERROR
, "aggregate function called in non-aggregate context");
5263 state1
= PG_ARGISNULL(0) ? NULL
: (StringInfo
) PG_GETARG_POINTER(0);
5264 state2
= PG_ARGISNULL(1) ? NULL
: (StringInfo
) PG_GETARG_POINTER(1);
5269 * NULL state2 is easy, just return state1, which we know is already
5270 * in the agg_context
5274 PG_RETURN_POINTER(state1
);
5279 /* We must copy state2's data into the agg_context */
5280 MemoryContext old_context
;
5282 old_context
= MemoryContextSwitchTo(agg_context
);
5283 state1
= makeStringAggState(fcinfo
);
5284 appendBinaryStringInfo(state1
, state2
->data
, state2
->len
);
5285 state1
->cursor
= state2
->cursor
;
5286 MemoryContextSwitchTo(old_context
);
5288 else if (state2
->len
> 0)
5290 /* Combine ... state1->cursor does not change in this case */
5291 appendBinaryStringInfo(state1
, state2
->data
, state2
->len
);
5294 PG_RETURN_POINTER(state1
);
5298 * string_agg_serialize
5299 * Aggregate serialize function for string_agg(text) and string_agg(bytea)
5301 * This is strict, so we need not handle NULL input
5304 string_agg_serialize(PG_FUNCTION_ARGS
)
5310 /* cannot be called directly because of internal-type argument */
5311 Assert(AggCheckCallContext(fcinfo
, NULL
));
5313 state
= (StringInfo
) PG_GETARG_POINTER(0);
5315 pq_begintypsend(&buf
);
5318 pq_sendint(&buf
, state
->cursor
, 4);
5321 pq_sendbytes(&buf
, state
->data
, state
->len
);
5323 result
= pq_endtypsend(&buf
);
5325 PG_RETURN_BYTEA_P(result
);
5329 * string_agg_deserialize
5330 * Aggregate deserial function for string_agg(text) and string_agg(bytea)
5332 * This is strict, so we need not handle NULL input
5335 string_agg_deserialize(PG_FUNCTION_ARGS
)
5343 /* cannot be called directly because of internal-type argument */
5344 Assert(AggCheckCallContext(fcinfo
, NULL
));
5346 sstate
= PG_GETARG_BYTEA_PP(0);
5349 * Initialize a StringInfo so that we can "receive" it using the standard
5350 * recv-function infrastructure.
5352 initReadOnlyStringInfo(&buf
, VARDATA_ANY(sstate
),
5353 VARSIZE_ANY_EXHDR(sstate
));
5355 result
= makeStringAggState(fcinfo
);
5358 result
->cursor
= pq_getmsgint(&buf
, 4);
5361 datalen
= VARSIZE_ANY_EXHDR(sstate
) - 4;
5362 data
= (char *) pq_getmsgbytes(&buf
, datalen
);
5363 appendBinaryStringInfo(result
, data
, datalen
);
5367 PG_RETURN_POINTER(result
);
5371 string_agg_finalfn(PG_FUNCTION_ARGS
)
5375 /* cannot be called directly because of internal-type argument */
5376 Assert(AggCheckCallContext(fcinfo
, NULL
));
5378 state
= PG_ARGISNULL(0) ? NULL
: (StringInfo
) PG_GETARG_POINTER(0);
5382 /* As per comment in transfn, strip data before the cursor position */
5383 PG_RETURN_TEXT_P(cstring_to_text_with_len(&state
->data
[state
->cursor
],
5384 state
->len
- state
->cursor
));
5391 * Prepare cache with fmgr info for the output functions of the datatypes of
5392 * the arguments of a concat-like function, beginning with argument "argidx".
5393 * (Arguments before that will have corresponding slots in the resulting
5394 * FmgrInfo array, but we don't fill those slots.)
5397 build_concat_foutcache(FunctionCallInfo fcinfo
, int argidx
)
5399 FmgrInfo
*foutcache
;
5402 /* We keep the info in fn_mcxt so it survives across calls */
5403 foutcache
= (FmgrInfo
*) MemoryContextAlloc(fcinfo
->flinfo
->fn_mcxt
,
5404 PG_NARGS() * sizeof(FmgrInfo
));
5406 for (i
= argidx
; i
< PG_NARGS(); i
++)
5412 valtype
= get_fn_expr_argtype(fcinfo
->flinfo
, i
);
5413 if (!OidIsValid(valtype
))
5414 elog(ERROR
, "could not determine data type of concat() input");
5416 getTypeOutputInfo(valtype
, &typOutput
, &typIsVarlena
);
5417 fmgr_info_cxt(typOutput
, &foutcache
[i
], fcinfo
->flinfo
->fn_mcxt
);
5420 fcinfo
->flinfo
->fn_extra
= foutcache
;
5426 * Implementation of both concat() and concat_ws().
5428 * sepstr is the separator string to place between values.
5429 * argidx identifies the first argument to concatenate (counting from zero);
5430 * note that this must be constant across any one series of calls.
5432 * Returns NULL if result should be NULL, else text value.
5435 concat_internal(const char *sepstr
, int argidx
,
5436 FunctionCallInfo fcinfo
)
5440 FmgrInfo
*foutcache
;
5441 bool first_arg
= true;
5445 * concat(VARIADIC some-array) is essentially equivalent to
5446 * array_to_text(), ie concat the array elements with the given separator.
5447 * So we just pass the case off to that code.
5449 if (get_fn_expr_variadic(fcinfo
->flinfo
))
5453 /* Should have just the one argument */
5454 Assert(argidx
== PG_NARGS() - 1);
5456 /* concat(VARIADIC NULL) is defined as NULL */
5457 if (PG_ARGISNULL(argidx
))
5461 * Non-null argument had better be an array. We assume that any call
5462 * context that could let get_fn_expr_variadic return true will have
5463 * checked that a VARIADIC-labeled parameter actually is an array. So
5464 * it should be okay to just Assert that it's an array rather than
5465 * doing a full-fledged error check.
5467 Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo
->flinfo
, argidx
))));
5469 /* OK, safe to fetch the array value */
5470 arr
= PG_GETARG_ARRAYTYPE_P(argidx
);
5473 * And serialize the array. We tell array_to_text to ignore null
5474 * elements, which matches the behavior of the loop below.
5476 return array_to_text_internal(fcinfo
, arr
, sepstr
, NULL
);
5479 /* Normal case without explicit VARIADIC marker */
5480 initStringInfo(&str
);
5482 /* Get output function info, building it if first time through */
5483 foutcache
= (FmgrInfo
*) fcinfo
->flinfo
->fn_extra
;
5484 if (foutcache
== NULL
)
5485 foutcache
= build_concat_foutcache(fcinfo
, argidx
);
5487 for (i
= argidx
; i
< PG_NARGS(); i
++)
5489 if (!PG_ARGISNULL(i
))
5491 Datum value
= PG_GETARG_DATUM(i
);
5493 /* add separator if appropriate */
5497 appendStringInfoString(&str
, sepstr
);
5499 /* call the appropriate type output function, append the result */
5500 appendStringInfoString(&str
,
5501 OutputFunctionCall(&foutcache
[i
], value
));
5505 result
= cstring_to_text_with_len(str
.data
, str
.len
);
5512 * Concatenate all arguments. NULL arguments are ignored.
5515 text_concat(PG_FUNCTION_ARGS
)
5519 result
= concat_internal("", 0, fcinfo
);
5522 PG_RETURN_TEXT_P(result
);
5526 * Concatenate all but first argument value with separators. The first
5527 * parameter is used as the separator. NULL arguments are ignored.
5530 text_concat_ws(PG_FUNCTION_ARGS
)
5535 /* return NULL when separator is NULL */
5536 if (PG_ARGISNULL(0))
5538 sep
= text_to_cstring(PG_GETARG_TEXT_PP(0));
5540 result
= concat_internal(sep
, 1, fcinfo
);
5543 PG_RETURN_TEXT_P(result
);
5547 * Return first n characters in the string. When n is negative,
5548 * return all but last |n| characters.
5551 text_left(PG_FUNCTION_ARGS
)
5553 int n
= PG_GETARG_INT32(1);
5557 text
*str
= PG_GETARG_TEXT_PP(0);
5558 const char *p
= VARDATA_ANY(str
);
5559 int len
= VARSIZE_ANY_EXHDR(str
);
5562 n
= pg_mbstrlen_with_len(p
, len
) + n
;
5563 rlen
= pg_mbcharcliplen(p
, len
, n
);
5564 PG_RETURN_TEXT_P(cstring_to_text_with_len(p
, rlen
));
5567 PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0), 1, n
, false));
5571 * Return last n characters in the string. When n is negative,
5572 * return all but first |n| characters.
5575 text_right(PG_FUNCTION_ARGS
)
5577 text
*str
= PG_GETARG_TEXT_PP(0);
5578 const char *p
= VARDATA_ANY(str
);
5579 int len
= VARSIZE_ANY_EXHDR(str
);
5580 int n
= PG_GETARG_INT32(1);
5586 n
= pg_mbstrlen_with_len(p
, len
) - n
;
5587 off
= pg_mbcharcliplen(p
, len
, n
);
5589 PG_RETURN_TEXT_P(cstring_to_text_with_len(p
+ off
, len
- off
));
5593 * Return reversed string
5596 text_reverse(PG_FUNCTION_ARGS
)
5598 text
*str
= PG_GETARG_TEXT_PP(0);
5599 const char *p
= VARDATA_ANY(str
);
5600 int len
= VARSIZE_ANY_EXHDR(str
);
5601 const char *endp
= p
+ len
;
5605 result
= palloc(len
+ VARHDRSZ
);
5606 dst
= (char *) VARDATA(result
) + len
;
5607 SET_VARSIZE(result
, len
+ VARHDRSZ
);
5609 if (pg_database_encoding_max_length() > 1)
5611 /* multibyte version */
5624 /* single byte version */
5629 PG_RETURN_TEXT_P(result
);
5634 * Support macros for text_format()
5636 #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
5638 #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
5640 if (++(ptr) >= (end_ptr)) \
5642 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
5643 errmsg("unterminated format() type specifier"), \
5644 errhint("For a single \"%%\" use \"%%%%\"."))); \
5648 * Returns a formatted string
5651 text_format(PG_FUNCTION_ARGS
)
5656 const char *start_ptr
;
5657 const char *end_ptr
;
5662 Datum
*elements
= NULL
;
5664 Oid element_type
= InvalidOid
;
5665 Oid prev_type
= InvalidOid
;
5666 Oid prev_width_type
= InvalidOid
;
5667 FmgrInfo typoutputfinfo
;
5668 FmgrInfo typoutputinfo_width
;
5670 /* When format string is null, immediately return null */
5671 if (PG_ARGISNULL(0))
5674 /* If argument is marked VARIADIC, expand array into elements */
5675 if (get_fn_expr_variadic(fcinfo
->flinfo
))
5683 /* Should have just the one argument */
5684 Assert(PG_NARGS() == 2);
5686 /* If argument is NULL, we treat it as zero-length array */
5687 if (PG_ARGISNULL(1))
5692 * Non-null argument had better be an array. We assume that any
5693 * call context that could let get_fn_expr_variadic return true
5694 * will have checked that a VARIADIC-labeled parameter actually is
5695 * an array. So it should be okay to just Assert that it's an
5696 * array rather than doing a full-fledged error check.
5698 Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo
->flinfo
, 1))));
5700 /* OK, safe to fetch the array value */
5701 arr
= PG_GETARG_ARRAYTYPE_P(1);
5703 /* Get info about array element type */
5704 element_type
= ARR_ELEMTYPE(arr
);
5705 get_typlenbyvalalign(element_type
,
5706 &elmlen
, &elmbyval
, &elmalign
);
5708 /* Extract all array elements */
5709 deconstruct_array(arr
, element_type
, elmlen
, elmbyval
, elmalign
,
5710 &elements
, &nulls
, &nitems
);
5714 funcvariadic
= true;
5718 /* Non-variadic case, we'll process the arguments individually */
5720 funcvariadic
= false;
5723 /* Setup for main loop. */
5724 fmt
= PG_GETARG_TEXT_PP(0);
5725 start_ptr
= VARDATA_ANY(fmt
);
5726 end_ptr
= start_ptr
+ VARSIZE_ANY_EXHDR(fmt
);
5727 initStringInfo(&str
);
5728 arg
= 1; /* next argument position to print */
5730 /* Scan format string, looking for conversion specifiers. */
5731 for (cp
= start_ptr
; cp
< end_ptr
; cp
++)
5742 * If it's not the start of a conversion specifier, just copy it to
5743 * the output buffer.
5747 appendStringInfoCharMacro(&str
, *cp
);
5751 ADVANCE_PARSE_POINTER(cp
, end_ptr
);
5753 /* Easy case: %% outputs a single % */
5756 appendStringInfoCharMacro(&str
, *cp
);
5760 /* Parse the optional portions of the format specifier */
5761 cp
= text_format_parse_format(cp
, end_ptr
,
5766 * Next we should see the main conversion specifier. Whether or not
5767 * an argument position was present, it's known that at least one
5768 * character remains in the string at this point. Experience suggests
5769 * that it's worth checking that that character is one of the expected
5770 * ones before we try to fetch arguments, so as to produce the least
5771 * confusing response to a mis-formatted specifier.
5773 if (strchr("sIL", *cp
) == NULL
)
5775 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
5776 errmsg("unrecognized format() type specifier \"%.*s\"",
5778 errhint("For a single \"%%\" use \"%%%%\".")));
5780 /* If indirect width was specified, get its value */
5783 /* Collect the specified or next argument position */
5788 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
5789 errmsg("too few arguments for format()")));
5791 /* Get the value and type of the selected argument */
5794 value
= PG_GETARG_DATUM(arg
);
5795 isNull
= PG_ARGISNULL(arg
);
5796 typid
= get_fn_expr_argtype(fcinfo
->flinfo
, arg
);
5800 value
= elements
[arg
- 1];
5801 isNull
= nulls
[arg
- 1];
5802 typid
= element_type
;
5804 if (!OidIsValid(typid
))
5805 elog(ERROR
, "could not determine data type of format() input");
5809 /* We can treat NULL width the same as zero */
5812 else if (typid
== INT4OID
)
5813 width
= DatumGetInt32(value
);
5814 else if (typid
== INT2OID
)
5815 width
= DatumGetInt16(value
);
5818 /* For less-usual datatypes, convert to text then to int */
5821 if (typid
!= prev_width_type
)
5826 getTypeOutputInfo(typid
, &typoutputfunc
, &typIsVarlena
);
5827 fmgr_info(typoutputfunc
, &typoutputinfo_width
);
5828 prev_width_type
= typid
;
5831 str
= OutputFunctionCall(&typoutputinfo_width
, value
);
5833 /* pg_strtoint32 will complain about bad data or overflow */
5834 width
= pg_strtoint32(str
);
5840 /* Collect the specified or next argument position */
5845 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
5846 errmsg("too few arguments for format()")));
5848 /* Get the value and type of the selected argument */
5851 value
= PG_GETARG_DATUM(arg
);
5852 isNull
= PG_ARGISNULL(arg
);
5853 typid
= get_fn_expr_argtype(fcinfo
->flinfo
, arg
);
5857 value
= elements
[arg
- 1];
5858 isNull
= nulls
[arg
- 1];
5859 typid
= element_type
;
5861 if (!OidIsValid(typid
))
5862 elog(ERROR
, "could not determine data type of format() input");
5867 * Get the appropriate typOutput function, reusing previous one if
5868 * same type as previous argument. That's particularly useful in the
5869 * variadic-array case, but often saves work even for ordinary calls.
5871 if (typid
!= prev_type
)
5876 getTypeOutputInfo(typid
, &typoutputfunc
, &typIsVarlena
);
5877 fmgr_info(typoutputfunc
, &typoutputfinfo
);
5882 * And now we can format the value.
5889 text_format_string_conversion(&str
, *cp
, &typoutputfinfo
,
5894 /* should not get here, because of previous check */
5896 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
5897 errmsg("unrecognized format() type specifier \"%.*s\"",
5899 errhint("For a single \"%%\" use \"%%%%\".")));
5904 /* Don't need deconstruct_array results anymore. */
5905 if (elements
!= NULL
)
5910 /* Generate results. */
5911 result
= cstring_to_text_with_len(str
.data
, str
.len
);
5914 PG_RETURN_TEXT_P(result
);
5918 * Parse contiguous digits as a decimal number.
5920 * Returns true if some digits could be parsed.
5921 * The value is returned into *value, and *ptr is advanced to the next
5922 * character to be parsed.
5924 * Note parsing invariant: at least one character is known available before
5925 * string end (end_ptr) at entry, and this is still true at exit.
5928 text_format_parse_digits(const char **ptr
, const char *end_ptr
, int *value
)
5931 const char *cp
= *ptr
;
5934 while (*cp
>= '0' && *cp
<= '9')
5936 int8 digit
= (*cp
- '0');
5938 if (unlikely(pg_mul_s32_overflow(val
, 10, &val
)) ||
5939 unlikely(pg_add_s32_overflow(val
, digit
, &val
)))
5941 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE
),
5942 errmsg("number is out of range")));
5943 ADVANCE_PARSE_POINTER(cp
, end_ptr
);
5954 * Parse a format specifier (generally following the SUS printf spec).
5956 * We have already advanced over the initial '%', and we are looking for
5957 * [argpos][flags][width]type (but the type character is not consumed here).
5959 * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5960 * Output parameters:
5961 * argpos: argument position for value to be printed. -1 means unspecified.
5962 * widthpos: argument position for width. Zero means the argument position
5963 * was unspecified (ie, take the next arg) and -1 means no width
5964 * argument (width was omitted or specified as a constant).
5965 * flags: bitmask of flags.
5966 * width: directly-specified width value. Zero means the width was omitted
5967 * (note it's not necessary to distinguish this case from an explicit
5968 * zero width value).
5970 * The function result is the next character position to be parsed, ie, the
5971 * location where the type character is/should be.
5973 * Note parsing invariant: at least one character is known available before
5974 * string end (end_ptr) at entry, and this is still true at exit.
5977 text_format_parse_format(const char *start_ptr
, const char *end_ptr
,
5978 int *argpos
, int *widthpos
,
5979 int *flags
, int *width
)
5981 const char *cp
= start_ptr
;
5984 /* set defaults for output parameters */
5990 /* try to identify first number */
5991 if (text_format_parse_digits(&cp
, end_ptr
, &n
))
5995 /* Must be just a width and a type, so we're done */
5999 /* The number was argument position */
6001 /* Explicit 0 for argument index is immediately refused */
6004 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
6005 errmsg("format specifies argument 0, but arguments are numbered from 1")));
6006 ADVANCE_PARSE_POINTER(cp
, end_ptr
);
6009 /* Handle flags (only minus is supported now) */
6012 *flags
|= TEXT_FORMAT_FLAG_MINUS
;
6013 ADVANCE_PARSE_POINTER(cp
, end_ptr
);
6018 /* Handle indirect width */
6019 ADVANCE_PARSE_POINTER(cp
, end_ptr
);
6020 if (text_format_parse_digits(&cp
, end_ptr
, &n
))
6022 /* number in this position must be closed by $ */
6025 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
6026 errmsg("width argument position must be ended by \"$\"")));
6027 /* The number was width argument position */
6029 /* Explicit 0 for argument index is immediately refused */
6032 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
6033 errmsg("format specifies argument 0, but arguments are numbered from 1")));
6034 ADVANCE_PARSE_POINTER(cp
, end_ptr
);
6037 *widthpos
= 0; /* width's argument position is unspecified */
6041 /* Check for direct width specification */
6042 if (text_format_parse_digits(&cp
, end_ptr
, &n
))
6046 /* cp should now be pointing at type character */
6051 * Format a %s, %I, or %L conversion
6054 text_format_string_conversion(StringInfo buf
, char conversion
,
6055 FmgrInfo
*typOutputInfo
,
6056 Datum value
, bool isNull
,
6057 int flags
, int width
)
6061 /* Handle NULL arguments before trying to stringify the value. */
6064 if (conversion
== 's')
6065 text_format_append_string(buf
, "", flags
, width
);
6066 else if (conversion
== 'L')
6067 text_format_append_string(buf
, "NULL", flags
, width
);
6068 else if (conversion
== 'I')
6070 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED
),
6071 errmsg("null values cannot be formatted as an SQL identifier")));
6076 str
= OutputFunctionCall(typOutputInfo
, value
);
6079 if (conversion
== 'I')
6081 /* quote_identifier may or may not allocate a new string. */
6082 text_format_append_string(buf
, quote_identifier(str
), flags
, width
);
6084 else if (conversion
== 'L')
6086 char *qstr
= quote_literal_cstr(str
);
6088 text_format_append_string(buf
, qstr
, flags
, width
);
6089 /* quote_literal_cstr() always allocates a new string */
6093 text_format_append_string(buf
, str
, flags
, width
);
6100 * Append str to buf, padding as directed by flags/width
6103 text_format_append_string(StringInfo buf
, const char *str
,
6104 int flags
, int width
)
6106 bool align_to_left
= false;
6109 /* fast path for typical easy case */
6112 appendStringInfoString(buf
, str
);
6118 /* Negative width: implicit '-' flag, then take absolute value */
6119 align_to_left
= true;
6120 /* -INT_MIN is undefined */
6121 if (width
<= INT_MIN
)
6123 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE
),
6124 errmsg("number is out of range")));
6127 else if (flags
& TEXT_FORMAT_FLAG_MINUS
)
6128 align_to_left
= true;
6130 len
= pg_mbstrlen(str
);
6134 appendStringInfoString(buf
, str
);
6136 appendStringInfoSpaces(buf
, width
- len
);
6142 appendStringInfoSpaces(buf
, width
- len
);
6143 appendStringInfoString(buf
, str
);
6148 * text_format_nv - nonvariadic wrapper for text_format function.
6150 * note: this wrapper is necessary to pass the sanity check in opr_sanity,
6151 * which checks that all built-in functions that share the implementing C
6152 * function take the same number of arguments.
6155 text_format_nv(PG_FUNCTION_ARGS
)
6157 return text_format(fcinfo
);
6161 * Helper function for Levenshtein distance functions. Faster than memcmp(),
6162 * for this use case.
6165 rest_of_char_same(const char *s1
, const char *s2
, int len
)
6170 if (s1
[len
] != s2
[len
])
6176 /* Expand each Levenshtein distance variant */
6177 #include "levenshtein.c"
6178 #define LEVENSHTEIN_LESS_EQUAL
6179 #include "levenshtein.c"
6183 * The following *ClosestMatch() functions can be used to determine whether a
6184 * user-provided string resembles any known valid values, which is useful for
6185 * providing hints in log messages, among other things. Use these functions
6188 * initClosestMatch(&state, source_string, max_distance);
6190 * for (int i = 0; i < num_valid_strings; i++)
6191 * updateClosestMatch(&state, valid_strings[i]);
6193 * closestMatch = getClosestMatch(&state);
6197 * Initialize the given state with the source string and maximum Levenshtein
6198 * distance to consider.
6201 initClosestMatch(ClosestMatchState
*state
, const char *source
, int max_d
)
6206 state
->source
= source
;
6208 state
->max_d
= max_d
;
6209 state
->match
= NULL
;
6213 * If the candidate string is a closer match than the current one saved (or
6214 * there is no match saved), save it as the closest match.
6216 * If the source or candidate string is NULL, empty, or too long, this function
6217 * takes no action. Likewise, if the Levenshtein distance exceeds the maximum
6218 * allowed or more than half the characters are different, no action is taken.
6221 updateClosestMatch(ClosestMatchState
*state
, const char *candidate
)
6227 if (state
->source
== NULL
|| state
->source
[0] == '\0' ||
6228 candidate
== NULL
|| candidate
[0] == '\0')
6232 * To avoid ERROR-ing, we check the lengths here instead of setting
6233 * 'trusted' to false in the call to varstr_levenshtein_less_equal().
6235 if (strlen(state
->source
) > MAX_LEVENSHTEIN_STRLEN
||
6236 strlen(candidate
) > MAX_LEVENSHTEIN_STRLEN
)
6239 dist
= varstr_levenshtein_less_equal(state
->source
, strlen(state
->source
),
6240 candidate
, strlen(candidate
), 1, 1, 1,
6241 state
->max_d
, true);
6242 if (dist
<= state
->max_d
&&
6243 dist
<= strlen(state
->source
) / 2 &&
6244 (state
->min_d
== -1 || dist
< state
->min_d
))
6246 state
->min_d
= dist
;
6247 state
->match
= candidate
;
6252 * Return the closest match. If no suitable candidates were provided via
6253 * updateClosestMatch(), return NULL.
6256 getClosestMatch(ClosestMatchState
*state
)
6260 return state
->match
;
6268 static UnicodeNormalizationForm
6269 unicode_norm_form_from_string(const char *formstr
)
6271 UnicodeNormalizationForm form
= -1;
6274 * Might as well check this while we're here.
6276 if (GetDatabaseEncoding() != PG_UTF8
)
6278 (errcode(ERRCODE_SYNTAX_ERROR
),
6279 errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
6281 if (pg_strcasecmp(formstr
, "NFC") == 0)
6283 else if (pg_strcasecmp(formstr
, "NFD") == 0)
6285 else if (pg_strcasecmp(formstr
, "NFKC") == 0)
6286 form
= UNICODE_NFKC
;
6287 else if (pg_strcasecmp(formstr
, "NFKD") == 0)
6288 form
= UNICODE_NFKD
;
6291 (errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
6292 errmsg("invalid normalization form: %s", formstr
)));
6298 * Returns version of Unicode used by Postgres in "major.minor" format (the
6299 * same format as the Unicode version reported by ICU). The third component
6300 * ("update version") never involves additions to the character repertoire and
6301 * is unimportant for most purposes.
6303 * See: https://unicode.org/versions/
6306 unicode_version(PG_FUNCTION_ARGS
)
6308 PG_RETURN_TEXT_P(cstring_to_text(PG_UNICODE_VERSION
));
6312 * Returns version of Unicode used by ICU, if enabled; otherwise NULL.
6315 icu_unicode_version(PG_FUNCTION_ARGS
)
6318 PG_RETURN_TEXT_P(cstring_to_text(U_UNICODE_VERSION
));
6325 * Check whether the string contains only assigned Unicode code
6326 * points. Requires that the database encoding is UTF-8.
6329 unicode_assigned(PG_FUNCTION_ARGS
)
6331 text
*input
= PG_GETARG_TEXT_PP(0);
6335 if (GetDatabaseEncoding() != PG_UTF8
)
6337 (errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
6339 /* convert to pg_wchar */
6340 size
= pg_mbstrlen_with_len(VARDATA_ANY(input
), VARSIZE_ANY_EXHDR(input
));
6341 p
= (unsigned char *) VARDATA_ANY(input
);
6342 for (int i
= 0; i
< size
; i
++)
6344 pg_wchar uchar
= utf8_to_unicode(p
);
6345 int category
= unicode_category(uchar
);
6347 if (category
== PG_U_UNASSIGNED
)
6348 PG_RETURN_BOOL(false);
6350 p
+= pg_utf_mblen(p
);
6353 PG_RETURN_BOOL(true);
6357 unicode_normalize_func(PG_FUNCTION_ARGS
)
6359 text
*input
= PG_GETARG_TEXT_PP(0);
6360 char *formstr
= text_to_cstring(PG_GETARG_TEXT_PP(1));
6361 UnicodeNormalizationForm form
;
6363 pg_wchar
*input_chars
;
6364 pg_wchar
*output_chars
;
6369 form
= unicode_norm_form_from_string(formstr
);
6371 /* convert to pg_wchar */
6372 size
= pg_mbstrlen_with_len(VARDATA_ANY(input
), VARSIZE_ANY_EXHDR(input
));
6373 input_chars
= palloc((size
+ 1) * sizeof(pg_wchar
));
6374 p
= (unsigned char *) VARDATA_ANY(input
);
6375 for (i
= 0; i
< size
; i
++)
6377 input_chars
[i
] = utf8_to_unicode(p
);
6378 p
+= pg_utf_mblen(p
);
6380 input_chars
[i
] = (pg_wchar
) '\0';
6381 Assert((char *) p
== VARDATA_ANY(input
) + VARSIZE_ANY_EXHDR(input
));
6384 output_chars
= unicode_normalize(form
, input_chars
);
6386 /* convert back to UTF-8 string */
6388 for (pg_wchar
*wp
= output_chars
; *wp
; wp
++)
6390 unsigned char buf
[4];
6392 unicode_to_utf8(*wp
, buf
);
6393 size
+= pg_utf_mblen(buf
);
6396 result
= palloc(size
+ VARHDRSZ
);
6397 SET_VARSIZE(result
, size
+ VARHDRSZ
);
6399 p
= (unsigned char *) VARDATA_ANY(result
);
6400 for (pg_wchar
*wp
= output_chars
; *wp
; wp
++)
6402 unicode_to_utf8(*wp
, p
);
6403 p
+= pg_utf_mblen(p
);
6405 Assert((char *) p
== (char *) result
+ size
+ VARHDRSZ
);
6407 PG_RETURN_TEXT_P(result
);
6411 * Check whether the string is in the specified Unicode normalization form.
6413 * This is done by converting the string to the specified normal form and then
6414 * comparing that to the original string. To speed that up, we also apply the
6415 * "quick check" algorithm specified in UAX #15, which can give a yes or no
6416 * answer for many strings by just scanning the string once.
6418 * This function should generally be optimized for the case where the string
6419 * is in fact normalized. In that case, we'll end up looking at the entire
6420 * string, so it's probably not worth doing any incremental conversion etc.
6423 unicode_is_normalized(PG_FUNCTION_ARGS
)
6425 text
*input
= PG_GETARG_TEXT_PP(0);
6426 char *formstr
= text_to_cstring(PG_GETARG_TEXT_PP(1));
6427 UnicodeNormalizationForm form
;
6429 pg_wchar
*input_chars
;
6430 pg_wchar
*output_chars
;
6433 UnicodeNormalizationQC quickcheck
;
6437 form
= unicode_norm_form_from_string(formstr
);
6439 /* convert to pg_wchar */
6440 size
= pg_mbstrlen_with_len(VARDATA_ANY(input
), VARSIZE_ANY_EXHDR(input
));
6441 input_chars
= palloc((size
+ 1) * sizeof(pg_wchar
));
6442 p
= (unsigned char *) VARDATA_ANY(input
);
6443 for (i
= 0; i
< size
; i
++)
6445 input_chars
[i
] = utf8_to_unicode(p
);
6446 p
+= pg_utf_mblen(p
);
6448 input_chars
[i
] = (pg_wchar
) '\0';
6449 Assert((char *) p
== VARDATA_ANY(input
) + VARSIZE_ANY_EXHDR(input
));
6451 /* quick check (see UAX #15) */
6452 quickcheck
= unicode_is_normalized_quickcheck(form
, input_chars
);
6453 if (quickcheck
== UNICODE_NORM_QC_YES
)
6454 PG_RETURN_BOOL(true);
6455 else if (quickcheck
== UNICODE_NORM_QC_NO
)
6456 PG_RETURN_BOOL(false);
6458 /* normalize and compare with original */
6459 output_chars
= unicode_normalize(form
, input_chars
);
6462 for (pg_wchar
*wp
= output_chars
; *wp
; wp
++)
6465 result
= (size
== output_size
) &&
6466 (memcmp(input_chars
, output_chars
, size
* sizeof(pg_wchar
)) == 0);
6468 PG_RETURN_BOOL(result
);
6472 * Check if first n chars are hexadecimal digits
6475 isxdigits_n(const char *instr
, size_t n
)
6477 for (size_t i
= 0; i
< n
; i
++)
6478 if (!isxdigit((unsigned char) instr
[i
]))
6485 hexval(unsigned char c
)
6487 if (c
>= '0' && c
<= '9')
6489 if (c
>= 'a' && c
<= 'f')
6490 return c
- 'a' + 0xA;
6491 if (c
>= 'A' && c
<= 'F')
6492 return c
- 'A' + 0xA;
6493 elog(ERROR
, "invalid hexadecimal digit");
6494 return 0; /* not reached */
6498 * Translate string with hexadecimal digits to number
6501 hexval_n(const char *instr
, size_t n
)
6503 unsigned int result
= 0;
6505 for (size_t i
= 0; i
< n
; i
++)
6506 result
+= hexval(instr
[i
]) << (4 * (n
- i
- 1));
6512 * Replaces Unicode escape sequences by Unicode characters
6515 unistr(PG_FUNCTION_ARGS
)
6517 text
*input_text
= PG_GETARG_TEXT_PP(0);
6522 pg_wchar pair_first
= 0;
6523 char cbuf
[MAX_UNICODE_EQUIVALENT_STRING
+ 1];
6525 instr
= VARDATA_ANY(input_text
);
6526 len
= VARSIZE_ANY_EXHDR(input_text
);
6528 initStringInfo(&str
);
6532 if (instr
[0] == '\\')
6539 appendStringInfoChar(&str
, '\\');
6543 else if ((len
>= 5 && isxdigits_n(instr
+ 1, 4)) ||
6544 (len
>= 6 && instr
[1] == 'u' && isxdigits_n(instr
+ 2, 4)))
6547 int offset
= instr
[1] == 'u' ? 2 : 1;
6549 unicode
= hexval_n(instr
+ offset
, 4);
6551 if (!is_valid_unicode_codepoint(unicode
))
6553 errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
6554 errmsg("invalid Unicode code point: %04X", unicode
));
6558 if (is_utf16_surrogate_second(unicode
))
6560 unicode
= surrogate_pair_to_codepoint(pair_first
, unicode
);
6566 else if (is_utf16_surrogate_second(unicode
))
6569 if (is_utf16_surrogate_first(unicode
))
6570 pair_first
= unicode
;
6573 pg_unicode_to_server(unicode
, (unsigned char *) cbuf
);
6574 appendStringInfoString(&str
, cbuf
);
6577 instr
+= 4 + offset
;
6580 else if (len
>= 8 && instr
[1] == '+' && isxdigits_n(instr
+ 2, 6))
6584 unicode
= hexval_n(instr
+ 2, 6);
6586 if (!is_valid_unicode_codepoint(unicode
))
6588 errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
6589 errmsg("invalid Unicode code point: %04X", unicode
));
6593 if (is_utf16_surrogate_second(unicode
))
6595 unicode
= surrogate_pair_to_codepoint(pair_first
, unicode
);
6601 else if (is_utf16_surrogate_second(unicode
))
6604 if (is_utf16_surrogate_first(unicode
))
6605 pair_first
= unicode
;
6608 pg_unicode_to_server(unicode
, (unsigned char *) cbuf
);
6609 appendStringInfoString(&str
, cbuf
);
6615 else if (len
>= 10 && instr
[1] == 'U' && isxdigits_n(instr
+ 2, 8))
6619 unicode
= hexval_n(instr
+ 2, 8);
6621 if (!is_valid_unicode_codepoint(unicode
))
6623 errcode(ERRCODE_INVALID_PARAMETER_VALUE
),
6624 errmsg("invalid Unicode code point: %04X", unicode
));
6628 if (is_utf16_surrogate_second(unicode
))
6630 unicode
= surrogate_pair_to_codepoint(pair_first
, unicode
);
6636 else if (is_utf16_surrogate_second(unicode
))
6639 if (is_utf16_surrogate_first(unicode
))
6640 pair_first
= unicode
;
6643 pg_unicode_to_server(unicode
, (unsigned char *) cbuf
);
6644 appendStringInfoString(&str
, cbuf
);
6652 (errcode(ERRCODE_SYNTAX_ERROR
),
6653 errmsg("invalid Unicode escape"),
6654 errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX, or \\UXXXXXXXX.")));
6661 appendStringInfoChar(&str
, *instr
++);
6666 /* unfinished surrogate pair? */
6670 result
= cstring_to_text_with_len(str
.data
, str
.len
);
6673 PG_RETURN_TEXT_P(result
);
6677 (errcode(ERRCODE_SYNTAX_ERROR
),
6678 errmsg("invalid Unicode surrogate pair")));
6679 PG_RETURN_NULL(); /* keep compiler quiet */