jsonpath scanner: reentrant scanner
[pgsql.git] / src / common / unicode_case.c
blob542e7ea537810c3e82297fed3b080f6f6120c995
1 /*-------------------------------------------------------------------------
2 * unicode_case.c
3 * Unicode case mapping and case conversion.
5 * Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
7 * IDENTIFICATION
8 * src/common/unicode_case.c
10 *-------------------------------------------------------------------------
12 #ifndef FRONTEND
13 #include "postgres.h"
14 #else
15 #include "postgres_fe.h"
16 #endif
18 #include "common/unicode_case.h"
19 #include "common/unicode_case_table.h"
20 #include "mb/pg_wchar.h"
22 static const pg_case_map *find_case_map(pg_wchar ucs);
23 static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
24 CaseKind str_casekind, WordBoundaryNext wbnext,
25 void *wbstate);
27 pg_wchar
28 unicode_lowercase_simple(pg_wchar code)
30 const pg_case_map *map = find_case_map(code);
32 return map ? map->simplemap[CaseLower] : code;
35 pg_wchar
36 unicode_titlecase_simple(pg_wchar code)
38 const pg_case_map *map = find_case_map(code);
40 return map ? map->simplemap[CaseTitle] : code;
43 pg_wchar
44 unicode_uppercase_simple(pg_wchar code)
46 const pg_case_map *map = find_case_map(code);
48 return map ? map->simplemap[CaseUpper] : code;
52 * unicode_strlower()
54 * Convert src to lowercase, and return the result length (not including
55 * terminating NUL).
57 * String src must be encoded in UTF-8. If srclen < 0, src must be
58 * NUL-terminated.
60 * Result string is stored in dst, truncating if larger than dstsize. If
61 * dstsize is greater than the result length, dst will be NUL-terminated;
62 * otherwise not.
64 * If dstsize is zero, dst may be NULL. This is useful for calculating the
65 * required buffer size before allocating.
67 size_t
68 unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
70 return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
74 * unicode_strtitle()
76 * Convert src to titlecase, and return the result length (not including
77 * terminating NUL).
79 * String src must be encoded in UTF-8. If srclen < 0, src must be
80 * NUL-terminated.
82 * Result string is stored in dst, truncating if larger than dstsize. If
83 * dstsize is greater than the result length, dst will be NUL-terminated;
84 * otherwise not.
86 * If dstsize is zero, dst may be NULL. This is useful for calculating the
87 * required buffer size before allocating.
89 * Titlecasing requires knowledge about word boundaries, which is provided by
90 * the callback wbnext. A word boundary is the offset of the start of a word
91 * or the offset of the character immediately following a word.
93 * The caller is expected to initialize and free the callback state
94 * wbstate. The callback should first return offset 0 for the first boundary;
95 * then the offset of each subsequent word boundary; then the total length of
96 * the string to indicate the final boundary.
98 size_t
99 unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
100 WordBoundaryNext wbnext, void *wbstate)
102 return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
103 wbstate);
107 * unicode_strupper()
109 * Convert src to uppercase, and return the result length (not including
110 * terminating NUL).
112 * String src must be encoded in UTF-8. If srclen < 0, src must be
113 * NUL-terminated.
115 * Result string is stored in dst, truncating if larger than dstsize. If
116 * dstsize is greater than the result length, dst will be NUL-terminated;
117 * otherwise not.
119 * If dstsize is zero, dst may be NULL. This is useful for calculating the
120 * required buffer size before allocating.
122 size_t
123 unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
125 return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
129 * If str_casekind is CaseLower or CaseUpper, map each character in the string
130 * for which a mapping is available.
132 * If str_casekind is CaseTitle, maps characters found on a word boundary to
133 * uppercase and other characters to lowercase.
135 static size_t
136 convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
137 CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
139 /* character CaseKind varies while titlecasing */
140 CaseKind chr_casekind = str_casekind;
141 size_t srcoff = 0;
142 size_t result_len = 0;
143 size_t boundary = 0;
145 Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
146 (str_casekind != CaseTitle && !wbnext && !wbstate));
148 if (str_casekind == CaseTitle)
150 boundary = wbnext(wbstate);
151 Assert(boundary == 0); /* start of text is always a boundary */
154 while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
156 pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
157 int u1len = unicode_utf8len(u1);
158 const pg_case_map *casemap = find_case_map(u1);
160 if (str_casekind == CaseTitle)
162 if (srcoff == boundary)
164 chr_casekind = CaseUpper;
165 boundary = wbnext(wbstate);
167 else
168 chr_casekind = CaseLower;
171 /* perform mapping, update result_len, and write to dst */
172 if (casemap)
174 pg_wchar u2 = casemap->simplemap[chr_casekind];
175 pg_wchar u2len = unicode_utf8len(u2);
177 if (result_len + u2len <= dstsize)
178 unicode_to_utf8(u2, (unsigned char *) dst + result_len);
180 result_len += u2len;
182 else
184 /* no mapping; copy bytes from src */
185 if (result_len + u1len <= dstsize)
186 memcpy(dst + result_len, src + srcoff, u1len);
188 result_len += u1len;
191 srcoff += u1len;
194 if (result_len < dstsize)
195 dst[result_len] = '\0';
197 return result_len;
200 /* find entry in simple case map, if any */
201 static const pg_case_map *
202 find_case_map(pg_wchar ucs)
204 int min;
205 int mid;
206 int max;
208 /* all chars <= 0x80 are stored in array for fast lookup */
209 Assert(lengthof(case_map) >= 0x80);
210 if (ucs < 0x80)
212 const pg_case_map *map = &case_map[ucs];
214 Assert(map->codepoint == ucs);
215 return map;
218 /* otherwise, binary search */
219 min = 0x80;
220 max = lengthof(case_map) - 1;
221 while (max >= min)
223 mid = (min + max) / 2;
224 if (ucs > case_map[mid].codepoint)
225 min = mid + 1;
226 else if (ucs < case_map[mid].codepoint)
227 max = mid - 1;
228 else
229 return &case_map[mid];
232 return NULL;