1 /*-------------------------------------------------------------------------
3 * Unicode case mapping and case conversion.
5 * Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
8 * src/common/unicode_case.c
10 *-------------------------------------------------------------------------
15 #include "postgres_fe.h"
18 #include "common/unicode_case.h"
19 #include "common/unicode_case_table.h"
20 #include "mb/pg_wchar.h"
22 static const pg_case_map
*find_case_map(pg_wchar ucs
);
23 static size_t convert_case(char *dst
, size_t dstsize
, const char *src
, ssize_t srclen
,
24 CaseKind str_casekind
, WordBoundaryNext wbnext
,
28 unicode_lowercase_simple(pg_wchar code
)
30 const pg_case_map
*map
= find_case_map(code
);
32 return map
? map
->simplemap
[CaseLower
] : code
;
36 unicode_titlecase_simple(pg_wchar code
)
38 const pg_case_map
*map
= find_case_map(code
);
40 return map
? map
->simplemap
[CaseTitle
] : code
;
44 unicode_uppercase_simple(pg_wchar code
)
46 const pg_case_map
*map
= find_case_map(code
);
48 return map
? map
->simplemap
[CaseUpper
] : code
;
54 * Convert src to lowercase, and return the result length (not including
57 * String src must be encoded in UTF-8. If srclen < 0, src must be
60 * Result string is stored in dst, truncating if larger than dstsize. If
61 * dstsize is greater than the result length, dst will be NUL-terminated;
64 * If dstsize is zero, dst may be NULL. This is useful for calculating the
65 * required buffer size before allocating.
68 unicode_strlower(char *dst
, size_t dstsize
, const char *src
, ssize_t srclen
)
70 return convert_case(dst
, dstsize
, src
, srclen
, CaseLower
, NULL
, NULL
);
76 * Convert src to titlecase, and return the result length (not including
79 * String src must be encoded in UTF-8. If srclen < 0, src must be
82 * Result string is stored in dst, truncating if larger than dstsize. If
83 * dstsize is greater than the result length, dst will be NUL-terminated;
86 * If dstsize is zero, dst may be NULL. This is useful for calculating the
87 * required buffer size before allocating.
89 * Titlecasing requires knowledge about word boundaries, which is provided by
90 * the callback wbnext. A word boundary is the offset of the start of a word
91 * or the offset of the character immediately following a word.
93 * The caller is expected to initialize and free the callback state
94 * wbstate. The callback should first return offset 0 for the first boundary;
95 * then the offset of each subsequent word boundary; then the total length of
96 * the string to indicate the final boundary.
99 unicode_strtitle(char *dst
, size_t dstsize
, const char *src
, ssize_t srclen
,
100 WordBoundaryNext wbnext
, void *wbstate
)
102 return convert_case(dst
, dstsize
, src
, srclen
, CaseTitle
, wbnext
,
109 * Convert src to uppercase, and return the result length (not including
112 * String src must be encoded in UTF-8. If srclen < 0, src must be
115 * Result string is stored in dst, truncating if larger than dstsize. If
116 * dstsize is greater than the result length, dst will be NUL-terminated;
119 * If dstsize is zero, dst may be NULL. This is useful for calculating the
120 * required buffer size before allocating.
123 unicode_strupper(char *dst
, size_t dstsize
, const char *src
, ssize_t srclen
)
125 return convert_case(dst
, dstsize
, src
, srclen
, CaseUpper
, NULL
, NULL
);
129 * If str_casekind is CaseLower or CaseUpper, map each character in the string
130 * for which a mapping is available.
132 * If str_casekind is CaseTitle, maps characters found on a word boundary to
133 * uppercase and other characters to lowercase.
136 convert_case(char *dst
, size_t dstsize
, const char *src
, ssize_t srclen
,
137 CaseKind str_casekind
, WordBoundaryNext wbnext
, void *wbstate
)
139 /* character CaseKind varies while titlecasing */
140 CaseKind chr_casekind
= str_casekind
;
142 size_t result_len
= 0;
145 Assert((str_casekind
== CaseTitle
&& wbnext
&& wbstate
) ||
146 (str_casekind
!= CaseTitle
&& !wbnext
&& !wbstate
));
148 if (str_casekind
== CaseTitle
)
150 boundary
= wbnext(wbstate
);
151 Assert(boundary
== 0); /* start of text is always a boundary */
154 while ((srclen
< 0 || srcoff
< srclen
) && src
[srcoff
] != '\0')
156 pg_wchar u1
= utf8_to_unicode((unsigned char *) src
+ srcoff
);
157 int u1len
= unicode_utf8len(u1
);
158 const pg_case_map
*casemap
= find_case_map(u1
);
160 if (str_casekind
== CaseTitle
)
162 if (srcoff
== boundary
)
164 chr_casekind
= CaseUpper
;
165 boundary
= wbnext(wbstate
);
168 chr_casekind
= CaseLower
;
171 /* perform mapping, update result_len, and write to dst */
174 pg_wchar u2
= casemap
->simplemap
[chr_casekind
];
175 pg_wchar u2len
= unicode_utf8len(u2
);
177 if (result_len
+ u2len
<= dstsize
)
178 unicode_to_utf8(u2
, (unsigned char *) dst
+ result_len
);
184 /* no mapping; copy bytes from src */
185 if (result_len
+ u1len
<= dstsize
)
186 memcpy(dst
+ result_len
, src
+ srcoff
, u1len
);
194 if (result_len
< dstsize
)
195 dst
[result_len
] = '\0';
200 /* find entry in simple case map, if any */
201 static const pg_case_map
*
202 find_case_map(pg_wchar ucs
)
208 /* all chars <= 0x80 are stored in array for fast lookup */
209 Assert(lengthof(case_map
) >= 0x80);
212 const pg_case_map
*map
= &case_map
[ucs
];
214 Assert(map
->codepoint
== ucs
);
218 /* otherwise, binary search */
220 max
= lengthof(case_map
) - 1;
223 mid
= (min
+ max
) / 2;
224 if (ucs
> case_map
[mid
].codepoint
)
226 else if (ucs
< case_map
[mid
].codepoint
)
229 return &case_map
[mid
];