jsonpath scanner: reentrant scanner
[pgsql.git] / src / common / unicode_category.c
bloba615a9056787cc6d8121d900e0d99d026c519afe
1 /*-------------------------------------------------------------------------
2 * unicode_category.c
3 * Determine general category and character properties of Unicode
4 * characters. Encoding must be UTF8, where we assume that the pg_wchar
5 * representation is a code point.
7 * Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
9 * IDENTIFICATION
10 * src/common/unicode_category.c
12 *-------------------------------------------------------------------------
14 #ifndef FRONTEND
15 #include "postgres.h"
16 #else
17 #include "postgres_fe.h"
18 #endif
20 #include "common/unicode_category.h"
21 #include "common/unicode_category_table.h"
24 * Create bitmasks from pg_unicode_category values for efficient comparison of
25 * multiple categories. For instance, PG_U_MN_MASK is a bitmask representing
26 * the general category Mn; and PG_U_M_MASK represents general categories Mn,
27 * Me, and Mc.
29 * The number of Unicode General Categories should never grow, so a 32-bit
30 * mask is fine.
32 #define PG_U_CATEGORY_MASK(X) ((uint32)(1 << (X)))
34 #define PG_U_LU_MASK PG_U_CATEGORY_MASK(PG_U_UPPERCASE_LETTER)
35 #define PG_U_LL_MASK PG_U_CATEGORY_MASK(PG_U_LOWERCASE_LETTER)
36 #define PG_U_LT_MASK PG_U_CATEGORY_MASK(PG_U_TITLECASE_LETTER)
37 #define PG_U_LC_MASK (PG_U_LU_MASK|PG_U_LL_MASK|PG_U_LT_MASK)
38 #define PG_U_LM_MASK PG_U_CATEGORY_MASK(PG_U_MODIFIER_LETTER)
39 #define PG_U_LO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_LETTER)
40 #define PG_U_L_MASK (PG_U_LU_MASK|PG_U_LL_MASK|PG_U_LT_MASK|PG_U_LM_MASK|\
41 PG_U_LO_MASK)
42 #define PG_U_MN_MASK PG_U_CATEGORY_MASK(PG_U_NONSPACING_MARK)
43 #define PG_U_ME_MASK PG_U_CATEGORY_MASK(PG_U_ENCLOSING_MARK)
44 #define PG_U_MC_MASK PG_U_CATEGORY_MASK(PG_U_SPACING_MARK)
45 #define PG_U_M_MASK (PG_U_MN_MASK|PG_U_MC_MASK|PG_U_ME_MASK)
46 #define PG_U_ND_MASK PG_U_CATEGORY_MASK(PG_U_DECIMAL_NUMBER)
47 #define PG_U_NL_MASK PG_U_CATEGORY_MASK(PG_U_LETTER_NUMBER)
48 #define PG_U_NO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_NUMBER)
49 #define PG_U_N_MASK (PG_U_ND_MASK|PG_U_NL_MASK|PG_U_NO_MASK)
50 #define PG_U_PC_MASK PG_U_CATEGORY_MASK(PG_U_CONNECTOR_PUNCTUATION)
51 #define PG_U_PD_MASK PG_U_CATEGORY_MASK(PG_U_DASH_PUNCTUATION)
52 #define PG_U_PS_MASK PG_U_CATEGORY_MASK(PG_U_OPEN_PUNCTUATION)
53 #define PG_U_PE_MASK PG_U_CATEGORY_MASK(PG_U_CLOSE_PUNCTUATION)
54 #define PG_U_PI_MASK PG_U_CATEGORY_MASK(PG_U_INITIAL_PUNCTUATION)
55 #define PG_U_PF_MASK PG_U_CATEGORY_MASK(PG_U_FINAL_PUNCTUATION)
56 #define PG_U_PO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_PUNCTUATION)
57 #define PG_U_P_MASK (PG_U_PC_MASK|PG_U_PD_MASK|PG_U_PS_MASK|PG_U_PE_MASK|\
58 PG_U_PI_MASK|PG_U_PF_MASK|PG_U_PO_MASK)
59 #define PG_U_SM_MASK PG_U_CATEGORY_MASK(PG_U_MATH_SYMBOL)
60 #define PG_U_SC_MASK PG_U_CATEGORY_MASK(PG_U_CURRENCY_SYMBOL)
61 #define PG_U_SK_MASK PG_U_CATEGORY_MASK(PG_U_MODIFIER_SYMBOL)
62 #define PG_U_SO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_SYMBOL)
63 #define PG_U_S_MASK (PG_U_SM_MASK|PG_U_SC_MASK|PG_U_SK_MASK|PG_U_SO_MASK)
64 #define PG_U_ZS_MASK PG_U_CATEGORY_MASK(PG_U_SPACE_SEPARATOR)
65 #define PG_U_ZL_MASK PG_U_CATEGORY_MASK(PG_U_LINE_SEPARATOR)
66 #define PG_U_ZP_MASK PG_U_CATEGORY_MASK(PG_U_PARAGRAPH_SEPARATOR)
67 #define PG_U_Z_MASK (PG_U_ZS_MASK|PG_U_ZL_MASK|PG_U_ZP_MASK)
68 #define PG_U_CC_MASK PG_U_CATEGORY_MASK(PG_U_CONTROL)
69 #define PG_U_CF_MASK PG_U_CATEGORY_MASK(PG_U_FORMAT)
70 #define PG_U_CS_MASK PG_U_CATEGORY_MASK(PG_U_SURROGATE)
71 #define PG_U_CO_MASK PG_U_CATEGORY_MASK(PG_U_PRIVATE_USE)
72 #define PG_U_CN_MASK PG_U_CATEGORY_MASK(PG_U_UNASSIGNED)
73 #define PG_U_C_MASK (PG_U_CC_MASK|PG_U_CF_MASK|PG_U_CS_MASK|PG_U_CO_MASK|\
74 PG_U_CN_MASK)
76 #define PG_U_CHARACTER_TAB 0x09
78 static bool range_search(const pg_unicode_range *tbl, size_t size,
79 pg_wchar code);
82 * Unicode general category for the given codepoint.
84 pg_unicode_category
85 unicode_category(pg_wchar code)
87 int min = 0;
88 int mid;
89 int max = lengthof(unicode_categories) - 1;
91 Assert(code <= 0x10ffff);
93 if (code < 0x80)
94 return unicode_opt_ascii[code].category;
96 while (max >= min)
98 mid = (min + max) / 2;
99 if (code > unicode_categories[mid].last)
100 min = mid + 1;
101 else if (code < unicode_categories[mid].first)
102 max = mid - 1;
103 else
104 return unicode_categories[mid].category;
107 return PG_U_UNASSIGNED;
110 bool
111 pg_u_prop_alphabetic(pg_wchar code)
113 if (code < 0x80)
114 return unicode_opt_ascii[code].properties & PG_U_PROP_ALPHABETIC;
116 return range_search(unicode_alphabetic,
117 lengthof(unicode_alphabetic),
118 code);
121 bool
122 pg_u_prop_lowercase(pg_wchar code)
124 if (code < 0x80)
125 return unicode_opt_ascii[code].properties & PG_U_PROP_LOWERCASE;
127 return range_search(unicode_lowercase,
128 lengthof(unicode_lowercase),
129 code);
132 bool
133 pg_u_prop_uppercase(pg_wchar code)
135 if (code < 0x80)
136 return unicode_opt_ascii[code].properties & PG_U_PROP_UPPERCASE;
138 return range_search(unicode_uppercase,
139 lengthof(unicode_uppercase),
140 code);
143 bool
144 pg_u_prop_cased(pg_wchar code)
146 uint32 category_mask;
148 if (code < 0x80)
149 return unicode_opt_ascii[code].properties & PG_U_PROP_CASED;
151 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
153 return category_mask & PG_U_LT_MASK ||
154 pg_u_prop_lowercase(code) ||
155 pg_u_prop_uppercase(code);
158 bool
159 pg_u_prop_case_ignorable(pg_wchar code)
161 if (code < 0x80)
162 return unicode_opt_ascii[code].properties & PG_U_PROP_CASE_IGNORABLE;
164 return range_search(unicode_case_ignorable,
165 lengthof(unicode_case_ignorable),
166 code);
169 bool
170 pg_u_prop_white_space(pg_wchar code)
172 if (code < 0x80)
173 return unicode_opt_ascii[code].properties & PG_U_PROP_WHITE_SPACE;
175 return range_search(unicode_white_space,
176 lengthof(unicode_white_space),
177 code);
180 bool
181 pg_u_prop_hex_digit(pg_wchar code)
183 if (code < 0x80)
184 return unicode_opt_ascii[code].properties & PG_U_PROP_HEX_DIGIT;
186 return range_search(unicode_hex_digit,
187 lengthof(unicode_hex_digit),
188 code);
191 bool
192 pg_u_prop_join_control(pg_wchar code)
194 if (code < 0x80)
195 return unicode_opt_ascii[code].properties & PG_U_PROP_JOIN_CONTROL;
197 return range_search(unicode_join_control,
198 lengthof(unicode_join_control),
199 code);
203 * The following functions implement the Compatibility Properties described
204 * at: http://www.unicode.org/reports/tr18/#Compatibility_Properties
206 * If 'posix' is true, implements the "POSIX Compatible" variant, otherwise
207 * the "Standard" variant.
210 bool
211 pg_u_isdigit(pg_wchar code, bool posix)
213 if (posix)
214 return ('0' <= code && code <= '9');
215 else
216 return unicode_category(code) == PG_U_DECIMAL_NUMBER;
219 bool
220 pg_u_isalpha(pg_wchar code)
222 return pg_u_prop_alphabetic(code);
225 bool
226 pg_u_isalnum(pg_wchar code, bool posix)
228 return pg_u_isalpha(code) || pg_u_isdigit(code, posix);
231 bool
232 pg_u_isword(pg_wchar code)
234 uint32 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
236 return
237 category_mask & (PG_U_M_MASK | PG_U_ND_MASK | PG_U_PC_MASK) ||
238 pg_u_isalpha(code) ||
239 pg_u_prop_join_control(code);
242 bool
243 pg_u_isupper(pg_wchar code)
245 return pg_u_prop_uppercase(code);
248 bool
249 pg_u_islower(pg_wchar code)
251 return pg_u_prop_lowercase(code);
254 bool
255 pg_u_isblank(pg_wchar code)
257 return code == PG_U_CHARACTER_TAB ||
258 unicode_category(code) == PG_U_SPACE_SEPARATOR;
261 bool
262 pg_u_iscntrl(pg_wchar code)
264 return unicode_category(code) == PG_U_CONTROL;
267 bool
268 pg_u_isgraph(pg_wchar code)
270 uint32 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
272 if (category_mask & (PG_U_CC_MASK | PG_U_CS_MASK | PG_U_CN_MASK) ||
273 pg_u_isspace(code))
274 return false;
275 return true;
278 bool
279 pg_u_isprint(pg_wchar code)
281 pg_unicode_category category = unicode_category(code);
283 if (category == PG_U_CONTROL)
284 return false;
286 return pg_u_isgraph(code) || pg_u_isblank(code);
289 bool
290 pg_u_ispunct(pg_wchar code, bool posix)
292 uint32 category_mask;
294 if (posix)
296 if (pg_u_isalpha(code))
297 return false;
299 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
300 return category_mask & (PG_U_P_MASK | PG_U_S_MASK);
302 else
304 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
306 return category_mask & PG_U_P_MASK;
310 bool
311 pg_u_isspace(pg_wchar code)
313 return pg_u_prop_white_space(code);
316 bool
317 pg_u_isxdigit(pg_wchar code, bool posix)
319 if (posix)
320 return (('0' <= code && code <= '9') ||
321 ('A' <= code && code <= 'F') ||
322 ('a' <= code && code <= 'f'));
323 else
324 return unicode_category(code) == PG_U_DECIMAL_NUMBER ||
325 pg_u_prop_hex_digit(code);
329 * Description of Unicode general category.
331 const char *
332 unicode_category_string(pg_unicode_category category)
334 switch (category)
336 case PG_U_UNASSIGNED:
337 return "Unassigned";
338 case PG_U_UPPERCASE_LETTER:
339 return "Uppercase_Letter";
340 case PG_U_LOWERCASE_LETTER:
341 return "Lowercase_Letter";
342 case PG_U_TITLECASE_LETTER:
343 return "Titlecase_Letter";
344 case PG_U_MODIFIER_LETTER:
345 return "Modifier_Letter";
346 case PG_U_OTHER_LETTER:
347 return "Other_Letter";
348 case PG_U_NONSPACING_MARK:
349 return "Nonspacing_Mark";
350 case PG_U_ENCLOSING_MARK:
351 return "Enclosing_Mark";
352 case PG_U_SPACING_MARK:
353 return "Spacing_Mark";
354 case PG_U_DECIMAL_NUMBER:
355 return "Decimal_Number";
356 case PG_U_LETTER_NUMBER:
357 return "Letter_Number";
358 case PG_U_OTHER_NUMBER:
359 return "Other_Number";
360 case PG_U_SPACE_SEPARATOR:
361 return "Space_Separator";
362 case PG_U_LINE_SEPARATOR:
363 return "Line_Separator";
364 case PG_U_PARAGRAPH_SEPARATOR:
365 return "Paragraph_Separator";
366 case PG_U_CONTROL:
367 return "Control";
368 case PG_U_FORMAT:
369 return "Format";
370 case PG_U_PRIVATE_USE:
371 return "Private_Use";
372 case PG_U_SURROGATE:
373 return "Surrogate";
374 case PG_U_DASH_PUNCTUATION:
375 return "Dash_Punctuation";
376 case PG_U_OPEN_PUNCTUATION:
377 return "Open_Punctuation";
378 case PG_U_CLOSE_PUNCTUATION:
379 return "Close_Punctuation";
380 case PG_U_CONNECTOR_PUNCTUATION:
381 return "Connector_Punctuation";
382 case PG_U_OTHER_PUNCTUATION:
383 return "Other_Punctuation";
384 case PG_U_MATH_SYMBOL:
385 return "Math_Symbol";
386 case PG_U_CURRENCY_SYMBOL:
387 return "Currency_Symbol";
388 case PG_U_MODIFIER_SYMBOL:
389 return "Modifier_Symbol";
390 case PG_U_OTHER_SYMBOL:
391 return "Other_Symbol";
392 case PG_U_INITIAL_PUNCTUATION:
393 return "Initial_Punctuation";
394 case PG_U_FINAL_PUNCTUATION:
395 return "Final_Punctuation";
398 Assert(false);
399 return "Unrecognized"; /* keep compiler quiet */
403 * Short code for Unicode general category.
405 const char *
406 unicode_category_abbrev(pg_unicode_category category)
408 switch (category)
410 case PG_U_UNASSIGNED:
411 return "Cn";
412 case PG_U_UPPERCASE_LETTER:
413 return "Lu";
414 case PG_U_LOWERCASE_LETTER:
415 return "Ll";
416 case PG_U_TITLECASE_LETTER:
417 return "Lt";
418 case PG_U_MODIFIER_LETTER:
419 return "Lm";
420 case PG_U_OTHER_LETTER:
421 return "Lo";
422 case PG_U_NONSPACING_MARK:
423 return "Mn";
424 case PG_U_ENCLOSING_MARK:
425 return "Me";
426 case PG_U_SPACING_MARK:
427 return "Mc";
428 case PG_U_DECIMAL_NUMBER:
429 return "Nd";
430 case PG_U_LETTER_NUMBER:
431 return "Nl";
432 case PG_U_OTHER_NUMBER:
433 return "No";
434 case PG_U_SPACE_SEPARATOR:
435 return "Zs";
436 case PG_U_LINE_SEPARATOR:
437 return "Zl";
438 case PG_U_PARAGRAPH_SEPARATOR:
439 return "Zp";
440 case PG_U_CONTROL:
441 return "Cc";
442 case PG_U_FORMAT:
443 return "Cf";
444 case PG_U_PRIVATE_USE:
445 return "Co";
446 case PG_U_SURROGATE:
447 return "Cs";
448 case PG_U_DASH_PUNCTUATION:
449 return "Pd";
450 case PG_U_OPEN_PUNCTUATION:
451 return "Ps";
452 case PG_U_CLOSE_PUNCTUATION:
453 return "Pe";
454 case PG_U_CONNECTOR_PUNCTUATION:
455 return "Pc";
456 case PG_U_OTHER_PUNCTUATION:
457 return "Po";
458 case PG_U_MATH_SYMBOL:
459 return "Sm";
460 case PG_U_CURRENCY_SYMBOL:
461 return "Sc";
462 case PG_U_MODIFIER_SYMBOL:
463 return "Sk";
464 case PG_U_OTHER_SYMBOL:
465 return "So";
466 case PG_U_INITIAL_PUNCTUATION:
467 return "Pi";
468 case PG_U_FINAL_PUNCTUATION:
469 return "Pf";
472 Assert(false);
473 return "??"; /* keep compiler quiet */
477 * Binary search to test if given codepoint exists in one of the ranges in the
478 * given table.
480 static bool
481 range_search(const pg_unicode_range *tbl, size_t size, pg_wchar code)
483 int min = 0;
484 int mid;
485 int max = size - 1;
487 Assert(code <= 0x10ffff);
489 while (max >= min)
491 mid = (min + max) / 2;
492 if (code > tbl[mid].last)
493 min = mid + 1;
494 else if (code < tbl[mid].first)
495 max = mid - 1;
496 else
497 return true;
500 return false;