1 /*-------------------------------------------------------------------------
3 * Normalize a Unicode string
5 * This implements Unicode normalization, per the documentation at
6 * https://www.unicode.org/reports/tr15/.
8 * Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
11 * src/common/unicode_norm.c
13 *-------------------------------------------------------------------------
18 #include "postgres_fe.h"
21 #include "common/unicode_norm.h"
23 #include "common/unicode_norm_hashfunc.h"
24 #include "common/unicode_normprops_table.h"
25 #include "port/pg_bswap.h"
27 #include "common/unicode_norm_table.h"
31 #define ALLOC(size) palloc(size)
32 #define FREE(size) pfree(size)
34 #define ALLOC(size) malloc(size)
35 #define FREE(size) free(size)
38 /* Constants for calculations with Hangul characters */
39 #define SBASE 0xAC00 /* U+AC00 */
40 #define LBASE 0x1100 /* U+1100 */
41 #define VBASE 0x1161 /* U+1161 */
42 #define TBASE 0x11A7 /* U+11A7 */
46 #define NCOUNT VCOUNT * TCOUNT
47 #define SCOUNT LCOUNT * NCOUNT
50 /* comparison routine for bsearch() of decomposition lookup table. */
52 conv_compare(const void *p1
, const void *p2
)
57 v1
= *(const uint32
*) p1
;
58 v2
= ((const pg_unicode_decomposition
*) p2
)->codepoint
;
59 return (v1
> v2
) ? 1 : ((v1
== v2
) ? 0 : -1);
67 * Get the entry corresponding to code in the decomposition lookup table.
68 * The backend version of this code uses a perfect hash function for the
69 * lookup, while the frontend version uses a binary search.
71 static const pg_unicode_decomposition
*
72 get_code_entry(pg_wchar code
)
77 pg_unicode_decompinfo decompinfo
= UnicodeDecompInfo
;
80 * Compute the hash function. The hash key is the codepoint with the bytes
83 hashkey
= pg_hton32(code
);
84 h
= decompinfo
.hash(&hashkey
);
86 /* An out-of-range result implies no match */
87 if (h
< 0 || h
>= decompinfo
.num_decomps
)
91 * Since it's a perfect hash, we need only match to the specific codepoint
94 if (code
!= decompinfo
.decomps
[h
].codepoint
)
98 return &decompinfo
.decomps
[h
];
100 return bsearch(&(code
),
102 lengthof(UnicodeDecompMain
),
103 sizeof(pg_unicode_decomposition
),
109 * Get the combining class of the given codepoint.
112 get_canonical_class(pg_wchar code
)
114 const pg_unicode_decomposition
*entry
= get_code_entry(code
);
117 * If no entries are found, the character used is either an Hangul
118 * character or a character with a class of 0 and no decompositions.
123 return entry
->comb_class
;
127 * Given a decomposition entry looked up earlier, get the decomposed
130 * Note: the returned pointer can point to statically allocated buffer, and
131 * is only valid until next call to this function!
133 static const pg_wchar
*
134 get_code_decomposition(const pg_unicode_decomposition
*entry
, int *dec_size
)
138 if (DECOMPOSITION_IS_INLINE(entry
))
140 Assert(DECOMPOSITION_SIZE(entry
) == 1);
141 x
= (pg_wchar
) entry
->dec_index
;
147 *dec_size
= DECOMPOSITION_SIZE(entry
);
148 return &UnicodeDecomp_codepoints
[entry
->dec_index
];
153 * Calculate how many characters a given character will decompose to.
155 * This needs to recurse, if the character decomposes into characters that
156 * are, in turn, decomposable.
159 get_decomposed_size(pg_wchar code
, bool compat
)
161 const pg_unicode_decomposition
*entry
;
164 const uint32
*decomp
;
168 * Fast path for Hangul characters not stored in tables to save memory as
169 * decomposition is algorithmic. See
170 * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
173 if (code
>= SBASE
&& code
< SBASE
+ SCOUNT
)
178 sindex
= code
- SBASE
;
179 tindex
= sindex
% TCOUNT
;
186 entry
= get_code_entry(code
);
189 * Just count current code if no other decompositions. A NULL entry is
190 * equivalent to a character with class 0 and no decompositions.
192 if (entry
== NULL
|| DECOMPOSITION_SIZE(entry
) == 0 ||
193 (!compat
&& DECOMPOSITION_IS_COMPAT(entry
)))
197 * If this entry has other decomposition codes look at them as well. First
198 * get its decomposition in the list of tables available.
200 decomp
= get_code_decomposition(entry
, &dec_size
);
201 for (i
= 0; i
< dec_size
; i
++)
203 uint32 lcode
= decomp
[i
];
205 size
+= get_decomposed_size(lcode
, compat
);
212 * Recompose a set of characters. For hangul characters, the calculation
213 * is algorithmic. For others, an inverse lookup at the decomposition
214 * table is necessary. Returns true if a recomposition can be done, and
218 recompose_code(uint32 start
, uint32 code
, uint32
*result
)
221 * Handle Hangul characters algorithmically, per the Unicode spec.
223 * Check if two current characters are L and V.
225 if (start
>= LBASE
&& start
< LBASE
+ LCOUNT
&&
226 code
>= VBASE
&& code
< VBASE
+ VCOUNT
)
228 /* make syllable of form LV */
229 uint32 lindex
= start
- LBASE
;
230 uint32 vindex
= code
- VBASE
;
232 *result
= SBASE
+ (lindex
* VCOUNT
+ vindex
) * TCOUNT
;
235 /* Check if two current characters are LV and T */
236 else if (start
>= SBASE
&& start
< (SBASE
+ SCOUNT
) &&
237 ((start
- SBASE
) % TCOUNT
) == 0 &&
238 code
>= TBASE
&& code
< (TBASE
+ TCOUNT
))
240 /* make syllable of form LVT */
241 uint32 tindex
= code
- TBASE
;
243 *result
= start
+ tindex
;
248 const pg_unicode_decomposition
*entry
;
251 * Do an inverse lookup of the decomposition tables to see if anything
252 * matches. The comparison just needs to be a perfect match on the
253 * sub-table of size two, because the start character has already been
254 * recomposed partially. This lookup uses a perfect hash function for
262 pg_unicode_recompinfo recompinfo
= UnicodeRecompInfo
;
265 * Compute the hash function. The hash key is formed by concatenating
266 * bytes of the two codepoints in network order. See also
267 * src/common/unicode/generate-unicode_norm_table.pl.
269 hashkey
= pg_hton64(((uint64
) start
<< 32) | (uint64
) code
);
270 h
= recompinfo
.hash(&hashkey
);
272 /* An out-of-range result implies no match */
273 if (h
< 0 || h
>= recompinfo
.num_recomps
)
276 inv_lookup_index
= recompinfo
.inverse_lookup
[h
];
277 entry
= &UnicodeDecompMain
[inv_lookup_index
];
279 if (start
== UnicodeDecomp_codepoints
[entry
->dec_index
] &&
280 code
== UnicodeDecomp_codepoints
[entry
->dec_index
+ 1])
282 *result
= entry
->codepoint
;
290 for (i
= 0; i
< lengthof(UnicodeDecompMain
); i
++)
292 entry
= &UnicodeDecompMain
[i
];
294 if (DECOMPOSITION_SIZE(entry
) != 2)
297 if (DECOMPOSITION_NO_COMPOSE(entry
))
300 if (start
== UnicodeDecomp_codepoints
[entry
->dec_index
] &&
301 code
== UnicodeDecomp_codepoints
[entry
->dec_index
+ 1])
303 *result
= entry
->codepoint
;
307 #endif /* !FRONTEND */
314 * Decompose the given code into the array given by caller. The
315 * decomposition begins at the position given by caller, saving one
316 * lookup on the decomposition table. The current position needs to be
317 * updated here to let the caller know from where to continue filling
318 * in the array result.
321 decompose_code(pg_wchar code
, bool compat
, pg_wchar
**result
, int *current
)
323 const pg_unicode_decomposition
*entry
;
325 const uint32
*decomp
;
329 * Fast path for Hangul characters not stored in tables to save memory as
330 * decomposition is algorithmic. See
331 * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details
334 if (code
>= SBASE
&& code
< SBASE
+ SCOUNT
)
340 pg_wchar
*res
= *result
;
342 sindex
= code
- SBASE
;
343 l
= LBASE
+ sindex
/ (VCOUNT
* TCOUNT
);
344 v
= VBASE
+ (sindex
% (VCOUNT
* TCOUNT
)) / TCOUNT
;
345 tindex
= sindex
% TCOUNT
;
354 res
[*current
] = TBASE
+ tindex
;
361 entry
= get_code_entry(code
);
364 * Just fill in with the current decomposition if there are no
365 * decomposition codes to recurse to. A NULL entry is equivalent to a
366 * character with class 0 and no decompositions, so just leave also in
369 if (entry
== NULL
|| DECOMPOSITION_SIZE(entry
) == 0 ||
370 (!compat
&& DECOMPOSITION_IS_COMPAT(entry
)))
372 pg_wchar
*res
= *result
;
374 res
[*current
] = code
;
380 * If this entry has other decomposition codes look at them as well.
382 decomp
= get_code_decomposition(entry
, &dec_size
);
383 for (i
= 0; i
< dec_size
; i
++)
385 pg_wchar lcode
= (pg_wchar
) decomp
[i
];
387 /* Leave if no more decompositions */
388 decompose_code(lcode
, compat
, result
, current
);
393 * unicode_normalize - Normalize a Unicode string to the specified form.
395 * The input is a 0-terminated array of codepoints.
397 * In frontend, returns a 0-terminated array of codepoints, allocated with
398 * malloc. Or NULL if we run out of memory. In backend, the returned
399 * string is palloc'd instead, and OOM is reported with ereport().
402 unicode_normalize(UnicodeNormalizationForm form
, const pg_wchar
*input
)
404 bool compat
= (form
== UNICODE_NFKC
|| form
== UNICODE_NFKD
);
405 bool recompose
= (form
== UNICODE_NFC
|| form
== UNICODE_NFKC
);
406 pg_wchar
*decomp_chars
;
407 pg_wchar
*recomp_chars
;
413 /* variables for recomposition */
419 /* First, do character decomposition */
422 * Calculate how many characters long the decomposed version will be.
425 for (p
= input
; *p
; p
++)
426 decomp_size
+= get_decomposed_size(*p
, compat
);
428 decomp_chars
= (pg_wchar
*) ALLOC((decomp_size
+ 1) * sizeof(pg_wchar
));
429 if (decomp_chars
== NULL
)
433 * Now fill in each entry recursively. This needs a second pass on the
434 * decomposition table.
437 for (p
= input
; *p
; p
++)
438 decompose_code(*p
, compat
, &decomp_chars
, ¤t_size
);
439 decomp_chars
[decomp_size
] = '\0';
440 Assert(decomp_size
== current_size
);
442 /* Leave if there is nothing to decompose */
443 if (decomp_size
== 0)
447 * Now apply canonical ordering.
449 for (count
= 1; count
< decomp_size
; count
++)
451 pg_wchar prev
= decomp_chars
[count
- 1];
452 pg_wchar next
= decomp_chars
[count
];
454 const uint8 prevClass
= get_canonical_class(prev
);
455 const uint8 nextClass
= get_canonical_class(next
);
458 * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html)
459 * annex 4, a sequence of two adjacent characters in a string is an
460 * exchangeable pair if the combining class (from the Unicode
461 * Character Database) for the first character is greater than the
462 * combining class for the second, and the second is not a starter. A
463 * character is a starter if its combining class is 0.
465 if (prevClass
== 0 || nextClass
== 0)
468 if (prevClass
<= nextClass
)
471 /* exchange can happen */
472 tmp
= decomp_chars
[count
- 1];
473 decomp_chars
[count
- 1] = decomp_chars
[count
];
474 decomp_chars
[count
] = tmp
;
476 /* backtrack to check again */
485 * The last phase of NFC and NFKC is the recomposition of the reordered
486 * Unicode string using combining classes. The recomposed string cannot be
487 * longer than the decomposed one, so make the allocation of the output
488 * string based on that assumption.
490 recomp_chars
= (pg_wchar
*) ALLOC((decomp_size
+ 1) * sizeof(pg_wchar
));
497 last_class
= -1; /* this eliminates a special check */
500 starter_ch
= recomp_chars
[0] = decomp_chars
[0];
502 for (count
= 1; count
< decomp_size
; count
++)
504 pg_wchar ch
= decomp_chars
[count
];
505 int ch_class
= get_canonical_class(ch
);
508 if (last_class
< ch_class
&&
509 recompose_code(starter_ch
, ch
, &composite
))
511 recomp_chars
[starter_pos
] = composite
;
512 starter_ch
= composite
;
514 else if (ch_class
== 0)
516 starter_pos
= target_pos
;
519 recomp_chars
[target_pos
++] = ch
;
523 last_class
= ch_class
;
524 recomp_chars
[target_pos
++] = ch
;
527 recomp_chars
[target_pos
] = (pg_wchar
) '\0';
535 * Normalization "quick check" algorithm; see
536 * <http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms>
539 /* We only need this in the backend. */
542 static const pg_unicode_normprops
*
543 qc_hash_lookup(pg_wchar ch
, const pg_unicode_norminfo
*norminfo
)
549 * Compute the hash function. The hash key is the codepoint with the bytes
552 hashkey
= pg_hton32(ch
);
553 h
= norminfo
->hash(&hashkey
);
555 /* An out-of-range result implies no match */
556 if (h
< 0 || h
>= norminfo
->num_normprops
)
560 * Since it's a perfect hash, we need only match to the specific codepoint
563 if (ch
!= norminfo
->normprops
[h
].codepoint
)
567 return &norminfo
->normprops
[h
];
571 * Look up the normalization quick check character property
573 static UnicodeNormalizationQC
574 qc_is_allowed(UnicodeNormalizationForm form
, pg_wchar ch
)
576 const pg_unicode_normprops
*found
= NULL
;
581 found
= qc_hash_lookup(ch
, &UnicodeNormInfo_NFC_QC
);
584 found
= qc_hash_lookup(ch
, &UnicodeNormInfo_NFKC_QC
);
592 return found
->quickcheck
;
594 return UNICODE_NORM_QC_YES
;
597 UnicodeNormalizationQC
598 unicode_is_normalized_quickcheck(UnicodeNormalizationForm form
, const pg_wchar
*input
)
600 uint8 lastCanonicalClass
= 0;
601 UnicodeNormalizationQC result
= UNICODE_NORM_QC_YES
;
604 * For the "D" forms, we don't run the quickcheck. We don't include the
605 * lookup tables for those because they are huge, checking for these
606 * particular forms is less common, and running the slow path is faster
607 * for the "D" forms than the "C" forms because you don't need to
608 * recompose, which is slow.
610 if (form
== UNICODE_NFD
|| form
== UNICODE_NFKD
)
611 return UNICODE_NORM_QC_MAYBE
;
613 for (const pg_wchar
*p
= input
; *p
; p
++)
616 uint8 canonicalClass
;
617 UnicodeNormalizationQC check
;
619 canonicalClass
= get_canonical_class(ch
);
620 if (lastCanonicalClass
> canonicalClass
&& canonicalClass
!= 0)
621 return UNICODE_NORM_QC_NO
;
623 check
= qc_is_allowed(form
, ch
);
624 if (check
== UNICODE_NORM_QC_NO
)
625 return UNICODE_NORM_QC_NO
;
626 else if (check
== UNICODE_NORM_QC_MAYBE
)
627 result
= UNICODE_NORM_QC_MAYBE
;
629 lastCanonicalClass
= canonicalClass
;
634 #endif /* !FRONTEND */