2 * Copyright (c) 2014 SGI.
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 /* Generator for a compact trie for unicode normalization */
21 #include <sys/types.h>
30 /* Default names of the in- and output files. */
32 #define AGE_NAME "DerivedAge.txt"
33 #define CCC_NAME "DerivedCombiningClass.txt"
34 #define PROP_NAME "DerivedCoreProperties.txt"
35 #define DATA_NAME "UnicodeData.txt"
36 #define FOLD_NAME "CaseFolding.txt"
37 #define NORM_NAME "NormalizationCorrections.txt"
38 #define TEST_NAME "NormalizationTest.txt"
39 #define UTF8_NAME "utf8data.h"
41 const char *age_name
= AGE_NAME
;
42 const char *ccc_name
= CCC_NAME
;
43 const char *prop_name
= PROP_NAME
;
44 const char *data_name
= DATA_NAME
;
45 const char *fold_name
= FOLD_NAME
;
46 const char *norm_name
= NORM_NAME
;
47 const char *test_name
= TEST_NAME
;
48 const char *utf8_name
= UTF8_NAME
;
52 /* An arbitrary line size limit on input lines. */
63 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
65 /* ------------------------------------------------------------------ */
68 * Unicode version numbers consist of three parts: major, minor, and a
69 * revision. These numbers are packed into an unsigned int to obtain
70 * a single version number.
72 * To save space in the generated trie, the unicode version is not
73 * stored directly, instead we calculate a generation number from the
74 * unicode versions seen in the DerivedAge file, and use that as an
75 * index into a table of unicode versions.
77 #define UNICODE_MAJ_SHIFT (16)
78 #define UNICODE_MIN_SHIFT (8)
80 #define UNICODE_MAJ_MAX ((unsigned short)-1)
81 #define UNICODE_MIN_MAX ((unsigned char)-1)
82 #define UNICODE_REV_MAX ((unsigned char)-1)
84 #define UNICODE_AGE(MAJ,MIN,REV) \
85 (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
86 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
87 ((unsigned int)(REV)))
92 unsigned int unicode_maxage
;
94 static int age_valid(unsigned int major
, unsigned int minor
,
95 unsigned int revision
)
97 if (major
> UNICODE_MAJ_MAX
)
99 if (minor
> UNICODE_MIN_MAX
)
101 if (revision
> UNICODE_REV_MAX
)
106 /* ------------------------------------------------------------------ */
111 * A compact binary tree, used to decode UTF-8 characters.
113 * Internal nodes are one byte for the node itself, and up to three
114 * bytes for an offset into the tree. The first byte contains the
115 * following information:
116 * NEXTBYTE - flag - advance to next byte if set
117 * BITNUM - 3 bit field - the bit number to tested
118 * OFFLEN - 2 bit field - number of bytes in the offset
119 * if offlen == 0 (non-branching node)
120 * RIGHTPATH - 1 bit field - set if the following node is for the
121 * right-hand path (tested bit is set)
122 * TRIENODE - 1 bit field - set if the following node is an internal
123 * node, otherwise it is a leaf node
124 * if offlen != 0 (branching node)
125 * LEFTNODE - 1 bit field - set if the left-hand node is internal
126 * RIGHTNODE - 1 bit field - set if the right-hand node is internal
128 * Due to the way utf8 works, there cannot be branching nodes with
129 * NEXTBYTE set, and moreover those nodes always have a righthand
132 typedef unsigned char utf8trie_t
;
134 #define NEXTBYTE 0x08
136 #define OFFLEN_SHIFT 4
137 #define RIGHTPATH 0x40
138 #define TRIENODE 0x80
139 #define RIGHTNODE 0x40
140 #define LEFTNODE 0x80
145 * The leaves of the trie are embedded in the trie, and so the same
146 * underlying datatype, unsigned char.
148 * leaf[0]: The unicode version, stored as a generation number that is
149 * an index into utf8agetab[]. With this we can filter code
150 * points based on the unicode version in which they were
151 * defined. The CCC of a non-defined code point is 0.
152 * leaf[1]: Canonical Combining Class. During normalization, we need
153 * to do a stable sort into ascending order of all characters
154 * with a non-zero CCC that occur between two characters with
155 * a CCC of 0, or at the begin or end of a string.
156 * The unicode standard guarantees that all CCC values are
157 * between 0 and 254 inclusive, which leaves 255 available as
159 * Code points with CCC 0 are known as stoppers.
160 * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
161 * start of a NUL-terminated string that is the decomposition
163 * The CCC of a decomposable character is the same as the CCC
164 * of the first character of its decomposition.
165 * Some characters decompose as the empty string: these are
166 * characters with the Default_Ignorable_Code_Point property.
167 * These do affect normalization, as they all have CCC 0.
169 * The decompositions in the trie have been fully expanded.
171 * Casefolding, if applicable, is also done using decompositions.
173 typedef unsigned char utf8leaf_t
;
175 #define LEAF_GEN(LEAF) ((LEAF)[0])
176 #define LEAF_CCC(LEAF) ((LEAF)[1])
177 #define LEAF_STR(LEAF) ((const char*)((LEAF) + 2))
184 #define DECOMPOSE (255)
185 #define HANGUL ((char)(255))
187 #define UTF8HANGULLEAF (12)
190 static utf8leaf_t
*utf8nlookup(struct tree
*, unsigned char *,
191 const char *, size_t);
192 static utf8leaf_t
*utf8lookup(struct tree
*, unsigned char *, const char *);
194 unsigned char *utf8data
;
195 size_t utf8data_size
;
200 /* ------------------------------------------------------------------ */
205 * The UTF-8 encoding spreads the bits of a 32bit word over several
206 * bytes. This table gives the ranges that can be held and how they'd
209 * 0x00000000 0x0000007F: 0xxxxxxx
210 * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
211 * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
212 * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
213 * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
214 * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
216 * There is an additional requirement on UTF-8, in that only the
217 * shortest representation of a 32bit value is to be used. A decoder
218 * must not decode sequences that do not satisfy this requirement.
219 * Thus the allowed ranges have a lower bound.
221 * 0x00000000 0x0000007F: 0xxxxxxx
222 * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
223 * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
224 * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
225 * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
226 * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
228 * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
229 * 17 planes of 65536 values. This limits the sequences actually seen
230 * even more, to just the following.
233 * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf
234 * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf
235 * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf
237 * Even within those ranges not all values are allowed: the surrogates
238 * 0xd800 - 0xdfff should never be seen.
240 * Note that the longest sequence seen with valid usage is 4 bytes,
241 * the same a single UTF-32 character. This makes the UTF-8
242 * representation of Unicode strictly smaller than UTF-32.
244 * The shortest sequence requirement was introduced by:
245 * Corrigendum #1: UTF-8 Shortest Form
246 * It can be found here:
247 * http://www.unicode.org/versions/corrigendum1.html
251 #define UTF8_2_BITS 0xC0
252 #define UTF8_3_BITS 0xE0
253 #define UTF8_4_BITS 0xF0
254 #define UTF8_N_BITS 0x80
255 #define UTF8_2_MASK 0xE0
256 #define UTF8_3_MASK 0xF0
257 #define UTF8_4_MASK 0xF8
258 #define UTF8_N_MASK 0xC0
259 #define UTF8_V_MASK 0x3F
260 #define UTF8_V_SHIFT 6
262 static int utf8encode(char *str
, unsigned int val
)
269 } else if (val
< 0x800) {
270 str
[1] = val
& UTF8_V_MASK
;
271 str
[1] |= UTF8_N_BITS
;
272 val
>>= UTF8_V_SHIFT
;
274 str
[0] |= UTF8_2_BITS
;
276 } else if (val
< 0x10000) {
277 str
[2] = val
& UTF8_V_MASK
;
278 str
[2] |= UTF8_N_BITS
;
279 val
>>= UTF8_V_SHIFT
;
280 str
[1] = val
& UTF8_V_MASK
;
281 str
[1] |= UTF8_N_BITS
;
282 val
>>= UTF8_V_SHIFT
;
284 str
[0] |= UTF8_3_BITS
;
286 } else if (val
< 0x110000) {
287 str
[3] = val
& UTF8_V_MASK
;
288 str
[3] |= UTF8_N_BITS
;
289 val
>>= UTF8_V_SHIFT
;
290 str
[2] = val
& UTF8_V_MASK
;
291 str
[2] |= UTF8_N_BITS
;
292 val
>>= UTF8_V_SHIFT
;
293 str
[1] = val
& UTF8_V_MASK
;
294 str
[1] |= UTF8_N_BITS
;
295 val
>>= UTF8_V_SHIFT
;
297 str
[0] |= UTF8_4_BITS
;
300 printf("%#x: illegal val\n", val
);
306 static unsigned int utf8decode(const char *str
)
308 const unsigned char *s
= (const unsigned char*)str
;
309 unsigned int unichar
= 0;
313 } else if (*s
< UTF8_3_BITS
) {
314 unichar
= *s
++ & 0x1F;
315 unichar
<<= UTF8_V_SHIFT
;
316 unichar
|= *s
& 0x3F;
317 } else if (*s
< UTF8_4_BITS
) {
318 unichar
= *s
++ & 0x0F;
319 unichar
<<= UTF8_V_SHIFT
;
320 unichar
|= *s
++ & 0x3F;
321 unichar
<<= UTF8_V_SHIFT
;
322 unichar
|= *s
& 0x3F;
324 unichar
= *s
++ & 0x0F;
325 unichar
<<= UTF8_V_SHIFT
;
326 unichar
|= *s
++ & 0x3F;
327 unichar
<<= UTF8_V_SHIFT
;
328 unichar
|= *s
++ & 0x3F;
329 unichar
<<= UTF8_V_SHIFT
;
330 unichar
|= *s
& 0x3F;
335 static int utf32valid(unsigned int unichar
)
337 return unichar
< 0x110000;
340 #define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3)
351 int (*leaf_equal
)(void *, void *);
352 void (*leaf_print
)(void *, int);
353 int (*leaf_mark
)(void *);
354 int (*leaf_size
)(void *);
355 int *(*leaf_index
)(struct tree
*, void *);
356 unsigned char *(*leaf_emit
)(void *, unsigned char *);
357 int leafindex
[0x110000];
369 unsigned char bitnum
;
370 unsigned char nextbyte
;
371 unsigned char leftnode
;
372 unsigned char rightnode
;
373 unsigned int keybits
;
374 unsigned int keymask
;
378 * Example lookup function for a tree.
380 static void *lookup(struct tree
*tree
, const char *key
)
386 while (!leaf
&& node
) {
389 if (*key
& (1 << (node
->bitnum
& 7))) {
391 if (node
->rightnode
== NODE
) {
393 } else if (node
->rightnode
== LEAF
) {
400 if (node
->leftnode
== NODE
) {
402 } else if (node
->leftnode
== LEAF
) {
414 * A simple non-recursive tree walker: keep track of visits to the
415 * left and right branches in the leftmask and rightmask.
417 static void tree_walk(struct tree
*tree
)
420 unsigned int leftmask
;
421 unsigned int rightmask
;
422 unsigned int bitmask
;
424 int nodes
, singletons
, leaves
;
426 nodes
= singletons
= leaves
= 0;
428 printf("%s_%x root %p\n", tree
->type
, tree
->maxage
, tree
->root
);
429 if (tree
->childnode
== LEAF
) {
431 tree
->leaf_print(tree
->root
, indent
);
434 assert(tree
->childnode
== NODE
);
436 leftmask
= rightmask
= 0;
438 printf("%*snode @ %p bitnum %d nextbyte %d"
439 " left %p right %p mask %x bits %x\n",
441 node
->bitnum
, node
->nextbyte
,
442 node
->left
, node
->right
,
443 node
->keymask
, node
->keybits
);
445 if (!(node
->left
&& node
->right
))
449 bitmask
= 1 << node
->bitnum
;
450 if ((leftmask
& bitmask
) == 0) {
452 if (node
->leftnode
== LEAF
) {
454 tree
->leaf_print(node
->left
,
457 } else if (node
->left
) {
458 assert(node
->leftnode
== NODE
);
464 if ((rightmask
& bitmask
) == 0) {
465 rightmask
|= bitmask
;
466 if (node
->rightnode
== LEAF
) {
468 tree
->leaf_print(node
->right
,
471 } else if (node
->right
) {
472 assert(node
->rightnode
== NODE
);
478 leftmask
&= ~bitmask
;
479 rightmask
&= ~bitmask
;
485 printf("nodes %d leaves %d singletons %d\n",
486 nodes
, leaves
, singletons
);
490 * Allocate an initialize a new internal node.
492 static struct node
*alloc_node(struct node
*parent
)
497 node
= malloc(sizeof(*node
));
498 node
->left
= node
->right
= NULL
;
499 node
->parent
= parent
;
500 node
->leftnode
= NODE
;
501 node
->rightnode
= NODE
;
510 bitnum
= parent
->bitnum
;
511 if ((bitnum
& 7) == 0) {
512 node
->bitnum
= bitnum
+ 7 + 8;
515 node
->bitnum
= bitnum
- 1;
527 * Insert a new leaf into the tree, and collapse any subtrees that are
528 * fully populated and end in identical leaves. A nextbyte tagged
529 * internal node will not be removed to preserve the tree's integrity.
530 * Note that due to the structure of utf8, no nextbyte tagged node
531 * will be a candidate for removal.
533 static int insert(struct tree
*tree
, char *key
, int keylen
, void *leaf
)
540 assert(keylen
>= 1 && keylen
<= 4);
543 cursor
= &tree
->root
;
544 keybits
= 8 * keylen
;
546 /* Insert, creating path along the way. */
549 *cursor
= alloc_node(node
);
553 if (*key
& (1 << (node
->bitnum
& 7)))
554 cursor
= &node
->right
;
556 cursor
= &node
->left
;
561 /* Merge subtrees if possible. */
563 if (*key
& (1 << (node
->bitnum
& 7)))
564 node
->rightnode
= LEAF
;
566 node
->leftnode
= LEAF
;
569 if (node
->leftnode
== NODE
|| node
->rightnode
== NODE
)
574 if (! tree
->leaf_equal(node
->left
, node
->right
))
576 /* Keep left, drop right leaf. */
578 /* Check in parent */
579 parent
= node
->parent
;
583 tree
->childnode
= LEAF
;
584 } else if (parent
->left
== node
) {
586 parent
->leftnode
= LEAF
;
591 parent
->keymask
|= (1 << node
->bitnum
);
593 } else if (parent
->right
== node
) {
594 parent
->right
= leaf
;
595 parent
->rightnode
= LEAF
;
600 parent
->keymask
|= (1 << node
->bitnum
);
601 parent
->keybits
|= (1 << node
->bitnum
);
604 /* internal tree error */
611 /* Propagate keymasks up along singleton chains. */
613 parent
= node
->parent
;
616 /* Nix the mask for parents with two children. */
617 if (node
->keymask
== 0) {
620 } else if (parent
->left
&& parent
->right
) {
624 assert((parent
->keymask
& node
->keymask
) == 0);
625 parent
->keymask
|= node
->keymask
;
626 parent
->keymask
|= (1 << parent
->bitnum
);
627 parent
->keybits
|= node
->keybits
;
629 parent
->keybits
|= (1 << parent
->bitnum
);
638 * Prune internal nodes.
640 * Fully populated subtrees that end at the same leaf have already
641 * been collapsed. There are still internal nodes that have for both
642 * their left and right branches a sequence of singletons that make
643 * identical choices and end in identical leaves. The keymask and
644 * keybits collected in the nodes describe the choices made in these
645 * singleton chains. When they are identical for the left and right
646 * branch of a node, and the two leaves comare identical, the node in
647 * question can be removed.
649 * Note that nodes with the nextbyte tag set will not be removed by
650 * this to ensure tree integrity. Note as well that the structure of
651 * utf8 ensures that these nodes would not have been candidates for
652 * removal in any case.
654 static void prune(struct tree
*tree
)
662 unsigned int leftmask
;
663 unsigned int rightmask
;
664 unsigned int bitmask
;
668 printf("Pruning %s_%x\n", tree
->type
, tree
->maxage
);
671 if (tree
->childnode
== LEAF
)
676 leftmask
= rightmask
= 0;
681 if (node
->leftnode
== LEAF
)
683 if (node
->rightnode
== LEAF
)
691 if (left
->keymask
== 0)
693 if (right
->keymask
== 0)
695 if (left
->keymask
!= right
->keymask
)
697 if (left
->keybits
!= right
->keybits
)
701 assert(left
->left
|| left
->right
);
702 if (left
->leftnode
== LEAF
)
703 leftleaf
= left
->left
;
704 else if (left
->rightnode
== LEAF
)
705 leftleaf
= left
->right
;
708 else if (left
->right
)
715 assert(right
->left
|| right
->right
);
716 if (right
->leftnode
== LEAF
)
717 rightleaf
= right
->left
;
718 else if (right
->rightnode
== LEAF
)
719 rightleaf
= right
->right
;
720 else if (right
->left
)
722 else if (right
->right
)
723 right
= right
->right
;
727 if (! tree
->leaf_equal(leftleaf
, rightleaf
))
730 * This node has identical singleton-only subtrees.
733 parent
= node
->parent
;
736 if (parent
->left
== node
)
738 else if (parent
->right
== node
)
739 parent
->right
= left
;
742 left
->parent
= parent
;
743 left
->keymask
|= (1 << node
->bitnum
);
746 bitmask
= 1 << node
->bitnum
;
747 leftmask
&= ~bitmask
;
748 rightmask
&= ~bitmask
;
749 if (node
->leftnode
== NODE
&& node
->left
) {
754 } else if (node
->rightnode
== NODE
&& node
->right
) {
763 /* Propagate keymasks up along singleton chains. */
766 bitmask
= 1 << node
->bitnum
;
767 leftmask
&= ~bitmask
;
768 rightmask
&= ~bitmask
;
770 if (node
->left
&& node
->right
)
774 node
->keymask
|= left
->keymask
;
775 node
->keybits
|= left
->keybits
;
779 node
->keymask
|= right
->keymask
;
780 node
->keybits
|= right
->keybits
;
782 node
->keymask
|= (1 << node
->bitnum
);
785 bitmask
= 1 << node
->bitnum
;
786 leftmask
&= ~bitmask
;
787 rightmask
&= ~bitmask
;
790 bitmask
= 1 << node
->bitnum
;
791 if ((leftmask
& bitmask
) == 0 &&
792 node
->leftnode
== NODE
&&
796 } else if ((rightmask
& bitmask
) == 0 &&
797 node
->rightnode
== NODE
&&
799 rightmask
|= bitmask
;
802 leftmask
&= ~bitmask
;
803 rightmask
&= ~bitmask
;
808 printf("Pruned %d nodes\n", count
);
812 * Mark the nodes in the tree that lead to leaves that must be
815 static void mark_nodes(struct tree
*tree
)
819 unsigned int leftmask
;
820 unsigned int rightmask
;
821 unsigned int bitmask
;
826 printf("Marking %s_%x\n", tree
->type
, tree
->maxage
);
827 if (tree
->childnode
== LEAF
)
830 assert(tree
->childnode
== NODE
);
832 leftmask
= rightmask
= 0;
834 bitmask
= 1 << node
->bitnum
;
835 if ((leftmask
& bitmask
) == 0) {
837 if (node
->leftnode
== LEAF
) {
839 if (tree
->leaf_mark(node
->left
)) {
841 while (n
&& !n
->mark
) {
847 } else if (node
->left
) {
848 assert(node
->leftnode
== NODE
);
853 if ((rightmask
& bitmask
) == 0) {
854 rightmask
|= bitmask
;
855 if (node
->rightnode
== LEAF
) {
857 if (tree
->leaf_mark(node
->right
)) {
859 while (n
&& !n
->mark
) {
865 } else if (node
->right
) {
866 assert(node
->rightnode
== NODE
);
871 leftmask
&= ~bitmask
;
872 rightmask
&= ~bitmask
;
876 /* second pass: left siblings and singletons */
878 assert(tree
->childnode
== NODE
);
880 leftmask
= rightmask
= 0;
882 bitmask
= 1 << node
->bitnum
;
883 if ((leftmask
& bitmask
) == 0) {
885 if (node
->leftnode
== LEAF
) {
887 if (tree
->leaf_mark(node
->left
)) {
889 while (n
&& !n
->mark
) {
895 } else if (node
->left
) {
896 assert(node
->leftnode
== NODE
);
898 if (!node
->mark
&& node
->parent
->mark
) {
905 if ((rightmask
& bitmask
) == 0) {
906 rightmask
|= bitmask
;
907 if (node
->rightnode
== LEAF
) {
909 if (tree
->leaf_mark(node
->right
)) {
911 while (n
&& !n
->mark
) {
917 } else if (node
->right
) {
918 assert(node
->rightnode
== NODE
);
920 if (!node
->mark
&& node
->parent
->mark
&&
921 !node
->parent
->left
) {
928 leftmask
&= ~bitmask
;
929 rightmask
&= ~bitmask
;
934 printf("Marked %d nodes\n", marked
);
938 * Compute the index of each node and leaf, which is the offset in the
939 * emitted trie. These values must be pre-computed because relative
940 * offsets between nodes are used to navigate the tree.
942 static int index_nodes(struct tree
*tree
, int index
)
945 unsigned int leftmask
;
946 unsigned int rightmask
;
947 unsigned int bitmask
;
951 /* Align to a cache line (or half a cache line?). */
959 printf("Indexing %s_%x: %d\n", tree
->type
, tree
->maxage
, index
);
960 if (tree
->childnode
== LEAF
) {
961 index
+= tree
->leaf_size(tree
->root
);
965 assert(tree
->childnode
== NODE
);
967 leftmask
= rightmask
= 0;
972 if (node
->index
!= index
)
977 bitmask
= 1 << node
->bitnum
;
978 if (node
->mark
&& (leftmask
& bitmask
) == 0) {
980 if (node
->leftnode
== LEAF
) {
982 *tree
->leaf_index(tree
, node
->left
) =
984 index
+= tree
->leaf_size(node
->left
);
986 } else if (node
->left
) {
987 assert(node
->leftnode
== NODE
);
993 if (node
->mark
&& (rightmask
& bitmask
) == 0) {
994 rightmask
|= bitmask
;
995 if (node
->rightnode
== LEAF
) {
997 *tree
->leaf_index(tree
, node
->right
) = index
;
998 index
+= tree
->leaf_size(node
->right
);
1000 } else if (node
->right
) {
1001 assert(node
->rightnode
== NODE
);
1007 leftmask
&= ~bitmask
;
1008 rightmask
&= ~bitmask
;
1009 node
= node
->parent
;
1014 /* Round up to a multiple of 16 */
1018 printf("Final index %d\n", index
);
1023 * Mark the nodes in a subtree, helper for size_nodes().
1025 static int mark_subtree(struct node
*node
)
1029 if (!node
|| node
->mark
)
1032 node
->index
= node
->parent
->index
;
1034 if (node
->leftnode
== NODE
)
1035 changed
+= mark_subtree(node
->left
);
1036 if (node
->rightnode
== NODE
)
1037 changed
+= mark_subtree(node
->right
);
1042 * Compute the size of nodes and leaves. We start by assuming that
1043 * each node needs to store a three-byte offset. The indexes of the
1044 * nodes are calculated based on that, and then this function is
1045 * called to see if the sizes of some nodes can be reduced. This is
1046 * repeated until no more changes are seen.
1048 static int size_nodes(struct tree
*tree
)
1054 unsigned int leftmask
;
1055 unsigned int rightmask
;
1056 unsigned int bitmask
;
1057 unsigned int pathbits
;
1058 unsigned int pathmask
;
1070 printf("Sizing %s_%x\n", tree
->type
, tree
->maxage
);
1071 if (tree
->childnode
== LEAF
)
1074 assert(tree
->childnode
== NODE
);
1078 leftmask
= rightmask
= 0;
1083 if (!node
->left
|| !node
->right
) {
1086 if (node
->rightnode
== NODE
) {
1088 * If the right node is not marked,
1089 * look for a corresponding node in
1090 * the next tree. Such a node need
1093 right
= node
->right
;
1095 while (!right
->mark
) {
1098 while (n
->bitnum
!= node
->bitnum
) {
1099 nbit
= 1 << n
->bitnum
;
1100 if (!(pathmask
& nbit
))
1102 if (pathbits
& nbit
) {
1103 if (n
->rightnode
== LEAF
)
1107 if (n
->leftnode
== LEAF
)
1112 if (n
->bitnum
!= node
->bitnum
)
1118 /* Make sure the right node is marked. */
1120 changed
+= mark_subtree(right
);
1121 offset
= right
->index
- node
->index
;
1123 offset
= *tree
->leaf_index(tree
, node
->right
);
1124 offset
-= node
->index
;
1126 assert(offset
>= 0);
1127 assert(offset
<= 0xffffff);
1128 if (offset
<= 0xff) {
1130 } else if (offset
<= 0xffff) {
1132 } else { /* offset <= 0xffffff */
1136 if (node
->size
!= size
|| node
->offset
!= offset
) {
1138 node
->offset
= offset
;
1143 bitmask
= 1 << node
->bitnum
;
1144 pathmask
|= bitmask
;
1145 if (node
->mark
&& (leftmask
& bitmask
) == 0) {
1146 leftmask
|= bitmask
;
1147 if (node
->leftnode
== LEAF
) {
1149 } else if (node
->left
) {
1150 assert(node
->leftnode
== NODE
);
1156 if (node
->mark
&& (rightmask
& bitmask
) == 0) {
1157 rightmask
|= bitmask
;
1158 pathbits
|= bitmask
;
1159 if (node
->rightnode
== LEAF
) {
1160 assert(node
->right
);
1161 } else if (node
->right
) {
1162 assert(node
->rightnode
== NODE
);
1168 leftmask
&= ~bitmask
;
1169 rightmask
&= ~bitmask
;
1170 pathmask
&= ~bitmask
;
1171 pathbits
&= ~bitmask
;
1172 node
= node
->parent
;
1178 printf("Found %d changes\n", changed
);
1183 * Emit a trie for the given tree into the data array.
1185 static void emit(struct tree
*tree
, unsigned char *data
)
1188 unsigned int leftmask
;
1189 unsigned int rightmask
;
1190 unsigned int bitmask
;
1201 nodes
[0] = nodes
[1] = nodes
[2] = nodes
[3] = 0;
1204 index
= tree
->index
;
1208 printf("Emitting %s_%x\n", tree
->type
, tree
->maxage
);
1209 if (tree
->childnode
== LEAF
) {
1211 tree
->leaf_emit(tree
->root
, data
);
1212 size
= tree
->leaf_size(tree
->root
);
1218 assert(tree
->childnode
== NODE
);
1220 leftmask
= rightmask
= 0;
1224 assert(node
->offset
!= -1);
1225 assert(node
->index
== index
);
1230 byte
|= (node
->bitnum
& BITNUM
);
1231 if (node
->left
&& node
->right
) {
1232 if (node
->leftnode
== NODE
)
1234 if (node
->rightnode
== NODE
)
1236 if (node
->offset
<= 0xff)
1238 else if (node
->offset
<= 0xffff)
1243 offset
= node
->offset
;
1244 byte
|= offlen
<< OFFLEN_SHIFT
;
1248 *data
++ = offset
& 0xff;
1252 } else if (node
->left
) {
1253 if (node
->leftnode
== NODE
)
1258 } else if (node
->right
) {
1260 if (node
->rightnode
== NODE
)
1270 bitmask
= 1 << node
->bitnum
;
1271 if (node
->mark
&& (leftmask
& bitmask
) == 0) {
1272 leftmask
|= bitmask
;
1273 if (node
->leftnode
== LEAF
) {
1275 data
= tree
->leaf_emit(node
->left
,
1277 size
= tree
->leaf_size(node
->left
);
1281 } else if (node
->left
) {
1282 assert(node
->leftnode
== NODE
);
1288 if (node
->mark
&& (rightmask
& bitmask
) == 0) {
1289 rightmask
|= bitmask
;
1290 if (node
->rightnode
== LEAF
) {
1291 assert(node
->right
);
1292 data
= tree
->leaf_emit(node
->right
,
1294 size
= tree
->leaf_size(node
->right
);
1298 } else if (node
->right
) {
1299 assert(node
->rightnode
== NODE
);
1305 leftmask
&= ~bitmask
;
1306 rightmask
&= ~bitmask
;
1307 node
= node
->parent
;
1313 printf("Emitted %d (%d) leaves",
1315 printf(" %d (%d+%d+%d+%d) nodes",
1316 nodes
[0] + nodes
[1] + nodes
[2] + nodes
[3],
1317 nodes
[0], nodes
[1], nodes
[2], nodes
[3]);
1318 printf(" %d total\n", index
- tree
->index
);
1322 /* ------------------------------------------------------------------ */
1327 * We need to keep track of the Canonical Combining Class, the Age,
1328 * and decompositions for a code point.
1330 * For the Age, we store the index into the ages table. Effectively
1331 * this is a generation number that the table maps to a unicode
1334 * The correction field is used to indicate that this entry is in the
1335 * corrections array, which contains decompositions that were
1336 * corrected in later revisions. The value of the correction field is
1337 * the Unicode version in which the mapping was corrected.
1339 struct unicode_data
{
1344 unsigned int *utf32nfdi
;
1345 unsigned int *utf32nfdicf
;
1350 struct unicode_data unicode_data
[0x110000];
1351 struct unicode_data
*corrections
;
1352 int corrections_count
;
1354 struct tree
*nfdi_tree
;
1355 struct tree
*nfdicf_tree
;
1361 * Check the corrections array to see if this entry was corrected at
1364 static struct unicode_data
*corrections_lookup(struct unicode_data
*u
)
1368 for (i
= 0; i
!= corrections_count
; i
++)
1369 if (u
->code
== corrections
[i
].code
)
1370 return &corrections
[i
];
1374 static int nfdi_equal(void *l
, void *r
)
1376 struct unicode_data
*left
= l
;
1377 struct unicode_data
*right
= r
;
1379 if (left
->gen
!= right
->gen
)
1381 if (left
->ccc
!= right
->ccc
)
1383 if (left
->utf8nfdi
&& right
->utf8nfdi
&&
1384 strcmp(left
->utf8nfdi
, right
->utf8nfdi
) == 0)
1386 if (left
->utf8nfdi
|| right
->utf8nfdi
)
1391 static int nfdicf_equal(void *l
, void *r
)
1393 struct unicode_data
*left
= l
;
1394 struct unicode_data
*right
= r
;
1396 if (left
->gen
!= right
->gen
)
1398 if (left
->ccc
!= right
->ccc
)
1400 if (left
->utf8nfdicf
&& right
->utf8nfdicf
&&
1401 strcmp(left
->utf8nfdicf
, right
->utf8nfdicf
) == 0)
1403 if (left
->utf8nfdicf
&& right
->utf8nfdicf
)
1405 if (left
->utf8nfdicf
|| right
->utf8nfdicf
)
1407 if (left
->utf8nfdi
&& right
->utf8nfdi
&&
1408 strcmp(left
->utf8nfdi
, right
->utf8nfdi
) == 0)
1410 if (left
->utf8nfdi
|| right
->utf8nfdi
)
1415 static void nfdi_print(void *l
, int indent
)
1417 struct unicode_data
*leaf
= l
;
1419 printf("%*sleaf @ %p code %X ccc %d gen %d", indent
, "", leaf
,
1420 leaf
->code
, leaf
->ccc
, leaf
->gen
);
1422 if (leaf
->utf8nfdi
&& leaf
->utf8nfdi
[0] == HANGUL
)
1423 printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1424 else if (leaf
->utf8nfdi
)
1425 printf(" nfdi \"%s\"", (const char*)leaf
->utf8nfdi
);
1430 static void nfdicf_print(void *l
, int indent
)
1432 struct unicode_data
*leaf
= l
;
1434 printf("%*sleaf @ %p code %X ccc %d gen %d", indent
, "", leaf
,
1435 leaf
->code
, leaf
->ccc
, leaf
->gen
);
1437 if (leaf
->utf8nfdicf
)
1438 printf(" nfdicf \"%s\"", (const char*)leaf
->utf8nfdicf
);
1439 else if (leaf
->utf8nfdi
&& leaf
->utf8nfdi
[0] == HANGUL
)
1440 printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1441 else if (leaf
->utf8nfdi
)
1442 printf(" nfdi \"%s\"", (const char*)leaf
->utf8nfdi
);
1446 static int nfdi_mark(void *l
)
1451 static int nfdicf_mark(void *l
)
1453 struct unicode_data
*leaf
= l
;
1455 if (leaf
->utf8nfdicf
)
1460 static int correction_mark(void *l
)
1462 struct unicode_data
*leaf
= l
;
1464 return leaf
->correction
;
1467 static int nfdi_size(void *l
)
1469 struct unicode_data
*leaf
= l
;
1472 if (HANGUL_SYLLABLE(leaf
->code
))
1474 else if (leaf
->utf8nfdi
)
1475 size
+= strlen(leaf
->utf8nfdi
) + 1;
1479 static int nfdicf_size(void *l
)
1481 struct unicode_data
*leaf
= l
;
1484 if (HANGUL_SYLLABLE(leaf
->code
))
1486 else if (leaf
->utf8nfdicf
)
1487 size
+= strlen(leaf
->utf8nfdicf
) + 1;
1488 else if (leaf
->utf8nfdi
)
1489 size
+= strlen(leaf
->utf8nfdi
) + 1;
1493 static int *nfdi_index(struct tree
*tree
, void *l
)
1495 struct unicode_data
*leaf
= l
;
1497 return &tree
->leafindex
[leaf
->code
];
1500 static int *nfdicf_index(struct tree
*tree
, void *l
)
1502 struct unicode_data
*leaf
= l
;
1504 return &tree
->leafindex
[leaf
->code
];
1507 static unsigned char *nfdi_emit(void *l
, unsigned char *data
)
1509 struct unicode_data
*leaf
= l
;
1512 *data
++ = leaf
->gen
;
1514 if (HANGUL_SYLLABLE(leaf
->code
)) {
1515 *data
++ = DECOMPOSE
;
1517 } else if (leaf
->utf8nfdi
) {
1518 *data
++ = DECOMPOSE
;
1519 s
= (unsigned char*)leaf
->utf8nfdi
;
1520 while ((*data
++ = *s
++) != 0)
1523 *data
++ = leaf
->ccc
;
1528 static unsigned char *nfdicf_emit(void *l
, unsigned char *data
)
1530 struct unicode_data
*leaf
= l
;
1533 *data
++ = leaf
->gen
;
1535 if (HANGUL_SYLLABLE(leaf
->code
)) {
1536 *data
++ = DECOMPOSE
;
1538 } else if (leaf
->utf8nfdicf
) {
1539 *data
++ = DECOMPOSE
;
1540 s
= (unsigned char*)leaf
->utf8nfdicf
;
1541 while ((*data
++ = *s
++) != 0)
1543 } else if (leaf
->utf8nfdi
) {
1544 *data
++ = DECOMPOSE
;
1545 s
= (unsigned char*)leaf
->utf8nfdi
;
1546 while ((*data
++ = *s
++) != 0)
1549 *data
++ = leaf
->ccc
;
1554 static void utf8_create(struct unicode_data
*data
)
1561 if (data
->utf8nfdi
) {
1562 assert(data
->utf8nfdi
[0] == HANGUL
);
1567 um
= data
->utf32nfdi
;
1569 for (i
= 0; um
[i
]; i
++)
1570 u
+= utf8encode(u
, um
[i
]);
1572 data
->utf8nfdi
= strdup(utf
);
1575 um
= data
->utf32nfdicf
;
1577 for (i
= 0; um
[i
]; i
++)
1578 u
+= utf8encode(u
, um
[i
]);
1580 if (!data
->utf8nfdi
|| strcmp(data
->utf8nfdi
, utf
))
1581 data
->utf8nfdicf
= strdup(utf
);
1585 static void utf8_init(void)
1587 unsigned int unichar
;
1590 for (unichar
= 0; unichar
!= 0x110000; unichar
++)
1591 utf8_create(&unicode_data
[unichar
]);
1593 for (i
= 0; i
!= corrections_count
; i
++)
1594 utf8_create(&corrections
[i
]);
1597 static void trees_init(void)
1599 struct unicode_data
*data
;
1600 unsigned int maxage
;
1601 unsigned int nextage
;
1606 /* Count the number of different ages. */
1608 nextage
= (unsigned int)-1;
1612 for (i
= 0; i
<= corrections_count
; i
++) {
1613 data
= &corrections
[i
];
1614 if (nextage
< data
->correction
&&
1615 data
->correction
< maxage
)
1616 nextage
= data
->correction
;
1621 /* Two trees per age: nfdi and nfdicf */
1622 trees_count
= count
* 2;
1623 trees
= calloc(trees_count
, sizeof(struct tree
));
1625 /* Assign ages to the trees. */
1626 count
= trees_count
;
1627 nextage
= (unsigned int)-1;
1630 trees
[--count
].maxage
= maxage
;
1631 trees
[--count
].maxage
= maxage
;
1633 for (i
= 0; i
<= corrections_count
; i
++) {
1634 data
= &corrections
[i
];
1635 if (nextage
< data
->correction
&&
1636 data
->correction
< maxage
)
1637 nextage
= data
->correction
;
1641 /* The ages assigned above are off by one. */
1642 for (i
= 0; i
!= trees_count
; i
++) {
1644 while (ages
[j
] < trees
[i
].maxage
)
1646 trees
[i
].maxage
= ages
[j
-1];
1649 /* Set up the forwarding between trees. */
1650 trees
[trees_count
-2].next
= &trees
[trees_count
-1];
1651 trees
[trees_count
-1].leaf_mark
= nfdi_mark
;
1652 trees
[trees_count
-2].leaf_mark
= nfdicf_mark
;
1653 for (i
= 0; i
!= trees_count
-2; i
+= 2) {
1654 trees
[i
].next
= &trees
[trees_count
-2];
1655 trees
[i
].leaf_mark
= correction_mark
;
1656 trees
[i
+1].next
= &trees
[trees_count
-1];
1657 trees
[i
+1].leaf_mark
= correction_mark
;
1660 /* Assign the callouts. */
1661 for (i
= 0; i
!= trees_count
; i
+= 2) {
1662 trees
[i
].type
= "nfdicf";
1663 trees
[i
].leaf_equal
= nfdicf_equal
;
1664 trees
[i
].leaf_print
= nfdicf_print
;
1665 trees
[i
].leaf_size
= nfdicf_size
;
1666 trees
[i
].leaf_index
= nfdicf_index
;
1667 trees
[i
].leaf_emit
= nfdicf_emit
;
1669 trees
[i
+1].type
= "nfdi";
1670 trees
[i
+1].leaf_equal
= nfdi_equal
;
1671 trees
[i
+1].leaf_print
= nfdi_print
;
1672 trees
[i
+1].leaf_size
= nfdi_size
;
1673 trees
[i
+1].leaf_index
= nfdi_index
;
1674 trees
[i
+1].leaf_emit
= nfdi_emit
;
1678 for (i
= 0; i
!= trees_count
; i
++)
1679 trees
[i
].childnode
= NODE
;
1682 static void trees_populate(void)
1684 struct unicode_data
*data
;
1685 unsigned int unichar
;
1690 for (i
= 0; i
!= trees_count
; i
++) {
1692 printf("Populating %s_%x\n",
1693 trees
[i
].type
, trees
[i
].maxage
);
1695 for (unichar
= 0; unichar
!= 0x110000; unichar
++) {
1696 if (unicode_data
[unichar
].gen
< 0)
1698 keylen
= utf8encode(keyval
, unichar
);
1699 data
= corrections_lookup(&unicode_data
[unichar
]);
1700 if (data
->correction
<= trees
[i
].maxage
)
1701 data
= &unicode_data
[unichar
];
1702 insert(&trees
[i
], keyval
, keylen
, data
);
1707 static void trees_reduce(void)
1713 for (i
= 0; i
!= trees_count
; i
++)
1715 for (i
= 0; i
!= trees_count
; i
++)
1716 mark_nodes(&trees
[i
]);
1719 for (i
= 0; i
!= trees_count
; i
++)
1720 size
= index_nodes(&trees
[i
], size
);
1722 for (i
= 0; i
!= trees_count
; i
++)
1723 changed
+= size_nodes(&trees
[i
]);
1726 utf8data
= calloc(size
, 1);
1727 utf8data_size
= size
;
1728 for (i
= 0; i
!= trees_count
; i
++)
1729 emit(&trees
[i
], utf8data
);
1732 for (i
= 0; i
!= trees_count
; i
++) {
1733 printf("%s_%x idx %d\n",
1734 trees
[i
].type
, trees
[i
].maxage
, trees
[i
].index
);
1738 nfdi
= utf8data
+ trees
[trees_count
-1].index
;
1739 nfdicf
= utf8data
+ trees
[trees_count
-2].index
;
1741 nfdi_tree
= &trees
[trees_count
-1];
1742 nfdicf_tree
= &trees
[trees_count
-2];
1745 static void verify(struct tree
*tree
)
1747 struct unicode_data
*data
;
1749 unsigned int unichar
;
1751 unsigned char hangul
[UTF8HANGULLEAF
];
1756 printf("Verifying %s_%x\n", tree
->type
, tree
->maxage
);
1757 nocf
= strcmp(tree
->type
, "nfdicf");
1759 for (unichar
= 0; unichar
!= 0x110000; unichar
++) {
1761 data
= corrections_lookup(&unicode_data
[unichar
]);
1762 if (data
->correction
<= tree
->maxage
)
1763 data
= &unicode_data
[unichar
];
1764 utf8encode(key
,unichar
);
1765 leaf
= utf8lookup(tree
, hangul
, key
);
1768 if (data
->gen
!= -1)
1770 if (unichar
< 0xd800 || unichar
> 0xdfff)
1773 if (unichar
>= 0xd800 && unichar
<= 0xdfff)
1775 if (data
->gen
== -1)
1777 if (data
->gen
!= LEAF_GEN(leaf
))
1779 if (LEAF_CCC(leaf
) == DECOMPOSE
) {
1780 if (HANGUL_SYLLABLE(data
->code
)) {
1781 if (data
->utf8nfdi
[0] != HANGUL
)
1784 if (!data
->utf8nfdi
) {
1786 } else if (strcmp(data
->utf8nfdi
,
1791 if (!data
->utf8nfdicf
&&
1794 } else if (data
->utf8nfdicf
) {
1795 if (strcmp(data
->utf8nfdicf
,
1798 } else if (strcmp(data
->utf8nfdi
,
1803 } else if (data
->ccc
!= LEAF_CCC(leaf
)) {
1808 printf("%X code %X gen %d ccc %d"
1810 unichar
, data
->code
, data
->gen
,
1814 printf(" gen %d ccc %d"
1818 LEAF_CCC(leaf
) == DECOMPOSE
?
1819 LEAF_STR(leaf
) : "");
1826 static void trees_verify(void)
1830 for (i
= 0; i
!= trees_count
; i
++)
1834 /* ------------------------------------------------------------------ */
1836 static void help(void)
1838 printf("Usage: %s [options]\n", argv0
);
1840 printf("This program creates an a data trie used for parsing and\n");
1841 printf("normalization of UTF-8 strings. The trie is derived from\n");
1842 printf("a set of input files from the Unicode character database\n");
1843 printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
1845 printf("The generated tree supports two normalization forms:\n");
1847 printf("\tnfdi:\n");
1848 printf("\t- Apply unicode normalization form NFD.\n");
1849 printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1851 printf("\tnfdicf:\n");
1852 printf("\t- Apply unicode normalization form NFD.\n");
1853 printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1854 printf("\t- Apply a full casefold (C + F).\n");
1856 printf("These forms were chosen as being most useful when dealing\n");
1857 printf("with file names: NFD catches most cases where characters\n");
1858 printf("should be considered equivalent. The ignorables are mostly\n");
1859 printf("invisible, making names hard to type.\n");
1861 printf("The options to specify the files to be used are listed\n");
1862 printf("below with their default values, which are the names used\n");
1863 printf("by version 11.0.0 of the Unicode Character Database.\n");
1865 printf("The input files:\n");
1866 printf("\t-a %s\n", AGE_NAME
);
1867 printf("\t-c %s\n", CCC_NAME
);
1868 printf("\t-p %s\n", PROP_NAME
);
1869 printf("\t-d %s\n", DATA_NAME
);
1870 printf("\t-f %s\n", FOLD_NAME
);
1871 printf("\t-n %s\n", NORM_NAME
);
1873 printf("Additionally, the generated tables are tested using:\n");
1874 printf("\t-t %s\n", TEST_NAME
);
1876 printf("Finally, the output file:\n");
1877 printf("\t-o %s\n", UTF8_NAME
);
1881 static void usage(void)
1887 static void open_fail(const char *name
, int error
)
1889 printf("Error %d opening %s: %s\n", error
, name
, strerror(error
));
1893 static void file_fail(const char *filename
)
1895 printf("Error parsing %s\n", filename
);
1899 static void line_fail(const char *filename
, const char *line
)
1901 printf("Error parsing %s:%s\n", filename
, line
);
1905 /* ------------------------------------------------------------------ */
1907 static void print_utf32(unsigned int *utf32str
)
1911 for (i
= 0; utf32str
[i
]; i
++)
1912 printf(" %X", utf32str
[i
]);
1915 static void print_utf32nfdi(unsigned int unichar
)
1917 printf(" %X ->", unichar
);
1918 print_utf32(unicode_data
[unichar
].utf32nfdi
);
1922 static void print_utf32nfdicf(unsigned int unichar
)
1924 printf(" %X ->", unichar
);
1925 print_utf32(unicode_data
[unichar
].utf32nfdicf
);
1929 /* ------------------------------------------------------------------ */
1931 static void age_init(void)
1936 unsigned int unichar
;
1939 unsigned int revision
;
1945 printf("Parsing %s\n", age_name
);
1947 file
= fopen(age_name
, "r");
1949 open_fail(age_name
, errno
);
1953 while (fgets(line
, LINESIZE
, file
)) {
1954 ret
= sscanf(line
, "# Age=V%d_%d_%d",
1955 &major
, &minor
, &revision
);
1959 printf(" Age V%d_%d_%d\n",
1960 major
, minor
, revision
);
1961 if (!age_valid(major
, minor
, revision
))
1962 line_fail(age_name
, line
);
1965 ret
= sscanf(line
, "# Age=V%d_%d", &major
, &minor
);
1969 printf(" Age V%d_%d\n", major
, minor
);
1970 if (!age_valid(major
, minor
, 0))
1971 line_fail(age_name
, line
);
1976 /* We must have found something above. */
1978 printf("%d age entries\n", ages_count
);
1979 if (ages_count
== 0 || ages_count
> MAXGEN
)
1980 file_fail(age_name
);
1982 /* There is a 0 entry. */
1984 ages
= calloc(ages_count
+ 1, sizeof(*ages
));
1985 /* And a guard entry. */
1986 ages
[ages_count
] = (unsigned int)-1;
1991 while (fgets(line
, LINESIZE
, file
)) {
1992 ret
= sscanf(line
, "# Age=V%d_%d_%d",
1993 &major
, &minor
, &revision
);
1996 UNICODE_AGE(major
, minor
, revision
);
1998 printf(" Age V%d_%d_%d = gen %d\n",
1999 major
, minor
, revision
, gen
);
2000 if (!age_valid(major
, minor
, revision
))
2001 line_fail(age_name
, line
);
2004 ret
= sscanf(line
, "# Age=V%d_%d", &major
, &minor
);
2006 ages
[++gen
] = UNICODE_AGE(major
, minor
, 0);
2008 printf(" Age V%d_%d = %d\n",
2010 if (!age_valid(major
, minor
, 0))
2011 line_fail(age_name
, line
);
2014 ret
= sscanf(line
, "%X..%X ; %d.%d #",
2015 &first
, &last
, &major
, &minor
);
2017 for (unichar
= first
; unichar
<= last
; unichar
++)
2018 unicode_data
[unichar
].gen
= gen
;
2019 count
+= 1 + last
- first
;
2021 printf(" %X..%X gen %d\n", first
, last
, gen
);
2022 if (!utf32valid(first
) || !utf32valid(last
))
2023 line_fail(age_name
, line
);
2026 ret
= sscanf(line
, "%X ; %d.%d #", &unichar
, &major
, &minor
);
2028 unicode_data
[unichar
].gen
= gen
;
2031 printf(" %X gen %d\n", unichar
, gen
);
2032 if (!utf32valid(unichar
))
2033 line_fail(age_name
, line
);
2037 unicode_maxage
= ages
[gen
];
2040 /* Nix surrogate block */
2042 printf(" Removing surrogate block D800..DFFF\n");
2043 for (unichar
= 0xd800; unichar
<= 0xdfff; unichar
++)
2044 unicode_data
[unichar
].gen
= -1;
2047 printf("Found %d entries\n", count
);
2049 file_fail(age_name
);
2052 static void ccc_init(void)
2057 unsigned int unichar
;
2063 printf("Parsing %s\n", ccc_name
);
2065 file
= fopen(ccc_name
, "r");
2067 open_fail(ccc_name
, errno
);
2070 while (fgets(line
, LINESIZE
, file
)) {
2071 ret
= sscanf(line
, "%X..%X ; %d #", &first
, &last
, &value
);
2073 for (unichar
= first
; unichar
<= last
; unichar
++) {
2074 unicode_data
[unichar
].ccc
= value
;
2078 printf(" %X..%X ccc %d\n", first
, last
, value
);
2079 if (!utf32valid(first
) || !utf32valid(last
))
2080 line_fail(ccc_name
, line
);
2083 ret
= sscanf(line
, "%X ; %d #", &unichar
, &value
);
2085 unicode_data
[unichar
].ccc
= value
;
2088 printf(" %X ccc %d\n", unichar
, value
);
2089 if (!utf32valid(unichar
))
2090 line_fail(ccc_name
, line
);
2097 printf("Found %d entries\n", count
);
2099 file_fail(ccc_name
);
2102 static int ignore_compatibility_form(char *type
)
2105 char *ignored_types
[] = {"font", "noBreak", "initial", "medial",
2106 "final", "isolated", "circle", "super",
2107 "sub", "vertical", "wide", "narrow",
2108 "small", "square", "fraction", "compat"};
2110 for (i
= 0 ; i
< ARRAY_SIZE(ignored_types
); i
++)
2111 if (strcmp(type
, ignored_types
[i
]) == 0)
2116 static void nfdi_init(void)
2119 unsigned int unichar
;
2120 unsigned int mapping
[19]; /* Magic - guaranteed not to be exceeded. */
2129 printf("Parsing %s\n", data_name
);
2130 file
= fopen(data_name
, "r");
2132 open_fail(data_name
, errno
);
2135 while (fgets(line
, LINESIZE
, file
)) {
2136 ret
= sscanf(line
, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
2140 if (!utf32valid(unichar
))
2141 line_fail(data_name
, line
);
2144 /* skip over <tag> */
2147 while (*++s
!= '>');
2149 if(ignore_compatibility_form(type
))
2152 /* decode the decomposition into UTF-32 */
2155 mapping
[i
] = strtoul(s
, &s
, 16);
2156 if (!utf32valid(mapping
[i
]))
2157 line_fail(data_name
, line
);
2162 um
= malloc(i
* sizeof(unsigned int));
2163 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2164 unicode_data
[unichar
].utf32nfdi
= um
;
2167 print_utf32nfdi(unichar
);
2172 printf("Found %d entries\n", count
);
2174 file_fail(data_name
);
2177 static void nfdicf_init(void)
2180 unsigned int unichar
;
2181 unsigned int mapping
[19]; /* Magic - guaranteed not to be exceeded. */
2190 printf("Parsing %s\n", fold_name
);
2191 file
= fopen(fold_name
, "r");
2193 open_fail(fold_name
, errno
);
2196 while (fgets(line
, LINESIZE
, file
)) {
2197 ret
= sscanf(line
, "%X; %c; %[^;];", &unichar
, &status
, buf0
);
2200 if (!utf32valid(unichar
))
2201 line_fail(fold_name
, line
);
2202 /* Use the C+F casefold. */
2203 if (status
!= 'C' && status
!= 'F')
2211 mapping
[i
] = strtoul(s
, &s
, 16);
2212 if (!utf32valid(mapping
[i
]))
2213 line_fail(fold_name
, line
);
2218 um
= malloc(i
* sizeof(unsigned int));
2219 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2220 unicode_data
[unichar
].utf32nfdicf
= um
;
2223 print_utf32nfdicf(unichar
);
2228 printf("Found %d entries\n", count
);
2230 file_fail(fold_name
);
2233 static void ignore_init(void)
2236 unsigned int unichar
;
2244 printf("Parsing %s\n", prop_name
);
2245 file
= fopen(prop_name
, "r");
2247 open_fail(prop_name
, errno
);
2250 while (fgets(line
, LINESIZE
, file
)) {
2251 ret
= sscanf(line
, "%X..%X ; %s # ", &first
, &last
, buf0
);
2253 if (strcmp(buf0
, "Default_Ignorable_Code_Point"))
2255 if (!utf32valid(first
) || !utf32valid(last
))
2256 line_fail(prop_name
, line
);
2257 for (unichar
= first
; unichar
<= last
; unichar
++) {
2258 free(unicode_data
[unichar
].utf32nfdi
);
2259 um
= malloc(sizeof(unsigned int));
2261 unicode_data
[unichar
].utf32nfdi
= um
;
2262 free(unicode_data
[unichar
].utf32nfdicf
);
2263 um
= malloc(sizeof(unsigned int));
2265 unicode_data
[unichar
].utf32nfdicf
= um
;
2269 printf(" %X..%X Default_Ignorable_Code_Point\n",
2273 ret
= sscanf(line
, "%X ; %s # ", &unichar
, buf0
);
2275 if (strcmp(buf0
, "Default_Ignorable_Code_Point"))
2277 if (!utf32valid(unichar
))
2278 line_fail(prop_name
, line
);
2279 free(unicode_data
[unichar
].utf32nfdi
);
2280 um
= malloc(sizeof(unsigned int));
2282 unicode_data
[unichar
].utf32nfdi
= um
;
2283 free(unicode_data
[unichar
].utf32nfdicf
);
2284 um
= malloc(sizeof(unsigned int));
2286 unicode_data
[unichar
].utf32nfdicf
= um
;
2288 printf(" %X Default_Ignorable_Code_Point\n",
2297 printf("Found %d entries\n", count
);
2299 file_fail(prop_name
);
2302 static void corrections_init(void)
2305 unsigned int unichar
;
2308 unsigned int revision
;
2311 unsigned int mapping
[19]; /* Magic - guaranteed not to be exceeded. */
2318 printf("Parsing %s\n", norm_name
);
2319 file
= fopen(norm_name
, "r");
2321 open_fail(norm_name
, errno
);
2324 while (fgets(line
, LINESIZE
, file
)) {
2325 ret
= sscanf(line
, "%X;%[^;];%[^;];%d.%d.%d #",
2326 &unichar
, buf0
, buf1
,
2327 &major
, &minor
, &revision
);
2330 if (!utf32valid(unichar
) || !age_valid(major
, minor
, revision
))
2331 line_fail(norm_name
, line
);
2334 corrections
= calloc(count
, sizeof(struct unicode_data
));
2335 corrections_count
= count
;
2339 while (fgets(line
, LINESIZE
, file
)) {
2340 ret
= sscanf(line
, "%X;%[^;];%[^;];%d.%d.%d #",
2341 &unichar
, buf0
, buf1
,
2342 &major
, &minor
, &revision
);
2345 if (!utf32valid(unichar
) || !age_valid(major
, minor
, revision
))
2346 line_fail(norm_name
, line
);
2347 corrections
[count
] = unicode_data
[unichar
];
2348 assert(corrections
[count
].code
== unichar
);
2349 age
= UNICODE_AGE(major
, minor
, revision
);
2350 corrections
[count
].correction
= age
;
2355 mapping
[i
] = strtoul(s
, &s
, 16);
2356 if (!utf32valid(mapping
[i
]))
2357 line_fail(norm_name
, line
);
2362 um
= malloc(i
* sizeof(unsigned int));
2363 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2364 corrections
[count
].utf32nfdi
= um
;
2367 printf(" %X -> %s -> %s V%d_%d_%d\n",
2368 unichar
, buf0
, buf1
, major
, minor
, revision
);
2374 printf("Found %d entries\n", count
);
2376 file_fail(norm_name
);
2379 /* ------------------------------------------------------------------ */
2382 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2384 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2385 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2394 * NCount = 588 (VCount * TCount)
2395 * SCount = 11172 (LCount * NCount)
2398 * SIndex = s - SBase
2400 * LV (Canonical/Full)
2401 * LIndex = SIndex / NCount
2402 * VIndex = (Sindex % NCount) / TCount
2403 * LPart = LBase + LIndex
2404 * VPart = VBase + VIndex
2407 * LVIndex = (SIndex / TCount) * TCount
2408 * TIndex = (Sindex % TCount)
2409 * LVPart = SBase + LVIndex
2410 * TPart = TBase + TIndex
2413 * LIndex = SIndex / NCount
2414 * VIndex = (Sindex % NCount) / TCount
2415 * TIndex = (Sindex % TCount)
2416 * LPart = LBase + LIndex
2417 * VPart = VBase + VIndex
2418 * if (TIndex == 0) {
2419 * d = <LPart, VPart>
2421 * TPart = TBase + TIndex
2422 * d = <LPart, VPart, TPart>
2427 static void hangul_decompose(void)
2429 unsigned int sb
= 0xAC00;
2430 unsigned int lb
= 0x1100;
2431 unsigned int vb
= 0x1161;
2432 unsigned int tb
= 0x11a7;
2433 /* unsigned int lc = 19; */
2434 unsigned int vc
= 21;
2435 unsigned int tc
= 28;
2436 unsigned int nc
= (vc
* tc
);
2437 /* unsigned int sc = (lc * nc); */
2438 unsigned int unichar
;
2439 unsigned int mapping
[4];
2445 printf("Decomposing hangul\n");
2448 for (unichar
= 0xAC00; unichar
<= 0xD7A3; unichar
++) {
2449 unsigned int si
= unichar
- sb
;
2450 unsigned int li
= si
/ nc
;
2451 unsigned int vi
= (si
% nc
) / tc
;
2452 unsigned int ti
= si
% tc
;
2455 mapping
[i
++] = lb
+ li
;
2456 mapping
[i
++] = vb
+ vi
;
2458 mapping
[i
++] = tb
+ ti
;
2461 assert(!unicode_data
[unichar
].utf32nfdi
);
2462 um
= malloc(i
* sizeof(unsigned int));
2463 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2464 unicode_data
[unichar
].utf32nfdi
= um
;
2466 assert(!unicode_data
[unichar
].utf32nfdicf
);
2467 um
= malloc(i
* sizeof(unsigned int));
2468 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2469 unicode_data
[unichar
].utf32nfdicf
= um
;
2472 * Add a cookie as a reminder that the hangul syllable
2473 * decompositions must not be stored in the generated
2476 unicode_data
[unichar
].utf8nfdi
= malloc(2);
2477 unicode_data
[unichar
].utf8nfdi
[0] = HANGUL
;
2478 unicode_data
[unichar
].utf8nfdi
[1] = '\0';
2481 print_utf32nfdi(unichar
);
2486 printf("Created %d entries\n", count
);
2489 static void nfdi_decompose(void)
2491 unsigned int unichar
;
2492 unsigned int mapping
[19]; /* Magic - guaranteed not to be exceeded. */
2501 printf("Decomposing nfdi\n");
2504 for (unichar
= 0; unichar
!= 0x110000; unichar
++) {
2505 if (!unicode_data
[unichar
].utf32nfdi
)
2510 um
= unicode_data
[unichar
].utf32nfdi
;
2512 dc
= unicode_data
[*um
].utf32nfdi
;
2514 for (j
= 0; dc
[j
]; j
++)
2515 mapping
[i
++] = dc
[j
];
2525 free(unicode_data
[unichar
].utf32nfdi
);
2526 um
= malloc(i
* sizeof(unsigned int));
2527 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2528 unicode_data
[unichar
].utf32nfdi
= um
;
2530 /* Add this decomposition to nfdicf if there is no entry. */
2531 if (!unicode_data
[unichar
].utf32nfdicf
) {
2532 um
= malloc(i
* sizeof(unsigned int));
2533 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2534 unicode_data
[unichar
].utf32nfdicf
= um
;
2537 print_utf32nfdi(unichar
);
2541 printf("Processed %d entries\n", count
);
2544 static void nfdicf_decompose(void)
2546 unsigned int unichar
;
2547 unsigned int mapping
[19]; /* Magic - guaranteed not to be exceeded. */
2556 printf("Decomposing nfdicf\n");
2558 for (unichar
= 0; unichar
!= 0x110000; unichar
++) {
2559 if (!unicode_data
[unichar
].utf32nfdicf
)
2564 um
= unicode_data
[unichar
].utf32nfdicf
;
2566 dc
= unicode_data
[*um
].utf32nfdicf
;
2568 for (j
= 0; dc
[j
]; j
++)
2569 mapping
[i
++] = dc
[j
];
2579 free(unicode_data
[unichar
].utf32nfdicf
);
2580 um
= malloc(i
* sizeof(unsigned int));
2581 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2582 unicode_data
[unichar
].utf32nfdicf
= um
;
2585 print_utf32nfdicf(unichar
);
2589 printf("Processed %d entries\n", count
);
2592 /* ------------------------------------------------------------------ */
2594 int utf8agemax(struct tree
*, const char *);
2595 int utf8nagemax(struct tree
*, const char *, size_t);
2596 int utf8agemin(struct tree
*, const char *);
2597 int utf8nagemin(struct tree
*, const char *, size_t);
2598 ssize_t
utf8len(struct tree
*, const char *);
2599 ssize_t
utf8nlen(struct tree
*, const char *, size_t);
2601 int utf8cursor(struct utf8cursor
*, struct tree
*, const char *);
2602 int utf8ncursor(struct utf8cursor
*, struct tree
*, const char *, size_t);
2603 int utf8byte(struct utf8cursor
*);
2606 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2608 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2609 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2618 * NCount = 588 (VCount * TCount)
2619 * SCount = 11172 (LCount * NCount)
2622 * SIndex = s - SBase
2624 * LV (Canonical/Full)
2625 * LIndex = SIndex / NCount
2626 * VIndex = (Sindex % NCount) / TCount
2627 * LPart = LBase + LIndex
2628 * VPart = VBase + VIndex
2631 * LVIndex = (SIndex / TCount) * TCount
2632 * TIndex = (Sindex % TCount)
2633 * LVPart = SBase + LVIndex
2634 * TPart = TBase + TIndex
2637 * LIndex = SIndex / NCount
2638 * VIndex = (Sindex % NCount) / TCount
2639 * TIndex = (Sindex % TCount)
2640 * LPart = LBase + LIndex
2641 * VPart = VBase + VIndex
2642 * if (TIndex == 0) {
2643 * d = <LPart, VPart>
2645 * TPart = TBase + TIndex
2646 * d = <LPart, VPart, TPart>
2658 #define NC (VC * TC)
2659 #define SC (LC * NC)
2661 /* Algorithmic decomposition of hangul syllable. */
2662 static utf8leaf_t
*utf8hangul(const char *str
, unsigned char *hangul
)
2670 /* Calculate the SI, LI, VI, and TI values. */
2671 si
= utf8decode(str
) - SB
;
2673 vi
= (si
% NC
) / TC
;
2676 /* Fill in base of leaf. */
2679 LEAF_CCC(h
) = DECOMPOSE
;
2682 /* Add LPart, a 3-byte UTF-8 sequence. */
2683 h
+= utf8encode((char *)h
, li
+ LB
);
2685 /* Add VPart, a 3-byte UTF-8 sequence. */
2686 h
+= utf8encode((char *)h
, vi
+ VB
);
2688 /* Add TPart if required, also a 3-byte UTF-8 sequence. */
2690 h
+= utf8encode((char *)h
, ti
+ TB
);
2692 /* Terminate string. */
2699 * Use trie to scan s, touching at most len bytes.
2700 * Returns the leaf if one exists, NULL otherwise.
2702 * A non-NULL return guarantees that the UTF-8 sequence starting at s
2703 * is well-formed and corresponds to a known unicode code point. The
2704 * shorthand for this will be "is valid UTF-8 unicode".
2706 static utf8leaf_t
*utf8nlookup(struct tree
*tree
, unsigned char *hangul
,
2707 const char *s
, size_t len
)
2720 trie
= utf8data
+ tree
->index
;
2722 offlen
= (*trie
& OFFLEN
) >> OFFLEN_SHIFT
;
2723 if (*trie
& NEXTBYTE
) {
2728 mask
= 1 << (*trie
& BITNUM
);
2732 /* Right node at offset of trie */
2733 node
= (*trie
& RIGHTNODE
);
2734 offset
= trie
[offlen
];
2737 offset
|= trie
[offlen
];
2740 } else if (*trie
& RIGHTPATH
) {
2741 /* Right node after this node */
2742 node
= (*trie
& TRIENODE
);
2745 /* No right node. */
2751 /* Left node after this node. */
2752 node
= (*trie
& LEFTNODE
);
2754 } else if (*trie
& RIGHTPATH
) {
2758 /* Left node after this node */
2759 node
= (*trie
& TRIENODE
);
2765 * Hangul decomposition is done algorithmically. These are the
2766 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
2767 * always 3 bytes long, so s has been advanced twice, and the
2768 * start of the sequence is at s-2.
2770 if (LEAF_CCC(trie
) == DECOMPOSE
&& LEAF_STR(trie
)[0] == HANGUL
)
2771 trie
= utf8hangul(s
- 2, hangul
);
2776 * Use trie to scan s.
2777 * Returns the leaf if one exists, NULL otherwise.
2779 * Forwards to trie_nlookup().
2781 static utf8leaf_t
*utf8lookup(struct tree
*tree
, unsigned char *hangul
,
2784 return utf8nlookup(tree
, hangul
, s
, (size_t)-1);
2788 * Return the number of bytes used by the current UTF-8 sequence.
2789 * Assumes the input points to the first byte of a valid UTF-8
2792 static inline int utf8clen(const char *s
)
2794 unsigned char c
= *s
;
2795 return 1 + (c
>= 0xC0) + (c
>= 0xE0) + (c
>= 0xF0);
2799 * Maximum age of any character in s.
2800 * Return -1 if s is not valid UTF-8 unicode.
2801 * Return 0 if only non-assigned code points are used.
2803 int utf8agemax(struct tree
*tree
, const char *s
)
2808 unsigned char hangul
[UTF8HANGULLEAF
];
2814 leaf
= utf8lookup(tree
, hangul
, s
);
2817 leaf_age
= ages
[LEAF_GEN(leaf
)];
2818 if (leaf_age
<= tree
->maxage
&& leaf_age
> age
)
2826 * Minimum age of any character in s.
2827 * Return -1 if s is not valid UTF-8 unicode.
2828 * Return 0 if non-assigned code points are used.
2830 int utf8agemin(struct tree
*tree
, const char *s
)
2835 unsigned char hangul
[UTF8HANGULLEAF
];
2841 leaf
= utf8lookup(tree
, hangul
, s
);
2844 leaf_age
= ages
[LEAF_GEN(leaf
)];
2845 if (leaf_age
<= tree
->maxage
&& leaf_age
< age
)
2853 * Maximum age of any character in s, touch at most len bytes.
2854 * Return -1 if s is not valid UTF-8 unicode.
2856 int utf8nagemax(struct tree
*tree
, const char *s
, size_t len
)
2861 unsigned char hangul
[UTF8HANGULLEAF
];
2867 leaf
= utf8nlookup(tree
, hangul
, s
, len
);
2870 leaf_age
= ages
[LEAF_GEN(leaf
)];
2871 if (leaf_age
<= tree
->maxage
&& leaf_age
> age
)
2880 * Maximum age of any character in s, touch at most len bytes.
2881 * Return -1 if s is not valid UTF-8 unicode.
2883 int utf8nagemin(struct tree
*tree
, const char *s
, size_t len
)
2888 unsigned char hangul
[UTF8HANGULLEAF
];
2894 leaf
= utf8nlookup(tree
, hangul
, s
, len
);
2897 leaf_age
= ages
[LEAF_GEN(leaf
)];
2898 if (leaf_age
<= tree
->maxage
&& leaf_age
< age
)
2907 * Length of the normalization of s.
2908 * Return -1 if s is not valid UTF-8 unicode.
2910 * A string of Default_Ignorable_Code_Point has length 0.
2912 ssize_t
utf8len(struct tree
*tree
, const char *s
)
2916 unsigned char hangul
[UTF8HANGULLEAF
];
2921 leaf
= utf8lookup(tree
, hangul
, s
);
2924 if (ages
[LEAF_GEN(leaf
)] > tree
->maxage
)
2926 else if (LEAF_CCC(leaf
) == DECOMPOSE
)
2927 ret
+= strlen(LEAF_STR(leaf
));
2936 * Length of the normalization of s, touch at most len bytes.
2937 * Return -1 if s is not valid UTF-8 unicode.
2939 ssize_t
utf8nlen(struct tree
*tree
, const char *s
, size_t len
)
2943 unsigned char hangul
[UTF8HANGULLEAF
];
2948 leaf
= utf8nlookup(tree
, hangul
, s
, len
);
2951 if (ages
[LEAF_GEN(leaf
)] > tree
->maxage
)
2953 else if (LEAF_CCC(leaf
) == DECOMPOSE
)
2954 ret
+= strlen(LEAF_STR(leaf
));
2964 * Cursor structure used by the normalizer.
2976 unsigned int unichar
;
2977 unsigned char hangul
[UTF8HANGULLEAF
];
2981 * Set up an utf8cursor for use by utf8byte().
2984 * len : length of s.
2985 * u8c : pointer to cursor.
2986 * trie : utf8trie_t to use for normalization.
2988 * Returns -1 on error, 0 on success.
2990 int utf8ncursor(struct utf8cursor
*u8c
, struct tree
*tree
, const char *s
,
3005 u8c
->nccc
= STOPPER
;
3007 /* Check we didn't clobber the maximum length. */
3008 if (u8c
->len
!= len
)
3010 /* The first byte of s may not be an utf8 continuation. */
3011 if (len
> 0 && (*s
& 0xC0) == 0x80)
3017 * Set up an utf8cursor for use by utf8byte().
3019 * s : NUL-terminated string.
3020 * u8c : pointer to cursor.
3021 * trie : utf8trie_t to use for normalization.
3023 * Returns -1 on error, 0 on success.
3025 int utf8cursor(struct utf8cursor
*u8c
, struct tree
*tree
, const char *s
)
3027 return utf8ncursor(u8c
, tree
, s
, (unsigned int)-1);
3031 * Get one byte from the normalized form of the string described by u8c.
3033 * Returns the byte cast to an unsigned char on succes, and -1 on failure.
3035 * The cursor keeps track of the location in the string in u8c->s.
3036 * When a character is decomposed, the current location is stored in
3037 * u8c->p, and u8c->s is set to the start of the decomposition. Note
3038 * that bytes from a decomposition do not count against u8c->len.
3040 * Characters are emitted if they match the current CCC in u8c->ccc.
3041 * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
3042 * and the function returns 0 in that case.
3044 * Sorting by CCC is done by repeatedly scanning the string. The
3045 * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
3046 * the start of the scan. The first pass finds the lowest CCC to be
3047 * emitted and stores it in u8c->nccc, the second pass emits the
3048 * characters with this CCC and finds the next lowest CCC. This limits
3049 * the number of passes to 1 + the number of different CCCs in the
3050 * sequence being scanned.
3053 * u8c->p != NULL -> a decomposition is being scanned.
3054 * u8c->ss != NULL -> this is a repeating scan.
3055 * u8c->ccc == -1 -> this is the first scan of a repeating scan.
3057 int utf8byte(struct utf8cursor
*u8c
)
3063 /* Check for the end of a decomposed character. */
3064 if (u8c
->p
&& *u8c
->s
== '\0') {
3069 /* Check for end-of-string. */
3070 if (!u8c
->p
&& (u8c
->len
== 0 || *u8c
->s
== '\0')) {
3071 /* There is no next byte. */
3072 if (u8c
->ccc
== STOPPER
)
3074 /* End-of-string during a scan counts as a stopper. */
3077 } else if ((*u8c
->s
& 0xC0) == 0x80) {
3078 /* This is a continuation of the current character. */
3081 return (unsigned char)*u8c
->s
++;
3084 /* Look up the data for the current character. */
3086 leaf
= utf8lookup(u8c
->tree
, u8c
->hangul
, u8c
->s
);
3088 leaf
= utf8nlookup(u8c
->tree
, u8c
->hangul
,
3092 /* No leaf found implies that the input is a binary blob. */
3096 /* Characters that are too new have CCC 0. */
3097 if (ages
[LEAF_GEN(leaf
)] > u8c
->tree
->maxage
) {
3099 } else if ((ccc
= LEAF_CCC(leaf
)) == DECOMPOSE
) {
3100 u8c
->len
-= utf8clen(u8c
->s
);
3101 u8c
->p
= u8c
->s
+ utf8clen(u8c
->s
);
3102 u8c
->s
= LEAF_STR(leaf
);
3103 /* Empty decomposition implies CCC 0. */
3104 if (*u8c
->s
== '\0') {
3105 if (u8c
->ccc
== STOPPER
)
3110 leaf
= utf8lookup(u8c
->tree
, u8c
->hangul
, u8c
->s
);
3111 ccc
= LEAF_CCC(leaf
);
3113 u8c
->unichar
= utf8decode(u8c
->s
);
3116 * If this is not a stopper, then see if it updates
3117 * the next canonical class to be emitted.
3119 if (ccc
!= STOPPER
&& u8c
->ccc
< ccc
&& ccc
< u8c
->nccc
)
3123 * Return the current byte if this is the current
3126 if (ccc
== u8c
->ccc
) {
3129 return (unsigned char)*u8c
->s
++;
3132 /* Current combining class mismatch. */
3134 if (u8c
->nccc
== STOPPER
) {
3136 * Scan forward for the first canonical class
3137 * to be emitted. Save the position from
3140 assert(u8c
->ccc
== STOPPER
);
3141 u8c
->ccc
= MINCCC
- 1;
3145 u8c
->slen
= u8c
->len
;
3147 u8c
->len
-= utf8clen(u8c
->s
);
3148 u8c
->s
+= utf8clen(u8c
->s
);
3149 } else if (ccc
!= STOPPER
) {
3150 /* Not a stopper, and not the ccc we're emitting. */
3152 u8c
->len
-= utf8clen(u8c
->s
);
3153 u8c
->s
+= utf8clen(u8c
->s
);
3154 } else if (u8c
->nccc
!= MAXCCC
+ 1) {
3155 /* At a stopper, restart for next ccc. */
3156 u8c
->ccc
= u8c
->nccc
;
3157 u8c
->nccc
= MAXCCC
+ 1;
3160 u8c
->len
= u8c
->slen
;
3162 /* All done, proceed from here. */
3164 u8c
->nccc
= STOPPER
;
3172 /* ------------------------------------------------------------------ */
3174 static int normalize_line(struct tree
*tree
)
3179 struct utf8cursor u8c
;
3181 /* First test: null-terminated string. */
3184 if (utf8cursor(&u8c
, tree
, s
))
3186 while ((c
= utf8byte(&u8c
)) > 0)
3187 if (c
!= (unsigned char)*t
++)
3194 /* Second test: length-limited string. */
3196 /* Replace NUL with a value that will cause an error if seen. */
3197 s
[strlen(s
) + 1] = -1;
3199 if (utf8cursor(&u8c
, tree
, s
))
3201 while ((c
= utf8byte(&u8c
)) > 0)
3202 if (c
!= (unsigned char)*t
++)
3212 static void normalization_test(void)
3215 unsigned int unichar
;
3216 struct unicode_data
*data
;
3225 printf("Parsing %s\n", test_name
);
3226 /* Step one, read data from file. */
3227 file
= fopen(test_name
, "r");
3229 open_fail(test_name
, errno
);
3231 while (fgets(line
, LINESIZE
, file
)) {
3232 ret
= sscanf(line
, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
3234 if (ret
!= 2 || *line
== '#')
3239 unichar
= strtoul(s
, &s
, 16);
3240 t
+= utf8encode(t
, unichar
);
3248 unichar
= strtoul(s
, &s
, 16);
3249 data
= &unicode_data
[unichar
];
3250 if (data
->utf8nfdi
&& !*data
->utf8nfdi
)
3253 t
+= utf8encode(t
, unichar
);
3258 if (normalize_line(nfdi_tree
) < 0) {
3259 printf("Line %s -> %s", buf0
, buf1
);
3261 printf(" (ignorables removed)");
3262 printf(" failure\n");
3268 printf("Ran %d tests with %d failures\n", tests
, failures
);
3270 file_fail(test_name
);
3273 /* ------------------------------------------------------------------ */
3275 static void write_file(void)
3284 printf("Writing %s\n", utf8_name
);
3285 file
= fopen(utf8_name
, "w");
3287 open_fail(utf8_name
, errno
);
3289 fprintf(file
, "/* This file is generated code, do not edit. */\n");
3290 fprintf(file
, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
3291 fprintf(file
, "#error Only nls_utf8-norm.c should include this file.\n");
3292 fprintf(file
, "#endif\n");
3293 fprintf(file
, "\n");
3294 fprintf(file
, "static const unsigned int utf8vers = %#x;\n",
3296 fprintf(file
, "\n");
3297 fprintf(file
, "static const unsigned int utf8agetab[] = {\n");
3298 for (i
= 0; i
!= ages_count
; i
++)
3299 fprintf(file
, "\t%#x%s\n", ages
[i
],
3300 ages
[i
] == unicode_maxage
? "" : ",");
3301 fprintf(file
, "};\n");
3302 fprintf(file
, "\n");
3303 fprintf(file
, "static const struct utf8data utf8nfdicfdata[] = {\n");
3305 for (gen
= 0; gen
< ages_count
; gen
++) {
3306 fprintf(file
, "\t{ %#x, %d }%s\n",
3307 ages
[gen
], trees
[t
].index
,
3308 ages
[gen
] == unicode_maxage
? "" : ",");
3309 if (trees
[t
].maxage
== ages
[gen
])
3312 fprintf(file
, "};\n");
3313 fprintf(file
, "\n");
3314 fprintf(file
, "static const struct utf8data utf8nfdidata[] = {\n");
3316 for (gen
= 0; gen
< ages_count
; gen
++) {
3317 fprintf(file
, "\t{ %#x, %d }%s\n",
3318 ages
[gen
], trees
[t
].index
,
3319 ages
[gen
] == unicode_maxage
? "" : ",");
3320 if (trees
[t
].maxage
== ages
[gen
])
3323 fprintf(file
, "};\n");
3324 fprintf(file
, "\n");
3325 fprintf(file
, "static const unsigned char utf8data[%zd] = {\n",
3328 for (i
= 0; i
!= utf8data_size
; i
+= 16) {
3329 if (i
== trees
[t
].index
) {
3330 fprintf(file
, "\t/* %s_%x */\n",
3331 trees
[t
].type
, trees
[t
].maxage
);
3332 if (t
< trees_count
-1)
3335 fprintf(file
, "\t");
3336 for (j
= i
; j
!= i
+ 16; j
++)
3337 fprintf(file
, "0x%.2x%s", utf8data
[j
],
3338 (j
< utf8data_size
-1 ? "," : ""));
3339 fprintf(file
, "\n");
3341 fprintf(file
, "};\n");
3345 /* ------------------------------------------------------------------ */
3347 int main(int argc
, char *argv
[])
3349 unsigned int unichar
;
3354 while ((opt
= getopt(argc
, argv
, "a:c:d:f:hn:o:p:t:v")) != -1) {
3393 for (unichar
= 0; unichar
!= 0x110000; unichar
++)
3394 unicode_data
[unichar
].code
= unichar
;
3409 /* Prevent "unused function" warning. */
3410 (void)lookup(nfdi_tree
, " ");
3412 tree_walk(nfdi_tree
);
3414 tree_walk(nfdicf_tree
);
3415 normalization_test();