2 * Copyright (c) 2014 SGI.
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 /* Generator for a compact trie for unicode normalization */
21 #include <sys/types.h>
30 /* Default names of the in- and output files. */
32 #define AGE_NAME "DerivedAge.txt"
33 #define CCC_NAME "DerivedCombiningClass.txt"
34 #define PROP_NAME "DerivedCoreProperties.txt"
35 #define DATA_NAME "UnicodeData.txt"
36 #define FOLD_NAME "CaseFolding.txt"
37 #define NORM_NAME "NormalizationCorrections.txt"
38 #define TEST_NAME "NormalizationTest.txt"
39 #define UTF8_NAME "utf8data.c"
41 const char *age_name
= AGE_NAME
;
42 const char *ccc_name
= CCC_NAME
;
43 const char *prop_name
= PROP_NAME
;
44 const char *data_name
= DATA_NAME
;
45 const char *fold_name
= FOLD_NAME
;
46 const char *norm_name
= NORM_NAME
;
47 const char *test_name
= TEST_NAME
;
48 const char *utf8_name
= UTF8_NAME
;
52 /* An arbitrary line size limit on input lines. */
63 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
65 /* ------------------------------------------------------------------ */
68 * Unicode version numbers consist of three parts: major, minor, and a
69 * revision. These numbers are packed into an unsigned int to obtain
70 * a single version number.
72 * To save space in the generated trie, the unicode version is not
73 * stored directly, instead we calculate a generation number from the
74 * unicode versions seen in the DerivedAge file, and use that as an
75 * index into a table of unicode versions.
77 #define UNICODE_MAJ_SHIFT (16)
78 #define UNICODE_MIN_SHIFT (8)
80 #define UNICODE_MAJ_MAX ((unsigned short)-1)
81 #define UNICODE_MIN_MAX ((unsigned char)-1)
82 #define UNICODE_REV_MAX ((unsigned char)-1)
84 #define UNICODE_AGE(MAJ,MIN,REV) \
85 (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
86 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
87 ((unsigned int)(REV)))
92 unsigned int unicode_maxage
;
94 static int age_valid(unsigned int major
, unsigned int minor
,
95 unsigned int revision
)
97 if (major
> UNICODE_MAJ_MAX
)
99 if (minor
> UNICODE_MIN_MAX
)
101 if (revision
> UNICODE_REV_MAX
)
106 /* ------------------------------------------------------------------ */
111 * A compact binary tree, used to decode UTF-8 characters.
113 * Internal nodes are one byte for the node itself, and up to three
114 * bytes for an offset into the tree. The first byte contains the
115 * following information:
116 * NEXTBYTE - flag - advance to next byte if set
117 * BITNUM - 3 bit field - the bit number to tested
118 * OFFLEN - 2 bit field - number of bytes in the offset
119 * if offlen == 0 (non-branching node)
120 * RIGHTPATH - 1 bit field - set if the following node is for the
121 * right-hand path (tested bit is set)
122 * TRIENODE - 1 bit field - set if the following node is an internal
123 * node, otherwise it is a leaf node
124 * if offlen != 0 (branching node)
125 * LEFTNODE - 1 bit field - set if the left-hand node is internal
126 * RIGHTNODE - 1 bit field - set if the right-hand node is internal
128 * Due to the way utf8 works, there cannot be branching nodes with
129 * NEXTBYTE set, and moreover those nodes always have a righthand
132 typedef unsigned char utf8trie_t
;
134 #define NEXTBYTE 0x08
136 #define OFFLEN_SHIFT 4
137 #define RIGHTPATH 0x40
138 #define TRIENODE 0x80
139 #define RIGHTNODE 0x40
140 #define LEFTNODE 0x80
145 * The leaves of the trie are embedded in the trie, and so the same
146 * underlying datatype, unsigned char.
148 * leaf[0]: The unicode version, stored as a generation number that is
149 * an index into utf8agetab[]. With this we can filter code
150 * points based on the unicode version in which they were
151 * defined. The CCC of a non-defined code point is 0.
152 * leaf[1]: Canonical Combining Class. During normalization, we need
153 * to do a stable sort into ascending order of all characters
154 * with a non-zero CCC that occur between two characters with
155 * a CCC of 0, or at the begin or end of a string.
156 * The unicode standard guarantees that all CCC values are
157 * between 0 and 254 inclusive, which leaves 255 available as
159 * Code points with CCC 0 are known as stoppers.
160 * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
161 * start of a NUL-terminated string that is the decomposition
163 * The CCC of a decomposable character is the same as the CCC
164 * of the first character of its decomposition.
165 * Some characters decompose as the empty string: these are
166 * characters with the Default_Ignorable_Code_Point property.
167 * These do affect normalization, as they all have CCC 0.
169 * The decompositions in the trie have been fully expanded.
171 * Casefolding, if applicable, is also done using decompositions.
173 typedef unsigned char utf8leaf_t
;
175 #define LEAF_GEN(LEAF) ((LEAF)[0])
176 #define LEAF_CCC(LEAF) ((LEAF)[1])
177 #define LEAF_STR(LEAF) ((const char*)((LEAF) + 2))
184 #define DECOMPOSE (255)
185 #define HANGUL ((char)(255))
187 #define UTF8HANGULLEAF (12)
190 static utf8leaf_t
*utf8nlookup(struct tree
*, unsigned char *,
191 const char *, size_t);
192 static utf8leaf_t
*utf8lookup(struct tree
*, unsigned char *, const char *);
194 unsigned char *utf8data
;
195 size_t utf8data_size
;
200 /* ------------------------------------------------------------------ */
205 * The UTF-8 encoding spreads the bits of a 32bit word over several
206 * bytes. This table gives the ranges that can be held and how they'd
209 * 0x00000000 0x0000007F: 0xxxxxxx
210 * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
211 * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
212 * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
213 * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
214 * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
216 * There is an additional requirement on UTF-8, in that only the
217 * shortest representation of a 32bit value is to be used. A decoder
218 * must not decode sequences that do not satisfy this requirement.
219 * Thus the allowed ranges have a lower bound.
221 * 0x00000000 0x0000007F: 0xxxxxxx
222 * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
223 * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
224 * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
225 * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
226 * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
228 * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
229 * 17 planes of 65536 values. This limits the sequences actually seen
230 * even more, to just the following.
233 * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf
234 * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf
235 * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf
237 * Even within those ranges not all values are allowed: the surrogates
238 * 0xd800 - 0xdfff should never be seen.
240 * Note that the longest sequence seen with valid usage is 4 bytes,
241 * the same a single UTF-32 character. This makes the UTF-8
242 * representation of Unicode strictly smaller than UTF-32.
244 * The shortest sequence requirement was introduced by:
245 * Corrigendum #1: UTF-8 Shortest Form
246 * It can be found here:
247 * http://www.unicode.org/versions/corrigendum1.html
251 #define UTF8_2_BITS 0xC0
252 #define UTF8_3_BITS 0xE0
253 #define UTF8_4_BITS 0xF0
254 #define UTF8_N_BITS 0x80
255 #define UTF8_2_MASK 0xE0
256 #define UTF8_3_MASK 0xF0
257 #define UTF8_4_MASK 0xF8
258 #define UTF8_N_MASK 0xC0
259 #define UTF8_V_MASK 0x3F
260 #define UTF8_V_SHIFT 6
262 static int utf8encode(char *str
, unsigned int val
)
269 } else if (val
< 0x800) {
270 str
[1] = val
& UTF8_V_MASK
;
271 str
[1] |= UTF8_N_BITS
;
272 val
>>= UTF8_V_SHIFT
;
274 str
[0] |= UTF8_2_BITS
;
276 } else if (val
< 0x10000) {
277 str
[2] = val
& UTF8_V_MASK
;
278 str
[2] |= UTF8_N_BITS
;
279 val
>>= UTF8_V_SHIFT
;
280 str
[1] = val
& UTF8_V_MASK
;
281 str
[1] |= UTF8_N_BITS
;
282 val
>>= UTF8_V_SHIFT
;
284 str
[0] |= UTF8_3_BITS
;
286 } else if (val
< 0x110000) {
287 str
[3] = val
& UTF8_V_MASK
;
288 str
[3] |= UTF8_N_BITS
;
289 val
>>= UTF8_V_SHIFT
;
290 str
[2] = val
& UTF8_V_MASK
;
291 str
[2] |= UTF8_N_BITS
;
292 val
>>= UTF8_V_SHIFT
;
293 str
[1] = val
& UTF8_V_MASK
;
294 str
[1] |= UTF8_N_BITS
;
295 val
>>= UTF8_V_SHIFT
;
297 str
[0] |= UTF8_4_BITS
;
300 printf("%#x: illegal val\n", val
);
306 static unsigned int utf8decode(const char *str
)
308 const unsigned char *s
= (const unsigned char*)str
;
309 unsigned int unichar
= 0;
313 } else if (*s
< UTF8_3_BITS
) {
314 unichar
= *s
++ & 0x1F;
315 unichar
<<= UTF8_V_SHIFT
;
316 unichar
|= *s
& 0x3F;
317 } else if (*s
< UTF8_4_BITS
) {
318 unichar
= *s
++ & 0x0F;
319 unichar
<<= UTF8_V_SHIFT
;
320 unichar
|= *s
++ & 0x3F;
321 unichar
<<= UTF8_V_SHIFT
;
322 unichar
|= *s
& 0x3F;
324 unichar
= *s
++ & 0x0F;
325 unichar
<<= UTF8_V_SHIFT
;
326 unichar
|= *s
++ & 0x3F;
327 unichar
<<= UTF8_V_SHIFT
;
328 unichar
|= *s
++ & 0x3F;
329 unichar
<<= UTF8_V_SHIFT
;
330 unichar
|= *s
& 0x3F;
335 static int utf32valid(unsigned int unichar
)
337 return unichar
< 0x110000;
340 #define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3)
351 int (*leaf_equal
)(void *, void *);
352 void (*leaf_print
)(void *, int);
353 int (*leaf_mark
)(void *);
354 int (*leaf_size
)(void *);
355 int *(*leaf_index
)(struct tree
*, void *);
356 unsigned char *(*leaf_emit
)(void *, unsigned char *);
357 int leafindex
[0x110000];
369 unsigned char bitnum
;
370 unsigned char nextbyte
;
371 unsigned char leftnode
;
372 unsigned char rightnode
;
373 unsigned int keybits
;
374 unsigned int keymask
;
378 * Example lookup function for a tree.
380 static void *lookup(struct tree
*tree
, const char *key
)
386 while (!leaf
&& node
) {
389 if (*key
& (1 << (node
->bitnum
& 7))) {
391 if (node
->rightnode
== NODE
) {
393 } else if (node
->rightnode
== LEAF
) {
400 if (node
->leftnode
== NODE
) {
402 } else if (node
->leftnode
== LEAF
) {
414 * A simple non-recursive tree walker: keep track of visits to the
415 * left and right branches in the leftmask and rightmask.
417 static void tree_walk(struct tree
*tree
)
420 unsigned int leftmask
;
421 unsigned int rightmask
;
422 unsigned int bitmask
;
424 int nodes
, singletons
, leaves
;
426 nodes
= singletons
= leaves
= 0;
428 printf("%s_%x root %p\n", tree
->type
, tree
->maxage
, tree
->root
);
429 if (tree
->childnode
== LEAF
) {
431 tree
->leaf_print(tree
->root
, indent
);
434 assert(tree
->childnode
== NODE
);
436 leftmask
= rightmask
= 0;
438 printf("%*snode @ %p bitnum %d nextbyte %d"
439 " left %p right %p mask %x bits %x\n",
441 node
->bitnum
, node
->nextbyte
,
442 node
->left
, node
->right
,
443 node
->keymask
, node
->keybits
);
445 if (!(node
->left
&& node
->right
))
449 bitmask
= 1 << node
->bitnum
;
450 if ((leftmask
& bitmask
) == 0) {
452 if (node
->leftnode
== LEAF
) {
454 tree
->leaf_print(node
->left
,
457 } else if (node
->left
) {
458 assert(node
->leftnode
== NODE
);
464 if ((rightmask
& bitmask
) == 0) {
465 rightmask
|= bitmask
;
466 if (node
->rightnode
== LEAF
) {
468 tree
->leaf_print(node
->right
,
471 } else if (node
->right
) {
472 assert(node
->rightnode
== NODE
);
478 leftmask
&= ~bitmask
;
479 rightmask
&= ~bitmask
;
485 printf("nodes %d leaves %d singletons %d\n",
486 nodes
, leaves
, singletons
);
490 * Allocate an initialize a new internal node.
492 static struct node
*alloc_node(struct node
*parent
)
497 node
= malloc(sizeof(*node
));
498 node
->left
= node
->right
= NULL
;
499 node
->parent
= parent
;
500 node
->leftnode
= NODE
;
501 node
->rightnode
= NODE
;
510 bitnum
= parent
->bitnum
;
511 if ((bitnum
& 7) == 0) {
512 node
->bitnum
= bitnum
+ 7 + 8;
515 node
->bitnum
= bitnum
- 1;
527 * Insert a new leaf into the tree, and collapse any subtrees that are
528 * fully populated and end in identical leaves. A nextbyte tagged
529 * internal node will not be removed to preserve the tree's integrity.
530 * Note that due to the structure of utf8, no nextbyte tagged node
531 * will be a candidate for removal.
533 static int insert(struct tree
*tree
, char *key
, int keylen
, void *leaf
)
540 assert(keylen
>= 1 && keylen
<= 4);
543 cursor
= &tree
->root
;
544 keybits
= 8 * keylen
;
546 /* Insert, creating path along the way. */
549 *cursor
= alloc_node(node
);
553 if (*key
& (1 << (node
->bitnum
& 7)))
554 cursor
= &node
->right
;
556 cursor
= &node
->left
;
561 /* Merge subtrees if possible. */
563 if (*key
& (1 << (node
->bitnum
& 7)))
564 node
->rightnode
= LEAF
;
566 node
->leftnode
= LEAF
;
569 if (node
->leftnode
== NODE
|| node
->rightnode
== NODE
)
574 if (! tree
->leaf_equal(node
->left
, node
->right
))
576 /* Keep left, drop right leaf. */
578 /* Check in parent */
579 parent
= node
->parent
;
583 tree
->childnode
= LEAF
;
584 } else if (parent
->left
== node
) {
586 parent
->leftnode
= LEAF
;
591 parent
->keymask
|= (1 << node
->bitnum
);
593 } else if (parent
->right
== node
) {
594 parent
->right
= leaf
;
595 parent
->rightnode
= LEAF
;
600 parent
->keymask
|= (1 << node
->bitnum
);
601 parent
->keybits
|= (1 << node
->bitnum
);
604 /* internal tree error */
611 /* Propagate keymasks up along singleton chains. */
613 parent
= node
->parent
;
616 /* Nix the mask for parents with two children. */
617 if (node
->keymask
== 0) {
620 } else if (parent
->left
&& parent
->right
) {
624 assert((parent
->keymask
& node
->keymask
) == 0);
625 parent
->keymask
|= node
->keymask
;
626 parent
->keymask
|= (1 << parent
->bitnum
);
627 parent
->keybits
|= node
->keybits
;
629 parent
->keybits
|= (1 << parent
->bitnum
);
638 * Prune internal nodes.
640 * Fully populated subtrees that end at the same leaf have already
641 * been collapsed. There are still internal nodes that have for both
642 * their left and right branches a sequence of singletons that make
643 * identical choices and end in identical leaves. The keymask and
644 * keybits collected in the nodes describe the choices made in these
645 * singleton chains. When they are identical for the left and right
646 * branch of a node, and the two leaves comare identical, the node in
647 * question can be removed.
649 * Note that nodes with the nextbyte tag set will not be removed by
650 * this to ensure tree integrity. Note as well that the structure of
651 * utf8 ensures that these nodes would not have been candidates for
652 * removal in any case.
654 static void prune(struct tree
*tree
)
662 unsigned int leftmask
;
663 unsigned int rightmask
;
664 unsigned int bitmask
;
668 printf("Pruning %s_%x\n", tree
->type
, tree
->maxage
);
671 if (tree
->childnode
== LEAF
)
676 leftmask
= rightmask
= 0;
681 if (node
->leftnode
== LEAF
)
683 if (node
->rightnode
== LEAF
)
691 if (left
->keymask
== 0)
693 if (right
->keymask
== 0)
695 if (left
->keymask
!= right
->keymask
)
697 if (left
->keybits
!= right
->keybits
)
701 assert(left
->left
|| left
->right
);
702 if (left
->leftnode
== LEAF
)
703 leftleaf
= left
->left
;
704 else if (left
->rightnode
== LEAF
)
705 leftleaf
= left
->right
;
708 else if (left
->right
)
715 assert(right
->left
|| right
->right
);
716 if (right
->leftnode
== LEAF
)
717 rightleaf
= right
->left
;
718 else if (right
->rightnode
== LEAF
)
719 rightleaf
= right
->right
;
720 else if (right
->left
)
722 else if (right
->right
)
723 right
= right
->right
;
727 if (! tree
->leaf_equal(leftleaf
, rightleaf
))
730 * This node has identical singleton-only subtrees.
733 parent
= node
->parent
;
736 if (parent
->left
== node
)
738 else if (parent
->right
== node
)
739 parent
->right
= left
;
742 left
->parent
= parent
;
743 left
->keymask
|= (1 << node
->bitnum
);
746 bitmask
= 1 << node
->bitnum
;
747 leftmask
&= ~bitmask
;
748 rightmask
&= ~bitmask
;
749 if (node
->leftnode
== NODE
&& node
->left
) {
754 } else if (node
->rightnode
== NODE
&& node
->right
) {
763 /* Propagate keymasks up along singleton chains. */
766 bitmask
= 1 << node
->bitnum
;
767 leftmask
&= ~bitmask
;
768 rightmask
&= ~bitmask
;
770 if (node
->left
&& node
->right
)
774 node
->keymask
|= left
->keymask
;
775 node
->keybits
|= left
->keybits
;
779 node
->keymask
|= right
->keymask
;
780 node
->keybits
|= right
->keybits
;
782 node
->keymask
|= (1 << node
->bitnum
);
785 bitmask
= 1 << node
->bitnum
;
786 leftmask
&= ~bitmask
;
787 rightmask
&= ~bitmask
;
790 bitmask
= 1 << node
->bitnum
;
791 if ((leftmask
& bitmask
) == 0 &&
792 node
->leftnode
== NODE
&&
796 } else if ((rightmask
& bitmask
) == 0 &&
797 node
->rightnode
== NODE
&&
799 rightmask
|= bitmask
;
802 leftmask
&= ~bitmask
;
803 rightmask
&= ~bitmask
;
808 printf("Pruned %d nodes\n", count
);
812 * Mark the nodes in the tree that lead to leaves that must be
815 static void mark_nodes(struct tree
*tree
)
819 unsigned int leftmask
;
820 unsigned int rightmask
;
821 unsigned int bitmask
;
826 printf("Marking %s_%x\n", tree
->type
, tree
->maxage
);
827 if (tree
->childnode
== LEAF
)
830 assert(tree
->childnode
== NODE
);
832 leftmask
= rightmask
= 0;
834 bitmask
= 1 << node
->bitnum
;
835 if ((leftmask
& bitmask
) == 0) {
837 if (node
->leftnode
== LEAF
) {
839 if (tree
->leaf_mark(node
->left
)) {
841 while (n
&& !n
->mark
) {
847 } else if (node
->left
) {
848 assert(node
->leftnode
== NODE
);
853 if ((rightmask
& bitmask
) == 0) {
854 rightmask
|= bitmask
;
855 if (node
->rightnode
== LEAF
) {
857 if (tree
->leaf_mark(node
->right
)) {
859 while (n
&& !n
->mark
) {
865 } else if (node
->right
) {
866 assert(node
->rightnode
== NODE
);
871 leftmask
&= ~bitmask
;
872 rightmask
&= ~bitmask
;
876 /* second pass: left siblings and singletons */
878 assert(tree
->childnode
== NODE
);
880 leftmask
= rightmask
= 0;
882 bitmask
= 1 << node
->bitnum
;
883 if ((leftmask
& bitmask
) == 0) {
885 if (node
->leftnode
== LEAF
) {
887 if (tree
->leaf_mark(node
->left
)) {
889 while (n
&& !n
->mark
) {
895 } else if (node
->left
) {
896 assert(node
->leftnode
== NODE
);
898 if (!node
->mark
&& node
->parent
->mark
) {
905 if ((rightmask
& bitmask
) == 0) {
906 rightmask
|= bitmask
;
907 if (node
->rightnode
== LEAF
) {
909 if (tree
->leaf_mark(node
->right
)) {
911 while (n
&& !n
->mark
) {
917 } else if (node
->right
) {
918 assert(node
->rightnode
== NODE
);
920 if (!node
->mark
&& node
->parent
->mark
&&
921 !node
->parent
->left
) {
928 leftmask
&= ~bitmask
;
929 rightmask
&= ~bitmask
;
934 printf("Marked %d nodes\n", marked
);
938 * Compute the index of each node and leaf, which is the offset in the
939 * emitted trie. These values must be pre-computed because relative
940 * offsets between nodes are used to navigate the tree.
942 static int index_nodes(struct tree
*tree
, int index
)
945 unsigned int leftmask
;
946 unsigned int rightmask
;
947 unsigned int bitmask
;
951 /* Align to a cache line (or half a cache line?). */
959 printf("Indexing %s_%x: %d\n", tree
->type
, tree
->maxage
, index
);
960 if (tree
->childnode
== LEAF
) {
961 index
+= tree
->leaf_size(tree
->root
);
965 assert(tree
->childnode
== NODE
);
967 leftmask
= rightmask
= 0;
972 if (node
->index
!= index
)
977 bitmask
= 1 << node
->bitnum
;
978 if (node
->mark
&& (leftmask
& bitmask
) == 0) {
980 if (node
->leftnode
== LEAF
) {
982 *tree
->leaf_index(tree
, node
->left
) =
984 index
+= tree
->leaf_size(node
->left
);
986 } else if (node
->left
) {
987 assert(node
->leftnode
== NODE
);
993 if (node
->mark
&& (rightmask
& bitmask
) == 0) {
994 rightmask
|= bitmask
;
995 if (node
->rightnode
== LEAF
) {
997 *tree
->leaf_index(tree
, node
->right
) = index
;
998 index
+= tree
->leaf_size(node
->right
);
1000 } else if (node
->right
) {
1001 assert(node
->rightnode
== NODE
);
1007 leftmask
&= ~bitmask
;
1008 rightmask
&= ~bitmask
;
1009 node
= node
->parent
;
1014 /* Round up to a multiple of 16 */
1018 printf("Final index %d\n", index
);
1023 * Mark the nodes in a subtree, helper for size_nodes().
1025 static int mark_subtree(struct node
*node
)
1029 if (!node
|| node
->mark
)
1032 node
->index
= node
->parent
->index
;
1034 if (node
->leftnode
== NODE
)
1035 changed
+= mark_subtree(node
->left
);
1036 if (node
->rightnode
== NODE
)
1037 changed
+= mark_subtree(node
->right
);
1042 * Compute the size of nodes and leaves. We start by assuming that
1043 * each node needs to store a three-byte offset. The indexes of the
1044 * nodes are calculated based on that, and then this function is
1045 * called to see if the sizes of some nodes can be reduced. This is
1046 * repeated until no more changes are seen.
1048 static int size_nodes(struct tree
*tree
)
1054 unsigned int leftmask
;
1055 unsigned int rightmask
;
1056 unsigned int bitmask
;
1057 unsigned int pathbits
;
1058 unsigned int pathmask
;
1070 printf("Sizing %s_%x\n", tree
->type
, tree
->maxage
);
1071 if (tree
->childnode
== LEAF
)
1074 assert(tree
->childnode
== NODE
);
1078 leftmask
= rightmask
= 0;
1083 if (!node
->left
|| !node
->right
) {
1086 if (node
->rightnode
== NODE
) {
1088 * If the right node is not marked,
1089 * look for a corresponding node in
1090 * the next tree. Such a node need
1093 right
= node
->right
;
1095 while (!right
->mark
) {
1098 while (n
->bitnum
!= node
->bitnum
) {
1099 nbit
= 1 << n
->bitnum
;
1100 if (!(pathmask
& nbit
))
1102 if (pathbits
& nbit
) {
1103 if (n
->rightnode
== LEAF
)
1107 if (n
->leftnode
== LEAF
)
1112 if (n
->bitnum
!= node
->bitnum
)
1118 /* Make sure the right node is marked. */
1120 changed
+= mark_subtree(right
);
1121 offset
= right
->index
- node
->index
;
1123 offset
= *tree
->leaf_index(tree
, node
->right
);
1124 offset
-= node
->index
;
1126 assert(offset
>= 0);
1127 assert(offset
<= 0xffffff);
1128 if (offset
<= 0xff) {
1130 } else if (offset
<= 0xffff) {
1132 } else { /* offset <= 0xffffff */
1136 if (node
->size
!= size
|| node
->offset
!= offset
) {
1138 node
->offset
= offset
;
1143 bitmask
= 1 << node
->bitnum
;
1144 pathmask
|= bitmask
;
1145 if (node
->mark
&& (leftmask
& bitmask
) == 0) {
1146 leftmask
|= bitmask
;
1147 if (node
->leftnode
== LEAF
) {
1149 } else if (node
->left
) {
1150 assert(node
->leftnode
== NODE
);
1156 if (node
->mark
&& (rightmask
& bitmask
) == 0) {
1157 rightmask
|= bitmask
;
1158 pathbits
|= bitmask
;
1159 if (node
->rightnode
== LEAF
) {
1160 assert(node
->right
);
1161 } else if (node
->right
) {
1162 assert(node
->rightnode
== NODE
);
1168 leftmask
&= ~bitmask
;
1169 rightmask
&= ~bitmask
;
1170 pathmask
&= ~bitmask
;
1171 pathbits
&= ~bitmask
;
1172 node
= node
->parent
;
1178 printf("Found %d changes\n", changed
);
1183 * Emit a trie for the given tree into the data array.
1185 static void emit(struct tree
*tree
, unsigned char *data
)
1188 unsigned int leftmask
;
1189 unsigned int rightmask
;
1190 unsigned int bitmask
;
1201 nodes
[0] = nodes
[1] = nodes
[2] = nodes
[3] = 0;
1204 index
= tree
->index
;
1208 printf("Emitting %s_%x\n", tree
->type
, tree
->maxage
);
1209 if (tree
->childnode
== LEAF
) {
1211 tree
->leaf_emit(tree
->root
, data
);
1212 size
= tree
->leaf_size(tree
->root
);
1218 assert(tree
->childnode
== NODE
);
1220 leftmask
= rightmask
= 0;
1224 assert(node
->offset
!= -1);
1225 assert(node
->index
== index
);
1230 byte
|= (node
->bitnum
& BITNUM
);
1231 if (node
->left
&& node
->right
) {
1232 if (node
->leftnode
== NODE
)
1234 if (node
->rightnode
== NODE
)
1236 if (node
->offset
<= 0xff)
1238 else if (node
->offset
<= 0xffff)
1243 offset
= node
->offset
;
1244 byte
|= offlen
<< OFFLEN_SHIFT
;
1248 *data
++ = offset
& 0xff;
1252 } else if (node
->left
) {
1253 if (node
->leftnode
== NODE
)
1258 } else if (node
->right
) {
1260 if (node
->rightnode
== NODE
)
1270 bitmask
= 1 << node
->bitnum
;
1271 if (node
->mark
&& (leftmask
& bitmask
) == 0) {
1272 leftmask
|= bitmask
;
1273 if (node
->leftnode
== LEAF
) {
1275 data
= tree
->leaf_emit(node
->left
,
1277 size
= tree
->leaf_size(node
->left
);
1281 } else if (node
->left
) {
1282 assert(node
->leftnode
== NODE
);
1288 if (node
->mark
&& (rightmask
& bitmask
) == 0) {
1289 rightmask
|= bitmask
;
1290 if (node
->rightnode
== LEAF
) {
1291 assert(node
->right
);
1292 data
= tree
->leaf_emit(node
->right
,
1294 size
= tree
->leaf_size(node
->right
);
1298 } else if (node
->right
) {
1299 assert(node
->rightnode
== NODE
);
1305 leftmask
&= ~bitmask
;
1306 rightmask
&= ~bitmask
;
1307 node
= node
->parent
;
1313 printf("Emitted %d (%d) leaves",
1315 printf(" %d (%d+%d+%d+%d) nodes",
1316 nodes
[0] + nodes
[1] + nodes
[2] + nodes
[3],
1317 nodes
[0], nodes
[1], nodes
[2], nodes
[3]);
1318 printf(" %d total\n", index
- tree
->index
);
1322 /* ------------------------------------------------------------------ */
1327 * We need to keep track of the Canonical Combining Class, the Age,
1328 * and decompositions for a code point.
1330 * For the Age, we store the index into the ages table. Effectively
1331 * this is a generation number that the table maps to a unicode
1334 * The correction field is used to indicate that this entry is in the
1335 * corrections array, which contains decompositions that were
1336 * corrected in later revisions. The value of the correction field is
1337 * the Unicode version in which the mapping was corrected.
1339 struct unicode_data
{
1344 unsigned int *utf32nfdi
;
1345 unsigned int *utf32nfdicf
;
1350 struct unicode_data unicode_data
[0x110000];
1351 struct unicode_data
*corrections
;
1352 int corrections_count
;
1354 struct tree
*nfdi_tree
;
1355 struct tree
*nfdicf_tree
;
1361 * Check the corrections array to see if this entry was corrected at
1364 static struct unicode_data
*corrections_lookup(struct unicode_data
*u
)
1368 for (i
= 0; i
!= corrections_count
; i
++)
1369 if (u
->code
== corrections
[i
].code
)
1370 return &corrections
[i
];
1374 static int nfdi_equal(void *l
, void *r
)
1376 struct unicode_data
*left
= l
;
1377 struct unicode_data
*right
= r
;
1379 if (left
->gen
!= right
->gen
)
1381 if (left
->ccc
!= right
->ccc
)
1383 if (left
->utf8nfdi
&& right
->utf8nfdi
&&
1384 strcmp(left
->utf8nfdi
, right
->utf8nfdi
) == 0)
1386 if (left
->utf8nfdi
|| right
->utf8nfdi
)
1391 static int nfdicf_equal(void *l
, void *r
)
1393 struct unicode_data
*left
= l
;
1394 struct unicode_data
*right
= r
;
1396 if (left
->gen
!= right
->gen
)
1398 if (left
->ccc
!= right
->ccc
)
1400 if (left
->utf8nfdicf
&& right
->utf8nfdicf
&&
1401 strcmp(left
->utf8nfdicf
, right
->utf8nfdicf
) == 0)
1403 if (left
->utf8nfdicf
&& right
->utf8nfdicf
)
1405 if (left
->utf8nfdicf
|| right
->utf8nfdicf
)
1407 if (left
->utf8nfdi
&& right
->utf8nfdi
&&
1408 strcmp(left
->utf8nfdi
, right
->utf8nfdi
) == 0)
1410 if (left
->utf8nfdi
|| right
->utf8nfdi
)
1415 static void nfdi_print(void *l
, int indent
)
1417 struct unicode_data
*leaf
= l
;
1419 printf("%*sleaf @ %p code %X ccc %d gen %d", indent
, "", leaf
,
1420 leaf
->code
, leaf
->ccc
, leaf
->gen
);
1422 if (leaf
->utf8nfdi
&& leaf
->utf8nfdi
[0] == HANGUL
)
1423 printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1424 else if (leaf
->utf8nfdi
)
1425 printf(" nfdi \"%s\"", (const char*)leaf
->utf8nfdi
);
1430 static void nfdicf_print(void *l
, int indent
)
1432 struct unicode_data
*leaf
= l
;
1434 printf("%*sleaf @ %p code %X ccc %d gen %d", indent
, "", leaf
,
1435 leaf
->code
, leaf
->ccc
, leaf
->gen
);
1437 if (leaf
->utf8nfdicf
)
1438 printf(" nfdicf \"%s\"", (const char*)leaf
->utf8nfdicf
);
1439 else if (leaf
->utf8nfdi
&& leaf
->utf8nfdi
[0] == HANGUL
)
1440 printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1441 else if (leaf
->utf8nfdi
)
1442 printf(" nfdi \"%s\"", (const char*)leaf
->utf8nfdi
);
1446 static int nfdi_mark(void *l
)
1451 static int nfdicf_mark(void *l
)
1453 struct unicode_data
*leaf
= l
;
1455 if (leaf
->utf8nfdicf
)
1460 static int correction_mark(void *l
)
1462 struct unicode_data
*leaf
= l
;
1464 return leaf
->correction
;
1467 static int nfdi_size(void *l
)
1469 struct unicode_data
*leaf
= l
;
1472 if (HANGUL_SYLLABLE(leaf
->code
))
1474 else if (leaf
->utf8nfdi
)
1475 size
+= strlen(leaf
->utf8nfdi
) + 1;
1479 static int nfdicf_size(void *l
)
1481 struct unicode_data
*leaf
= l
;
1484 if (HANGUL_SYLLABLE(leaf
->code
))
1486 else if (leaf
->utf8nfdicf
)
1487 size
+= strlen(leaf
->utf8nfdicf
) + 1;
1488 else if (leaf
->utf8nfdi
)
1489 size
+= strlen(leaf
->utf8nfdi
) + 1;
1493 static int *nfdi_index(struct tree
*tree
, void *l
)
1495 struct unicode_data
*leaf
= l
;
1497 return &tree
->leafindex
[leaf
->code
];
1500 static int *nfdicf_index(struct tree
*tree
, void *l
)
1502 struct unicode_data
*leaf
= l
;
1504 return &tree
->leafindex
[leaf
->code
];
1507 static unsigned char *nfdi_emit(void *l
, unsigned char *data
)
1509 struct unicode_data
*leaf
= l
;
1512 *data
++ = leaf
->gen
;
1514 if (HANGUL_SYLLABLE(leaf
->code
)) {
1515 *data
++ = DECOMPOSE
;
1517 } else if (leaf
->utf8nfdi
) {
1518 *data
++ = DECOMPOSE
;
1519 s
= (unsigned char*)leaf
->utf8nfdi
;
1520 while ((*data
++ = *s
++) != 0)
1523 *data
++ = leaf
->ccc
;
1528 static unsigned char *nfdicf_emit(void *l
, unsigned char *data
)
1530 struct unicode_data
*leaf
= l
;
1533 *data
++ = leaf
->gen
;
1535 if (HANGUL_SYLLABLE(leaf
->code
)) {
1536 *data
++ = DECOMPOSE
;
1538 } else if (leaf
->utf8nfdicf
) {
1539 *data
++ = DECOMPOSE
;
1540 s
= (unsigned char*)leaf
->utf8nfdicf
;
1541 while ((*data
++ = *s
++) != 0)
1543 } else if (leaf
->utf8nfdi
) {
1544 *data
++ = DECOMPOSE
;
1545 s
= (unsigned char*)leaf
->utf8nfdi
;
1546 while ((*data
++ = *s
++) != 0)
1549 *data
++ = leaf
->ccc
;
1554 static void utf8_create(struct unicode_data
*data
)
1561 if (data
->utf8nfdi
) {
1562 assert(data
->utf8nfdi
[0] == HANGUL
);
1567 um
= data
->utf32nfdi
;
1569 for (i
= 0; um
[i
]; i
++)
1570 u
+= utf8encode(u
, um
[i
]);
1572 data
->utf8nfdi
= strdup(utf
);
1575 um
= data
->utf32nfdicf
;
1577 for (i
= 0; um
[i
]; i
++)
1578 u
+= utf8encode(u
, um
[i
]);
1580 if (!data
->utf8nfdi
|| strcmp(data
->utf8nfdi
, utf
))
1581 data
->utf8nfdicf
= strdup(utf
);
1585 static void utf8_init(void)
1587 unsigned int unichar
;
1590 for (unichar
= 0; unichar
!= 0x110000; unichar
++)
1591 utf8_create(&unicode_data
[unichar
]);
1593 for (i
= 0; i
!= corrections_count
; i
++)
1594 utf8_create(&corrections
[i
]);
1597 static void trees_init(void)
1599 struct unicode_data
*data
;
1600 unsigned int maxage
;
1601 unsigned int nextage
;
1606 /* Count the number of different ages. */
1608 nextage
= (unsigned int)-1;
1612 for (i
= 0; i
<= corrections_count
; i
++) {
1613 data
= &corrections
[i
];
1614 if (nextage
< data
->correction
&&
1615 data
->correction
< maxage
)
1616 nextage
= data
->correction
;
1621 /* Two trees per age: nfdi and nfdicf */
1622 trees_count
= count
* 2;
1623 trees
= calloc(trees_count
, sizeof(struct tree
));
1625 /* Assign ages to the trees. */
1626 count
= trees_count
;
1627 nextage
= (unsigned int)-1;
1630 trees
[--count
].maxage
= maxage
;
1631 trees
[--count
].maxage
= maxage
;
1633 for (i
= 0; i
<= corrections_count
; i
++) {
1634 data
= &corrections
[i
];
1635 if (nextage
< data
->correction
&&
1636 data
->correction
< maxage
)
1637 nextage
= data
->correction
;
1641 /* The ages assigned above are off by one. */
1642 for (i
= 0; i
!= trees_count
; i
++) {
1644 while (ages
[j
] < trees
[i
].maxage
)
1646 trees
[i
].maxage
= ages
[j
-1];
1649 /* Set up the forwarding between trees. */
1650 trees
[trees_count
-2].next
= &trees
[trees_count
-1];
1651 trees
[trees_count
-1].leaf_mark
= nfdi_mark
;
1652 trees
[trees_count
-2].leaf_mark
= nfdicf_mark
;
1653 for (i
= 0; i
!= trees_count
-2; i
+= 2) {
1654 trees
[i
].next
= &trees
[trees_count
-2];
1655 trees
[i
].leaf_mark
= correction_mark
;
1656 trees
[i
+1].next
= &trees
[trees_count
-1];
1657 trees
[i
+1].leaf_mark
= correction_mark
;
1660 /* Assign the callouts. */
1661 for (i
= 0; i
!= trees_count
; i
+= 2) {
1662 trees
[i
].type
= "nfdicf";
1663 trees
[i
].leaf_equal
= nfdicf_equal
;
1664 trees
[i
].leaf_print
= nfdicf_print
;
1665 trees
[i
].leaf_size
= nfdicf_size
;
1666 trees
[i
].leaf_index
= nfdicf_index
;
1667 trees
[i
].leaf_emit
= nfdicf_emit
;
1669 trees
[i
+1].type
= "nfdi";
1670 trees
[i
+1].leaf_equal
= nfdi_equal
;
1671 trees
[i
+1].leaf_print
= nfdi_print
;
1672 trees
[i
+1].leaf_size
= nfdi_size
;
1673 trees
[i
+1].leaf_index
= nfdi_index
;
1674 trees
[i
+1].leaf_emit
= nfdi_emit
;
1678 for (i
= 0; i
!= trees_count
; i
++)
1679 trees
[i
].childnode
= NODE
;
1682 static void trees_populate(void)
1684 struct unicode_data
*data
;
1685 unsigned int unichar
;
1690 for (i
= 0; i
!= trees_count
; i
++) {
1692 printf("Populating %s_%x\n",
1693 trees
[i
].type
, trees
[i
].maxage
);
1695 for (unichar
= 0; unichar
!= 0x110000; unichar
++) {
1696 if (unicode_data
[unichar
].gen
< 0)
1698 keylen
= utf8encode(keyval
, unichar
);
1699 data
= corrections_lookup(&unicode_data
[unichar
]);
1700 if (data
->correction
<= trees
[i
].maxage
)
1701 data
= &unicode_data
[unichar
];
1702 insert(&trees
[i
], keyval
, keylen
, data
);
1707 static void trees_reduce(void)
1713 for (i
= 0; i
!= trees_count
; i
++)
1715 for (i
= 0; i
!= trees_count
; i
++)
1716 mark_nodes(&trees
[i
]);
1719 for (i
= 0; i
!= trees_count
; i
++)
1720 size
= index_nodes(&trees
[i
], size
);
1722 for (i
= 0; i
!= trees_count
; i
++)
1723 changed
+= size_nodes(&trees
[i
]);
1726 utf8data
= calloc(size
, 1);
1727 utf8data_size
= size
;
1728 for (i
= 0; i
!= trees_count
; i
++)
1729 emit(&trees
[i
], utf8data
);
1732 for (i
= 0; i
!= trees_count
; i
++) {
1733 printf("%s_%x idx %d\n",
1734 trees
[i
].type
, trees
[i
].maxage
, trees
[i
].index
);
1738 nfdi
= utf8data
+ trees
[trees_count
-1].index
;
1739 nfdicf
= utf8data
+ trees
[trees_count
-2].index
;
1741 nfdi_tree
= &trees
[trees_count
-1];
1742 nfdicf_tree
= &trees
[trees_count
-2];
1745 static void verify(struct tree
*tree
)
1747 struct unicode_data
*data
;
1749 unsigned int unichar
;
1751 unsigned char hangul
[UTF8HANGULLEAF
];
1756 printf("Verifying %s_%x\n", tree
->type
, tree
->maxage
);
1757 nocf
= strcmp(tree
->type
, "nfdicf");
1759 for (unichar
= 0; unichar
!= 0x110000; unichar
++) {
1761 data
= corrections_lookup(&unicode_data
[unichar
]);
1762 if (data
->correction
<= tree
->maxage
)
1763 data
= &unicode_data
[unichar
];
1764 utf8encode(key
,unichar
);
1765 leaf
= utf8lookup(tree
, hangul
, key
);
1768 if (data
->gen
!= -1)
1770 if (unichar
< 0xd800 || unichar
> 0xdfff)
1773 if (unichar
>= 0xd800 && unichar
<= 0xdfff)
1775 if (data
->gen
== -1)
1777 if (data
->gen
!= LEAF_GEN(leaf
))
1779 if (LEAF_CCC(leaf
) == DECOMPOSE
) {
1780 if (HANGUL_SYLLABLE(data
->code
)) {
1781 if (data
->utf8nfdi
[0] != HANGUL
)
1784 if (!data
->utf8nfdi
) {
1786 } else if (strcmp(data
->utf8nfdi
,
1791 if (!data
->utf8nfdicf
&&
1794 } else if (data
->utf8nfdicf
) {
1795 if (strcmp(data
->utf8nfdicf
,
1798 } else if (strcmp(data
->utf8nfdi
,
1803 } else if (data
->ccc
!= LEAF_CCC(leaf
)) {
1808 printf("%X code %X gen %d ccc %d"
1810 unichar
, data
->code
, data
->gen
,
1814 printf(" gen %d ccc %d"
1818 LEAF_CCC(leaf
) == DECOMPOSE
?
1819 LEAF_STR(leaf
) : "");
1826 static void trees_verify(void)
1830 for (i
= 0; i
!= trees_count
; i
++)
1834 /* ------------------------------------------------------------------ */
1836 static void help(void)
1838 printf("Usage: %s [options]\n", argv0
);
1840 printf("This program creates an a data trie used for parsing and\n");
1841 printf("normalization of UTF-8 strings. The trie is derived from\n");
1842 printf("a set of input files from the Unicode character database\n");
1843 printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
1845 printf("The generated tree supports two normalization forms:\n");
1847 printf("\tnfdi:\n");
1848 printf("\t- Apply unicode normalization form NFD.\n");
1849 printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1851 printf("\tnfdicf:\n");
1852 printf("\t- Apply unicode normalization form NFD.\n");
1853 printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1854 printf("\t- Apply a full casefold (C + F).\n");
1856 printf("These forms were chosen as being most useful when dealing\n");
1857 printf("with file names: NFD catches most cases where characters\n");
1858 printf("should be considered equivalent. The ignorables are mostly\n");
1859 printf("invisible, making names hard to type.\n");
1861 printf("The options to specify the files to be used are listed\n");
1862 printf("below with their default values, which are the names used\n");
1863 printf("by version 11.0.0 of the Unicode Character Database.\n");
1865 printf("The input files:\n");
1866 printf("\t-a %s\n", AGE_NAME
);
1867 printf("\t-c %s\n", CCC_NAME
);
1868 printf("\t-p %s\n", PROP_NAME
);
1869 printf("\t-d %s\n", DATA_NAME
);
1870 printf("\t-f %s\n", FOLD_NAME
);
1871 printf("\t-n %s\n", NORM_NAME
);
1873 printf("Additionally, the generated tables are tested using:\n");
1874 printf("\t-t %s\n", TEST_NAME
);
1876 printf("Finally, the output file:\n");
1877 printf("\t-o %s\n", UTF8_NAME
);
1881 static void usage(void)
1887 static void open_fail(const char *name
, int error
)
1889 printf("Error %d opening %s: %s\n", error
, name
, strerror(error
));
1893 static void file_fail(const char *filename
)
1895 printf("Error parsing %s\n", filename
);
1899 static void line_fail(const char *filename
, const char *line
)
1901 printf("Error parsing %s:%s\n", filename
, line
);
1905 /* ------------------------------------------------------------------ */
1907 static void print_utf32(unsigned int *utf32str
)
1911 for (i
= 0; utf32str
[i
]; i
++)
1912 printf(" %X", utf32str
[i
]);
1915 static void print_utf32nfdi(unsigned int unichar
)
1917 printf(" %X ->", unichar
);
1918 print_utf32(unicode_data
[unichar
].utf32nfdi
);
1922 static void print_utf32nfdicf(unsigned int unichar
)
1924 printf(" %X ->", unichar
);
1925 print_utf32(unicode_data
[unichar
].utf32nfdicf
);
1929 /* ------------------------------------------------------------------ */
1931 static void age_init(void)
1936 unsigned int unichar
;
1939 unsigned int revision
;
1945 printf("Parsing %s\n", age_name
);
1947 file
= fopen(age_name
, "r");
1949 open_fail(age_name
, errno
);
1953 while (fgets(line
, LINESIZE
, file
)) {
1954 ret
= sscanf(line
, "# Age=V%d_%d_%d",
1955 &major
, &minor
, &revision
);
1959 printf(" Age V%d_%d_%d\n",
1960 major
, minor
, revision
);
1961 if (!age_valid(major
, minor
, revision
))
1962 line_fail(age_name
, line
);
1965 ret
= sscanf(line
, "# Age=V%d_%d", &major
, &minor
);
1969 printf(" Age V%d_%d\n", major
, minor
);
1970 if (!age_valid(major
, minor
, 0))
1971 line_fail(age_name
, line
);
1976 /* We must have found something above. */
1978 printf("%d age entries\n", ages_count
);
1979 if (ages_count
== 0 || ages_count
> MAXGEN
)
1980 file_fail(age_name
);
1982 /* There is a 0 entry. */
1984 ages
= calloc(ages_count
+ 1, sizeof(*ages
));
1985 /* And a guard entry. */
1986 ages
[ages_count
] = (unsigned int)-1;
1991 while (fgets(line
, LINESIZE
, file
)) {
1992 ret
= sscanf(line
, "# Age=V%d_%d_%d",
1993 &major
, &minor
, &revision
);
1996 UNICODE_AGE(major
, minor
, revision
);
1998 printf(" Age V%d_%d_%d = gen %d\n",
1999 major
, minor
, revision
, gen
);
2000 if (!age_valid(major
, minor
, revision
))
2001 line_fail(age_name
, line
);
2004 ret
= sscanf(line
, "# Age=V%d_%d", &major
, &minor
);
2006 ages
[++gen
] = UNICODE_AGE(major
, minor
, 0);
2008 printf(" Age V%d_%d = %d\n",
2010 if (!age_valid(major
, minor
, 0))
2011 line_fail(age_name
, line
);
2014 ret
= sscanf(line
, "%X..%X ; %d.%d #",
2015 &first
, &last
, &major
, &minor
);
2017 for (unichar
= first
; unichar
<= last
; unichar
++)
2018 unicode_data
[unichar
].gen
= gen
;
2019 count
+= 1 + last
- first
;
2021 printf(" %X..%X gen %d\n", first
, last
, gen
);
2022 if (!utf32valid(first
) || !utf32valid(last
))
2023 line_fail(age_name
, line
);
2026 ret
= sscanf(line
, "%X ; %d.%d #", &unichar
, &major
, &minor
);
2028 unicode_data
[unichar
].gen
= gen
;
2031 printf(" %X gen %d\n", unichar
, gen
);
2032 if (!utf32valid(unichar
))
2033 line_fail(age_name
, line
);
2037 unicode_maxage
= ages
[gen
];
2040 /* Nix surrogate block */
2042 printf(" Removing surrogate block D800..DFFF\n");
2043 for (unichar
= 0xd800; unichar
<= 0xdfff; unichar
++)
2044 unicode_data
[unichar
].gen
= -1;
2047 printf("Found %d entries\n", count
);
2049 file_fail(age_name
);
2052 static void ccc_init(void)
2057 unsigned int unichar
;
2063 printf("Parsing %s\n", ccc_name
);
2065 file
= fopen(ccc_name
, "r");
2067 open_fail(ccc_name
, errno
);
2070 while (fgets(line
, LINESIZE
, file
)) {
2071 ret
= sscanf(line
, "%X..%X ; %d #", &first
, &last
, &value
);
2073 for (unichar
= first
; unichar
<= last
; unichar
++) {
2074 unicode_data
[unichar
].ccc
= value
;
2078 printf(" %X..%X ccc %d\n", first
, last
, value
);
2079 if (!utf32valid(first
) || !utf32valid(last
))
2080 line_fail(ccc_name
, line
);
2083 ret
= sscanf(line
, "%X ; %d #", &unichar
, &value
);
2085 unicode_data
[unichar
].ccc
= value
;
2088 printf(" %X ccc %d\n", unichar
, value
);
2089 if (!utf32valid(unichar
))
2090 line_fail(ccc_name
, line
);
2097 printf("Found %d entries\n", count
);
2099 file_fail(ccc_name
);
2102 static int ignore_compatibility_form(char *type
)
2105 char *ignored_types
[] = {"font", "noBreak", "initial", "medial",
2106 "final", "isolated", "circle", "super",
2107 "sub", "vertical", "wide", "narrow",
2108 "small", "square", "fraction", "compat"};
2110 for (i
= 0 ; i
< ARRAY_SIZE(ignored_types
); i
++)
2111 if (strcmp(type
, ignored_types
[i
]) == 0)
2116 static void nfdi_init(void)
2119 unsigned int unichar
;
2120 unsigned int mapping
[19]; /* Magic - guaranteed not to be exceeded. */
2129 printf("Parsing %s\n", data_name
);
2130 file
= fopen(data_name
, "r");
2132 open_fail(data_name
, errno
);
2135 while (fgets(line
, LINESIZE
, file
)) {
2136 ret
= sscanf(line
, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
2140 if (!utf32valid(unichar
))
2141 line_fail(data_name
, line
);
2144 /* skip over <tag> */
2147 while (*++s
!= '>');
2149 if(ignore_compatibility_form(type
))
2152 /* decode the decomposition into UTF-32 */
2155 mapping
[i
] = strtoul(s
, &s
, 16);
2156 if (!utf32valid(mapping
[i
]))
2157 line_fail(data_name
, line
);
2162 um
= malloc(i
* sizeof(unsigned int));
2163 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2164 unicode_data
[unichar
].utf32nfdi
= um
;
2167 print_utf32nfdi(unichar
);
2172 printf("Found %d entries\n", count
);
2174 file_fail(data_name
);
2177 static void nfdicf_init(void)
2180 unsigned int unichar
;
2181 unsigned int mapping
[19]; /* Magic - guaranteed not to be exceeded. */
2190 printf("Parsing %s\n", fold_name
);
2191 file
= fopen(fold_name
, "r");
2193 open_fail(fold_name
, errno
);
2196 while (fgets(line
, LINESIZE
, file
)) {
2197 ret
= sscanf(line
, "%X; %c; %[^;];", &unichar
, &status
, buf0
);
2200 if (!utf32valid(unichar
))
2201 line_fail(fold_name
, line
);
2202 /* Use the C+F casefold. */
2203 if (status
!= 'C' && status
!= 'F')
2211 mapping
[i
] = strtoul(s
, &s
, 16);
2212 if (!utf32valid(mapping
[i
]))
2213 line_fail(fold_name
, line
);
2218 um
= malloc(i
* sizeof(unsigned int));
2219 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2220 unicode_data
[unichar
].utf32nfdicf
= um
;
2223 print_utf32nfdicf(unichar
);
2228 printf("Found %d entries\n", count
);
2230 file_fail(fold_name
);
2233 static void corrections_init(void)
2236 unsigned int unichar
;
2239 unsigned int revision
;
2242 unsigned int mapping
[19]; /* Magic - guaranteed not to be exceeded. */
2249 printf("Parsing %s\n", norm_name
);
2250 file
= fopen(norm_name
, "r");
2252 open_fail(norm_name
, errno
);
2255 while (fgets(line
, LINESIZE
, file
)) {
2256 ret
= sscanf(line
, "%X;%[^;];%[^;];%d.%d.%d #",
2257 &unichar
, buf0
, buf1
,
2258 &major
, &minor
, &revision
);
2261 if (!utf32valid(unichar
) || !age_valid(major
, minor
, revision
))
2262 line_fail(norm_name
, line
);
2265 corrections
= calloc(count
, sizeof(struct unicode_data
));
2266 corrections_count
= count
;
2270 while (fgets(line
, LINESIZE
, file
)) {
2271 ret
= sscanf(line
, "%X;%[^;];%[^;];%d.%d.%d #",
2272 &unichar
, buf0
, buf1
,
2273 &major
, &minor
, &revision
);
2276 if (!utf32valid(unichar
) || !age_valid(major
, minor
, revision
))
2277 line_fail(norm_name
, line
);
2278 corrections
[count
] = unicode_data
[unichar
];
2279 assert(corrections
[count
].code
== unichar
);
2280 age
= UNICODE_AGE(major
, minor
, revision
);
2281 corrections
[count
].correction
= age
;
2286 mapping
[i
] = strtoul(s
, &s
, 16);
2287 if (!utf32valid(mapping
[i
]))
2288 line_fail(norm_name
, line
);
2293 um
= malloc(i
* sizeof(unsigned int));
2294 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2295 corrections
[count
].utf32nfdi
= um
;
2298 printf(" %X -> %s -> %s V%d_%d_%d\n",
2299 unichar
, buf0
, buf1
, major
, minor
, revision
);
2305 printf("Found %d entries\n", count
);
2307 file_fail(norm_name
);
2310 /* ------------------------------------------------------------------ */
2313 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2315 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2316 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2325 * NCount = 588 (VCount * TCount)
2326 * SCount = 11172 (LCount * NCount)
2329 * SIndex = s - SBase
2331 * LV (Canonical/Full)
2332 * LIndex = SIndex / NCount
2333 * VIndex = (Sindex % NCount) / TCount
2334 * LPart = LBase + LIndex
2335 * VPart = VBase + VIndex
2338 * LVIndex = (SIndex / TCount) * TCount
2339 * TIndex = (Sindex % TCount)
2340 * LVPart = SBase + LVIndex
2341 * TPart = TBase + TIndex
2344 * LIndex = SIndex / NCount
2345 * VIndex = (Sindex % NCount) / TCount
2346 * TIndex = (Sindex % TCount)
2347 * LPart = LBase + LIndex
2348 * VPart = VBase + VIndex
2349 * if (TIndex == 0) {
2350 * d = <LPart, VPart>
2352 * TPart = TBase + TIndex
2353 * d = <LPart, VPart, TPart>
2358 static void hangul_decompose(void)
2360 unsigned int sb
= 0xAC00;
2361 unsigned int lb
= 0x1100;
2362 unsigned int vb
= 0x1161;
2363 unsigned int tb
= 0x11a7;
2364 /* unsigned int lc = 19; */
2365 unsigned int vc
= 21;
2366 unsigned int tc
= 28;
2367 unsigned int nc
= (vc
* tc
);
2368 /* unsigned int sc = (lc * nc); */
2369 unsigned int unichar
;
2370 unsigned int mapping
[4];
2376 printf("Decomposing hangul\n");
2379 for (unichar
= 0xAC00; unichar
<= 0xD7A3; unichar
++) {
2380 unsigned int si
= unichar
- sb
;
2381 unsigned int li
= si
/ nc
;
2382 unsigned int vi
= (si
% nc
) / tc
;
2383 unsigned int ti
= si
% tc
;
2386 mapping
[i
++] = lb
+ li
;
2387 mapping
[i
++] = vb
+ vi
;
2389 mapping
[i
++] = tb
+ ti
;
2392 assert(!unicode_data
[unichar
].utf32nfdi
);
2393 um
= malloc(i
* sizeof(unsigned int));
2394 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2395 unicode_data
[unichar
].utf32nfdi
= um
;
2397 assert(!unicode_data
[unichar
].utf32nfdicf
);
2398 um
= malloc(i
* sizeof(unsigned int));
2399 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2400 unicode_data
[unichar
].utf32nfdicf
= um
;
2403 * Add a cookie as a reminder that the hangul syllable
2404 * decompositions must not be stored in the generated
2407 unicode_data
[unichar
].utf8nfdi
= malloc(2);
2408 unicode_data
[unichar
].utf8nfdi
[0] = HANGUL
;
2409 unicode_data
[unichar
].utf8nfdi
[1] = '\0';
2412 print_utf32nfdi(unichar
);
2417 printf("Created %d entries\n", count
);
2420 static void nfdi_decompose(void)
2422 unsigned int unichar
;
2423 unsigned int mapping
[19]; /* Magic - guaranteed not to be exceeded. */
2432 printf("Decomposing nfdi\n");
2435 for (unichar
= 0; unichar
!= 0x110000; unichar
++) {
2436 if (!unicode_data
[unichar
].utf32nfdi
)
2441 um
= unicode_data
[unichar
].utf32nfdi
;
2443 dc
= unicode_data
[*um
].utf32nfdi
;
2445 for (j
= 0; dc
[j
]; j
++)
2446 mapping
[i
++] = dc
[j
];
2456 free(unicode_data
[unichar
].utf32nfdi
);
2457 um
= malloc(i
* sizeof(unsigned int));
2458 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2459 unicode_data
[unichar
].utf32nfdi
= um
;
2461 /* Add this decomposition to nfdicf if there is no entry. */
2462 if (!unicode_data
[unichar
].utf32nfdicf
) {
2463 um
= malloc(i
* sizeof(unsigned int));
2464 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2465 unicode_data
[unichar
].utf32nfdicf
= um
;
2468 print_utf32nfdi(unichar
);
2472 printf("Processed %d entries\n", count
);
2475 static void nfdicf_decompose(void)
2477 unsigned int unichar
;
2478 unsigned int mapping
[19]; /* Magic - guaranteed not to be exceeded. */
2487 printf("Decomposing nfdicf\n");
2489 for (unichar
= 0; unichar
!= 0x110000; unichar
++) {
2490 if (!unicode_data
[unichar
].utf32nfdicf
)
2495 um
= unicode_data
[unichar
].utf32nfdicf
;
2497 dc
= unicode_data
[*um
].utf32nfdicf
;
2499 for (j
= 0; dc
[j
]; j
++)
2500 mapping
[i
++] = dc
[j
];
2510 free(unicode_data
[unichar
].utf32nfdicf
);
2511 um
= malloc(i
* sizeof(unsigned int));
2512 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2513 unicode_data
[unichar
].utf32nfdicf
= um
;
2516 print_utf32nfdicf(unichar
);
2520 printf("Processed %d entries\n", count
);
2523 /* ------------------------------------------------------------------ */
2525 int utf8agemax(struct tree
*, const char *);
2526 int utf8nagemax(struct tree
*, const char *, size_t);
2527 int utf8agemin(struct tree
*, const char *);
2528 int utf8nagemin(struct tree
*, const char *, size_t);
2529 ssize_t
utf8len(struct tree
*, const char *);
2530 ssize_t
utf8nlen(struct tree
*, const char *, size_t);
2532 int utf8cursor(struct utf8cursor
*, struct tree
*, const char *);
2533 int utf8ncursor(struct utf8cursor
*, struct tree
*, const char *, size_t);
2534 int utf8byte(struct utf8cursor
*);
2537 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2539 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2540 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2549 * NCount = 588 (VCount * TCount)
2550 * SCount = 11172 (LCount * NCount)
2553 * SIndex = s - SBase
2555 * LV (Canonical/Full)
2556 * LIndex = SIndex / NCount
2557 * VIndex = (Sindex % NCount) / TCount
2558 * LPart = LBase + LIndex
2559 * VPart = VBase + VIndex
2562 * LVIndex = (SIndex / TCount) * TCount
2563 * TIndex = (Sindex % TCount)
2564 * LVPart = SBase + LVIndex
2565 * TPart = TBase + TIndex
2568 * LIndex = SIndex / NCount
2569 * VIndex = (Sindex % NCount) / TCount
2570 * TIndex = (Sindex % TCount)
2571 * LPart = LBase + LIndex
2572 * VPart = VBase + VIndex
2573 * if (TIndex == 0) {
2574 * d = <LPart, VPart>
2576 * TPart = TBase + TIndex
2577 * d = <LPart, VPart, TPart>
2589 #define NC (VC * TC)
2590 #define SC (LC * NC)
2592 /* Algorithmic decomposition of hangul syllable. */
2593 static utf8leaf_t
*utf8hangul(const char *str
, unsigned char *hangul
)
2601 /* Calculate the SI, LI, VI, and TI values. */
2602 si
= utf8decode(str
) - SB
;
2604 vi
= (si
% NC
) / TC
;
2607 /* Fill in base of leaf. */
2610 LEAF_CCC(h
) = DECOMPOSE
;
2613 /* Add LPart, a 3-byte UTF-8 sequence. */
2614 h
+= utf8encode((char *)h
, li
+ LB
);
2616 /* Add VPart, a 3-byte UTF-8 sequence. */
2617 h
+= utf8encode((char *)h
, vi
+ VB
);
2619 /* Add TPart if required, also a 3-byte UTF-8 sequence. */
2621 h
+= utf8encode((char *)h
, ti
+ TB
);
2623 /* Terminate string. */
2630 * Use trie to scan s, touching at most len bytes.
2631 * Returns the leaf if one exists, NULL otherwise.
2633 * A non-NULL return guarantees that the UTF-8 sequence starting at s
2634 * is well-formed and corresponds to a known unicode code point. The
2635 * shorthand for this will be "is valid UTF-8 unicode".
2637 static utf8leaf_t
*utf8nlookup(struct tree
*tree
, unsigned char *hangul
,
2638 const char *s
, size_t len
)
2651 trie
= utf8data
+ tree
->index
;
2653 offlen
= (*trie
& OFFLEN
) >> OFFLEN_SHIFT
;
2654 if (*trie
& NEXTBYTE
) {
2659 mask
= 1 << (*trie
& BITNUM
);
2663 /* Right node at offset of trie */
2664 node
= (*trie
& RIGHTNODE
);
2665 offset
= trie
[offlen
];
2668 offset
|= trie
[offlen
];
2671 } else if (*trie
& RIGHTPATH
) {
2672 /* Right node after this node */
2673 node
= (*trie
& TRIENODE
);
2676 /* No right node. */
2682 /* Left node after this node. */
2683 node
= (*trie
& LEFTNODE
);
2685 } else if (*trie
& RIGHTPATH
) {
2689 /* Left node after this node */
2690 node
= (*trie
& TRIENODE
);
2696 * Hangul decomposition is done algorithmically. These are the
2697 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
2698 * always 3 bytes long, so s has been advanced twice, and the
2699 * start of the sequence is at s-2.
2701 if (LEAF_CCC(trie
) == DECOMPOSE
&& LEAF_STR(trie
)[0] == HANGUL
)
2702 trie
= utf8hangul(s
- 2, hangul
);
2707 * Use trie to scan s.
2708 * Returns the leaf if one exists, NULL otherwise.
2710 * Forwards to trie_nlookup().
2712 static utf8leaf_t
*utf8lookup(struct tree
*tree
, unsigned char *hangul
,
2715 return utf8nlookup(tree
, hangul
, s
, (size_t)-1);
2719 * Return the number of bytes used by the current UTF-8 sequence.
2720 * Assumes the input points to the first byte of a valid UTF-8
2723 static inline int utf8clen(const char *s
)
2725 unsigned char c
= *s
;
2726 return 1 + (c
>= 0xC0) + (c
>= 0xE0) + (c
>= 0xF0);
2730 * Maximum age of any character in s.
2731 * Return -1 if s is not valid UTF-8 unicode.
2732 * Return 0 if only non-assigned code points are used.
2734 int utf8agemax(struct tree
*tree
, const char *s
)
2739 unsigned char hangul
[UTF8HANGULLEAF
];
2745 leaf
= utf8lookup(tree
, hangul
, s
);
2748 leaf_age
= ages
[LEAF_GEN(leaf
)];
2749 if (leaf_age
<= tree
->maxage
&& leaf_age
> age
)
2757 * Minimum age of any character in s.
2758 * Return -1 if s is not valid UTF-8 unicode.
2759 * Return 0 if non-assigned code points are used.
2761 int utf8agemin(struct tree
*tree
, const char *s
)
2766 unsigned char hangul
[UTF8HANGULLEAF
];
2772 leaf
= utf8lookup(tree
, hangul
, s
);
2775 leaf_age
= ages
[LEAF_GEN(leaf
)];
2776 if (leaf_age
<= tree
->maxage
&& leaf_age
< age
)
2784 * Maximum age of any character in s, touch at most len bytes.
2785 * Return -1 if s is not valid UTF-8 unicode.
2787 int utf8nagemax(struct tree
*tree
, const char *s
, size_t len
)
2792 unsigned char hangul
[UTF8HANGULLEAF
];
2798 leaf
= utf8nlookup(tree
, hangul
, s
, len
);
2801 leaf_age
= ages
[LEAF_GEN(leaf
)];
2802 if (leaf_age
<= tree
->maxage
&& leaf_age
> age
)
2811 * Maximum age of any character in s, touch at most len bytes.
2812 * Return -1 if s is not valid UTF-8 unicode.
2814 int utf8nagemin(struct tree
*tree
, const char *s
, size_t len
)
2819 unsigned char hangul
[UTF8HANGULLEAF
];
2825 leaf
= utf8nlookup(tree
, hangul
, s
, len
);
2828 leaf_age
= ages
[LEAF_GEN(leaf
)];
2829 if (leaf_age
<= tree
->maxage
&& leaf_age
< age
)
2838 * Length of the normalization of s.
2839 * Return -1 if s is not valid UTF-8 unicode.
2841 * A string of Default_Ignorable_Code_Point has length 0.
2843 ssize_t
utf8len(struct tree
*tree
, const char *s
)
2847 unsigned char hangul
[UTF8HANGULLEAF
];
2852 leaf
= utf8lookup(tree
, hangul
, s
);
2855 if (ages
[LEAF_GEN(leaf
)] > tree
->maxage
)
2857 else if (LEAF_CCC(leaf
) == DECOMPOSE
)
2858 ret
+= strlen(LEAF_STR(leaf
));
2867 * Length of the normalization of s, touch at most len bytes.
2868 * Return -1 if s is not valid UTF-8 unicode.
2870 ssize_t
utf8nlen(struct tree
*tree
, const char *s
, size_t len
)
2874 unsigned char hangul
[UTF8HANGULLEAF
];
2879 leaf
= utf8nlookup(tree
, hangul
, s
, len
);
2882 if (ages
[LEAF_GEN(leaf
)] > tree
->maxage
)
2884 else if (LEAF_CCC(leaf
) == DECOMPOSE
)
2885 ret
+= strlen(LEAF_STR(leaf
));
2895 * Cursor structure used by the normalizer.
2907 unsigned int unichar
;
2908 unsigned char hangul
[UTF8HANGULLEAF
];
2912 * Set up an utf8cursor for use by utf8byte().
2915 * len : length of s.
2916 * u8c : pointer to cursor.
2917 * trie : utf8trie_t to use for normalization.
2919 * Returns -1 on error, 0 on success.
2921 int utf8ncursor(struct utf8cursor
*u8c
, struct tree
*tree
, const char *s
,
2936 u8c
->nccc
= STOPPER
;
2938 /* Check we didn't clobber the maximum length. */
2939 if (u8c
->len
!= len
)
2941 /* The first byte of s may not be an utf8 continuation. */
2942 if (len
> 0 && (*s
& 0xC0) == 0x80)
2948 * Set up an utf8cursor for use by utf8byte().
2950 * s : NUL-terminated string.
2951 * u8c : pointer to cursor.
2952 * trie : utf8trie_t to use for normalization.
2954 * Returns -1 on error, 0 on success.
2956 int utf8cursor(struct utf8cursor
*u8c
, struct tree
*tree
, const char *s
)
2958 return utf8ncursor(u8c
, tree
, s
, (unsigned int)-1);
2962 * Get one byte from the normalized form of the string described by u8c.
2964 * Returns the byte cast to an unsigned char on succes, and -1 on failure.
2966 * The cursor keeps track of the location in the string in u8c->s.
2967 * When a character is decomposed, the current location is stored in
2968 * u8c->p, and u8c->s is set to the start of the decomposition. Note
2969 * that bytes from a decomposition do not count against u8c->len.
2971 * Characters are emitted if they match the current CCC in u8c->ccc.
2972 * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
2973 * and the function returns 0 in that case.
2975 * Sorting by CCC is done by repeatedly scanning the string. The
2976 * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
2977 * the start of the scan. The first pass finds the lowest CCC to be
2978 * emitted and stores it in u8c->nccc, the second pass emits the
2979 * characters with this CCC and finds the next lowest CCC. This limits
2980 * the number of passes to 1 + the number of different CCCs in the
2981 * sequence being scanned.
2984 * u8c->p != NULL -> a decomposition is being scanned.
2985 * u8c->ss != NULL -> this is a repeating scan.
2986 * u8c->ccc == -1 -> this is the first scan of a repeating scan.
2988 int utf8byte(struct utf8cursor
*u8c
)
2994 /* Check for the end of a decomposed character. */
2995 if (u8c
->p
&& *u8c
->s
== '\0') {
3000 /* Check for end-of-string. */
3001 if (!u8c
->p
&& (u8c
->len
== 0 || *u8c
->s
== '\0')) {
3002 /* There is no next byte. */
3003 if (u8c
->ccc
== STOPPER
)
3005 /* End-of-string during a scan counts as a stopper. */
3008 } else if ((*u8c
->s
& 0xC0) == 0x80) {
3009 /* This is a continuation of the current character. */
3012 return (unsigned char)*u8c
->s
++;
3015 /* Look up the data for the current character. */
3017 leaf
= utf8lookup(u8c
->tree
, u8c
->hangul
, u8c
->s
);
3019 leaf
= utf8nlookup(u8c
->tree
, u8c
->hangul
,
3023 /* No leaf found implies that the input is a binary blob. */
3027 /* Characters that are too new have CCC 0. */
3028 if (ages
[LEAF_GEN(leaf
)] > u8c
->tree
->maxage
) {
3030 } else if ((ccc
= LEAF_CCC(leaf
)) == DECOMPOSE
) {
3031 u8c
->len
-= utf8clen(u8c
->s
);
3032 u8c
->p
= u8c
->s
+ utf8clen(u8c
->s
);
3033 u8c
->s
= LEAF_STR(leaf
);
3034 /* Empty decomposition implies CCC 0. */
3035 if (*u8c
->s
== '\0') {
3036 if (u8c
->ccc
== STOPPER
)
3041 leaf
= utf8lookup(u8c
->tree
, u8c
->hangul
, u8c
->s
);
3042 ccc
= LEAF_CCC(leaf
);
3044 u8c
->unichar
= utf8decode(u8c
->s
);
3047 * If this is not a stopper, then see if it updates
3048 * the next canonical class to be emitted.
3050 if (ccc
!= STOPPER
&& u8c
->ccc
< ccc
&& ccc
< u8c
->nccc
)
3054 * Return the current byte if this is the current
3057 if (ccc
== u8c
->ccc
) {
3060 return (unsigned char)*u8c
->s
++;
3063 /* Current combining class mismatch. */
3065 if (u8c
->nccc
== STOPPER
) {
3067 * Scan forward for the first canonical class
3068 * to be emitted. Save the position from
3071 assert(u8c
->ccc
== STOPPER
);
3072 u8c
->ccc
= MINCCC
- 1;
3076 u8c
->slen
= u8c
->len
;
3078 u8c
->len
-= utf8clen(u8c
->s
);
3079 u8c
->s
+= utf8clen(u8c
->s
);
3080 } else if (ccc
!= STOPPER
) {
3081 /* Not a stopper, and not the ccc we're emitting. */
3083 u8c
->len
-= utf8clen(u8c
->s
);
3084 u8c
->s
+= utf8clen(u8c
->s
);
3085 } else if (u8c
->nccc
!= MAXCCC
+ 1) {
3086 /* At a stopper, restart for next ccc. */
3087 u8c
->ccc
= u8c
->nccc
;
3088 u8c
->nccc
= MAXCCC
+ 1;
3091 u8c
->len
= u8c
->slen
;
3093 /* All done, proceed from here. */
3095 u8c
->nccc
= STOPPER
;
3103 /* ------------------------------------------------------------------ */
3105 static int normalize_line(struct tree
*tree
)
3110 struct utf8cursor u8c
;
3112 /* First test: null-terminated string. */
3115 if (utf8cursor(&u8c
, tree
, s
))
3117 while ((c
= utf8byte(&u8c
)) > 0)
3118 if (c
!= (unsigned char)*t
++)
3125 /* Second test: length-limited string. */
3127 /* Replace NUL with a value that will cause an error if seen. */
3128 s
[strlen(s
) + 1] = -1;
3130 if (utf8cursor(&u8c
, tree
, s
))
3132 while ((c
= utf8byte(&u8c
)) > 0)
3133 if (c
!= (unsigned char)*t
++)
3143 static void normalization_test(void)
3146 unsigned int unichar
;
3147 struct unicode_data
*data
;
3156 printf("Parsing %s\n", test_name
);
3157 /* Step one, read data from file. */
3158 file
= fopen(test_name
, "r");
3160 open_fail(test_name
, errno
);
3162 while (fgets(line
, LINESIZE
, file
)) {
3163 ret
= sscanf(line
, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
3165 if (ret
!= 2 || *line
== '#')
3170 unichar
= strtoul(s
, &s
, 16);
3171 t
+= utf8encode(t
, unichar
);
3179 unichar
= strtoul(s
, &s
, 16);
3180 data
= &unicode_data
[unichar
];
3181 if (data
->utf8nfdi
&& !*data
->utf8nfdi
)
3184 t
+= utf8encode(t
, unichar
);
3189 if (normalize_line(nfdi_tree
) < 0) {
3190 printf("Line %s -> %s", buf0
, buf1
);
3192 printf(" (ignorables removed)");
3193 printf(" failure\n");
3199 printf("Ran %d tests with %d failures\n", tests
, failures
);
3201 file_fail(test_name
);
3204 /* ------------------------------------------------------------------ */
3206 static void write_file(void)
3215 printf("Writing %s\n", utf8_name
);
3216 file
= fopen(utf8_name
, "w");
3218 open_fail(utf8_name
, errno
);
3220 fprintf(file
, "/* This file is generated code, do not edit. */\n");
3221 fprintf(file
, "\n");
3222 fprintf(file
, "#include <linux/module.h>\n");
3223 fprintf(file
, "#include <linux/kernel.h>\n");
3224 fprintf(file
, "#include \"utf8n.h\"\n");
3225 fprintf(file
, "\n");
3226 fprintf(file
, "static const unsigned int utf8agetab[] = {\n");
3227 for (i
= 0; i
!= ages_count
; i
++)
3228 fprintf(file
, "\t%#x%s\n", ages
[i
],
3229 ages
[i
] == unicode_maxage
? "" : ",");
3230 fprintf(file
, "};\n");
3231 fprintf(file
, "\n");
3232 fprintf(file
, "static const struct utf8data utf8nfdicfdata[] = {\n");
3234 for (gen
= 0; gen
< ages_count
; gen
++) {
3235 fprintf(file
, "\t{ %#x, %d }%s\n",
3236 ages
[gen
], trees
[t
].index
,
3237 ages
[gen
] == unicode_maxage
? "" : ",");
3238 if (trees
[t
].maxage
== ages
[gen
])
3241 fprintf(file
, "};\n");
3242 fprintf(file
, "\n");
3243 fprintf(file
, "static const struct utf8data utf8nfdidata[] = {\n");
3245 for (gen
= 0; gen
< ages_count
; gen
++) {
3246 fprintf(file
, "\t{ %#x, %d }%s\n",
3247 ages
[gen
], trees
[t
].index
,
3248 ages
[gen
] == unicode_maxage
? "" : ",");
3249 if (trees
[t
].maxage
== ages
[gen
])
3252 fprintf(file
, "};\n");
3253 fprintf(file
, "\n");
3254 fprintf(file
, "static const unsigned char utf8data[%zd] = {\n",
3257 for (i
= 0; i
!= utf8data_size
; i
+= 16) {
3258 if (i
== trees
[t
].index
) {
3259 fprintf(file
, "\t/* %s_%x */\n",
3260 trees
[t
].type
, trees
[t
].maxage
);
3261 if (t
< trees_count
-1)
3264 fprintf(file
, "\t");
3265 for (j
= i
; j
!= i
+ 16; j
++)
3266 fprintf(file
, "0x%.2x%s", utf8data
[j
],
3267 (j
< utf8data_size
-1 ? "," : ""));
3268 fprintf(file
, "\n");
3270 fprintf(file
, "};\n");
3271 fprintf(file
, "\n");
3272 fprintf(file
, "const struct utf8data_table utf8_data_table = {\n");
3273 fprintf(file
, "\t.utf8agetab = utf8agetab,\n");
3274 fprintf(file
, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n");
3275 fprintf(file
, "\n");
3276 fprintf(file
, "\t.utf8nfdicfdata = utf8nfdicfdata,\n");
3277 fprintf(file
, "\t.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),\n");
3278 fprintf(file
, "\n");
3279 fprintf(file
, "\t.utf8nfdidata = utf8nfdidata,\n");
3280 fprintf(file
, "\t.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),\n");
3281 fprintf(file
, "\n");
3282 fprintf(file
, "\t.utf8data = utf8data,\n");
3283 fprintf(file
, "};\n");
3284 fprintf(file
, "EXPORT_SYMBOL_GPL(utf8_data_table);");
3285 fprintf(file
, "\n");
3286 fprintf(file
, "MODULE_DESCRIPTION(\"UTF8 data table\");\n");
3287 fprintf(file
, "MODULE_LICENSE(\"GPL v2\");\n");
3291 /* ------------------------------------------------------------------ */
3293 int main(int argc
, char *argv
[])
3295 unsigned int unichar
;
3300 while ((opt
= getopt(argc
, argv
, "a:c:d:f:hn:o:p:t:v")) != -1) {
3339 for (unichar
= 0; unichar
!= 0x110000; unichar
++)
3340 unicode_data
[unichar
].code
= unichar
;
3354 /* Prevent "unused function" warning. */
3355 (void)lookup(nfdi_tree
, " ");
3357 tree_walk(nfdi_tree
);
3359 tree_walk(nfdicf_tree
);
3360 normalization_test();