fs/unicode/mkutf8data.c

   1 /*
   2  * Copyright (c) 2014 SGI.
   3  * All rights reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18
  19 /* Generator for a compact trie for unicode normalization */
  20
  21 #include <sys/types.h>
  22 #include <stddef.h>
  23 #include <stdlib.h>
  24 #include <stdio.h>
  25 #include <assert.h>
  26 #include <string.h>
  27 #include <unistd.h>
  28 #include <errno.h>
  29
  30 /* Default names of the in- and output files. */
  31
  32 #define AGE_NAME        "DerivedAge.txt"
  33 #define CCC_NAME        "DerivedCombiningClass.txt"
  34 #define PROP_NAME       "DerivedCoreProperties.txt"
  35 #define DATA_NAME       "UnicodeData.txt"
  36 #define FOLD_NAME       "CaseFolding.txt"
  37 #define NORM_NAME       "NormalizationCorrections.txt"
  38 #define TEST_NAME       "NormalizationTest.txt"
  39 #define UTF8_NAME       "utf8data.h"
  40
  41 const char      *age_name  = AGE_NAME;
  42 const char      *ccc_name  = CCC_NAME;
  43 const char      *prop_name = PROP_NAME;
  44 const char      *data_name = DATA_NAME;
  45 const char      *fold_name = FOLD_NAME;
  46 const char      *norm_name = NORM_NAME;
  47 const char      *test_name = TEST_NAME;
  48 const char      *utf8_name = UTF8_NAME;
  49
  50 int verbose = 0;
  51
  52 /* An arbitrary line size limit on input lines. */
  53
  54 #define LINESIZE        1024
  55 char line[LINESIZE];
  56 char buf0[LINESIZE];
  57 char buf1[LINESIZE];
  58 char buf2[LINESIZE];
  59 char buf3[LINESIZE];
  60
  61 const char *argv0;
  62
  63 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
  64
  65 /* ------------------------------------------------------------------ */
  66
  67 /*
  68  * Unicode version numbers consist of three parts: major, minor, and a
  69  * revision.  These numbers are packed into an unsigned int to obtain
  70  * a single version number.
  71  *
  72  * To save space in the generated trie, the unicode version is not
  73  * stored directly, instead we calculate a generation number from the
  74  * unicode versions seen in the DerivedAge file, and use that as an
  75  * index into a table of unicode versions.
  76  */
  77 #define UNICODE_MAJ_SHIFT               (16)
  78 #define UNICODE_MIN_SHIFT               (8)
  79
  80 #define UNICODE_MAJ_MAX                 ((unsigned short)-1)
  81 #define UNICODE_MIN_MAX                 ((unsigned char)-1)
  82 #define UNICODE_REV_MAX                 ((unsigned char)-1)
  83
  84 #define UNICODE_AGE(MAJ,MIN,REV)                        \
  85         (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |   \
  86          ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |   \
  87          ((unsigned int)(REV)))
  88
  89 unsigned int *ages;
  90 int ages_count;
  91
  92 unsigned int unicode_maxage;
  93
  94 static int age_valid(unsigned int major, unsigned int minor,
  95                      unsigned int revision)
  96 {
  97         if (major > UNICODE_MAJ_MAX)
  98                 return 0;
  99         if (minor > UNICODE_MIN_MAX)
 100                 return 0;
 101         if (revision > UNICODE_REV_MAX)
 102                 return 0;
 103         return 1;
 104 }
 105
 106 /* ------------------------------------------------------------------ */
 107
 108 /*
 109  * utf8trie_t
 110  *
 111  * A compact binary tree, used to decode UTF-8 characters.
 112  *
 113  * Internal nodes are one byte for the node itself, and up to three
 114  * bytes for an offset into the tree.  The first byte contains the
 115  * following information:
 116  *  NEXTBYTE  - flag        - advance to next byte if set
 117  *  BITNUM    - 3 bit field - the bit number to tested
 118  *  OFFLEN    - 2 bit field - number of bytes in the offset
 119  * if offlen == 0 (non-branching node)
 120  *  RIGHTPATH - 1 bit field - set if the following node is for the
 121  *                            right-hand path (tested bit is set)
 122  *  TRIENODE  - 1 bit field - set if the following node is an internal
 123  *                            node, otherwise it is a leaf node
 124  * if offlen != 0 (branching node)
 125  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
 126  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
 127  *
 128  * Due to the way utf8 works, there cannot be branching nodes with
 129  * NEXTBYTE set, and moreover those nodes always have a righthand
 130  * descendant.
 131  */
 132 typedef unsigned char utf8trie_t;
 133 #define BITNUM          0x07
 134 #define NEXTBYTE        0x08
 135 #define OFFLEN          0x30
 136 #define OFFLEN_SHIFT    4
 137 #define RIGHTPATH       0x40
 138 #define TRIENODE        0x80
 139 #define RIGHTNODE       0x40
 140 #define LEFTNODE        0x80
 141
 142 /*
 143  * utf8leaf_t
 144  *
 145  * The leaves of the trie are embedded in the trie, and so the same
 146  * underlying datatype, unsigned char.
 147  *
 148  * leaf[0]: The unicode version, stored as a generation number that is
 149  *          an index into utf8agetab[].  With this we can filter code
 150  *          points based on the unicode version in which they were
 151  *          defined.  The CCC of a non-defined code point is 0.
 152  * leaf[1]: Canonical Combining Class. During normalization, we need
 153  *          to do a stable sort into ascending order of all characters
 154  *          with a non-zero CCC that occur between two characters with
 155  *          a CCC of 0, or at the begin or end of a string.
 156  *          The unicode standard guarantees that all CCC values are
 157  *          between 0 and 254 inclusive, which leaves 255 available as
 158  *          a special value.
 159  *          Code points with CCC 0 are known as stoppers.
 160  * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
 161  *          start of a NUL-terminated string that is the decomposition
 162  *          of the character.
 163  *          The CCC of a decomposable character is the same as the CCC
 164  *          of the first character of its decomposition.
 165  *          Some characters decompose as the empty string: these are
 166  *          characters with the Default_Ignorable_Code_Point property.
 167  *          These do affect normalization, as they all have CCC 0.
 168  *
 169  * The decompositions in the trie have been fully expanded.
 170  *
 171  * Casefolding, if applicable, is also done using decompositions.
 172  */
 173 typedef unsigned char utf8leaf_t;
 174
 175 #define LEAF_GEN(LEAF)  ((LEAF)[0])
 176 #define LEAF_CCC(LEAF)  ((LEAF)[1])
 177 #define LEAF_STR(LEAF)  ((const char*)((LEAF) + 2))
 178
 179 #define MAXGEN          (255)
 180
 181 #define MINCCC          (0)
 182 #define MAXCCC          (254)
 183 #define STOPPER         (0)
 184 #define DECOMPOSE       (255)
 185 #define HANGUL          ((char)(255))
 186
 187 #define UTF8HANGULLEAF  (12)
 188
 189 struct tree;
 190 static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
 191                                const char *, size_t);
 192 static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
 193
 194 unsigned char *utf8data;
 195 size_t utf8data_size;
 196
 197 utf8trie_t *nfdi;
 198 utf8trie_t *nfdicf;
 199
 200 /* ------------------------------------------------------------------ */
 201
 202 /*
 203  * UTF8 valid ranges.
 204  *
 205  * The UTF-8 encoding spreads the bits of a 32bit word over several
 206  * bytes. This table gives the ranges that can be held and how they'd
 207  * be represented.
 208  *
 209  * 0x00000000 0x0000007F: 0xxxxxxx
 210  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
 211  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
 212  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 213  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 214  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 215  *
 216  * There is an additional requirement on UTF-8, in that only the
 217  * shortest representation of a 32bit value is to be used.  A decoder
 218  * must not decode sequences that do not satisfy this requirement.
 219  * Thus the allowed ranges have a lower bound.
 220  *
 221  * 0x00000000 0x0000007F: 0xxxxxxx
 222  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
 223  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
 224  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 225  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 226  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 227  *
 228  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
 229  * 17 planes of 65536 values.  This limits the sequences actually seen
 230  * even more, to just the following.
 231  *
 232  *          0 -     0x7f: 0                     0x7f
 233  *       0x80 -    0x7ff: 0xc2 0x80             0xdf 0xbf
 234  *      0x800 -   0xffff: 0xe0 0xa0 0x80        0xef 0xbf 0xbf
 235  *    0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80   0xf4 0x8f 0xbf 0xbf
 236  *
 237  * Even within those ranges not all values are allowed: the surrogates
 238  * 0xd800 - 0xdfff should never be seen.
 239  *
 240  * Note that the longest sequence seen with valid usage is 4 bytes,
 241  * the same a single UTF-32 character.  This makes the UTF-8
 242  * representation of Unicode strictly smaller than UTF-32.
 243  *
 244  * The shortest sequence requirement was introduced by:
 245  *    Corrigendum #1: UTF-8 Shortest Form
 246  * It can be found here:
 247  *    http://www.unicode.org/versions/corrigendum1.html
 248  *
 249  */
 250
 251 #define UTF8_2_BITS     0xC0
 252 #define UTF8_3_BITS     0xE0
 253 #define UTF8_4_BITS     0xF0
 254 #define UTF8_N_BITS     0x80
 255 #define UTF8_2_MASK     0xE0
 256 #define UTF8_3_MASK     0xF0
 257 #define UTF8_4_MASK     0xF8
 258 #define UTF8_N_MASK     0xC0
 259 #define UTF8_V_MASK     0x3F
 260 #define UTF8_V_SHIFT    6
 261
 262 static int utf8encode(char *str, unsigned int val)
 263 {
 264         int len;
 265
 266         if (val < 0x80) {
 267                 str[0] = val;
 268                 len = 1;
 269         } else if (val < 0x800) {
 270                 str[1] = val & UTF8_V_MASK;
 271                 str[1] |= UTF8_N_BITS;
 272                 val >>= UTF8_V_SHIFT;
 273                 str[0] = val;
 274                 str[0] |= UTF8_2_BITS;
 275                 len = 2;
 276         } else if (val < 0x10000) {
 277                 str[2] = val & UTF8_V_MASK;
 278                 str[2] |= UTF8_N_BITS;
 279                 val >>= UTF8_V_SHIFT;
 280                 str[1] = val & UTF8_V_MASK;
 281                 str[1] |= UTF8_N_BITS;
 282                 val >>= UTF8_V_SHIFT;
 283                 str[0] = val;
 284                 str[0] |= UTF8_3_BITS;
 285                 len = 3;
 286         } else if (val < 0x110000) {
 287                 str[3] = val & UTF8_V_MASK;
 288                 str[3] |= UTF8_N_BITS;
 289                 val >>= UTF8_V_SHIFT;
 290                 str[2] = val & UTF8_V_MASK;
 291                 str[2] |= UTF8_N_BITS;
 292                 val >>= UTF8_V_SHIFT;
 293                 str[1] = val & UTF8_V_MASK;
 294                 str[1] |= UTF8_N_BITS;
 295                 val >>= UTF8_V_SHIFT;
 296                 str[0] = val;
 297                 str[0] |= UTF8_4_BITS;
 298                 len = 4;
 299         } else {
 300                 printf("%#x: illegal val\n", val);
 301                 len = 0;
 302         }
 303         return len;
 304 }
 305
 306 static unsigned int utf8decode(const char *str)
 307 {
 308         const unsigned char *s = (const unsigned char*)str;
 309         unsigned int unichar = 0;
 310
 311         if (*s < 0x80) {
 312                 unichar = *s;
 313         } else if (*s < UTF8_3_BITS) {
 314                 unichar = *s++ & 0x1F;
 315                 unichar <<= UTF8_V_SHIFT;
 316                 unichar |= *s & 0x3F;
 317         } else if (*s < UTF8_4_BITS) {
 318                 unichar = *s++ & 0x0F;
 319                 unichar <<= UTF8_V_SHIFT;
 320                 unichar |= *s++ & 0x3F;
 321                 unichar <<= UTF8_V_SHIFT;
 322                 unichar |= *s & 0x3F;
 323         } else {
 324                 unichar = *s++ & 0x0F;
 325                 unichar <<= UTF8_V_SHIFT;
 326                 unichar |= *s++ & 0x3F;
 327                 unichar <<= UTF8_V_SHIFT;
 328                 unichar |= *s++ & 0x3F;
 329                 unichar <<= UTF8_V_SHIFT;
 330                 unichar |= *s & 0x3F;
 331         }
 332         return unichar;
 333 }
 334
 335 static int utf32valid(unsigned int unichar)
 336 {
 337         return unichar < 0x110000;
 338 }
 339
 340 #define HANGUL_SYLLABLE(U)      ((U) >= 0xAC00 && (U) <= 0xD7A3)
 341
 342 #define NODE 1
 343 #define LEAF 0
 344
 345 struct tree {
 346         void *root;
 347         int childnode;
 348         const char *type;
 349         unsigned int maxage;
 350         struct tree *next;
 351         int (*leaf_equal)(void *, void *);
 352         void (*leaf_print)(void *, int);
 353         int (*leaf_mark)(void *);
 354         int (*leaf_size)(void *);
 355         int *(*leaf_index)(struct tree *, void *);
 356         unsigned char *(*leaf_emit)(void *, unsigned char *);
 357         int leafindex[0x110000];
 358         int index;
 359 };
 360
 361 struct node {
 362         int index;
 363         int offset;
 364         int mark;
 365         int size;
 366         struct node *parent;
 367         void *left;
 368         void *right;
 369         unsigned char bitnum;
 370         unsigned char nextbyte;
 371         unsigned char leftnode;
 372         unsigned char rightnode;
 373         unsigned int keybits;
 374         unsigned int keymask;
 375 };
 376
 377 /*
 378  * Example lookup function for a tree.
 379  */
 380 static void *lookup(struct tree *tree, const char *key)
 381 {
 382         struct node *node;
 383         void *leaf = NULL;
 384
 385         node = tree->root;
 386         while (!leaf && node) {
 387                 if (node->nextbyte)
 388                         key++;
 389                 if (*key & (1 << (node->bitnum & 7))) {
 390                         /* Right leg */
 391                         if (node->rightnode == NODE) {
 392                                 node = node->right;
 393                         } else if (node->rightnode == LEAF) {
 394                                 leaf = node->right;
 395                         } else {
 396                                 node = NULL;
 397                         }
 398                 } else {
 399                         /* Left leg */
 400                         if (node->leftnode == NODE) {
 401                                 node = node->left;
 402                         } else if (node->leftnode == LEAF) {
 403                                 leaf = node->left;
 404                         } else {
 405                                 node = NULL;
 406                         }
 407                 }
 408         }
 409
 410         return leaf;
 411 }
 412
 413 /*
 414  * A simple non-recursive tree walker: keep track of visits to the
 415  * left and right branches in the leftmask and rightmask.
 416  */
 417 static void tree_walk(struct tree *tree)
 418 {
 419         struct node *node;
 420         unsigned int leftmask;
 421         unsigned int rightmask;
 422         unsigned int bitmask;
 423         int indent = 1;
 424         int nodes, singletons, leaves;
 425
 426         nodes = singletons = leaves = 0;
 427
 428         printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
 429         if (tree->childnode == LEAF) {
 430                 assert(tree->root);
 431                 tree->leaf_print(tree->root, indent);
 432                 leaves = 1;
 433         } else {
 434                 assert(tree->childnode == NODE);
 435                 node = tree->root;
 436                 leftmask = rightmask = 0;
 437                 while (node) {
 438                         printf("%*snode @ %p bitnum %d nextbyte %d"
 439                                " left %p right %p mask %x bits %x\n",
 440                                 indent, "", node,
 441                                 node->bitnum, node->nextbyte,
 442                                 node->left, node->right,
 443                                 node->keymask, node->keybits);
 444                         nodes += 1;
 445                         if (!(node->left && node->right))
 446                                 singletons += 1;
 447
 448                         while (node) {
 449                                 bitmask = 1 << node->bitnum;
 450                                 if ((leftmask & bitmask) == 0) {
 451                                         leftmask |= bitmask;
 452                                         if (node->leftnode == LEAF) {
 453                                                 assert(node->left);
 454                                                 tree->leaf_print(node->left,
 455                                                                  indent+1);
 456                                                 leaves += 1;
 457                                         } else if (node->left) {
 458                                                 assert(node->leftnode == NODE);
 459                                                 indent += 1;
 460                                                 node = node->left;
 461                                                 break;
 462                                         }
 463                                 }
 464                                 if ((rightmask & bitmask) == 0) {
 465                                         rightmask |= bitmask;
 466                                         if (node->rightnode == LEAF) {
 467                                                 assert(node->right);
 468                                                 tree->leaf_print(node->right,
 469                                                                  indent+1);
 470                                                 leaves += 1;
 471                                         } else if (node->right) {
 472                                                 assert(node->rightnode == NODE);
 473                                                 indent += 1;
 474                                                 node = node->right;
 475                                                 break;
 476                                         }
 477                                 }
 478                                 leftmask &= ~bitmask;
 479                                 rightmask &= ~bitmask;
 480                                 node = node->parent;
 481                                 indent -= 1;
 482                         }
 483                 }
 484         }
 485         printf("nodes %d leaves %d singletons %d\n",
 486                nodes, leaves, singletons);
 487 }
 488
 489 /*
 490  * Allocate an initialize a new internal node.
 491  */
 492 static struct node *alloc_node(struct node *parent)
 493 {
 494         struct node *node;
 495         int bitnum;
 496
 497         node = malloc(sizeof(*node));
 498         node->left = node->right = NULL;
 499         node->parent = parent;
 500         node->leftnode = NODE;
 501         node->rightnode = NODE;
 502         node->keybits = 0;
 503         node->keymask = 0;
 504         node->mark = 0;
 505         node->index = 0;
 506         node->offset = -1;
 507         node->size = 4;
 508
 509         if (node->parent) {
 510                 bitnum = parent->bitnum;
 511                 if ((bitnum & 7) == 0) {
 512                         node->bitnum = bitnum + 7 + 8;
 513                         node->nextbyte = 1;
 514                 } else {
 515                         node->bitnum = bitnum - 1;
 516                         node->nextbyte = 0;
 517                 }
 518         } else {
 519                 node->bitnum = 7;
 520                 node->nextbyte = 0;
 521         }
 522
 523         return node;
 524 }
 525
 526 /*
 527  * Insert a new leaf into the tree, and collapse any subtrees that are
 528  * fully populated and end in identical leaves. A nextbyte tagged
 529  * internal node will not be removed to preserve the tree's integrity.
 530  * Note that due to the structure of utf8, no nextbyte tagged node
 531  * will be a candidate for removal.
 532  */
 533 static int insert(struct tree *tree, char *key, int keylen, void *leaf)
 534 {
 535         struct node *node;
 536         struct node *parent;
 537         void **cursor;
 538         int keybits;
 539
 540         assert(keylen >= 1 && keylen <= 4);
 541
 542         node = NULL;
 543         cursor = &tree->root;
 544         keybits = 8 * keylen;
 545
 546         /* Insert, creating path along the way. */
 547         while (keybits) {
 548                 if (!*cursor)
 549                         *cursor = alloc_node(node);
 550                 node = *cursor;
 551                 if (node->nextbyte)
 552                         key++;
 553                 if (*key & (1 << (node->bitnum & 7)))
 554                         cursor = &node->right;
 555                 else
 556                         cursor = &node->left;
 557                 keybits--;
 558         }
 559         *cursor = leaf;
 560
 561         /* Merge subtrees if possible. */
 562         while (node) {
 563                 if (*key & (1 << (node->bitnum & 7)))
 564                         node->rightnode = LEAF;
 565                 else
 566                         node->leftnode = LEAF;
 567                 if (node->nextbyte)
 568                         break;
 569                 if (node->leftnode == NODE || node->rightnode == NODE)
 570                         break;
 571                 assert(node->left);
 572                 assert(node->right);
 573                 /* Compare */
 574                 if (! tree->leaf_equal(node->left, node->right))
 575                         break;
 576                 /* Keep left, drop right leaf. */
 577                 leaf = node->left;
 578                 /* Check in parent */
 579                 parent = node->parent;
 580                 if (!parent) {
 581                         /* root of tree! */
 582                         tree->root = leaf;
 583                         tree->childnode = LEAF;
 584                 } else if (parent->left == node) {
 585                         parent->left = leaf;
 586                         parent->leftnode = LEAF;
 587                         if (parent->right) {
 588                                 parent->keymask = 0;
 589                                 parent->keybits = 0;
 590                         } else {
 591                                 parent->keymask |= (1 << node->bitnum);
 592                         }
 593                 } else if (parent->right == node) {
 594                         parent->right = leaf;
 595                         parent->rightnode = LEAF;
 596                         if (parent->left) {
 597                                 parent->keymask = 0;
 598                                 parent->keybits = 0;
 599                         } else {
 600                                 parent->keymask |= (1 << node->bitnum);
 601                                 parent->keybits |= (1 << node->bitnum);
 602                         }
 603                 } else {
 604                         /* internal tree error */
 605                         assert(0);
 606                 }
 607                 free(node);
 608                 node = parent;
 609         }
 610
 611         /* Propagate keymasks up along singleton chains. */
 612         while (node) {
 613                 parent = node->parent;
 614                 if (!parent)
 615                         break;
 616                 /* Nix the mask for parents with two children. */
 617                 if (node->keymask == 0) {
 618                         parent->keymask = 0;
 619                         parent->keybits = 0;
 620                 } else if (parent->left && parent->right) {
 621                         parent->keymask = 0;
 622                         parent->keybits = 0;
 623                 } else {
 624                         assert((parent->keymask & node->keymask) == 0);
 625                         parent->keymask |= node->keymask;
 626                         parent->keymask |= (1 << parent->bitnum);
 627                         parent->keybits |= node->keybits;
 628                         if (parent->right)
 629                                 parent->keybits |= (1 << parent->bitnum);
 630                 }
 631                 node = parent;
 632         }
 633
 634         return 0;
 635 }
 636
 637 /*
 638  * Prune internal nodes.
 639  *
 640  * Fully populated subtrees that end at the same leaf have already
 641  * been collapsed.  There are still internal nodes that have for both
 642  * their left and right branches a sequence of singletons that make
 643  * identical choices and end in identical leaves.  The keymask and
 644  * keybits collected in the nodes describe the choices made in these
 645  * singleton chains.  When they are identical for the left and right
 646  * branch of a node, and the two leaves comare identical, the node in
 647  * question can be removed.
 648  *
 649  * Note that nodes with the nextbyte tag set will not be removed by
 650  * this to ensure tree integrity.  Note as well that the structure of
 651  * utf8 ensures that these nodes would not have been candidates for
 652  * removal in any case.
 653  */
 654 static void prune(struct tree *tree)
 655 {
 656         struct node *node;
 657         struct node *left;
 658         struct node *right;
 659         struct node *parent;
 660         void *leftleaf;
 661         void *rightleaf;
 662         unsigned int leftmask;
 663         unsigned int rightmask;
 664         unsigned int bitmask;
 665         int count;
 666
 667         if (verbose > 0)
 668                 printf("Pruning %s_%x\n", tree->type, tree->maxage);
 669
 670         count = 0;
 671         if (tree->childnode == LEAF)
 672                 return;
 673         if (!tree->root)
 674                 return;
 675
 676         leftmask = rightmask = 0;
 677         node = tree->root;
 678         while (node) {
 679                 if (node->nextbyte)
 680                         goto advance;
 681                 if (node->leftnode == LEAF)
 682                         goto advance;
 683                 if (node->rightnode == LEAF)
 684                         goto advance;
 685                 if (!node->left)
 686                         goto advance;
 687                 if (!node->right)
 688                         goto advance;
 689                 left = node->left;
 690                 right = node->right;
 691                 if (left->keymask == 0)
 692                         goto advance;
 693                 if (right->keymask == 0)
 694                         goto advance;
 695                 if (left->keymask != right->keymask)
 696                         goto advance;
 697                 if (left->keybits != right->keybits)
 698                         goto advance;
 699                 leftleaf = NULL;
 700                 while (!leftleaf) {
 701                         assert(left->left || left->right);
 702                         if (left->leftnode == LEAF)
 703                                 leftleaf = left->left;
 704                         else if (left->rightnode == LEAF)
 705                                 leftleaf = left->right;
 706                         else if (left->left)
 707                                 left = left->left;
 708                         else if (left->right)
 709                                 left = left->right;
 710                         else
 711                                 assert(0);
 712                 }
 713                 rightleaf = NULL;
 714                 while (!rightleaf) {
 715                         assert(right->left || right->right);
 716                         if (right->leftnode == LEAF)
 717                                 rightleaf = right->left;
 718                         else if (right->rightnode == LEAF)
 719                                 rightleaf = right->right;
 720                         else if (right->left)
 721                                 right = right->left;
 722                         else if (right->right)
 723                                 right = right->right;
 724                         else
 725                                 assert(0);
 726                 }
 727                 if (! tree->leaf_equal(leftleaf, rightleaf))
 728                         goto advance;
 729                 /*
 730                  * This node has identical singleton-only subtrees.
 731                  * Remove it.
 732                  */
 733                 parent = node->parent;
 734                 left = node->left;
 735                 right = node->right;
 736                 if (parent->left == node)
 737                         parent->left = left;
 738                 else if (parent->right == node)
 739                         parent->right = left;
 740                 else
 741                         assert(0);
 742                 left->parent = parent;
 743                 left->keymask |= (1 << node->bitnum);
 744                 node->left = NULL;
 745                 while (node) {
 746                         bitmask = 1 << node->bitnum;
 747                         leftmask &= ~bitmask;
 748                         rightmask &= ~bitmask;
 749                         if (node->leftnode == NODE && node->left) {
 750                                 left = node->left;
 751                                 free(node);
 752                                 count++;
 753                                 node = left;
 754                         } else if (node->rightnode == NODE && node->right) {
 755                                 right = node->right;
 756                                 free(node);
 757                                 count++;
 758                                 node = right;
 759                         } else {
 760                                 node = NULL;
 761                         }
 762                 }
 763                 /* Propagate keymasks up along singleton chains. */
 764                 node = parent;
 765                 /* Force re-check */
 766                 bitmask = 1 << node->bitnum;
 767                 leftmask &= ~bitmask;
 768                 rightmask &= ~bitmask;
 769                 for (;;) {
 770                         if (node->left && node->right)
 771                                 break;
 772                         if (node->left) {
 773                                 left = node->left;
 774                                 node->keymask |= left->keymask;
 775                                 node->keybits |= left->keybits;
 776                         }
 777                         if (node->right) {
 778                                 right = node->right;
 779                                 node->keymask |= right->keymask;
 780                                 node->keybits |= right->keybits;
 781                         }
 782                         node->keymask |= (1 << node->bitnum);
 783                         node = node->parent;
 784                         /* Force re-check */
 785                         bitmask = 1 << node->bitnum;
 786                         leftmask &= ~bitmask;
 787                         rightmask &= ~bitmask;
 788                 }
 789         advance:
 790                 bitmask = 1 << node->bitnum;
 791                 if ((leftmask & bitmask) == 0 &&
 792                     node->leftnode == NODE &&
 793                     node->left) {
 794                         leftmask |= bitmask;
 795                         node = node->left;
 796                 } else if ((rightmask & bitmask) == 0 &&
 797                            node->rightnode == NODE &&
 798                            node->right) {
 799                         rightmask |= bitmask;
 800                         node = node->right;
 801                 } else {
 802                         leftmask &= ~bitmask;
 803                         rightmask &= ~bitmask;
 804                         node = node->parent;
 805                 }
 806         }
 807         if (verbose > 0)
 808                 printf("Pruned %d nodes\n", count);
 809 }
 810
 811 /*
 812  * Mark the nodes in the tree that lead to leaves that must be
 813  * emitted.
 814  */
 815 static void mark_nodes(struct tree *tree)
 816 {
 817         struct node *node;
 818         struct node *n;
 819         unsigned int leftmask;
 820         unsigned int rightmask;
 821         unsigned int bitmask;
 822         int marked;
 823
 824         marked = 0;
 825         if (verbose > 0)
 826                 printf("Marking %s_%x\n", tree->type, tree->maxage);
 827         if (tree->childnode == LEAF)
 828                 goto done;
 829
 830         assert(tree->childnode == NODE);
 831         node = tree->root;
 832         leftmask = rightmask = 0;
 833         while (node) {
 834                 bitmask = 1 << node->bitnum;
 835                 if ((leftmask & bitmask) == 0) {
 836                         leftmask |= bitmask;
 837                         if (node->leftnode == LEAF) {
 838                                 assert(node->left);
 839                                 if (tree->leaf_mark(node->left)) {
 840                                         n = node;
 841                                         while (n && !n->mark) {
 842                                                 marked++;
 843                                                 n->mark = 1;
 844                                                 n = n->parent;
 845                                         }
 846                                 }
 847                         } else if (node->left) {
 848                                 assert(node->leftnode == NODE);
 849                                 node = node->left;
 850                                 continue;
 851                         }
 852                 }
 853                 if ((rightmask & bitmask) == 0) {
 854                         rightmask |= bitmask;
 855                         if (node->rightnode == LEAF) {
 856                                 assert(node->right);
 857                                 if (tree->leaf_mark(node->right)) {
 858                                         n = node;
 859                                         while (n && !n->mark) {
 860                                                 marked++;
 861                                                 n->mark = 1;
 862                                                 n = n->parent;
 863                                         }
 864                                 }
 865                         } else if (node->right) {
 866                                 assert(node->rightnode == NODE);
 867                                 node = node->right;
 868                                 continue;
 869                         }
 870                 }
 871                 leftmask &= ~bitmask;
 872                 rightmask &= ~bitmask;
 873                 node = node->parent;
 874         }
 875
 876         /* second pass: left siblings and singletons */
 877
 878         assert(tree->childnode == NODE);
 879         node = tree->root;
 880         leftmask = rightmask = 0;
 881         while (node) {
 882                 bitmask = 1 << node->bitnum;
 883                 if ((leftmask & bitmask) == 0) {
 884                         leftmask |= bitmask;
 885                         if (node->leftnode == LEAF) {
 886                                 assert(node->left);
 887                                 if (tree->leaf_mark(node->left)) {
 888                                         n = node;
 889                                         while (n && !n->mark) {
 890                                                 marked++;
 891                                                 n->mark = 1;
 892                                                 n = n->parent;
 893                                         }
 894                                 }
 895                         } else if (node->left) {
 896                                 assert(node->leftnode == NODE);
 897                                 node = node->left;
 898                                 if (!node->mark && node->parent->mark) {
 899                                         marked++;
 900                                         node->mark = 1;
 901                                 }
 902                                 continue;
 903                         }
 904                 }
 905                 if ((rightmask & bitmask) == 0) {
 906                         rightmask |= bitmask;
 907                         if (node->rightnode == LEAF) {
 908                                 assert(node->right);
 909                                 if (tree->leaf_mark(node->right)) {
 910                                         n = node;
 911                                         while (n && !n->mark) {
 912                                                 marked++;
 913                                                 n->mark = 1;
 914                                                 n = n->parent;
 915                                         }
 916                                 }
 917                         } else if (node->right) {
 918                                 assert(node->rightnode == NODE);
 919                                 node = node->right;
 920                                 if (!node->mark && node->parent->mark &&
 921                                     !node->parent->left) {
 922                                         marked++;
 923                                         node->mark = 1;
 924                                 }
 925                                 continue;
 926                         }
 927                 }
 928                 leftmask &= ~bitmask;
 929                 rightmask &= ~bitmask;
 930                 node = node->parent;
 931         }
 932 done:
 933         if (verbose > 0)
 934                 printf("Marked %d nodes\n", marked);
 935 }
 936
 937 /*
 938  * Compute the index of each node and leaf, which is the offset in the
 939  * emitted trie.  These values must be pre-computed because relative
 940  * offsets between nodes are used to navigate the tree.
 941  */
 942 static int index_nodes(struct tree *tree, int index)
 943 {
 944         struct node *node;
 945         unsigned int leftmask;
 946         unsigned int rightmask;
 947         unsigned int bitmask;
 948         int count;
 949         int indent;
 950
 951         /* Align to a cache line (or half a cache line?). */
 952         while (index % 64)
 953                 index++;
 954         tree->index = index;
 955         indent = 1;
 956         count = 0;
 957
 958         if (verbose > 0)
 959                 printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
 960         if (tree->childnode == LEAF) {
 961                 index += tree->leaf_size(tree->root);
 962                 goto done;
 963         }
 964
 965         assert(tree->childnode == NODE);
 966         node = tree->root;
 967         leftmask = rightmask = 0;
 968         while (node) {
 969                 if (!node->mark)
 970                         goto skip;
 971                 count++;
 972                 if (node->index != index)
 973                         node->index = index;
 974                 index += node->size;
 975 skip:
 976                 while (node) {
 977                         bitmask = 1 << node->bitnum;
 978                         if (node->mark && (leftmask & bitmask) == 0) {
 979                                 leftmask |= bitmask;
 980                                 if (node->leftnode == LEAF) {
 981                                         assert(node->left);
 982                                         *tree->leaf_index(tree, node->left) =
 983                                                                         index;
 984                                         index += tree->leaf_size(node->left);
 985                                         count++;
 986                                 } else if (node->left) {
 987                                         assert(node->leftnode == NODE);
 988                                         indent += 1;
 989                                         node = node->left;
 990                                         break;
 991                                 }
 992                         }
 993                         if (node->mark && (rightmask & bitmask) == 0) {
 994                                 rightmask |= bitmask;
 995                                 if (node->rightnode == LEAF) {
 996                                         assert(node->right);
 997                                         *tree->leaf_index(tree, node->right) = index;
 998                                         index += tree->leaf_size(node->right);
 999                                         count++;
1000                                 } else if (node->right) {
1001                                         assert(node->rightnode == NODE);
1002                                         indent += 1;
1003                                         node = node->right;
1004                                         break;
1005                                 }
1006                         }
1007                         leftmask &= ~bitmask;
1008                         rightmask &= ~bitmask;
1009                         node = node->parent;
1010                         indent -= 1;
1011                 }
1012         }
1013 done:
1014         /* Round up to a multiple of 16 */
1015         while (index % 16)
1016                 index++;
1017         if (verbose > 0)
1018                 printf("Final index %d\n", index);
1019         return index;
1020 }
1021
1022 /*
1023  * Mark the nodes in a subtree, helper for size_nodes().
1024  */
1025 static int mark_subtree(struct node *node)
1026 {
1027         int changed;
1028
1029         if (!node || node->mark)
1030                 return 0;
1031         node->mark = 1;
1032         node->index = node->parent->index;
1033         changed = 1;
1034         if (node->leftnode == NODE)
1035                 changed += mark_subtree(node->left);
1036         if (node->rightnode == NODE)
1037                 changed += mark_subtree(node->right);
1038         return changed;
1039 }
1040
1041 /*
1042  * Compute the size of nodes and leaves. We start by assuming that
1043  * each node needs to store a three-byte offset. The indexes of the
1044  * nodes are calculated based on that, and then this function is
1045  * called to see if the sizes of some nodes can be reduced.  This is
1046  * repeated until no more changes are seen.
1047  */
1048 static int size_nodes(struct tree *tree)
1049 {
1050         struct tree *next;
1051         struct node *node;
1052         struct node *right;
1053         struct node *n;
1054         unsigned int leftmask;
1055         unsigned int rightmask;
1056         unsigned int bitmask;
1057         unsigned int pathbits;
1058         unsigned int pathmask;
1059         unsigned int nbit;
1060         int changed;
1061         int offset;
1062         int size;
1063         int indent;
1064
1065         indent = 1;
1066         changed = 0;
1067         size = 0;
1068
1069         if (verbose > 0)
1070                 printf("Sizing %s_%x\n", tree->type, tree->maxage);
1071         if (tree->childnode == LEAF)
1072                 goto done;
1073
1074         assert(tree->childnode == NODE);
1075         pathbits = 0;
1076         pathmask = 0;
1077         node = tree->root;
1078         leftmask = rightmask = 0;
1079         while (node) {
1080                 if (!node->mark)
1081                         goto skip;
1082                 offset = 0;
1083                 if (!node->left || !node->right) {
1084                         size = 1;
1085                 } else {
1086                         if (node->rightnode == NODE) {
1087                                 /*
1088                                  * If the right node is not marked,
1089                                  * look for a corresponding node in
1090                                  * the next tree.  Such a node need
1091                                  * not exist.
1092                                  */
1093                                 right = node->right;
1094                                 next = tree->next;
1095                                 while (!right->mark) {
1096                                         assert(next);
1097                                         n = next->root;
1098                                         while (n->bitnum != node->bitnum) {
1099                                                 nbit = 1 << n->bitnum;
1100                                                 if (!(pathmask & nbit))
1101                                                         break;
1102                                                 if (pathbits & nbit) {
1103                                                         if (n->rightnode == LEAF)
1104                                                                 break;
1105                                                         n = n->right;
1106                                                 } else {
1107                                                         if (n->leftnode == LEAF)
1108                                                                 break;
1109                                                         n = n->left;
1110                                                 }
1111                                         }
1112                                         if (n->bitnum != node->bitnum)
1113                                                 break;
1114                                         n = n->right;
1115                                         right = n;
1116                                         next = next->next;
1117                                 }
1118                                 /* Make sure the right node is marked. */
1119                                 if (!right->mark)
1120                                         changed += mark_subtree(right);
1121                                 offset = right->index - node->index;
1122                         } else {
1123                                 offset = *tree->leaf_index(tree, node->right);
1124                                 offset -= node->index;
1125                         }
1126                         assert(offset >= 0);
1127                         assert(offset <= 0xffffff);
1128                         if (offset <= 0xff) {
1129                                 size = 2;
1130                         } else if (offset <= 0xffff) {
1131                                 size = 3;
1132                         } else { /* offset <= 0xffffff */
1133                                 size = 4;
1134                         }
1135                 }
1136                 if (node->size != size || node->offset != offset) {
1137                         node->size = size;
1138                         node->offset = offset;
1139                         changed++;
1140                 }
1141 skip:
1142                 while (node) {
1143                         bitmask = 1 << node->bitnum;
1144                         pathmask |= bitmask;
1145                         if (node->mark && (leftmask & bitmask) == 0) {
1146                                 leftmask |= bitmask;
1147                                 if (node->leftnode == LEAF) {
1148                                         assert(node->left);
1149                                 } else if (node->left) {
1150                                         assert(node->leftnode == NODE);
1151                                         indent += 1;
1152                                         node = node->left;
1153                                         break;
1154                                 }
1155                         }
1156                         if (node->mark && (rightmask & bitmask) == 0) {
1157                                 rightmask |= bitmask;
1158                                 pathbits |= bitmask;
1159                                 if (node->rightnode == LEAF) {
1160                                         assert(node->right);
1161                                 } else if (node->right) {
1162                                         assert(node->rightnode == NODE);
1163                                         indent += 1;
1164                                         node = node->right;
1165                                         break;
1166                                 }
1167                         }
1168                         leftmask &= ~bitmask;
1169                         rightmask &= ~bitmask;
1170                         pathmask &= ~bitmask;
1171                         pathbits &= ~bitmask;
1172                         node = node->parent;
1173                         indent -= 1;
1174                 }
1175         }
1176 done:
1177         if (verbose > 0)
1178                 printf("Found %d changes\n", changed);
1179         return changed;
1180 }
1181
1182 /*
1183  * Emit a trie for the given tree into the data array.
1184  */
1185 static void emit(struct tree *tree, unsigned char *data)
1186 {
1187         struct node *node;
1188         unsigned int leftmask;
1189         unsigned int rightmask;
1190         unsigned int bitmask;
1191         int offlen;
1192         int offset;
1193         int index;
1194         int indent;
1195         int size;
1196         int bytes;
1197         int leaves;
1198         int nodes[4];
1199         unsigned char byte;
1200
1201         nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
1202         leaves = 0;
1203         bytes = 0;
1204         index = tree->index;
1205         data += index;
1206         indent = 1;
1207         if (verbose > 0)
1208                 printf("Emitting %s_%x\n", tree->type, tree->maxage);
1209         if (tree->childnode == LEAF) {
1210                 assert(tree->root);
1211                 tree->leaf_emit(tree->root, data);
1212                 size = tree->leaf_size(tree->root);
1213                 index += size;
1214                 leaves++;
1215                 goto done;
1216         }
1217
1218         assert(tree->childnode == NODE);
1219         node = tree->root;
1220         leftmask = rightmask = 0;
1221         while (node) {
1222                 if (!node->mark)
1223                         goto skip;
1224                 assert(node->offset != -1);
1225                 assert(node->index == index);
1226
1227                 byte = 0;
1228                 if (node->nextbyte)
1229                         byte |= NEXTBYTE;
1230                 byte |= (node->bitnum & BITNUM);
1231                 if (node->left && node->right) {
1232                         if (node->leftnode == NODE)
1233                                 byte |= LEFTNODE;
1234                         if (node->rightnode == NODE)
1235                                 byte |= RIGHTNODE;
1236                         if (node->offset <= 0xff)
1237                                 offlen = 1;
1238                         else if (node->offset <= 0xffff)
1239                                 offlen = 2;
1240                         else
1241                                 offlen = 3;
1242                         nodes[offlen]++;
1243                         offset = node->offset;
1244                         byte |= offlen << OFFLEN_SHIFT;
1245                         *data++ = byte;
1246                         index++;
1247                         while (offlen--) {
1248                                 *data++ = offset & 0xff;
1249                                 index++;
1250                                 offset >>= 8;
1251                         }
1252                 } else if (node->left) {
1253                         if (node->leftnode == NODE)
1254                                 byte |= TRIENODE;
1255                         nodes[0]++;
1256                         *data++ = byte;
1257                         index++;
1258                 } else if (node->right) {
1259                         byte |= RIGHTNODE;
1260                         if (node->rightnode == NODE)
1261                                 byte |= TRIENODE;
1262                         nodes[0]++;
1263                         *data++ = byte;
1264                         index++;
1265                 } else {
1266                         assert(0);
1267                 }
1268 skip:
1269                 while (node) {
1270                         bitmask = 1 << node->bitnum;
1271                         if (node->mark && (leftmask & bitmask) == 0) {
1272                                 leftmask |= bitmask;
1273                                 if (node->leftnode == LEAF) {
1274                                         assert(node->left);
1275                                         data = tree->leaf_emit(node->left,
1276                                                                data);
1277                                         size = tree->leaf_size(node->left);
1278                                         index += size;
1279                                         bytes += size;
1280                                         leaves++;
1281                                 } else if (node->left) {
1282                                         assert(node->leftnode == NODE);
1283                                         indent += 1;
1284                                         node = node->left;
1285                                         break;
1286                                 }
1287                         }
1288                         if (node->mark && (rightmask & bitmask) == 0) {
1289                                 rightmask |= bitmask;
1290                                 if (node->rightnode == LEAF) {
1291                                         assert(node->right);
1292                                         data = tree->leaf_emit(node->right,
1293                                                                data);
1294                                         size = tree->leaf_size(node->right);
1295                                         index += size;
1296                                         bytes += size;
1297                                         leaves++;
1298                                 } else if (node->right) {
1299                                         assert(node->rightnode == NODE);
1300                                         indent += 1;
1301                                         node = node->right;
1302                                         break;
1303                                 }
1304                         }
1305                         leftmask &= ~bitmask;
1306                         rightmask &= ~bitmask;
1307                         node = node->parent;
1308                         indent -= 1;
1309                 }
1310         }
1311 done:
1312         if (verbose > 0) {
1313                 printf("Emitted %d (%d) leaves",
1314                         leaves, bytes);
1315                 printf(" %d (%d+%d+%d+%d) nodes",
1316                         nodes[0] + nodes[1] + nodes[2] + nodes[3],
1317                         nodes[0], nodes[1], nodes[2], nodes[3]);
1318                 printf(" %d total\n", index - tree->index);
1319         }
1320 }
1321
1322 /* ------------------------------------------------------------------ */
1323
1324 /*
1325  * Unicode data.
1326  *
1327  * We need to keep track of the Canonical Combining Class, the Age,
1328  * and decompositions for a code point.
1329  *
1330  * For the Age, we store the index into the ages table.  Effectively
1331  * this is a generation number that the table maps to a unicode
1332  * version.
1333  *
1334  * The correction field is used to indicate that this entry is in the
1335  * corrections array, which contains decompositions that were
1336  * corrected in later revisions.  The value of the correction field is
1337  * the Unicode version in which the mapping was corrected.
1338  */
1339 struct unicode_data {
1340         unsigned int code;
1341         int ccc;
1342         int gen;
1343         int correction;
1344         unsigned int *utf32nfdi;
1345         unsigned int *utf32nfdicf;
1346         char *utf8nfdi;
1347         char *utf8nfdicf;
1348 };
1349
1350 struct unicode_data unicode_data[0x110000];
1351 struct unicode_data *corrections;
1352 int    corrections_count;
1353
1354 struct tree *nfdi_tree;
1355 struct tree *nfdicf_tree;
1356
1357 struct tree *trees;
1358 int          trees_count;
1359
1360 /*
1361  * Check the corrections array to see if this entry was corrected at
1362  * some point.
1363  */
1364 static struct unicode_data *corrections_lookup(struct unicode_data *u)
1365 {
1366         int i;
1367
1368         for (i = 0; i != corrections_count; i++)
1369                 if (u->code == corrections[i].code)
1370                         return &corrections[i];
1371         return u;
1372 }
1373
1374 static int nfdi_equal(void *l, void *r)
1375 {
1376         struct unicode_data *left = l;
1377         struct unicode_data *right = r;
1378
1379         if (left->gen != right->gen)
1380                 return 0;
1381         if (left->ccc != right->ccc)
1382                 return 0;
1383         if (left->utf8nfdi && right->utf8nfdi &&
1384             strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1385                 return 1;
1386         if (left->utf8nfdi || right->utf8nfdi)
1387                 return 0;
1388         return 1;
1389 }
1390
1391 static int nfdicf_equal(void *l, void *r)
1392 {
1393         struct unicode_data *left = l;
1394         struct unicode_data *right = r;
1395
1396         if (left->gen != right->gen)
1397                 return 0;
1398         if (left->ccc != right->ccc)
1399                 return 0;
1400         if (left->utf8nfdicf && right->utf8nfdicf &&
1401             strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
1402                 return 1;
1403         if (left->utf8nfdicf && right->utf8nfdicf)
1404                 return 0;
1405         if (left->utf8nfdicf || right->utf8nfdicf)
1406                 return 0;
1407         if (left->utf8nfdi && right->utf8nfdi &&
1408             strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1409                 return 1;
1410         if (left->utf8nfdi || right->utf8nfdi)
1411                 return 0;
1412         return 1;
1413 }
1414
1415 static void nfdi_print(void *l, int indent)
1416 {
1417         struct unicode_data *leaf = l;
1418
1419         printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1420                 leaf->code, leaf->ccc, leaf->gen);
1421
1422         if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
1423                 printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1424         else if (leaf->utf8nfdi)
1425                 printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1426
1427         printf("\n");
1428 }
1429
1430 static void nfdicf_print(void *l, int indent)
1431 {
1432         struct unicode_data *leaf = l;
1433
1434         printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1435                 leaf->code, leaf->ccc, leaf->gen);
1436
1437         if (leaf->utf8nfdicf)
1438                 printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
1439         else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
1440                 printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1441         else if (leaf->utf8nfdi)
1442                 printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1443         printf("\n");
1444 }
1445
1446 static int nfdi_mark(void *l)
1447 {
1448         return 1;
1449 }
1450
1451 static int nfdicf_mark(void *l)
1452 {
1453         struct unicode_data *leaf = l;
1454
1455         if (leaf->utf8nfdicf)
1456                 return 1;
1457         return 0;
1458 }
1459
1460 static int correction_mark(void *l)
1461 {
1462         struct unicode_data *leaf = l;
1463
1464         return leaf->correction;
1465 }
1466
1467 static int nfdi_size(void *l)
1468 {
1469         struct unicode_data *leaf = l;
1470         int size = 2;
1471
1472         if (HANGUL_SYLLABLE(leaf->code))
1473                 size += 1;
1474         else if (leaf->utf8nfdi)
1475                 size += strlen(leaf->utf8nfdi) + 1;
1476         return size;
1477 }
1478
1479 static int nfdicf_size(void *l)
1480 {
1481         struct unicode_data *leaf = l;
1482         int size = 2;
1483
1484         if (HANGUL_SYLLABLE(leaf->code))
1485                 size += 1;
1486         else if (leaf->utf8nfdicf)
1487                 size += strlen(leaf->utf8nfdicf) + 1;
1488         else if (leaf->utf8nfdi)
1489                 size += strlen(leaf->utf8nfdi) + 1;
1490         return size;
1491 }
1492
1493 static int *nfdi_index(struct tree *tree, void *l)
1494 {
1495         struct unicode_data *leaf = l;
1496
1497         return &tree->leafindex[leaf->code];
1498 }
1499
1500 static int *nfdicf_index(struct tree *tree, void *l)
1501 {
1502         struct unicode_data *leaf = l;
1503
1504         return &tree->leafindex[leaf->code];
1505 }
1506
1507 static unsigned char *nfdi_emit(void *l, unsigned char *data)
1508 {
1509         struct unicode_data *leaf = l;
1510         unsigned char *s;
1511
1512         *data++ = leaf->gen;
1513
1514         if (HANGUL_SYLLABLE(leaf->code)) {
1515                 *data++ = DECOMPOSE;
1516                 *data++ = HANGUL;
1517         } else if (leaf->utf8nfdi) {
1518                 *data++ = DECOMPOSE;
1519                 s = (unsigned char*)leaf->utf8nfdi;
1520                 while ((*data++ = *s++) != 0)
1521                         ;
1522         } else {
1523                 *data++ = leaf->ccc;
1524         }
1525         return data;
1526 }
1527
1528 static unsigned char *nfdicf_emit(void *l, unsigned char *data)
1529 {
1530         struct unicode_data *leaf = l;
1531         unsigned char *s;
1532
1533         *data++ = leaf->gen;
1534
1535         if (HANGUL_SYLLABLE(leaf->code)) {
1536                 *data++ = DECOMPOSE;
1537                 *data++ = HANGUL;
1538         } else if (leaf->utf8nfdicf) {
1539                 *data++ = DECOMPOSE;
1540                 s = (unsigned char*)leaf->utf8nfdicf;
1541                 while ((*data++ = *s++) != 0)
1542                         ;
1543         } else if (leaf->utf8nfdi) {
1544                 *data++ = DECOMPOSE;
1545                 s = (unsigned char*)leaf->utf8nfdi;
1546                 while ((*data++ = *s++) != 0)
1547                         ;
1548         } else {
1549                 *data++ = leaf->ccc;
1550         }
1551         return data;
1552 }
1553
1554 static void utf8_create(struct unicode_data *data)
1555 {
1556         char utf[18*4+1];
1557         char *u;
1558         unsigned int *um;
1559         int i;
1560
1561         if (data->utf8nfdi) {
1562                 assert(data->utf8nfdi[0] == HANGUL);
1563                 return;
1564         }
1565
1566         u = utf;
1567         um = data->utf32nfdi;
1568         if (um) {
1569                 for (i = 0; um[i]; i++)
1570                         u += utf8encode(u, um[i]);
1571                 *u = '\0';
1572                 data->utf8nfdi = strdup(utf);
1573         }
1574         u = utf;
1575         um = data->utf32nfdicf;
1576         if (um) {
1577                 for (i = 0; um[i]; i++)
1578                         u += utf8encode(u, um[i]);
1579                 *u = '\0';
1580                 if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
1581                         data->utf8nfdicf = strdup(utf);
1582         }
1583 }
1584
1585 static void utf8_init(void)
1586 {
1587         unsigned int unichar;
1588         int i;
1589
1590         for (unichar = 0; unichar != 0x110000; unichar++)
1591                 utf8_create(&unicode_data[unichar]);
1592
1593         for (i = 0; i != corrections_count; i++)
1594                 utf8_create(&corrections[i]);
1595 }
1596
1597 static void trees_init(void)
1598 {
1599         struct unicode_data *data;
1600         unsigned int maxage;
1601         unsigned int nextage;
1602         int count;
1603         int i;
1604         int j;
1605
1606         /* Count the number of different ages. */
1607         count = 0;
1608         nextage = (unsigned int)-1;
1609         do {
1610                 maxage = nextage;
1611                 nextage = 0;
1612                 for (i = 0; i <= corrections_count; i++) {
1613                         data = &corrections[i];
1614                         if (nextage < data->correction &&
1615                             data->correction < maxage)
1616                                 nextage = data->correction;
1617                 }
1618                 count++;
1619         } while (nextage);
1620
1621         /* Two trees per age: nfdi and nfdicf */
1622         trees_count = count * 2;
1623         trees = calloc(trees_count, sizeof(struct tree));
1624
1625         /* Assign ages to the trees. */
1626         count = trees_count;
1627         nextage = (unsigned int)-1;
1628         do {
1629                 maxage = nextage;
1630                 trees[--count].maxage = maxage;
1631                 trees[--count].maxage = maxage;
1632                 nextage = 0;
1633                 for (i = 0; i <= corrections_count; i++) {
1634                         data = &corrections[i];
1635                         if (nextage < data->correction &&
1636                             data->correction < maxage)
1637                                 nextage = data->correction;
1638                 }
1639         } while (nextage);
1640
1641         /* The ages assigned above are off by one. */
1642         for (i = 0; i != trees_count; i++) {
1643                 j = 0;
1644                 while (ages[j] < trees[i].maxage)
1645                         j++;
1646                 trees[i].maxage = ages[j-1];
1647         }
1648
1649         /* Set up the forwarding between trees. */
1650         trees[trees_count-2].next = &trees[trees_count-1];
1651         trees[trees_count-1].leaf_mark = nfdi_mark;
1652         trees[trees_count-2].leaf_mark = nfdicf_mark;
1653         for (i = 0; i != trees_count-2; i += 2) {
1654                 trees[i].next = &trees[trees_count-2];
1655                 trees[i].leaf_mark = correction_mark;
1656                 trees[i+1].next = &trees[trees_count-1];
1657                 trees[i+1].leaf_mark = correction_mark;
1658         }
1659
1660         /* Assign the callouts. */
1661         for (i = 0; i != trees_count; i += 2) {
1662                 trees[i].type = "nfdicf";
1663                 trees[i].leaf_equal = nfdicf_equal;
1664                 trees[i].leaf_print = nfdicf_print;
1665                 trees[i].leaf_size = nfdicf_size;
1666                 trees[i].leaf_index = nfdicf_index;
1667                 trees[i].leaf_emit = nfdicf_emit;
1668
1669                 trees[i+1].type = "nfdi";
1670                 trees[i+1].leaf_equal = nfdi_equal;
1671                 trees[i+1].leaf_print = nfdi_print;
1672                 trees[i+1].leaf_size = nfdi_size;
1673                 trees[i+1].leaf_index = nfdi_index;
1674                 trees[i+1].leaf_emit = nfdi_emit;
1675         }
1676
1677         /* Finish init. */
1678         for (i = 0; i != trees_count; i++)
1679                 trees[i].childnode = NODE;
1680 }
1681
1682 static void trees_populate(void)
1683 {
1684         struct unicode_data *data;
1685         unsigned int unichar;
1686         char keyval[4];
1687         int keylen;
1688         int i;
1689
1690         for (i = 0; i != trees_count; i++) {
1691                 if (verbose > 0) {
1692                         printf("Populating %s_%x\n",
1693                                 trees[i].type, trees[i].maxage);
1694                 }
1695                 for (unichar = 0; unichar != 0x110000; unichar++) {
1696                         if (unicode_data[unichar].gen < 0)
1697                                 continue;
1698                         keylen = utf8encode(keyval, unichar);
1699                         data = corrections_lookup(&unicode_data[unichar]);
1700                         if (data->correction <= trees[i].maxage)
1701                                 data = &unicode_data[unichar];
1702                         insert(&trees[i], keyval, keylen, data);
1703                 }
1704         }
1705 }
1706
1707 static void trees_reduce(void)
1708 {
1709         int i;
1710         int size;
1711         int changed;
1712
1713         for (i = 0; i != trees_count; i++)
1714                 prune(&trees[i]);
1715         for (i = 0; i != trees_count; i++)
1716                 mark_nodes(&trees[i]);
1717         do {
1718                 size = 0;
1719                 for (i = 0; i != trees_count; i++)
1720                         size = index_nodes(&trees[i], size);
1721                 changed = 0;
1722                 for (i = 0; i != trees_count; i++)
1723                         changed += size_nodes(&trees[i]);
1724         } while (changed);
1725
1726         utf8data = calloc(size, 1);
1727         utf8data_size = size;
1728         for (i = 0; i != trees_count; i++)
1729                 emit(&trees[i], utf8data);
1730
1731         if (verbose > 0) {
1732                 for (i = 0; i != trees_count; i++) {
1733                         printf("%s_%x idx %d\n",
1734                                 trees[i].type, trees[i].maxage, trees[i].index);
1735                 }
1736         }
1737
1738         nfdi = utf8data + trees[trees_count-1].index;
1739         nfdicf = utf8data + trees[trees_count-2].index;
1740
1741         nfdi_tree = &trees[trees_count-1];
1742         nfdicf_tree = &trees[trees_count-2];
1743 }
1744
1745 static void verify(struct tree *tree)
1746 {
1747         struct unicode_data *data;
1748         utf8leaf_t      *leaf;
1749         unsigned int    unichar;
1750         char            key[4];
1751         unsigned char   hangul[UTF8HANGULLEAF];
1752         int             report;
1753         int             nocf;
1754
1755         if (verbose > 0)
1756                 printf("Verifying %s_%x\n", tree->type, tree->maxage);
1757         nocf = strcmp(tree->type, "nfdicf");
1758
1759         for (unichar = 0; unichar != 0x110000; unichar++) {
1760                 report = 0;
1761                 data = corrections_lookup(&unicode_data[unichar]);
1762                 if (data->correction <= tree->maxage)
1763                         data = &unicode_data[unichar];
1764                 utf8encode(key,unichar);
1765                 leaf = utf8lookup(tree, hangul, key);
1766
1767                 if (!leaf) {
1768                         if (data->gen != -1)
1769                                 report++;
1770                         if (unichar < 0xd800 || unichar > 0xdfff)
1771                                 report++;
1772                 } else {
1773                         if (unichar >= 0xd800 && unichar <= 0xdfff)
1774                                 report++;
1775                         if (data->gen == -1)
1776                                 report++;
1777                         if (data->gen != LEAF_GEN(leaf))
1778                                 report++;
1779                         if (LEAF_CCC(leaf) == DECOMPOSE) {
1780                                 if (HANGUL_SYLLABLE(data->code)) {
1781                                         if (data->utf8nfdi[0] != HANGUL)
1782                                                 report++;
1783                                 } else if (nocf) {
1784                                         if (!data->utf8nfdi) {
1785                                                 report++;
1786                                         } else if (strcmp(data->utf8nfdi,
1787                                                           LEAF_STR(leaf))) {
1788                                                 report++;
1789                                         }
1790                                 } else {
1791                                         if (!data->utf8nfdicf &&
1792                                             !data->utf8nfdi) {
1793                                                 report++;
1794                                         } else if (data->utf8nfdicf) {
1795                                                 if (strcmp(data->utf8nfdicf,
1796                                                            LEAF_STR(leaf)))
1797                                                         report++;
1798                                         } else if (strcmp(data->utf8nfdi,
1799                                                           LEAF_STR(leaf))) {
1800                                                 report++;
1801                                         }
1802                                 }
1803                         } else if (data->ccc != LEAF_CCC(leaf)) {
1804                                 report++;
1805                         }
1806                 }
1807                 if (report) {
1808                         printf("%X code %X gen %d ccc %d"
1809                                 " nfdi -> \"%s\"",
1810                                 unichar, data->code, data->gen,
1811                                 data->ccc,
1812                                 data->utf8nfdi);
1813                         if (leaf) {
1814                                 printf(" gen %d ccc %d"
1815                                         " nfdi -> \"%s\"",
1816                                         LEAF_GEN(leaf),
1817                                         LEAF_CCC(leaf),
1818                                         LEAF_CCC(leaf) == DECOMPOSE ?
1819                                                 LEAF_STR(leaf) : "");
1820                         }
1821                         printf("\n");
1822                 }
1823         }
1824 }
1825
1826 static void trees_verify(void)
1827 {
1828         int i;
1829
1830         for (i = 0; i != trees_count; i++)
1831                 verify(&trees[i]);
1832 }
1833
1834 /* ------------------------------------------------------------------ */
1835
1836 static void help(void)
1837 {
1838         printf("Usage: %s [options]\n", argv0);
1839         printf("\n");
1840         printf("This program creates an a data trie used for parsing and\n");
1841         printf("normalization of UTF-8 strings. The trie is derived from\n");
1842         printf("a set of input files from the Unicode character database\n");
1843         printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
1844         printf("\n");
1845         printf("The generated tree supports two normalization forms:\n");
1846         printf("\n");
1847         printf("\tnfdi:\n");
1848         printf("\t- Apply unicode normalization form NFD.\n");
1849         printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1850         printf("\n");
1851         printf("\tnfdicf:\n");
1852         printf("\t- Apply unicode normalization form NFD.\n");
1853         printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1854         printf("\t- Apply a full casefold (C + F).\n");
1855         printf("\n");
1856         printf("These forms were chosen as being most useful when dealing\n");
1857         printf("with file names: NFD catches most cases where characters\n");
1858         printf("should be considered equivalent. The ignorables are mostly\n");
1859         printf("invisible, making names hard to type.\n");
1860         printf("\n");
1861         printf("The options to specify the files to be used are listed\n");
1862         printf("below with their default values, which are the names used\n");
1863         printf("by version 11.0.0 of the Unicode Character Database.\n");
1864         printf("\n");
1865         printf("The input files:\n");
1866         printf("\t-a %s\n", AGE_NAME);
1867         printf("\t-c %s\n", CCC_NAME);
1868         printf("\t-p %s\n", PROP_NAME);
1869         printf("\t-d %s\n", DATA_NAME);
1870         printf("\t-f %s\n", FOLD_NAME);
1871         printf("\t-n %s\n", NORM_NAME);
1872         printf("\n");
1873         printf("Additionally, the generated tables are tested using:\n");
1874         printf("\t-t %s\n", TEST_NAME);
1875         printf("\n");
1876         printf("Finally, the output file:\n");
1877         printf("\t-o %s\n", UTF8_NAME);
1878         printf("\n");
1879 }
1880
1881 static void usage(void)
1882 {
1883         help();
1884         exit(1);
1885 }
1886
1887 static void open_fail(const char *name, int error)
1888 {
1889         printf("Error %d opening %s: %s\n", error, name, strerror(error));
1890         exit(1);
1891 }
1892
1893 static void file_fail(const char *filename)
1894 {
1895         printf("Error parsing %s\n", filename);
1896         exit(1);
1897 }
1898
1899 static void line_fail(const char *filename, const char *line)
1900 {
1901         printf("Error parsing %s:%s\n", filename, line);
1902         exit(1);
1903 }
1904
1905 /* ------------------------------------------------------------------ */
1906
1907 static void print_utf32(unsigned int *utf32str)
1908 {
1909         int     i;
1910
1911         for (i = 0; utf32str[i]; i++)
1912                 printf(" %X", utf32str[i]);
1913 }
1914
1915 static void print_utf32nfdi(unsigned int unichar)
1916 {
1917         printf(" %X ->", unichar);
1918         print_utf32(unicode_data[unichar].utf32nfdi);
1919         printf("\n");
1920 }
1921
1922 static void print_utf32nfdicf(unsigned int unichar)
1923 {
1924         printf(" %X ->", unichar);
1925         print_utf32(unicode_data[unichar].utf32nfdicf);
1926         printf("\n");
1927 }
1928
1929 /* ------------------------------------------------------------------ */
1930
1931 static void age_init(void)
1932 {
1933         FILE *file;
1934         unsigned int first;
1935         unsigned int last;
1936         unsigned int unichar;
1937         unsigned int major;
1938         unsigned int minor;
1939         unsigned int revision;
1940         int gen;
1941         int count;
1942         int ret;
1943
1944         if (verbose > 0)
1945                 printf("Parsing %s\n", age_name);
1946
1947         file = fopen(age_name, "r");
1948         if (!file)
1949                 open_fail(age_name, errno);
1950         count = 0;
1951
1952         gen = 0;
1953         while (fgets(line, LINESIZE, file)) {
1954                 ret = sscanf(line, "# Age=V%d_%d_%d",
1955                                 &major, &minor, &revision);
1956                 if (ret == 3) {
1957                         ages_count++;
1958                         if (verbose > 1)
1959                                 printf(" Age V%d_%d_%d\n",
1960                                         major, minor, revision);
1961                         if (!age_valid(major, minor, revision))
1962                                 line_fail(age_name, line);
1963                         continue;
1964                 }
1965                 ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1966                 if (ret == 2) {
1967                         ages_count++;
1968                         if (verbose > 1)
1969                                 printf(" Age V%d_%d\n", major, minor);
1970                         if (!age_valid(major, minor, 0))
1971                                 line_fail(age_name, line);
1972                         continue;
1973                 }
1974         }
1975
1976         /* We must have found something above. */
1977         if (verbose > 1)
1978                 printf("%d age entries\n", ages_count);
1979         if (ages_count == 0 || ages_count > MAXGEN)
1980                 file_fail(age_name);
1981
1982         /* There is a 0 entry. */
1983         ages_count++;
1984         ages = calloc(ages_count + 1, sizeof(*ages));
1985         /* And a guard entry. */
1986         ages[ages_count] = (unsigned int)-1;
1987
1988         rewind(file);
1989         count = 0;
1990         gen = 0;
1991         while (fgets(line, LINESIZE, file)) {
1992                 ret = sscanf(line, "# Age=V%d_%d_%d",
1993                                 &major, &minor, &revision);
1994                 if (ret == 3) {
1995                         ages[++gen] =
1996                                 UNICODE_AGE(major, minor, revision);
1997                         if (verbose > 1)
1998                                 printf(" Age V%d_%d_%d = gen %d\n",
1999                                         major, minor, revision, gen);
2000                         if (!age_valid(major, minor, revision))
2001                                 line_fail(age_name, line);
2002                         continue;
2003                 }
2004                 ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
2005                 if (ret == 2) {
2006                         ages[++gen] = UNICODE_AGE(major, minor, 0);
2007                         if (verbose > 1)
2008                                 printf(" Age V%d_%d = %d\n",
2009                                         major, minor, gen);
2010                         if (!age_valid(major, minor, 0))
2011                                 line_fail(age_name, line);
2012                         continue;
2013                 }
2014                 ret = sscanf(line, "%X..%X ; %d.%d #",
2015                              &first, &last, &major, &minor);
2016                 if (ret == 4) {
2017                         for (unichar = first; unichar <= last; unichar++)
2018                                 unicode_data[unichar].gen = gen;
2019                         count += 1 + last - first;
2020                         if (verbose > 1)
2021                                 printf("  %X..%X gen %d\n", first, last, gen);
2022                         if (!utf32valid(first) || !utf32valid(last))
2023                                 line_fail(age_name, line);
2024                         continue;
2025                 }
2026                 ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
2027                 if (ret == 3) {
2028                         unicode_data[unichar].gen = gen;
2029                         count++;
2030                         if (verbose > 1)
2031                                 printf("  %X gen %d\n", unichar, gen);
2032                         if (!utf32valid(unichar))
2033                                 line_fail(age_name, line);
2034                         continue;
2035                 }
2036         }
2037         unicode_maxage = ages[gen];
2038         fclose(file);
2039
2040         /* Nix surrogate block */
2041         if (verbose > 1)
2042                 printf(" Removing surrogate block D800..DFFF\n");
2043         for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
2044                 unicode_data[unichar].gen = -1;
2045
2046         if (verbose > 0)
2047                 printf("Found %d entries\n", count);
2048         if (count == 0)
2049                 file_fail(age_name);
2050 }
2051
2052 static void ccc_init(void)
2053 {
2054         FILE *file;
2055         unsigned int first;
2056         unsigned int last;
2057         unsigned int unichar;
2058         unsigned int value;
2059         int count;
2060         int ret;
2061
2062         if (verbose > 0)
2063                 printf("Parsing %s\n", ccc_name);
2064
2065         file = fopen(ccc_name, "r");
2066         if (!file)
2067                 open_fail(ccc_name, errno);
2068
2069         count = 0;
2070         while (fgets(line, LINESIZE, file)) {
2071                 ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
2072                 if (ret == 3) {
2073                         for (unichar = first; unichar <= last; unichar++) {
2074                                 unicode_data[unichar].ccc = value;
2075                                 count++;
2076                         }
2077                         if (verbose > 1)
2078                                 printf(" %X..%X ccc %d\n", first, last, value);
2079                         if (!utf32valid(first) || !utf32valid(last))
2080                                 line_fail(ccc_name, line);
2081                         continue;
2082                 }
2083                 ret = sscanf(line, "%X ; %d #", &unichar, &value);
2084                 if (ret == 2) {
2085                         unicode_data[unichar].ccc = value;
2086                         count++;
2087                         if (verbose > 1)
2088                                 printf(" %X ccc %d\n", unichar, value);
2089                         if (!utf32valid(unichar))
2090                                 line_fail(ccc_name, line);
2091                         continue;
2092                 }
2093         }
2094         fclose(file);
2095
2096         if (verbose > 0)
2097                 printf("Found %d entries\n", count);
2098         if (count == 0)
2099                 file_fail(ccc_name);
2100 }
2101
2102 static int ignore_compatibility_form(char *type)
2103 {
2104         int i;
2105         char *ignored_types[] = {"font", "noBreak", "initial", "medial",
2106                                  "final", "isolated", "circle", "super",
2107                                  "sub", "vertical", "wide", "narrow",
2108                                  "small", "square", "fraction", "compat"};
2109
2110         for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
2111                 if (strcmp(type, ignored_types[i]) == 0)
2112                         return 1;
2113         return 0;
2114 }
2115
2116 static void nfdi_init(void)
2117 {
2118         FILE *file;
2119         unsigned int unichar;
2120         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2121         char *s;
2122         char *type;
2123         unsigned int *um;
2124         int count;
2125         int i;
2126         int ret;
2127
2128         if (verbose > 0)
2129                 printf("Parsing %s\n", data_name);
2130         file = fopen(data_name, "r");
2131         if (!file)
2132                 open_fail(data_name, errno);
2133
2134         count = 0;
2135         while (fgets(line, LINESIZE, file)) {
2136                 ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
2137                              &unichar, buf0);
2138                 if (ret != 2)
2139                         continue;
2140                 if (!utf32valid(unichar))
2141                         line_fail(data_name, line);
2142
2143                 s = buf0;
2144                 /* skip over <tag> */
2145                 if (*s == '<') {
2146                         type = ++s;
2147                         while (*++s != '>');
2148                         *s++ = '\0';
2149                         if(ignore_compatibility_form(type))
2150                                 continue;
2151                 }
2152                 /* decode the decomposition into UTF-32 */
2153                 i = 0;
2154                 while (*s) {
2155                         mapping[i] = strtoul(s, &s, 16);
2156                         if (!utf32valid(mapping[i]))
2157                                 line_fail(data_name, line);
2158                         i++;
2159                 }
2160                 mapping[i++] = 0;
2161
2162                 um = malloc(i * sizeof(unsigned int));
2163                 memcpy(um, mapping, i * sizeof(unsigned int));
2164                 unicode_data[unichar].utf32nfdi = um;
2165
2166                 if (verbose > 1)
2167                         print_utf32nfdi(unichar);
2168                 count++;
2169         }
2170         fclose(file);
2171         if (verbose > 0)
2172                 printf("Found %d entries\n", count);
2173         if (count == 0)
2174                 file_fail(data_name);
2175 }
2176
2177 static void nfdicf_init(void)
2178 {
2179         FILE *file;
2180         unsigned int unichar;
2181         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2182         char status;
2183         char *s;
2184         unsigned int *um;
2185         int i;
2186         int count;
2187         int ret;
2188
2189         if (verbose > 0)
2190                 printf("Parsing %s\n", fold_name);
2191         file = fopen(fold_name, "r");
2192         if (!file)
2193                 open_fail(fold_name, errno);
2194
2195         count = 0;
2196         while (fgets(line, LINESIZE, file)) {
2197                 ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
2198                 if (ret != 3)
2199                         continue;
2200                 if (!utf32valid(unichar))
2201                         line_fail(fold_name, line);
2202                 /* Use the C+F casefold. */
2203                 if (status != 'C' && status != 'F')
2204                         continue;
2205                 s = buf0;
2206                 if (*s == '<')
2207                         while (*s++ != ' ')
2208                                 ;
2209                 i = 0;
2210                 while (*s) {
2211                         mapping[i] = strtoul(s, &s, 16);
2212                         if (!utf32valid(mapping[i]))
2213                                 line_fail(fold_name, line);
2214                         i++;
2215                 }
2216                 mapping[i++] = 0;
2217
2218                 um = malloc(i * sizeof(unsigned int));
2219                 memcpy(um, mapping, i * sizeof(unsigned int));
2220                 unicode_data[unichar].utf32nfdicf = um;
2221
2222                 if (verbose > 1)
2223                         print_utf32nfdicf(unichar);
2224                 count++;
2225         }
2226         fclose(file);
2227         if (verbose > 0)
2228                 printf("Found %d entries\n", count);
2229         if (count == 0)
2230                 file_fail(fold_name);
2231 }
2232
2233 static void ignore_init(void)
2234 {
2235         FILE *file;
2236         unsigned int unichar;
2237         unsigned int first;
2238         unsigned int last;
2239         unsigned int *um;
2240         int count;
2241         int ret;
2242
2243         if (verbose > 0)
2244                 printf("Parsing %s\n", prop_name);
2245         file = fopen(prop_name, "r");
2246         if (!file)
2247                 open_fail(prop_name, errno);
2248         assert(file);
2249         count = 0;
2250         while (fgets(line, LINESIZE, file)) {
2251                 ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
2252                 if (ret == 3) {
2253                         if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2254                                 continue;
2255                         if (!utf32valid(first) || !utf32valid(last))
2256                                 line_fail(prop_name, line);
2257                         for (unichar = first; unichar <= last; unichar++) {
2258                                 free(unicode_data[unichar].utf32nfdi);
2259                                 um = malloc(sizeof(unsigned int));
2260                                 *um = 0;
2261                                 unicode_data[unichar].utf32nfdi = um;
2262                                 free(unicode_data[unichar].utf32nfdicf);
2263                                 um = malloc(sizeof(unsigned int));
2264                                 *um = 0;
2265                                 unicode_data[unichar].utf32nfdicf = um;
2266                                 count++;
2267                         }
2268                         if (verbose > 1)
2269                                 printf(" %X..%X Default_Ignorable_Code_Point\n",
2270                                         first, last);
2271                         continue;
2272                 }
2273                 ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
2274                 if (ret == 2) {
2275                         if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2276                                 continue;
2277                         if (!utf32valid(unichar))
2278                                 line_fail(prop_name, line);
2279                         free(unicode_data[unichar].utf32nfdi);
2280                         um = malloc(sizeof(unsigned int));
2281                         *um = 0;
2282                         unicode_data[unichar].utf32nfdi = um;
2283                         free(unicode_data[unichar].utf32nfdicf);
2284                         um = malloc(sizeof(unsigned int));
2285                         *um = 0;
2286                         unicode_data[unichar].utf32nfdicf = um;
2287                         if (verbose > 1)
2288                                 printf(" %X Default_Ignorable_Code_Point\n",
2289                                         unichar);
2290                         count++;
2291                         continue;
2292                 }
2293         }
2294         fclose(file);
2295
2296         if (verbose > 0)
2297                 printf("Found %d entries\n", count);
2298         if (count == 0)
2299                 file_fail(prop_name);
2300 }
2301
2302 static void corrections_init(void)
2303 {
2304         FILE *file;
2305         unsigned int unichar;
2306         unsigned int major;
2307         unsigned int minor;
2308         unsigned int revision;
2309         unsigned int age;
2310         unsigned int *um;
2311         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2312         char *s;
2313         int i;
2314         int count;
2315         int ret;
2316
2317         if (verbose > 0)
2318                 printf("Parsing %s\n", norm_name);
2319         file = fopen(norm_name, "r");
2320         if (!file)
2321                 open_fail(norm_name, errno);
2322
2323         count = 0;
2324         while (fgets(line, LINESIZE, file)) {
2325                 ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2326                                 &unichar, buf0, buf1,
2327                                 &major, &minor, &revision);
2328                 if (ret != 6)
2329                         continue;
2330                 if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2331                         line_fail(norm_name, line);
2332                 count++;
2333         }
2334         corrections = calloc(count, sizeof(struct unicode_data));
2335         corrections_count = count;
2336         rewind(file);
2337
2338         count = 0;
2339         while (fgets(line, LINESIZE, file)) {
2340                 ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2341                                 &unichar, buf0, buf1,
2342                                 &major, &minor, &revision);
2343                 if (ret != 6)
2344                         continue;
2345                 if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2346                         line_fail(norm_name, line);
2347                 corrections[count] = unicode_data[unichar];
2348                 assert(corrections[count].code == unichar);
2349                 age = UNICODE_AGE(major, minor, revision);
2350                 corrections[count].correction = age;
2351
2352                 i = 0;
2353                 s = buf0;
2354                 while (*s) {
2355                         mapping[i] = strtoul(s, &s, 16);
2356                         if (!utf32valid(mapping[i]))
2357                                 line_fail(norm_name, line);
2358                         i++;
2359                 }
2360                 mapping[i++] = 0;
2361
2362                 um = malloc(i * sizeof(unsigned int));
2363                 memcpy(um, mapping, i * sizeof(unsigned int));
2364                 corrections[count].utf32nfdi = um;
2365
2366                 if (verbose > 1)
2367                         printf(" %X -> %s -> %s V%d_%d_%d\n",
2368                                 unichar, buf0, buf1, major, minor, revision);
2369                 count++;
2370         }
2371         fclose(file);
2372
2373         if (verbose > 0)
2374                 printf("Found %d entries\n", count);
2375         if (count == 0)
2376                 file_fail(norm_name);
2377 }
2378
2379 /* ------------------------------------------------------------------ */
2380
2381 /*
2382  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2383  *
2384  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2385  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2386  *
2387  * SBase = 0xAC00
2388  * LBase = 0x1100
2389  * VBase = 0x1161
2390  * TBase = 0x11A7
2391  * LCount = 19
2392  * VCount = 21
2393  * TCount = 28
2394  * NCount = 588 (VCount * TCount)
2395  * SCount = 11172 (LCount * NCount)
2396  *
2397  * Decomposition:
2398  *   SIndex = s - SBase
2399  *
2400  * LV (Canonical/Full)
2401  *   LIndex = SIndex / NCount
2402  *   VIndex = (Sindex % NCount) / TCount
2403  *   LPart = LBase + LIndex
2404  *   VPart = VBase + VIndex
2405  *
2406  * LVT (Canonical)
2407  *   LVIndex = (SIndex / TCount) * TCount
2408  *   TIndex = (Sindex % TCount)
2409  *   LVPart = SBase + LVIndex
2410  *   TPart = TBase + TIndex
2411  *
2412  * LVT (Full)
2413  *   LIndex = SIndex / NCount
2414  *   VIndex = (Sindex % NCount) / TCount
2415  *   TIndex = (Sindex % TCount)
2416  *   LPart = LBase + LIndex
2417  *   VPart = VBase + VIndex
2418  *   if (TIndex == 0) {
2419  *          d = <LPart, VPart>
2420  *   } else {
2421  *          TPart = TBase + TIndex
2422  *          d = <LPart, VPart, TPart>
2423  *   }
2424  *
2425  */
2426
2427 static void hangul_decompose(void)
2428 {
2429         unsigned int sb = 0xAC00;
2430         unsigned int lb = 0x1100;
2431         unsigned int vb = 0x1161;
2432         unsigned int tb = 0x11a7;
2433         /* unsigned int lc = 19; */
2434         unsigned int vc = 21;
2435         unsigned int tc = 28;
2436         unsigned int nc = (vc * tc);
2437         /* unsigned int sc = (lc * nc); */
2438         unsigned int unichar;
2439         unsigned int mapping[4];
2440         unsigned int *um;
2441         int count;
2442         int i;
2443
2444         if (verbose > 0)
2445                 printf("Decomposing hangul\n");
2446         /* Hangul */
2447         count = 0;
2448         for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
2449                 unsigned int si = unichar - sb;
2450                 unsigned int li = si / nc;
2451                 unsigned int vi = (si % nc) / tc;
2452                 unsigned int ti = si % tc;
2453
2454                 i = 0;
2455                 mapping[i++] = lb + li;
2456                 mapping[i++] = vb + vi;
2457                 if (ti)
2458                         mapping[i++] = tb + ti;
2459                 mapping[i++] = 0;
2460
2461                 assert(!unicode_data[unichar].utf32nfdi);
2462                 um = malloc(i * sizeof(unsigned int));
2463                 memcpy(um, mapping, i * sizeof(unsigned int));
2464                 unicode_data[unichar].utf32nfdi = um;
2465
2466                 assert(!unicode_data[unichar].utf32nfdicf);
2467                 um = malloc(i * sizeof(unsigned int));
2468                 memcpy(um, mapping, i * sizeof(unsigned int));
2469                 unicode_data[unichar].utf32nfdicf = um;
2470
2471                 /*
2472                  * Add a cookie as a reminder that the hangul syllable
2473                  * decompositions must not be stored in the generated
2474                  * trie.
2475                  */
2476                 unicode_data[unichar].utf8nfdi = malloc(2);
2477                 unicode_data[unichar].utf8nfdi[0] = HANGUL;
2478                 unicode_data[unichar].utf8nfdi[1] = '\0';
2479
2480                 if (verbose > 1)
2481                         print_utf32nfdi(unichar);
2482
2483                 count++;
2484         }
2485         if (verbose > 0)
2486                 printf("Created %d entries\n", count);
2487 }
2488
2489 static void nfdi_decompose(void)
2490 {
2491         unsigned int unichar;
2492         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2493         unsigned int *um;
2494         unsigned int *dc;
2495         int count;
2496         int i;
2497         int j;
2498         int ret;
2499
2500         if (verbose > 0)
2501                 printf("Decomposing nfdi\n");
2502
2503         count = 0;
2504         for (unichar = 0; unichar != 0x110000; unichar++) {
2505                 if (!unicode_data[unichar].utf32nfdi)
2506                         continue;
2507                 for (;;) {
2508                         ret = 1;
2509                         i = 0;
2510                         um = unicode_data[unichar].utf32nfdi;
2511                         while (*um) {
2512                                 dc = unicode_data[*um].utf32nfdi;
2513                                 if (dc) {
2514                                         for (j = 0; dc[j]; j++)
2515                                                 mapping[i++] = dc[j];
2516                                         ret = 0;
2517                                 } else {
2518                                         mapping[i++] = *um;
2519                                 }
2520                                 um++;
2521                         }
2522                         mapping[i++] = 0;
2523                         if (ret)
2524                                 break;
2525                         free(unicode_data[unichar].utf32nfdi);
2526                         um = malloc(i * sizeof(unsigned int));
2527                         memcpy(um, mapping, i * sizeof(unsigned int));
2528                         unicode_data[unichar].utf32nfdi = um;
2529                 }
2530                 /* Add this decomposition to nfdicf if there is no entry. */
2531                 if (!unicode_data[unichar].utf32nfdicf) {
2532                         um = malloc(i * sizeof(unsigned int));
2533                         memcpy(um, mapping, i * sizeof(unsigned int));
2534                         unicode_data[unichar].utf32nfdicf = um;
2535                 }
2536                 if (verbose > 1)
2537                         print_utf32nfdi(unichar);
2538                 count++;
2539         }
2540         if (verbose > 0)
2541                 printf("Processed %d entries\n", count);
2542 }
2543
2544 static void nfdicf_decompose(void)
2545 {
2546         unsigned int unichar;
2547         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2548         unsigned int *um;
2549         unsigned int *dc;
2550         int count;
2551         int i;
2552         int j;
2553         int ret;
2554
2555         if (verbose > 0)
2556                 printf("Decomposing nfdicf\n");
2557         count = 0;
2558         for (unichar = 0; unichar != 0x110000; unichar++) {
2559                 if (!unicode_data[unichar].utf32nfdicf)
2560                         continue;
2561                 for (;;) {
2562                         ret = 1;
2563                         i = 0;
2564                         um = unicode_data[unichar].utf32nfdicf;
2565                         while (*um) {
2566                                 dc = unicode_data[*um].utf32nfdicf;
2567                                 if (dc) {
2568                                         for (j = 0; dc[j]; j++)
2569                                                 mapping[i++] = dc[j];
2570                                         ret = 0;
2571                                 } else {
2572                                         mapping[i++] = *um;
2573                                 }
2574                                 um++;
2575                         }
2576                         mapping[i++] = 0;
2577                         if (ret)
2578                                 break;
2579                         free(unicode_data[unichar].utf32nfdicf);
2580                         um = malloc(i * sizeof(unsigned int));
2581                         memcpy(um, mapping, i * sizeof(unsigned int));
2582                         unicode_data[unichar].utf32nfdicf = um;
2583                 }
2584                 if (verbose > 1)
2585                         print_utf32nfdicf(unichar);
2586                 count++;
2587         }
2588         if (verbose > 0)
2589                 printf("Processed %d entries\n", count);
2590 }
2591
2592 /* ------------------------------------------------------------------ */
2593
2594 int utf8agemax(struct tree *, const char *);
2595 int utf8nagemax(struct tree *, const char *, size_t);
2596 int utf8agemin(struct tree *, const char *);
2597 int utf8nagemin(struct tree *, const char *, size_t);
2598 ssize_t utf8len(struct tree *, const char *);
2599 ssize_t utf8nlen(struct tree *, const char *, size_t);
2600 struct utf8cursor;
2601 int utf8cursor(struct utf8cursor *, struct tree *, const char *);
2602 int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
2603 int utf8byte(struct utf8cursor *);
2604
2605 /*
2606  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2607  *
2608  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2609  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2610  *
2611  * SBase = 0xAC00
2612  * LBase = 0x1100
2613  * VBase = 0x1161
2614  * TBase = 0x11A7
2615  * LCount = 19
2616  * VCount = 21
2617  * TCount = 28
2618  * NCount = 588 (VCount * TCount)
2619  * SCount = 11172 (LCount * NCount)
2620  *
2621  * Decomposition:
2622  *   SIndex = s - SBase
2623  *
2624  * LV (Canonical/Full)
2625  *   LIndex = SIndex / NCount
2626  *   VIndex = (Sindex % NCount) / TCount
2627  *   LPart = LBase + LIndex
2628  *   VPart = VBase + VIndex
2629  *
2630  * LVT (Canonical)
2631  *   LVIndex = (SIndex / TCount) * TCount
2632  *   TIndex = (Sindex % TCount)
2633  *   LVPart = SBase + LVIndex
2634  *   TPart = TBase + TIndex
2635  *
2636  * LVT (Full)
2637  *   LIndex = SIndex / NCount
2638  *   VIndex = (Sindex % NCount) / TCount
2639  *   TIndex = (Sindex % TCount)
2640  *   LPart = LBase + LIndex
2641  *   VPart = VBase + VIndex
2642  *   if (TIndex == 0) {
2643  *          d = <LPart, VPart>
2644  *   } else {
2645  *          TPart = TBase + TIndex
2646  *          d = <LPart, VPart, TPart>
2647  *   }
2648  */
2649
2650 /* Constants */
2651 #define SB      (0xAC00)
2652 #define LB      (0x1100)
2653 #define VB      (0x1161)
2654 #define TB      (0x11A7)
2655 #define LC      (19)
2656 #define VC      (21)
2657 #define TC      (28)
2658 #define NC      (VC * TC)
2659 #define SC      (LC * NC)
2660
2661 /* Algorithmic decomposition of hangul syllable. */
2662 static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
2663 {
2664         unsigned int    si;
2665         unsigned int    li;
2666         unsigned int    vi;
2667         unsigned int    ti;
2668         unsigned char   *h;
2669
2670         /* Calculate the SI, LI, VI, and TI values. */
2671         si = utf8decode(str) - SB;
2672         li = si / NC;
2673         vi = (si % NC) / TC;
2674         ti = si % TC;
2675
2676         /* Fill in base of leaf. */
2677         h = hangul;
2678         LEAF_GEN(h) = 2;
2679         LEAF_CCC(h) = DECOMPOSE;
2680         h += 2;
2681
2682         /* Add LPart, a 3-byte UTF-8 sequence. */
2683         h += utf8encode((char *)h, li + LB);
2684
2685         /* Add VPart, a 3-byte UTF-8 sequence. */
2686         h += utf8encode((char *)h, vi + VB);
2687
2688         /* Add TPart if required, also a 3-byte UTF-8 sequence. */
2689         if (ti)
2690                 h += utf8encode((char *)h, ti + TB);
2691
2692         /* Terminate string. */
2693         h[0] = '\0';
2694
2695         return hangul;
2696 }
2697
2698 /*
2699  * Use trie to scan s, touching at most len bytes.
2700  * Returns the leaf if one exists, NULL otherwise.
2701  *
2702  * A non-NULL return guarantees that the UTF-8 sequence starting at s
2703  * is well-formed and corresponds to a known unicode code point.  The
2704  * shorthand for this will be "is valid UTF-8 unicode".
2705  */
2706 static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
2707                                const char *s, size_t len)
2708 {
2709         utf8trie_t      *trie;
2710         int             offlen;
2711         int             offset;
2712         int             mask;
2713         int             node;
2714
2715         if (!tree)
2716                 return NULL;
2717         if (len == 0)
2718                 return NULL;
2719         node = 1;
2720         trie = utf8data + tree->index;
2721         while (node) {
2722                 offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
2723                 if (*trie & NEXTBYTE) {
2724                         if (--len == 0)
2725                                 return NULL;
2726                         s++;
2727                 }
2728                 mask = 1 << (*trie & BITNUM);
2729                 if (*s & mask) {
2730                         /* Right leg */
2731                         if (offlen) {
2732                                 /* Right node at offset of trie */
2733                                 node = (*trie & RIGHTNODE);
2734                                 offset = trie[offlen];
2735                                 while (--offlen) {
2736                                         offset <<= 8;
2737                                         offset |= trie[offlen];
2738                                 }
2739                                 trie += offset;
2740                         } else if (*trie & RIGHTPATH) {
2741                                 /* Right node after this node */
2742                                 node = (*trie & TRIENODE);
2743                                 trie++;
2744                         } else {
2745                                 /* No right node. */
2746                                 return NULL;
2747                         }
2748                 } else {
2749                         /* Left leg */
2750                         if (offlen) {
2751                                 /* Left node after this node. */
2752                                 node = (*trie & LEFTNODE);
2753                                 trie += offlen + 1;
2754                         } else if (*trie & RIGHTPATH) {
2755                                 /* No left node. */
2756                                 return NULL;
2757                         } else {
2758                                 /* Left node after this node */
2759                                 node = (*trie & TRIENODE);
2760                                 trie++;
2761                         }
2762                 }
2763         }
2764         /*
2765          * Hangul decomposition is done algorithmically. These are the
2766          * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
2767          * always 3 bytes long, so s has been advanced twice, and the
2768          * start of the sequence is at s-2.
2769          */
2770         if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
2771                 trie = utf8hangul(s - 2, hangul);
2772         return trie;
2773 }
2774
2775 /*
2776  * Use trie to scan s.
2777  * Returns the leaf if one exists, NULL otherwise.
2778  *
2779  * Forwards to trie_nlookup().
2780  */
2781 static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
2782                               const char *s)
2783 {
2784         return utf8nlookup(tree, hangul, s, (size_t)-1);
2785 }
2786
2787 /*
2788  * Return the number of bytes used by the current UTF-8 sequence.
2789  * Assumes the input points to the first byte of a valid UTF-8
2790  * sequence.
2791  */
2792 static inline int utf8clen(const char *s)
2793 {
2794         unsigned char c = *s;
2795         return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
2796 }
2797
2798 /*
2799  * Maximum age of any character in s.
2800  * Return -1 if s is not valid UTF-8 unicode.
2801  * Return 0 if only non-assigned code points are used.
2802  */
2803 int utf8agemax(struct tree *tree, const char *s)
2804 {
2805         utf8leaf_t      *leaf;
2806         int             age = 0;
2807         int             leaf_age;
2808         unsigned char   hangul[UTF8HANGULLEAF];
2809
2810         if (!tree)
2811                 return -1;
2812
2813         while (*s) {
2814                 leaf = utf8lookup(tree, hangul, s);
2815                 if (!leaf)
2816                         return -1;
2817                 leaf_age = ages[LEAF_GEN(leaf)];
2818                 if (leaf_age <= tree->maxage && leaf_age > age)
2819                         age = leaf_age;
2820                 s += utf8clen(s);
2821         }
2822         return age;
2823 }
2824
2825 /*
2826  * Minimum age of any character in s.
2827  * Return -1 if s is not valid UTF-8 unicode.
2828  * Return 0 if non-assigned code points are used.
2829  */
2830 int utf8agemin(struct tree *tree, const char *s)
2831 {
2832         utf8leaf_t      *leaf;
2833         int             age;
2834         int             leaf_age;
2835         unsigned char   hangul[UTF8HANGULLEAF];
2836
2837         if (!tree)
2838                 return -1;
2839         age = tree->maxage;
2840         while (*s) {
2841                 leaf = utf8lookup(tree, hangul, s);
2842                 if (!leaf)
2843                         return -1;
2844                 leaf_age = ages[LEAF_GEN(leaf)];
2845                 if (leaf_age <= tree->maxage && leaf_age < age)
2846                         age = leaf_age;
2847                 s += utf8clen(s);
2848         }
2849         return age;
2850 }
2851
2852 /*
2853  * Maximum age of any character in s, touch at most len bytes.
2854  * Return -1 if s is not valid UTF-8 unicode.
2855  */
2856 int utf8nagemax(struct tree *tree, const char *s, size_t len)
2857 {
2858         utf8leaf_t      *leaf;
2859         int             age = 0;
2860         int             leaf_age;
2861         unsigned char   hangul[UTF8HANGULLEAF];
2862
2863         if (!tree)
2864                 return -1;
2865
2866         while (len && *s) {
2867                 leaf = utf8nlookup(tree, hangul, s, len);
2868                 if (!leaf)
2869                         return -1;
2870                 leaf_age = ages[LEAF_GEN(leaf)];
2871                 if (leaf_age <= tree->maxage && leaf_age > age)
2872                         age = leaf_age;
2873                 len -= utf8clen(s);
2874                 s += utf8clen(s);
2875         }
2876         return age;
2877 }
2878
2879 /*
2880  * Maximum age of any character in s, touch at most len bytes.
2881  * Return -1 if s is not valid UTF-8 unicode.
2882  */
2883 int utf8nagemin(struct tree *tree, const char *s, size_t len)
2884 {
2885         utf8leaf_t      *leaf;
2886         int             leaf_age;
2887         int             age;
2888         unsigned char   hangul[UTF8HANGULLEAF];
2889
2890         if (!tree)
2891                 return -1;
2892         age = tree->maxage;
2893         while (len && *s) {
2894                 leaf = utf8nlookup(tree, hangul, s, len);
2895                 if (!leaf)
2896                         return -1;
2897                 leaf_age = ages[LEAF_GEN(leaf)];
2898                 if (leaf_age <= tree->maxage && leaf_age < age)
2899                         age = leaf_age;
2900                 len -= utf8clen(s);
2901                 s += utf8clen(s);
2902         }
2903         return age;
2904 }
2905
2906 /*
2907  * Length of the normalization of s.
2908  * Return -1 if s is not valid UTF-8 unicode.
2909  *
2910  * A string of Default_Ignorable_Code_Point has length 0.
2911  */
2912 ssize_t utf8len(struct tree *tree, const char *s)
2913 {
2914         utf8leaf_t      *leaf;
2915         size_t          ret = 0;
2916         unsigned char   hangul[UTF8HANGULLEAF];
2917
2918         if (!tree)
2919                 return -1;
2920         while (*s) {
2921                 leaf = utf8lookup(tree, hangul, s);
2922                 if (!leaf)
2923                         return -1;
2924                 if (ages[LEAF_GEN(leaf)] > tree->maxage)
2925                         ret += utf8clen(s);
2926                 else if (LEAF_CCC(leaf) == DECOMPOSE)
2927                         ret += strlen(LEAF_STR(leaf));
2928                 else
2929                         ret += utf8clen(s);
2930                 s += utf8clen(s);
2931         }
2932         return ret;
2933 }
2934
2935 /*
2936  * Length of the normalization of s, touch at most len bytes.
2937  * Return -1 if s is not valid UTF-8 unicode.
2938  */
2939 ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
2940 {
2941         utf8leaf_t      *leaf;
2942         size_t          ret = 0;
2943         unsigned char   hangul[UTF8HANGULLEAF];
2944
2945         if (!tree)
2946                 return -1;
2947         while (len && *s) {
2948                 leaf = utf8nlookup(tree, hangul, s, len);
2949                 if (!leaf)
2950                         return -1;
2951                 if (ages[LEAF_GEN(leaf)] > tree->maxage)
2952                         ret += utf8clen(s);
2953                 else if (LEAF_CCC(leaf) == DECOMPOSE)
2954                         ret += strlen(LEAF_STR(leaf));
2955                 else
2956                         ret += utf8clen(s);
2957                 len -= utf8clen(s);
2958                 s += utf8clen(s);
2959         }
2960         return ret;
2961 }
2962
2963 /*
2964  * Cursor structure used by the normalizer.
2965  */
2966 struct utf8cursor {
2967         struct tree     *tree;
2968         const char      *s;
2969         const char      *p;
2970         const char      *ss;
2971         const char      *sp;
2972         unsigned int    len;
2973         unsigned int    slen;
2974         short int       ccc;
2975         short int       nccc;
2976         unsigned int    unichar;
2977         unsigned char   hangul[UTF8HANGULLEAF];
2978 };
2979
2980 /*
2981  * Set up an utf8cursor for use by utf8byte().
2982  *
2983  *   s      : string.
2984  *   len    : length of s.
2985  *   u8c    : pointer to cursor.
2986  *   trie   : utf8trie_t to use for normalization.
2987  *
2988  * Returns -1 on error, 0 on success.
2989  */
2990 int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
2991                 size_t len)
2992 {
2993         if (!tree)
2994                 return -1;
2995         if (!s)
2996                 return -1;
2997         u8c->tree = tree;
2998         u8c->s = s;
2999         u8c->p = NULL;
3000         u8c->ss = NULL;
3001         u8c->sp = NULL;
3002         u8c->len = len;
3003         u8c->slen = 0;
3004         u8c->ccc = STOPPER;
3005         u8c->nccc = STOPPER;
3006         u8c->unichar = 0;
3007         /* Check we didn't clobber the maximum length. */
3008         if (u8c->len != len)
3009                 return -1;
3010         /* The first byte of s may not be an utf8 continuation. */
3011         if (len > 0 && (*s & 0xC0) == 0x80)
3012                 return -1;
3013         return 0;
3014 }
3015
3016 /*
3017  * Set up an utf8cursor for use by utf8byte().
3018  *
3019  *   s      : NUL-terminated string.
3020  *   u8c    : pointer to cursor.
3021  *   trie   : utf8trie_t to use for normalization.
3022  *
3023  * Returns -1 on error, 0 on success.
3024  */
3025 int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
3026 {
3027         return utf8ncursor(u8c, tree, s, (unsigned int)-1);
3028 }
3029
3030 /*
3031  * Get one byte from the normalized form of the string described by u8c.
3032  *
3033  * Returns the byte cast to an unsigned char on succes, and -1 on failure.
3034  *
3035  * The cursor keeps track of the location in the string in u8c->s.
3036  * When a character is decomposed, the current location is stored in
3037  * u8c->p, and u8c->s is set to the start of the decomposition. Note
3038  * that bytes from a decomposition do not count against u8c->len.
3039  *
3040  * Characters are emitted if they match the current CCC in u8c->ccc.
3041  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
3042  * and the function returns 0 in that case.
3043  *
3044  * Sorting by CCC is done by repeatedly scanning the string.  The
3045  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
3046  * the start of the scan.  The first pass finds the lowest CCC to be
3047  * emitted and stores it in u8c->nccc, the second pass emits the
3048  * characters with this CCC and finds the next lowest CCC. This limits
3049  * the number of passes to 1 + the number of different CCCs in the
3050  * sequence being scanned.
3051  *
3052  * Therefore:
3053  *  u8c->p  != NULL -> a decomposition is being scanned.
3054  *  u8c->ss != NULL -> this is a repeating scan.
3055  *  u8c->ccc == -1  -> this is the first scan of a repeating scan.
3056  */
3057 int utf8byte(struct utf8cursor *u8c)
3058 {
3059         utf8leaf_t *leaf;
3060         int ccc;
3061
3062         for (;;) {
3063                 /* Check for the end of a decomposed character. */
3064                 if (u8c->p && *u8c->s == '\0') {
3065                         u8c->s = u8c->p;
3066                         u8c->p = NULL;
3067                 }
3068
3069                 /* Check for end-of-string. */
3070                 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
3071                         /* There is no next byte. */
3072                         if (u8c->ccc == STOPPER)
3073                                 return 0;
3074                         /* End-of-string during a scan counts as a stopper. */
3075                         ccc = STOPPER;
3076                         goto ccc_mismatch;
3077                 } else if ((*u8c->s & 0xC0) == 0x80) {
3078                         /* This is a continuation of the current character. */
3079                         if (!u8c->p)
3080                                 u8c->len--;
3081                         return (unsigned char)*u8c->s++;
3082                 }
3083
3084                 /* Look up the data for the current character. */
3085                 if (u8c->p) {
3086                         leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3087                 } else {
3088                         leaf = utf8nlookup(u8c->tree, u8c->hangul,
3089                                            u8c->s, u8c->len);
3090                 }
3091
3092                 /* No leaf found implies that the input is a binary blob. */
3093                 if (!leaf)
3094                         return -1;
3095
3096                 /* Characters that are too new have CCC 0. */
3097                 if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
3098                         ccc = STOPPER;
3099                 } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
3100                         u8c->len -= utf8clen(u8c->s);
3101                         u8c->p = u8c->s + utf8clen(u8c->s);
3102                         u8c->s = LEAF_STR(leaf);
3103                         /* Empty decomposition implies CCC 0. */
3104                         if (*u8c->s == '\0') {
3105                                 if (u8c->ccc == STOPPER)
3106                                         continue;
3107                                 ccc = STOPPER;
3108                                 goto ccc_mismatch;
3109                         }
3110                         leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3111                         ccc = LEAF_CCC(leaf);
3112                 }
3113                 u8c->unichar = utf8decode(u8c->s);
3114
3115                 /*
3116                  * If this is not a stopper, then see if it updates
3117                  * the next canonical class to be emitted.
3118                  */
3119                 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
3120                         u8c->nccc = ccc;
3121
3122                 /*
3123                  * Return the current byte if this is the current
3124                  * combining class.
3125                  */
3126                 if (ccc == u8c->ccc) {
3127                         if (!u8c->p)
3128                                 u8c->len--;
3129                         return (unsigned char)*u8c->s++;
3130                 }
3131
3132                 /* Current combining class mismatch. */
3133         ccc_mismatch:
3134                 if (u8c->nccc == STOPPER) {
3135                         /*
3136                          * Scan forward for the first canonical class
3137                          * to be emitted.  Save the position from
3138                          * which to restart.
3139                          */
3140                         assert(u8c->ccc == STOPPER);
3141                         u8c->ccc = MINCCC - 1;
3142                         u8c->nccc = ccc;
3143                         u8c->sp = u8c->p;
3144                         u8c->ss = u8c->s;
3145                         u8c->slen = u8c->len;
3146                         if (!u8c->p)
3147                                 u8c->len -= utf8clen(u8c->s);
3148                         u8c->s += utf8clen(u8c->s);
3149                 } else if (ccc != STOPPER) {
3150                         /* Not a stopper, and not the ccc we're emitting. */
3151                         if (!u8c->p)
3152                                 u8c->len -= utf8clen(u8c->s);
3153                         u8c->s += utf8clen(u8c->s);
3154                 } else if (u8c->nccc != MAXCCC + 1) {
3155                         /* At a stopper, restart for next ccc. */
3156                         u8c->ccc = u8c->nccc;
3157                         u8c->nccc = MAXCCC + 1;
3158                         u8c->s = u8c->ss;
3159                         u8c->p = u8c->sp;
3160                         u8c->len = u8c->slen;
3161                 } else {
3162                         /* All done, proceed from here. */
3163                         u8c->ccc = STOPPER;
3164                         u8c->nccc = STOPPER;
3165                         u8c->sp = NULL;
3166                         u8c->ss = NULL;
3167                         u8c->slen = 0;
3168                 }
3169         }
3170 }
3171
3172 /* ------------------------------------------------------------------ */
3173
3174 static int normalize_line(struct tree *tree)
3175 {
3176         char *s;
3177         char *t;
3178         int c;
3179         struct utf8cursor u8c;
3180
3181         /* First test: null-terminated string. */
3182         s = buf2;
3183         t = buf3;
3184         if (utf8cursor(&u8c, tree, s))
3185                 return -1;
3186         while ((c = utf8byte(&u8c)) > 0)
3187                 if (c != (unsigned char)*t++)
3188                         return -1;
3189         if (c < 0)
3190                 return -1;
3191         if (*t != 0)
3192                 return -1;
3193
3194         /* Second test: length-limited string. */
3195         s = buf2;
3196         /* Replace NUL with a value that will cause an error if seen. */
3197         s[strlen(s) + 1] = -1;
3198         t = buf3;
3199         if (utf8cursor(&u8c, tree, s))
3200                 return -1;
3201         while ((c = utf8byte(&u8c)) > 0)
3202                 if (c != (unsigned char)*t++)
3203                         return -1;
3204         if (c < 0)
3205                 return -1;
3206         if (*t != 0)
3207                 return -1;
3208
3209         return 0;
3210 }
3211
3212 static void normalization_test(void)
3213 {
3214         FILE *file;
3215         unsigned int unichar;
3216         struct unicode_data *data;
3217         char *s;
3218         char *t;
3219         int ret;
3220         int ignorables;
3221         int tests = 0;
3222         int failures = 0;
3223
3224         if (verbose > 0)
3225                 printf("Parsing %s\n", test_name);
3226         /* Step one, read data from file. */
3227         file = fopen(test_name, "r");
3228         if (!file)
3229                 open_fail(test_name, errno);
3230
3231         while (fgets(line, LINESIZE, file)) {
3232                 ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
3233                              buf0, buf1);
3234                 if (ret != 2 || *line == '#')
3235                         continue;
3236                 s = buf0;
3237                 t = buf2;
3238                 while (*s) {
3239                         unichar = strtoul(s, &s, 16);
3240                         t += utf8encode(t, unichar);
3241                 }
3242                 *t = '\0';
3243
3244                 ignorables = 0;
3245                 s = buf1;
3246                 t = buf3;
3247                 while (*s) {
3248                         unichar = strtoul(s, &s, 16);
3249                         data = &unicode_data[unichar];
3250                         if (data->utf8nfdi && !*data->utf8nfdi)
3251                                 ignorables = 1;
3252                         else
3253                                 t += utf8encode(t, unichar);
3254                 }
3255                 *t = '\0';
3256
3257                 tests++;
3258                 if (normalize_line(nfdi_tree) < 0) {
3259                         printf("Line %s -> %s", buf0, buf1);
3260                         if (ignorables)
3261                                 printf(" (ignorables removed)");
3262                         printf(" failure\n");
3263                         failures++;
3264                 }
3265         }
3266         fclose(file);
3267         if (verbose > 0)
3268                 printf("Ran %d tests with %d failures\n", tests, failures);
3269         if (failures)
3270                 file_fail(test_name);
3271 }
3272
3273 /* ------------------------------------------------------------------ */
3274
3275 static void write_file(void)
3276 {
3277         FILE *file;
3278         int i;
3279         int j;
3280         int t;
3281         int gen;
3282
3283         if (verbose > 0)
3284                 printf("Writing %s\n", utf8_name);
3285         file = fopen(utf8_name, "w");
3286         if (!file)
3287                 open_fail(utf8_name, errno);
3288
3289         fprintf(file, "/* This file is generated code, do not edit. */\n");
3290         fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
3291         fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
3292         fprintf(file, "#endif\n");
3293         fprintf(file, "\n");
3294         fprintf(file, "static const unsigned int utf8vers = %#x;\n",
3295                 unicode_maxage);
3296         fprintf(file, "\n");
3297         fprintf(file, "static const unsigned int utf8agetab[] = {\n");
3298         for (i = 0; i != ages_count; i++)
3299                 fprintf(file, "\t%#x%s\n", ages[i],
3300                         ages[i] == unicode_maxage ? "" : ",");
3301         fprintf(file, "};\n");
3302         fprintf(file, "\n");
3303         fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
3304         t = 0;
3305         for (gen = 0; gen < ages_count; gen++) {
3306                 fprintf(file, "\t{ %#x, %d }%s\n",
3307                         ages[gen], trees[t].index,
3308                         ages[gen] == unicode_maxage ? "" : ",");
3309                 if (trees[t].maxage == ages[gen])
3310                         t += 2;
3311         }
3312         fprintf(file, "};\n");
3313         fprintf(file, "\n");
3314         fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
3315         t = 1;
3316         for (gen = 0; gen < ages_count; gen++) {
3317                 fprintf(file, "\t{ %#x, %d }%s\n",
3318                         ages[gen], trees[t].index,
3319                         ages[gen] == unicode_maxage ? "" : ",");
3320                 if (trees[t].maxage == ages[gen])
3321                         t += 2;
3322         }
3323         fprintf(file, "};\n");
3324         fprintf(file, "\n");
3325         fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
3326                 utf8data_size);
3327         t = 0;
3328         for (i = 0; i != utf8data_size; i += 16) {
3329                 if (i == trees[t].index) {
3330                         fprintf(file, "\t/* %s_%x */\n",
3331                                 trees[t].type, trees[t].maxage);
3332                         if (t < trees_count-1)
3333                                 t++;
3334                 }
3335                 fprintf(file, "\t");
3336                 for (j = i; j != i + 16; j++)
3337                         fprintf(file, "0x%.2x%s", utf8data[j],
3338                                 (j < utf8data_size -1 ? "," : ""));
3339                 fprintf(file, "\n");
3340         }
3341         fprintf(file, "};\n");
3342         fclose(file);
3343 }
3344
3345 /* ------------------------------------------------------------------ */
3346
3347 int main(int argc, char *argv[])
3348 {
3349         unsigned int unichar;
3350         int opt;
3351
3352         argv0 = argv[0];
3353
3354         while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
3355                 switch (opt) {
3356                 case 'a':
3357                         age_name = optarg;
3358                         break;
3359                 case 'c':
3360                         ccc_name = optarg;
3361                         break;
3362                 case 'd':
3363                         data_name = optarg;
3364                         break;
3365                 case 'f':
3366                         fold_name = optarg;
3367                         break;
3368                 case 'n':
3369                         norm_name = optarg;
3370                         break;
3371                 case 'o':
3372                         utf8_name = optarg;
3373                         break;
3374                 case 'p':
3375                         prop_name = optarg;
3376                         break;
3377                 case 't':
3378                         test_name = optarg;
3379                         break;
3380                 case 'v':
3381                         verbose++;
3382                         break;
3383                 case 'h':
3384                         help();
3385                         exit(0);
3386                 default:
3387                         usage();
3388                 }
3389         }
3390
3391         if (verbose > 1)
3392                 help();
3393         for (unichar = 0; unichar != 0x110000; unichar++)
3394                 unicode_data[unichar].code = unichar;
3395         age_init();
3396         ccc_init();
3397         nfdi_init();
3398         nfdicf_init();
3399         ignore_init();
3400         corrections_init();
3401         hangul_decompose();
3402         nfdi_decompose();
3403         nfdicf_decompose();
3404         utf8_init();
3405         trees_init();
3406         trees_populate();
3407         trees_reduce();
3408         trees_verify();
3409         /* Prevent "unused function" warning. */
3410         (void)lookup(nfdi_tree, " ");
3411         if (verbose > 2)
3412                 tree_walk(nfdi_tree);
3413         if (verbose > 2)
3414                 tree_walk(nfdicf_tree);
3415         normalization_test();
3416         write_file();
3417
3418         return 0;
3419 }