fs/unicode/mkutf8data.c

   1 /*
   2  * Copyright (c) 2014 SGI.
   3  * All rights reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18
  19 /* Generator for a compact trie for unicode normalization */
  20
  21 #include <sys/types.h>
  22 #include <stddef.h>
  23 #include <stdlib.h>
  24 #include <stdio.h>
  25 #include <assert.h>
  26 #include <string.h>
  27 #include <unistd.h>
  28 #include <errno.h>
  29
  30 /* Default names of the in- and output files. */
  31
  32 #define AGE_NAME        "DerivedAge.txt"
  33 #define CCC_NAME        "DerivedCombiningClass.txt"
  34 #define PROP_NAME       "DerivedCoreProperties.txt"
  35 #define DATA_NAME       "UnicodeData.txt"
  36 #define FOLD_NAME       "CaseFolding.txt"
  37 #define NORM_NAME       "NormalizationCorrections.txt"
  38 #define TEST_NAME       "NormalizationTest.txt"
  39 #define UTF8_NAME       "utf8data.c"
  40
  41 const char      *age_name  = AGE_NAME;
  42 const char      *ccc_name  = CCC_NAME;
  43 const char      *prop_name = PROP_NAME;
  44 const char      *data_name = DATA_NAME;
  45 const char      *fold_name = FOLD_NAME;
  46 const char      *norm_name = NORM_NAME;
  47 const char      *test_name = TEST_NAME;
  48 const char      *utf8_name = UTF8_NAME;
  49
  50 int verbose = 0;
  51
  52 /* An arbitrary line size limit on input lines. */
  53
  54 #define LINESIZE        1024
  55 char line[LINESIZE];
  56 char buf0[LINESIZE];
  57 char buf1[LINESIZE];
  58 char buf2[LINESIZE];
  59 char buf3[LINESIZE];
  60
  61 const char *argv0;
  62
  63 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
  64
  65 /* ------------------------------------------------------------------ */
  66
  67 /*
  68  * Unicode version numbers consist of three parts: major, minor, and a
  69  * revision.  These numbers are packed into an unsigned int to obtain
  70  * a single version number.
  71  *
  72  * To save space in the generated trie, the unicode version is not
  73  * stored directly, instead we calculate a generation number from the
  74  * unicode versions seen in the DerivedAge file, and use that as an
  75  * index into a table of unicode versions.
  76  */
  77 #define UNICODE_MAJ_SHIFT               (16)
  78 #define UNICODE_MIN_SHIFT               (8)
  79
  80 #define UNICODE_MAJ_MAX                 ((unsigned short)-1)
  81 #define UNICODE_MIN_MAX                 ((unsigned char)-1)
  82 #define UNICODE_REV_MAX                 ((unsigned char)-1)
  83
  84 #define UNICODE_AGE(MAJ,MIN,REV)                        \
  85         (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |   \
  86          ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |   \
  87          ((unsigned int)(REV)))
  88
  89 unsigned int *ages;
  90 int ages_count;
  91
  92 unsigned int unicode_maxage;
  93
  94 static int age_valid(unsigned int major, unsigned int minor,
  95                      unsigned int revision)
  96 {
  97         if (major > UNICODE_MAJ_MAX)
  98                 return 0;
  99         if (minor > UNICODE_MIN_MAX)
 100                 return 0;
 101         if (revision > UNICODE_REV_MAX)
 102                 return 0;
 103         return 1;
 104 }
 105
 106 /* ------------------------------------------------------------------ */
 107
 108 /*
 109  * utf8trie_t
 110  *
 111  * A compact binary tree, used to decode UTF-8 characters.
 112  *
 113  * Internal nodes are one byte for the node itself, and up to three
 114  * bytes for an offset into the tree.  The first byte contains the
 115  * following information:
 116  *  NEXTBYTE  - flag        - advance to next byte if set
 117  *  BITNUM    - 3 bit field - the bit number to tested
 118  *  OFFLEN    - 2 bit field - number of bytes in the offset
 119  * if offlen == 0 (non-branching node)
 120  *  RIGHTPATH - 1 bit field - set if the following node is for the
 121  *                            right-hand path (tested bit is set)
 122  *  TRIENODE  - 1 bit field - set if the following node is an internal
 123  *                            node, otherwise it is a leaf node
 124  * if offlen != 0 (branching node)
 125  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
 126  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
 127  *
 128  * Due to the way utf8 works, there cannot be branching nodes with
 129  * NEXTBYTE set, and moreover those nodes always have a righthand
 130  * descendant.
 131  */
 132 typedef unsigned char utf8trie_t;
 133 #define BITNUM          0x07
 134 #define NEXTBYTE        0x08
 135 #define OFFLEN          0x30
 136 #define OFFLEN_SHIFT    4
 137 #define RIGHTPATH       0x40
 138 #define TRIENODE        0x80
 139 #define RIGHTNODE       0x40
 140 #define LEFTNODE        0x80
 141
 142 /*
 143  * utf8leaf_t
 144  *
 145  * The leaves of the trie are embedded in the trie, and so the same
 146  * underlying datatype, unsigned char.
 147  *
 148  * leaf[0]: The unicode version, stored as a generation number that is
 149  *          an index into utf8agetab[].  With this we can filter code
 150  *          points based on the unicode version in which they were
 151  *          defined.  The CCC of a non-defined code point is 0.
 152  * leaf[1]: Canonical Combining Class. During normalization, we need
 153  *          to do a stable sort into ascending order of all characters
 154  *          with a non-zero CCC that occur between two characters with
 155  *          a CCC of 0, or at the begin or end of a string.
 156  *          The unicode standard guarantees that all CCC values are
 157  *          between 0 and 254 inclusive, which leaves 255 available as
 158  *          a special value.
 159  *          Code points with CCC 0 are known as stoppers.
 160  * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
 161  *          start of a NUL-terminated string that is the decomposition
 162  *          of the character.
 163  *          The CCC of a decomposable character is the same as the CCC
 164  *          of the first character of its decomposition.
 165  *          Some characters decompose as the empty string: these are
 166  *          characters with the Default_Ignorable_Code_Point property.
 167  *          These do affect normalization, as they all have CCC 0.
 168  *
 169  * The decompositions in the trie have been fully expanded.
 170  *
 171  * Casefolding, if applicable, is also done using decompositions.
 172  */
 173 typedef unsigned char utf8leaf_t;
 174
 175 #define LEAF_GEN(LEAF)  ((LEAF)[0])
 176 #define LEAF_CCC(LEAF)  ((LEAF)[1])
 177 #define LEAF_STR(LEAF)  ((const char*)((LEAF) + 2))
 178
 179 #define MAXGEN          (255)
 180
 181 #define MINCCC          (0)
 182 #define MAXCCC          (254)
 183 #define STOPPER         (0)
 184 #define DECOMPOSE       (255)
 185 #define HANGUL          ((char)(255))
 186
 187 #define UTF8HANGULLEAF  (12)
 188
 189 struct tree;
 190 static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
 191                                const char *, size_t);
 192 static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
 193
 194 unsigned char *utf8data;
 195 size_t utf8data_size;
 196
 197 utf8trie_t *nfdi;
 198 utf8trie_t *nfdicf;
 199
 200 /* ------------------------------------------------------------------ */
 201
 202 /*
 203  * UTF8 valid ranges.
 204  *
 205  * The UTF-8 encoding spreads the bits of a 32bit word over several
 206  * bytes. This table gives the ranges that can be held and how they'd
 207  * be represented.
 208  *
 209  * 0x00000000 0x0000007F: 0xxxxxxx
 210  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
 211  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
 212  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 213  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 214  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 215  *
 216  * There is an additional requirement on UTF-8, in that only the
 217  * shortest representation of a 32bit value is to be used.  A decoder
 218  * must not decode sequences that do not satisfy this requirement.
 219  * Thus the allowed ranges have a lower bound.
 220  *
 221  * 0x00000000 0x0000007F: 0xxxxxxx
 222  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
 223  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
 224  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 225  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 226  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 227  *
 228  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
 229  * 17 planes of 65536 values.  This limits the sequences actually seen
 230  * even more, to just the following.
 231  *
 232  *          0 -     0x7f: 0                     0x7f
 233  *       0x80 -    0x7ff: 0xc2 0x80             0xdf 0xbf
 234  *      0x800 -   0xffff: 0xe0 0xa0 0x80        0xef 0xbf 0xbf
 235  *    0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80   0xf4 0x8f 0xbf 0xbf
 236  *
 237  * Even within those ranges not all values are allowed: the surrogates
 238  * 0xd800 - 0xdfff should never be seen.
 239  *
 240  * Note that the longest sequence seen with valid usage is 4 bytes,
 241  * the same a single UTF-32 character.  This makes the UTF-8
 242  * representation of Unicode strictly smaller than UTF-32.
 243  *
 244  * The shortest sequence requirement was introduced by:
 245  *    Corrigendum #1: UTF-8 Shortest Form
 246  * It can be found here:
 247  *    http://www.unicode.org/versions/corrigendum1.html
 248  *
 249  */
 250
 251 #define UTF8_2_BITS     0xC0
 252 #define UTF8_3_BITS     0xE0
 253 #define UTF8_4_BITS     0xF0
 254 #define UTF8_N_BITS     0x80
 255 #define UTF8_2_MASK     0xE0
 256 #define UTF8_3_MASK     0xF0
 257 #define UTF8_4_MASK     0xF8
 258 #define UTF8_N_MASK     0xC0
 259 #define UTF8_V_MASK     0x3F
 260 #define UTF8_V_SHIFT    6
 261
 262 static int utf8encode(char *str, unsigned int val)
 263 {
 264         int len;
 265
 266         if (val < 0x80) {
 267                 str[0] = val;
 268                 len = 1;
 269         } else if (val < 0x800) {
 270                 str[1] = val & UTF8_V_MASK;
 271                 str[1] |= UTF8_N_BITS;
 272                 val >>= UTF8_V_SHIFT;
 273                 str[0] = val;
 274                 str[0] |= UTF8_2_BITS;
 275                 len = 2;
 276         } else if (val < 0x10000) {
 277                 str[2] = val & UTF8_V_MASK;
 278                 str[2] |= UTF8_N_BITS;
 279                 val >>= UTF8_V_SHIFT;
 280                 str[1] = val & UTF8_V_MASK;
 281                 str[1] |= UTF8_N_BITS;
 282                 val >>= UTF8_V_SHIFT;
 283                 str[0] = val;
 284                 str[0] |= UTF8_3_BITS;
 285                 len = 3;
 286         } else if (val < 0x110000) {
 287                 str[3] = val & UTF8_V_MASK;
 288                 str[3] |= UTF8_N_BITS;
 289                 val >>= UTF8_V_SHIFT;
 290                 str[2] = val & UTF8_V_MASK;
 291                 str[2] |= UTF8_N_BITS;
 292                 val >>= UTF8_V_SHIFT;
 293                 str[1] = val & UTF8_V_MASK;
 294                 str[1] |= UTF8_N_BITS;
 295                 val >>= UTF8_V_SHIFT;
 296                 str[0] = val;
 297                 str[0] |= UTF8_4_BITS;
 298                 len = 4;
 299         } else {
 300                 printf("%#x: illegal val\n", val);
 301                 len = 0;
 302         }
 303         return len;
 304 }
 305
 306 static unsigned int utf8decode(const char *str)
 307 {
 308         const unsigned char *s = (const unsigned char*)str;
 309         unsigned int unichar = 0;
 310
 311         if (*s < 0x80) {
 312                 unichar = *s;
 313         } else if (*s < UTF8_3_BITS) {
 314                 unichar = *s++ & 0x1F;
 315                 unichar <<= UTF8_V_SHIFT;
 316                 unichar |= *s & 0x3F;
 317         } else if (*s < UTF8_4_BITS) {
 318                 unichar = *s++ & 0x0F;
 319                 unichar <<= UTF8_V_SHIFT;
 320                 unichar |= *s++ & 0x3F;
 321                 unichar <<= UTF8_V_SHIFT;
 322                 unichar |= *s & 0x3F;
 323         } else {
 324                 unichar = *s++ & 0x0F;
 325                 unichar <<= UTF8_V_SHIFT;
 326                 unichar |= *s++ & 0x3F;
 327                 unichar <<= UTF8_V_SHIFT;
 328                 unichar |= *s++ & 0x3F;
 329                 unichar <<= UTF8_V_SHIFT;
 330                 unichar |= *s & 0x3F;
 331         }
 332         return unichar;
 333 }
 334
 335 static int utf32valid(unsigned int unichar)
 336 {
 337         return unichar < 0x110000;
 338 }
 339
 340 #define HANGUL_SYLLABLE(U)      ((U) >= 0xAC00 && (U) <= 0xD7A3)
 341
 342 #define NODE 1
 343 #define LEAF 0
 344
 345 struct tree {
 346         void *root;
 347         int childnode;
 348         const char *type;
 349         unsigned int maxage;
 350         struct tree *next;
 351         int (*leaf_equal)(void *, void *);
 352         void (*leaf_print)(void *, int);
 353         int (*leaf_mark)(void *);
 354         int (*leaf_size)(void *);
 355         int *(*leaf_index)(struct tree *, void *);
 356         unsigned char *(*leaf_emit)(void *, unsigned char *);
 357         int leafindex[0x110000];
 358         int index;
 359 };
 360
 361 struct node {
 362         int index;
 363         int offset;
 364         int mark;
 365         int size;
 366         struct node *parent;
 367         void *left;
 368         void *right;
 369         unsigned char bitnum;
 370         unsigned char nextbyte;
 371         unsigned char leftnode;
 372         unsigned char rightnode;
 373         unsigned int keybits;
 374         unsigned int keymask;
 375 };
 376
 377 /*
 378  * Example lookup function for a tree.
 379  */
 380 static void *lookup(struct tree *tree, const char *key)
 381 {
 382         struct node *node;
 383         void *leaf = NULL;
 384
 385         node = tree->root;
 386         while (!leaf && node) {
 387                 if (node->nextbyte)
 388                         key++;
 389                 if (*key & (1 << (node->bitnum & 7))) {
 390                         /* Right leg */
 391                         if (node->rightnode == NODE) {
 392                                 node = node->right;
 393                         } else if (node->rightnode == LEAF) {
 394                                 leaf = node->right;
 395                         } else {
 396                                 node = NULL;
 397                         }
 398                 } else {
 399                         /* Left leg */
 400                         if (node->leftnode == NODE) {
 401                                 node = node->left;
 402                         } else if (node->leftnode == LEAF) {
 403                                 leaf = node->left;
 404                         } else {
 405                                 node = NULL;
 406                         }
 407                 }
 408         }
 409
 410         return leaf;
 411 }
 412
 413 /*
 414  * A simple non-recursive tree walker: keep track of visits to the
 415  * left and right branches in the leftmask and rightmask.
 416  */
 417 static void tree_walk(struct tree *tree)
 418 {
 419         struct node *node;
 420         unsigned int leftmask;
 421         unsigned int rightmask;
 422         unsigned int bitmask;
 423         int indent = 1;
 424         int nodes, singletons, leaves;
 425
 426         nodes = singletons = leaves = 0;
 427
 428         printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
 429         if (tree->childnode == LEAF) {
 430                 assert(tree->root);
 431                 tree->leaf_print(tree->root, indent);
 432                 leaves = 1;
 433         } else {
 434                 assert(tree->childnode == NODE);
 435                 node = tree->root;
 436                 leftmask = rightmask = 0;
 437                 while (node) {
 438                         printf("%*snode @ %p bitnum %d nextbyte %d"
 439                                " left %p right %p mask %x bits %x\n",
 440                                 indent, "", node,
 441                                 node->bitnum, node->nextbyte,
 442                                 node->left, node->right,
 443                                 node->keymask, node->keybits);
 444                         nodes += 1;
 445                         if (!(node->left && node->right))
 446                                 singletons += 1;
 447
 448                         while (node) {
 449                                 bitmask = 1 << node->bitnum;
 450                                 if ((leftmask & bitmask) == 0) {
 451                                         leftmask |= bitmask;
 452                                         if (node->leftnode == LEAF) {
 453                                                 assert(node->left);
 454                                                 tree->leaf_print(node->left,
 455                                                                  indent+1);
 456                                                 leaves += 1;
 457                                         } else if (node->left) {
 458                                                 assert(node->leftnode == NODE);
 459                                                 indent += 1;
 460                                                 node = node->left;
 461                                                 break;
 462                                         }
 463                                 }
 464                                 if ((rightmask & bitmask) == 0) {
 465                                         rightmask |= bitmask;
 466                                         if (node->rightnode == LEAF) {
 467                                                 assert(node->right);
 468                                                 tree->leaf_print(node->right,
 469                                                                  indent+1);
 470                                                 leaves += 1;
 471                                         } else if (node->right) {
 472                                                 assert(node->rightnode == NODE);
 473                                                 indent += 1;
 474                                                 node = node->right;
 475                                                 break;
 476                                         }
 477                                 }
 478                                 leftmask &= ~bitmask;
 479                                 rightmask &= ~bitmask;
 480                                 node = node->parent;
 481                                 indent -= 1;
 482                         }
 483                 }
 484         }
 485         printf("nodes %d leaves %d singletons %d\n",
 486                nodes, leaves, singletons);
 487 }
 488
 489 /*
 490  * Allocate an initialize a new internal node.
 491  */
 492 static struct node *alloc_node(struct node *parent)
 493 {
 494         struct node *node;
 495         int bitnum;
 496
 497         node = malloc(sizeof(*node));
 498         node->left = node->right = NULL;
 499         node->parent = parent;
 500         node->leftnode = NODE;
 501         node->rightnode = NODE;
 502         node->keybits = 0;
 503         node->keymask = 0;
 504         node->mark = 0;
 505         node->index = 0;
 506         node->offset = -1;
 507         node->size = 4;
 508
 509         if (node->parent) {
 510                 bitnum = parent->bitnum;
 511                 if ((bitnum & 7) == 0) {
 512                         node->bitnum = bitnum + 7 + 8;
 513                         node->nextbyte = 1;
 514                 } else {
 515                         node->bitnum = bitnum - 1;
 516                         node->nextbyte = 0;
 517                 }
 518         } else {
 519                 node->bitnum = 7;
 520                 node->nextbyte = 0;
 521         }
 522
 523         return node;
 524 }
 525
 526 /*
 527  * Insert a new leaf into the tree, and collapse any subtrees that are
 528  * fully populated and end in identical leaves. A nextbyte tagged
 529  * internal node will not be removed to preserve the tree's integrity.
 530  * Note that due to the structure of utf8, no nextbyte tagged node
 531  * will be a candidate for removal.
 532  */
 533 static int insert(struct tree *tree, char *key, int keylen, void *leaf)
 534 {
 535         struct node *node;
 536         struct node *parent;
 537         void **cursor;
 538         int keybits;
 539
 540         assert(keylen >= 1 && keylen <= 4);
 541
 542         node = NULL;
 543         cursor = &tree->root;
 544         keybits = 8 * keylen;
 545
 546         /* Insert, creating path along the way. */
 547         while (keybits) {
 548                 if (!*cursor)
 549                         *cursor = alloc_node(node);
 550                 node = *cursor;
 551                 if (node->nextbyte)
 552                         key++;
 553                 if (*key & (1 << (node->bitnum & 7)))
 554                         cursor = &node->right;
 555                 else
 556                         cursor = &node->left;
 557                 keybits--;
 558         }
 559         *cursor = leaf;
 560
 561         /* Merge subtrees if possible. */
 562         while (node) {
 563                 if (*key & (1 << (node->bitnum & 7)))
 564                         node->rightnode = LEAF;
 565                 else
 566                         node->leftnode = LEAF;
 567                 if (node->nextbyte)
 568                         break;
 569                 if (node->leftnode == NODE || node->rightnode == NODE)
 570                         break;
 571                 assert(node->left);
 572                 assert(node->right);
 573                 /* Compare */
 574                 if (! tree->leaf_equal(node->left, node->right))
 575                         break;
 576                 /* Keep left, drop right leaf. */
 577                 leaf = node->left;
 578                 /* Check in parent */
 579                 parent = node->parent;
 580                 if (!parent) {
 581                         /* root of tree! */
 582                         tree->root = leaf;
 583                         tree->childnode = LEAF;
 584                 } else if (parent->left == node) {
 585                         parent->left = leaf;
 586                         parent->leftnode = LEAF;
 587                         if (parent->right) {
 588                                 parent->keymask = 0;
 589                                 parent->keybits = 0;
 590                         } else {
 591                                 parent->keymask |= (1 << node->bitnum);
 592                         }
 593                 } else if (parent->right == node) {
 594                         parent->right = leaf;
 595                         parent->rightnode = LEAF;
 596                         if (parent->left) {
 597                                 parent->keymask = 0;
 598                                 parent->keybits = 0;
 599                         } else {
 600                                 parent->keymask |= (1 << node->bitnum);
 601                                 parent->keybits |= (1 << node->bitnum);
 602                         }
 603                 } else {
 604                         /* internal tree error */
 605                         assert(0);
 606                 }
 607                 free(node);
 608                 node = parent;
 609         }
 610
 611         /* Propagate keymasks up along singleton chains. */
 612         while (node) {
 613                 parent = node->parent;
 614                 if (!parent)
 615                         break;
 616                 /* Nix the mask for parents with two children. */
 617                 if (node->keymask == 0) {
 618                         parent->keymask = 0;
 619                         parent->keybits = 0;
 620                 } else if (parent->left && parent->right) {
 621                         parent->keymask = 0;
 622                         parent->keybits = 0;
 623                 } else {
 624                         assert((parent->keymask & node->keymask) == 0);
 625                         parent->keymask |= node->keymask;
 626                         parent->keymask |= (1 << parent->bitnum);
 627                         parent->keybits |= node->keybits;
 628                         if (parent->right)
 629                                 parent->keybits |= (1 << parent->bitnum);
 630                 }
 631                 node = parent;
 632         }
 633
 634         return 0;
 635 }
 636
 637 /*
 638  * Prune internal nodes.
 639  *
 640  * Fully populated subtrees that end at the same leaf have already
 641  * been collapsed.  There are still internal nodes that have for both
 642  * their left and right branches a sequence of singletons that make
 643  * identical choices and end in identical leaves.  The keymask and
 644  * keybits collected in the nodes describe the choices made in these
 645  * singleton chains.  When they are identical for the left and right
 646  * branch of a node, and the two leaves comare identical, the node in
 647  * question can be removed.
 648  *
 649  * Note that nodes with the nextbyte tag set will not be removed by
 650  * this to ensure tree integrity.  Note as well that the structure of
 651  * utf8 ensures that these nodes would not have been candidates for
 652  * removal in any case.
 653  */
 654 static void prune(struct tree *tree)
 655 {
 656         struct node *node;
 657         struct node *left;
 658         struct node *right;
 659         struct node *parent;
 660         void *leftleaf;
 661         void *rightleaf;
 662         unsigned int leftmask;
 663         unsigned int rightmask;
 664         unsigned int bitmask;
 665         int count;
 666
 667         if (verbose > 0)
 668                 printf("Pruning %s_%x\n", tree->type, tree->maxage);
 669
 670         count = 0;
 671         if (tree->childnode == LEAF)
 672                 return;
 673         if (!tree->root)
 674                 return;
 675
 676         leftmask = rightmask = 0;
 677         node = tree->root;
 678         while (node) {
 679                 if (node->nextbyte)
 680                         goto advance;
 681                 if (node->leftnode == LEAF)
 682                         goto advance;
 683                 if (node->rightnode == LEAF)
 684                         goto advance;
 685                 if (!node->left)
 686                         goto advance;
 687                 if (!node->right)
 688                         goto advance;
 689                 left = node->left;
 690                 right = node->right;
 691                 if (left->keymask == 0)
 692                         goto advance;
 693                 if (right->keymask == 0)
 694                         goto advance;
 695                 if (left->keymask != right->keymask)
 696                         goto advance;
 697                 if (left->keybits != right->keybits)
 698                         goto advance;
 699                 leftleaf = NULL;
 700                 while (!leftleaf) {
 701                         assert(left->left || left->right);
 702                         if (left->leftnode == LEAF)
 703                                 leftleaf = left->left;
 704                         else if (left->rightnode == LEAF)
 705                                 leftleaf = left->right;
 706                         else if (left->left)
 707                                 left = left->left;
 708                         else if (left->right)
 709                                 left = left->right;
 710                         else
 711                                 assert(0);
 712                 }
 713                 rightleaf = NULL;
 714                 while (!rightleaf) {
 715                         assert(right->left || right->right);
 716                         if (right->leftnode == LEAF)
 717                                 rightleaf = right->left;
 718                         else if (right->rightnode == LEAF)
 719                                 rightleaf = right->right;
 720                         else if (right->left)
 721                                 right = right->left;
 722                         else if (right->right)
 723                                 right = right->right;
 724                         else
 725                                 assert(0);
 726                 }
 727                 if (! tree->leaf_equal(leftleaf, rightleaf))
 728                         goto advance;
 729                 /*
 730                  * This node has identical singleton-only subtrees.
 731                  * Remove it.
 732                  */
 733                 parent = node->parent;
 734                 left = node->left;
 735                 right = node->right;
 736                 if (parent->left == node)
 737                         parent->left = left;
 738                 else if (parent->right == node)
 739                         parent->right = left;
 740                 else
 741                         assert(0);
 742                 left->parent = parent;
 743                 left->keymask |= (1 << node->bitnum);
 744                 node->left = NULL;
 745                 while (node) {
 746                         bitmask = 1 << node->bitnum;
 747                         leftmask &= ~bitmask;
 748                         rightmask &= ~bitmask;
 749                         if (node->leftnode == NODE && node->left) {
 750                                 left = node->left;
 751                                 free(node);
 752                                 count++;
 753                                 node = left;
 754                         } else if (node->rightnode == NODE && node->right) {
 755                                 right = node->right;
 756                                 free(node);
 757                                 count++;
 758                                 node = right;
 759                         } else {
 760                                 node = NULL;
 761                         }
 762                 }
 763                 /* Propagate keymasks up along singleton chains. */
 764                 node = parent;
 765                 /* Force re-check */
 766                 bitmask = 1 << node->bitnum;
 767                 leftmask &= ~bitmask;
 768                 rightmask &= ~bitmask;
 769                 for (;;) {
 770                         if (node->left && node->right)
 771                                 break;
 772                         if (node->left) {
 773                                 left = node->left;
 774                                 node->keymask |= left->keymask;
 775                                 node->keybits |= left->keybits;
 776                         }
 777                         if (node->right) {
 778                                 right = node->right;
 779                                 node->keymask |= right->keymask;
 780                                 node->keybits |= right->keybits;
 781                         }
 782                         node->keymask |= (1 << node->bitnum);
 783                         node = node->parent;
 784                         /* Force re-check */
 785                         bitmask = 1 << node->bitnum;
 786                         leftmask &= ~bitmask;
 787                         rightmask &= ~bitmask;
 788                 }
 789         advance:
 790                 bitmask = 1 << node->bitnum;
 791                 if ((leftmask & bitmask) == 0 &&
 792                     node->leftnode == NODE &&
 793                     node->left) {
 794                         leftmask |= bitmask;
 795                         node = node->left;
 796                 } else if ((rightmask & bitmask) == 0 &&
 797                            node->rightnode == NODE &&
 798                            node->right) {
 799                         rightmask |= bitmask;
 800                         node = node->right;
 801                 } else {
 802                         leftmask &= ~bitmask;
 803                         rightmask &= ~bitmask;
 804                         node = node->parent;
 805                 }
 806         }
 807         if (verbose > 0)
 808                 printf("Pruned %d nodes\n", count);
 809 }
 810
 811 /*
 812  * Mark the nodes in the tree that lead to leaves that must be
 813  * emitted.
 814  */
 815 static void mark_nodes(struct tree *tree)
 816 {
 817         struct node *node;
 818         struct node *n;
 819         unsigned int leftmask;
 820         unsigned int rightmask;
 821         unsigned int bitmask;
 822         int marked;
 823
 824         marked = 0;
 825         if (verbose > 0)
 826                 printf("Marking %s_%x\n", tree->type, tree->maxage);
 827         if (tree->childnode == LEAF)
 828                 goto done;
 829
 830         assert(tree->childnode == NODE);
 831         node = tree->root;
 832         leftmask = rightmask = 0;
 833         while (node) {
 834                 bitmask = 1 << node->bitnum;
 835                 if ((leftmask & bitmask) == 0) {
 836                         leftmask |= bitmask;
 837                         if (node->leftnode == LEAF) {
 838                                 assert(node->left);
 839                                 if (tree->leaf_mark(node->left)) {
 840                                         n = node;
 841                                         while (n && !n->mark) {
 842                                                 marked++;
 843                                                 n->mark = 1;
 844                                                 n = n->parent;
 845                                         }
 846                                 }
 847                         } else if (node->left) {
 848                                 assert(node->leftnode == NODE);
 849                                 node = node->left;
 850                                 continue;
 851                         }
 852                 }
 853                 if ((rightmask & bitmask) == 0) {
 854                         rightmask |= bitmask;
 855                         if (node->rightnode == LEAF) {
 856                                 assert(node->right);
 857                                 if (tree->leaf_mark(node->right)) {
 858                                         n = node;
 859                                         while (n && !n->mark) {
 860                                                 marked++;
 861                                                 n->mark = 1;
 862                                                 n = n->parent;
 863                                         }
 864                                 }
 865                         } else if (node->right) {
 866                                 assert(node->rightnode == NODE);
 867                                 node = node->right;
 868                                 continue;
 869                         }
 870                 }
 871                 leftmask &= ~bitmask;
 872                 rightmask &= ~bitmask;
 873                 node = node->parent;
 874         }
 875
 876         /* second pass: left siblings and singletons */
 877
 878         assert(tree->childnode == NODE);
 879         node = tree->root;
 880         leftmask = rightmask = 0;
 881         while (node) {
 882                 bitmask = 1 << node->bitnum;
 883                 if ((leftmask & bitmask) == 0) {
 884                         leftmask |= bitmask;
 885                         if (node->leftnode == LEAF) {
 886                                 assert(node->left);
 887                                 if (tree->leaf_mark(node->left)) {
 888                                         n = node;
 889                                         while (n && !n->mark) {
 890                                                 marked++;
 891                                                 n->mark = 1;
 892                                                 n = n->parent;
 893                                         }
 894                                 }
 895                         } else if (node->left) {
 896                                 assert(node->leftnode == NODE);
 897                                 node = node->left;
 898                                 if (!node->mark && node->parent->mark) {
 899                                         marked++;
 900                                         node->mark = 1;
 901                                 }
 902                                 continue;
 903                         }
 904                 }
 905                 if ((rightmask & bitmask) == 0) {
 906                         rightmask |= bitmask;
 907                         if (node->rightnode == LEAF) {
 908                                 assert(node->right);
 909                                 if (tree->leaf_mark(node->right)) {
 910                                         n = node;
 911                                         while (n && !n->mark) {
 912                                                 marked++;
 913                                                 n->mark = 1;
 914                                                 n = n->parent;
 915                                         }
 916                                 }
 917                         } else if (node->right) {
 918                                 assert(node->rightnode == NODE);
 919                                 node = node->right;
 920                                 if (!node->mark && node->parent->mark &&
 921                                     !node->parent->left) {
 922                                         marked++;
 923                                         node->mark = 1;
 924                                 }
 925                                 continue;
 926                         }
 927                 }
 928                 leftmask &= ~bitmask;
 929                 rightmask &= ~bitmask;
 930                 node = node->parent;
 931         }
 932 done:
 933         if (verbose > 0)
 934                 printf("Marked %d nodes\n", marked);
 935 }
 936
 937 /*
 938  * Compute the index of each node and leaf, which is the offset in the
 939  * emitted trie.  These values must be pre-computed because relative
 940  * offsets between nodes are used to navigate the tree.
 941  */
 942 static int index_nodes(struct tree *tree, int index)
 943 {
 944         struct node *node;
 945         unsigned int leftmask;
 946         unsigned int rightmask;
 947         unsigned int bitmask;
 948         int count;
 949         int indent;
 950
 951         /* Align to a cache line (or half a cache line?). */
 952         while (index % 64)
 953                 index++;
 954         tree->index = index;
 955         indent = 1;
 956         count = 0;
 957
 958         if (verbose > 0)
 959                 printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
 960         if (tree->childnode == LEAF) {
 961                 index += tree->leaf_size(tree->root);
 962                 goto done;
 963         }
 964
 965         assert(tree->childnode == NODE);
 966         node = tree->root;
 967         leftmask = rightmask = 0;
 968         while (node) {
 969                 if (!node->mark)
 970                         goto skip;
 971                 count++;
 972                 if (node->index != index)
 973                         node->index = index;
 974                 index += node->size;
 975 skip:
 976                 while (node) {
 977                         bitmask = 1 << node->bitnum;
 978                         if (node->mark && (leftmask & bitmask) == 0) {
 979                                 leftmask |= bitmask;
 980                                 if (node->leftnode == LEAF) {
 981                                         assert(node->left);
 982                                         *tree->leaf_index(tree, node->left) =
 983                                                                         index;
 984                                         index += tree->leaf_size(node->left);
 985                                         count++;
 986                                 } else if (node->left) {
 987                                         assert(node->leftnode == NODE);
 988                                         indent += 1;
 989                                         node = node->left;
 990                                         break;
 991                                 }
 992                         }
 993                         if (node->mark && (rightmask & bitmask) == 0) {
 994                                 rightmask |= bitmask;
 995                                 if (node->rightnode == LEAF) {
 996                                         assert(node->right);
 997                                         *tree->leaf_index(tree, node->right) = index;
 998                                         index += tree->leaf_size(node->right);
 999                                         count++;
1000                                 } else if (node->right) {
1001                                         assert(node->rightnode == NODE);
1002                                         indent += 1;
1003                                         node = node->right;
1004                                         break;
1005                                 }
1006                         }
1007                         leftmask &= ~bitmask;
1008                         rightmask &= ~bitmask;
1009                         node = node->parent;
1010                         indent -= 1;
1011                 }
1012         }
1013 done:
1014         /* Round up to a multiple of 16 */
1015         while (index % 16)
1016                 index++;
1017         if (verbose > 0)
1018                 printf("Final index %d\n", index);
1019         return index;
1020 }
1021
1022 /*
1023  * Mark the nodes in a subtree, helper for size_nodes().
1024  */
1025 static int mark_subtree(struct node *node)
1026 {
1027         int changed;
1028
1029         if (!node || node->mark)
1030                 return 0;
1031         node->mark = 1;
1032         node->index = node->parent->index;
1033         changed = 1;
1034         if (node->leftnode == NODE)
1035                 changed += mark_subtree(node->left);
1036         if (node->rightnode == NODE)
1037                 changed += mark_subtree(node->right);
1038         return changed;
1039 }
1040
1041 /*
1042  * Compute the size of nodes and leaves. We start by assuming that
1043  * each node needs to store a three-byte offset. The indexes of the
1044  * nodes are calculated based on that, and then this function is
1045  * called to see if the sizes of some nodes can be reduced.  This is
1046  * repeated until no more changes are seen.
1047  */
1048 static int size_nodes(struct tree *tree)
1049 {
1050         struct tree *next;
1051         struct node *node;
1052         struct node *right;
1053         struct node *n;
1054         unsigned int leftmask;
1055         unsigned int rightmask;
1056         unsigned int bitmask;
1057         unsigned int pathbits;
1058         unsigned int pathmask;
1059         unsigned int nbit;
1060         int changed;
1061         int offset;
1062         int size;
1063         int indent;
1064
1065         indent = 1;
1066         changed = 0;
1067         size = 0;
1068
1069         if (verbose > 0)
1070                 printf("Sizing %s_%x\n", tree->type, tree->maxage);
1071         if (tree->childnode == LEAF)
1072                 goto done;
1073
1074         assert(tree->childnode == NODE);
1075         pathbits = 0;
1076         pathmask = 0;
1077         node = tree->root;
1078         leftmask = rightmask = 0;
1079         while (node) {
1080                 if (!node->mark)
1081                         goto skip;
1082                 offset = 0;
1083                 if (!node->left || !node->right) {
1084                         size = 1;
1085                 } else {
1086                         if (node->rightnode == NODE) {
1087                                 /*
1088                                  * If the right node is not marked,
1089                                  * look for a corresponding node in
1090                                  * the next tree.  Such a node need
1091                                  * not exist.
1092                                  */
1093                                 right = node->right;
1094                                 next = tree->next;
1095                                 while (!right->mark) {
1096                                         assert(next);
1097                                         n = next->root;
1098                                         while (n->bitnum != node->bitnum) {
1099                                                 nbit = 1 << n->bitnum;
1100                                                 if (!(pathmask & nbit))
1101                                                         break;
1102                                                 if (pathbits & nbit) {
1103                                                         if (n->rightnode == LEAF)
1104                                                                 break;
1105                                                         n = n->right;
1106                                                 } else {
1107                                                         if (n->leftnode == LEAF)
1108                                                                 break;
1109                                                         n = n->left;
1110                                                 }
1111                                         }
1112                                         if (n->bitnum != node->bitnum)
1113                                                 break;
1114                                         n = n->right;
1115                                         right = n;
1116                                         next = next->next;
1117                                 }
1118                                 /* Make sure the right node is marked. */
1119                                 if (!right->mark)
1120                                         changed += mark_subtree(right);
1121                                 offset = right->index - node->index;
1122                         } else {
1123                                 offset = *tree->leaf_index(tree, node->right);
1124                                 offset -= node->index;
1125                         }
1126                         assert(offset >= 0);
1127                         assert(offset <= 0xffffff);
1128                         if (offset <= 0xff) {
1129                                 size = 2;
1130                         } else if (offset <= 0xffff) {
1131                                 size = 3;
1132                         } else { /* offset <= 0xffffff */
1133                                 size = 4;
1134                         }
1135                 }
1136                 if (node->size != size || node->offset != offset) {
1137                         node->size = size;
1138                         node->offset = offset;
1139                         changed++;
1140                 }
1141 skip:
1142                 while (node) {
1143                         bitmask = 1 << node->bitnum;
1144                         pathmask |= bitmask;
1145                         if (node->mark && (leftmask & bitmask) == 0) {
1146                                 leftmask |= bitmask;
1147                                 if (node->leftnode == LEAF) {
1148                                         assert(node->left);
1149                                 } else if (node->left) {
1150                                         assert(node->leftnode == NODE);
1151                                         indent += 1;
1152                                         node = node->left;
1153                                         break;
1154                                 }
1155                         }
1156                         if (node->mark && (rightmask & bitmask) == 0) {
1157                                 rightmask |= bitmask;
1158                                 pathbits |= bitmask;
1159                                 if (node->rightnode == LEAF) {
1160                                         assert(node->right);
1161                                 } else if (node->right) {
1162                                         assert(node->rightnode == NODE);
1163                                         indent += 1;
1164                                         node = node->right;
1165                                         break;
1166                                 }
1167                         }
1168                         leftmask &= ~bitmask;
1169                         rightmask &= ~bitmask;
1170                         pathmask &= ~bitmask;
1171                         pathbits &= ~bitmask;
1172                         node = node->parent;
1173                         indent -= 1;
1174                 }
1175         }
1176 done:
1177         if (verbose > 0)
1178                 printf("Found %d changes\n", changed);
1179         return changed;
1180 }
1181
1182 /*
1183  * Emit a trie for the given tree into the data array.
1184  */
1185 static void emit(struct tree *tree, unsigned char *data)
1186 {
1187         struct node *node;
1188         unsigned int leftmask;
1189         unsigned int rightmask;
1190         unsigned int bitmask;
1191         int offlen;
1192         int offset;
1193         int index;
1194         int indent;
1195         int size;
1196         int bytes;
1197         int leaves;
1198         int nodes[4];
1199         unsigned char byte;
1200
1201         nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
1202         leaves = 0;
1203         bytes = 0;
1204         index = tree->index;
1205         data += index;
1206         indent = 1;
1207         if (verbose > 0)
1208                 printf("Emitting %s_%x\n", tree->type, tree->maxage);
1209         if (tree->childnode == LEAF) {
1210                 assert(tree->root);
1211                 tree->leaf_emit(tree->root, data);
1212                 size = tree->leaf_size(tree->root);
1213                 index += size;
1214                 leaves++;
1215                 goto done;
1216         }
1217
1218         assert(tree->childnode == NODE);
1219         node = tree->root;
1220         leftmask = rightmask = 0;
1221         while (node) {
1222                 if (!node->mark)
1223                         goto skip;
1224                 assert(node->offset != -1);
1225                 assert(node->index == index);
1226
1227                 byte = 0;
1228                 if (node->nextbyte)
1229                         byte |= NEXTBYTE;
1230                 byte |= (node->bitnum & BITNUM);
1231                 if (node->left && node->right) {
1232                         if (node->leftnode == NODE)
1233                                 byte |= LEFTNODE;
1234                         if (node->rightnode == NODE)
1235                                 byte |= RIGHTNODE;
1236                         if (node->offset <= 0xff)
1237                                 offlen = 1;
1238                         else if (node->offset <= 0xffff)
1239                                 offlen = 2;
1240                         else
1241                                 offlen = 3;
1242                         nodes[offlen]++;
1243                         offset = node->offset;
1244                         byte |= offlen << OFFLEN_SHIFT;
1245                         *data++ = byte;
1246                         index++;
1247                         while (offlen--) {
1248                                 *data++ = offset & 0xff;
1249                                 index++;
1250                                 offset >>= 8;
1251                         }
1252                 } else if (node->left) {
1253                         if (node->leftnode == NODE)
1254                                 byte |= TRIENODE;
1255                         nodes[0]++;
1256                         *data++ = byte;
1257                         index++;
1258                 } else if (node->right) {
1259                         byte |= RIGHTNODE;
1260                         if (node->rightnode == NODE)
1261                                 byte |= TRIENODE;
1262                         nodes[0]++;
1263                         *data++ = byte;
1264                         index++;
1265                 } else {
1266                         assert(0);
1267                 }
1268 skip:
1269                 while (node) {
1270                         bitmask = 1 << node->bitnum;
1271                         if (node->mark && (leftmask & bitmask) == 0) {
1272                                 leftmask |= bitmask;
1273                                 if (node->leftnode == LEAF) {
1274                                         assert(node->left);
1275                                         data = tree->leaf_emit(node->left,
1276                                                                data);
1277                                         size = tree->leaf_size(node->left);
1278                                         index += size;
1279                                         bytes += size;
1280                                         leaves++;
1281                                 } else if (node->left) {
1282                                         assert(node->leftnode == NODE);
1283                                         indent += 1;
1284                                         node = node->left;
1285                                         break;
1286                                 }
1287                         }
1288                         if (node->mark && (rightmask & bitmask) == 0) {
1289                                 rightmask |= bitmask;
1290                                 if (node->rightnode == LEAF) {
1291                                         assert(node->right);
1292                                         data = tree->leaf_emit(node->right,
1293                                                                data);
1294                                         size = tree->leaf_size(node->right);
1295                                         index += size;
1296                                         bytes += size;
1297                                         leaves++;
1298                                 } else if (node->right) {
1299                                         assert(node->rightnode == NODE);
1300                                         indent += 1;
1301                                         node = node->right;
1302                                         break;
1303                                 }
1304                         }
1305                         leftmask &= ~bitmask;
1306                         rightmask &= ~bitmask;
1307                         node = node->parent;
1308                         indent -= 1;
1309                 }
1310         }
1311 done:
1312         if (verbose > 0) {
1313                 printf("Emitted %d (%d) leaves",
1314                         leaves, bytes);
1315                 printf(" %d (%d+%d+%d+%d) nodes",
1316                         nodes[0] + nodes[1] + nodes[2] + nodes[3],
1317                         nodes[0], nodes[1], nodes[2], nodes[3]);
1318                 printf(" %d total\n", index - tree->index);
1319         }
1320 }
1321
1322 /* ------------------------------------------------------------------ */
1323
1324 /*
1325  * Unicode data.
1326  *
1327  * We need to keep track of the Canonical Combining Class, the Age,
1328  * and decompositions for a code point.
1329  *
1330  * For the Age, we store the index into the ages table.  Effectively
1331  * this is a generation number that the table maps to a unicode
1332  * version.
1333  *
1334  * The correction field is used to indicate that this entry is in the
1335  * corrections array, which contains decompositions that were
1336  * corrected in later revisions.  The value of the correction field is
1337  * the Unicode version in which the mapping was corrected.
1338  */
1339 struct unicode_data {
1340         unsigned int code;
1341         int ccc;
1342         int gen;
1343         int correction;
1344         unsigned int *utf32nfdi;
1345         unsigned int *utf32nfdicf;
1346         char *utf8nfdi;
1347         char *utf8nfdicf;
1348 };
1349
1350 struct unicode_data unicode_data[0x110000];
1351 struct unicode_data *corrections;
1352 int    corrections_count;
1353
1354 struct tree *nfdi_tree;
1355 struct tree *nfdicf_tree;
1356
1357 struct tree *trees;
1358 int          trees_count;
1359
1360 /*
1361  * Check the corrections array to see if this entry was corrected at
1362  * some point.
1363  */
1364 static struct unicode_data *corrections_lookup(struct unicode_data *u)
1365 {
1366         int i;
1367
1368         for (i = 0; i != corrections_count; i++)
1369                 if (u->code == corrections[i].code)
1370                         return &corrections[i];
1371         return u;
1372 }
1373
1374 static int nfdi_equal(void *l, void *r)
1375 {
1376         struct unicode_data *left = l;
1377         struct unicode_data *right = r;
1378
1379         if (left->gen != right->gen)
1380                 return 0;
1381         if (left->ccc != right->ccc)
1382                 return 0;
1383         if (left->utf8nfdi && right->utf8nfdi &&
1384             strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1385                 return 1;
1386         if (left->utf8nfdi || right->utf8nfdi)
1387                 return 0;
1388         return 1;
1389 }
1390
1391 static int nfdicf_equal(void *l, void *r)
1392 {
1393         struct unicode_data *left = l;
1394         struct unicode_data *right = r;
1395
1396         if (left->gen != right->gen)
1397                 return 0;
1398         if (left->ccc != right->ccc)
1399                 return 0;
1400         if (left->utf8nfdicf && right->utf8nfdicf &&
1401             strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
1402                 return 1;
1403         if (left->utf8nfdicf && right->utf8nfdicf)
1404                 return 0;
1405         if (left->utf8nfdicf || right->utf8nfdicf)
1406                 return 0;
1407         if (left->utf8nfdi && right->utf8nfdi &&
1408             strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1409                 return 1;
1410         if (left->utf8nfdi || right->utf8nfdi)
1411                 return 0;
1412         return 1;
1413 }
1414
1415 static void nfdi_print(void *l, int indent)
1416 {
1417         struct unicode_data *leaf = l;
1418
1419         printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1420                 leaf->code, leaf->ccc, leaf->gen);
1421
1422         if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
1423                 printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1424         else if (leaf->utf8nfdi)
1425                 printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1426
1427         printf("\n");
1428 }
1429
1430 static void nfdicf_print(void *l, int indent)
1431 {
1432         struct unicode_data *leaf = l;
1433
1434         printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1435                 leaf->code, leaf->ccc, leaf->gen);
1436
1437         if (leaf->utf8nfdicf)
1438                 printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
1439         else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
1440                 printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1441         else if (leaf->utf8nfdi)
1442                 printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1443         printf("\n");
1444 }
1445
1446 static int nfdi_mark(void *l)
1447 {
1448         return 1;
1449 }
1450
1451 static int nfdicf_mark(void *l)
1452 {
1453         struct unicode_data *leaf = l;
1454
1455         if (leaf->utf8nfdicf)
1456                 return 1;
1457         return 0;
1458 }
1459
1460 static int correction_mark(void *l)
1461 {
1462         struct unicode_data *leaf = l;
1463
1464         return leaf->correction;
1465 }
1466
1467 static int nfdi_size(void *l)
1468 {
1469         struct unicode_data *leaf = l;
1470         int size = 2;
1471
1472         if (HANGUL_SYLLABLE(leaf->code))
1473                 size += 1;
1474         else if (leaf->utf8nfdi)
1475                 size += strlen(leaf->utf8nfdi) + 1;
1476         return size;
1477 }
1478
1479 static int nfdicf_size(void *l)
1480 {
1481         struct unicode_data *leaf = l;
1482         int size = 2;
1483
1484         if (HANGUL_SYLLABLE(leaf->code))
1485                 size += 1;
1486         else if (leaf->utf8nfdicf)
1487                 size += strlen(leaf->utf8nfdicf) + 1;
1488         else if (leaf->utf8nfdi)
1489                 size += strlen(leaf->utf8nfdi) + 1;
1490         return size;
1491 }
1492
1493 static int *nfdi_index(struct tree *tree, void *l)
1494 {
1495         struct unicode_data *leaf = l;
1496
1497         return &tree->leafindex[leaf->code];
1498 }
1499
1500 static int *nfdicf_index(struct tree *tree, void *l)
1501 {
1502         struct unicode_data *leaf = l;
1503
1504         return &tree->leafindex[leaf->code];
1505 }
1506
1507 static unsigned char *nfdi_emit(void *l, unsigned char *data)
1508 {
1509         struct unicode_data *leaf = l;
1510         unsigned char *s;
1511
1512         *data++ = leaf->gen;
1513
1514         if (HANGUL_SYLLABLE(leaf->code)) {
1515                 *data++ = DECOMPOSE;
1516                 *data++ = HANGUL;
1517         } else if (leaf->utf8nfdi) {
1518                 *data++ = DECOMPOSE;
1519                 s = (unsigned char*)leaf->utf8nfdi;
1520                 while ((*data++ = *s++) != 0)
1521                         ;
1522         } else {
1523                 *data++ = leaf->ccc;
1524         }
1525         return data;
1526 }
1527
1528 static unsigned char *nfdicf_emit(void *l, unsigned char *data)
1529 {
1530         struct unicode_data *leaf = l;
1531         unsigned char *s;
1532
1533         *data++ = leaf->gen;
1534
1535         if (HANGUL_SYLLABLE(leaf->code)) {
1536                 *data++ = DECOMPOSE;
1537                 *data++ = HANGUL;
1538         } else if (leaf->utf8nfdicf) {
1539                 *data++ = DECOMPOSE;
1540                 s = (unsigned char*)leaf->utf8nfdicf;
1541                 while ((*data++ = *s++) != 0)
1542                         ;
1543         } else if (leaf->utf8nfdi) {
1544                 *data++ = DECOMPOSE;
1545                 s = (unsigned char*)leaf->utf8nfdi;
1546                 while ((*data++ = *s++) != 0)
1547                         ;
1548         } else {
1549                 *data++ = leaf->ccc;
1550         }
1551         return data;
1552 }
1553
1554 static void utf8_create(struct unicode_data *data)
1555 {
1556         char utf[18*4+1];
1557         char *u;
1558         unsigned int *um;
1559         int i;
1560
1561         if (data->utf8nfdi) {
1562                 assert(data->utf8nfdi[0] == HANGUL);
1563                 return;
1564         }
1565
1566         u = utf;
1567         um = data->utf32nfdi;
1568         if (um) {
1569                 for (i = 0; um[i]; i++)
1570                         u += utf8encode(u, um[i]);
1571                 *u = '\0';
1572                 data->utf8nfdi = strdup(utf);
1573         }
1574         u = utf;
1575         um = data->utf32nfdicf;
1576         if (um) {
1577                 for (i = 0; um[i]; i++)
1578                         u += utf8encode(u, um[i]);
1579                 *u = '\0';
1580                 if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
1581                         data->utf8nfdicf = strdup(utf);
1582         }
1583 }
1584
1585 static void utf8_init(void)
1586 {
1587         unsigned int unichar;
1588         int i;
1589
1590         for (unichar = 0; unichar != 0x110000; unichar++)
1591                 utf8_create(&unicode_data[unichar]);
1592
1593         for (i = 0; i != corrections_count; i++)
1594                 utf8_create(&corrections[i]);
1595 }
1596
1597 static void trees_init(void)
1598 {
1599         struct unicode_data *data;
1600         unsigned int maxage;
1601         unsigned int nextage;
1602         int count;
1603         int i;
1604         int j;
1605
1606         /* Count the number of different ages. */
1607         count = 0;
1608         nextage = (unsigned int)-1;
1609         do {
1610                 maxage = nextage;
1611                 nextage = 0;
1612                 for (i = 0; i <= corrections_count; i++) {
1613                         data = &corrections[i];
1614                         if (nextage < data->correction &&
1615                             data->correction < maxage)
1616                                 nextage = data->correction;
1617                 }
1618                 count++;
1619         } while (nextage);
1620
1621         /* Two trees per age: nfdi and nfdicf */
1622         trees_count = count * 2;
1623         trees = calloc(trees_count, sizeof(struct tree));
1624
1625         /* Assign ages to the trees. */
1626         count = trees_count;
1627         nextage = (unsigned int)-1;
1628         do {
1629                 maxage = nextage;
1630                 trees[--count].maxage = maxage;
1631                 trees[--count].maxage = maxage;
1632                 nextage = 0;
1633                 for (i = 0; i <= corrections_count; i++) {
1634                         data = &corrections[i];
1635                         if (nextage < data->correction &&
1636                             data->correction < maxage)
1637                                 nextage = data->correction;
1638                 }
1639         } while (nextage);
1640
1641         /* The ages assigned above are off by one. */
1642         for (i = 0; i != trees_count; i++) {
1643                 j = 0;
1644                 while (ages[j] < trees[i].maxage)
1645                         j++;
1646                 trees[i].maxage = ages[j-1];
1647         }
1648
1649         /* Set up the forwarding between trees. */
1650         trees[trees_count-2].next = &trees[trees_count-1];
1651         trees[trees_count-1].leaf_mark = nfdi_mark;
1652         trees[trees_count-2].leaf_mark = nfdicf_mark;
1653         for (i = 0; i != trees_count-2; i += 2) {
1654                 trees[i].next = &trees[trees_count-2];
1655                 trees[i].leaf_mark = correction_mark;
1656                 trees[i+1].next = &trees[trees_count-1];
1657                 trees[i+1].leaf_mark = correction_mark;
1658         }
1659
1660         /* Assign the callouts. */
1661         for (i = 0; i != trees_count; i += 2) {
1662                 trees[i].type = "nfdicf";
1663                 trees[i].leaf_equal = nfdicf_equal;
1664                 trees[i].leaf_print = nfdicf_print;
1665                 trees[i].leaf_size = nfdicf_size;
1666                 trees[i].leaf_index = nfdicf_index;
1667                 trees[i].leaf_emit = nfdicf_emit;
1668
1669                 trees[i+1].type = "nfdi";
1670                 trees[i+1].leaf_equal = nfdi_equal;
1671                 trees[i+1].leaf_print = nfdi_print;
1672                 trees[i+1].leaf_size = nfdi_size;
1673                 trees[i+1].leaf_index = nfdi_index;
1674                 trees[i+1].leaf_emit = nfdi_emit;
1675         }
1676
1677         /* Finish init. */
1678         for (i = 0; i != trees_count; i++)
1679                 trees[i].childnode = NODE;
1680 }
1681
1682 static void trees_populate(void)
1683 {
1684         struct unicode_data *data;
1685         unsigned int unichar;
1686         char keyval[4];
1687         int keylen;
1688         int i;
1689
1690         for (i = 0; i != trees_count; i++) {
1691                 if (verbose > 0) {
1692                         printf("Populating %s_%x\n",
1693                                 trees[i].type, trees[i].maxage);
1694                 }
1695                 for (unichar = 0; unichar != 0x110000; unichar++) {
1696                         if (unicode_data[unichar].gen < 0)
1697                                 continue;
1698                         keylen = utf8encode(keyval, unichar);
1699                         data = corrections_lookup(&unicode_data[unichar]);
1700                         if (data->correction <= trees[i].maxage)
1701                                 data = &unicode_data[unichar];
1702                         insert(&trees[i], keyval, keylen, data);
1703                 }
1704         }
1705 }
1706
1707 static void trees_reduce(void)
1708 {
1709         int i;
1710         int size;
1711         int changed;
1712
1713         for (i = 0; i != trees_count; i++)
1714                 prune(&trees[i]);
1715         for (i = 0; i != trees_count; i++)
1716                 mark_nodes(&trees[i]);
1717         do {
1718                 size = 0;
1719                 for (i = 0; i != trees_count; i++)
1720                         size = index_nodes(&trees[i], size);
1721                 changed = 0;
1722                 for (i = 0; i != trees_count; i++)
1723                         changed += size_nodes(&trees[i]);
1724         } while (changed);
1725
1726         utf8data = calloc(size, 1);
1727         utf8data_size = size;
1728         for (i = 0; i != trees_count; i++)
1729                 emit(&trees[i], utf8data);
1730
1731         if (verbose > 0) {
1732                 for (i = 0; i != trees_count; i++) {
1733                         printf("%s_%x idx %d\n",
1734                                 trees[i].type, trees[i].maxage, trees[i].index);
1735                 }
1736         }
1737
1738         nfdi = utf8data + trees[trees_count-1].index;
1739         nfdicf = utf8data + trees[trees_count-2].index;
1740
1741         nfdi_tree = &trees[trees_count-1];
1742         nfdicf_tree = &trees[trees_count-2];
1743 }
1744
1745 static void verify(struct tree *tree)
1746 {
1747         struct unicode_data *data;
1748         utf8leaf_t      *leaf;
1749         unsigned int    unichar;
1750         char            key[4];
1751         unsigned char   hangul[UTF8HANGULLEAF];
1752         int             report;
1753         int             nocf;
1754
1755         if (verbose > 0)
1756                 printf("Verifying %s_%x\n", tree->type, tree->maxage);
1757         nocf = strcmp(tree->type, "nfdicf");
1758
1759         for (unichar = 0; unichar != 0x110000; unichar++) {
1760                 report = 0;
1761                 data = corrections_lookup(&unicode_data[unichar]);
1762                 if (data->correction <= tree->maxage)
1763                         data = &unicode_data[unichar];
1764                 utf8encode(key,unichar);
1765                 leaf = utf8lookup(tree, hangul, key);
1766
1767                 if (!leaf) {
1768                         if (data->gen != -1)
1769                                 report++;
1770                         if (unichar < 0xd800 || unichar > 0xdfff)
1771                                 report++;
1772                 } else {
1773                         if (unichar >= 0xd800 && unichar <= 0xdfff)
1774                                 report++;
1775                         if (data->gen == -1)
1776                                 report++;
1777                         if (data->gen != LEAF_GEN(leaf))
1778                                 report++;
1779                         if (LEAF_CCC(leaf) == DECOMPOSE) {
1780                                 if (HANGUL_SYLLABLE(data->code)) {
1781                                         if (data->utf8nfdi[0] != HANGUL)
1782                                                 report++;
1783                                 } else if (nocf) {
1784                                         if (!data->utf8nfdi) {
1785                                                 report++;
1786                                         } else if (strcmp(data->utf8nfdi,
1787                                                           LEAF_STR(leaf))) {
1788                                                 report++;
1789                                         }
1790                                 } else {
1791                                         if (!data->utf8nfdicf &&
1792                                             !data->utf8nfdi) {
1793                                                 report++;
1794                                         } else if (data->utf8nfdicf) {
1795                                                 if (strcmp(data->utf8nfdicf,
1796                                                            LEAF_STR(leaf)))
1797                                                         report++;
1798                                         } else if (strcmp(data->utf8nfdi,
1799                                                           LEAF_STR(leaf))) {
1800                                                 report++;
1801                                         }
1802                                 }
1803                         } else if (data->ccc != LEAF_CCC(leaf)) {
1804                                 report++;
1805                         }
1806                 }
1807                 if (report) {
1808                         printf("%X code %X gen %d ccc %d"
1809                                 " nfdi -> \"%s\"",
1810                                 unichar, data->code, data->gen,
1811                                 data->ccc,
1812                                 data->utf8nfdi);
1813                         if (leaf) {
1814                                 printf(" gen %d ccc %d"
1815                                         " nfdi -> \"%s\"",
1816                                         LEAF_GEN(leaf),
1817                                         LEAF_CCC(leaf),
1818                                         LEAF_CCC(leaf) == DECOMPOSE ?
1819                                                 LEAF_STR(leaf) : "");
1820                         }
1821                         printf("\n");
1822                 }
1823         }
1824 }
1825
1826 static void trees_verify(void)
1827 {
1828         int i;
1829
1830         for (i = 0; i != trees_count; i++)
1831                 verify(&trees[i]);
1832 }
1833
1834 /* ------------------------------------------------------------------ */
1835
1836 static void help(void)
1837 {
1838         printf("Usage: %s [options]\n", argv0);
1839         printf("\n");
1840         printf("This program creates an a data trie used for parsing and\n");
1841         printf("normalization of UTF-8 strings. The trie is derived from\n");
1842         printf("a set of input files from the Unicode character database\n");
1843         printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
1844         printf("\n");
1845         printf("The generated tree supports two normalization forms:\n");
1846         printf("\n");
1847         printf("\tnfdi:\n");
1848         printf("\t- Apply unicode normalization form NFD.\n");
1849         printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1850         printf("\n");
1851         printf("\tnfdicf:\n");
1852         printf("\t- Apply unicode normalization form NFD.\n");
1853         printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1854         printf("\t- Apply a full casefold (C + F).\n");
1855         printf("\n");
1856         printf("These forms were chosen as being most useful when dealing\n");
1857         printf("with file names: NFD catches most cases where characters\n");
1858         printf("should be considered equivalent. The ignorables are mostly\n");
1859         printf("invisible, making names hard to type.\n");
1860         printf("\n");
1861         printf("The options to specify the files to be used are listed\n");
1862         printf("below with their default values, which are the names used\n");
1863         printf("by version 11.0.0 of the Unicode Character Database.\n");
1864         printf("\n");
1865         printf("The input files:\n");
1866         printf("\t-a %s\n", AGE_NAME);
1867         printf("\t-c %s\n", CCC_NAME);
1868         printf("\t-p %s\n", PROP_NAME);
1869         printf("\t-d %s\n", DATA_NAME);
1870         printf("\t-f %s\n", FOLD_NAME);
1871         printf("\t-n %s\n", NORM_NAME);
1872         printf("\n");
1873         printf("Additionally, the generated tables are tested using:\n");
1874         printf("\t-t %s\n", TEST_NAME);
1875         printf("\n");
1876         printf("Finally, the output file:\n");
1877         printf("\t-o %s\n", UTF8_NAME);
1878         printf("\n");
1879 }
1880
1881 static void usage(void)
1882 {
1883         help();
1884         exit(1);
1885 }
1886
1887 static void open_fail(const char *name, int error)
1888 {
1889         printf("Error %d opening %s: %s\n", error, name, strerror(error));
1890         exit(1);
1891 }
1892
1893 static void file_fail(const char *filename)
1894 {
1895         printf("Error parsing %s\n", filename);
1896         exit(1);
1897 }
1898
1899 static void line_fail(const char *filename, const char *line)
1900 {
1901         printf("Error parsing %s:%s\n", filename, line);
1902         exit(1);
1903 }
1904
1905 /* ------------------------------------------------------------------ */
1906
1907 static void print_utf32(unsigned int *utf32str)
1908 {
1909         int     i;
1910
1911         for (i = 0; utf32str[i]; i++)
1912                 printf(" %X", utf32str[i]);
1913 }
1914
1915 static void print_utf32nfdi(unsigned int unichar)
1916 {
1917         printf(" %X ->", unichar);
1918         print_utf32(unicode_data[unichar].utf32nfdi);
1919         printf("\n");
1920 }
1921
1922 static void print_utf32nfdicf(unsigned int unichar)
1923 {
1924         printf(" %X ->", unichar);
1925         print_utf32(unicode_data[unichar].utf32nfdicf);
1926         printf("\n");
1927 }
1928
1929 /* ------------------------------------------------------------------ */
1930
1931 static void age_init(void)
1932 {
1933         FILE *file;
1934         unsigned int first;
1935         unsigned int last;
1936         unsigned int unichar;
1937         unsigned int major;
1938         unsigned int minor;
1939         unsigned int revision;
1940         int gen;
1941         int count;
1942         int ret;
1943
1944         if (verbose > 0)
1945                 printf("Parsing %s\n", age_name);
1946
1947         file = fopen(age_name, "r");
1948         if (!file)
1949                 open_fail(age_name, errno);
1950         count = 0;
1951
1952         gen = 0;
1953         while (fgets(line, LINESIZE, file)) {
1954                 ret = sscanf(line, "# Age=V%d_%d_%d",
1955                                 &major, &minor, &revision);
1956                 if (ret == 3) {
1957                         ages_count++;
1958                         if (verbose > 1)
1959                                 printf(" Age V%d_%d_%d\n",
1960                                         major, minor, revision);
1961                         if (!age_valid(major, minor, revision))
1962                                 line_fail(age_name, line);
1963                         continue;
1964                 }
1965                 ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1966                 if (ret == 2) {
1967                         ages_count++;
1968                         if (verbose > 1)
1969                                 printf(" Age V%d_%d\n", major, minor);
1970                         if (!age_valid(major, minor, 0))
1971                                 line_fail(age_name, line);
1972                         continue;
1973                 }
1974         }
1975
1976         /* We must have found something above. */
1977         if (verbose > 1)
1978                 printf("%d age entries\n", ages_count);
1979         if (ages_count == 0 || ages_count > MAXGEN)
1980                 file_fail(age_name);
1981
1982         /* There is a 0 entry. */
1983         ages_count++;
1984         ages = calloc(ages_count + 1, sizeof(*ages));
1985         /* And a guard entry. */
1986         ages[ages_count] = (unsigned int)-1;
1987
1988         rewind(file);
1989         count = 0;
1990         gen = 0;
1991         while (fgets(line, LINESIZE, file)) {
1992                 ret = sscanf(line, "# Age=V%d_%d_%d",
1993                                 &major, &minor, &revision);
1994                 if (ret == 3) {
1995                         ages[++gen] =
1996                                 UNICODE_AGE(major, minor, revision);
1997                         if (verbose > 1)
1998                                 printf(" Age V%d_%d_%d = gen %d\n",
1999                                         major, minor, revision, gen);
2000                         if (!age_valid(major, minor, revision))
2001                                 line_fail(age_name, line);
2002                         continue;
2003                 }
2004                 ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
2005                 if (ret == 2) {
2006                         ages[++gen] = UNICODE_AGE(major, minor, 0);
2007                         if (verbose > 1)
2008                                 printf(" Age V%d_%d = %d\n",
2009                                         major, minor, gen);
2010                         if (!age_valid(major, minor, 0))
2011                                 line_fail(age_name, line);
2012                         continue;
2013                 }
2014                 ret = sscanf(line, "%X..%X ; %d.%d #",
2015                              &first, &last, &major, &minor);
2016                 if (ret == 4) {
2017                         for (unichar = first; unichar <= last; unichar++)
2018                                 unicode_data[unichar].gen = gen;
2019                         count += 1 + last - first;
2020                         if (verbose > 1)
2021                                 printf("  %X..%X gen %d\n", first, last, gen);
2022                         if (!utf32valid(first) || !utf32valid(last))
2023                                 line_fail(age_name, line);
2024                         continue;
2025                 }
2026                 ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
2027                 if (ret == 3) {
2028                         unicode_data[unichar].gen = gen;
2029                         count++;
2030                         if (verbose > 1)
2031                                 printf("  %X gen %d\n", unichar, gen);
2032                         if (!utf32valid(unichar))
2033                                 line_fail(age_name, line);
2034                         continue;
2035                 }
2036         }
2037         unicode_maxage = ages[gen];
2038         fclose(file);
2039
2040         /* Nix surrogate block */
2041         if (verbose > 1)
2042                 printf(" Removing surrogate block D800..DFFF\n");
2043         for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
2044                 unicode_data[unichar].gen = -1;
2045
2046         if (verbose > 0)
2047                 printf("Found %d entries\n", count);
2048         if (count == 0)
2049                 file_fail(age_name);
2050 }
2051
2052 static void ccc_init(void)
2053 {
2054         FILE *file;
2055         unsigned int first;
2056         unsigned int last;
2057         unsigned int unichar;
2058         unsigned int value;
2059         int count;
2060         int ret;
2061
2062         if (verbose > 0)
2063                 printf("Parsing %s\n", ccc_name);
2064
2065         file = fopen(ccc_name, "r");
2066         if (!file)
2067                 open_fail(ccc_name, errno);
2068
2069         count = 0;
2070         while (fgets(line, LINESIZE, file)) {
2071                 ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
2072                 if (ret == 3) {
2073                         for (unichar = first; unichar <= last; unichar++) {
2074                                 unicode_data[unichar].ccc = value;
2075                                 count++;
2076                         }
2077                         if (verbose > 1)
2078                                 printf(" %X..%X ccc %d\n", first, last, value);
2079                         if (!utf32valid(first) || !utf32valid(last))
2080                                 line_fail(ccc_name, line);
2081                         continue;
2082                 }
2083                 ret = sscanf(line, "%X ; %d #", &unichar, &value);
2084                 if (ret == 2) {
2085                         unicode_data[unichar].ccc = value;
2086                         count++;
2087                         if (verbose > 1)
2088                                 printf(" %X ccc %d\n", unichar, value);
2089                         if (!utf32valid(unichar))
2090                                 line_fail(ccc_name, line);
2091                         continue;
2092                 }
2093         }
2094         fclose(file);
2095
2096         if (verbose > 0)
2097                 printf("Found %d entries\n", count);
2098         if (count == 0)
2099                 file_fail(ccc_name);
2100 }
2101
2102 static int ignore_compatibility_form(char *type)
2103 {
2104         int i;
2105         char *ignored_types[] = {"font", "noBreak", "initial", "medial",
2106                                  "final", "isolated", "circle", "super",
2107                                  "sub", "vertical", "wide", "narrow",
2108                                  "small", "square", "fraction", "compat"};
2109
2110         for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
2111                 if (strcmp(type, ignored_types[i]) == 0)
2112                         return 1;
2113         return 0;
2114 }
2115
2116 static void nfdi_init(void)
2117 {
2118         FILE *file;
2119         unsigned int unichar;
2120         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2121         char *s;
2122         char *type;
2123         unsigned int *um;
2124         int count;
2125         int i;
2126         int ret;
2127
2128         if (verbose > 0)
2129                 printf("Parsing %s\n", data_name);
2130         file = fopen(data_name, "r");
2131         if (!file)
2132                 open_fail(data_name, errno);
2133
2134         count = 0;
2135         while (fgets(line, LINESIZE, file)) {
2136                 ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
2137                              &unichar, buf0);
2138                 if (ret != 2)
2139                         continue;
2140                 if (!utf32valid(unichar))
2141                         line_fail(data_name, line);
2142
2143                 s = buf0;
2144                 /* skip over <tag> */
2145                 if (*s == '<') {
2146                         type = ++s;
2147                         while (*++s != '>');
2148                         *s++ = '\0';
2149                         if(ignore_compatibility_form(type))
2150                                 continue;
2151                 }
2152                 /* decode the decomposition into UTF-32 */
2153                 i = 0;
2154                 while (*s) {
2155                         mapping[i] = strtoul(s, &s, 16);
2156                         if (!utf32valid(mapping[i]))
2157                                 line_fail(data_name, line);
2158                         i++;
2159                 }
2160                 mapping[i++] = 0;
2161
2162                 um = malloc(i * sizeof(unsigned int));
2163                 memcpy(um, mapping, i * sizeof(unsigned int));
2164                 unicode_data[unichar].utf32nfdi = um;
2165
2166                 if (verbose > 1)
2167                         print_utf32nfdi(unichar);
2168                 count++;
2169         }
2170         fclose(file);
2171         if (verbose > 0)
2172                 printf("Found %d entries\n", count);
2173         if (count == 0)
2174                 file_fail(data_name);
2175 }
2176
2177 static void nfdicf_init(void)
2178 {
2179         FILE *file;
2180         unsigned int unichar;
2181         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2182         char status;
2183         char *s;
2184         unsigned int *um;
2185         int i;
2186         int count;
2187         int ret;
2188
2189         if (verbose > 0)
2190                 printf("Parsing %s\n", fold_name);
2191         file = fopen(fold_name, "r");
2192         if (!file)
2193                 open_fail(fold_name, errno);
2194
2195         count = 0;
2196         while (fgets(line, LINESIZE, file)) {
2197                 ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
2198                 if (ret != 3)
2199                         continue;
2200                 if (!utf32valid(unichar))
2201                         line_fail(fold_name, line);
2202                 /* Use the C+F casefold. */
2203                 if (status != 'C' && status != 'F')
2204                         continue;
2205                 s = buf0;
2206                 if (*s == '<')
2207                         while (*s++ != ' ')
2208                                 ;
2209                 i = 0;
2210                 while (*s) {
2211                         mapping[i] = strtoul(s, &s, 16);
2212                         if (!utf32valid(mapping[i]))
2213                                 line_fail(fold_name, line);
2214                         i++;
2215                 }
2216                 mapping[i++] = 0;
2217
2218                 um = malloc(i * sizeof(unsigned int));
2219                 memcpy(um, mapping, i * sizeof(unsigned int));
2220                 unicode_data[unichar].utf32nfdicf = um;
2221
2222                 if (verbose > 1)
2223                         print_utf32nfdicf(unichar);
2224                 count++;
2225         }
2226         fclose(file);
2227         if (verbose > 0)
2228                 printf("Found %d entries\n", count);
2229         if (count == 0)
2230                 file_fail(fold_name);
2231 }
2232
2233 static void corrections_init(void)
2234 {
2235         FILE *file;
2236         unsigned int unichar;
2237         unsigned int major;
2238         unsigned int minor;
2239         unsigned int revision;
2240         unsigned int age;
2241         unsigned int *um;
2242         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2243         char *s;
2244         int i;
2245         int count;
2246         int ret;
2247
2248         if (verbose > 0)
2249                 printf("Parsing %s\n", norm_name);
2250         file = fopen(norm_name, "r");
2251         if (!file)
2252                 open_fail(norm_name, errno);
2253
2254         count = 0;
2255         while (fgets(line, LINESIZE, file)) {
2256                 ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2257                                 &unichar, buf0, buf1,
2258                                 &major, &minor, &revision);
2259                 if (ret != 6)
2260                         continue;
2261                 if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2262                         line_fail(norm_name, line);
2263                 count++;
2264         }
2265         corrections = calloc(count, sizeof(struct unicode_data));
2266         corrections_count = count;
2267         rewind(file);
2268
2269         count = 0;
2270         while (fgets(line, LINESIZE, file)) {
2271                 ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2272                                 &unichar, buf0, buf1,
2273                                 &major, &minor, &revision);
2274                 if (ret != 6)
2275                         continue;
2276                 if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2277                         line_fail(norm_name, line);
2278                 corrections[count] = unicode_data[unichar];
2279                 assert(corrections[count].code == unichar);
2280                 age = UNICODE_AGE(major, minor, revision);
2281                 corrections[count].correction = age;
2282
2283                 i = 0;
2284                 s = buf0;
2285                 while (*s) {
2286                         mapping[i] = strtoul(s, &s, 16);
2287                         if (!utf32valid(mapping[i]))
2288                                 line_fail(norm_name, line);
2289                         i++;
2290                 }
2291                 mapping[i++] = 0;
2292
2293                 um = malloc(i * sizeof(unsigned int));
2294                 memcpy(um, mapping, i * sizeof(unsigned int));
2295                 corrections[count].utf32nfdi = um;
2296
2297                 if (verbose > 1)
2298                         printf(" %X -> %s -> %s V%d_%d_%d\n",
2299                                 unichar, buf0, buf1, major, minor, revision);
2300                 count++;
2301         }
2302         fclose(file);
2303
2304         if (verbose > 0)
2305                 printf("Found %d entries\n", count);
2306         if (count == 0)
2307                 file_fail(norm_name);
2308 }
2309
2310 /* ------------------------------------------------------------------ */
2311
2312 /*
2313  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2314  *
2315  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2316  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2317  *
2318  * SBase = 0xAC00
2319  * LBase = 0x1100
2320  * VBase = 0x1161
2321  * TBase = 0x11A7
2322  * LCount = 19
2323  * VCount = 21
2324  * TCount = 28
2325  * NCount = 588 (VCount * TCount)
2326  * SCount = 11172 (LCount * NCount)
2327  *
2328  * Decomposition:
2329  *   SIndex = s - SBase
2330  *
2331  * LV (Canonical/Full)
2332  *   LIndex = SIndex / NCount
2333  *   VIndex = (Sindex % NCount) / TCount
2334  *   LPart = LBase + LIndex
2335  *   VPart = VBase + VIndex
2336  *
2337  * LVT (Canonical)
2338  *   LVIndex = (SIndex / TCount) * TCount
2339  *   TIndex = (Sindex % TCount)
2340  *   LVPart = SBase + LVIndex
2341  *   TPart = TBase + TIndex
2342  *
2343  * LVT (Full)
2344  *   LIndex = SIndex / NCount
2345  *   VIndex = (Sindex % NCount) / TCount
2346  *   TIndex = (Sindex % TCount)
2347  *   LPart = LBase + LIndex
2348  *   VPart = VBase + VIndex
2349  *   if (TIndex == 0) {
2350  *          d = <LPart, VPart>
2351  *   } else {
2352  *          TPart = TBase + TIndex
2353  *          d = <LPart, VPart, TPart>
2354  *   }
2355  *
2356  */
2357
2358 static void hangul_decompose(void)
2359 {
2360         unsigned int sb = 0xAC00;
2361         unsigned int lb = 0x1100;
2362         unsigned int vb = 0x1161;
2363         unsigned int tb = 0x11a7;
2364         /* unsigned int lc = 19; */
2365         unsigned int vc = 21;
2366         unsigned int tc = 28;
2367         unsigned int nc = (vc * tc);
2368         /* unsigned int sc = (lc * nc); */
2369         unsigned int unichar;
2370         unsigned int mapping[4];
2371         unsigned int *um;
2372         int count;
2373         int i;
2374
2375         if (verbose > 0)
2376                 printf("Decomposing hangul\n");
2377         /* Hangul */
2378         count = 0;
2379         for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
2380                 unsigned int si = unichar - sb;
2381                 unsigned int li = si / nc;
2382                 unsigned int vi = (si % nc) / tc;
2383                 unsigned int ti = si % tc;
2384
2385                 i = 0;
2386                 mapping[i++] = lb + li;
2387                 mapping[i++] = vb + vi;
2388                 if (ti)
2389                         mapping[i++] = tb + ti;
2390                 mapping[i++] = 0;
2391
2392                 assert(!unicode_data[unichar].utf32nfdi);
2393                 um = malloc(i * sizeof(unsigned int));
2394                 memcpy(um, mapping, i * sizeof(unsigned int));
2395                 unicode_data[unichar].utf32nfdi = um;
2396
2397                 assert(!unicode_data[unichar].utf32nfdicf);
2398                 um = malloc(i * sizeof(unsigned int));
2399                 memcpy(um, mapping, i * sizeof(unsigned int));
2400                 unicode_data[unichar].utf32nfdicf = um;
2401
2402                 /*
2403                  * Add a cookie as a reminder that the hangul syllable
2404                  * decompositions must not be stored in the generated
2405                  * trie.
2406                  */
2407                 unicode_data[unichar].utf8nfdi = malloc(2);
2408                 unicode_data[unichar].utf8nfdi[0] = HANGUL;
2409                 unicode_data[unichar].utf8nfdi[1] = '\0';
2410
2411                 if (verbose > 1)
2412                         print_utf32nfdi(unichar);
2413
2414                 count++;
2415         }
2416         if (verbose > 0)
2417                 printf("Created %d entries\n", count);
2418 }
2419
2420 static void nfdi_decompose(void)
2421 {
2422         unsigned int unichar;
2423         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2424         unsigned int *um;
2425         unsigned int *dc;
2426         int count;
2427         int i;
2428         int j;
2429         int ret;
2430
2431         if (verbose > 0)
2432                 printf("Decomposing nfdi\n");
2433
2434         count = 0;
2435         for (unichar = 0; unichar != 0x110000; unichar++) {
2436                 if (!unicode_data[unichar].utf32nfdi)
2437                         continue;
2438                 for (;;) {
2439                         ret = 1;
2440                         i = 0;
2441                         um = unicode_data[unichar].utf32nfdi;
2442                         while (*um) {
2443                                 dc = unicode_data[*um].utf32nfdi;
2444                                 if (dc) {
2445                                         for (j = 0; dc[j]; j++)
2446                                                 mapping[i++] = dc[j];
2447                                         ret = 0;
2448                                 } else {
2449                                         mapping[i++] = *um;
2450                                 }
2451                                 um++;
2452                         }
2453                         mapping[i++] = 0;
2454                         if (ret)
2455                                 break;
2456                         free(unicode_data[unichar].utf32nfdi);
2457                         um = malloc(i * sizeof(unsigned int));
2458                         memcpy(um, mapping, i * sizeof(unsigned int));
2459                         unicode_data[unichar].utf32nfdi = um;
2460                 }
2461                 /* Add this decomposition to nfdicf if there is no entry. */
2462                 if (!unicode_data[unichar].utf32nfdicf) {
2463                         um = malloc(i * sizeof(unsigned int));
2464                         memcpy(um, mapping, i * sizeof(unsigned int));
2465                         unicode_data[unichar].utf32nfdicf = um;
2466                 }
2467                 if (verbose > 1)
2468                         print_utf32nfdi(unichar);
2469                 count++;
2470         }
2471         if (verbose > 0)
2472                 printf("Processed %d entries\n", count);
2473 }
2474
2475 static void nfdicf_decompose(void)
2476 {
2477         unsigned int unichar;
2478         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2479         unsigned int *um;
2480         unsigned int *dc;
2481         int count;
2482         int i;
2483         int j;
2484         int ret;
2485
2486         if (verbose > 0)
2487                 printf("Decomposing nfdicf\n");
2488         count = 0;
2489         for (unichar = 0; unichar != 0x110000; unichar++) {
2490                 if (!unicode_data[unichar].utf32nfdicf)
2491                         continue;
2492                 for (;;) {
2493                         ret = 1;
2494                         i = 0;
2495                         um = unicode_data[unichar].utf32nfdicf;
2496                         while (*um) {
2497                                 dc = unicode_data[*um].utf32nfdicf;
2498                                 if (dc) {
2499                                         for (j = 0; dc[j]; j++)
2500                                                 mapping[i++] = dc[j];
2501                                         ret = 0;
2502                                 } else {
2503                                         mapping[i++] = *um;
2504                                 }
2505                                 um++;
2506                         }
2507                         mapping[i++] = 0;
2508                         if (ret)
2509                                 break;
2510                         free(unicode_data[unichar].utf32nfdicf);
2511                         um = malloc(i * sizeof(unsigned int));
2512                         memcpy(um, mapping, i * sizeof(unsigned int));
2513                         unicode_data[unichar].utf32nfdicf = um;
2514                 }
2515                 if (verbose > 1)
2516                         print_utf32nfdicf(unichar);
2517                 count++;
2518         }
2519         if (verbose > 0)
2520                 printf("Processed %d entries\n", count);
2521 }
2522
2523 /* ------------------------------------------------------------------ */
2524
2525 int utf8agemax(struct tree *, const char *);
2526 int utf8nagemax(struct tree *, const char *, size_t);
2527 int utf8agemin(struct tree *, const char *);
2528 int utf8nagemin(struct tree *, const char *, size_t);
2529 ssize_t utf8len(struct tree *, const char *);
2530 ssize_t utf8nlen(struct tree *, const char *, size_t);
2531 struct utf8cursor;
2532 int utf8cursor(struct utf8cursor *, struct tree *, const char *);
2533 int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
2534 int utf8byte(struct utf8cursor *);
2535
2536 /*
2537  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2538  *
2539  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2540  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2541  *
2542  * SBase = 0xAC00
2543  * LBase = 0x1100
2544  * VBase = 0x1161
2545  * TBase = 0x11A7
2546  * LCount = 19
2547  * VCount = 21
2548  * TCount = 28
2549  * NCount = 588 (VCount * TCount)
2550  * SCount = 11172 (LCount * NCount)
2551  *
2552  * Decomposition:
2553  *   SIndex = s - SBase
2554  *
2555  * LV (Canonical/Full)
2556  *   LIndex = SIndex / NCount
2557  *   VIndex = (Sindex % NCount) / TCount
2558  *   LPart = LBase + LIndex
2559  *   VPart = VBase + VIndex
2560  *
2561  * LVT (Canonical)
2562  *   LVIndex = (SIndex / TCount) * TCount
2563  *   TIndex = (Sindex % TCount)
2564  *   LVPart = SBase + LVIndex
2565  *   TPart = TBase + TIndex
2566  *
2567  * LVT (Full)
2568  *   LIndex = SIndex / NCount
2569  *   VIndex = (Sindex % NCount) / TCount
2570  *   TIndex = (Sindex % TCount)
2571  *   LPart = LBase + LIndex
2572  *   VPart = VBase + VIndex
2573  *   if (TIndex == 0) {
2574  *          d = <LPart, VPart>
2575  *   } else {
2576  *          TPart = TBase + TIndex
2577  *          d = <LPart, VPart, TPart>
2578  *   }
2579  */
2580
2581 /* Constants */
2582 #define SB      (0xAC00)
2583 #define LB      (0x1100)
2584 #define VB      (0x1161)
2585 #define TB      (0x11A7)
2586 #define LC      (19)
2587 #define VC      (21)
2588 #define TC      (28)
2589 #define NC      (VC * TC)
2590 #define SC      (LC * NC)
2591
2592 /* Algorithmic decomposition of hangul syllable. */
2593 static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
2594 {
2595         unsigned int    si;
2596         unsigned int    li;
2597         unsigned int    vi;
2598         unsigned int    ti;
2599         unsigned char   *h;
2600
2601         /* Calculate the SI, LI, VI, and TI values. */
2602         si = utf8decode(str) - SB;
2603         li = si / NC;
2604         vi = (si % NC) / TC;
2605         ti = si % TC;
2606
2607         /* Fill in base of leaf. */
2608         h = hangul;
2609         LEAF_GEN(h) = 2;
2610         LEAF_CCC(h) = DECOMPOSE;
2611         h += 2;
2612
2613         /* Add LPart, a 3-byte UTF-8 sequence. */
2614         h += utf8encode((char *)h, li + LB);
2615
2616         /* Add VPart, a 3-byte UTF-8 sequence. */
2617         h += utf8encode((char *)h, vi + VB);
2618
2619         /* Add TPart if required, also a 3-byte UTF-8 sequence. */
2620         if (ti)
2621                 h += utf8encode((char *)h, ti + TB);
2622
2623         /* Terminate string. */
2624         h[0] = '\0';
2625
2626         return hangul;
2627 }
2628
2629 /*
2630  * Use trie to scan s, touching at most len bytes.
2631  * Returns the leaf if one exists, NULL otherwise.
2632  *
2633  * A non-NULL return guarantees that the UTF-8 sequence starting at s
2634  * is well-formed and corresponds to a known unicode code point.  The
2635  * shorthand for this will be "is valid UTF-8 unicode".
2636  */
2637 static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
2638                                const char *s, size_t len)
2639 {
2640         utf8trie_t      *trie;
2641         int             offlen;
2642         int             offset;
2643         int             mask;
2644         int             node;
2645
2646         if (!tree)
2647                 return NULL;
2648         if (len == 0)
2649                 return NULL;
2650         node = 1;
2651         trie = utf8data + tree->index;
2652         while (node) {
2653                 offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
2654                 if (*trie & NEXTBYTE) {
2655                         if (--len == 0)
2656                                 return NULL;
2657                         s++;
2658                 }
2659                 mask = 1 << (*trie & BITNUM);
2660                 if (*s & mask) {
2661                         /* Right leg */
2662                         if (offlen) {
2663                                 /* Right node at offset of trie */
2664                                 node = (*trie & RIGHTNODE);
2665                                 offset = trie[offlen];
2666                                 while (--offlen) {
2667                                         offset <<= 8;
2668                                         offset |= trie[offlen];
2669                                 }
2670                                 trie += offset;
2671                         } else if (*trie & RIGHTPATH) {
2672                                 /* Right node after this node */
2673                                 node = (*trie & TRIENODE);
2674                                 trie++;
2675                         } else {
2676                                 /* No right node. */
2677                                 return NULL;
2678                         }
2679                 } else {
2680                         /* Left leg */
2681                         if (offlen) {
2682                                 /* Left node after this node. */
2683                                 node = (*trie & LEFTNODE);
2684                                 trie += offlen + 1;
2685                         } else if (*trie & RIGHTPATH) {
2686                                 /* No left node. */
2687                                 return NULL;
2688                         } else {
2689                                 /* Left node after this node */
2690                                 node = (*trie & TRIENODE);
2691                                 trie++;
2692                         }
2693                 }
2694         }
2695         /*
2696          * Hangul decomposition is done algorithmically. These are the
2697          * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
2698          * always 3 bytes long, so s has been advanced twice, and the
2699          * start of the sequence is at s-2.
2700          */
2701         if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
2702                 trie = utf8hangul(s - 2, hangul);
2703         return trie;
2704 }
2705
2706 /*
2707  * Use trie to scan s.
2708  * Returns the leaf if one exists, NULL otherwise.
2709  *
2710  * Forwards to trie_nlookup().
2711  */
2712 static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
2713                               const char *s)
2714 {
2715         return utf8nlookup(tree, hangul, s, (size_t)-1);
2716 }
2717
2718 /*
2719  * Return the number of bytes used by the current UTF-8 sequence.
2720  * Assumes the input points to the first byte of a valid UTF-8
2721  * sequence.
2722  */
2723 static inline int utf8clen(const char *s)
2724 {
2725         unsigned char c = *s;
2726         return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
2727 }
2728
2729 /*
2730  * Maximum age of any character in s.
2731  * Return -1 if s is not valid UTF-8 unicode.
2732  * Return 0 if only non-assigned code points are used.
2733  */
2734 int utf8agemax(struct tree *tree, const char *s)
2735 {
2736         utf8leaf_t      *leaf;
2737         int             age = 0;
2738         int             leaf_age;
2739         unsigned char   hangul[UTF8HANGULLEAF];
2740
2741         if (!tree)
2742                 return -1;
2743
2744         while (*s) {
2745                 leaf = utf8lookup(tree, hangul, s);
2746                 if (!leaf)
2747                         return -1;
2748                 leaf_age = ages[LEAF_GEN(leaf)];
2749                 if (leaf_age <= tree->maxage && leaf_age > age)
2750                         age = leaf_age;
2751                 s += utf8clen(s);
2752         }
2753         return age;
2754 }
2755
2756 /*
2757  * Minimum age of any character in s.
2758  * Return -1 if s is not valid UTF-8 unicode.
2759  * Return 0 if non-assigned code points are used.
2760  */
2761 int utf8agemin(struct tree *tree, const char *s)
2762 {
2763         utf8leaf_t      *leaf;
2764         int             age;
2765         int             leaf_age;
2766         unsigned char   hangul[UTF8HANGULLEAF];
2767
2768         if (!tree)
2769                 return -1;
2770         age = tree->maxage;
2771         while (*s) {
2772                 leaf = utf8lookup(tree, hangul, s);
2773                 if (!leaf)
2774                         return -1;
2775                 leaf_age = ages[LEAF_GEN(leaf)];
2776                 if (leaf_age <= tree->maxage && leaf_age < age)
2777                         age = leaf_age;
2778                 s += utf8clen(s);
2779         }
2780         return age;
2781 }
2782
2783 /*
2784  * Maximum age of any character in s, touch at most len bytes.
2785  * Return -1 if s is not valid UTF-8 unicode.
2786  */
2787 int utf8nagemax(struct tree *tree, const char *s, size_t len)
2788 {
2789         utf8leaf_t      *leaf;
2790         int             age = 0;
2791         int             leaf_age;
2792         unsigned char   hangul[UTF8HANGULLEAF];
2793
2794         if (!tree)
2795                 return -1;
2796
2797         while (len && *s) {
2798                 leaf = utf8nlookup(tree, hangul, s, len);
2799                 if (!leaf)
2800                         return -1;
2801                 leaf_age = ages[LEAF_GEN(leaf)];
2802                 if (leaf_age <= tree->maxage && leaf_age > age)
2803                         age = leaf_age;
2804                 len -= utf8clen(s);
2805                 s += utf8clen(s);
2806         }
2807         return age;
2808 }
2809
2810 /*
2811  * Maximum age of any character in s, touch at most len bytes.
2812  * Return -1 if s is not valid UTF-8 unicode.
2813  */
2814 int utf8nagemin(struct tree *tree, const char *s, size_t len)
2815 {
2816         utf8leaf_t      *leaf;
2817         int             leaf_age;
2818         int             age;
2819         unsigned char   hangul[UTF8HANGULLEAF];
2820
2821         if (!tree)
2822                 return -1;
2823         age = tree->maxage;
2824         while (len && *s) {
2825                 leaf = utf8nlookup(tree, hangul, s, len);
2826                 if (!leaf)
2827                         return -1;
2828                 leaf_age = ages[LEAF_GEN(leaf)];
2829                 if (leaf_age <= tree->maxage && leaf_age < age)
2830                         age = leaf_age;
2831                 len -= utf8clen(s);
2832                 s += utf8clen(s);
2833         }
2834         return age;
2835 }
2836
2837 /*
2838  * Length of the normalization of s.
2839  * Return -1 if s is not valid UTF-8 unicode.
2840  *
2841  * A string of Default_Ignorable_Code_Point has length 0.
2842  */
2843 ssize_t utf8len(struct tree *tree, const char *s)
2844 {
2845         utf8leaf_t      *leaf;
2846         size_t          ret = 0;
2847         unsigned char   hangul[UTF8HANGULLEAF];
2848
2849         if (!tree)
2850                 return -1;
2851         while (*s) {
2852                 leaf = utf8lookup(tree, hangul, s);
2853                 if (!leaf)
2854                         return -1;
2855                 if (ages[LEAF_GEN(leaf)] > tree->maxage)
2856                         ret += utf8clen(s);
2857                 else if (LEAF_CCC(leaf) == DECOMPOSE)
2858                         ret += strlen(LEAF_STR(leaf));
2859                 else
2860                         ret += utf8clen(s);
2861                 s += utf8clen(s);
2862         }
2863         return ret;
2864 }
2865
2866 /*
2867  * Length of the normalization of s, touch at most len bytes.
2868  * Return -1 if s is not valid UTF-8 unicode.
2869  */
2870 ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
2871 {
2872         utf8leaf_t      *leaf;
2873         size_t          ret = 0;
2874         unsigned char   hangul[UTF8HANGULLEAF];
2875
2876         if (!tree)
2877                 return -1;
2878         while (len && *s) {
2879                 leaf = utf8nlookup(tree, hangul, s, len);
2880                 if (!leaf)
2881                         return -1;
2882                 if (ages[LEAF_GEN(leaf)] > tree->maxage)
2883                         ret += utf8clen(s);
2884                 else if (LEAF_CCC(leaf) == DECOMPOSE)
2885                         ret += strlen(LEAF_STR(leaf));
2886                 else
2887                         ret += utf8clen(s);
2888                 len -= utf8clen(s);
2889                 s += utf8clen(s);
2890         }
2891         return ret;
2892 }
2893
2894 /*
2895  * Cursor structure used by the normalizer.
2896  */
2897 struct utf8cursor {
2898         struct tree     *tree;
2899         const char      *s;
2900         const char      *p;
2901         const char      *ss;
2902         const char      *sp;
2903         unsigned int    len;
2904         unsigned int    slen;
2905         short int       ccc;
2906         short int       nccc;
2907         unsigned int    unichar;
2908         unsigned char   hangul[UTF8HANGULLEAF];
2909 };
2910
2911 /*
2912  * Set up an utf8cursor for use by utf8byte().
2913  *
2914  *   s      : string.
2915  *   len    : length of s.
2916  *   u8c    : pointer to cursor.
2917  *   trie   : utf8trie_t to use for normalization.
2918  *
2919  * Returns -1 on error, 0 on success.
2920  */
2921 int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
2922                 size_t len)
2923 {
2924         if (!tree)
2925                 return -1;
2926         if (!s)
2927                 return -1;
2928         u8c->tree = tree;
2929         u8c->s = s;
2930         u8c->p = NULL;
2931         u8c->ss = NULL;
2932         u8c->sp = NULL;
2933         u8c->len = len;
2934         u8c->slen = 0;
2935         u8c->ccc = STOPPER;
2936         u8c->nccc = STOPPER;
2937         u8c->unichar = 0;
2938         /* Check we didn't clobber the maximum length. */
2939         if (u8c->len != len)
2940                 return -1;
2941         /* The first byte of s may not be an utf8 continuation. */
2942         if (len > 0 && (*s & 0xC0) == 0x80)
2943                 return -1;
2944         return 0;
2945 }
2946
2947 /*
2948  * Set up an utf8cursor for use by utf8byte().
2949  *
2950  *   s      : NUL-terminated string.
2951  *   u8c    : pointer to cursor.
2952  *   trie   : utf8trie_t to use for normalization.
2953  *
2954  * Returns -1 on error, 0 on success.
2955  */
2956 int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
2957 {
2958         return utf8ncursor(u8c, tree, s, (unsigned int)-1);
2959 }
2960
2961 /*
2962  * Get one byte from the normalized form of the string described by u8c.
2963  *
2964  * Returns the byte cast to an unsigned char on succes, and -1 on failure.
2965  *
2966  * The cursor keeps track of the location in the string in u8c->s.
2967  * When a character is decomposed, the current location is stored in
2968  * u8c->p, and u8c->s is set to the start of the decomposition. Note
2969  * that bytes from a decomposition do not count against u8c->len.
2970  *
2971  * Characters are emitted if they match the current CCC in u8c->ccc.
2972  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
2973  * and the function returns 0 in that case.
2974  *
2975  * Sorting by CCC is done by repeatedly scanning the string.  The
2976  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
2977  * the start of the scan.  The first pass finds the lowest CCC to be
2978  * emitted and stores it in u8c->nccc, the second pass emits the
2979  * characters with this CCC and finds the next lowest CCC. This limits
2980  * the number of passes to 1 + the number of different CCCs in the
2981  * sequence being scanned.
2982  *
2983  * Therefore:
2984  *  u8c->p  != NULL -> a decomposition is being scanned.
2985  *  u8c->ss != NULL -> this is a repeating scan.
2986  *  u8c->ccc == -1  -> this is the first scan of a repeating scan.
2987  */
2988 int utf8byte(struct utf8cursor *u8c)
2989 {
2990         utf8leaf_t *leaf;
2991         int ccc;
2992
2993         for (;;) {
2994                 /* Check for the end of a decomposed character. */
2995                 if (u8c->p && *u8c->s == '\0') {
2996                         u8c->s = u8c->p;
2997                         u8c->p = NULL;
2998                 }
2999
3000                 /* Check for end-of-string. */
3001                 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
3002                         /* There is no next byte. */
3003                         if (u8c->ccc == STOPPER)
3004                                 return 0;
3005                         /* End-of-string during a scan counts as a stopper. */
3006                         ccc = STOPPER;
3007                         goto ccc_mismatch;
3008                 } else if ((*u8c->s & 0xC0) == 0x80) {
3009                         /* This is a continuation of the current character. */
3010                         if (!u8c->p)
3011                                 u8c->len--;
3012                         return (unsigned char)*u8c->s++;
3013                 }
3014
3015                 /* Look up the data for the current character. */
3016                 if (u8c->p) {
3017                         leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3018                 } else {
3019                         leaf = utf8nlookup(u8c->tree, u8c->hangul,
3020                                            u8c->s, u8c->len);
3021                 }
3022
3023                 /* No leaf found implies that the input is a binary blob. */
3024                 if (!leaf)
3025                         return -1;
3026
3027                 /* Characters that are too new have CCC 0. */
3028                 if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
3029                         ccc = STOPPER;
3030                 } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
3031                         u8c->len -= utf8clen(u8c->s);
3032                         u8c->p = u8c->s + utf8clen(u8c->s);
3033                         u8c->s = LEAF_STR(leaf);
3034                         /* Empty decomposition implies CCC 0. */
3035                         if (*u8c->s == '\0') {
3036                                 if (u8c->ccc == STOPPER)
3037                                         continue;
3038                                 ccc = STOPPER;
3039                                 goto ccc_mismatch;
3040                         }
3041                         leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3042                         ccc = LEAF_CCC(leaf);
3043                 }
3044                 u8c->unichar = utf8decode(u8c->s);
3045
3046                 /*
3047                  * If this is not a stopper, then see if it updates
3048                  * the next canonical class to be emitted.
3049                  */
3050                 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
3051                         u8c->nccc = ccc;
3052
3053                 /*
3054                  * Return the current byte if this is the current
3055                  * combining class.
3056                  */
3057                 if (ccc == u8c->ccc) {
3058                         if (!u8c->p)
3059                                 u8c->len--;
3060                         return (unsigned char)*u8c->s++;
3061                 }
3062
3063                 /* Current combining class mismatch. */
3064         ccc_mismatch:
3065                 if (u8c->nccc == STOPPER) {
3066                         /*
3067                          * Scan forward for the first canonical class
3068                          * to be emitted.  Save the position from
3069                          * which to restart.
3070                          */
3071                         assert(u8c->ccc == STOPPER);
3072                         u8c->ccc = MINCCC - 1;
3073                         u8c->nccc = ccc;
3074                         u8c->sp = u8c->p;
3075                         u8c->ss = u8c->s;
3076                         u8c->slen = u8c->len;
3077                         if (!u8c->p)
3078                                 u8c->len -= utf8clen(u8c->s);
3079                         u8c->s += utf8clen(u8c->s);
3080                 } else if (ccc != STOPPER) {
3081                         /* Not a stopper, and not the ccc we're emitting. */
3082                         if (!u8c->p)
3083                                 u8c->len -= utf8clen(u8c->s);
3084                         u8c->s += utf8clen(u8c->s);
3085                 } else if (u8c->nccc != MAXCCC + 1) {
3086                         /* At a stopper, restart for next ccc. */
3087                         u8c->ccc = u8c->nccc;
3088                         u8c->nccc = MAXCCC + 1;
3089                         u8c->s = u8c->ss;
3090                         u8c->p = u8c->sp;
3091                         u8c->len = u8c->slen;
3092                 } else {
3093                         /* All done, proceed from here. */
3094                         u8c->ccc = STOPPER;
3095                         u8c->nccc = STOPPER;
3096                         u8c->sp = NULL;
3097                         u8c->ss = NULL;
3098                         u8c->slen = 0;
3099                 }
3100         }
3101 }
3102
3103 /* ------------------------------------------------------------------ */
3104
3105 static int normalize_line(struct tree *tree)
3106 {
3107         char *s;
3108         char *t;
3109         int c;
3110         struct utf8cursor u8c;
3111
3112         /* First test: null-terminated string. */
3113         s = buf2;
3114         t = buf3;
3115         if (utf8cursor(&u8c, tree, s))
3116                 return -1;
3117         while ((c = utf8byte(&u8c)) > 0)
3118                 if (c != (unsigned char)*t++)
3119                         return -1;
3120         if (c < 0)
3121                 return -1;
3122         if (*t != 0)
3123                 return -1;
3124
3125         /* Second test: length-limited string. */
3126         s = buf2;
3127         /* Replace NUL with a value that will cause an error if seen. */
3128         s[strlen(s) + 1] = -1;
3129         t = buf3;
3130         if (utf8cursor(&u8c, tree, s))
3131                 return -1;
3132         while ((c = utf8byte(&u8c)) > 0)
3133                 if (c != (unsigned char)*t++)
3134                         return -1;
3135         if (c < 0)
3136                 return -1;
3137         if (*t != 0)
3138                 return -1;
3139
3140         return 0;
3141 }
3142
3143 static void normalization_test(void)
3144 {
3145         FILE *file;
3146         unsigned int unichar;
3147         struct unicode_data *data;
3148         char *s;
3149         char *t;
3150         int ret;
3151         int ignorables;
3152         int tests = 0;
3153         int failures = 0;
3154
3155         if (verbose > 0)
3156                 printf("Parsing %s\n", test_name);
3157         /* Step one, read data from file. */
3158         file = fopen(test_name, "r");
3159         if (!file)
3160                 open_fail(test_name, errno);
3161
3162         while (fgets(line, LINESIZE, file)) {
3163                 ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
3164                              buf0, buf1);
3165                 if (ret != 2 || *line == '#')
3166                         continue;
3167                 s = buf0;
3168                 t = buf2;
3169                 while (*s) {
3170                         unichar = strtoul(s, &s, 16);
3171                         t += utf8encode(t, unichar);
3172                 }
3173                 *t = '\0';
3174
3175                 ignorables = 0;
3176                 s = buf1;
3177                 t = buf3;
3178                 while (*s) {
3179                         unichar = strtoul(s, &s, 16);
3180                         data = &unicode_data[unichar];
3181                         if (data->utf8nfdi && !*data->utf8nfdi)
3182                                 ignorables = 1;
3183                         else
3184                                 t += utf8encode(t, unichar);
3185                 }
3186                 *t = '\0';
3187
3188                 tests++;
3189                 if (normalize_line(nfdi_tree) < 0) {
3190                         printf("Line %s -> %s", buf0, buf1);
3191                         if (ignorables)
3192                                 printf(" (ignorables removed)");
3193                         printf(" failure\n");
3194                         failures++;
3195                 }
3196         }
3197         fclose(file);
3198         if (verbose > 0)
3199                 printf("Ran %d tests with %d failures\n", tests, failures);
3200         if (failures)
3201                 file_fail(test_name);
3202 }
3203
3204 /* ------------------------------------------------------------------ */
3205
3206 static void write_file(void)
3207 {
3208         FILE *file;
3209         int i;
3210         int j;
3211         int t;
3212         int gen;
3213
3214         if (verbose > 0)
3215                 printf("Writing %s\n", utf8_name);
3216         file = fopen(utf8_name, "w");
3217         if (!file)
3218                 open_fail(utf8_name, errno);
3219
3220         fprintf(file, "/* This file is generated code, do not edit. */\n");
3221         fprintf(file, "\n");
3222         fprintf(file, "#include <linux/module.h>\n");
3223         fprintf(file, "#include <linux/kernel.h>\n");
3224         fprintf(file, "#include \"utf8n.h\"\n");
3225         fprintf(file, "\n");
3226         fprintf(file, "static const unsigned int utf8agetab[] = {\n");
3227         for (i = 0; i != ages_count; i++)
3228                 fprintf(file, "\t%#x%s\n", ages[i],
3229                         ages[i] == unicode_maxage ? "" : ",");
3230         fprintf(file, "};\n");
3231         fprintf(file, "\n");
3232         fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
3233         t = 0;
3234         for (gen = 0; gen < ages_count; gen++) {
3235                 fprintf(file, "\t{ %#x, %d }%s\n",
3236                         ages[gen], trees[t].index,
3237                         ages[gen] == unicode_maxage ? "" : ",");
3238                 if (trees[t].maxage == ages[gen])
3239                         t += 2;
3240         }
3241         fprintf(file, "};\n");
3242         fprintf(file, "\n");
3243         fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
3244         t = 1;
3245         for (gen = 0; gen < ages_count; gen++) {
3246                 fprintf(file, "\t{ %#x, %d }%s\n",
3247                         ages[gen], trees[t].index,
3248                         ages[gen] == unicode_maxage ? "" : ",");
3249                 if (trees[t].maxage == ages[gen])
3250                         t += 2;
3251         }
3252         fprintf(file, "};\n");
3253         fprintf(file, "\n");
3254         fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
3255                 utf8data_size);
3256         t = 0;
3257         for (i = 0; i != utf8data_size; i += 16) {
3258                 if (i == trees[t].index) {
3259                         fprintf(file, "\t/* %s_%x */\n",
3260                                 trees[t].type, trees[t].maxage);
3261                         if (t < trees_count-1)
3262                                 t++;
3263                 }
3264                 fprintf(file, "\t");
3265                 for (j = i; j != i + 16; j++)
3266                         fprintf(file, "0x%.2x%s", utf8data[j],
3267                                 (j < utf8data_size -1 ? "," : ""));
3268                 fprintf(file, "\n");
3269         }
3270         fprintf(file, "};\n");
3271         fprintf(file, "\n");
3272         fprintf(file, "const struct utf8data_table utf8_data_table = {\n");
3273         fprintf(file, "\t.utf8agetab = utf8agetab,\n");
3274         fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n");
3275         fprintf(file, "\n");
3276         fprintf(file, "\t.utf8nfdicfdata = utf8nfdicfdata,\n");
3277         fprintf(file, "\t.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),\n");
3278         fprintf(file, "\n");
3279         fprintf(file, "\t.utf8nfdidata = utf8nfdidata,\n");
3280         fprintf(file, "\t.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),\n");
3281         fprintf(file, "\n");
3282         fprintf(file, "\t.utf8data = utf8data,\n");
3283         fprintf(file, "};\n");
3284         fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);");
3285         fprintf(file, "\n");
3286         fprintf(file, "MODULE_DESCRIPTION(\"UTF8 data table\");\n");
3287         fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n");
3288         fclose(file);
3289 }
3290
3291 /* ------------------------------------------------------------------ */
3292
3293 int main(int argc, char *argv[])
3294 {
3295         unsigned int unichar;
3296         int opt;
3297
3298         argv0 = argv[0];
3299
3300         while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
3301                 switch (opt) {
3302                 case 'a':
3303                         age_name = optarg;
3304                         break;
3305                 case 'c':
3306                         ccc_name = optarg;
3307                         break;
3308                 case 'd':
3309                         data_name = optarg;
3310                         break;
3311                 case 'f':
3312                         fold_name = optarg;
3313                         break;
3314                 case 'n':
3315                         norm_name = optarg;
3316                         break;
3317                 case 'o':
3318                         utf8_name = optarg;
3319                         break;
3320                 case 'p':
3321                         prop_name = optarg;
3322                         break;
3323                 case 't':
3324                         test_name = optarg;
3325                         break;
3326                 case 'v':
3327                         verbose++;
3328                         break;
3329                 case 'h':
3330                         help();
3331                         exit(0);
3332                 default:
3333                         usage();
3334                 }
3335         }
3336
3337         if (verbose > 1)
3338                 help();
3339         for (unichar = 0; unichar != 0x110000; unichar++)
3340                 unicode_data[unichar].code = unichar;
3341         age_init();
3342         ccc_init();
3343         nfdi_init();
3344         nfdicf_init();
3345         corrections_init();
3346         hangul_decompose();
3347         nfdi_decompose();
3348         nfdicf_decompose();
3349         utf8_init();
3350         trees_init();
3351         trees_populate();
3352         trees_reduce();
3353         trees_verify();
3354         /* Prevent "unused function" warning. */
3355         (void)lookup(nfdi_tree, " ");
3356         if (verbose > 2)
3357                 tree_walk(nfdi_tree);
3358         if (verbose > 2)
3359                 tree_walk(nfdicf_tree);
3360         normalization_test();
3361         write_file();
3362
3363         return 0;
3364 }