intl/unicharutil/tools/ucgendat.c

   1 /*
   2  * Copyright 1996, 1997, 1998 Computing Research Labs,
   3  * New Mexico State University
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice shall be included in
  13  * all copies or substantial portions of the Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
  19  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
  20  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
  21  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  */
  23 #ifndef lint
  24 #ifdef __GNUC__
  25 static char rcsid[] __attribute__ ((unused)) = "$Id: ucgendat.c,v 1.1 1999/01/08 00:19:21 ftang%netscape.com Exp $";
  26 #else
  27 static char rcsid[] = "$Id: ucgendat.c,v 1.1 1999/01/08 00:19:21 ftang%netscape.com Exp $";
  28 #endif
  29 #endif
  30
  31 #include <stdio.h>
  32 #include <stdlib.h>
  33 #include <string.h>
  34 #ifndef WIN32
  35 #include <unistd.h>
  36 #endif
  37
  38 #define ishdigit(cc) (((cc) >= '0' && (cc) <= '9') ||\
  39                       ((cc) >= 'A' && (cc) <= 'F') ||\
  40                       ((cc) >= 'a' && (cc) <= 'f'))
  41
  42 /*
  43  * A header written to the output file with the byte-order-mark and the number
  44  * of property nodes.
  45  */
  46 static unsigned short hdr[2] = {0xfeff, 0};
  47
  48 #define NUMPROPS 49
  49 #define NEEDPROPS (NUMPROPS + (4 - (NUMPROPS & 3)))
  50
  51 typedef struct {
  52     char *name;
  53     int len;
  54 } _prop_t;
  55
  56 /*
  57  * List of properties expected to be found in the Unicode Character Database
  58  * including some implementation specific properties.
  59  *
  60  * The implementation specific properties are:
  61  * Cm = Composed (can be decomposed)
  62  * Nb = Non-breaking
  63  * Sy = Symmetric (has left and right forms)
  64  * Hd = Hex digit
  65  * Qm = Quote marks
  66  * Mr = Mirroring
  67  * Ss = Space, other
  68  * Cp = Defined character
  69  */
  70 static _prop_t props[NUMPROPS] = {
  71     {"Mn", 2}, {"Mc", 2}, {"Me", 2}, {"Nd", 2}, {"Nl", 2}, {"No", 2},
  72     {"Zs", 2}, {"Zl", 2}, {"Zp", 2}, {"Cc", 2}, {"Cf", 2}, {"Cs", 2},
  73     {"Co", 2}, {"Cn", 2}, {"Lu", 2}, {"Ll", 2}, {"Lt", 2}, {"Lm", 2},
  74     {"Lo", 2}, {"Pc", 2}, {"Pd", 2}, {"Ps", 2}, {"Pe", 2}, {"Po", 2},
  75     {"Sm", 2}, {"Sc", 2}, {"Sk", 2}, {"So", 2}, {"L",  1}, {"R",  1},
  76     {"EN", 2}, {"ES", 2}, {"ET", 2}, {"AN", 2}, {"CS", 2}, {"B",  1},
  77     {"S",  1}, {"WS", 2}, {"ON", 2},
  78     {"Cm", 2}, {"Nb", 2}, {"Sy", 2}, {"Hd", 2}, {"Qm", 2}, {"Mr", 2},
  79     {"Ss", 2}, {"Cp", 2}, {"Pi", 2}, {"Pf", 2}
  80 };
  81
  82 typedef struct {
  83     unsigned long *ranges;
  84     unsigned short used;
  85     unsigned short size;
  86 } _ranges_t;
  87
  88 static _ranges_t proptbl[NUMPROPS];
  89
  90 /*
  91  * Make sure this array is sized to be on a 4-byte boundary at compile time.
  92  */
  93 static unsigned short propcnt[NEEDPROPS];
  94
  95 /*
  96  * Array used to collect a decomposition before adding it to the decomposition
  97  * table.
  98  */
  99 static unsigned long dectmp[64];
 100 static unsigned long dectmp_size;
 101
 102 typedef struct {
 103     unsigned long code;
 104     unsigned short size;
 105     unsigned short used;
 106     unsigned long *decomp;
 107 } _decomp_t;
 108
 109 /*
 110  * List of decomposition.  Created and expanded in order as the characters are
 111  * encountered.
 112  */
 113 static _decomp_t *decomps;
 114 static unsigned long decomps_used;
 115 static unsigned long decomps_size;
 116
 117 /*
 118  * Types and lists for handling lists of case mappings.
 119  */
 120 typedef struct {
 121     unsigned long key;
 122     unsigned long other1;
 123     unsigned long other2;
 124 } _case_t;
 125
 126 static _case_t *upper;
 127 static _case_t *lower;
 128 static _case_t *title;
 129 static unsigned long upper_used;
 130 static unsigned long upper_size;
 131 static unsigned long lower_used;
 132 static unsigned long lower_size;
 133 static unsigned long title_used;
 134 static unsigned long title_size;
 135
 136 /*
 137  * Array used to collect case mappings before adding them to a list.
 138  */
 139 static unsigned long cases[3];
 140
 141 /*
 142  * An array to hold ranges for combining classes.
 143  */
 144 static unsigned long *ccl;
 145 static unsigned long ccl_used;
 146 static unsigned long ccl_size;
 147
 148 /*
 149  * Structures for handling numbers.
 150  */
 151 typedef struct {
 152     unsigned long code;
 153     unsigned long idx;
 154 } _codeidx_t;
 155
 156 typedef struct {
 157     short numerator;
 158     short denominator;
 159 } _num_t;
 160
 161 /*
 162  * Arrays to hold the mapping of codes to numbers.
 163  */
 164 static _codeidx_t *ncodes;
 165 static unsigned long ncodes_used;
 166 static unsigned long ncodes_size;
 167
 168 static _num_t *nums;
 169 static unsigned long nums_used;
 170 static unsigned long nums_size;
 171
 172 /*
 173  * Array for holding numbers.
 174  */
 175 static _num_t *nums;
 176 static unsigned long nums_used;
 177 static unsigned long nums_size;
 178
 179 static void
 180 #ifdef __STDC__
 181 add_range(unsigned long start, unsigned long end, char *p1, char *p2)
 182 #else
 183 add_range(start, end, p1, p2)
 184 unsigned long start, end;
 185 char *p1, *p2;
 186 #endif
 187 {
 188     int i, j, k, len;
 189     _ranges_t *rlp;
 190     char *name;
 191
 192     for (k = 0; k < 2; k++) {
 193         if (k == 0) {
 194             name = p1;
 195             len = 2;
 196         } else {
 197             if (p2 == 0)
 198               break;
 199
 200             name = p2;
 201             len = 1;
 202         }
 203
 204         for (i = 0; i < NUMPROPS; i++) {
 205             if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
 206               break;
 207         }
 208
 209         if (i == NUMPROPS)
 210           continue;
 211
 212         rlp = &proptbl[i];
 213
 214         /*
 215          * Resize the range list if necessary.
 216          */
 217         if (rlp->used == rlp->size) {
 218             if (rlp->size == 0)
 219               rlp->ranges = (unsigned long *)
 220                   malloc(sizeof(unsigned long) << 3);
 221             else
 222               rlp->ranges = (unsigned long *)
 223                   realloc((char *) rlp->ranges,
 224                           sizeof(unsigned long) * (rlp->size + 8));
 225             rlp->size += 8;
 226         }
 227
 228         /*
 229          * If this is the first code for this property list, just add it
 230          * and return.
 231          */
 232         if (rlp->used == 0) {
 233             rlp->ranges[0] = start;
 234             rlp->ranges[1] = end;
 235             rlp->used += 2;
 236             continue;
 237         }
 238
 239         /*
 240          * Optimize the case of adding the range to the end.
 241          */
 242         j = rlp->used - 1;
 243         if (start > rlp->ranges[j]) {
 244             j = rlp->used;
 245             rlp->ranges[j++] = start;
 246             rlp->ranges[j++] = end;
 247             rlp->used = j;
 248             continue;
 249         }
 250
 251         /*
 252          * Need to locate the insertion point.
 253          */
 254         for (i = 0;
 255              i < rlp->used && start > rlp->ranges[i + 1] + 1; i += 2) ;
 256
 257         /*
 258          * If the start value lies in the current range, then simply set the
 259          * new end point of the range to the end value passed as a parameter.
 260          */
 261         if (rlp->ranges[i] <= start && start <= rlp->ranges[i + 1] + 1) {
 262             rlp->ranges[i + 1] = end;
 263             return;
 264         }
 265
 266         /*
 267          * Shift following values up by two.
 268          */
 269         for (j = rlp->used; j > i; j -= 2) {
 270             rlp->ranges[j] = rlp->ranges[j - 2];
 271             rlp->ranges[j + 1] = rlp->ranges[j - 1];
 272         }
 273
 274         /*
 275          * Add the new range at the insertion point.
 276          */
 277         rlp->ranges[i] = start;
 278         rlp->ranges[i + 1] = end;
 279         rlp->used += 2;
 280     }
 281 }
 282
 283 static void
 284 #ifdef __STDC__
 285 ordered_range_insert(unsigned long c, char *name, int len)
 286 #else
 287 ordered_range_insert(c, name, len)
 288 unsigned long c;
 289 char *name;
 290 int len;
 291 #endif
 292 {
 293     int i, j;
 294     unsigned long s, e;
 295     _ranges_t *rlp;
 296
 297     if (len == 0)
 298       return;
 299
 300     for (i = 0; i < NUMPROPS; i++) {
 301         if (props[i].len == len && memcmp(props[i].name, name, len) == 0)
 302           break;
 303     }
 304
 305     if (i == NUMPROPS)
 306       return;
 307
 308     /*
 309      * Have a match, so insert the code in order.
 310      */
 311     rlp = &proptbl[i];
 312
 313     /*
 314      * Resize the range list if necessary.
 315      */
 316     if (rlp->used == rlp->size) {
 317         if (rlp->size == 0)
 318           rlp->ranges = (unsigned long *)
 319               malloc(sizeof(unsigned long) << 3);
 320         else
 321           rlp->ranges = (unsigned long *)
 322               realloc((char *) rlp->ranges,
 323                       sizeof(unsigned long) * (rlp->size + 8));
 324         rlp->size += 8;
 325     }
 326
 327     /*
 328      * If this is the first code for this property list, just add it
 329      * and return.
 330      */
 331     if (rlp->used == 0) {
 332         rlp->ranges[0] = rlp->ranges[1] = c;
 333         rlp->used += 2;
 334         return;
 335     }
 336
 337     /*
 338      * Optimize the cases of extending the last range and adding new ranges to
 339      * the end.
 340      */
 341     j = rlp->used - 1;
 342     e = rlp->ranges[j];
 343     s = rlp->ranges[j - 1];
 344
 345     if (c == e + 1) {
 346         /*
 347          * Extend the last range.
 348          */
 349         rlp->ranges[j] = c;
 350         return;
 351     }
 352
 353     if (c > e + 1) {
 354         /*
 355          * Start another range on the end.
 356          */
 357         j = rlp->used;
 358         rlp->ranges[j] = rlp->ranges[j + 1] = c;
 359         rlp->used += 2;
 360         return;
 361     }
 362
 363     if (c >= s)
 364       /*
 365        * The code is a duplicate of a code in the last range, so just return.
 366        */
 367       return;
 368
 369     /*
 370      * The code should be inserted somewhere before the last range in the
 371      * list.  Locate the insertion point.
 372      */
 373     for (i = 0;
 374          i < rlp->used && c > rlp->ranges[i + 1] + 1; i += 2) ;
 375
 376     s = rlp->ranges[i];
 377     e = rlp->ranges[i + 1];
 378
 379     if (c == e + 1)
 380       /*
 381        * Simply extend the current range.
 382        */
 383       rlp->ranges[i + 1] = c;
 384     else if (c < s) {
 385         /*
 386          * Add a new entry before the current location.  Shift all entries
 387          * before the current one up by one to make room.
 388          */
 389         for (j = rlp->used; j > i; j -= 2) {
 390             rlp->ranges[j] = rlp->ranges[j - 2];
 391             rlp->ranges[j + 1] = rlp->ranges[j - 1];
 392         }
 393         rlp->ranges[i] = rlp->ranges[i + 1] = c;
 394
 395         rlp->used += 2;
 396     }
 397 }
 398
 399 static void
 400 #ifdef __STDC__
 401 add_decomp(unsigned long code)
 402 #else
 403 add_decomp(code)
 404 unsigned long code;
 405 #endif
 406 {
 407     unsigned long i, j, size;
 408
 409     /*
 410      * Add the code to the composite property.
 411      */
 412     ordered_range_insert(code, "Cm", 2);
 413
 414     /*
 415      * Locate the insertion point for the code.
 416      */
 417     for (i = 0; i < decomps_used && code > decomps[i].code; i++) ;
 418
 419     /*
 420      * Allocate space for a new decomposition.
 421      */
 422     if (decomps_used == decomps_size) {
 423         if (decomps_size == 0)
 424           decomps = (_decomp_t *) malloc(sizeof(_decomp_t) << 3);
 425         else
 426           decomps = (_decomp_t *)
 427               realloc((char *) decomps,
 428                       sizeof(_decomp_t) * (decomps_size + 8));
 429         (void) memset((char *) (decomps + decomps_size), 0,
 430                       sizeof(_decomp_t) << 3);
 431         decomps_size += 8;
 432     }
 433
 434     if (i < decomps_used && code != decomps[i].code) {
 435         /*
 436          * Shift the decomps up by one if the codes don't match.
 437          */
 438         for (j = decomps_used; j > i; j--)
 439           (void) memcpy((char *) &decomps[j], (char *) &decomps[j - 1],
 440                         sizeof(_decomp_t));
 441     }
 442
 443     /*
 444      * Insert or replace a decomposition.
 445      */
 446     size = dectmp_size + (4 - (dectmp_size & 3));
 447     if (decomps[i].size < size) {
 448         if (decomps[i].size == 0)
 449           decomps[i].decomp = (unsigned long *)
 450               malloc(sizeof(unsigned long) * size);
 451         else
 452           decomps[i].decomp = (unsigned long *)
 453               realloc((char *) decomps[i].decomp,
 454                       sizeof(unsigned long) * size);
 455         decomps[i].size = size;
 456     }
 457
 458     if (decomps[i].code != code)
 459       decomps_used++;
 460
 461     decomps[i].code = code;
 462     decomps[i].used = dectmp_size;
 463     (void) memcpy((char *) decomps[i].decomp, (char *) dectmp,
 464                   sizeof(unsigned long) * dectmp_size);
 465
 466 }
 467
 468 static void
 469 #ifdef __STDC__
 470 add_title(unsigned long code)
 471 #else
 472 add_title(code)
 473 unsigned long code;
 474 #endif
 475 {
 476     unsigned long i, j;
 477
 478     /*
 479      * Always map the code to itself.
 480      */
 481     cases[2] = code;
 482
 483     if (title_used == title_size) {
 484         if (title_size == 0)
 485           title = (_case_t *) malloc(sizeof(_case_t) << 3);
 486         else
 487           title = (_case_t *) realloc((char *) title,
 488                                       sizeof(_case_t) * (title_size + 8));
 489         title_size += 8;
 490     }
 491
 492     /*
 493      * Locate the insertion point.
 494      */
 495     for (i = 0; i < title_used && code > title[i].key; i++) ;
 496
 497     if (i < title_used) {
 498         /*
 499          * Shift the array up by one.
 500          */
 501         for (j = title_used; j > i; j--)
 502           (void) memcpy((char *) &title[j], (char *) &title[j - 1],
 503                         sizeof(_case_t));
 504     }
 505
 506     title[i].key = cases[2];    /* Title */
 507     title[i].other1 = cases[0]; /* Upper */
 508     title[i].other2 = cases[1]; /* Lower */
 509
 510     title_used++;
 511 }
 512
 513 static void
 514 #ifdef __STDC__
 515 add_upper(unsigned long code)
 516 #else
 517 add_upper(code)
 518 unsigned long code;
 519 #endif
 520 {
 521     unsigned long i, j;
 522
 523     /*
 524      * Always map the code to itself.
 525      */
 526     cases[0] = code;
 527
 528     /*
 529      * If the title case character is not present, then make it the same as
 530      * the upper case.
 531      */
 532     if (cases[2] == 0)
 533       cases[2] = code;
 534
 535     if (upper_used == upper_size) {
 536         if (upper_size == 0)
 537           upper = (_case_t *) malloc(sizeof(_case_t) << 3);
 538         else
 539           upper = (_case_t *) realloc((char *) upper,
 540                                       sizeof(_case_t) * (upper_size + 8));
 541         upper_size += 8;
 542     }
 543
 544     /*
 545      * Locate the insertion point.
 546      */
 547     for (i = 0; i < upper_used && code > upper[i].key; i++) ;
 548
 549     if (i < upper_used) {
 550         /*
 551          * Shift the array up by one.
 552          */
 553         for (j = upper_used; j > i; j--)
 554           (void) memcpy((char *) &upper[j], (char *) &upper[j - 1],
 555                         sizeof(_case_t));
 556     }
 557
 558     upper[i].key = cases[0];    /* Upper */
 559     upper[i].other1 = cases[1]; /* Lower */
 560     upper[i].other2 = cases[2]; /* Title */
 561
 562     upper_used++;
 563 }
 564
 565 static void
 566 #ifdef __STDC__
 567 add_lower(unsigned long code)
 568 #else
 569 add_lower(code)
 570 unsigned long code;
 571 #endif
 572 {
 573     unsigned long i, j;
 574
 575     /*
 576      * Always map the code to itself.
 577      */
 578     cases[1] = code;
 579
 580     /*
 581      * If the title case character is empty, then make it the same as the
 582      * upper case.
 583      */
 584     if (cases[2] == 0)
 585       cases[2] = cases[0];
 586
 587     if (lower_used == lower_size) {
 588         if (lower_size == 0)
 589           lower = (_case_t *) malloc(sizeof(_case_t) << 3);
 590         else
 591           lower = (_case_t *) realloc((char *) lower,
 592                                       sizeof(_case_t) * (lower_size + 8));
 593         lower_size += 8;
 594     }
 595
 596     /*
 597      * Locate the insertion point.
 598      */
 599     for (i = 0; i < lower_used && code > lower[i].key; i++) ;
 600
 601     if (i < lower_used) {
 602         /*
 603          * Shift the array up by one.
 604          */
 605         for (j = lower_used; j > i; j--)
 606           (void) memcpy((char *) &lower[j], (char *) &lower[j - 1],
 607                         sizeof(_case_t));
 608     }
 609
 610     lower[i].key = cases[1];    /* Lower */
 611     lower[i].other1 = cases[0]; /* Upper */
 612     lower[i].other2 = cases[2]; /* Title */
 613
 614     lower_used++;
 615 }
 616
 617 static void
 618 #ifdef __STDC__
 619 ordered_ccl_insert(unsigned long c, unsigned long ccl_code)
 620 #else
 621 ordered_ccl_insert(c, ccl_code)
 622 unsigned long c, ccl_code;
 623 #endif
 624 {
 625     unsigned long i, j;
 626
 627     if (ccl_used == ccl_size) {
 628         if (ccl_size == 0)
 629           ccl = (unsigned long *) malloc(sizeof(unsigned long) * 24);
 630         else
 631           ccl = (unsigned long *)
 632               realloc((char *) ccl, sizeof(unsigned long) * (ccl_size + 24));
 633         ccl_size += 24;
 634     }
 635
 636     /*
 637      * Optimize adding the first item.
 638      */
 639     if (ccl_used == 0) {
 640         ccl[0] = ccl[1] = c;
 641         ccl[2] = ccl_code;
 642         ccl_used += 3;
 643         return;
 644     }
 645
 646     /*
 647      * Handle the special case of extending the range on the end.  This
 648      * requires that the combining class codes are the same.
 649      */
 650     if (ccl_code == ccl[ccl_used - 1] && c == ccl[ccl_used - 2] + 1) {
 651         ccl[ccl_used - 2] = c;
 652         return;
 653     }
 654
 655     /*
 656      * Handle the special case of adding another range on the end.
 657      */
 658     if (c > ccl[ccl_used - 2] + 1 ||
 659         (c == ccl[ccl_used - 2] + 1 && ccl_code != ccl[ccl_used - 1])) {
 660         ccl[ccl_used++] = c;
 661         ccl[ccl_used++] = c;
 662         ccl[ccl_used++] = ccl_code;
 663         return;
 664     }
 665
 666     /*
 667      * Locate either the insertion point or range for the code.
 668      */
 669     for (i = 0; i < ccl_used && c > ccl[i + 1] + 1; i += 3) ;
 670
 671     if (ccl_code == ccl[i + 2] && c == ccl[i + 1] + 1) {
 672         /*
 673          * Extend an existing range.
 674          */
 675         ccl[i + 1] = c;
 676         return;
 677     } else if (c < ccl[i]) {
 678         /*
 679          * Start a new range before the current location.
 680          */
 681         for (j = ccl_used; j > i; j -= 3) {
 682             ccl[j] = ccl[j - 3];
 683             ccl[j - 1] = ccl[j - 4];
 684             ccl[j - 2] = ccl[j - 5];
 685         }
 686         ccl[i] = ccl[i + 1] = c;
 687         ccl[i + 2] = ccl_code;
 688     }
 689 }
 690
 691 /*
 692  * Adds a number if it does not already exist and returns an index value
 693  * multiplied by 2.
 694  */
 695 static unsigned long
 696 #ifdef __STDC__
 697 make_number(short num, short denom)
 698 #else
 699 make_number(num, denom)
 700 short num, denom;
 701 #endif
 702 {
 703     unsigned long n;
 704
 705     /*
 706      * Determine if the number already exists.
 707      */
 708     for (n = 0; n < nums_used; n++) {
 709         if (nums[n].numerator == num && nums[n].denominator == denom)
 710           return n << 1;
 711     }
 712
 713     if (nums_used == nums_size) {
 714         if (nums_size == 0)
 715           nums = (_num_t *) malloc(sizeof(_num_t) << 3);
 716         else
 717           nums = (_num_t *) realloc((char *) nums,
 718                                     sizeof(_num_t) * (nums_size + 8));
 719         nums_size += 8;
 720     }
 721
 722     n = nums_used++;
 723     nums[n].numerator = num;
 724     nums[n].denominator = denom;
 725
 726     return n << 1;
 727 }
 728
 729 static void
 730 #ifdef __STDC__
 731 add_number(unsigned long code, short num, short denom)
 732 #else
 733 add_number(code, num, denom)
 734 unsigned long code;
 735 short num, denom;
 736 #endif
 737 {
 738     unsigned long i, j;
 739
 740     /*
 741      * Insert the code in order.
 742      */
 743     for (i = 0; i < ncodes_used && code > ncodes[i].code; i++) ;
 744
 745     /*
 746      * Handle the case of the codes matching and simply replace the number
 747      * that was there before.
 748      */
 749     if (ncodes_used > 0 && code == ncodes[i].code) {
 750         ncodes[i].idx = make_number(num, denom);
 751         return;
 752     }
 753
 754     /*
 755      * Resize the array if necessary.
 756      */
 757     if (ncodes_used == ncodes_size) {
 758         if (ncodes_size == 0)
 759           ncodes = (_codeidx_t *) malloc(sizeof(_codeidx_t) << 3);
 760         else
 761           ncodes = (_codeidx_t *)
 762               realloc((char *) ncodes, sizeof(_codeidx_t) * (ncodes_size + 8));
 763
 764         ncodes_size += 8;
 765     }
 766
 767     /*
 768      * Shift things around to insert the code if necessary.
 769      */
 770     if (i < ncodes_used) {
 771         for (j = ncodes_used; j > i; j--) {
 772             ncodes[j].code = ncodes[j - 1].code;
 773             ncodes[j].idx = ncodes[j - 1].idx;
 774         }
 775     }
 776     ncodes[i].code = code;
 777     ncodes[i].idx = make_number(num, denom);
 778
 779     ncodes_used++;
 780 }
 781
 782 /*
 783  * This routine assumes that the line is a valid Unicode Character Database
 784  * entry.
 785  */
 786 static void
 787 #ifdef __STDC__
 788 read_cdata(FILE *in)
 789 #else
 790 read_cdata(in)
 791 FILE *in;
 792 #endif
 793 {
 794     unsigned long i, lineno, skip, code, ccl_code;
 795     short wnum, neg, number[2];
 796     char line[512], *s, *e;
 797
 798     lineno = skip = 0;
 799     while (fscanf(in, "%[^\n]\n", line) != EOF) {
 800         lineno++;
 801
 802         /*
 803          * Skip blank lines and lines that start with a '#'.
 804          */
 805         if (line[0] == 0 || line[0] == '#')
 806           continue;
 807
 808         /*
 809          * If lines need to be skipped, do it here.
 810          */
 811         if (skip) {
 812             skip--;
 813             continue;
 814         }
 815
 816         /*
 817          * Collect the code.  The code can be up to 6 hex digits in length to
 818          * allow surrogates to be specified.
 819          */
 820         for (s = line, i = code = 0; *s != ';' && i < 6; i++, s++) {
 821             code <<= 4;
 822             if (*s >= '0' && *s <= '9')
 823               code += *s - '0';
 824             else if (*s >= 'A' && *s <= 'F')
 825               code += (*s - 'A') + 10;
 826             else if (*s >= 'a' && *s <= 'f')
 827               code += (*s - 'a') + 10;
 828         }
 829
 830         /*
 831          * Handle the following special cases:
 832          * 1. 4E00-9FA5 CJK Ideographs.
 833          * 2. AC00-D7A3 Hangul Syllables.
 834          * 3. D800-DFFF Surrogates.
 835          * 4. E000-F8FF Private Use Area.
 836          * 5. F900-FA2D Han compatibility.
 837          */
 838         switch (code) {
 839           case 0x4e00:
 840             /*
 841              * The Han ideographs.
 842              */
 843             add_range(0x4e00, 0x9fff, "Lo", "L");
 844
 845             /*
 846              * Add the characters to the defined category.
 847              */
 848             add_range(0x4e00, 0x9fa5, "Cp", 0);
 849
 850             skip = 1;
 851             break;
 852           case 0xac00:
 853             /*
 854              * The Hangul syllables.
 855              */
 856             add_range(0xac00, 0xd7a3, "Lo", "L");
 857
 858             /*
 859              * Add the characters to the defined category.
 860              */
 861             add_range(0xac00, 0xd7a3, "Cp", 0);
 862
 863             skip = 1;
 864             break;
 865           case 0xd800:
 866             /*
 867              * Make a range of all surrogates and assume some default
 868              * properties.
 869              */
 870             add_range(0x010000, 0x10ffff, "Cs", "L");
 871             skip = 5;
 872             break;
 873           case 0xe000:
 874             /*
 875              * The Private Use area.  Add with a default set of properties.
 876              */
 877             add_range(0xe000, 0xf8ff, "Co", "L");
 878             skip = 1;
 879             break;
 880           case 0xf900:
 881             /*
 882              * The CJK compatibility area.
 883              */
 884             add_range(0xf900, 0xfaff, "Lo", "L");
 885
 886             /*
 887              * Add the characters to the defined category.
 888              */
 889             add_range(0xf900, 0xfaff, "Cp", 0);
 890
 891             skip = 1;
 892         }
 893
 894         if (skip)
 895           continue;
 896
 897         /*
 898          * Add the code to the defined category.
 899          */
 900         ordered_range_insert(code, "Cp", 2);
 901
 902         /*
 903          * Locate the first character property field.
 904          */
 905         for (i = 0; *s != 0 && i < 2; s++) {
 906             if (*s == ';')
 907               i++;
 908         }
 909         for (e = s; *e && *e != ';'; e++) ;
 910
 911         ordered_range_insert(code, s, e - s);
 912
 913         /*
 914          * Locate the combining class code.
 915          */
 916         for (s = e; *s != 0 && i < 3; s++) {
 917             if (*s == ';')
 918               i++;
 919         }
 920
 921         /*
 922          * Convert the combining class code from decimal.
 923          */
 924         for (ccl_code = 0, e = s; *e && *e != ';'; e++)
 925           ccl_code = (ccl_code * 10) + (*e - '0');
 926
 927         /*
 928          * Add the code if it not 0.
 929          */
 930         if (ccl_code != 0)
 931           ordered_ccl_insert(code, ccl_code);
 932
 933         /*
 934          * Locate the second character property field.
 935          */
 936         for (s = e; *s != 0 && i < 4; s++) {
 937             if (*s == ';')
 938               i++;
 939         }
 940         for (e = s; *e && *e != ';'; e++) ;
 941
 942         ordered_range_insert(code, s, e - s);
 943
 944         /*
 945          * Check for a decomposition.
 946          */
 947         s = ++e;
 948         if (*s != ';' && *s != '<') {
 949             /*
 950              * Collect the codes of the decomposition.
 951              */
 952             for (dectmp_size = 0; *s != ';'; ) {
 953                 /*
 954                  * Skip all leading non-hex digits.
 955                  */
 956                 while (!ishdigit(*s))
 957                   s++;
 958
 959                 for (dectmp[dectmp_size] = 0; ishdigit(*s); s++) {
 960                     dectmp[dectmp_size] <<= 4;
 961                     if (*s >= '0' && *s <= '9')
 962                       dectmp[dectmp_size] += *s - '0';
 963                     else if (*s >= 'A' && *s <= 'F')
 964                       dectmp[dectmp_size] += (*s - 'A') + 10;
 965                     else if (*s >= 'a' && *s <= 'f')
 966                       dectmp[dectmp_size] += (*s - 'a') + 10;
 967                 }
 968                 dectmp_size++;
 969             }
 970
 971             /*
 972              * If there is more than one code in the temporary decomposition
 973              * array, then add the character with its decomposition.
 974              */
 975             if (dectmp_size > 1)
 976               add_decomp(code);
 977         }
 978
 979         /*
 980          * Skip to the number field.
 981          */
 982         for (i = 0; i < 3 && *s; s++) {
 983             if (*s == ';')
 984               i++;
 985         }
 986
 987         /*
 988          * Scan the number in.
 989          */
 990         number[0] = number[1] = 0;
 991         for (e = s, neg = wnum = 0; *e && *e != ';'; e++) {
 992             if (*e == '-') {
 993                 neg = 1;
 994                 continue;
 995             }
 996
 997             if (*e == '/') {
 998                 /*
 999                  * Move the the denominator of the fraction.
1000                  */
1001                 if (neg)
1002                   number[wnum] *= -1;
1003                 neg = 0;
1004                 e++;
1005                 wnum++;
1006             }
1007             number[wnum] = (number[wnum] * 10) + (*e - '0');
1008         }
1009
1010         if (e > s) {
1011             /*
1012              * Adjust the denominator in case of integers and add the number.
1013              */
1014             if (wnum == 0)
1015               number[1] = number[0];
1016
1017             add_number(code, number[0], number[1]);
1018         }
1019
1020         /*
1021          * Skip to the start of the possible case mappings.
1022          */
1023         for (s = e, i = 0; i < 4 && *s; s++) {
1024             if (*s == ';')
1025               i++;
1026         }
1027
1028         /*
1029          * Collect the case mappings.
1030          */
1031         cases[0] = cases[1] = cases[2] = 0;
1032         for (i = 0; i < 3; i++) {
1033             while (ishdigit(*s)) {
1034                 cases[i] <<= 4;
1035                 if (*s >= '0' && *s <= '9')
1036                   cases[i] += *s - '0';
1037                 else if (*s >= 'A' && *s <= 'F')
1038                   cases[i] += (*s - 'A') + 10;
1039                 else if (*s >= 'a' && *s <= 'f')
1040                   cases[i] += (*s - 'a') + 10;
1041                 s++;
1042             }
1043             if (*s == ';')
1044               s++;
1045         }
1046         if (cases[0] && cases[1])
1047           /*
1048            * Add the upper and lower mappings for a title case character.
1049            */
1050           add_title(code);
1051         else if (cases[1])
1052           /*
1053            * Add the lower and title case mappings for the upper case
1054            * character.
1055            */
1056           add_upper(code);
1057         else if (cases[0])
1058           /*
1059            * Add the upper and title case mappings for the lower case
1060            * character.
1061            */
1062           add_lower(code);
1063     }
1064 }
1065
1066 static _decomp_t *
1067 #ifdef __STDC__
1068 find_decomp(unsigned long code)
1069 #else
1070 find_decomp(code)
1071 unsigned long code;
1072 #endif
1073 {
1074     long l, r, m;
1075
1076     l = 0;
1077     r = decomps_used - 1;
1078     while (l <= r) {
1079         m = (l + r) >> 1;
1080         if (code > decomps[m].code)
1081           l = m + 1;
1082         else if (code < decomps[m].code)
1083           r = m - 1;
1084         else
1085           return &decomps[m];
1086     }
1087     return 0;
1088 }
1089
1090 static void
1091 #ifdef __STDC__
1092 decomp_it(_decomp_t *d)
1093 #else
1094 decomp_it(d)
1095 _decomp_t *d;
1096 #endif
1097 {
1098     unsigned long i;
1099     _decomp_t *dp;
1100
1101     for (i = 0; i < d->used; i++) {
1102         if ((dp = find_decomp(d->decomp[i])) != 0)
1103           decomp_it(dp);
1104         else
1105           dectmp[dectmp_size++] = d->decomp[i];
1106     }
1107 }
1108
1109 /*
1110  * Expand all decompositions by recursively decomposing each character
1111  * in the decomposition.
1112  */
1113 static void
1114 #ifdef __STDC__
1115 expand_decomp(void)
1116 #else
1117 expand_decomp()
1118 #endif
1119 {
1120     unsigned long i;
1121
1122     for (i = 0; i < decomps_used; i++) {
1123         dectmp_size = 0;
1124         decomp_it(&decomps[i]);
1125         if (dectmp_size > 0)
1126           add_decomp(decomps[i].code);
1127     }
1128 }
1129
1130 static void
1131 #ifdef __STDC__
1132 write_cdata(char *opath)
1133 #else
1134 write_cdata(opath)
1135 char *opath;
1136 #endif
1137 {
1138     FILE *out;
1139     unsigned long i, idx, bytes, nprops;
1140     unsigned short casecnt[2];
1141     char path[BUFSIZ];
1142
1143     /*****************************************************************
1144      *
1145      * Generate the ctype data.
1146      *
1147      *****************************************************************/
1148
1149     /*
1150      * Open the ctype.dat file.
1151      */
1152     sprintf(path, "%s/ctype.dat", opath);
1153     if ((out = fopen(path, "wb")) == 0)
1154       return;
1155
1156     /*
1157      * Collect the offsets for the properties.  The offsets array is
1158      * on a 4-byte boundary to keep things efficient for architectures
1159      * that need such a thing.
1160      */
1161     for (i = idx = 0; i < NUMPROPS; i++) {
1162         propcnt[i] = (proptbl[i].used != 0) ? idx : 0xffff;
1163         idx += proptbl[i].used;
1164     }
1165
1166     /*
1167      * Add the sentinel index which is used by the binary search as the upper
1168      * bound for a search.
1169      */
1170     propcnt[i] = idx;
1171
1172     /*
1173      * Record the actual number of property lists.  This may be different than
1174      * the number of offsets actually written because of aligning on a 4-byte
1175      * boundary.
1176      */
1177     hdr[1] = NUMPROPS;
1178
1179     /*
1180      * Calculate the byte count needed and pad the property counts array to a
1181      * 4-byte boundary.
1182      */
1183     if ((bytes = sizeof(unsigned short) * (NUMPROPS + 1)) & 3)
1184       bytes += 4 - (bytes & 3);
1185     nprops = bytes / sizeof(unsigned short);
1186     bytes += sizeof(unsigned long) * idx;
1187
1188     /*
1189      * Write the header.
1190      */
1191     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1192
1193     /*
1194      * Write the byte count.
1195      */
1196     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1197
1198     /*
1199      * Write the property list counts.
1200      */
1201     fwrite((char *) propcnt, sizeof(unsigned short), nprops, out);
1202
1203     /*
1204      * Write the property lists.
1205      */
1206     for (i = 0; i < NUMPROPS; i++) {
1207         if (proptbl[i].used > 0)
1208           fwrite((char *) proptbl[i].ranges, sizeof(unsigned long),
1209                  proptbl[i].used, out);
1210     }
1211
1212     fclose(out);
1213
1214     /*****************************************************************
1215      *
1216      * Generate the case mapping data.
1217      *
1218      *****************************************************************/
1219
1220     /*
1221      * Open the case.dat file.
1222      */
1223     sprintf(path, "%s/case.dat", opath);
1224     if ((out = fopen(path, "wb")) == 0)
1225       return;
1226
1227     /*
1228      * Write the case mapping tables.
1229      */
1230     hdr[1] = upper_used + lower_used + title_used;
1231     casecnt[0] = upper_used;
1232     casecnt[1] = lower_used;
1233
1234     /*
1235      * Write the header.
1236      */
1237     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1238
1239     /*
1240      * Write the upper and lower case table sizes.
1241      */
1242     fwrite((char *) casecnt, sizeof(unsigned short), 2, out);
1243
1244     if (upper_used > 0)
1245       /*
1246        * Write the upper case table.
1247        */
1248       fwrite((char *) upper, sizeof(_case_t), upper_used, out);
1249
1250     if (lower_used > 0)
1251       /*
1252        * Write the lower case table.
1253        */
1254       fwrite((char *) lower, sizeof(_case_t), lower_used, out);
1255
1256     if (title_used > 0)
1257       /*
1258        * Write the title case table.
1259        */
1260       fwrite((char *) title, sizeof(_case_t), title_used, out);
1261
1262     fclose(out);
1263
1264     /*****************************************************************
1265      *
1266      * Generate the decomposition data.
1267      *
1268      *****************************************************************/
1269
1270     /*
1271      * Fully expand all decompositions before generating the output file.
1272      */
1273     expand_decomp();
1274
1275     /*
1276      * Open the decomp.dat file.
1277      */
1278     sprintf(path, "%s/decomp.dat", opath);
1279     if ((out = fopen(path, "wb")) == 0)
1280       return;
1281
1282     hdr[1] = decomps_used;
1283
1284     /*
1285      * Write the header.
1286      */
1287     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1288
1289     /*
1290      * Write a temporary byte count which will be calculated as the
1291      * decompositions are written out.
1292      */
1293     bytes = 0;
1294     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1295
1296     if (decomps_used) {
1297         /*
1298          * Write the list of decomp nodes.
1299          */
1300         for (i = idx = 0; i < decomps_used; i++) {
1301             fwrite((char *) &decomps[i].code, sizeof(unsigned long), 1, out);
1302             fwrite((char *) &idx, sizeof(unsigned long), 1, out);
1303             idx += decomps[i].used;
1304         }
1305
1306         /*
1307          * Write the sentinel index as the last decomp node.
1308          */
1309         fwrite((char *) &idx, sizeof(unsigned long), 1, out);
1310
1311         /*
1312          * Write the decompositions themselves.
1313          */
1314         for (i = 0; i < decomps_used; i++)
1315           fwrite((char *) decomps[i].decomp, sizeof(unsigned long),
1316                  decomps[i].used, out);
1317
1318         /*
1319          * Seek back to the beginning and write the byte count.
1320          */
1321         bytes = (sizeof(unsigned long) * idx) +
1322             (sizeof(unsigned long) * ((hdr[1] << 1) + 1));
1323         fseek(out, sizeof(unsigned short) << 1, 0L);
1324         fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1325
1326         fclose(out);
1327     }
1328
1329     /*****************************************************************
1330      *
1331      * Generate the combining class data.
1332      *
1333      *****************************************************************/
1334
1335     /*
1336      * Open the cmbcl.dat file.
1337      */
1338     sprintf(path, "%s/cmbcl.dat", opath);
1339     if ((out = fopen(path, "wb")) == 0)
1340       return;
1341
1342     /*
1343      * Set the number of ranges used.  Each range has a combining class which
1344      * means each entry is a 3-tuple.
1345      */
1346     hdr[1] = ccl_used / 3;
1347
1348     /*
1349      * Write the header.
1350      */
1351     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1352
1353     /*
1354      * Write out the byte count to maintain header size.
1355      */
1356     bytes = ccl_used * sizeof(unsigned long);
1357     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1358
1359     if (ccl_used > 0)
1360       /*
1361        * Write the combining class ranges out.
1362        */
1363       fwrite((char *) ccl, sizeof(unsigned long), ccl_used, out);
1364
1365     fclose(out);
1366
1367     /*****************************************************************
1368      *
1369      * Generate the number data.
1370      *
1371      *****************************************************************/
1372
1373     /*
1374      * Open the num.dat file.
1375      */
1376     sprintf(path, "%s/num.dat", opath);
1377     if ((out = fopen(path, "wb")) == 0)
1378       return;
1379
1380     /*
1381      * The count part of the header will be the total number of codes that
1382      * have numbers.
1383      */
1384     hdr[1] = (unsigned short) (ncodes_used << 1);
1385     bytes = (ncodes_used * sizeof(_codeidx_t)) + (nums_used * sizeof(_num_t));
1386
1387     /*
1388      * Write the header.
1389      */
1390     fwrite((char *) hdr, sizeof(unsigned short), 2, out);
1391
1392     /*
1393      * Write out the byte count to maintain header size.
1394      */
1395     fwrite((char *) &bytes, sizeof(unsigned long), 1, out);
1396
1397     /*
1398      * Now, if number mappings exist, write them out.
1399      */
1400     if (ncodes_used > 0) {
1401         fwrite((char *) ncodes, sizeof(_codeidx_t), ncodes_used, out);
1402         fwrite((char *) nums, sizeof(_num_t), nums_used, out);
1403     }
1404
1405     fclose(out);
1406 }
1407
1408 void
1409 #ifdef __STDC__
1410 main(int argc, char *argv[])
1411 #else
1412 main(argc, argv)
1413 int argc;
1414 char *argv[];
1415 #endif
1416 {
1417     FILE *in;
1418     char *prog, *opath;
1419
1420     if ((prog = strrchr(argv[0], '/')) != 0)
1421       prog++;
1422     else
1423       prog = argv[0];
1424
1425     opath = 0;
1426     in = stdin;
1427
1428     argc--;
1429     argv++;
1430
1431     while (argc > 0) {
1432         if (argv[0][0] == '-' && argv[0][1] == 'o') {
1433             argc--;
1434             argv++;
1435             opath = argv[0];
1436         } else {
1437             if (in != stdin)
1438               fclose(in);
1439             if ((in = fopen(argv[0], "rb")) == 0)
1440               fprintf(stderr, "%s: unable to open ctype file %s\n",
1441                       prog, argv[0]);
1442             else {
1443                 read_cdata(in);
1444                 fclose(in);
1445                 in = 0;
1446             }
1447         }
1448         argc--;
1449         argv++;
1450     }
1451
1452     if (opath == 0)
1453       opath = ".";
1454     write_cdata(opath);
1455
1456     exit(0);
1457 }