usr/src/cmd/localedef/collate.c

   1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11
  12 /*
  13  * Copyright 2017 Nexenta Systems, Inc.
  14  */
  15
  16 /*
  17  * LC_COLLATE database generation routines for localedef.
  18  */
  19
  20 #include <stdio.h>
  21 #include <stdlib.h>
  22 #include <errno.h>
  23 #include <string.h>
  24 #include <sys/types.h>
  25 #include <sys/avl.h>
  26 #include <string.h>
  27 #include <unistd.h>
  28 #include <wchar.h>
  29 #include <widec.h>
  30 #include <limits.h>
  31 #include "localedef.h"
  32 #include "parser.tab.h"
  33 #include "collatefile.h"
  34
  35 /*
  36  * Design notes.
  37  *
  38  * It will be extremely helpful to the reader if they have access to
  39  * the localedef and locale file format specifications available.
  40  * Latest versions of these are available from www.opengroup.org.
  41  *
  42  * The design for the collation code is a bit complex.  The goal is a
  43  * single collation database as described in collate.h (in
  44  * libc/port/locale).  However, there are some other tidbits:
  45  *
  46  * a) The substitution entries are now a directly indexable array.  A
  47  * priority elsewhere in the table is taken as an index into the
  48  * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY)
  49  * set.  (The bit is cleared and the result is the index into the
  50  * table.
  51  *
  52  * b) We eliminate duplicate entries into the substitution table.
  53  * This saves a lot of space.
  54  *
  55  * c) The priorities for each level are "compressed", so that each
  56  * sorting level has consecutively numbered priorities starting at 1.
  57  * (O is reserved for the ignore priority.)  This means sort levels
  58  * which only have a few distinct priorities can represent the
  59  * priority level in fewer bits, which makes the strxfrm output
  60  * smaller.
  61  *
  62  * d) We record the total number of priorities so that strxfrm can
  63  * figure out how many bytes to expand a numeric priority into.
  64  *
  65  * e) For the UNDEFINED pass (the last pass), we record the maximum
  66  * number of bits needed to uniquely prioritize these entries, so that
  67  * the last pass can also use smaller strxfrm output when possible.
  68  *
  69  * f) Priorities with the sign bit set are verboten.  This works out
  70  * because no active character set needs that bit to carry significant
  71  * information once the character is in wide form.
  72  *
  73  * To process the entire data to make the database, we actually run
  74  * multiple passes over the data.
  75  *
  76  * The first pass, which is done at parse time, identifies elements,
  77  * substitutions, and such, and records them in priority order.  As
  78  * some priorities can refer to other priorities, using forward
  79  * references, we use a table of references indicating whether the
  80  * priority's value has been resolved, or whether it is still a
  81  * reference.
  82  *
  83  * The second pass walks over all the items in priority order, noting
  84  * that they are used directly, and not just an indirect reference.
  85  * This is done by creating a "weight" structure for the item.  The
  86  * weights are stashed in an AVL tree sorted by relative "priority".
  87  *
  88  * The third pass walks over all the weight structures, in priority
  89  * order, and assigns a new monotonically increasing (per sort level)
  90  * weight value to them.  These are the values that will actually be
  91  * written to the file.
  92  *
  93  * The fourth pass just writes the data out.
  94  */
  95
  96 /*
  97  * In order to resolve the priorities, we create a table of priorities.
  98  * Entries in the table can be in one of three states.
  99  *
 100  * UNKNOWN is for newly allocated entries, and indicates that nothing
 101  * is known about the priority.  (For example, when new entries are created
 102  * for collating-symbols, this is the value assigned for them until the
 103  * collating symbol's order has been determined.
 104  *
 105  * RESOLVED is used for an entry where the priority indicates the final
 106  * numeric weight.
 107  *
 108  * REFER is used for entries that reference other entries.  Typically
 109  * this is used for forward references.  A collating-symbol can never
 110  * have this value.
 111  *
 112  * The "pass" field is used during final resolution to aid in detection
 113  * of referencing loops.  (For example <A> depends on <B>, but <B> has its
 114  * priority dependent on <A>.)
 115  */
 116 typedef enum {
 117         UNKNOWN,        /* priority is totally unknown */
 118         RESOLVED,       /* priority value fully resolved */
 119         REFER           /* priority is a reference (index) */
 120 } res_t;
 121
 122 typedef struct weight {
 123         int32_t         pri;
 124         int             opt;
 125         avl_node_t      avl;
 126 } weight_t;
 127
 128 typedef struct priority {
 129         res_t           res;
 130         int32_t         pri;
 131         int             pass;
 132         int             lineno;
 133 } collpri_t;
 134
 135 #define NUM_WT  collinfo.directive_count
 136
 137 /*
 138  * These are the abstract collating symbols, which are just a symbolic
 139  * way to reference a priority.
 140  */
 141 struct collsym {
 142         char            *name;
 143         int32_t         ref;
 144         avl_node_t      avl;
 145 };
 146
 147 /*
 148  * These are also abstract collating symbols, but we allow them to have
 149  * different priorities at different levels.
 150  */
 151 typedef struct collundef {
 152         char            *name;
 153         int32_t         ref[COLL_WEIGHTS_MAX];
 154         avl_node_t      avl;
 155 } collundef_t;
 156
 157 /*
 158  * These are called "chains" in libc.  This records the fact that two
 159  * more characters should be treated as a single collating entity when
 160  * they appear together.  For example, in Spanish <C><h> gets collated
 161  * as a character between <C> and <D>.
 162  */
 163 struct collelem {
 164         char            *symbol;
 165         wchar_t         *expand;
 166         int32_t         ref[COLL_WEIGHTS_MAX];
 167         avl_node_t      avl_bysymbol;
 168         avl_node_t      avl_byexpand;
 169 };
 170
 171 /*
 172  * Individual characters have a sequence of weights as well.
 173  */
 174 typedef struct collchar {
 175         wchar_t         wc;
 176         int32_t         ref[COLL_WEIGHTS_MAX];
 177         avl_node_t      avl;
 178 } collchar_t;
 179
 180 /*
 181  * Substitution entries.  The key is itself a priority.  Note that
 182  * when we create one of these, we *automatically* wind up with a
 183  * fully resolved priority for the key, because creation of
 184  * substitutions creates a resolved priority at the same time.
 185  */
 186 typedef struct {
 187         int32_t         key;
 188         int32_t         ref[COLLATE_STR_LEN];
 189         avl_node_t      avl;
 190         avl_node_t      avl_ref;
 191 } subst_t;
 192
 193 static avl_tree_t       collsyms;
 194 static avl_tree_t       collundefs;
 195 static avl_tree_t       elem_by_symbol;
 196 static avl_tree_t       elem_by_expand;
 197 static avl_tree_t       collchars;
 198 static avl_tree_t       substs[COLL_WEIGHTS_MAX];
 199 static avl_tree_t       substs_ref[COLL_WEIGHTS_MAX];
 200 static avl_tree_t       weights[COLL_WEIGHTS_MAX];
 201 static int32_t          nweight[COLL_WEIGHTS_MAX];
 202
 203 /*
 204  * This is state tracking for the ellipsis token.  Note that we start
 205  * the initial values so that the ellipsis logic will think we got a
 206  * magic starting value of NUL.  It starts at minus one because the
 207  * starting point is exclusive -- i.e. the starting point is not
 208  * itself handled by the ellipsis code.
 209  */
 210 static int currorder = EOF;
 211 static int lastorder = EOF;
 212 static collelem_t *currelem;
 213 static collchar_t *currchar;
 214 static collundef_t *currundef;
 215 static wchar_t ellipsis_start = 0;
 216 static int32_t ellipsis_weights[COLL_WEIGHTS_MAX];
 217
 218 /*
 219  * We keep a running tally of weights.
 220  */
 221 static int nextpri = 1;
 222 static int nextsubst[COLL_WEIGHTS_MAX] = { 0 };
 223
 224 /*
 225  * This array collects up the weights for each level.
 226  */
 227 static int32_t order_weights[COLL_WEIGHTS_MAX];
 228 static int curr_weight = 0;
 229 static int32_t subst_weights[COLLATE_STR_LEN];
 230 static int curr_subst = 0;
 231
 232 /*
 233  * Some initial priority values.
 234  */
 235 static int32_t pri_undefined[COLL_WEIGHTS_MAX];
 236 static int32_t pri_ignore;
 237
 238 static collate_info_t collinfo;
 239
 240 static collpri_t        *prilist = NULL;
 241 static int              numpri = 0;
 242 static int              maxpri = 0;
 243
 244 static void start_order(int);
 245
 246 static int32_t
 247 new_pri(void)
 248 {
 249         int i;
 250
 251         if (numpri >= maxpri) {
 252                 maxpri = maxpri ? maxpri * 2 : 1024;
 253                 prilist = realloc(prilist, sizeof (collpri_t) * maxpri);
 254                 if (prilist == NULL) {
 255                         errf(_("out of memory"));
 256                         return (-1);
 257                 }
 258                 for (i = numpri; i < maxpri; i++) {
 259                         prilist[i].res = UNKNOWN;
 260                         prilist[i].pri = 0;
 261                         prilist[i].pass = 0;
 262                 }
 263         }
 264         return (numpri++);
 265 }
 266
 267 static collpri_t *
 268 get_pri(int32_t ref)
 269 {
 270         if ((ref < 0) || (ref > numpri)) {
 271                 INTERR;
 272                 return (NULL);
 273         }
 274         return (&prilist[ref]);
 275 }
 276
 277 static void
 278 set_pri(int32_t ref, int32_t v, res_t res)
 279 {
 280         collpri_t       *pri;
 281
 282         pri = get_pri(ref);
 283
 284         if ((res == REFER) && ((v < 0) || (v >= numpri))) {
 285                 INTERR;
 286         }
 287
 288         /* Resolve self references */
 289         if ((res == REFER) && (ref == v)) {
 290                 v = nextpri;
 291                 res = RESOLVED;
 292         }
 293
 294         if (pri->res != UNKNOWN) {
 295                 warn(_("repeated item in order list (first on %d)"),
 296                     pri->lineno);
 297                 return;
 298         }
 299         pri->lineno = lineno;
 300         pri->pri = v;
 301         pri->res = res;
 302 }
 303
 304 static int32_t
 305 resolve_pri(int32_t ref)
 306 {
 307         collpri_t       *pri;
 308         static int32_t  pass = 0;
 309
 310         pri = get_pri(ref);
 311         pass++;
 312         while (pri->res == REFER) {
 313                 if (pri->pass == pass) {
 314                         /* report a line with the circular symbol */
 315                         lineno = pri->lineno;
 316                         errf(_("circular reference in order list"));
 317                         return (-1);
 318                 }
 319                 if ((pri->pri < 0) || (pri->pri >= numpri)) {
 320                         INTERR;
 321                         return (-1);
 322                 }
 323                 pri->pass = pass;
 324                 pri = &prilist[pri->pri];
 325         }
 326
 327         if (pri->res == UNKNOWN) {
 328                 return (-1);
 329         }
 330         if (pri->res != RESOLVED)
 331                 INTERR;
 332
 333         return (pri->pri);
 334 }
 335
 336 static int
 337 weight_compare(const void *n1, const void *n2)
 338 {
 339         int32_t k1 = ((const weight_t *)n1)->pri;
 340         int32_t k2 = ((const weight_t *)n2)->pri;
 341
 342         return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
 343 }
 344
 345 static int
 346 collsym_compare(const void *n1, const void *n2)
 347 {
 348         const collsym_t *c1 = n1;
 349         const collsym_t *c2 = n2;
 350         int rv;
 351
 352         rv = strcmp(c1->name, c2->name);
 353         return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
 354 }
 355
 356 static int
 357 collundef_compare(const void *n1, const void *n2)
 358 {
 359         const collundef_t *c1 = n1;
 360         const collundef_t *c2 = n2;
 361         int rv;
 362
 363         rv = strcmp(c1->name, c2->name);
 364         return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
 365 }
 366
 367 static int
 368 element_compare_symbol(const void *n1, const void *n2)
 369 {
 370         const collelem_t *c1 = n1;
 371         const collelem_t *c2 = n2;
 372         int rv;
 373
 374         rv = strcmp(c1->symbol, c2->symbol);
 375         return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
 376 }
 377
 378 static int
 379 element_compare_expand(const void *n1, const void *n2)
 380 {
 381         const collelem_t *c1 = n1;
 382         const collelem_t *c2 = n2;
 383         int rv;
 384
 385         rv = wcscmp(c1->expand, c2->expand);
 386         return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
 387 }
 388
 389 static int
 390 collchar_compare(const void *n1, const void *n2)
 391 {
 392         wchar_t k1 = ((const collchar_t *)n1)->wc;
 393         wchar_t k2 = ((const collchar_t *)n2)->wc;
 394
 395         return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
 396 }
 397
 398 static int
 399 subst_compare(const void *n1, const void *n2)
 400 {
 401         int32_t k1 = ((const subst_t *)n1)->key;
 402         int32_t k2 = ((const subst_t *)n2)->key;
 403
 404         return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
 405 }
 406
 407 static int
 408 subst_compare_ref(const void *n1, const void *n2)
 409 {
 410         int32_t *c1 = ((subst_t *)n1)->ref;
 411         int32_t *c2 = ((subst_t *)n2)->ref;
 412         int rv;
 413
 414         rv = wcscmp((wchar_t *)c1, (wchar_t *)c2);
 415         return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
 416 }
 417
 418 void
 419 init_collate(void)
 420 {
 421         int i;
 422
 423         avl_create(&collsyms, collsym_compare, sizeof (collsym_t),
 424             offsetof(collsym_t, avl));
 425
 426         avl_create(&collundefs, collundef_compare, sizeof (collsym_t),
 427             offsetof(collundef_t, avl));
 428
 429         avl_create(&elem_by_symbol, element_compare_symbol, sizeof (collelem_t),
 430             offsetof(collelem_t, avl_bysymbol));
 431         avl_create(&elem_by_expand, element_compare_expand, sizeof (collelem_t),
 432             offsetof(collelem_t, avl_byexpand));
 433
 434         avl_create(&collchars, collchar_compare, sizeof (collchar_t),
 435             offsetof(collchar_t, avl));
 436
 437         for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
 438                 avl_create(&substs[i], subst_compare, sizeof (subst_t),
 439                     offsetof(subst_t, avl));
 440                 avl_create(&substs_ref[i], subst_compare_ref,
 441                     sizeof (subst_t), offsetof(subst_t, avl_ref));
 442                 avl_create(&weights[i], weight_compare, sizeof (weight_t),
 443                     offsetof(weight_t, avl));
 444                 nweight[i] = 1;
 445         }
 446
 447         (void) memset(&collinfo, 0, sizeof (collinfo));
 448
 449         /* allocate some initial priorities */
 450         pri_ignore = new_pri();
 451
 452         set_pri(pri_ignore, 0, RESOLVED);
 453
 454         for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
 455                 pri_undefined[i] = new_pri();
 456
 457                 /* we will override this later */
 458                 set_pri(pri_undefined[i], COLLATE_MAX_PRIORITY, UNKNOWN);
 459         }
 460 }
 461
 462 void
 463 define_collsym(char *name)
 464 {
 465         collsym_t       *sym;
 466         avl_index_t     where;
 467
 468         if ((sym = calloc(1, sizeof (*sym))) == NULL) {
 469                 errf(_("out of memory"));
 470                 return;
 471         }
 472         sym->name = name;
 473         sym->ref = new_pri();
 474
 475         if (avl_find(&collsyms, sym, &where) != NULL) {
 476                 /*
 477                  * This should never happen because we are only called
 478                  * for undefined symbols.
 479                  */
 480                 free(sym);
 481                 INTERR;
 482                 return;
 483         }
 484         avl_insert(&collsyms, sym, where);
 485 }
 486
 487 collsym_t *
 488 lookup_collsym(char *name)
 489 {
 490         collsym_t       srch;
 491
 492         srch.name = name;
 493         return (avl_find(&collsyms, &srch, NULL));
 494 }
 495
 496 collelem_t *
 497 lookup_collelem(char *symbol)
 498 {
 499         collelem_t      srch;
 500
 501         srch.symbol = symbol;
 502         return (avl_find(&elem_by_symbol, &srch, NULL));
 503 }
 504
 505 static collundef_t *
 506 get_collundef(char *name)
 507 {
 508         collundef_t     srch;
 509         collundef_t     *ud;
 510         avl_index_t     where;
 511         int             i;
 512
 513         srch.name = name;
 514         if ((ud = avl_find(&collundefs, &srch, &where)) == NULL) {
 515                 if (((ud = calloc(1, sizeof (*ud))) == NULL) ||
 516                     ((ud->name = strdup(name)) == NULL)) {
 517                         errf(_("out of memory"));
 518                         free(ud);
 519                         return (NULL);
 520                 }
 521                 for (i = 0; i < NUM_WT; i++) {
 522                         ud->ref[i] = new_pri();
 523                 }
 524                 avl_insert(&collundefs, ud, where);
 525         }
 526         add_charmap_undefined(name);
 527         return (ud);
 528 }
 529
 530 static collchar_t *
 531 get_collchar(wchar_t wc, int create)
 532 {
 533         collchar_t      srch;
 534         collchar_t      *cc;
 535         avl_index_t     where;
 536         int             i;
 537
 538         srch.wc = wc;
 539         cc = avl_find(&collchars, &srch, &where);
 540         if ((cc == NULL) && create) {
 541                 if ((cc = calloc(1, sizeof (*cc))) == NULL) {
 542                         errf(_("out of memory"));
 543                         return (NULL);
 544                 }
 545                 for (i = 0; i < NUM_WT; i++) {
 546                         cc->ref[i] = new_pri();
 547                 }
 548                 cc->wc = wc;
 549                 avl_insert(&collchars, cc, where);
 550         }
 551         return (cc);
 552 }
 553
 554 void
 555 end_order_collsym(collsym_t *sym)
 556 {
 557         start_order(T_COLLSYM);
 558         /* update the weight */
 559
 560         set_pri(sym->ref, nextpri, RESOLVED);
 561         nextpri++;
 562 }
 563
 564 void
 565 end_order(void)
 566 {
 567         int             i;
 568         int32_t         pri;
 569         int32_t         ref;
 570         collpri_t       *p;
 571
 572         /* advance the priority/weight */
 573         pri = nextpri;
 574
 575         switch (currorder) {
 576         case T_CHAR:
 577                 for (i = 0; i < NUM_WT; i++) {
 578                         if (((ref = order_weights[i]) < 0) ||
 579                             ((p = get_pri(ref)) == NULL) ||
 580                             (p->pri == -1)) {
 581                                 /* unspecified weight is a self reference */
 582                                 set_pri(currchar->ref[i], pri, RESOLVED);
 583                         } else {
 584                                 set_pri(currchar->ref[i], ref, REFER);
 585                         }
 586                         order_weights[i] = -1;
 587                 }
 588
 589                 /* leave a cookie trail in case next symbol is ellipsis */
 590                 ellipsis_start = currchar->wc + 1;
 591                 currchar = NULL;
 592                 break;
 593
 594         case T_ELLIPSIS:
 595                 /* save off the weights were we can find them */
 596                 for (i = 0; i < NUM_WT; i++) {
 597                         ellipsis_weights[i] = order_weights[i];
 598                         order_weights[i] = -1;
 599                 }
 600                 break;
 601
 602         case T_COLLELEM:
 603                 if (currelem == NULL) {
 604                         INTERR;
 605                 } else {
 606                         for (i = 0; i < NUM_WT; i++) {
 607
 608                                 if (((ref = order_weights[i]) < 0) ||
 609                                     ((p = get_pri(ref)) == NULL) ||
 610                                     (p->pri == -1)) {
 611                                         set_pri(currelem->ref[i], pri,
 612                                             RESOLVED);
 613                                 } else {
 614                                         set_pri(currelem->ref[i], ref, REFER);
 615                                 }
 616                                 order_weights[i] = -1;
 617                         }
 618                 }
 619                 break;
 620
 621         case T_UNDEFINED:
 622                 for (i = 0; i < NUM_WT; i++) {
 623                         if (((ref = order_weights[i]) < 0) ||
 624                             ((p = get_pri(ref)) == NULL) ||
 625                             (p->pri == -1)) {
 626                                 set_pri(pri_undefined[i], -1, RESOLVED);
 627                         } else {
 628                                 set_pri(pri_undefined[i], ref, REFER);
 629                         }
 630                         order_weights[i] = -1;
 631                 }
 632                 break;
 633
 634         case T_SYMBOL:
 635                 for (i = 0; i < NUM_WT; i++) {
 636                         if (((ref = order_weights[i]) < 0) ||
 637                             ((p = get_pri(ref)) == NULL) ||
 638                             (p->pri == -1)) {
 639                                 set_pri(currundef->ref[i], pri, RESOLVED);
 640                         } else {
 641                                 set_pri(currundef->ref[i], ref, REFER);
 642                         }
 643                         order_weights[i] = -1;
 644                 }
 645                 break;
 646
 647         default:
 648                 INTERR;
 649         }
 650
 651         nextpri++;
 652 }
 653
 654 static void
 655 start_order(int type)
 656 {
 657         int     i;
 658
 659         lastorder = currorder;
 660         currorder = type;
 661
 662         /* this is used to protect ELLIPSIS processing */
 663         if ((lastorder == T_ELLIPSIS) && (type != T_CHAR)) {
 664                 errf(_("character value expected"));
 665         }
 666
 667         for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
 668                 order_weights[i] = -1;
 669         }
 670         curr_weight = 0;
 671 }
 672
 673 void
 674 start_order_undefined(void)
 675 {
 676         start_order(T_UNDEFINED);
 677 }
 678
 679 void
 680 start_order_symbol(char *name)
 681 {
 682         currundef = get_collundef(name);
 683         start_order(T_SYMBOL);
 684 }
 685
 686 void
 687 start_order_char(wchar_t wc)
 688 {
 689         collchar_t      *cc;
 690         int32_t         ref;
 691
 692         start_order(T_CHAR);
 693
 694         /*
 695          * If we last saw an ellipsis, then we need to close the range.
 696          * Handle that here.  Note that we have to be careful because the
 697          * items *inside* the range are treated exclusiveley to the items
 698          * outside of the range.  The ends of the range can have quite
 699          * different weights than the range members.
 700          */
 701         if (lastorder == T_ELLIPSIS) {
 702                 int             i;
 703
 704                 if (wc < ellipsis_start) {
 705                         errf(_("malformed range!"));
 706                         return;
 707                 }
 708                 while (ellipsis_start < wc) {
 709                         /*
 710                          * pick all of the saved weights for the
 711                          * ellipsis.  note that -1 encodes for the
 712                          * ellipsis itself, which means to take the
 713                          * current relative priority.
 714                          */
 715                         if ((cc = get_collchar(ellipsis_start, 1)) == NULL) {
 716                                 INTERR;
 717                                 return;
 718                         }
 719                         for (i = 0; i < NUM_WT; i++) {
 720                                 collpri_t *p;
 721                                 if (((ref = ellipsis_weights[i]) == -1) ||
 722                                     ((p = get_pri(ref)) == NULL) ||
 723                                     (p->pri == -1)) {
 724                                         set_pri(cc->ref[i], nextpri, RESOLVED);
 725                                 } else {
 726                                         set_pri(cc->ref[i], ref, REFER);
 727                                 }
 728                                 ellipsis_weights[i] = 0;
 729                         }
 730                         ellipsis_start++;
 731                         nextpri++;
 732                 }
 733         }
 734
 735         currchar = get_collchar(wc, 1);
 736 }
 737
 738 void
 739 start_order_collelem(collelem_t *e)
 740 {
 741         start_order(T_COLLELEM);
 742         currelem = e;
 743 }
 744
 745 void
 746 start_order_ellipsis(void)
 747 {
 748         int     i;
 749
 750         start_order(T_ELLIPSIS);
 751
 752         if (lastorder != T_CHAR) {
 753                 errf(_("illegal starting point for range"));
 754                 return;
 755         }
 756
 757         for (i = 0; i < NUM_WT; i++) {
 758                 ellipsis_weights[i] = order_weights[i];
 759         }
 760 }
 761
 762 void
 763 define_collelem(char *name, wchar_t *wcs)
 764 {
 765         collelem_t      *e;
 766         avl_index_t     where1;
 767         avl_index_t     where2;
 768         int             i;
 769
 770         if (wcslen(wcs) >= COLLATE_STR_LEN) {
 771                 errf(_("expanded collation element too long"));
 772                 return;
 773         }
 774
 775         if ((e = calloc(1, sizeof (*e))) == NULL) {
 776                 errf(_("out of memory"));
 777                 return;
 778         }
 779         e->expand = wcs;
 780         e->symbol = name;
 781
 782         /*
 783          * This is executed before the order statement, so we don't
 784          * know how many priorities we *really* need.  We allocate one
 785          * for each possible weight.  Not a big deal, as collating-elements
 786          * prove to be quite rare.
 787          */
 788         for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
 789                 e->ref[i] = new_pri();
 790         }
 791
 792         /* A character sequence can only reduce to one element. */
 793         if ((avl_find(&elem_by_symbol, e, &where1) != NULL) ||
 794             (avl_find(&elem_by_expand, e, &where2) != NULL)) {
 795                 errf(_("duplicate collating element definition"));
 796                 free(e);
 797                 return;
 798         }
 799         avl_insert(&elem_by_symbol, e, where1);
 800         avl_insert(&elem_by_expand, e, where2);
 801 }
 802
 803 void
 804 add_order_bit(int kw)
 805 {
 806         uint8_t bit = DIRECTIVE_UNDEF;
 807
 808         switch (kw) {
 809         case T_FORWARD:
 810                 bit = DIRECTIVE_FORWARD;
 811                 break;
 812         case T_BACKWARD:
 813                 bit = DIRECTIVE_BACKWARD;
 814                 break;
 815         case T_POSITION:
 816                 bit = DIRECTIVE_POSITION;
 817                 break;
 818         default:
 819                 INTERR;
 820                 break;
 821         }
 822         collinfo.directive[collinfo.directive_count] |= bit;
 823 }
 824
 825 void
 826 add_order_directive(void)
 827 {
 828         if (collinfo.directive_count >= COLL_WEIGHTS_MAX) {
 829                 errf(_("too many directives (max %d)"), COLL_WEIGHTS_MAX);
 830         }
 831         collinfo.directive_count++;
 832 }
 833
 834 static void
 835 add_order_pri(int32_t ref)
 836 {
 837         if (curr_weight >= NUM_WT) {
 838                 errf(_("too many weights (max %d)"), NUM_WT);
 839                 return;
 840         }
 841         order_weights[curr_weight] = ref;
 842         curr_weight++;
 843 }
 844
 845 void
 846 add_order_collsym(collsym_t *s)
 847 {
 848         add_order_pri(s->ref);
 849 }
 850
 851 void
 852 add_order_char(wchar_t wc)
 853 {
 854         collchar_t *cc;
 855
 856         if ((cc = get_collchar(wc, 1)) == NULL) {
 857                 INTERR;
 858                 return;
 859         }
 860
 861         add_order_pri(cc->ref[curr_weight]);
 862 }
 863
 864 void
 865 add_order_collelem(collelem_t *e)
 866 {
 867         add_order_pri(e->ref[curr_weight]);
 868 }
 869
 870 void
 871 add_order_ignore(void)
 872 {
 873         add_order_pri(pri_ignore);
 874 }
 875
 876 void
 877 add_order_symbol(char *sym)
 878 {
 879         collundef_t *c;
 880         if ((c = get_collundef(sym)) == NULL) {
 881                 INTERR;
 882                 return;
 883         }
 884         add_order_pri(c->ref[curr_weight]);
 885 }
 886
 887 void
 888 add_order_ellipsis(void)
 889 {
 890         /* special 0 value indicates self reference */
 891         add_order_pri(0);
 892 }
 893
 894 void
 895 add_order_subst(void)
 896 {
 897         subst_t srch;
 898         subst_t *s;
 899         avl_index_t where;
 900         int i;
 901
 902         (void) memset(&srch, 0, sizeof (srch));
 903         for (i = 0; i < curr_subst; i++) {
 904                 srch.ref[i] = subst_weights[i];
 905                 subst_weights[i] = 0;
 906         }
 907         s = avl_find(&substs_ref[curr_weight], &srch, &where);
 908
 909         if (s == NULL) {
 910                 if ((s = calloc(1, sizeof (*s))) == NULL) {
 911                         errf(_("out of memory"));
 912                         return;
 913                 }
 914                 s->key = new_pri();
 915
 916                 /*
 917                  * We use a self reference for our key, but we set a
 918                  * high bit to indicate that this is a substitution
 919                  * reference.  This will expedite table lookups later,
 920                  * and prevent table lookups for situations that don't
 921                  * require it.  (In short, its a big win, because we
 922                  * can skip a lot of binary searching.)
 923                  */
 924                 set_pri(s->key,
 925                     (nextsubst[curr_weight] | COLLATE_SUBST_PRIORITY),
 926                     RESOLVED);
 927                 nextsubst[curr_weight] += 1;
 928
 929                 for (i = 0; i < curr_subst; i++) {
 930                         s->ref[i] = srch.ref[i];
 931                 }
 932
 933                 avl_insert(&substs_ref[curr_weight], s, where);
 934
 935                 if (avl_find(&substs[curr_weight], s, &where) != NULL) {
 936                         INTERR;
 937                         return;
 938                 }
 939                 avl_insert(&substs[curr_weight], s, where);
 940         }
 941         curr_subst = 0;
 942
 943
 944         /*
 945          * We are using the current (unique) priority as a search key
 946          * in the substitution table.
 947          */
 948         add_order_pri(s->key);
 949 }
 950
 951 static void
 952 add_subst_pri(int32_t ref)
 953 {
 954         if (curr_subst >= COLLATE_STR_LEN) {
 955                 errf(_("substitution string is too long"));
 956                 return;
 957         }
 958         subst_weights[curr_subst] = ref;
 959         curr_subst++;
 960 }
 961
 962 void
 963 add_subst_char(wchar_t wc)
 964 {
 965         collchar_t *cc;
 966
 967
 968         if (((cc = get_collchar(wc, 1)) == NULL) ||
 969             (cc->wc != wc)) {
 970                 INTERR;
 971                 return;
 972         }
 973         /* we take the weight for the character at that position */
 974         add_subst_pri(cc->ref[curr_weight]);
 975 }
 976
 977 void
 978 add_subst_collelem(collelem_t *e)
 979 {
 980         add_subst_pri(e->ref[curr_weight]);
 981 }
 982
 983 void
 984 add_subst_collsym(collsym_t *s)
 985 {
 986         add_subst_pri(s->ref);
 987 }
 988
 989 void
 990 add_subst_symbol(char *ptr)
 991 {
 992         collundef_t *cu;
 993
 994         if ((cu = get_collundef(ptr)) != NULL) {
 995                 add_subst_pri(cu->ref[curr_weight]);
 996         }
 997 }
 998
 999 void
1000 add_weight(int32_t ref, int pass)
1001 {
1002         weight_t srch;
1003         weight_t *w;
1004         avl_index_t where;
1005
1006         srch.pri = resolve_pri(ref);
1007
1008         /* No translation of ignores */
1009         if (srch.pri == 0)
1010                 return;
1011
1012         /* Substitution priorities are not weights */
1013         if (srch.pri & COLLATE_SUBST_PRIORITY)
1014                 return;
1015
1016         if (avl_find(&weights[pass], &srch, &where) != NULL)
1017                 return;
1018
1019         if ((w = calloc(1, sizeof (*w))) == NULL) {
1020                 errf(_("out of memory"));
1021                 return;
1022         }
1023         w->pri = srch.pri;
1024         avl_insert(&weights[pass], w, where);
1025 }
1026
1027 void
1028 add_weights(int32_t *refs)
1029 {
1030         int i;
1031         for (i = 0; i < NUM_WT; i++) {
1032                 add_weight(refs[i], i);
1033         }
1034 }
1035
1036 int32_t
1037 get_weight(int32_t ref, int pass)
1038 {
1039         weight_t        srch;
1040         weight_t        *w;
1041         int32_t         pri;
1042
1043         pri = resolve_pri(ref);
1044         if (pri & COLLATE_SUBST_PRIORITY) {
1045                 return (pri);
1046         }
1047         if (pri <= 0) {
1048                 return (pri);
1049         }
1050         srch.pri = pri;
1051         if ((w = avl_find(&weights[pass], &srch, NULL)) == NULL) {
1052                 INTERR;
1053                 return (-1);
1054         }
1055         return (w->opt);
1056 }
1057
1058 void
1059 dump_collate(void)
1060 {
1061         FILE                    *f;
1062         int                     i, j, n;
1063         size_t                  sz;
1064         int32_t                 pri;
1065         collelem_t              *ce;
1066         collchar_t              *cc;
1067         subst_t                 *sb;
1068         char                    vers[COLLATE_STR_LEN];
1069         collate_char_t          chars[UCHAR_MAX + 1];
1070         collate_large_t         *large;
1071         collate_subst_t         *subst[COLL_WEIGHTS_MAX];
1072         collate_chain_t         *chain;
1073
1074         /*
1075          * We have to run through a preliminary pass to identify all the
1076          * weights that we use for each sorting level.
1077          */
1078         for (i = 0; i < NUM_WT; i++) {
1079                 add_weight(pri_ignore, i);
1080         }
1081         for (i = 0; i < NUM_WT; i++) {
1082                 for (sb = avl_first(&substs[i]); sb;
1083                     sb = AVL_NEXT(&substs[i], sb)) {
1084                         for (j = 0; sb->ref[j]; j++) {
1085                                 add_weight(sb->ref[j], i);
1086                         }
1087                 }
1088         }
1089         for (ce = avl_first(&elem_by_expand);
1090             ce != NULL;
1091             ce = AVL_NEXT(&elem_by_expand, ce)) {
1092                 add_weights(ce->ref);
1093         }
1094         for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
1095                 add_weights(cc->ref);
1096         }
1097
1098         /*
1099          * Now we walk the entire set of weights, removing the gaps
1100          * in the weights.  This gives us optimum usage.  The walk
1101          * occurs in priority.
1102          */
1103         for (i = 0; i < NUM_WT; i++) {
1104                 weight_t *w;
1105                 for (w = avl_first(&weights[i]); w;
1106                     w = AVL_NEXT(&weights[i], w)) {
1107                         w->opt = nweight[i];
1108                         nweight[i] += 1;
1109                 }
1110         }
1111
1112         (void) memset(&chars, 0, sizeof (chars));
1113         (void) memset(vers, 0, COLLATE_STR_LEN);
1114         (void) strlcpy(vers, COLLATE_VERSION, sizeof (vers));
1115
1116         /*
1117          * We need to make sure we arrange for the UNDEFINED field
1118          * to show up.  Also, set the total weight counts.
1119          */
1120         for (i = 0; i < NUM_WT; i++) {
1121                 if (resolve_pri(pri_undefined[i]) == -1) {
1122                         set_pri(pri_undefined[i], -1, RESOLVED);
1123                         /* they collate at the end of everything else */
1124                         collinfo.undef_pri[i] = COLLATE_MAX_PRIORITY;
1125                 }
1126                 collinfo.pri_count[i] = nweight[i];
1127         }
1128
1129         collinfo.pri_count[NUM_WT] = max_wide();
1130         collinfo.undef_pri[NUM_WT] = COLLATE_MAX_PRIORITY;
1131         collinfo.directive[NUM_WT] = DIRECTIVE_UNDEFINED;
1132
1133         /*
1134          * Ordinary character priorities
1135          */
1136         for (i = 0; i <= UCHAR_MAX; i++) {
1137                 if ((cc = get_collchar(i, 0)) != NULL) {
1138                         for (j = 0; j < NUM_WT; j++) {
1139                                 chars[i].pri[j] = get_weight(cc->ref[j], j);
1140                         }
1141                 } else {
1142                         for (j = 0; j < NUM_WT; j++) {
1143                                 chars[i].pri[j] =
1144                                     get_weight(pri_undefined[j], j);
1145                         }
1146                         /*
1147                          * Per POSIX, for undefined characters, we
1148                          * also have to add a last item, which is the
1149                          * character code.
1150                          */
1151                         chars[i].pri[NUM_WT] = i;
1152                 }
1153         }
1154
1155         /*
1156          * Substitution tables
1157          */
1158         for (i = 0; i < NUM_WT; i++) {
1159                 collate_subst_t *st = NULL;
1160                 n = collinfo.subst_count[i] = avl_numnodes(&substs[i]);
1161                 if ((st = calloc(n, sizeof (collate_subst_t))) == NULL) {
1162                         errf(_("out of memory"));
1163                         return;
1164                 }
1165                 n = 0;
1166                 for (sb = avl_first(&substs[i]); sb;
1167                     sb = AVL_NEXT(&substs[i], sb)) {
1168                         if ((st[n].key = resolve_pri(sb->key)) < 0) {
1169                                 /* by definition these resolve! */
1170                                 INTERR;
1171                         }
1172                         if (st[n].key != (n | COLLATE_SUBST_PRIORITY)) {
1173                                 INTERR;
1174                         }
1175                         for (j = 0; sb->ref[j]; j++) {
1176                                 st[n].pri[j] = get_weight(sb->ref[j], i);
1177                         }
1178                         n++;
1179                 }
1180                 if (n != collinfo.subst_count[i])
1181                         INTERR;
1182                 subst[i] = st;
1183         }
1184
1185
1186         /*
1187          * Chains, i.e. collating elements
1188          */
1189         collinfo.chain_count = avl_numnodes(&elem_by_expand);
1190         chain = calloc(collinfo.chain_count, sizeof (collate_chain_t));
1191         if (chain == NULL) {
1192                 errf(_("out of memory"));
1193                 return;
1194         }
1195         for (n = 0, ce = avl_first(&elem_by_expand);
1196             ce != NULL;
1197             ce = AVL_NEXT(&elem_by_expand, ce), n++) {
1198                 (void) wsncpy(chain[n].str, ce->expand, COLLATE_STR_LEN);
1199                 for (i = 0; i < NUM_WT; i++) {
1200                         chain[n].pri[i] = get_weight(ce->ref[i], i);
1201                 }
1202         }
1203         if (n != collinfo.chain_count)
1204                 INTERR;
1205
1206         /*
1207          * Large (> UCHAR_MAX) character priorities
1208          */
1209         large = calloc(avl_numnodes(&collchars), sizeof (collate_large_t));
1210         if (large == NULL) {
1211                 errf(_("out of memory"));
1212                 return;
1213         }
1214
1215         i = 0;
1216         for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
1217                 int     undef = 0;
1218                 /* we already gathered those */
1219                 if (cc->wc <= UCHAR_MAX)
1220                         continue;
1221                 for (j = 0; j < NUM_WT; j++) {
1222                         if ((pri = get_weight(cc->ref[j], j)) < 0) {
1223                                 undef = 1;
1224                         }
1225                         if (undef && (pri >= 0)) {
1226                                 /* if undefined, then all priorities are */
1227                                 INTERR;
1228                         } else {
1229                                 large[i].pri.pri[j] = pri;
1230                         }
1231                 }
1232                 if (!undef) {
1233                         large[i].val = cc->wc;
1234                         collinfo.large_count = i++;
1235                 }
1236         }
1237
1238         if ((f = open_category()) == NULL) {
1239                 return;
1240         }
1241
1242         /* Time to write the entire data set out */
1243
1244         if ((wr_category(vers, COLLATE_STR_LEN, f) < 0) ||
1245             (wr_category(&collinfo, sizeof (collinfo), f) < 0) ||
1246             (wr_category(&chars, sizeof (chars), f) < 0)) {
1247                 delete_category(f);
1248                 return;
1249         }
1250
1251         for (i = 0; i < NUM_WT; i++) {
1252                 sz =  sizeof (collate_subst_t) * collinfo.subst_count[i];
1253                 if (wr_category(subst[i], sz, f) < 0) {
1254                         delete_category(f);
1255                         return;
1256                 }
1257         }
1258         sz = sizeof (collate_chain_t) * collinfo.chain_count;
1259         if (wr_category(chain, sz, f) < 0) {
1260                 delete_category(f);
1261                 return;
1262         }
1263         sz = sizeof (collate_large_t) * collinfo.large_count;
1264         if (wr_category(large, sz, f) < 0) {
1265                 delete_category(f);
1266                 return;
1267         }
1268
1269         close_category(f);
1270 }