2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
13 * Copyright 2017 Nexenta Systems, Inc.
17 * LC_COLLATE database generation routines for localedef.
24 #include <sys/types.h>
31 #include "localedef.h"
32 #include "parser.tab.h"
33 #include "collatefile.h"
38 * It will be extremely helpful to the reader if they have access to
39 * the localedef and locale file format specifications available.
40 * Latest versions of these are available from www.opengroup.org.
42 * The design for the collation code is a bit complex. The goal is a
43 * single collation database as described in collate.h (in
44 * libc/port/locale). However, there are some other tidbits:
46 * a) The substitution entries are now a directly indexable array. A
47 * priority elsewhere in the table is taken as an index into the
48 * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY)
49 * set. (The bit is cleared and the result is the index into the
52 * b) We eliminate duplicate entries into the substitution table.
53 * This saves a lot of space.
55 * c) The priorities for each level are "compressed", so that each
56 * sorting level has consecutively numbered priorities starting at 1.
57 * (O is reserved for the ignore priority.) This means sort levels
58 * which only have a few distinct priorities can represent the
59 * priority level in fewer bits, which makes the strxfrm output
62 * d) We record the total number of priorities so that strxfrm can
63 * figure out how many bytes to expand a numeric priority into.
65 * e) For the UNDEFINED pass (the last pass), we record the maximum
66 * number of bits needed to uniquely prioritize these entries, so that
67 * the last pass can also use smaller strxfrm output when possible.
69 * f) Priorities with the sign bit set are verboten. This works out
70 * because no active character set needs that bit to carry significant
71 * information once the character is in wide form.
73 * To process the entire data to make the database, we actually run
74 * multiple passes over the data.
76 * The first pass, which is done at parse time, identifies elements,
77 * substitutions, and such, and records them in priority order. As
78 * some priorities can refer to other priorities, using forward
79 * references, we use a table of references indicating whether the
80 * priority's value has been resolved, or whether it is still a
83 * The second pass walks over all the items in priority order, noting
84 * that they are used directly, and not just an indirect reference.
85 * This is done by creating a "weight" structure for the item. The
86 * weights are stashed in an AVL tree sorted by relative "priority".
88 * The third pass walks over all the weight structures, in priority
89 * order, and assigns a new monotonically increasing (per sort level)
90 * weight value to them. These are the values that will actually be
91 * written to the file.
93 * The fourth pass just writes the data out.
97 * In order to resolve the priorities, we create a table of priorities.
98 * Entries in the table can be in one of three states.
100 * UNKNOWN is for newly allocated entries, and indicates that nothing
101 * is known about the priority. (For example, when new entries are created
102 * for collating-symbols, this is the value assigned for them until the
103 * collating symbol's order has been determined.
105 * RESOLVED is used for an entry where the priority indicates the final
108 * REFER is used for entries that reference other entries. Typically
109 * this is used for forward references. A collating-symbol can never
112 * The "pass" field is used during final resolution to aid in detection
113 * of referencing loops. (For example <A> depends on <B>, but <B> has its
114 * priority dependent on <A>.)
117 UNKNOWN
, /* priority is totally unknown */
118 RESOLVED
, /* priority value fully resolved */
119 REFER
/* priority is a reference (index) */
122 typedef struct weight
{
128 typedef struct priority
{
135 #define NUM_WT collinfo.directive_count
138 * These are the abstract collating symbols, which are just a symbolic
139 * way to reference a priority.
148 * These are also abstract collating symbols, but we allow them to have
149 * different priorities at different levels.
151 typedef struct collundef
{
153 int32_t ref
[COLL_WEIGHTS_MAX
];
158 * These are called "chains" in libc. This records the fact that two
159 * more characters should be treated as a single collating entity when
160 * they appear together. For example, in Spanish <C><h> gets collated
161 * as a character between <C> and <D>.
166 int32_t ref
[COLL_WEIGHTS_MAX
];
167 avl_node_t avl_bysymbol
;
168 avl_node_t avl_byexpand
;
172 * Individual characters have a sequence of weights as well.
174 typedef struct collchar
{
176 int32_t ref
[COLL_WEIGHTS_MAX
];
181 * Substitution entries. The key is itself a priority. Note that
182 * when we create one of these, we *automatically* wind up with a
183 * fully resolved priority for the key, because creation of
184 * substitutions creates a resolved priority at the same time.
188 int32_t ref
[COLLATE_STR_LEN
];
193 static avl_tree_t collsyms
;
194 static avl_tree_t collundefs
;
195 static avl_tree_t elem_by_symbol
;
196 static avl_tree_t elem_by_expand
;
197 static avl_tree_t collchars
;
198 static avl_tree_t substs
[COLL_WEIGHTS_MAX
];
199 static avl_tree_t substs_ref
[COLL_WEIGHTS_MAX
];
200 static avl_tree_t weights
[COLL_WEIGHTS_MAX
];
201 static int32_t nweight
[COLL_WEIGHTS_MAX
];
204 * This is state tracking for the ellipsis token. Note that we start
205 * the initial values so that the ellipsis logic will think we got a
206 * magic starting value of NUL. It starts at minus one because the
207 * starting point is exclusive -- i.e. the starting point is not
208 * itself handled by the ellipsis code.
210 static int currorder
= EOF
;
211 static int lastorder
= EOF
;
212 static collelem_t
*currelem
;
213 static collchar_t
*currchar
;
214 static collundef_t
*currundef
;
215 static wchar_t ellipsis_start
= 0;
216 static int32_t ellipsis_weights
[COLL_WEIGHTS_MAX
];
219 * We keep a running tally of weights.
221 static int nextpri
= 1;
222 static int nextsubst
[COLL_WEIGHTS_MAX
] = { 0 };
225 * This array collects up the weights for each level.
227 static int32_t order_weights
[COLL_WEIGHTS_MAX
];
228 static int curr_weight
= 0;
229 static int32_t subst_weights
[COLLATE_STR_LEN
];
230 static int curr_subst
= 0;
233 * Some initial priority values.
235 static int32_t pri_undefined
[COLL_WEIGHTS_MAX
];
236 static int32_t pri_ignore
;
238 static collate_info_t collinfo
;
240 static collpri_t
*prilist
= NULL
;
241 static int numpri
= 0;
242 static int maxpri
= 0;
244 static void start_order(int);
251 if (numpri
>= maxpri
) {
252 maxpri
= maxpri
? maxpri
* 2 : 1024;
253 prilist
= realloc(prilist
, sizeof (collpri_t
) * maxpri
);
254 if (prilist
== NULL
) {
255 errf(_("out of memory"));
258 for (i
= numpri
; i
< maxpri
; i
++) {
259 prilist
[i
].res
= UNKNOWN
;
270 if ((ref
< 0) || (ref
> numpri
)) {
274 return (&prilist
[ref
]);
278 set_pri(int32_t ref
, int32_t v
, res_t res
)
284 if ((res
== REFER
) && ((v
< 0) || (v
>= numpri
))) {
288 /* Resolve self references */
289 if ((res
== REFER
) && (ref
== v
)) {
294 if (pri
->res
!= UNKNOWN
) {
295 warn(_("repeated item in order list (first on %d)"),
299 pri
->lineno
= lineno
;
305 resolve_pri(int32_t ref
)
308 static int32_t pass
= 0;
312 while (pri
->res
== REFER
) {
313 if (pri
->pass
== pass
) {
314 /* report a line with the circular symbol */
315 lineno
= pri
->lineno
;
316 errf(_("circular reference in order list"));
319 if ((pri
->pri
< 0) || (pri
->pri
>= numpri
)) {
324 pri
= &prilist
[pri
->pri
];
327 if (pri
->res
== UNKNOWN
) {
330 if (pri
->res
!= RESOLVED
)
337 weight_compare(const void *n1
, const void *n2
)
339 int32_t k1
= ((const weight_t
*)n1
)->pri
;
340 int32_t k2
= ((const weight_t
*)n2
)->pri
;
342 return (k1
< k2
? -1 : k1
> k2
? 1 : 0);
346 collsym_compare(const void *n1
, const void *n2
)
348 const collsym_t
*c1
= n1
;
349 const collsym_t
*c2
= n2
;
352 rv
= strcmp(c1
->name
, c2
->name
);
353 return ((rv
< 0) ? -1 : (rv
> 0) ? 1 : 0);
357 collundef_compare(const void *n1
, const void *n2
)
359 const collundef_t
*c1
= n1
;
360 const collundef_t
*c2
= n2
;
363 rv
= strcmp(c1
->name
, c2
->name
);
364 return ((rv
< 0) ? -1 : (rv
> 0) ? 1 : 0);
368 element_compare_symbol(const void *n1
, const void *n2
)
370 const collelem_t
*c1
= n1
;
371 const collelem_t
*c2
= n2
;
374 rv
= strcmp(c1
->symbol
, c2
->symbol
);
375 return ((rv
< 0) ? -1 : (rv
> 0) ? 1 : 0);
379 element_compare_expand(const void *n1
, const void *n2
)
381 const collelem_t
*c1
= n1
;
382 const collelem_t
*c2
= n2
;
385 rv
= wcscmp(c1
->expand
, c2
->expand
);
386 return ((rv
< 0) ? -1 : (rv
> 0) ? 1 : 0);
390 collchar_compare(const void *n1
, const void *n2
)
392 wchar_t k1
= ((const collchar_t
*)n1
)->wc
;
393 wchar_t k2
= ((const collchar_t
*)n2
)->wc
;
395 return (k1
< k2
? -1 : k1
> k2
? 1 : 0);
399 subst_compare(const void *n1
, const void *n2
)
401 int32_t k1
= ((const subst_t
*)n1
)->key
;
402 int32_t k2
= ((const subst_t
*)n2
)->key
;
404 return (k1
< k2
? -1 : k1
> k2
? 1 : 0);
408 subst_compare_ref(const void *n1
, const void *n2
)
410 int32_t *c1
= ((subst_t
*)n1
)->ref
;
411 int32_t *c2
= ((subst_t
*)n2
)->ref
;
414 rv
= wcscmp((wchar_t *)c1
, (wchar_t *)c2
);
415 return ((rv
< 0) ? -1 : (rv
> 0) ? 1 : 0);
423 avl_create(&collsyms
, collsym_compare
, sizeof (collsym_t
),
424 offsetof(collsym_t
, avl
));
426 avl_create(&collundefs
, collundef_compare
, sizeof (collsym_t
),
427 offsetof(collundef_t
, avl
));
429 avl_create(&elem_by_symbol
, element_compare_symbol
, sizeof (collelem_t
),
430 offsetof(collelem_t
, avl_bysymbol
));
431 avl_create(&elem_by_expand
, element_compare_expand
, sizeof (collelem_t
),
432 offsetof(collelem_t
, avl_byexpand
));
434 avl_create(&collchars
, collchar_compare
, sizeof (collchar_t
),
435 offsetof(collchar_t
, avl
));
437 for (i
= 0; i
< COLL_WEIGHTS_MAX
; i
++) {
438 avl_create(&substs
[i
], subst_compare
, sizeof (subst_t
),
439 offsetof(subst_t
, avl
));
440 avl_create(&substs_ref
[i
], subst_compare_ref
,
441 sizeof (subst_t
), offsetof(subst_t
, avl_ref
));
442 avl_create(&weights
[i
], weight_compare
, sizeof (weight_t
),
443 offsetof(weight_t
, avl
));
447 (void) memset(&collinfo
, 0, sizeof (collinfo
));
449 /* allocate some initial priorities */
450 pri_ignore
= new_pri();
452 set_pri(pri_ignore
, 0, RESOLVED
);
454 for (i
= 0; i
< COLL_WEIGHTS_MAX
; i
++) {
455 pri_undefined
[i
] = new_pri();
457 /* we will override this later */
458 set_pri(pri_undefined
[i
], COLLATE_MAX_PRIORITY
, UNKNOWN
);
463 define_collsym(char *name
)
468 if ((sym
= calloc(1, sizeof (*sym
))) == NULL
) {
469 errf(_("out of memory"));
473 sym
->ref
= new_pri();
475 if (avl_find(&collsyms
, sym
, &where
) != NULL
) {
477 * This should never happen because we are only called
478 * for undefined symbols.
484 avl_insert(&collsyms
, sym
, where
);
488 lookup_collsym(char *name
)
493 return (avl_find(&collsyms
, &srch
, NULL
));
497 lookup_collelem(char *symbol
)
501 srch
.symbol
= symbol
;
502 return (avl_find(&elem_by_symbol
, &srch
, NULL
));
506 get_collundef(char *name
)
514 if ((ud
= avl_find(&collundefs
, &srch
, &where
)) == NULL
) {
515 if (((ud
= calloc(1, sizeof (*ud
))) == NULL
) ||
516 ((ud
->name
= strdup(name
)) == NULL
)) {
517 errf(_("out of memory"));
521 for (i
= 0; i
< NUM_WT
; i
++) {
522 ud
->ref
[i
] = new_pri();
524 avl_insert(&collundefs
, ud
, where
);
526 add_charmap_undefined(name
);
531 get_collchar(wchar_t wc
, int create
)
539 cc
= avl_find(&collchars
, &srch
, &where
);
540 if ((cc
== NULL
) && create
) {
541 if ((cc
= calloc(1, sizeof (*cc
))) == NULL
) {
542 errf(_("out of memory"));
545 for (i
= 0; i
< NUM_WT
; i
++) {
546 cc
->ref
[i
] = new_pri();
549 avl_insert(&collchars
, cc
, where
);
555 end_order_collsym(collsym_t
*sym
)
557 start_order(T_COLLSYM
);
558 /* update the weight */
560 set_pri(sym
->ref
, nextpri
, RESOLVED
);
572 /* advance the priority/weight */
577 for (i
= 0; i
< NUM_WT
; i
++) {
578 if (((ref
= order_weights
[i
]) < 0) ||
579 ((p
= get_pri(ref
)) == NULL
) ||
581 /* unspecified weight is a self reference */
582 set_pri(currchar
->ref
[i
], pri
, RESOLVED
);
584 set_pri(currchar
->ref
[i
], ref
, REFER
);
586 order_weights
[i
] = -1;
589 /* leave a cookie trail in case next symbol is ellipsis */
590 ellipsis_start
= currchar
->wc
+ 1;
595 /* save off the weights were we can find them */
596 for (i
= 0; i
< NUM_WT
; i
++) {
597 ellipsis_weights
[i
] = order_weights
[i
];
598 order_weights
[i
] = -1;
603 if (currelem
== NULL
) {
606 for (i
= 0; i
< NUM_WT
; i
++) {
608 if (((ref
= order_weights
[i
]) < 0) ||
609 ((p
= get_pri(ref
)) == NULL
) ||
611 set_pri(currelem
->ref
[i
], pri
,
614 set_pri(currelem
->ref
[i
], ref
, REFER
);
616 order_weights
[i
] = -1;
622 for (i
= 0; i
< NUM_WT
; i
++) {
623 if (((ref
= order_weights
[i
]) < 0) ||
624 ((p
= get_pri(ref
)) == NULL
) ||
626 set_pri(pri_undefined
[i
], -1, RESOLVED
);
628 set_pri(pri_undefined
[i
], ref
, REFER
);
630 order_weights
[i
] = -1;
635 for (i
= 0; i
< NUM_WT
; i
++) {
636 if (((ref
= order_weights
[i
]) < 0) ||
637 ((p
= get_pri(ref
)) == NULL
) ||
639 set_pri(currundef
->ref
[i
], pri
, RESOLVED
);
641 set_pri(currundef
->ref
[i
], ref
, REFER
);
643 order_weights
[i
] = -1;
655 start_order(int type
)
659 lastorder
= currorder
;
662 /* this is used to protect ELLIPSIS processing */
663 if ((lastorder
== T_ELLIPSIS
) && (type
!= T_CHAR
)) {
664 errf(_("character value expected"));
667 for (i
= 0; i
< COLL_WEIGHTS_MAX
; i
++) {
668 order_weights
[i
] = -1;
674 start_order_undefined(void)
676 start_order(T_UNDEFINED
);
680 start_order_symbol(char *name
)
682 currundef
= get_collundef(name
);
683 start_order(T_SYMBOL
);
687 start_order_char(wchar_t wc
)
695 * If we last saw an ellipsis, then we need to close the range.
696 * Handle that here. Note that we have to be careful because the
697 * items *inside* the range are treated exclusiveley to the items
698 * outside of the range. The ends of the range can have quite
699 * different weights than the range members.
701 if (lastorder
== T_ELLIPSIS
) {
704 if (wc
< ellipsis_start
) {
705 errf(_("malformed range!"));
708 while (ellipsis_start
< wc
) {
710 * pick all of the saved weights for the
711 * ellipsis. note that -1 encodes for the
712 * ellipsis itself, which means to take the
713 * current relative priority.
715 if ((cc
= get_collchar(ellipsis_start
, 1)) == NULL
) {
719 for (i
= 0; i
< NUM_WT
; i
++) {
721 if (((ref
= ellipsis_weights
[i
]) == -1) ||
722 ((p
= get_pri(ref
)) == NULL
) ||
724 set_pri(cc
->ref
[i
], nextpri
, RESOLVED
);
726 set_pri(cc
->ref
[i
], ref
, REFER
);
728 ellipsis_weights
[i
] = 0;
735 currchar
= get_collchar(wc
, 1);
739 start_order_collelem(collelem_t
*e
)
741 start_order(T_COLLELEM
);
746 start_order_ellipsis(void)
750 start_order(T_ELLIPSIS
);
752 if (lastorder
!= T_CHAR
) {
753 errf(_("illegal starting point for range"));
757 for (i
= 0; i
< NUM_WT
; i
++) {
758 ellipsis_weights
[i
] = order_weights
[i
];
763 define_collelem(char *name
, wchar_t *wcs
)
770 if (wcslen(wcs
) >= COLLATE_STR_LEN
) {
771 errf(_("expanded collation element too long"));
775 if ((e
= calloc(1, sizeof (*e
))) == NULL
) {
776 errf(_("out of memory"));
783 * This is executed before the order statement, so we don't
784 * know how many priorities we *really* need. We allocate one
785 * for each possible weight. Not a big deal, as collating-elements
786 * prove to be quite rare.
788 for (i
= 0; i
< COLL_WEIGHTS_MAX
; i
++) {
789 e
->ref
[i
] = new_pri();
792 /* A character sequence can only reduce to one element. */
793 if ((avl_find(&elem_by_symbol
, e
, &where1
) != NULL
) ||
794 (avl_find(&elem_by_expand
, e
, &where2
) != NULL
)) {
795 errf(_("duplicate collating element definition"));
799 avl_insert(&elem_by_symbol
, e
, where1
);
800 avl_insert(&elem_by_expand
, e
, where2
);
804 add_order_bit(int kw
)
806 uint8_t bit
= DIRECTIVE_UNDEF
;
810 bit
= DIRECTIVE_FORWARD
;
813 bit
= DIRECTIVE_BACKWARD
;
816 bit
= DIRECTIVE_POSITION
;
822 collinfo
.directive
[collinfo
.directive_count
] |= bit
;
826 add_order_directive(void)
828 if (collinfo
.directive_count
>= COLL_WEIGHTS_MAX
) {
829 errf(_("too many directives (max %d)"), COLL_WEIGHTS_MAX
);
831 collinfo
.directive_count
++;
835 add_order_pri(int32_t ref
)
837 if (curr_weight
>= NUM_WT
) {
838 errf(_("too many weights (max %d)"), NUM_WT
);
841 order_weights
[curr_weight
] = ref
;
846 add_order_collsym(collsym_t
*s
)
848 add_order_pri(s
->ref
);
852 add_order_char(wchar_t wc
)
856 if ((cc
= get_collchar(wc
, 1)) == NULL
) {
861 add_order_pri(cc
->ref
[curr_weight
]);
865 add_order_collelem(collelem_t
*e
)
867 add_order_pri(e
->ref
[curr_weight
]);
871 add_order_ignore(void)
873 add_order_pri(pri_ignore
);
877 add_order_symbol(char *sym
)
880 if ((c
= get_collundef(sym
)) == NULL
) {
884 add_order_pri(c
->ref
[curr_weight
]);
888 add_order_ellipsis(void)
890 /* special 0 value indicates self reference */
895 add_order_subst(void)
902 (void) memset(&srch
, 0, sizeof (srch
));
903 for (i
= 0; i
< curr_subst
; i
++) {
904 srch
.ref
[i
] = subst_weights
[i
];
905 subst_weights
[i
] = 0;
907 s
= avl_find(&substs_ref
[curr_weight
], &srch
, &where
);
910 if ((s
= calloc(1, sizeof (*s
))) == NULL
) {
911 errf(_("out of memory"));
917 * We use a self reference for our key, but we set a
918 * high bit to indicate that this is a substitution
919 * reference. This will expedite table lookups later,
920 * and prevent table lookups for situations that don't
921 * require it. (In short, its a big win, because we
922 * can skip a lot of binary searching.)
925 (nextsubst
[curr_weight
] | COLLATE_SUBST_PRIORITY
),
927 nextsubst
[curr_weight
] += 1;
929 for (i
= 0; i
< curr_subst
; i
++) {
930 s
->ref
[i
] = srch
.ref
[i
];
933 avl_insert(&substs_ref
[curr_weight
], s
, where
);
935 if (avl_find(&substs
[curr_weight
], s
, &where
) != NULL
) {
939 avl_insert(&substs
[curr_weight
], s
, where
);
945 * We are using the current (unique) priority as a search key
946 * in the substitution table.
948 add_order_pri(s
->key
);
952 add_subst_pri(int32_t ref
)
954 if (curr_subst
>= COLLATE_STR_LEN
) {
955 errf(_("substitution string is too long"));
958 subst_weights
[curr_subst
] = ref
;
963 add_subst_char(wchar_t wc
)
968 if (((cc
= get_collchar(wc
, 1)) == NULL
) ||
973 /* we take the weight for the character at that position */
974 add_subst_pri(cc
->ref
[curr_weight
]);
978 add_subst_collelem(collelem_t
*e
)
980 add_subst_pri(e
->ref
[curr_weight
]);
984 add_subst_collsym(collsym_t
*s
)
986 add_subst_pri(s
->ref
);
990 add_subst_symbol(char *ptr
)
994 if ((cu
= get_collundef(ptr
)) != NULL
) {
995 add_subst_pri(cu
->ref
[curr_weight
]);
1000 add_weight(int32_t ref
, int pass
)
1006 srch
.pri
= resolve_pri(ref
);
1008 /* No translation of ignores */
1012 /* Substitution priorities are not weights */
1013 if (srch
.pri
& COLLATE_SUBST_PRIORITY
)
1016 if (avl_find(&weights
[pass
], &srch
, &where
) != NULL
)
1019 if ((w
= calloc(1, sizeof (*w
))) == NULL
) {
1020 errf(_("out of memory"));
1024 avl_insert(&weights
[pass
], w
, where
);
1028 add_weights(int32_t *refs
)
1031 for (i
= 0; i
< NUM_WT
; i
++) {
1032 add_weight(refs
[i
], i
);
1037 get_weight(int32_t ref
, int pass
)
1043 pri
= resolve_pri(ref
);
1044 if (pri
& COLLATE_SUBST_PRIORITY
) {
1051 if ((w
= avl_find(&weights
[pass
], &srch
, NULL
)) == NULL
) {
1068 char vers
[COLLATE_STR_LEN
];
1069 collate_char_t chars
[UCHAR_MAX
+ 1];
1070 collate_large_t
*large
;
1071 collate_subst_t
*subst
[COLL_WEIGHTS_MAX
];
1072 collate_chain_t
*chain
;
1075 * We have to run through a preliminary pass to identify all the
1076 * weights that we use for each sorting level.
1078 for (i
= 0; i
< NUM_WT
; i
++) {
1079 add_weight(pri_ignore
, i
);
1081 for (i
= 0; i
< NUM_WT
; i
++) {
1082 for (sb
= avl_first(&substs
[i
]); sb
;
1083 sb
= AVL_NEXT(&substs
[i
], sb
)) {
1084 for (j
= 0; sb
->ref
[j
]; j
++) {
1085 add_weight(sb
->ref
[j
], i
);
1089 for (ce
= avl_first(&elem_by_expand
);
1091 ce
= AVL_NEXT(&elem_by_expand
, ce
)) {
1092 add_weights(ce
->ref
);
1094 for (cc
= avl_first(&collchars
); cc
; cc
= AVL_NEXT(&collchars
, cc
)) {
1095 add_weights(cc
->ref
);
1099 * Now we walk the entire set of weights, removing the gaps
1100 * in the weights. This gives us optimum usage. The walk
1101 * occurs in priority.
1103 for (i
= 0; i
< NUM_WT
; i
++) {
1105 for (w
= avl_first(&weights
[i
]); w
;
1106 w
= AVL_NEXT(&weights
[i
], w
)) {
1107 w
->opt
= nweight
[i
];
1112 (void) memset(&chars
, 0, sizeof (chars
));
1113 (void) memset(vers
, 0, COLLATE_STR_LEN
);
1114 (void) strlcpy(vers
, COLLATE_VERSION
, sizeof (vers
));
1117 * We need to make sure we arrange for the UNDEFINED field
1118 * to show up. Also, set the total weight counts.
1120 for (i
= 0; i
< NUM_WT
; i
++) {
1121 if (resolve_pri(pri_undefined
[i
]) == -1) {
1122 set_pri(pri_undefined
[i
], -1, RESOLVED
);
1123 /* they collate at the end of everything else */
1124 collinfo
.undef_pri
[i
] = COLLATE_MAX_PRIORITY
;
1126 collinfo
.pri_count
[i
] = nweight
[i
];
1129 collinfo
.pri_count
[NUM_WT
] = max_wide();
1130 collinfo
.undef_pri
[NUM_WT
] = COLLATE_MAX_PRIORITY
;
1131 collinfo
.directive
[NUM_WT
] = DIRECTIVE_UNDEFINED
;
1134 * Ordinary character priorities
1136 for (i
= 0; i
<= UCHAR_MAX
; i
++) {
1137 if ((cc
= get_collchar(i
, 0)) != NULL
) {
1138 for (j
= 0; j
< NUM_WT
; j
++) {
1139 chars
[i
].pri
[j
] = get_weight(cc
->ref
[j
], j
);
1142 for (j
= 0; j
< NUM_WT
; j
++) {
1144 get_weight(pri_undefined
[j
], j
);
1147 * Per POSIX, for undefined characters, we
1148 * also have to add a last item, which is the
1151 chars
[i
].pri
[NUM_WT
] = i
;
1156 * Substitution tables
1158 for (i
= 0; i
< NUM_WT
; i
++) {
1159 collate_subst_t
*st
= NULL
;
1160 n
= collinfo
.subst_count
[i
] = avl_numnodes(&substs
[i
]);
1161 if ((st
= calloc(n
, sizeof (collate_subst_t
))) == NULL
) {
1162 errf(_("out of memory"));
1166 for (sb
= avl_first(&substs
[i
]); sb
;
1167 sb
= AVL_NEXT(&substs
[i
], sb
)) {
1168 if ((st
[n
].key
= resolve_pri(sb
->key
)) < 0) {
1169 /* by definition these resolve! */
1172 if (st
[n
].key
!= (n
| COLLATE_SUBST_PRIORITY
)) {
1175 for (j
= 0; sb
->ref
[j
]; j
++) {
1176 st
[n
].pri
[j
] = get_weight(sb
->ref
[j
], i
);
1180 if (n
!= collinfo
.subst_count
[i
])
1187 * Chains, i.e. collating elements
1189 collinfo
.chain_count
= avl_numnodes(&elem_by_expand
);
1190 chain
= calloc(collinfo
.chain_count
, sizeof (collate_chain_t
));
1191 if (chain
== NULL
) {
1192 errf(_("out of memory"));
1195 for (n
= 0, ce
= avl_first(&elem_by_expand
);
1197 ce
= AVL_NEXT(&elem_by_expand
, ce
), n
++) {
1198 (void) wsncpy(chain
[n
].str
, ce
->expand
, COLLATE_STR_LEN
);
1199 for (i
= 0; i
< NUM_WT
; i
++) {
1200 chain
[n
].pri
[i
] = get_weight(ce
->ref
[i
], i
);
1203 if (n
!= collinfo
.chain_count
)
1207 * Large (> UCHAR_MAX) character priorities
1209 large
= calloc(avl_numnodes(&collchars
), sizeof (collate_large_t
));
1210 if (large
== NULL
) {
1211 errf(_("out of memory"));
1216 for (cc
= avl_first(&collchars
); cc
; cc
= AVL_NEXT(&collchars
, cc
)) {
1218 /* we already gathered those */
1219 if (cc
->wc
<= UCHAR_MAX
)
1221 for (j
= 0; j
< NUM_WT
; j
++) {
1222 if ((pri
= get_weight(cc
->ref
[j
], j
)) < 0) {
1225 if (undef
&& (pri
>= 0)) {
1226 /* if undefined, then all priorities are */
1229 large
[i
].pri
.pri
[j
] = pri
;
1233 large
[i
].val
= cc
->wc
;
1234 collinfo
.large_count
= i
++;
1238 if ((f
= open_category()) == NULL
) {
1242 /* Time to write the entire data set out */
1244 if ((wr_category(vers
, COLLATE_STR_LEN
, f
) < 0) ||
1245 (wr_category(&collinfo
, sizeof (collinfo
), f
) < 0) ||
1246 (wr_category(&chars
, sizeof (chars
), f
) < 0)) {
1251 for (i
= 0; i
< NUM_WT
; i
++) {
1252 sz
= sizeof (collate_subst_t
) * collinfo
.subst_count
[i
];
1253 if (wr_category(subst
[i
], sz
, f
) < 0) {
1258 sz
= sizeof (collate_chain_t
) * collinfo
.chain_count
;
1259 if (wr_category(chain
, sz
, f
) < 0) {
1263 sz
= sizeof (collate_large_t
) * collinfo
.large_count
;
1264 if (wr_category(large
, sz
, f
) < 0) {