Update to 2.1.x development version
[glibc/history.git] / locale / programs / ld-collate.c
blob3a8c17a303078fe9c6e0e105d9d4450074344682
1 /* Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
24 #include <endian.h>
25 #include <errno.h>
26 #include <limits.h>
27 #include <locale.h>
28 #include <obstack.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <wchar.h>
33 #include "localeinfo.h"
34 #include "locales.h"
35 #include "simple-hash.h"
36 #include "stringtrans.h"
37 #include "strlen-hash.h"
39 /* Uncomment the following line in the production version. */
40 /* #define NDEBUG 1 */
41 #include <assert.h>
44 #define MAX(a, b) ((a) > (b) ? (a) : (b))
46 #define SWAPU32(w) \
47 (((w) << 24) | (((w) & 0xff00) << 8) | (((w) >> 8) & 0xff00) | ((w) >> 24))
50 /* What kind of symbols get defined? */
51 enum coll_symbol
53 undefined,
54 ellipsis,
55 character,
56 element,
57 symbol
61 typedef struct patch_t
63 const char *fname;
64 size_t lineno;
65 const char *token;
66 union
68 unsigned int *pos;
69 size_t idx;
70 } where;
71 struct patch_t *next;
72 } patch_t;
75 typedef struct element_t
77 const wchar_t *name;
78 unsigned int this_weight;
80 struct element_t *next;
82 unsigned int *ordering;
83 size_t ordering_len;
84 } element_t;
87 /* The real definition of the struct for the LC_COLLATE locale. */
88 struct locale_collate_t
90 /* Collate symbol table. Simple mapping to number. */
91 hash_table symbols;
93 /* The collation elements. */
94 hash_table elements;
95 struct obstack element_mem;
97 /* The result table. */
98 hash_table result;
100 /* Sorting rules given in order_start line. */
101 u_int32_t nrules;
102 u_int32_t nrules_max;
103 enum coll_sort_rule *rules;
105 /* Used while recognizing symbol composed of multiple tokens
106 (collating-element). */
107 const char *combine_token;
108 size_t combine_token_len;
110 /* How many sorting order specifications so far. */
111 unsigned int order_cnt;
113 /* Was lastline ellipsis? */
114 int was_ellipsis;
115 /* Value of last entry if was character. */
116 wchar_t last_char;
117 /* Current element. */
118 element_t *current_element;
119 /* What kind of symbol is current element. */
120 enum coll_symbol kind;
122 /* While collecting the weights we need some temporary space. */
123 unsigned int current_order;
124 int *weight_cnt;
125 unsigned int weight_idx;
126 unsigned int *weight;
127 size_t nweight;
128 size_t nweight_max;
130 /* Patch lists. */
131 patch_t *current_patch;
132 patch_t *all_patches;
134 /* Room for the UNDEFINED information. */
135 element_t undefined;
136 unsigned int undefined_len;
140 /* Be verbose? Defined in localedef.c. */
141 extern int verbose;
144 void *xmalloc (size_t __n);
145 void *xrealloc (void *__p, size_t __n);
148 #define obstack_chunk_alloc malloc
149 #define obstack_chunk_free free
152 void
153 collate_startup (struct linereader *lr, struct localedef_t *locale,
154 struct charset_t *charset)
156 struct locale_collate_t *collate;
158 /* It is important that we always use UCS4 encoding for strings now. */
159 encoding_method = ENC_UCS4;
161 /* Allocate the needed room. */
162 locale->categories[LC_COLLATE].collate = collate =
163 (struct locale_collate_t *) xmalloc (sizeof (struct locale_collate_t));
165 /* Allocate hash table for collating elements. */
166 if (init_hash (&collate->elements, 512))
167 error (4, 0, _("memory exhausted"));
168 collate->combine_token = NULL;
169 obstack_init (&collate->element_mem);
171 /* Allocate hash table for collating elements. */
172 if (init_hash (&collate->symbols, 64))
173 error (4, 0, _("memory exhausted"));
175 /* Allocate hash table for result. */
176 if (init_hash (&collate->result, 512))
177 error (4, 0, _("memory exhausted"));
179 collate->nrules = 0;
180 collate->nrules_max = 10;
181 collate->rules
182 = (enum coll_sort_rule *) xmalloc (collate->nrules_max
183 * sizeof (enum coll_sort_rule));
185 collate->order_cnt = 1; /* The smallest weight is 2. */
187 collate->was_ellipsis = 0;
188 collate->last_char = L'\0'; /* 0 because leading ellipsis is allowed. */
190 collate->all_patches = NULL;
192 /* This tells us no UNDEFINED entry was found until now. */
193 collate->undefined.this_weight = 0;
195 lr->translate_strings = 0;
199 void
200 collate_finish (struct localedef_t *locale, struct charset_t *charset)
202 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
203 patch_t *patch;
204 size_t cnt;
206 /* Patch the constructed table so that forward references are
207 correctly filled. */
208 for (patch = collate->all_patches; patch != NULL; patch = patch->next)
210 wchar_t wch;
211 size_t toklen = strlen (patch->token);
212 void *ptmp;
213 unsigned int value = 0;
215 wch = charset_find_value (charset, patch->token, toklen);
216 if (wch != ILLEGAL_CHAR_VALUE)
218 element_t *runp;
220 if (find_entry (&collate->result, &wch, sizeof (wchar_t),
221 (void *) &runp) < 0)
222 runp = NULL;
223 for (; runp != NULL; runp = runp->next)
224 if (runp->name[0] == wch && runp->name[1] == L'\0')
225 break;
227 value = runp == NULL ? 0 : runp->this_weight;
229 else if (find_entry (&collate->elements, patch->token, toklen, &ptmp)
230 >= 0)
232 value = ((element_t *) ptmp)->this_weight;
234 else if (find_entry (&collate->symbols, patch->token, toklen, &ptmp)
235 >= 0)
237 value = (unsigned long int) ptmp;
239 else
240 value = 0;
242 if (value == 0 && !be_quiet)
243 error_at_line (0, 0, patch->fname, patch->lineno,
244 _("no weight defined for symbol `%s'"), patch->token);
245 else
246 *patch->where.pos = value;
249 /* If no definition for UNDEFINED is given, all characters in the
250 given charset must be specified. */
251 if (collate->undefined.ordering == NULL)
253 /**************************************************************\
254 |* XXX We should test whether really an unspecified character *|
255 |* exists before giving the message. *|
256 \**************************************************************/
257 u_int32_t weight;
259 if (!be_quiet)
260 error (0, 0, _("no definition of `UNDEFINED'"));
262 collate->undefined.ordering_len = collate->nrules;
263 weight = ++collate->order_cnt;
265 for (cnt = 0; cnt < collate->nrules; ++cnt)
267 u_int32_t one = 1;
268 obstack_grow (&collate->element_mem, &one, sizeof (one));
271 for (cnt = 0; cnt < collate->nrules; ++cnt)
272 obstack_grow (&collate->element_mem, &weight, sizeof (weight));
274 collate->undefined.ordering = obstack_finish (&collate->element_mem);
277 collate->undefined_len = 2; /* For the name: 1 x wchar_t + L'\0'. */
278 for (cnt = 0; cnt < collate->nrules; ++cnt)
279 collate->undefined_len += 1 + collate->undefined.ordering[cnt];
284 void
285 collate_output (struct localedef_t *locale, struct charset_t *charset,
286 const char *output_path)
288 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
289 u_int32_t table_size, table_best, level_best, sum_best;
290 void *last;
291 element_t *pelem;
292 wchar_t *name;
293 size_t len;
294 const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE);
295 struct iovec iov[2 + nelems];
296 struct locale_file data;
297 u_int32_t idx[nelems];
298 struct obstack non_simple;
299 struct obstack string_pool;
300 size_t cnt, entry_size;
301 u_int32_t undefined_offset = UINT_MAX;
302 u_int32_t *table, *extra, *table2, *extra2;
303 size_t extra_len;
304 u_int32_t element_hash_tab_size;
305 u_int32_t *element_hash_tab;
306 u_int32_t *element_hash_tab_ob;
307 u_int32_t element_string_pool_size;
308 char *element_string_pool;
309 u_int32_t element_value_size;
310 wchar_t *element_value;
311 wchar_t *element_value_ob;
312 u_int32_t symbols_hash_tab_size;
313 u_int32_t *symbols_hash_tab;
314 u_int32_t *symbols_hash_tab_ob;
315 u_int32_t symbols_string_pool_size;
316 char *symbols_string_pool;
317 u_int32_t symbols_class_size;
318 u_int32_t *symbols_class;
319 u_int32_t *symbols_class_ob;
320 hash_table *hash_tab;
321 unsigned int dummy_weights[collate->nrules + 1];
323 sum_best = UINT_MAX;
324 table_best = 0xffff;
325 level_best = 0xffff;
327 /* Compute table size. */
328 if (!be_quiet)
329 fputs (_("\
330 Computing table size for collation information might take a while..."),
331 stderr);
332 for (table_size = 256; table_size < sum_best; ++table_size)
334 size_t hits[table_size];
335 unsigned int worst = 1;
336 size_t cnt;
338 last = NULL;
340 for (cnt = 0; cnt < 256; ++cnt)
341 hits[cnt] = 1;
342 memset (&hits[256], '\0', sizeof (hits) - 256 * sizeof (size_t));
344 while (iterate_table (&collate->result, &last, (const void **) &name,
345 &len, (void **) &pelem) >= 0)
346 if (pelem->ordering != NULL && pelem->name[0] > 0xff)
347 if (++hits[(unsigned int) pelem->name[0] % table_size] > worst)
349 worst = hits[(unsigned int) pelem->name[0] % table_size];
350 if (table_size * worst > sum_best)
351 break;
354 if (table_size * worst < sum_best)
356 sum_best = table_size * worst;
357 table_best = table_size;
358 level_best = worst;
361 assert (table_best != 0xffff || level_best != 0xffff);
362 if (!be_quiet)
363 fputs (_(" done\n"), stderr);
365 obstack_init (&non_simple);
366 obstack_init (&string_pool);
368 data.magic = LIMAGIC (LC_COLLATE);
369 data.n = nelems;
370 iov[0].iov_base = (void *) &data;
371 iov[0].iov_len = sizeof (data);
373 iov[1].iov_base = (void *) idx;
374 iov[1].iov_len = sizeof (idx);
376 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_NRULES)].iov_base = &collate->nrules;
377 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_NRULES)].iov_len = sizeof (u_int32_t);
379 table = (u_int32_t *) alloca (collate->nrules * sizeof (u_int32_t));
380 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_RULES)].iov_base = table;
381 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_RULES)].iov_len
382 = collate->nrules * sizeof (u_int32_t);
383 /* Another trick here. Describing the collation method needs only a
384 few bits (3, to be exact). But the binary file should be
385 accessible by machines with both endianesses and so we store both
386 forms in the same word. */
387 for (cnt = 0; cnt < collate->nrules; ++cnt)
388 table[cnt] = collate->rules[cnt] | SWAPU32 (collate->rules[cnt]);
390 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE)].iov_base = &table_best;
391 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE)].iov_len = sizeof (u_int32_t);
393 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS)].iov_base = &level_best;
394 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS)].iov_len
395 = sizeof (u_int32_t);
397 entry_size = 1 + MAX (collate->nrules, 2);
399 table = (u_int32_t *) alloca (table_best * level_best * entry_size
400 * sizeof (table[0]));
401 memset (table, '\0', table_best * level_best * entry_size
402 * sizeof (table[0]));
405 /* Macros for inserting in output table. */
406 #define ADD_VALUE(expr) \
407 do { \
408 u_int32_t to_write = (u_int32_t) expr; \
409 obstack_grow (&non_simple, &to_write, sizeof (to_write)); \
410 } while (0)
412 #define ADD_ELEMENT(pelem, len) \
413 do { \
414 size_t cnt, idx; \
416 ADD_VALUE (len); \
418 wlen = wcslen (pelem->name); \
419 obstack_grow (&non_simple, pelem->name, (wlen + 1) * sizeof (u_int32_t)); \
421 idx = collate->nrules; \
422 for (cnt = 0; cnt < collate->nrules; ++cnt) \
424 size_t disp; \
426 ADD_VALUE (pelem->ordering[cnt]); \
427 for (disp = 0; disp < pelem->ordering[cnt]; ++disp) \
428 ADD_VALUE (pelem->ordering[idx++]); \
430 } while (0)
432 #define ADD_FORWARD(pelem) \
433 do { \
434 /* We leave a reference in the main table and put all \
435 information in the table for the extended entries. */ \
436 element_t *runp; \
437 element_t *has_simple = NULL; \
438 size_t wlen; \
440 table[(level * table_best + slot) * entry_size + 1] \
441 = FORWARD_CHAR; \
442 table[(level * table_best + slot) * entry_size + 2] \
443 = obstack_object_size (&non_simple) / sizeof (u_int32_t); \
445 /* Here we have to construct the non-simple table entry. First \
446 compute the total length of this entry. */ \
447 for (runp = (pelem); runp != NULL; runp = runp->next) \
448 if (runp->ordering != NULL) \
450 u_int32_t value; \
451 size_t cnt; \
453 value = 1 + wcslen (runp->name) + 1; \
455 for (cnt = 0; cnt < collate->nrules; ++cnt) \
456 /* We have to take care for entries without ordering \
457 information. While reading them they get inserted in the \
458 table and later not removed when something goes wrong with \
459 reading its weights. */ \
461 value += 1 + runp->ordering[cnt]; \
463 if (runp->name[1] == L'\0') \
464 has_simple = runp; \
467 ADD_ELEMENT (runp, value); \
470 if (has_simple == NULL) \
472 size_t idx, cnt; \
474 ADD_VALUE (collate->undefined_len + 1); \
476 /* Add the name. */ \
477 ADD_VALUE ((pelem)->name[0]); \
478 ADD_VALUE (0); \
480 idx = collate->nrules; \
481 for (cnt = 0; cnt < collate->nrules; ++cnt) \
483 size_t disp; \
485 ADD_VALUE (collate->undefined.ordering[cnt]); \
486 for (disp = 0; disp < collate->undefined.ordering[cnt]; ++disp) \
488 if ((wchar_t) collate->undefined.ordering[idx] \
489 == ELLIPSIS_CHAR) \
490 ADD_VALUE ((pelem)->name[0]); \
491 else \
492 ADD_VALUE (collate->undefined.ordering[idx++]); \
493 ++idx; \
497 } while (0)
501 /* Fill the table now. First we look for all the characters which
502 fit into one single byte. This speeds up the 8-bit string
503 functions. */
504 last = NULL;
505 while (iterate_table (&collate->result, &last, (const void **) &name,
506 &len, (void **) &pelem) >= 0)
507 if (pelem->name[0] <= 0xff)
509 /* We have a single byte name. Now we must distinguish
510 between entries in simple form (i.e., only one value per
511 weight and no collation element starting with the same
512 character) and those which are not. */
513 size_t slot = ((size_t) pelem->name[0]);
514 const size_t level = 0;
516 table[slot * entry_size] = pelem->name[0];
518 if (pelem->name[1] == L'\0' && pelem->next == NULL
519 && pelem->ordering_len == collate->nrules)
521 /* Yes, we have a simple one. Lucky us. */
522 size_t cnt;
524 for (cnt = 0; cnt < collate->nrules; ++cnt)
525 table[slot * entry_size + 1 + cnt]
526 = pelem->ordering[collate->nrules + cnt];
528 else
529 ADD_FORWARD (pelem);
532 /* Now check for missing single byte entries. If one exist we fill
533 with the UNDEFINED entry. */
534 for (cnt = 0; cnt < 256; ++cnt)
535 /* The first weight is never 0 for existing entries. */
536 if (table[cnt * entry_size + 1] == 0)
538 /* We have to fill in the information from the UNDEFINED
539 entry. */
540 table[cnt * entry_size] = (u_int32_t) cnt;
542 if (collate->undefined.ordering_len == collate->nrules)
544 size_t inner;
546 for (inner = 0; inner < collate->nrules; ++inner)
547 if ((wchar_t)collate->undefined.ordering[collate->nrules + inner]
548 == ELLIPSIS_CHAR)
549 table[cnt * entry_size + 1 + inner] = cnt;
550 else
551 table[cnt * entry_size + 1 + inner]
552 = collate->undefined.ordering[collate->nrules + inner];
554 else
556 if (undefined_offset != UINT_MAX)
558 table[cnt * entry_size + 1] = FORWARD_CHAR;
559 table[cnt * entry_size + 2] = undefined_offset;
561 else
563 const size_t slot = cnt;
564 const size_t level = 0;
566 ADD_FORWARD (&collate->undefined);
567 undefined_offset = table[cnt * entry_size + 2];
572 /* Now we are ready for inserting the whole rest. */
573 last = NULL;
574 while (iterate_table (&collate->result, &last, (const void **) &name,
575 &len, (void **) &pelem) >= 0)
576 if (pelem->name[0] > 0xff)
578 /* Find the position. */
579 size_t slot = ((size_t) pelem->name[0]) % table_best;
580 size_t level = 0;
582 while (table[(level * table_best + slot) * entry_size + 1] != 0)
583 ++level;
584 assert (level < level_best);
586 if (pelem->name[1] == L'\0' && pelem->next == NULL
587 && pelem->ordering_len == collate->nrules)
589 /* Again a simple entry. */
590 size_t inner;
592 for (inner = 0; inner < collate->nrules; ++inner)
593 table[(level * table_best + slot) * entry_size + 1 + inner]
594 = pelem->ordering[collate->nrules + inner];
596 else
597 ADD_FORWARD (pelem);
600 /* Add the UNDEFINED entry. */
602 /* Here we have to construct the non-simple table entry. */
603 size_t idx, cnt;
605 undefined_offset = obstack_object_size (&non_simple);
607 idx = collate->nrules;
608 for (cnt = 0; cnt < collate->nrules; ++cnt)
610 size_t disp;
612 ADD_VALUE (collate->undefined.ordering[cnt]);
613 for (disp = 0; disp < collate->undefined.ordering[cnt]; ++disp)
614 ADD_VALUE (collate->undefined.ordering[idx++]);
618 /* Finish the extra block. */
619 extra_len = obstack_object_size (&non_simple);
620 extra = (u_int32_t *) obstack_finish (&non_simple);
621 assert ((extra_len % sizeof (u_int32_t)) == 0);
623 /* Now we have to build the two array for the other byte ordering. */
624 table2 = (u_int32_t *) alloca (table_best * level_best * entry_size
625 * sizeof (table[0]));
626 extra2 = (u_int32_t *) alloca (extra_len);
628 for (cnt = 0; cnt < table_best * level_best * entry_size; ++cnt)
629 table2[cnt] = SWAPU32 (table[cnt]);
631 for (cnt = 0; cnt < extra_len / sizeof (u_int32_t); ++cnt)
632 extra2[cnt] = SWAPU32 (extra2[cnt]);
634 /* We need a simple hashing table to get a collation-element->chars
635 mapping. We again use internal hasing using a secondary hashing
636 function.
638 Each string has an associate hashing value V, computed by a
639 fixed function. To locate the string we use open addressing with
640 double hashing. The first index will be V % M, where M is the
641 size of the hashing table. If no entry is found, iterating with
642 a second, independent hashing function takes place. This second
643 value will be 1 + V % (M - 2). The approximate number of probes
644 will be
646 for unsuccessful search: (1 - N / M) ^ -1
647 for successful search: - (N / M) ^ -1 * ln (1 - N / M)
649 where N is the number of keys.
651 If we now choose M to be the next prime bigger than 4 / 3 * N,
652 we get the values 4 and 1.85 resp. Because unsuccessful searches
653 are unlikely this is a good value. Formulas: [Knuth, The Art of
654 Computer Programming, Volume 3, Sorting and Searching, 1973,
655 Addison Wesley] */
656 if (collate->elements.filled == 0)
658 /* We don't need any element table since there are no collating
659 elements. */
660 element_hash_tab_size = 0;
661 element_hash_tab = NULL;
662 element_hash_tab_ob = NULL;
663 element_string_pool_size = 0;
664 element_string_pool = NULL;
665 element_value_size = 0;
666 element_value = NULL;
667 element_value_ob = NULL;
669 else
671 void *ptr; /* Running pointer. */
672 const char *key; /* Key for current bucket. */
673 size_t keylen; /* Length of key data. */
674 const element_t *data; /* Data, i.e., the character sequence. */
676 element_hash_tab_size = next_prime ((collate->elements.filled * 4) / 3);
677 if (element_hash_tab_size < 7)
678 /* We need a minimum to make the following code work. */
679 element_hash_tab_size = 7;
681 element_hash_tab = obstack_alloc (&non_simple, (2 * element_hash_tab_size
682 * sizeof (u_int32_t)));
683 memset (element_hash_tab, '\377', (2 * element_hash_tab_size
684 * sizeof (u_int32_t)));
686 ptr = NULL;
687 while (iterate_table (&collate->elements, &ptr, (const void **) &key,
688 &keylen, (void **) &data) == 0)
690 size_t hash_val = hash_string (key, keylen);
691 size_t idx = hash_val % element_hash_tab_size;
693 if (element_hash_tab[2 * idx] != (~((u_int32_t) 0)))
695 /* We need the second hashing function. */
696 size_t c = 1 + (hash_val % (element_hash_tab_size - 2));
699 if (idx >= element_hash_tab_size - c)
700 idx -= element_hash_tab_size - c;
701 else
702 idx += c;
703 while (element_hash_tab[2 * idx] != (~((u_int32_t) 0)));
706 element_hash_tab[2 * idx] = obstack_object_size (&non_simple);
707 element_hash_tab[2 * idx + 1] = (obstack_object_size (&string_pool)
708 / sizeof (wchar_t));
710 obstack_grow0 (&non_simple, key, keylen);
711 obstack_grow (&string_pool, data->name,
712 (wcslen (data->name) + 1) * sizeof (wchar_t));
715 if (obstack_object_size (&non_simple) % 4 != 0)
716 obstack_blank (&non_simple,
717 4 - (obstack_object_size (&non_simple) % 4));
718 element_string_pool_size = obstack_object_size (&non_simple);
719 element_string_pool = obstack_finish (&non_simple);
721 element_value_size = obstack_object_size (&string_pool);
722 element_value = obstack_finish (&string_pool);
724 /* Create the tables for the other byte order. */
725 element_hash_tab_ob = obstack_alloc (&non_simple,
726 (2 * element_hash_tab_size
727 * sizeof (u_int32_t)));
728 for (cnt = 0; cnt < 2 * element_hash_tab_size; ++cnt)
729 element_hash_tab_ob[cnt] = SWAPU32 (element_hash_tab[cnt]);
731 element_value_ob = obstack_alloc (&string_pool, element_value_size);
732 if (sizeof (wchar_t) != 4)
734 fputs ("sizeof (wchar_t) != 4 currently not handled", stderr);
735 abort ();
737 for (cnt = 0; cnt < element_value_size / 4; ++cnt)
738 element_value_ob[cnt] = SWAPU32 (element_value[cnt]);
741 /* Store collation elements as map to collation class. There are
742 three kinds of symbols:
743 - simple characters
744 - collation elements
745 - collation symbols
746 We need to make a table which lets the user to access the primary
747 weight based on the symbol string. */
748 symbols_hash_tab_size = next_prime ((4 * (charset->char_table.filled
749 + collate->elements.filled
750 + collate->symbols.filled)) / 3);
751 symbols_hash_tab = obstack_alloc (&non_simple, (2 * symbols_hash_tab_size
752 * sizeof (u_int32_t)));
753 memset (symbols_hash_tab, '\377', (2 * symbols_hash_tab_size
754 * sizeof (u_int32_t)));
756 /* Now fill the array. First the symbols from the character set,
757 then the collation elements and last the collation symbols. */
758 hash_tab = &charset->char_table;
759 while (1)
761 void *ptr; /* Running pointer. */
762 const char *key; /* Key for current bucket. */
763 size_t keylen; /* Length of key data. */
764 void *data; /* Data. */
766 ptr = NULL;
767 while (iterate_table (hash_tab, &ptr, (const void **) &key,
768 &keylen, (void **) &data) == 0)
770 size_t hash_val;
771 size_t idx;
772 u_int32_t word;
773 unsigned int *weights;
775 if (hash_tab == &charset->char_table
776 || hash_tab == &collate->elements)
778 element_t *lastp, *firstp;
779 wchar_t dummy_name[2];
780 const wchar_t *name;
781 size_t name_len;
783 if (hash_tab == &charset->char_table)
785 dummy_name[0] = (wchar_t) ((unsigned long int) data);
786 dummy_name[1] = L'\0';
787 name = dummy_name;
788 name_len = sizeof (wchar_t);
790 else
792 element_t *elemp = (element_t *) data;
793 name = elemp->name;
794 name_len = wcslen (name) * sizeof (wchar_t);
797 /* First check whether this character is used at all. */
798 if (find_entry (&collate->result, name, name_len,
799 (void *) &firstp) < 0)
800 /* The symbol is not directly mentioned in the collation.
801 I.e., we use the value for UNDEFINED. */
802 lastp = &collate->undefined;
803 else
805 /* The entry for the simple character is always found at
806 the end. */
807 lastp = firstp;
808 while (lastp->next != NULL && wcscmp (name, lastp->name))
809 lastp = lastp->next;
812 weights = lastp->ordering;
814 else
816 dummy_weights[0] = 1;
817 dummy_weights[collate->nrules]
818 = (unsigned int) ((unsigned long int) data);
820 weights = dummy_weights;
823 /* In LASTP->ordering we now have the collation class.
824 Determine the place in the hashing table next. */
825 hash_val = hash_string (key, keylen);
826 idx = hash_val % symbols_hash_tab_size;
828 if (symbols_hash_tab[2 * idx] != (~((u_int32_t) 0)))
830 /* We need the second hashing function. */
831 size_t c = 1 + (hash_val % (symbols_hash_tab_size - 2));
834 if (idx >= symbols_hash_tab_size - c)
835 idx -= symbols_hash_tab_size - c;
836 else
837 idx += c;
838 while (symbols_hash_tab[2 * idx] != (~((u_int32_t) 0)));
841 symbols_hash_tab[2 * idx] = obstack_object_size (&string_pool);
842 symbols_hash_tab[2 * idx + 1] = (obstack_object_size (&non_simple)
843 / sizeof (u_int32_t));
845 obstack_grow0 (&string_pool, key, keylen);
846 /* Adding the first weight looks complicated. We have to deal
847 with the kind it is stored and with the fact that original
848 form uses `unsigned int's while we need `u_int32_t' here. */
849 word = weights[0];
850 obstack_grow (&non_simple, &word, sizeof (u_int32_t));
851 for (cnt = 0; cnt < weights[0]; ++cnt)
853 word = weights[collate->nrules + cnt];
854 obstack_grow (&non_simple, &word, sizeof (u_int32_t));
858 if (hash_tab == &charset->char_table)
859 hash_tab = &collate->elements;
860 else if (hash_tab == &collate->elements)
861 hash_tab = &collate->symbols;
862 else
863 break;
866 /* Now we have the complete tables. */
867 if (obstack_object_size (&string_pool) % 4 != 0)
868 obstack_blank (&non_simple, 4 - (obstack_object_size (&string_pool) % 4));
869 symbols_string_pool_size = obstack_object_size (&string_pool);
870 symbols_string_pool = obstack_finish (&string_pool);
872 symbols_class_size = obstack_object_size (&non_simple);
873 symbols_class = obstack_finish (&non_simple);
875 /* Generate tables with other byte order. */
876 symbols_hash_tab_ob = obstack_alloc (&non_simple, (2 * symbols_hash_tab_size
877 * sizeof (u_int32_t)));
878 for (cnt = 0; cnt < 2 * symbols_hash_tab_size; ++cnt)
879 symbols_hash_tab_ob[cnt] = SWAPU32 (symbols_hash_tab[cnt]);
881 symbols_class_ob = obstack_alloc (&non_simple, symbols_class_size);
882 for (cnt = 0; cnt < symbols_class_size / 4; ++cnt)
883 symbols_class_ob[cnt] = SWAPU32 (symbols_class[cnt]);
886 /* Store table addresses and lengths. */
887 #if __BYTE_ORDER == __BIG_ENDIAN
888 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_base = table;
889 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_len
890 = table_best * level_best * entry_size * sizeof (table[0]);
892 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_base = table2;
893 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_len
894 = table_best * level_best * entry_size * sizeof (table[0]);
896 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_base = extra;
897 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_len = extra_len;
899 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_base = extra2;
900 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_len = extra_len;
901 #else
902 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_base = table2;
903 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_len
904 = table_best * level_best * entry_size * sizeof (table[0]);
906 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_base = table;
907 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_len
908 = table_best * level_best * entry_size * sizeof (table[0]);
910 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_base = extra2;
911 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_len = extra_len;
913 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_base = extra;
914 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_len = extra_len;
915 #endif
917 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_UNDEFINED)].iov_base = &undefined_offset;
918 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_UNDEFINED)].iov_len = sizeof (u_int32_t);
921 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_SIZE)].iov_base
922 = &element_hash_tab_size;
923 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_SIZE)].iov_len
924 = sizeof (u_int32_t);
926 #if __BYTE_ORDER == __BIG_ENDIAN
927 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_base
928 = element_hash_tab;
929 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_len
930 = 2 * element_hash_tab_size * sizeof (u_int32_t);
932 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_base
933 = element_hash_tab_ob;
934 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_len
935 = 2 * element_hash_tab_size * sizeof (u_int32_t);
936 #else
937 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_base
938 = element_hash_tab;
939 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_len
940 = 2 * element_hash_tab_size * sizeof (u_int32_t);
942 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_base
943 = element_hash_tab_ob;
944 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_len
945 = 2 * element_hash_tab_size * sizeof (u_int32_t);
946 #endif
948 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_STR_POOL)].iov_base
949 = element_string_pool;
950 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_STR_POOL)].iov_len
951 = element_string_pool_size;
953 #if __BYTE_ORDER == __BIG_ENDIAN
954 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_base
955 = element_value;
956 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_len
957 = element_value_size;
959 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_base
960 = element_value_ob;
961 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_len
962 = element_value_size;
963 #else
964 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_base
965 = element_value;
966 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_len
967 = element_value_size;
969 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_base
970 = element_value_ob;
971 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_len
972 = element_value_size;
973 #endif
975 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZE)].iov_base
976 = &symbols_hash_tab_size;
977 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZE)].iov_len
978 = sizeof (u_int32_t);
980 #if __BYTE_ORDER == __BIG_ENDIAN
981 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_base
982 = symbols_hash_tab;
983 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_len
984 = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
986 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_base
987 = symbols_hash_tab_ob;
988 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_len
989 = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
990 #else
991 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_base
992 = symbols_hash_tab;
993 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_len
994 = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
996 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_base
997 = symbols_hash_tab_ob;
998 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_len
999 = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
1000 #endif
1002 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_STR_POOL)].iov_base
1003 = symbols_string_pool;
1004 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_STR_POOL)].iov_len
1005 = symbols_string_pool_size;
1007 #if __BYTE_ORDER == __BIG_ENDIAN
1008 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EB)].iov_base
1009 = symbols_class;
1010 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EB)].iov_len
1011 = symbols_class_size;
1013 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_base
1014 = symbols_class_ob;
1015 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_len
1016 = symbols_class_size;
1017 #else
1018 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_base
1019 = symbols_class;
1020 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_len
1021 = symbols_class_size;
1023 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EB)].iov_base
1024 = symbols_class_ob;
1025 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EB)].iov_len
1026 = symbols_class_size;
1027 #endif
1029 /* Update idx array. */
1030 idx[0] = iov[0].iov_len + iov[1].iov_len;
1031 for (cnt = 1; cnt < nelems; ++cnt)
1032 idx[cnt] = idx[cnt - 1] + iov[1 + cnt].iov_len;
1034 write_locale_data (output_path, "LC_COLLATE", 2 + nelems, iov);
1036 obstack_free (&non_simple, NULL);
1037 obstack_free (&string_pool, NULL);
1041 void
1042 collate_element_to (struct linereader *lr, struct localedef_t *locale,
1043 struct token *code, struct charset_t *charset)
1045 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1046 unsigned int value;
1047 void *not_used;
1049 if (collate->combine_token != NULL)
1051 free ((void *) collate->combine_token);
1052 collate->combine_token = NULL;
1055 value = charset_find_value (charset, code->val.str.start, code->val.str.len);
1056 if ((wchar_t) value != ILLEGAL_CHAR_VALUE)
1058 lr_error (lr, _("symbol for multicharacter collating element "
1059 "`%.*s' duplicates symbolic name in charset"),
1060 (int) code->val.str.len, code->val.str.start);
1061 return;
1064 if (find_entry (&collate->elements, code->val.str.start, code->val.str.len,
1065 &not_used) >= 0)
1067 lr_error (lr, _("symbol for multicharacter collating element "
1068 "`%.*s' duplicates other element definition"),
1069 (int) code->val.str.len, code->val.str.start);
1070 return;
1073 if (find_entry (&collate->elements, code->val.str.start, code->val.str.len,
1074 &not_used) >= 0)
1076 lr_error (lr, _("symbol for multicharacter collating element "
1077 "`%.*s' duplicates symbol definition"),
1078 (int) code->val.str.len, code->val.str.start);
1079 return;
1082 collate->combine_token = code->val.str.start;
1083 collate->combine_token_len = code->val.str.len;
1087 void
1088 collate_element_from (struct linereader *lr, struct localedef_t *locale,
1089 struct token *code, struct charset_t *charset)
1091 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1092 element_t *elemp, *runp;
1094 /* CODE is a string. */
1095 elemp = (element_t *) obstack_alloc (&collate->element_mem,
1096 sizeof (element_t));
1098 /* We have to translate the string. It may contain <...> character
1099 names. */
1100 elemp->name = (wchar_t *) translate_string (code->val.str.start, charset);
1101 elemp->this_weight = 0;
1102 elemp->ordering = NULL;
1103 elemp->ordering_len = 0;
1105 free (code->val.str.start);
1107 if (elemp->name == NULL)
1109 /* At least one character in the string is not defined. We simply
1110 do nothing. */
1111 if (verbose)
1112 lr_error (lr, _("\
1113 `from' string in collation element declaration contains unknown character"));
1114 return;
1117 if (elemp->name[0] == L'\0' || elemp->name[1] == L'\0')
1119 lr_error (lr, _("illegal collation element"));
1120 return;
1123 /* The entries in the linked lists of RESULT are sorting in
1124 descending order. The order is important for the `strcoll' and
1125 `wcscoll' functions. */
1126 if (find_entry (&collate->result, elemp->name, sizeof (wchar_t),
1127 (void *) &runp) >= 0)
1129 /* We already have an entry with this key. Check whether it is
1130 identical. */
1131 element_t *prevp = NULL;
1132 int cmpres;
1136 cmpres = wcscmp (elemp->name, runp->name);
1137 if (cmpres <= 0)
1138 break;
1139 prevp = runp;
1141 while ((runp = runp->next) != NULL);
1143 if (cmpres == 0)
1144 lr_error (lr, _("duplicate collating element definition"));
1145 else
1147 elemp->next = runp;
1148 if (prevp == NULL)
1150 if (set_entry (&collate->result, elemp->name, sizeof (wchar_t),
1151 elemp) < 0)
1152 error (EXIT_FAILURE, 0, _("\
1153 error while inserting collation element into hash table"));
1155 else
1156 prevp->next = elemp;
1159 else
1161 elemp->next = NULL;
1162 if (insert_entry (&collate->result, elemp->name, sizeof (wchar_t), elemp)
1163 < 0)
1164 error (EXIT_FAILURE, errno, _("error while inserting to hash table"));
1167 if (insert_entry (&collate->elements, collate->combine_token,
1168 collate->combine_token_len, (void *) elemp) < 0)
1169 lr_error (lr, _("cannot insert new collating symbol definition: %s"),
1170 strerror (errno));
1174 void
1175 collate_symbol (struct linereader *lr, struct localedef_t *locale,
1176 struct token *code, struct charset_t *charset)
1178 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1179 wchar_t value;
1180 void *not_used;
1182 value = charset_find_value (charset, code->val.str.start, code->val.str.len);
1183 if (value != ILLEGAL_CHAR_VALUE)
1185 lr_error (lr, _("symbol for multicharacter collating element "
1186 "`%.*s' duplicates symbolic name in charset"),
1187 (int) code->val.str.len, code->val.str.start);
1188 return;
1191 if (find_entry (&collate->elements, code->val.str.start, code->val.str.len,
1192 &not_used) >= 0)
1194 lr_error (lr, _("symbol for multicharacter collating element "
1195 "`%.*s' duplicates element definition"),
1196 (int) code->val.str.len, code->val.str.start);
1197 return;
1200 if (find_entry (&collate->symbols, code->val.str.start, code->val.str.len,
1201 &not_used) >= 0)
1203 lr_error (lr, _("symbol for multicharacter collating element "
1204 "`%.*s' duplicates other symbol definition"),
1205 (int) code->val.str.len, code->val.str.start);
1206 return;
1209 if (insert_entry (&collate->symbols, code->val.str.start, code->val.str.len,
1210 (void *) 0) < 0)
1211 lr_error (lr, _("cannot insert new collating symbol definition: %s"),
1212 strerror (errno));
1216 void
1217 collate_new_order (struct linereader *lr, struct localedef_t *locale,
1218 enum coll_sort_rule sort_rule)
1220 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1222 if (collate->nrules >= collate->nrules_max)
1224 collate->nrules_max *= 2;
1225 collate->rules
1226 = (enum coll_sort_rule *) xrealloc (collate->rules,
1227 collate->nrules_max
1228 * sizeof (enum coll_sort_rule));
1231 collate->rules[collate->nrules++] = sort_rule;
1235 void
1236 collate_build_arrays (struct linereader *lr, struct localedef_t *locale)
1238 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1240 collate->rules
1241 = (enum coll_sort_rule *) xrealloc (collate->rules,
1242 collate->nrules
1243 * sizeof (enum coll_sort_rule));
1245 /* Allocate arrays for temporary weights. */
1246 collate->weight_cnt = (int *) xmalloc (collate->nrules * sizeof (int));
1248 /* Choose arbitrary start value for table size. */
1249 collate->nweight_max = 5 * collate->nrules;
1250 collate->weight = (int *) xmalloc (collate->nweight_max * sizeof (int));
1255 collate_order_elem (struct linereader *lr, struct localedef_t *locale,
1256 struct token *code, struct charset_t *charset)
1258 const wchar_t zero = L'\0';
1259 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1260 int result = 0;
1261 wchar_t value;
1262 void *tmp;
1263 unsigned int i;
1265 switch (code->tok)
1267 case tok_bsymbol:
1268 /* We have a string to find in one of the three hashing tables. */
1269 value = charset_find_value (charset, code->val.str.start,
1270 code->val.str.len);
1271 if (value != ILLEGAL_CHAR_VALUE)
1273 element_t *lastp, *firstp;
1275 collate->kind = character;
1277 if (find_entry (&collate->result, &value, sizeof (wchar_t),
1278 (void *) &firstp) < 0)
1279 firstp = lastp = NULL;
1280 else
1282 /* The entry for the simple character is always found at
1283 the end. */
1284 lastp = firstp;
1285 while (lastp->next != NULL)
1286 lastp = lastp->next;
1288 if (lastp->name[0] == value && lastp->name[1] == L'\0')
1290 lr_error (lr, _("duplicate definition for character `%.*s'"),
1291 (int) code->val.str.len, code->val.str.start);
1292 lr_ignore_rest (lr, 0);
1293 result = -1;
1294 break;
1298 collate->current_element
1299 = (element_t *) obstack_alloc (&collate->element_mem,
1300 sizeof (element_t));
1302 obstack_grow (&collate->element_mem, &value, sizeof (value));
1303 obstack_grow (&collate->element_mem, &zero, sizeof (zero));
1305 collate->current_element->name =
1306 (const wchar_t *) obstack_finish (&collate->element_mem);
1308 collate->current_element->this_weight = ++collate->order_cnt;
1310 collate->current_element->next = NULL;
1312 if (firstp == NULL)
1314 if (insert_entry (&collate->result, &value, sizeof (wchar_t),
1315 (void *) collate->current_element) < 0)
1317 lr_error (lr, _("cannot insert collation element `%.*s'"),
1318 (int) code->val.str.len, code->val.str.start);
1319 exit (4);
1322 else
1323 lastp->next = collate->current_element;
1325 else if (find_entry (&collate->elements, code->val.str.start,
1326 code->val.str.len, &tmp) >= 0)
1328 collate->current_element = (element_t *) tmp;
1330 if (collate->current_element->this_weight != 0)
1332 lr_error (lr, _("\
1333 collation element `%.*s' appears more than once: ignore line"),
1334 (int) code->val.str.len, code->val.str.start);
1335 lr_ignore_rest (lr, 0);
1336 result = -1;
1337 break;
1340 collate->kind = element;
1341 collate->current_element->this_weight = ++collate->order_cnt;
1343 else if (find_entry (&collate->symbols, code->val.str.start,
1344 code->val.str.len, &tmp) >= 0)
1346 unsigned int order = ++collate->order_cnt;
1348 if ((unsigned long int) tmp != 0ul)
1350 lr_error (lr, _("\
1351 collation symbol `%.*s' appears more than once: ignore line"),
1352 (int) code->val.str.len, code->val.str.start);
1353 lr_ignore_rest (lr, 0);
1354 result = -1;
1355 break;
1358 collate->kind = symbol;
1360 if (set_entry (&collate->symbols, code->val.str.start,
1361 code->val.str.len, (void *) order) < 0)
1363 lr_error (lr, _("cannot process order specification"));
1364 exit (4);
1367 else
1369 if (verbose)
1370 lr_error (lr, _("unknown symbol `%.*s': line ignored"),
1371 (int) code->val.str.len, code->val.str.start);
1372 lr_ignore_rest (lr, 0);
1374 result = -1;
1376 break;
1378 case tok_undefined:
1379 collate->kind = undefined;
1380 collate->current_element = &collate->undefined;
1381 break;
1383 case tok_ellipsis:
1384 if (collate->was_ellipsis)
1386 lr_error (lr, _("\
1387 two lines in a row containing `...' are not allowed"));
1388 result = -1;
1390 else if (collate->kind != character)
1392 /* An ellipsis requires the previous line to be an
1393 character definition. */
1394 lr_error (lr, _("\
1395 line before ellipsis does not contain definition for character constant"));
1396 lr_ignore_rest (lr, 0);
1397 result = -1;
1399 else
1400 collate->kind = ellipsis;
1401 break;
1403 default:
1404 assert (! "illegal token in `collate_order_elem'");
1407 /* Now it's time to handle the ellipsis in the previous line. We do
1408 this only when the last line contained an definition for a
1409 character, the current line also defines an character, the
1410 character code for the later is bigger than the former. */
1411 if (collate->was_ellipsis)
1413 if (collate->kind != character)
1415 lr_error (lr, _("\
1416 line after ellipsis must contain character definition"));
1417 lr_ignore_rest (lr, 0);
1418 result = -1;
1420 else if (collate->last_char > value)
1422 lr_error (lr, _("end point of ellipsis range is bigger then start"));
1423 lr_ignore_rest (lr, 0);
1424 result = -1;
1426 else
1428 /* We can fill the arrays with the information we need. */
1429 wchar_t name[2];
1430 unsigned int *data;
1431 size_t *ptr;
1432 size_t cnt;
1434 name[0] = collate->last_char + 1;
1435 name[1] = L'\0';
1437 data = (unsigned int *) alloca ((collate->nrules + collate->nweight)
1438 * sizeof (unsigned int));
1439 ptr = (size_t *) alloca (collate->nrules * sizeof (size_t));
1441 if (data == NULL || ptr == NULL)
1442 error (4, 0, _("memory exhausted"));
1444 /* Prepare data. Because the characters covered by an
1445 ellipsis all have equal values we prepare the data once
1446 and only change the variable number (if there are any).
1447 PTR[...] will point to the entries which will have to be
1448 fixed during the output loop. */
1449 for (cnt = 0; cnt < collate->nrules; ++cnt)
1451 data[cnt] = collate->weight_cnt[cnt];
1452 ptr[cnt] = (cnt == 0
1453 ? collate->nweight
1454 : ptr[cnt - 1] + collate->weight_cnt[cnt - 1]);
1457 for (cnt = 0; cnt < collate->nweight; ++cnt)
1458 data[collate->nrules + cnt] = collate->weight[cnt];
1460 for (cnt = 0; cnt < collate->nrules; ++cnt)
1461 if ((wchar_t) data[ptr[cnt]] != ELLIPSIS_CHAR)
1462 ptr[cnt] = 0;
1464 while (name[0] <= value)
1466 element_t *pelem;
1468 pelem = (element_t *) obstack_alloc (&collate->element_mem,
1469 sizeof (element_t));
1470 if (pelem == NULL)
1471 error (4, 0, _("memory exhausted"));
1473 pelem->name
1474 = (const wchar_t *) obstack_copy (&collate->element_mem,
1475 name, 2 * sizeof (wchar_t));
1476 pelem->this_weight = ++collate->order_cnt;
1478 pelem->ordering_len = collate->nweight;
1479 pelem->ordering
1480 = (unsigned int *) obstack_copy (&collate->element_mem, data,
1481 (collate->nrules
1482 * pelem->ordering_len)
1483 * sizeof (unsigned int));
1485 /* `...' weights need to be adjusted. */
1486 for (cnt = 0; cnt < collate->nrules; ++cnt)
1487 if (ptr[cnt] != 0)
1488 pelem->ordering[ptr[cnt]] = pelem->this_weight;
1490 /* Insert new entry into result table. */
1491 if (find_entry (&collate->result, name, sizeof (wchar_t),
1492 (void *) &pelem->next) >= 0)
1494 if (set_entry (&collate->result, name, sizeof (wchar_t),
1495 (void *) pelem->next) < 0)
1496 error (4, 0, _("cannot insert into result table"));
1498 else
1499 if (insert_entry (&collate->result, name, sizeof (wchar_t),
1500 (void *) pelem->next) < 0)
1501 error (4, 0, _("cannot insert into result table"));
1503 /* Increment counter. */
1504 ++name[0];
1509 /* Reset counters for weights. */
1510 collate->weight_idx = 0;
1511 collate->nweight = 0;
1512 for (i = 0; i < collate->nrules; ++i)
1513 collate->weight_cnt[i] = 0;
1514 collate->current_patch = NULL;
1516 return result;
1521 collate_weight_bsymbol (struct linereader *lr, struct localedef_t *locale,
1522 struct token *code, struct charset_t *charset)
1524 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1525 unsigned int here_weight;
1526 wchar_t value;
1527 void *tmp;
1529 assert (code->tok == tok_bsymbol);
1531 value = charset_find_value (charset, code->val.str.start, code->val.str.len);
1532 if (value != ILLEGAL_CHAR_VALUE)
1534 element_t *runp;
1536 if (find_entry (&collate->result, &value, sizeof (wchar_t),
1537 (void *)&runp) < 0)
1538 runp = NULL;
1540 while (runp != NULL
1541 && (runp->name[0] != value || runp->name[1] != L'\0'))
1542 runp = runp->next;
1544 here_weight = runp == NULL ? 0 : runp->this_weight;
1546 else if (find_entry (&collate->elements, code->val.str.start,
1547 code->val.str.len, &tmp) >= 0)
1549 element_t *runp = (element_t *) tmp;
1551 here_weight = runp->this_weight;
1553 else if (find_entry (&collate->symbols, code->val.str.start,
1554 code->val.str.len, &tmp) >= 0)
1556 here_weight = (unsigned int) tmp;
1558 else
1560 if (verbose)
1561 lr_error (lr, _("unknown symbol `%.*s': line ignored"),
1562 (int) code->val.str.len, code->val.str.start);
1563 lr_ignore_rest (lr, 0);
1564 return -1;
1567 /* When we currently work on a collation symbol we do not expect any
1568 weight. */
1569 if (collate->kind == symbol)
1571 lr_error (lr, _("\
1572 specification of sorting weight for collation symbol does not make sense"));
1573 lr_ignore_rest (lr, 0);
1574 return -1;
1577 /* Add to the current collection of weights. */
1578 if (collate->nweight >= collate->nweight_max)
1580 collate->nweight_max *= 2;
1581 collate->weight = (unsigned int *) xrealloc (collate->weight,
1582 collate->nweight_max);
1585 /* If the weight is currently not known, we remember to patch the
1586 resulting tables. */
1587 if (here_weight == 0)
1589 patch_t *newp;
1591 newp = (patch_t *) obstack_alloc (&collate->element_mem,
1592 sizeof (patch_t));
1593 newp->fname = lr->fname;
1594 newp->lineno = lr->lineno;
1595 newp->token = (const char *) obstack_copy0 (&collate->element_mem,
1596 code->val.str.start,
1597 code->val.str.len);
1598 newp->where.idx = collate->nweight++;
1599 newp->next = collate->current_patch;
1600 collate->current_patch = newp;
1602 else
1603 collate->weight[collate->nweight++] = here_weight;
1604 ++collate->weight_cnt[collate->weight_idx];
1606 return 0;
1611 collate_next_weight (struct linereader *lr, struct localedef_t *locale)
1613 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1615 if (collate->kind == symbol)
1617 lr_error (lr, _("\
1618 specification of sorting weight for collation symbol does not make sense"));
1619 lr_ignore_rest (lr, 0);
1620 return -1;
1623 ++collate->weight_idx;
1624 if (collate->weight_idx >= collate->nrules)
1626 lr_error (lr, _("too many weights"));
1627 lr_ignore_rest (lr, 0);
1628 return -1;
1631 return 0;
1636 collate_simple_weight (struct linereader *lr, struct localedef_t *locale,
1637 struct token *code, struct charset_t *charset)
1639 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1640 unsigned int value = 0;
1642 /* There current tokens can be `IGNORE', `...', or a string. */
1643 switch (code->tok)
1645 case tok_ignore:
1646 /* This token is allowed in all situations. */
1647 value = IGNORE_CHAR;
1648 break;
1650 case tok_ellipsis:
1651 /* The ellipsis is only allowed for the `...' or `UNDEFINED'
1652 entry. */
1653 if (collate->kind != ellipsis && collate->kind != undefined)
1655 lr_error (lr, _("\
1656 `...' must only be used in `...' and `UNDEFINED' entries"));
1657 lr_ignore_rest (lr, 0);
1658 return -1;
1660 value = ELLIPSIS_CHAR;
1661 break;
1663 case tok_string:
1664 /* This can become difficult. We have to get the weights which
1665 correspond to the single wide chars in the string. But some
1666 of the `chars' might not be real characters, but collation
1667 elements or symbols. And so the string decoder might have
1668 signaled errors. The string at this point is not translated.
1669 I.e., all <...> sequences are still there. */
1671 char *runp = code->val.str.start;
1672 void *tmp;
1674 while (*runp != '\0')
1676 char *startp = (char *) runp;
1677 char *putp = (char *) runp;
1678 wchar_t wch;
1680 /* Lookup weight for char and store it. */
1681 if (*runp == '<')
1683 while (*++runp != '\0' && *runp != '>')
1685 if (*runp == lr->escape_char)
1686 if (*++runp == '\0')
1688 lr_error (lr, _("unterminated weight name"));
1689 lr_ignore_rest (lr, 0);
1690 return -1;
1692 *putp++ = *runp;
1694 if (*runp == '>')
1695 ++runp;
1697 if (putp == startp)
1699 lr_error (lr, _("empty weight name: line ignored"));
1700 lr_ignore_rest (lr, 0);
1701 return -1;
1704 wch = charset_find_value (charset, startp, putp - startp);
1705 if (wch != ILLEGAL_CHAR_VALUE)
1707 element_t *pelem;
1709 if (find_entry (&collate->result, &wch, sizeof (wchar_t),
1710 (void *)&pelem) < 0)
1711 pelem = NULL;
1713 while (pelem != NULL
1714 && (pelem->name[0] != wch
1715 || pelem->name[1] != L'\0'))
1716 pelem = pelem->next;
1718 value = pelem == NULL ? 0 : pelem->this_weight;
1720 else if (find_entry (&collate->elements, startp, putp - startp,
1721 &tmp) >= 0)
1723 element_t *pelem = (element_t *) tmp;
1725 value = pelem->this_weight;
1727 else if (find_entry (&collate->symbols, startp, putp - startp,
1728 &tmp) >= 0)
1730 value = (unsigned int) tmp;
1732 else
1734 if (verbose)
1735 lr_error (lr, _("unknown symbol `%.*s': line ignored"),
1736 (int) (putp - startp), startp);
1737 lr_ignore_rest (lr, 0);
1738 return -1;
1741 else
1743 element_t *wp;
1744 wchar_t wch;
1746 if (*runp == lr->escape_char)
1748 static const char digits[] = "0123456789abcdef";
1749 const char *dp;
1750 int base;
1752 ++runp;
1753 if (tolower (*runp) == 'x')
1755 ++runp;
1756 base = 16;
1758 else if (tolower (*runp) == 'd')
1760 ++runp;
1761 base = 10;
1763 else
1764 base = 8;
1766 dp = strchr (digits, tolower (*runp));
1767 if (dp == NULL || (dp - digits) >= base)
1769 illegal_char:
1770 lr_error (lr, _("\
1771 illegal character constant in string"));
1772 lr_ignore_rest (lr, 0);
1773 return -1;
1775 wch = dp - digits;
1776 ++runp;
1778 dp = strchr (digits, tolower (*runp));
1779 if (dp == NULL || (dp - digits) >= base)
1780 goto illegal_char;
1781 wch *= base;
1782 wch += dp - digits;
1783 ++runp;
1785 if (base != 16)
1787 dp = strchr (digits, tolower (*runp));
1788 if (dp != NULL && (dp - digits < base))
1790 wch *= base;
1791 wch += dp - digits;
1792 ++runp;
1796 else
1797 wch = (wchar_t) *runp++;
1799 /* Lookup the weight for WCH. */
1800 if (find_entry (&collate->result, &wch, sizeof (wch),
1801 (void *)&wp) < 0)
1802 wp = NULL;
1804 while (wp != NULL
1805 && (wp->name[0] != wch || wp->name[1] != L'\0'))
1806 wp = wp->next;
1808 value = wp == NULL ? 0 : wp->this_weight;
1810 /* To get the correct name for the error message. */
1811 putp = runp;
1813 /**************************************************\
1814 |* I know here is something wrong. Characters in *|
1815 |* the string which are not in the <...> form *|
1816 |* cannot be declared forward for now!!! *|
1817 \**************************************************/
1820 /* Store in weight array. */
1821 if (collate->nweight >= collate->nweight_max)
1823 collate->nweight_max *= 2;
1824 collate->weight
1825 = (unsigned int *) xrealloc (collate->weight,
1826 collate->nweight_max);
1829 if (value == 0)
1831 patch_t *newp;
1833 newp = (patch_t *) obstack_alloc (&collate->element_mem,
1834 sizeof (patch_t));
1835 newp->fname = lr->fname;
1836 newp->lineno = lr->lineno;
1837 newp->token
1838 = (const char *) obstack_copy0 (&collate->element_mem,
1839 startp, putp - startp);
1840 newp->where.idx = collate->nweight++;
1841 newp->next = collate->current_patch;
1842 collate->current_patch = newp;
1844 else
1845 collate->weight[collate->nweight++] = value;
1846 ++collate->weight_cnt[collate->weight_idx];
1849 return 0;
1851 default:
1852 assert (! "should not happen");
1856 if (collate->nweight >= collate->nweight_max)
1858 collate->nweight_max *= 2;
1859 collate->weight = (unsigned int *) xrealloc (collate->weight,
1860 collate->nweight_max);
1863 collate->weight[collate->nweight++] = value;
1864 ++collate->weight_cnt[collate->weight_idx];
1866 return 0;
1870 void
1871 collate_end_weight (struct linereader *lr, struct localedef_t *locale)
1873 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1874 element_t *pelem = collate->current_element;
1876 if (collate->kind == symbol)
1878 /* We don't have to do anything. */
1879 collate->was_ellipsis = 0;
1880 return;
1883 if (collate->kind == ellipsis)
1885 /* Before the next line is processed the ellipsis is handled. */
1886 collate->was_ellipsis = 1;
1887 return;
1890 assert (collate->kind == character || collate->kind == element
1891 || collate->kind == undefined);
1893 /* Fill in the missing weights. */
1894 while (++collate->weight_idx < collate->nrules)
1896 collate->weight[collate->nweight++] = pelem->this_weight;
1897 ++collate->weight_cnt[collate->weight_idx];
1900 /* Now we know how many ordering weights the current
1901 character/element has. Allocate room in the element structure
1902 and copy information. */
1903 pelem->ordering_len = collate->nweight;
1905 /* First we write an array with the number of values for each
1906 weight. */
1907 obstack_grow (&collate->element_mem, collate->weight_cnt,
1908 collate->nrules * sizeof (unsigned int));
1910 /* Now the weights itselves. */
1911 obstack_grow (&collate->element_mem, collate->weight,
1912 collate->nweight * sizeof (unsigned int));
1914 /* Get result. */
1915 pelem->ordering = obstack_finish (&collate->element_mem);
1917 /* Now we handle the "patches". */
1918 while (collate->current_patch != NULL)
1920 patch_t *this_patch;
1922 this_patch = collate->current_patch;
1924 this_patch->where.pos = &pelem->ordering[collate->nrules
1925 + this_patch->where.idx];
1927 collate->current_patch = this_patch->next;
1928 this_patch->next = collate->all_patches;
1929 collate->all_patches = this_patch;
1932 /* Set information for next round. */
1933 collate->was_ellipsis = 0;
1934 if (collate->kind != undefined)
1935 collate->last_char = pelem->name[0];