Thu May 9 01:24:00 1996 Ulrich Drepper <drepper@cygnus.com>
[glibc/history.git] / locale / programs / ld-collate.c
blob629df90ced1dec8fa17f16b810bc9c103e141d43
1 /* Copyright (C) 1995, 1996 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If
17 not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
24 #include <endian.h>
25 #include <errno.h>
26 #include <limits.h>
27 #include <locale.h>
28 #include <obstack.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <wchar.h>
33 #include "localeinfo.h"
34 #include "locales.h"
35 #include "simple-hash.h"
36 #include "stringtrans.h"
38 /* Uncomment the following line in the production version. */
39 /* #define NDEBUG 1 */
40 #include <assert.h>
43 #define MAX(a, b) ((a) > (b) ? (a) : (b))
45 #define SWAPU32(w) \
46 (((w) << 24) | (((w) & 0xff00) << 8) | (((w) >> 8) & 0xff00) | ((w) >> 24))
49 /* What kind of symbols get defined? */
50 enum coll_symbol
52 undefined,
53 ellipsis,
54 character,
55 element,
56 symbol
60 typedef struct patch_t
62 const char *fname;
63 size_t lineno;
64 const char *token;
65 union
67 unsigned int *pos;
68 size_t idx;
69 } where;
70 struct patch_t *next;
71 } patch_t;
74 typedef struct element_t
76 const wchar_t *name;
77 unsigned int this_weight;
79 struct element_t *next;
81 unsigned int *ordering;
82 size_t ordering_len;
83 } element_t;
86 /* The real definition of the struct for the LC_CTYPE locale. */
87 struct locale_collate_t
89 /* Collate symbol table. Simple mapping to number. */
90 hash_table symbols;
92 /* The collation elements. */
93 hash_table elements;
94 struct obstack element_mem;
96 /* The result table. */
97 hash_table result;
99 /* Sorting rules given in order_start line. */
100 int nrules;
101 int nrules_max;
102 enum coll_sort_rule *rules;
104 /* Used while recognizing symbol composed of multiple tokens
105 (collating-element). */
106 const char *combine_token;
107 size_t combine_token_len;
109 /* How many sorting order specifications so far. */
110 unsigned int order_cnt;
112 /* Was lastline ellipsis? */
113 int was_ellipsis;
114 /* Value of last entry if was character. */
115 wchar_t last_char;
116 /* Current element. */
117 element_t *current_element;
118 /* What kind of symbol is current element. */
119 enum coll_symbol kind;
121 /* While collecting the weigths we need some temporary space. */
122 unsigned int current_order;
123 int *weight_cnt;
124 int weight_idx;
125 unsigned int *weight;
126 int nweight;
127 int nweight_max;
129 /* Patch lists. */
130 patch_t *current_patch;
131 patch_t *all_patches;
133 /* Room for the UNDEFINED information. */
134 element_t undefined;
135 unsigned int undefined_len;
139 /* Be verbose? Defined in localedef.c. */
140 extern int verbose;
143 void *xmalloc (size_t __n);
144 void *xrealloc (void *__p, size_t __n);
147 #define obstack_chunk_alloc xmalloc
148 #define obstack_chunk_free free
151 void
152 collate_startup (struct linereader *lr, struct localedef_t *locale,
153 struct charset_t *charset)
155 struct locale_collate_t *collate;
157 /* It is important that we always use UCS4 encoding for strings now. */
158 encoding_method = ENC_UCS4;
160 /* Allocate the needed room. */
161 locale->categories[LC_COLLATE].collate = collate =
162 (struct locale_collate_t *) xmalloc (sizeof (struct locale_collate_t));
164 /* Allocate hash table for collating elements. */
165 if (init_hash (&collate->elements, 512))
166 error (4, 0, _("memory exhausted"));
167 collate->combine_token = NULL;
168 obstack_init (&collate->element_mem);
170 /* Allocate hash table for collating elements. */
171 if (init_hash (&collate->symbols, 64))
172 error (4, 0, _("memory exhausted"));
174 /* Allocate hash table for result. */
175 if (init_hash (&collate->result, 512))
176 error (4, 0, _("memory exhausted"));
178 collate->nrules = 0;
179 collate->nrules_max = 10;
180 collate->rules
181 = (enum coll_sort_rule *) xmalloc (collate->nrules_max
182 * sizeof (enum coll_sort_rule));
184 collate->order_cnt = 1; /* The smallest weight is 2. */
186 collate->was_ellipsis = 0;
187 collate->last_char = L'\0'; /* 0 because leading ellipsis is allowed. */
189 collate->all_patches = NULL;
191 /* This tells us no UNDEFINED entry was found until now. */
192 collate->undefined.this_weight = 0;
194 lr->translate_strings = 0;
198 void
199 collate_finish (struct localedef_t *locale, struct charset_t *charset)
201 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
202 patch_t *patch;
203 size_t cnt;
205 /* Patch the constructed table so that forward references are
206 correctly filled. */
207 for (patch = collate->all_patches; patch != NULL; patch = patch->next)
209 wchar_t wch;
210 size_t toklen = strlen (patch->token);
211 void *ptmp;
212 unsigned int value = 0;
214 wch = charset_find_value (charset, patch->token, toklen);
215 if (wch != ILLEGAL_CHAR_VALUE)
217 element_t *runp;
219 if (find_entry (&collate->result, &wch, sizeof (wchar_t),
220 (void *) &runp) < 0)
221 runp = NULL;
222 for (; runp != NULL; runp = runp->next)
223 if (runp->name[0] == wch && runp->name[1] == L'\0')
224 break;
226 value = runp == NULL ? 0 : runp->this_weight;
228 else if (find_entry (&collate->elements, patch->token, toklen, &ptmp)
229 >= 0)
231 value = ((element_t *) ptmp)->this_weight;
233 else if (find_entry (&collate->symbols, patch->token, toklen, &ptmp)
234 >= 0)
236 value = (unsigned int) ptmp;
238 else
239 value = 0;
241 if (value == 0)
242 error_at_line (0, 0, patch->fname, patch->lineno,
243 _("no weight defined for symbol `%s'"), patch->token);
244 else
245 *patch->where.pos = value;
248 /* If no definition for UNDEFINED is given, all characters in the
249 given charset must be specified. */
250 if (collate->undefined.ordering == NULL)
252 /**************************************************************\
253 |* XXX We should test whether really an unspecified character *|
254 |* exists before giving the message. *|
255 \**************************************************************/
256 u_int32_t weight;
258 error (0, 0, _("no definition of `UNDEFINED'"));
260 collate->undefined.ordering_len = collate->nrules;
261 weight = ++collate->order_cnt;
263 for (cnt = 0; cnt < collate->nrules; ++cnt)
265 u_int32_t one = 1;
266 obstack_grow (&collate->element_mem, &one, sizeof (one));
269 for (cnt = 0; cnt < collate->nrules; ++cnt)
270 obstack_grow (&collate->element_mem, &weight, sizeof (weight));
272 collate->undefined.ordering = obstack_finish (&collate->element_mem);
275 collate->undefined_len = 2; /* For the name: 1 x wchar_t + L'\0'. */
276 for (cnt = 0; cnt < collate->nrules; ++cnt)
277 collate->undefined_len += 1 + collate->undefined.ordering[cnt];
279 /* Collating symbols are not used anymore. */
280 (void) delete_hash (&collate->symbols);
285 void
286 collate_output (struct localedef_t *locale, const char *output_path)
288 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
289 u_int32_t table_size, table_best, level_best, sum_best;
290 void *last;
291 element_t *pelem;
292 wchar_t *name;
293 size_t len;
294 const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE);
295 struct iovec iov[2 + nelems];
296 struct locale_file data;
297 u_int32_t idx[nelems];
298 struct obstack non_simple;
299 size_t cnt, entry_size;
300 u_int32_t undefined_offset = UINT_MAX;
301 u_int32_t *table, *extra, *table2, *extra2;
302 size_t extra_len;
304 sum_best = UINT_MAX;
305 table_best = 0xffff;
306 level_best = 0xffff;
308 /* Compute table size. */
309 fputs (_("\
310 Computing table size for collation information might take a while..."),
311 stderr);
312 for (table_size = 256; table_size < sum_best; ++table_size)
314 size_t hits[table_size];
315 unsigned int worst = 1;
316 size_t cnt;
318 last = NULL;
320 for (cnt = 0; cnt < 256; ++cnt)
321 hits[cnt] = 1;
322 memset (&hits[256], '\0', sizeof (hits) - 256 * sizeof (size_t));
324 while (iterate_table (&collate->result, &last, (const void **) &name,
325 &len, (void **) &pelem) >= 0)
326 if (pelem->ordering != NULL && pelem->name[0] > 0xff)
327 if (++hits[(unsigned int) pelem->name[0] % table_size] > worst)
329 worst = hits[(unsigned int) pelem->name[0] % table_size];
330 if (table_size * worst > sum_best)
331 break;
334 if (table_size * worst < sum_best)
336 sum_best = table_size * worst;
337 table_best = table_size;
338 level_best = worst;
341 assert (table_best != 0xffff || level_best != 0xffff);
342 fputs (_(" done\n"), stderr);
344 obstack_init (&non_simple);
346 data.magic = LIMAGIC (LC_COLLATE);
347 data.n = nelems;
348 iov[0].iov_base = (void *) &data;
349 iov[0].iov_len = sizeof (data);
351 iov[1].iov_base = (void *) idx;
352 iov[1].iov_len = sizeof (idx);
354 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_NRULES)].iov_base = &collate->nrules;
355 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_NRULES)].iov_len = sizeof (u_int32_t);
357 table = (u_int32_t *) alloca (collate->nrules * sizeof (u_int32_t));
358 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_RULES)].iov_base = table;
359 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_RULES)].iov_len
360 = collate->nrules * sizeof (u_int32_t);
361 /* Another trick here. Describing the collation method needs only a
362 few bits (3, to be exact). But the binary file should be
363 accessible by maschines with both endianesses and so we store both
364 information in the same word. */
365 for (cnt = 0; cnt < collate->nrules; ++cnt)
366 table[cnt] = collate->rules[cnt] | SWAPU32 (collate->rules[cnt]);
368 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE)].iov_base = &table_best;
369 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE)].iov_len = sizeof (u_int32_t);
371 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS)].iov_base = &level_best;
372 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS)].iov_len
373 = sizeof (u_int32_t);
375 entry_size = 1 + MAX (collate->nrules, 2);
377 table = (u_int32_t *) alloca (table_best * level_best * entry_size
378 * sizeof (table[0]));
379 memset (table, '\0', table_best * level_best * entry_size
380 * sizeof (table[0]));
383 /* Macros for inserting in output table. */
384 #define ADD_VALUE(expr) \
385 do { \
386 u_int32_t to_write = (u_int32_t) expr; \
387 obstack_grow (&non_simple, &to_write, sizeof (to_write)); \
388 } while (0)
390 #define ADD_ELEMENT(pelem, len) \
391 do { \
392 size_t cnt, idx; \
394 ADD_VALUE (len); \
396 wlen = wcslen (pelem->name); \
397 obstack_grow (&non_simple, pelem->name, (wlen + 1) * sizeof (u_int32_t)); \
399 idx = collate->nrules; \
400 for (cnt = 0; cnt < collate->nrules; ++cnt) \
402 size_t disp; \
404 ADD_VALUE (pelem->ordering[cnt]); \
405 for (disp = 0; disp < pelem->ordering[cnt]; ++disp) \
406 ADD_VALUE (pelem->ordering[idx++]); \
408 } while (0)
410 #define ADD_FORWARD(pelem) \
411 do { \
412 /* We leave a reference in the main table and put all \
413 information in the table for the extended entries. */ \
414 element_t *runp; \
415 element_t *has_simple = NULL; \
416 size_t wlen; \
418 table[(level * table_best + slot) * entry_size + 1] \
419 = FORWARD_CHAR; \
420 table[(level * table_best + slot) * entry_size + 2] \
421 = obstack_object_size (&non_simple) / sizeof (u_int32_t); \
423 /* Here we have to construct the non-simple table entry. First \
424 compute the total length of this entry. */ \
425 for (runp = (pelem); runp != NULL; runp = runp->next) \
426 if (runp->ordering != NULL) \
428 u_int32_t value; \
429 size_t cnt; \
431 value = 1 + wcslen (runp->name) + 1; \
433 for (cnt = 0; cnt < collate->nrules; ++cnt) \
434 /* We have to take care for entries without ordering \
435 information. While reading them they get inserted in the \
436 table and later not removed when something goes wrong with \
437 reading its weights. */ \
439 value += 1 + runp->ordering[cnt]; \
441 if (runp->name[1] == L'\0') \
442 has_simple = runp; \
445 ADD_ELEMENT (runp, value); \
448 if (has_simple == NULL) \
450 size_t idx, cnt; \
452 ADD_VALUE (collate->undefined_len + 1); \
454 /* Add the name. */ \
455 ADD_VALUE ((pelem)->name[0]); \
456 ADD_VALUE (0); \
458 idx = collate->nrules; \
459 for (cnt = 0; cnt < collate->nrules; ++cnt) \
461 size_t disp; \
463 ADD_VALUE (collate->undefined.ordering[cnt]); \
464 for (disp = 0; disp < collate->undefined.ordering[cnt]; ++disp) \
466 if (collate->undefined.ordering[idx] == ELLIPSIS_CHAR) \
467 ADD_VALUE ((pelem)->name[0]); \
468 else \
469 ADD_VALUE (collate->undefined.ordering[idx++]); \
470 ++idx; \
474 } while (0)
478 /* Fill the table now. First we look for all the characters which
479 fit into one single byte. This speeds up the 8-bit string
480 functions. */
481 last = NULL;
482 while (iterate_table (&collate->result, &last, (const void **) &name,
483 &len, (void **) &pelem) >= 0)
484 if (pelem->name[0] <= 0xff)
486 /* We have a single byte name. Now we must distinguish
487 between entries in simple form (i.e., only one value per
488 weight and no collation element starting with the same
489 character) and those which are not. */
490 size_t slot = ((size_t) pelem->name[0]);
491 const size_t level = 0;
493 table[slot * entry_size] = pelem->name[0];
495 if (pelem->name[1] == L'\0' && pelem->next == NULL
496 && pelem->ordering_len == collate->nrules)
498 /* Yes, we have a simple one. Lucky us. */
499 size_t cnt;
501 for (cnt = 0; cnt < collate->nrules; ++cnt)
502 table[slot * entry_size + 1 + cnt]
503 = pelem->ordering[collate->nrules + cnt];
505 else
506 ADD_FORWARD (pelem);
509 /* Now check for missing single byte entries. If one exist we fill
510 with the UNDEFINED entry. */
511 for (cnt = 0; cnt < 256; ++cnt)
512 /* The first weight is never 0 for existing entries. */
513 if (table[cnt * entry_size + 1] == 0)
515 /* We have to fill in the information from the UNDEFINED
516 entry. */
517 table[cnt * entry_size] = (u_int32_t) cnt;
519 if (collate->undefined.ordering_len == collate->nrules)
521 size_t inner;
523 for (inner = 0; inner < collate->nrules; ++inner)
524 if (collate->undefined.ordering[collate->nrules + inner]
525 == ELLIPSIS_CHAR)
526 table[cnt * entry_size + 1 + inner] = cnt;
527 else
528 table[cnt * entry_size + 1 + inner]
529 = collate->undefined.ordering[collate->nrules + inner];
531 else
533 if (undefined_offset != UINT_MAX)
535 table[cnt * entry_size + 1] = FORWARD_CHAR;
536 table[cnt * entry_size + 2] = undefined_offset;
538 else
540 const size_t slot = cnt;
541 const size_t level = 0;
543 ADD_FORWARD (&collate->undefined);
544 undefined_offset = table[cnt * entry_size + 2];
549 /* Now we are ready for inserting the whole rest. */
550 last = NULL;
551 while (iterate_table (&collate->result, &last, (const void **) &name,
552 &len, (void **) &pelem) >= 0)
553 if (pelem->name[0] > 0xff)
555 /* Find the position. */
556 size_t slot = ((size_t) pelem->name[0]) % table_best;
557 size_t level = 0;
559 while (table[(level * table_best + slot) * entry_size + 1] != 0)
560 ++level;
561 assert (level < level_best);
563 if (pelem->name[1] == L'\0' && pelem->next == NULL
564 && pelem->ordering_len == collate->nrules)
566 /* Again a simple entry. */
567 size_t inner;
569 for (inner = 0; inner < collate->nrules; ++inner)
570 table[(level * table_best + slot) * entry_size + 1 + inner]
571 = pelem->ordering[collate->nrules + inner];
573 else
574 ADD_FORWARD (pelem);
577 /* Add the UNDEFINED entry. */
579 /* Here we have to construct the non-simple table entry. */
580 size_t idx, cnt;
582 undefined_offset = obstack_object_size (&non_simple);
584 idx = collate->nrules;
585 for (cnt = 0; cnt < collate->nrules; ++cnt)
587 size_t disp;
589 ADD_VALUE (collate->undefined.ordering[cnt]);
590 for (disp = 0; disp < collate->undefined.ordering[cnt]; ++disp)
591 ADD_VALUE (collate->undefined.ordering[idx++]);
595 /* Finish the extra block. */
596 extra_len = obstack_object_size (&non_simple);
597 extra = (u_int32_t *) obstack_finish (&non_simple);
598 assert ((extra_len % sizeof (u_int32_t)) == 0);
600 /* Now we have to build the two array for the other byte ordering. */
601 table2 = (u_int32_t *) alloca (table_best * level_best * entry_size
602 * sizeof (table[0]));
603 extra2 = (u_int32_t *) alloca (extra_len);
605 for (cnt = 0; cnt < table_best * level_best * entry_size; ++cnt)
606 table2[cnt] = SWAPU32 (table[cnt]);
608 for (cnt = 0; cnt < extra_len / sizeof (u_int32_t); ++cnt)
609 extra2[cnt] = SWAPU32 (extra2[cnt]);
611 /* Store table adresses and lengths. */
612 #if __BYTE_ORDER == __BIG_ENDIAN
613 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_base = table;
614 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_len
615 = table_best * level_best * entry_size * sizeof (table[0]);
617 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_base = table2;
618 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_len
619 = table_best * level_best * entry_size * sizeof (table[0]);
621 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_base = extra;
622 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_len = extra_len;
624 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_base = extra2;
625 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_len = extra_len;
626 #else
627 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_base = table2;
628 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_len
629 = table_best * level_best * entry_size * sizeof (table[0]);
631 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_base = table;
632 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_len
633 = table_best * level_best * entry_size * sizeof (table[0]);
635 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_base = extra2;
636 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_len = extra_len;
638 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_base = extra;
639 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_len = extra_len;
640 #endif
642 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_UNDEFINED)].iov_base = &undefined_offset;
643 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_UNDEFINED)].iov_len = sizeof (u_int32_t);
645 /* Update idx array. */
646 idx[0] = iov[0].iov_len + iov[1].iov_len;
647 for (cnt = 1; cnt < nelems; ++cnt)
648 idx[cnt] = idx[cnt - 1] + iov[1 + cnt].iov_len;
650 write_locale_data (output_path, "LC_COLLATE", 2 + nelems, iov);
654 void
655 collate_element_to (struct linereader *lr, struct localedef_t *locale,
656 struct token *code, struct charset_t *charset)
658 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
659 unsigned int value;
660 void *not_used;
662 if (collate->combine_token != NULL)
664 free ((void *) collate->combine_token);
665 collate->combine_token = NULL;
668 value = charset_find_value (charset, code->val.str.start, code->val.str.len);
669 if (value != ILLEGAL_CHAR_VALUE)
671 lr_error (lr, _("symbol for multicharacter collating element "
672 "`%.*s' duplicates symbolic name in charset"),
673 code->val.str.len, code->val.str.start);
674 return;
677 if (find_entry (&collate->elements, code->val.str.start, code->val.str.len,
678 &not_used) >= 0)
680 lr_error (lr, _("symbol for multicharacter collating element "
681 "`%.*s' duplicates other element definition"),
682 code->val.str.len, code->val.str.start);
683 return;
686 if (find_entry (&collate->elements, code->val.str.start, code->val.str.len,
687 &not_used) >= 0)
689 lr_error (lr, _("symbol for multicharacter collating element "
690 "`%.*s' duplicates symbol definition"),
691 code->val.str.len, code->val.str.start);
692 return;
695 collate->combine_token = code->val.str.start;
696 collate->combine_token_len = code->val.str.len;
700 void
701 collate_element_from (struct linereader *lr, struct localedef_t *locale,
702 struct token *code, struct charset_t *charset)
704 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
705 element_t *elemp, *runp;
707 /* CODE is a string. */
708 elemp = (element_t *) obstack_alloc (&collate->element_mem,
709 sizeof (element_t));
711 /* We have to translate the string. It may contain <...> character
712 names. */
713 elemp->name = (wchar_t *) translate_string (code->val.str.start, charset);
714 elemp->this_weight = 0;
715 elemp->ordering = NULL;
716 elemp->ordering_len = 0;
718 free (code->val.str.start);
720 if (elemp->name == NULL)
722 /* At least one character in the string is not defined. We simply
723 do nothing. */
724 if (verbose)
725 lr_error (lr, _("\
726 `from' string in collation element declaration contains unknown character"));
727 return;
730 if (elemp->name[0] == L'\0' || elemp->name[1] == L'\0')
732 lr_error (lr, _("illegal colltion element"));
733 return;
736 /* The entries in the linked lists of RESULT are sorting in
737 descending order. The order is important for the `strcoll' and
738 `wcscoll' functions. */
739 if (find_entry (&collate->result, elemp->name, sizeof (wchar_t),
740 (void *) &runp) >= 0)
742 /* We already have an entry with this key. Check whether it is
743 identical. */
744 element_t *prevp = NULL;
745 int cmpres;
749 cmpres = wcscmp (elemp->name, runp->name);
750 if (cmpres <= 0)
751 break;
752 prevp = runp;
754 while ((runp = runp->next) != NULL);
756 if (cmpres == 0)
757 lr_error (lr, _("duplicate collating element definition"));
758 else
760 elemp->next = runp;
761 if (prevp == NULL)
763 if (set_entry (&collate->result, elemp->name, sizeof (wchar_t),
764 elemp) < 0)
765 error (EXIT_FAILURE, 0,
766 _("\
767 error while inserting collation element into hash table"));
769 else
770 prevp->next = elemp;
773 else
775 elemp->next = NULL;
776 if (insert_entry (&collate->result, elemp->name, sizeof (wchar_t), elemp)
777 < 0)
778 error (EXIT_FAILURE, errno, _("error while inserting to hash table"));
781 if (insert_entry (&collate->elements, collate->combine_token,
782 collate->combine_token_len, (void *) elemp) < 0)
783 lr_error (lr, _("cannot insert new collating symbol definition: %s"),
784 strerror (errno));
788 void
789 collate_symbol (struct linereader *lr, struct localedef_t *locale,
790 struct token *code, struct charset_t *charset)
792 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
793 wchar_t value;
794 void *not_used;
796 value = charset_find_value (charset, code->val.str.start, code->val.str.len);
797 if (value != ILLEGAL_CHAR_VALUE)
799 lr_error (lr, _("symbol for multicharacter collating element "
800 "`%.*s' duplicates symbolic name in charset"),
801 code->val.str.len, code->val.str.start);
802 return;
805 if (find_entry (&collate->elements, code->val.str.start, code->val.str.len,
806 &not_used) >= 0)
808 lr_error (lr, _("symbol for multicharacter collating element "
809 "`%.*s' duplicates element definition"),
810 code->val.str.len, code->val.str.start);
811 return;
814 if (find_entry (&collate->symbols, code->val.str.start, code->val.str.len,
815 &not_used) >= 0)
817 lr_error (lr, _("symbol for multicharacter collating element "
818 "`%.*s' duplicates other symbol definition"),
819 code->val.str.len, code->val.str.start);
820 return;
823 if (insert_entry (&collate->symbols, code->val.str.start, code->val.str.len,
824 (void *) 0) < 0)
825 lr_error (lr, _("cannot insert new collating symbol definition: %s"),
826 strerror (errno));
830 void
831 collate_new_order (struct linereader *lr, struct localedef_t *locale,
832 enum coll_sort_rule sort_rule)
834 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
836 if (collate->nrules >= collate->nrules_max)
838 collate->nrules_max *= 2;
839 collate->rules
840 = (enum coll_sort_rule *) xrealloc (collate->rules,
841 collate->nrules_max
842 * sizeof (enum coll_sort_rule));
845 collate->rules[collate->nrules++] = sort_rule;
849 void
850 collate_build_arrays (struct linereader *lr, struct localedef_t *locale)
852 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
854 collate->rules
855 = (enum coll_sort_rule *) xrealloc (collate->rules,
856 collate->nrules
857 * sizeof (enum coll_sort_rule));
859 /* Allocate arrays for temporary weights. */
860 collate->weight_cnt = (int *) xmalloc (collate->nrules * sizeof (int));
862 /* Choose arbitrary start value for table size. */
863 collate->nweight_max = 5 * collate->nrules;
864 collate->weight = (int *) xmalloc (collate->nweight_max * sizeof (int));
869 collate_order_elem (struct linereader *lr, struct localedef_t *locale,
870 struct token *code, struct charset_t *charset)
872 const wchar_t zero = L'\0';
873 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
874 int result = 0;
875 wchar_t value;
876 void *tmp;
877 int i;
879 switch (code->tok)
881 case tok_bsymbol:
882 /* We have a string to find in one of the three hashing tables. */
883 value = charset_find_value (charset, code->val.str.start,
884 code->val.str.len);
885 if (value != ILLEGAL_CHAR_VALUE)
887 element_t *lastp, *firstp;
889 collate->kind = character;
891 if (find_entry (&collate->result, &value, sizeof (wchar_t),
892 (void *) &firstp) < 0)
893 firstp = lastp = NULL;
894 else
896 /* The entry for the simple character is always found at
897 the end. */
898 lastp = firstp;
899 while (lastp->next != NULL)
900 lastp = lastp->next;
902 if (lastp->name[0] == value && lastp->name[1] == L'\0')
904 lr_error (lr, _("duplicate definition for character `%.*s'"),
905 code->val.str.len, code->val.str.start);
906 lr_ignore_rest (lr, 0);
907 result = -1;
908 break;
912 collate->current_element
913 = (element_t *) obstack_alloc (&collate->element_mem,
914 sizeof (element_t));
916 obstack_grow (&collate->element_mem, &value, sizeof (value));
917 obstack_grow (&collate->element_mem, &zero, sizeof (zero));
919 collate->current_element->name =
920 (const wchar_t *) obstack_finish (&collate->element_mem);
922 collate->current_element->this_weight = ++collate->order_cnt;
924 collate->current_element->next = NULL;
926 if (firstp == NULL)
928 if (insert_entry (&collate->result, &value, sizeof (wchar_t),
929 (void *) collate->current_element) < 0)
931 lr_error (lr, _("cannot insert collation element `%.*s'"),
932 code->val.str.len, code->val.str.start);
933 exit (4);
936 else
937 lastp->next = collate->current_element;
939 else if (find_entry (&collate->elements, code->val.str.start,
940 code->val.str.len, &tmp) >= 0)
942 collate->current_element = (element_t *) tmp;
944 if (collate->current_element->this_weight != 0)
946 lr_error (lr, _("\
947 collation element `%.*s' appears more than once: ignore line"),
948 code->val.str.len, code->val.str.start);
949 lr_ignore_rest (lr, 0);
950 result = -1;
951 break;
954 collate->kind = element;
955 collate->current_element->this_weight = ++collate->order_cnt;
957 else if (find_entry (&collate->symbols, code->val.str.start,
958 code->val.str.len, &tmp) >= 0)
960 unsigned int order = ++collate->order_cnt;
962 if ((unsigned int) tmp != 0)
964 lr_error (lr, _("\
965 collation symbol `.*s' appears more than once: ignore line"),
966 code->val.str.len, code->val.str.start);
967 lr_ignore_rest (lr, 0);
968 result = -1;
969 break;
972 collate->kind = symbol;
974 if (set_entry (&collate->symbols, code->val.str.start,
975 code->val.str.len, (void *) order) < 0)
977 lr_error (lr, _("cannot process order specification"));
978 exit (4);
981 else
983 if (verbose)
984 lr_error (lr, _("unknown symbol `%.*s': line ignored"),
985 code->val.str.len, code->val.str.start);
986 lr_ignore_rest (lr, 0);
988 result = -1;
990 break;
992 case tok_undefined:
993 collate->kind = undefined;
994 collate->current_element = &collate->undefined;
995 break;
997 case tok_ellipsis:
998 if (collate->was_ellipsis)
1000 lr_error (lr, _("\
1001 two lines in a row containing `...' are not allowed"));
1002 result = -1;
1004 else if (collate->kind != character)
1006 /* An ellipsis requires the previous line to be an
1007 character definition. */
1008 lr_error (lr, _("\
1009 line before ellipsis does not contain definition for character constant"));
1010 lr_ignore_rest (lr, 0);
1011 result = -1;
1013 else
1014 collate->kind = ellipsis;
1015 break;
1017 default:
1018 assert (! "illegal token in `collate_order_elem'");
1021 /* Now it's time to handle the ellipsis in the previous line. We do
1022 this only when the last line contained an definition for an
1023 character, the current line also defines an character, the
1024 character code for the later is bigger than the former. */
1025 if (collate->was_ellipsis)
1027 if (collate->kind != character)
1029 lr_error (lr, _("\
1030 line after ellipsis must contain character definition"));
1031 lr_ignore_rest (lr, 0);
1032 result = -1;
1034 else if (collate->last_char > value)
1036 lr_error (lr, _("end point of ellipsis range is bigger then start"));
1037 lr_ignore_rest (lr, 0);
1038 result = -1;
1040 else
1042 /* We can fill the arrays with the information we need. */
1043 wchar_t name[2];
1044 unsigned int *data;
1045 size_t *ptr;
1046 size_t cnt;
1048 name[0] = collate->last_char + 1;
1049 name[1] = L'\0';
1051 data = (unsigned int *) alloca ((collate->nrules + collate->nweight)
1052 * sizeof (unsigned int));
1053 ptr = (size_t *) alloca (collate->nrules * sizeof (size_t));
1055 if (data == NULL || ptr == NULL)
1056 error (4, 0, _("memory exhausted"));
1058 /* Prepare data. Because the characters covered by an
1059 ellipsis all have equal values we prepare the data once
1060 and only change the variable number (if there are any).
1061 PTR[...] will point to the entries which will have to be
1062 fixed during the output loop. */
1063 for (cnt = 0; cnt < collate->nrules; ++cnt)
1065 data[cnt] = collate->weight_cnt[cnt];
1066 ptr[cnt] = (cnt == 0
1067 ? collate->nweight
1068 : ptr[cnt - 1] + collate->weight_cnt[cnt - 1]);
1071 for (cnt = 0; cnt < collate->nweight; ++cnt)
1072 data[collate->nrules + cnt] = collate->weight[cnt];
1074 for (cnt = 0; cnt < collate->nrules; ++cnt)
1075 if (data[ptr[cnt]] != ELLIPSIS_CHAR)
1076 ptr[cnt] = 0;
1078 while (name[0] <= value)
1080 element_t *pelem;
1082 pelem = (element_t *) obstack_alloc (&collate->element_mem,
1083 sizeof (element_t));
1084 if (pelem == NULL)
1085 error (4, 0, _("memory exhausted"));
1087 pelem->name
1088 = (const wchar_t *) obstack_copy (&collate->element_mem,
1089 name, 2 * sizeof (wchar_t));
1090 pelem->this_weight = ++collate->order_cnt;
1092 pelem->ordering_len = collate->nweight;
1093 pelem->ordering
1094 = (unsigned int *) obstack_copy (&collate->element_mem, data,
1095 (collate->nrules
1096 * pelem->ordering_len)
1097 * sizeof (unsigned int));
1099 /* `...' weights need to be adjusted. */
1100 for (cnt = 0; cnt < collate->nrules; ++cnt)
1101 if (ptr[cnt] != 0)
1102 pelem->ordering[ptr[cnt]] = pelem->this_weight;
1104 /* Insert new entry into result table. */
1105 if (find_entry (&collate->result, name, sizeof (wchar_t),
1106 (void *) &pelem->next) >= 0)
1108 if (set_entry (&collate->result, name, sizeof (wchar_t),
1109 (void *) pelem->next) < 0)
1110 error (4, 0, _("cannot insert into result table"));
1112 else
1113 if (insert_entry (&collate->result, name, sizeof (wchar_t),
1114 (void *) pelem->next) < 0)
1115 error (4, 0, _("cannot insert into result table"));
1117 /* Increment counter. */
1118 ++name[0];
1123 /* Reset counters for weights. */
1124 collate->weight_idx = 0;
1125 collate->nweight = 0;
1126 for (i = 0; i < collate->nrules; ++i)
1127 collate->weight_cnt[i] = 0;
1128 collate->current_patch = NULL;
1130 return result;
1135 collate_weight_bsymbol (struct linereader *lr, struct localedef_t *locale,
1136 struct token *code, struct charset_t *charset)
1138 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1139 unsigned int here_weight;
1140 wchar_t value;
1141 void *tmp;
1143 assert (code->tok == tok_bsymbol);
1145 value = charset_find_value (charset, code->val.str.start, code->val.str.len);
1146 if (value != ILLEGAL_CHAR_VALUE)
1148 element_t *runp;
1150 if (find_entry (&collate->result, &value, sizeof (wchar_t),
1151 (void *)&runp) < 0)
1152 runp = NULL;
1154 while (runp != NULL
1155 && (runp->name[0] != value || runp->name[1] != L'\0'))
1156 runp = runp->next;
1158 here_weight = runp == NULL ? 0 : runp->this_weight;
1160 else if (find_entry (&collate->elements, code->val.str.start,
1161 code->val.str.len, &tmp) >= 0)
1163 element_t *runp = (element_t *) tmp;
1165 here_weight = runp->this_weight;
1167 else if (find_entry (&collate->symbols, code->val.str.start,
1168 code->val.str.len, &tmp) >= 0)
1170 here_weight = (unsigned int) tmp;
1172 else
1174 if (verbose)
1175 lr_error (lr, _("unknown symbol `%.*s': line ignored"),
1176 code->val.str.len, code->val.str.start);
1177 lr_ignore_rest (lr, 0);
1178 return -1;
1181 /* When we currently work on a collation symbol we do not expect any
1182 weight. */
1183 if (collate->kind == symbol)
1185 lr_error (lr, _("\
1186 specification of sorting weight for collation symbol does not make sense"));
1187 lr_ignore_rest (lr, 0);
1188 return -1;
1191 /* Add to the current collection of weights. */
1192 if (collate->nweight >= collate->nweight_max)
1194 collate->nweight_max *= 2;
1195 collate->weight = (unsigned int *) xrealloc (collate->weight,
1196 collate->nweight_max);
1199 /* If the weight is currently not known, we remember to patch the
1200 resulting tables. */
1201 if (here_weight == 0)
1203 patch_t *newp;
1205 newp = (patch_t *) obstack_alloc (&collate->element_mem,
1206 sizeof (patch_t));
1207 newp->fname = lr->fname;
1208 newp->lineno = lr->lineno;
1209 newp->token = (const char *) obstack_copy0 (&collate->element_mem,
1210 code->val.str.start,
1211 code->val.str.len);
1212 newp->where.idx = collate->nweight++;
1213 newp->next = collate->current_patch;
1214 collate->current_patch = newp;
1216 else
1217 collate->weight[collate->nweight++] = here_weight;
1218 ++collate->weight_cnt[collate->weight_idx];
1220 return 0;
1225 collate_next_weight (struct linereader *lr, struct localedef_t *locale)
1227 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1229 if (collate->kind == symbol)
1231 lr_error (lr, _("\
1232 specification of sorting weight for collation symbol does not make sense"));
1233 lr_ignore_rest (lr, 0);
1234 return -1;
1237 ++collate->weight_idx;
1238 if (collate->weight_idx >= collate->nrules)
1240 lr_error (lr, _("too many weights"));
1241 lr_ignore_rest (lr, 0);
1242 return -1;
1245 return 0;
1250 collate_simple_weight (struct linereader *lr, struct localedef_t *locale,
1251 struct token *code, struct charset_t *charset)
1253 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1254 unsigned int value = 0;
1256 /* There current tokens can be `IGNORE', `...', or a string. */
1257 switch (code->tok)
1259 case tok_ignore:
1260 /* This token is allowed in all situations. */
1261 value = IGNORE_CHAR;
1262 break;
1264 case tok_ellipsis:
1265 /* The ellipsis is only allowed for the `...' or `UNDEFINED'
1266 entry. */
1267 if (collate->kind != ellipsis && collate->kind != undefined)
1269 lr_error (lr, _("\
1270 `...' must only be used in `...' and `UNDEFINED' entries"));
1271 lr_ignore_rest (lr, 0);
1272 return -1;
1274 value = ELLIPSIS_CHAR;
1275 break;
1277 case tok_string:
1278 /* This can become difficult. We have to get the weights which
1279 correspind the the single wide chars in the string. But some
1280 of the `chars' might not be real characters, but collation
1281 elements or symbols. And so the string decoder might have
1282 signaled errors. The string at this point is not translated.
1283 I.e., all <...> sequences are still there. */
1285 char *runp = code->val.str.start;
1286 void *tmp;
1288 while (*runp != '\0')
1290 char *startp = (char *) runp;
1291 char *putp = (char *) runp;
1292 wchar_t wch;
1294 /* Lookup weight for char and store it. */
1295 if (*runp == '<')
1297 while (*++runp != '\0' && *runp != '>')
1299 if (*runp == lr->escape_char)
1300 if (*++runp == '\0')
1302 lr_error (lr, _("unterminated weight name"));
1303 lr_ignore_rest (lr, 0);
1304 return -1;
1306 *putp++ = *runp;
1308 if (*runp == '>')
1309 ++runp;
1311 if (putp == startp)
1313 lr_error (lr, _("empty weight name: line ignored"));
1314 lr_ignore_rest (lr, 0);
1315 return -1;
1318 wch = charset_find_value (charset, startp, putp - startp);
1319 if (wch != ILLEGAL_CHAR_VALUE)
1321 element_t *pelem;
1323 if (find_entry (&collate->result, &wch, sizeof (wchar_t),
1324 (void *)&pelem) < 0)
1325 pelem = NULL;
1327 while (pelem != NULL
1328 && (pelem->name[0] != wch
1329 || pelem->name[1] != L'\0'))
1330 pelem = pelem->next;
1332 value = pelem == NULL ? 0 : pelem->this_weight;
1334 else if (find_entry (&collate->elements, startp, putp - startp,
1335 &tmp) >= 0)
1337 element_t *pelem = (element_t *) tmp;
1339 value = pelem->this_weight;
1341 else if (find_entry (&collate->symbols, startp, putp - startp,
1342 &tmp) >= 0)
1344 value = (unsigned int) tmp;
1346 else
1348 if (verbose)
1349 lr_error (lr, _("unknown symbol `%.*s': line ignored"),
1350 putp - startp, startp);
1351 lr_ignore_rest (lr, 0);
1352 return -1;
1355 else
1357 element_t *wp;
1358 wchar_t wch;
1360 if (*runp == lr->escape_char)
1362 static char digits[] = "0123456789abcdef";
1363 char *dp;
1364 int base;
1366 ++runp;
1367 if (tolower (*runp) == 'x')
1369 ++runp;
1370 base = 16;
1372 else if (tolower (*runp) == 'd')
1374 ++runp;
1375 base = 10;
1377 else
1378 base = 8;
1380 dp = strchr (digits, tolower (*runp));
1381 if (dp == NULL || (dp - digits) >= base)
1383 illegal_char:
1384 lr_error (lr, _("\
1385 illegal character constant in string"));
1386 lr_ignore_rest (lr, 0);
1387 return -1;
1389 wch = dp - digits;
1390 ++runp;
1392 dp = strchr (digits, tolower (*runp));
1393 if (dp == NULL || (dp - digits) >= base)
1394 goto illegal_char;
1395 wch *= base;
1396 wch += dp - digits;
1397 ++runp;
1399 if (base != 16)
1401 dp = strchr (digits, tolower (*runp));
1402 if (dp != NULL && (dp - digits < base))
1404 wch *= base;
1405 wch += dp - digits;
1406 ++runp;
1410 else
1411 wch = (wchar_t) *runp++;
1413 /* Lookup the weight for WCH. */
1414 if (find_entry (&collate->result, &wch, sizeof (wch),
1415 (void *)&wp) < 0)
1416 wp = NULL;
1418 while (wp != NULL
1419 && (wp->name[0] != wch || wp->name[1] != L'\0'))
1420 wp = wp->next;
1422 value = wp == NULL ? 0 : wp->this_weight;
1424 /* To get the correct name for the error message. */
1425 putp = runp;
1427 /**************************************************\
1428 |* I know here is something wrong. Characters in *|
1429 |* the string which are not in the <...> form *|
1430 |* cannot be declared forward for now!!! *|
1431 \**************************************************/
1434 /* Store in weight array. */
1435 if (collate->nweight >= collate->nweight_max)
1437 collate->nweight_max *= 2;
1438 collate->weight
1439 = (unsigned int *) xrealloc (collate->weight,
1440 collate->nweight_max);
1443 if (value == 0)
1445 patch_t *newp;
1447 newp = (patch_t *) obstack_alloc (&collate->element_mem,
1448 sizeof (patch_t));
1449 newp->fname = lr->fname;
1450 newp->lineno = lr->lineno;
1451 newp->token
1452 = (const char *) obstack_copy0 (&collate->element_mem,
1453 startp, putp - startp);
1454 newp->where.idx = collate->nweight++;
1455 newp->next = collate->current_patch;
1456 collate->current_patch = newp;
1458 else
1459 collate->weight[collate->nweight++] = value;
1460 ++collate->weight_cnt[collate->weight_idx];
1463 return 0;
1465 default:
1466 assert (! "should not happen");
1470 if (collate->nweight >= collate->nweight_max)
1472 collate->nweight_max *= 2;
1473 collate->weight = (unsigned int *) xrealloc (collate->weight,
1474 collate->nweight_max);
1477 collate->weight[collate->nweight++] = value;
1478 ++collate->weight_cnt[collate->weight_idx];
1480 return 0;
1484 void
1485 collate_end_weight (struct linereader *lr, struct localedef_t *locale)
1487 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1488 element_t *pelem = collate->current_element;
1490 if (collate->kind == symbol)
1492 /* We don't have to do anything. */
1493 collate->was_ellipsis = 0;
1494 return;
1497 if (collate->kind == ellipsis)
1499 /* Before the next line is processed the ellipsis is handled. */
1500 collate->was_ellipsis = 1;
1501 return;
1504 assert (collate->kind == character || collate->kind == element
1505 || collate->kind == undefined);
1507 /* Fill in the missing weights. */
1508 while (++collate->weight_idx < collate->nrules)
1510 collate->weight[collate->nweight++] = pelem->this_weight;
1511 ++collate->weight_cnt[collate->weight_idx];
1514 /* Now we know how many ordering weights the current
1515 character/element has. Allocate room in the element structure
1516 and copy information. */
1517 pelem->ordering_len = collate->nweight;
1519 /* First we write an array with the number of values for each
1520 weight. */
1521 obstack_grow (&collate->element_mem, collate->weight_cnt,
1522 collate->nrules * sizeof (unsigned int));
1524 /* Now the weights itselves. */
1525 obstack_grow (&collate->element_mem, collate->weight,
1526 collate->nweight * sizeof (unsigned int));
1528 /* Get result. */
1529 pelem->ordering = obstack_finish (&collate->element_mem);
1531 /* Now we handle the "patches". */
1532 while (collate->current_patch != NULL)
1534 patch_t *this_patch;
1536 this_patch = collate->current_patch;
1538 this_patch->where.pos = &pelem->ordering[collate->nrules
1539 + this_patch->where.idx];
1541 collate->current_patch = this_patch->next;
1542 this_patch->next = collate->all_patches;
1543 collate->all_patches = this_patch;
1546 /* Set information for next round. */
1547 collate->was_ellipsis = 0;
1548 if (collate->kind != undefined)
1549 collate->last_char = pelem->name[0];