1 /* Copyright (C) 1995-2006, 2007 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; version 2 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
35 #include "localedef.h"
37 #include "localeinfo.h"
39 #include "linereader.h"
40 #include "locfile-token.h"
46 #ifdef PREDEFINED_CLASSES
47 /* These are the extra bits not in wctype.h since these are not preallocated
49 # define _ISwspecial1 (1 << 29)
50 # define _ISwspecial2 (1 << 30)
51 # define _ISwspecial3 (1 << 31)
55 /* The bit used for representing a special class. */
56 #define BITPOS(class) ((class) - tok_upper)
57 #define BIT(class) (_ISbit (BITPOS (class)))
58 #define BITw(class) (_ISwbit (BITPOS (class)))
60 #define ELEM(ctype, collection, idx, value) \
61 *find_idx (ctype, &ctype->collection idx, &ctype->collection##_max idx, \
62 &ctype->collection##_act idx, value)
65 /* To be compatible with former implementations we for now restrict
66 the number of bits for character classes to 16. When compatibility
67 is not necessary anymore increase the number to 32. */
68 #define char_class_t uint16_t
69 #define char_class32_t uint32_t
72 /* Type to describe a transliteration action. We have a possibly
73 multiple character from-string and a set of multiple character
74 to-strings. All are 32bit values since this is what is used in
75 the gconv functions. */
80 struct translit_to_t
*next
;
90 struct translit_to_t
*to
;
92 struct translit_t
*next
;
95 struct translit_ignore_t
104 struct translit_ignore_t
*next
;
108 /* Type to describe a transliteration include statement. */
109 struct translit_include_t
111 const char *copy_locale
;
112 const char *copy_repertoire
;
114 struct translit_include_t
*next
;
118 /* Sparse table of uint32_t. */
119 #define TABLE idx_table
120 #define ELEMENT uint32_t
121 #define DEFAULT ((uint32_t) ~0)
126 /* The real definition of the struct for the LC_CTYPE locale. */
127 struct locale_ctype_t
130 size_t charnames_max
;
131 size_t charnames_act
;
132 /* An index lookup table, to speedup find_idx. */
133 struct idx_table charnames_idx
;
135 struct repertoire_t
*repertoire
;
137 /* We will allow up to 8 * sizeof (uint32_t) character classes. */
138 #define MAX_NR_CHARCLASS (8 * sizeof (uint32_t))
140 const char *classnames
[MAX_NR_CHARCLASS
];
141 uint32_t last_class_char
;
142 uint32_t class256_collection
[256];
143 uint32_t *class_collection
;
144 size_t class_collection_max
;
145 size_t class_collection_act
;
147 uint32_t class_offset
;
149 struct charseq
**mbdigits
;
156 struct charseq
*mboutdigits
[10];
157 uint32_t wcoutdigits
[10];
158 size_t outdigits_act
;
160 /* If the following number ever turns out to be too small simply
161 increase it. But I doubt it will. --drepper@gnu */
162 #define MAX_NR_CHARMAP 16
163 const char *mapnames
[MAX_NR_CHARMAP
];
164 uint32_t *map_collection
[MAX_NR_CHARMAP
];
165 uint32_t map256_collection
[2][256];
166 size_t map_collection_max
[MAX_NR_CHARMAP
];
167 size_t map_collection_act
[MAX_NR_CHARMAP
];
168 size_t map_collection_nr
;
170 int tomap_done
[MAX_NR_CHARMAP
];
173 /* Transliteration information. */
174 struct translit_include_t
*translit_include
;
175 struct translit_t
*translit
;
176 struct translit_ignore_t
*translit_ignore
;
177 uint32_t ntranslit_ignore
;
179 uint32_t *default_missing
;
180 const char *default_missing_file
;
181 size_t default_missing_lineno
;
183 uint32_t to_nonascii
;
185 /* The arrays for the binary representation. */
186 char_class_t
*ctype_b
;
187 char_class32_t
*ctype32_b
;
191 struct iovec
*class_3level
;
192 struct iovec
*map_3level
;
193 uint32_t *class_name_ptr
;
194 uint32_t *map_name_ptr
;
197 const char *codeset_name
;
198 uint32_t *translit_from_idx
;
199 uint32_t *translit_from_tbl
;
200 uint32_t *translit_to_idx
;
201 uint32_t *translit_to_tbl
;
202 uint32_t translit_idx_size
;
203 size_t translit_from_tbl_size
;
204 size_t translit_to_tbl_size
;
206 struct obstack mempool
;
210 /* Marker for an empty slot. This has the value 0xFFFFFFFF, regardless
211 whether 'int' is 16 bit, 32 bit, or 64 bit. */
212 #define EMPTY ((uint32_t) ~0)
215 #define obstack_chunk_alloc xmalloc
216 #define obstack_chunk_free free
219 /* Prototypes for local functions. */
220 static void ctype_startup (struct linereader
*lr
, struct localedef_t
*locale
,
221 const struct charmap_t
*charmap
,
222 struct localedef_t
*copy_locale
,
224 static void ctype_class_new (struct linereader
*lr
,
225 struct locale_ctype_t
*ctype
, const char *name
);
226 static void ctype_map_new (struct linereader
*lr
,
227 struct locale_ctype_t
*ctype
,
228 const char *name
, const struct charmap_t
*charmap
);
229 static uint32_t *find_idx (struct locale_ctype_t
*ctype
, uint32_t **table
,
230 size_t *max
, size_t *act
, unsigned int idx
);
231 static void set_class_defaults (struct locale_ctype_t
*ctype
,
232 const struct charmap_t
*charmap
,
233 struct repertoire_t
*repertoire
);
234 static void allocate_arrays (struct locale_ctype_t
*ctype
,
235 const struct charmap_t
*charmap
,
236 struct repertoire_t
*repertoire
);
239 static const char *longnames
[] =
241 "zero", "one", "two", "three", "four",
242 "five", "six", "seven", "eight", "nine"
244 static const char *uninames
[] =
246 "U00000030", "U00000031", "U00000032", "U00000033", "U00000034",
247 "U00000035", "U00000036", "U00000037", "U00000038", "U00000039"
249 static const unsigned char digits
[] = "0123456789";
253 ctype_startup (struct linereader
*lr
, struct localedef_t
*locale
,
254 const struct charmap_t
*charmap
,
255 struct localedef_t
*copy_locale
, int ignore_content
)
258 struct locale_ctype_t
*ctype
;
260 if (!ignore_content
&& locale
->categories
[LC_CTYPE
].ctype
== NULL
)
262 if (copy_locale
== NULL
)
264 /* Allocate the needed room. */
265 locale
->categories
[LC_CTYPE
].ctype
= ctype
=
266 (struct locale_ctype_t
*) xcalloc (1,
267 sizeof (struct locale_ctype_t
));
269 /* We have seen no names yet. */
270 ctype
->charnames_max
= charmap
->mb_cur_max
== 1 ? 256 : 512;
272 (unsigned int *) xmalloc (ctype
->charnames_max
273 * sizeof (unsigned int));
274 for (cnt
= 0; cnt
< 256; ++cnt
)
275 ctype
->charnames
[cnt
] = cnt
;
276 ctype
->charnames_act
= 256;
277 idx_table_init (&ctype
->charnames_idx
);
279 /* Fill character class information. */
280 ctype
->last_class_char
= ILLEGAL_CHAR_VALUE
;
281 /* The order of the following instructions determines the bit
283 ctype_class_new (lr
, ctype
, "upper");
284 ctype_class_new (lr
, ctype
, "lower");
285 ctype_class_new (lr
, ctype
, "alpha");
286 ctype_class_new (lr
, ctype
, "digit");
287 ctype_class_new (lr
, ctype
, "xdigit");
288 ctype_class_new (lr
, ctype
, "space");
289 ctype_class_new (lr
, ctype
, "print");
290 ctype_class_new (lr
, ctype
, "graph");
291 ctype_class_new (lr
, ctype
, "blank");
292 ctype_class_new (lr
, ctype
, "cntrl");
293 ctype_class_new (lr
, ctype
, "punct");
294 ctype_class_new (lr
, ctype
, "alnum");
295 #ifdef PREDEFINED_CLASSES
296 /* The following are extensions from ISO 14652. */
297 ctype_class_new (lr
, ctype
, "left_to_right");
298 ctype_class_new (lr
, ctype
, "right_to_left");
299 ctype_class_new (lr
, ctype
, "num_terminator");
300 ctype_class_new (lr
, ctype
, "num_separator");
301 ctype_class_new (lr
, ctype
, "segment_separator");
302 ctype_class_new (lr
, ctype
, "block_separator");
303 ctype_class_new (lr
, ctype
, "direction_control");
304 ctype_class_new (lr
, ctype
, "sym_swap_layout");
305 ctype_class_new (lr
, ctype
, "char_shape_selector");
306 ctype_class_new (lr
, ctype
, "num_shape_selector");
307 ctype_class_new (lr
, ctype
, "non_spacing");
308 ctype_class_new (lr
, ctype
, "non_spacing_level3");
309 ctype_class_new (lr
, ctype
, "normal_connect");
310 ctype_class_new (lr
, ctype
, "r_connect");
311 ctype_class_new (lr
, ctype
, "no_connect");
312 ctype_class_new (lr
, ctype
, "no_connect-space");
313 ctype_class_new (lr
, ctype
, "vowel_connect");
316 ctype
->class_collection_max
= charmap
->mb_cur_max
== 1 ? 256 : 512;
317 ctype
->class_collection
318 = (uint32_t *) xcalloc (sizeof (unsigned long int),
319 ctype
->class_collection_max
);
320 ctype
->class_collection_act
= 256;
322 /* Fill character map information. */
323 ctype
->last_map_idx
= MAX_NR_CHARMAP
;
324 ctype_map_new (lr
, ctype
, "toupper", charmap
);
325 ctype_map_new (lr
, ctype
, "tolower", charmap
);
326 #ifdef PREDEFINED_CLASSES
327 ctype_map_new (lr
, ctype
, "tosymmetric", charmap
);
330 /* Fill first 256 entries in `toXXX' arrays. */
331 for (cnt
= 0; cnt
< 256; ++cnt
)
333 ctype
->map_collection
[0][cnt
] = cnt
;
334 ctype
->map_collection
[1][cnt
] = cnt
;
335 #ifdef PREDEFINED_CLASSES
336 ctype
->map_collection
[2][cnt
] = cnt
;
338 ctype
->map256_collection
[0][cnt
] = cnt
;
339 ctype
->map256_collection
[1][cnt
] = cnt
;
342 if (enc_not_ascii_compatible
)
343 ctype
->to_nonascii
= 1;
345 obstack_init (&ctype
->mempool
);
348 ctype
= locale
->categories
[LC_CTYPE
].ctype
=
349 copy_locale
->categories
[LC_CTYPE
].ctype
;
355 ctype_finish (struct localedef_t
*locale
, const struct charmap_t
*charmap
)
357 /* See POSIX.2, table 2-6 for the meaning of the following table. */
362 const char allow
[NCLASS
];
364 valid_table
[NCLASS
] =
366 /* The order is important. See token.h for more information.
367 M = Always, D = Default, - = Permitted, X = Mutually exclusive */
368 { "upper", "--MX-XDDXXX-" },
369 { "lower", "--MX-XDDXXX-" },
370 { "alpha", "---X-XDDXXX-" },
371 { "digit", "XXX--XDDXXX-" },
372 { "xdigit", "-----XDDXXX-" },
373 { "space", "XXXXX------X" },
374 { "print", "---------X--" },
375 { "graph", "---------X--" },
376 { "blank", "XXXXXM-----X" },
377 { "cntrl", "XXXXX-XX--XX" },
378 { "punct", "XXXXX-DD-X-X" },
379 { "alnum", "-----XDDXXX-" }
383 uint32_t space_value
;
384 struct charseq
*space_seq
;
385 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
392 /* Now resolve copying and also handle completely missing definitions. */
395 const char *repertoire_name
;
397 /* First see whether we were supposed to copy. If yes, find the
398 actual definition. */
399 if (locale
->copy_name
[LC_CTYPE
] != NULL
)
401 /* Find the copying locale. This has to happen transitively since
402 the locale we are copying from might also copying another one. */
403 struct localedef_t
*from
= locale
;
406 from
= find_locale (LC_CTYPE
, from
->copy_name
[LC_CTYPE
],
407 from
->repertoire_name
, charmap
);
408 while (from
->categories
[LC_CTYPE
].ctype
== NULL
409 && from
->copy_name
[LC_CTYPE
] != NULL
);
411 ctype
= locale
->categories
[LC_CTYPE
].ctype
412 = from
->categories
[LC_CTYPE
].ctype
;
415 /* If there is still no definition issue an warning and create an
420 WITH_CUR_LOCALE (error (0, 0, _("\
421 No definition for %s category found"), "LC_CTYPE"));
422 ctype_startup (NULL
, locale
, charmap
, NULL
, 0);
423 ctype
= locale
->categories
[LC_CTYPE
].ctype
;
426 /* Get the repertoire we have to use. */
427 repertoire_name
= locale
->repertoire_name
?: repertoire_global
;
428 if (repertoire_name
!= NULL
)
429 ctype
->repertoire
= repertoire_read (repertoire_name
);
432 /* We need the name of the currently used 8-bit character set to
433 make correct conversion between this 8-bit representation and the
434 ISO 10646 character set used internally for wide characters. */
435 ctype
->codeset_name
= charmap
->code_set_name
;
436 if (ctype
->codeset_name
== NULL
)
439 WITH_CUR_LOCALE (error (0, 0, _("\
440 No character set name specified in charmap")));
441 ctype
->codeset_name
= "//UNKNOWN//";
444 /* Set default value for classes not specified. */
445 set_class_defaults (ctype
, charmap
, ctype
->repertoire
);
447 /* Check according to table. */
448 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
450 uint32_t tmp
= ctype
->class_collection
[cnt
];
454 for (cls1
= 0; cls1
< NCLASS
; ++cls1
)
455 if ((tmp
& _ISwbit (cls1
)) != 0)
456 for (cls2
= 0; cls2
< NCLASS
; ++cls2
)
457 if (valid_table
[cls1
].allow
[cls2
] != '-')
459 int eq
= (tmp
& _ISwbit (cls2
)) != 0;
460 switch (valid_table
[cls1
].allow
[cls2
])
465 uint32_t value
= ctype
->charnames
[cnt
];
468 WITH_CUR_LOCALE (error (0, 0, _("\
469 character L'\\u%0*x' in class `%s' must be in class `%s'"),
470 value
> 0xffff ? 8 : 4,
472 valid_table
[cls1
].name
,
473 valid_table
[cls2
].name
));
480 uint32_t value
= ctype
->charnames
[cnt
];
483 WITH_CUR_LOCALE (error (0, 0, _("\
484 character L'\\u%0*x' in class `%s' must not be in class `%s'"),
485 value
> 0xffff ? 8 : 4,
487 valid_table
[cls1
].name
,
488 valid_table
[cls2
].name
));
493 ctype
->class_collection
[cnt
] |= _ISwbit (cls2
);
497 WITH_CUR_LOCALE (error (5, 0, _("\
498 internal error in %s, line %u"), __FUNCTION__
, __LINE__
));
504 for (cnt
= 0; cnt
< 256; ++cnt
)
506 uint32_t tmp
= ctype
->class256_collection
[cnt
];
510 for (cls1
= 0; cls1
< NCLASS
; ++cls1
)
511 if ((tmp
& _ISbit (cls1
)) != 0)
512 for (cls2
= 0; cls2
< NCLASS
; ++cls2
)
513 if (valid_table
[cls1
].allow
[cls2
] != '-')
515 int eq
= (tmp
& _ISbit (cls2
)) != 0;
516 switch (valid_table
[cls1
].allow
[cls2
])
523 snprintf (buf
, sizeof buf
, "\\%Zo", cnt
);
526 WITH_CUR_LOCALE (error (0, 0, _("\
527 character '%s' in class `%s' must be in class `%s'"),
529 valid_table
[cls1
].name
,
530 valid_table
[cls2
].name
));
539 snprintf (buf
, sizeof buf
, "\\%Zo", cnt
);
542 WITH_CUR_LOCALE (error (0, 0, _("\
543 character '%s' in class `%s' must not be in class `%s'"),
545 valid_table
[cls1
].name
,
546 valid_table
[cls2
].name
));
551 ctype
->class256_collection
[cnt
] |= _ISbit (cls2
);
555 WITH_CUR_LOCALE (error (5, 0, _("\
556 internal error in %s, line %u"), __FUNCTION__
, __LINE__
));
562 /* ... and now test <SP> as a special case. */
564 if (((cnt
= BITPOS (tok_space
),
565 (ELEM (ctype
, class_collection
, , space_value
)
566 & BITw (tok_space
)) == 0)
567 || (cnt
= BITPOS (tok_blank
),
568 (ELEM (ctype
, class_collection
, , space_value
)
569 & BITw (tok_blank
)) == 0)))
572 WITH_CUR_LOCALE (error (0, 0, _("<SP> character not in class `%s'"),
573 valid_table
[cnt
].name
));
575 else if (((cnt
= BITPOS (tok_punct
),
576 (ELEM (ctype
, class_collection
, , space_value
)
577 & BITw (tok_punct
)) != 0)
578 || (cnt
= BITPOS (tok_graph
),
579 (ELEM (ctype
, class_collection
, , space_value
)
584 WITH_CUR_LOCALE (error (0, 0, _("\
585 <SP> character must not be in class `%s'"),
586 valid_table
[cnt
].name
));
589 ELEM (ctype
, class_collection
, , space_value
) |= BITw (tok_print
);
591 space_seq
= charmap_find_value (charmap
, "SP", 2);
592 if (space_seq
== NULL
)
593 space_seq
= charmap_find_value (charmap
, "space", 5);
594 if (space_seq
== NULL
)
595 space_seq
= charmap_find_value (charmap
, "U00000020", 9);
596 if (space_seq
== NULL
|| space_seq
->nbytes
!= 1)
599 WITH_CUR_LOCALE (error (0, 0, _("\
600 character <SP> not defined in character map")));
602 else if (((cnt
= BITPOS (tok_space
),
603 (ctype
->class256_collection
[space_seq
->bytes
[0]]
604 & BIT (tok_space
)) == 0)
605 || (cnt
= BITPOS (tok_blank
),
606 (ctype
->class256_collection
[space_seq
->bytes
[0]]
607 & BIT (tok_blank
)) == 0)))
610 WITH_CUR_LOCALE (error (0, 0, _("<SP> character not in class `%s'"),
611 valid_table
[cnt
].name
));
613 else if (((cnt
= BITPOS (tok_punct
),
614 (ctype
->class256_collection
[space_seq
->bytes
[0]]
615 & BIT (tok_punct
)) != 0)
616 || (cnt
= BITPOS (tok_graph
),
617 (ctype
->class256_collection
[space_seq
->bytes
[0]]
618 & BIT (tok_graph
)) != 0)))
621 WITH_CUR_LOCALE (error (0, 0, _("\
622 <SP> character must not be in class `%s'"),
623 valid_table
[cnt
].name
));
626 ctype
->class256_collection
[space_seq
->bytes
[0]] |= BIT (tok_print
);
628 /* Now that the tests are done make sure the name array contains all
629 characters which are handled in the WIDTH section of the
630 character set definition file. */
631 if (charmap
->width_rules
!= NULL
)
632 for (cnt
= 0; cnt
< charmap
->nwidth_rules
; ++cnt
)
634 unsigned char bytes
[charmap
->mb_cur_max
];
635 int nbytes
= charmap
->width_rules
[cnt
].from
->nbytes
;
637 /* We have the range of character for which the width is
638 specified described using byte sequences of the multibyte
639 charset. We have to convert this to UCS4 now. And we
640 cannot simply convert the beginning and the end of the
641 sequence, we have to iterate over the byte sequence and
642 convert it for every single character. */
643 memcpy (bytes
, charmap
->width_rules
[cnt
].from
->bytes
, nbytes
);
645 while (nbytes
< charmap
->width_rules
[cnt
].to
->nbytes
646 || memcmp (bytes
, charmap
->width_rules
[cnt
].to
->bytes
,
649 /* Find the UCS value for `bytes'. */
653 = charmap_find_symbol (charmap
, (char *) bytes
, nbytes
);
656 wch
= ILLEGAL_CHAR_VALUE
;
657 else if (seq
->ucs4
!= UNINITIALIZED_CHAR_VALUE
)
660 wch
= repertoire_find_value (ctype
->repertoire
, seq
->name
,
663 if (wch
!= ILLEGAL_CHAR_VALUE
)
664 /* We are only interested in the side-effects of the
665 `find_idx' call. It will add appropriate entries in
666 the name array if this is necessary. */
667 (void) find_idx (ctype
, NULL
, NULL
, NULL
, wch
);
669 /* "Increment" the bytes sequence. */
671 while (inner
>= 0 && bytes
[inner
] == 0xff)
676 /* We have to extend the byte sequence. */
677 if (nbytes
>= charmap
->width_rules
[cnt
].to
->nbytes
)
681 memset (&bytes
[1], 0, nbytes
);
687 while (++inner
< nbytes
)
693 /* Now set all the other characters of the character set to the
696 while (iterate_table (&charmap
->char_table
, &curs
, &key
, &len
, &vdata
) == 0)
698 struct charseq
*data
= (struct charseq
*) vdata
;
700 if (data
->ucs4
== UNINITIALIZED_CHAR_VALUE
)
701 data
->ucs4
= repertoire_find_value (ctype
->repertoire
,
704 if (data
->ucs4
!= ILLEGAL_CHAR_VALUE
)
705 (void) find_idx (ctype
, NULL
, NULL
, NULL
, data
->ucs4
);
708 /* There must be a multiple of 10 digits. */
709 if (ctype
->mbdigits_act
% 10 != 0)
711 assert (ctype
->mbdigits_act
== ctype
->wcdigits_act
);
712 ctype
->wcdigits_act
-= ctype
->mbdigits_act
% 10;
713 ctype
->mbdigits_act
-= ctype
->mbdigits_act
% 10;
714 WITH_CUR_LOCALE (error (0, 0, _("\
715 `digit' category has not entries in groups of ten")));
718 /* Check the input digits. There must be a multiple of ten available.
719 In each group it could be that one or the other character is missing.
720 In this case the whole group must be removed. */
722 while (cnt
< ctype
->mbdigits_act
)
725 for (inner
= 0; inner
< 10; ++inner
)
726 if (ctype
->mbdigits
[cnt
+ inner
] == NULL
)
733 /* Remove the group. */
734 memmove (&ctype
->mbdigits
[cnt
], &ctype
->mbdigits
[cnt
+ 10],
735 ((ctype
->wcdigits_act
- cnt
- 10)
736 * sizeof (ctype
->mbdigits
[0])));
737 ctype
->mbdigits_act
-= 10;
741 /* If no input digits are given use the default. */
742 if (ctype
->mbdigits_act
== 0)
744 if (ctype
->mbdigits_max
== 0)
746 ctype
->mbdigits
= obstack_alloc (&((struct charmap_t
*) charmap
)->mem_pool
,
747 10 * sizeof (struct charseq
*));
748 ctype
->mbdigits_max
= 10;
751 for (cnt
= 0; cnt
< 10; ++cnt
)
753 ctype
->mbdigits
[cnt
] = charmap_find_symbol (charmap
,
754 (char *) digits
+ cnt
, 1);
755 if (ctype
->mbdigits
[cnt
] == NULL
)
757 ctype
->mbdigits
[cnt
] = charmap_find_symbol (charmap
,
759 strlen (longnames
[cnt
]));
760 if (ctype
->mbdigits
[cnt
] == NULL
)
762 /* Hum, this ain't good. */
763 WITH_CUR_LOCALE (error (0, 0, _("\
764 no input digits defined and none of the standard names in the charmap")));
766 ctype
->mbdigits
[cnt
] = obstack_alloc (&((struct charmap_t
*) charmap
)->mem_pool
,
767 sizeof (struct charseq
) + 1);
769 /* This is better than nothing. */
770 ctype
->mbdigits
[cnt
]->bytes
[0] = digits
[cnt
];
771 ctype
->mbdigits
[cnt
]->nbytes
= 1;
776 ctype
->mbdigits_act
= 10;
779 /* Check the wide character input digits. There must be a multiple
780 of ten available. In each group it could be that one or the other
781 character is missing. In this case the whole group must be
784 while (cnt
< ctype
->wcdigits_act
)
787 for (inner
= 0; inner
< 10; ++inner
)
788 if (ctype
->wcdigits
[cnt
+ inner
] == ILLEGAL_CHAR_VALUE
)
795 /* Remove the group. */
796 memmove (&ctype
->wcdigits
[cnt
], &ctype
->wcdigits
[cnt
+ 10],
797 ((ctype
->wcdigits_act
- cnt
- 10)
798 * sizeof (ctype
->wcdigits
[0])));
799 ctype
->wcdigits_act
-= 10;
803 /* If no input digits are given use the default. */
804 if (ctype
->wcdigits_act
== 0)
806 if (ctype
->wcdigits_max
== 0)
808 ctype
->wcdigits
= obstack_alloc (&((struct charmap_t
*) charmap
)->mem_pool
,
809 10 * sizeof (uint32_t));
810 ctype
->wcdigits_max
= 10;
813 for (cnt
= 0; cnt
< 10; ++cnt
)
814 ctype
->wcdigits
[cnt
] = L
'0' + cnt
;
816 ctype
->mbdigits_act
= 10;
819 /* Check the outdigits. */
821 for (cnt
= 0; cnt
< 10; ++cnt
)
822 if (ctype
->mboutdigits
[cnt
] == NULL
)
824 static struct charseq replace
[2];
828 WITH_CUR_LOCALE (error (0, 0, _("\
829 not all characters used in `outdigit' are available in the charmap")));
833 replace
[0].nbytes
= 1;
834 replace
[0].bytes
[0] = '?';
835 replace
[0].bytes
[1] = '\0';
836 ctype
->mboutdigits
[cnt
] = &replace
[0];
840 for (cnt
= 0; cnt
< 10; ++cnt
)
841 if (ctype
->wcoutdigits
[cnt
] == 0)
845 WITH_CUR_LOCALE (error (0, 0, _("\
846 not all characters used in `outdigit' are available in the repertoire")));
850 ctype
->wcoutdigits
[cnt
] = L
'?';
853 /* Sort the entries in the translit_ignore list. */
854 if (ctype
->translit_ignore
!= NULL
)
856 struct translit_ignore_t
*firstp
= ctype
->translit_ignore
;
857 struct translit_ignore_t
*runp
;
859 ctype
->ntranslit_ignore
= 1;
861 for (runp
= firstp
->next
; runp
!= NULL
; runp
= runp
->next
)
863 struct translit_ignore_t
*lastp
= NULL
;
864 struct translit_ignore_t
*cmpp
;
866 ++ctype
->ntranslit_ignore
;
868 for (cmpp
= firstp
; cmpp
!= NULL
; lastp
= cmpp
, cmpp
= cmpp
->next
)
869 if (runp
->from
< cmpp
->from
)
877 ctype
->translit_ignore
= firstp
;
883 ctype_output (struct localedef_t
*locale
, const struct charmap_t
*charmap
,
884 const char *output_path
)
886 static const char nulbytes
[4] = { 0, 0, 0, 0 };
887 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
888 const size_t nelems
= (_NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1
)
889 + ctype
->nr_charclass
+ ctype
->map_collection_nr
);
890 struct iovec
*iov
= alloca (sizeof *iov
891 * (2 + nelems
+ 2 * ctype
->nr_charclass
892 + ctype
->map_collection_nr
+ 4));
893 struct locale_file data
;
894 uint32_t *idx
= alloca (sizeof *idx
* (nelems
+ 1));
895 uint32_t default_missing_len
;
896 size_t elem
, cnt
, offset
, total
;
899 /* Now prepare the output: Find the sizes of the table we can use. */
900 allocate_arrays (ctype
, charmap
, ctype
->repertoire
);
902 data
.magic
= LIMAGIC (LC_CTYPE
);
904 iov
[0].iov_base
= (void *) &data
;
905 iov
[0].iov_len
= sizeof (data
);
907 iov
[1].iov_base
= (void *) idx
;
908 iov
[1].iov_len
= nelems
* sizeof (uint32_t);
910 idx
[0] = iov
[0].iov_len
+ iov
[1].iov_len
;
913 for (elem
= 0; elem
< nelems
; ++elem
)
915 if (elem
< _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1
))
918 #define CTYPE_EMPTY(name) \
920 iov[2 + elem + offset].iov_base = NULL; \
921 iov[2 + elem + offset].iov_len = 0; \
922 idx[elem + 1] = idx[elem]; \
925 CTYPE_EMPTY(_NL_CTYPE_GAP1
);
926 CTYPE_EMPTY(_NL_CTYPE_GAP2
);
927 CTYPE_EMPTY(_NL_CTYPE_GAP3
);
928 CTYPE_EMPTY(_NL_CTYPE_GAP4
);
929 CTYPE_EMPTY(_NL_CTYPE_GAP5
);
930 CTYPE_EMPTY(_NL_CTYPE_GAP6
);
932 #define CTYPE_DATA(name, base, len) \
933 case _NL_ITEM_INDEX (name): \
934 iov[2 + elem + offset].iov_base = (base); \
935 iov[2 + elem + offset].iov_len = (len); \
936 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len; \
939 CTYPE_DATA (_NL_CTYPE_CLASS
,
941 (256 + 128) * sizeof (char_class_t
));
943 CTYPE_DATA (_NL_CTYPE_TOUPPER
,
945 (256 + 128) * sizeof (uint32_t));
946 CTYPE_DATA (_NL_CTYPE_TOLOWER
,
948 (256 + 128) * sizeof (uint32_t));
950 CTYPE_DATA (_NL_CTYPE_TOUPPER32
,
952 256 * sizeof (uint32_t));
953 CTYPE_DATA (_NL_CTYPE_TOLOWER32
,
955 256 * sizeof (uint32_t));
957 CTYPE_DATA (_NL_CTYPE_CLASS32
,
959 256 * sizeof (char_class32_t
));
961 CTYPE_DATA (_NL_CTYPE_CLASS_OFFSET
,
962 &ctype
->class_offset
, sizeof (uint32_t));
964 CTYPE_DATA (_NL_CTYPE_MAP_OFFSET
,
965 &ctype
->map_offset
, sizeof (uint32_t));
967 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TAB_SIZE
,
968 &ctype
->translit_idx_size
, sizeof (uint32_t));
970 CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_IDX
,
971 ctype
->translit_from_idx
,
972 ctype
->translit_idx_size
* sizeof (uint32_t));
974 CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_TBL
,
975 ctype
->translit_from_tbl
,
976 ctype
->translit_from_tbl_size
);
978 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_IDX
,
979 ctype
->translit_to_idx
,
980 ctype
->translit_idx_size
* sizeof (uint32_t));
982 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_TBL
,
983 ctype
->translit_to_tbl
, ctype
->translit_to_tbl_size
);
985 case _NL_ITEM_INDEX (_NL_CTYPE_CLASS_NAMES
):
986 /* The class name array. */
988 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
, ++offset
)
990 iov
[2 + elem
+ offset
].iov_base
991 = (void *) ctype
->classnames
[cnt
];
992 iov
[2 + elem
+ offset
].iov_len
993 = strlen (ctype
->classnames
[cnt
]) + 1;
994 total
+= iov
[2 + elem
+ offset
].iov_len
;
996 iov
[2 + elem
+ offset
].iov_base
= (void *) nulbytes
;
997 iov
[2 + elem
+ offset
].iov_len
= 4 - (total
% 4);
998 total
+= 4 - (total
% 4);
1000 idx
[elem
+ 1] = idx
[elem
] + total
;
1003 case _NL_ITEM_INDEX (_NL_CTYPE_MAP_NAMES
):
1004 /* The class name array. */
1006 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
, ++offset
)
1008 iov
[2 + elem
+ offset
].iov_base
1009 = (void *) ctype
->mapnames
[cnt
];
1010 iov
[2 + elem
+ offset
].iov_len
1011 = strlen (ctype
->mapnames
[cnt
]) + 1;
1012 total
+= iov
[2 + elem
+ offset
].iov_len
;
1014 iov
[2 + elem
+ offset
].iov_base
= (void *) nulbytes
;
1015 iov
[2 + elem
+ offset
].iov_len
= 4 - (total
% 4);
1016 total
+= 4 - (total
% 4);
1018 idx
[elem
+ 1] = idx
[elem
] + total
;
1021 CTYPE_DATA (_NL_CTYPE_WIDTH
,
1022 ctype
->width
.iov_base
,
1023 ctype
->width
.iov_len
);
1025 CTYPE_DATA (_NL_CTYPE_MB_CUR_MAX
,
1026 &ctype
->mb_cur_max
, sizeof (uint32_t));
1028 case _NL_ITEM_INDEX (_NL_CTYPE_CODESET_NAME
):
1029 total
= strlen (ctype
->codeset_name
) + 1;
1031 iov
[2 + elem
+ offset
].iov_base
= (char *) ctype
->codeset_name
;
1034 iov
[2 + elem
+ offset
].iov_base
= alloca ((total
+ 3) & ~3);
1035 memset (mempcpy (iov
[2 + elem
+ offset
].iov_base
,
1036 ctype
->codeset_name
, total
),
1037 '\0', 4 - (total
& 3));
1038 total
= (total
+ 3) & ~3;
1040 iov
[2 + elem
+ offset
].iov_len
= total
;
1041 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
1045 CTYPE_DATA (_NL_CTYPE_MAP_TO_NONASCII
,
1046 &ctype
->to_nonascii
, sizeof (uint32_t));
1048 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_MB_LEN
):
1049 iov
[2 + elem
+ offset
].iov_base
= alloca (sizeof (uint32_t));
1050 iov
[2 + elem
+ offset
].iov_len
= sizeof (uint32_t);
1051 *(uint32_t *) iov
[2 + elem
+ offset
].iov_base
=
1052 ctype
->mbdigits_act
/ 10;
1053 idx
[elem
+ 1] = idx
[elem
] + sizeof (uint32_t);
1056 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_WC_LEN
):
1057 /* Align entries. */
1058 iov
[2 + elem
+ offset
].iov_base
= (void *) nulbytes
;
1059 iov
[2 + elem
+ offset
].iov_len
= (4 - idx
[elem
] % 4) % 4;
1060 idx
[elem
] += iov
[2 + elem
+ offset
].iov_len
;
1063 iov
[2 + elem
+ offset
].iov_base
= alloca (sizeof (uint32_t));
1064 iov
[2 + elem
+ offset
].iov_len
= sizeof (uint32_t);
1065 *(uint32_t *) iov
[2 + elem
+ offset
].iov_base
=
1066 ctype
->wcdigits_act
/ 10;
1067 idx
[elem
+ 1] = idx
[elem
] + sizeof (uint32_t);
1070 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB
) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_MB
):
1071 /* Compute the length of all possible characters. For INDIGITS
1072 there might be more than one. We simply concatenate all of
1073 them with a NUL byte following. The NUL byte wouldn't be
1074 necessary but it makes it easier for the user. */
1077 for (cnt
= elem
- _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB
);
1078 cnt
< ctype
->mbdigits_act
; cnt
+= 10)
1079 total
+= ctype
->mbdigits
[cnt
]->nbytes
+ 1;
1080 iov
[2 + elem
+ offset
].iov_base
= (char *) alloca (total
);
1081 iov
[2 + elem
+ offset
].iov_len
= total
;
1083 cp
= iov
[2 + elem
+ offset
].iov_base
;
1084 for (cnt
= elem
- _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB
);
1085 cnt
< ctype
->mbdigits_act
; cnt
+= 10)
1087 cp
= mempcpy (cp
, ctype
->mbdigits
[cnt
]->bytes
,
1088 ctype
->mbdigits
[cnt
]->nbytes
);
1091 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
1094 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB
) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_MB
):
1095 /* Compute the length of all possible characters. For INDIGITS
1096 there might be more than one. We simply concatenate all of
1097 them with a NUL byte following. The NUL byte wouldn't be
1098 necessary but it makes it easier for the user. */
1099 cnt
= elem
- _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB
);
1100 total
= ctype
->mboutdigits
[cnt
]->nbytes
+ 1;
1101 iov
[2 + elem
+ offset
].iov_base
= (char *) alloca (total
);
1102 iov
[2 + elem
+ offset
].iov_len
= total
;
1104 *(char *) mempcpy (iov
[2 + elem
+ offset
].iov_base
,
1105 ctype
->mboutdigits
[cnt
]->bytes
,
1106 ctype
->mboutdigits
[cnt
]->nbytes
) = '\0';
1107 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
1110 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC
) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_WC
):
1111 total
= ctype
->wcdigits_act
/ 10;
1113 iov
[2 + elem
+ offset
].iov_base
=
1114 (uint32_t *) alloca (total
* sizeof (uint32_t));
1115 iov
[2 + elem
+ offset
].iov_len
= total
* sizeof (uint32_t);
1117 for (cnt
= elem
- _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC
);
1118 cnt
< ctype
->wcdigits_act
; cnt
+= 10)
1119 ((uint32_t *) iov
[2 + elem
+ offset
].iov_base
)[cnt
/ 10]
1120 = ctype
->wcdigits
[cnt
];
1121 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
1124 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC
):
1125 /* Align entries. */
1126 iov
[2 + elem
+ offset
].iov_base
= (void *) nulbytes
;
1127 iov
[2 + elem
+ offset
].iov_len
= (4 - idx
[elem
] % 4) % 4;
1128 idx
[elem
] += iov
[2 + elem
+ offset
].iov_len
;
1132 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT1_WC
) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_WC
):
1133 cnt
= elem
- _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC
);
1134 iov
[2 + elem
+ offset
].iov_base
= &ctype
->wcoutdigits
[cnt
];
1135 iov
[2 + elem
+ offset
].iov_len
= sizeof (uint32_t);
1136 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
1139 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN
):
1140 /* Align entries. */
1141 iov
[2 + elem
+ offset
].iov_base
= (void *) nulbytes
;
1142 iov
[2 + elem
+ offset
].iov_len
= (4 - idx
[elem
] % 4) % 4;
1143 idx
[elem
] += iov
[2 + elem
+ offset
].iov_len
;
1146 default_missing_len
= (ctype
->default_missing
1147 ? wcslen ((wchar_t *)ctype
->default_missing
)
1149 iov
[2 + elem
+ offset
].iov_base
= &default_missing_len
;
1150 iov
[2 + elem
+ offset
].iov_len
= sizeof (uint32_t);
1151 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
1154 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_DEFAULT_MISSING
):
1155 iov
[2 + elem
+ offset
].iov_base
=
1156 ctype
->default_missing
?: (uint32_t *) L
"";
1157 iov
[2 + elem
+ offset
].iov_len
=
1158 wcslen (iov
[2 + elem
+ offset
].iov_base
) * sizeof (uint32_t);
1159 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
1162 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_IGNORE_LEN
):
1163 /* Align entries. */
1164 iov
[2 + elem
+ offset
].iov_base
= (void *) nulbytes
;
1165 iov
[2 + elem
+ offset
].iov_len
= (4 - idx
[elem
] % 4) % 4;
1166 idx
[elem
] += iov
[2 + elem
+ offset
].iov_len
;
1169 iov
[2 + elem
+ offset
].iov_base
= &ctype
->ntranslit_ignore
;
1170 iov
[2 + elem
+ offset
].iov_len
= sizeof (uint32_t);
1171 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
1174 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_IGNORE
):
1176 uint32_t *ranges
= (uint32_t *) alloca (ctype
->ntranslit_ignore
1177 * 3 * sizeof (uint32_t));
1178 struct translit_ignore_t
*runp
;
1180 iov
[2 + elem
+ offset
].iov_base
= ranges
;
1181 iov
[2 + elem
+ offset
].iov_len
= (ctype
->ntranslit_ignore
1182 * 3 * sizeof (uint32_t));
1184 for (runp
= ctype
->translit_ignore
; runp
!= NULL
;
1187 *ranges
++ = runp
->from
;
1188 *ranges
++ = runp
->to
;
1189 *ranges
++ = runp
->step
;
1192 /* Remove the following line in case a new entry is added
1193 after _NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN. */
1195 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
1199 assert (! "unknown CTYPE element");
1203 /* Handle extra maps. */
1204 size_t nr
= elem
- _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1
);
1205 if (nr
< ctype
->nr_charclass
)
1207 iov
[2 + elem
+ offset
].iov_base
= ctype
->class_b
[nr
];
1208 iov
[2 + elem
+ offset
].iov_len
= 256 / 32 * sizeof (uint32_t);
1209 idx
[elem
] += iov
[2 + elem
+ offset
].iov_len
;
1212 iov
[2 + elem
+ offset
] = ctype
->class_3level
[nr
];
1216 nr
-= ctype
->nr_charclass
;
1217 assert (nr
< ctype
->map_collection_nr
);
1218 iov
[2 + elem
+ offset
] = ctype
->map_3level
[nr
];
1220 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
1224 assert (2 + elem
+ offset
== (nelems
+ 2 * ctype
->nr_charclass
1225 + ctype
->map_collection_nr
+ 4 + 2));
1227 write_locale_data (output_path
, LC_CTYPE
, "LC_CTYPE", 2 + elem
+ offset
,
1232 /* Local functions. */
1234 ctype_class_new (struct linereader
*lr
, struct locale_ctype_t
*ctype
,
1239 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
)
1240 if (strcmp (ctype
->classnames
[cnt
], name
) == 0)
1243 if (cnt
< ctype
->nr_charclass
)
1245 lr_error (lr
, _("character class `%s' already defined"), name
);
1249 if (ctype
->nr_charclass
== MAX_NR_CHARCLASS
)
1250 /* Exit code 2 is prescribed in P1003.2b. */
1251 WITH_CUR_LOCALE (error (2, 0, _("\
1252 implementation limit: no more than %Zd character classes allowed"),
1255 ctype
->classnames
[ctype
->nr_charclass
++] = name
;
1260 ctype_map_new (struct linereader
*lr
, struct locale_ctype_t
*ctype
,
1261 const char *name
, const struct charmap_t
*charmap
)
1263 size_t max_chars
= 0;
1266 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
)
1268 if (strcmp (ctype
->mapnames
[cnt
], name
) == 0)
1271 if (max_chars
< ctype
->map_collection_max
[cnt
])
1272 max_chars
= ctype
->map_collection_max
[cnt
];
1275 if (cnt
< ctype
->map_collection_nr
)
1277 lr_error (lr
, _("character map `%s' already defined"), name
);
1281 if (ctype
->map_collection_nr
== MAX_NR_CHARMAP
)
1282 /* Exit code 2 is prescribed in P1003.2b. */
1283 WITH_CUR_LOCALE (error (2, 0, _("\
1284 implementation limit: no more than %d character maps allowed"),
1287 ctype
->mapnames
[cnt
] = name
;
1290 ctype
->map_collection_max
[cnt
] = charmap
->mb_cur_max
== 1 ? 256 : 512;
1292 ctype
->map_collection_max
[cnt
] = max_chars
;
1294 ctype
->map_collection
[cnt
] = (uint32_t *)
1295 xcalloc (sizeof (uint32_t), ctype
->map_collection_max
[cnt
]);
1296 ctype
->map_collection_act
[cnt
] = 256;
1298 ++ctype
->map_collection_nr
;
1302 /* We have to be prepared that TABLE, MAX, and ACT can be NULL. This
1303 is possible if we only want to extend the name array. */
1305 find_idx (struct locale_ctype_t
*ctype
, uint32_t **table
, size_t *max
,
1306 size_t *act
, uint32_t idx
)
1311 return table
== NULL
? NULL
: &(*table
)[idx
];
1313 /* Use the charnames_idx lookup table instead of the slow search loop. */
1315 cnt
= idx_table_get (&ctype
->charnames_idx
, idx
);
1318 cnt
= ctype
->charnames_act
;
1320 for (cnt
= 256; cnt
< ctype
->charnames_act
; ++cnt
)
1321 if (ctype
->charnames
[cnt
] == idx
)
1325 /* We have to distinguish two cases: the name is found or not. */
1326 if (cnt
== ctype
->charnames_act
)
1328 /* Extend the name array. */
1329 if (ctype
->charnames_act
== ctype
->charnames_max
)
1331 ctype
->charnames_max
*= 2;
1332 ctype
->charnames
= (uint32_t *)
1333 xrealloc (ctype
->charnames
,
1334 sizeof (uint32_t) * ctype
->charnames_max
);
1336 ctype
->charnames
[ctype
->charnames_act
++] = idx
;
1337 idx_table_add (&ctype
->charnames_idx
, idx
, cnt
);
1341 /* We have done everything we are asked to do. */
1345 /* The caller does not want to extend the table. */
1346 return (cnt
>= *act
? NULL
: &(*table
)[cnt
]);
1352 size_t old_max
= *max
;
1355 while (*max
<= cnt
);
1358 (uint32_t *) xrealloc (*table
, *max
* sizeof (uint32_t));
1359 memset (&(*table
)[old_max
], '\0',
1360 (*max
- old_max
) * sizeof (uint32_t));
1366 return &(*table
)[cnt
];
1371 get_character (struct token
*now
, const struct charmap_t
*charmap
,
1372 struct repertoire_t
*repertoire
,
1373 struct charseq
**seqp
, uint32_t *wchp
)
1375 if (now
->tok
== tok_bsymbol
)
1377 /* This will hopefully be the normal case. */
1378 *wchp
= repertoire_find_value (repertoire
, now
->val
.str
.startmb
,
1379 now
->val
.str
.lenmb
);
1380 *seqp
= charmap_find_value (charmap
, now
->val
.str
.startmb
,
1381 now
->val
.str
.lenmb
);
1383 else if (now
->tok
== tok_ucs4
)
1387 snprintf (utmp
, sizeof (utmp
), "U%08X", now
->val
.ucs4
);
1388 *seqp
= charmap_find_value (charmap
, utmp
, 9);
1391 *seqp
= repertoire_find_seq (repertoire
, now
->val
.ucs4
);
1395 /* Compute the value in the charmap from the UCS value. */
1396 const char *symbol
= repertoire_find_symbol (repertoire
,
1402 *seqp
= charmap_find_value (charmap
, symbol
, strlen (symbol
));
1406 if (repertoire
!= NULL
)
1408 /* Insert a negative entry. */
1409 static const struct charseq negative
1410 = { .ucs4
= ILLEGAL_CHAR_VALUE
};
1411 uint32_t *newp
= obstack_alloc (&repertoire
->mem_pool
,
1413 *newp
= now
->val
.ucs4
;
1415 insert_entry (&repertoire
->seq_table
, newp
,
1416 sizeof (uint32_t), (void *) &negative
);
1420 (*seqp
)->ucs4
= now
->val
.ucs4
;
1422 else if ((*seqp
)->ucs4
!= now
->val
.ucs4
)
1425 *wchp
= now
->val
.ucs4
;
1427 else if (now
->tok
== tok_charcode
)
1429 /* We must map from the byte code to UCS4. */
1430 *seqp
= charmap_find_symbol (charmap
, now
->val
.str
.startmb
,
1431 now
->val
.str
.lenmb
);
1434 *wchp
= ILLEGAL_CHAR_VALUE
;
1437 if ((*seqp
)->ucs4
== UNINITIALIZED_CHAR_VALUE
)
1438 (*seqp
)->ucs4
= repertoire_find_value (repertoire
, (*seqp
)->name
,
1439 strlen ((*seqp
)->name
));
1440 *wchp
= (*seqp
)->ucs4
;
1450 /* Ellipsis like in `<foo123>..<foo12a>' or `<j1234>....<j1245>' and
1451 the .(2). counterparts. */
1453 charclass_symbolic_ellipsis (struct linereader
*ldfile
,
1454 struct locale_ctype_t
*ctype
,
1455 const struct charmap_t
*charmap
,
1456 struct repertoire_t
*repertoire
,
1458 const char *last_str
,
1459 unsigned long int class256_bit
,
1460 unsigned long int class_bit
, int base
,
1461 int ignore_content
, int handle_digits
, int step
)
1463 const char *nowstr
= now
->val
.str
.startmb
;
1464 char tmp
[now
->val
.str
.lenmb
+ 1];
1467 unsigned long int from
;
1468 unsigned long int to
;
1470 /* We have to compute the ellipsis values using the symbolic names. */
1471 assert (last_str
!= NULL
);
1473 if (strlen (last_str
) != now
->val
.str
.lenmb
)
1477 _("`%s' and `%.*s' are not valid names for symbolic range"),
1478 last_str
, (int) now
->val
.str
.lenmb
, nowstr
);
1482 if (memcmp (last_str
, nowstr
, now
->val
.str
.lenmb
) == 0)
1483 /* Nothing to do, the names are the same. */
1486 for (cp
= last_str
; *cp
== *(nowstr
+ (cp
- last_str
)); ++cp
)
1490 from
= strtoul (cp
, &endp
, base
);
1491 if ((from
== UINT_MAX
&& errno
== ERANGE
) || *endp
!= '\0')
1494 to
= strtoul (nowstr
+ (cp
- last_str
), &endp
, base
);
1495 if ((to
== UINT_MAX
&& errno
== ERANGE
)
1496 || (endp
- nowstr
) != now
->val
.str
.lenmb
|| from
>= to
)
1499 /* OK, we have a range FROM - TO. Now we can create the symbolic names. */
1500 if (!ignore_content
)
1502 now
->val
.str
.startmb
= tmp
;
1503 while ((from
+= step
) <= to
)
1505 struct charseq
*seq
;
1508 sprintf (tmp
, (base
== 10 ? "%.*s%0*ld" : "%.*s%0*lX"),
1509 (int) (cp
- last_str
), last_str
,
1510 (int) (now
->val
.str
.lenmb
- (cp
- last_str
)),
1513 get_character (now
, charmap
, repertoire
, &seq
, &wch
);
1515 if (seq
!= NULL
&& seq
->nbytes
== 1)
1516 /* Yep, we can store information about this byte sequence. */
1517 ctype
->class256_collection
[seq
->bytes
[0]] |= class256_bit
;
1519 if (wch
!= ILLEGAL_CHAR_VALUE
&& class_bit
!= 0)
1520 /* We have the UCS4 position. */
1521 *find_idx (ctype
, &ctype
->class_collection
,
1522 &ctype
->class_collection_max
,
1523 &ctype
->class_collection_act
, wch
) |= class_bit
;
1525 if (handle_digits
== 1)
1527 /* We must store the digit values. */
1528 if (ctype
->mbdigits_act
== ctype
->mbdigits_max
)
1530 ctype
->mbdigits_max
*= 2;
1531 ctype
->mbdigits
= xrealloc (ctype
->mbdigits
,
1532 (ctype
->mbdigits_max
1533 * sizeof (char *)));
1534 ctype
->wcdigits_max
*= 2;
1535 ctype
->wcdigits
= xrealloc (ctype
->wcdigits
,
1536 (ctype
->wcdigits_max
1537 * sizeof (uint32_t)));
1540 ctype
->mbdigits
[ctype
->mbdigits_act
++] = seq
;
1541 ctype
->wcdigits
[ctype
->wcdigits_act
++] = wch
;
1543 else if (handle_digits
== 2)
1545 /* We must store the digit values. */
1546 if (ctype
->outdigits_act
>= 10)
1548 lr_error (ldfile
, _("\
1549 %s: field `%s' does not contain exactly ten entries"),
1550 "LC_CTYPE", "outdigit");
1554 ctype
->mboutdigits
[ctype
->outdigits_act
] = seq
;
1555 ctype
->wcoutdigits
[ctype
->outdigits_act
] = wch
;
1556 ++ctype
->outdigits_act
;
1563 /* Ellipsis like in `<U1234>..<U2345>' or `<U1234>..(2)..<U2345>'. */
1565 charclass_ucs4_ellipsis (struct linereader
*ldfile
,
1566 struct locale_ctype_t
*ctype
,
1567 const struct charmap_t
*charmap
,
1568 struct repertoire_t
*repertoire
,
1569 struct token
*now
, uint32_t last_wch
,
1570 unsigned long int class256_bit
,
1571 unsigned long int class_bit
, int ignore_content
,
1572 int handle_digits
, int step
)
1574 if (last_wch
> now
->val
.ucs4
)
1576 lr_error (ldfile
, _("\
1577 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
1578 (now
->val
.ucs4
| last_wch
) < 65536 ? 4 : 8, now
->val
.ucs4
,
1579 (now
->val
.ucs4
| last_wch
) < 65536 ? 4 : 8, last_wch
);
1583 if (!ignore_content
)
1584 while ((last_wch
+= step
) <= now
->val
.ucs4
)
1586 /* We have to find out whether there is a byte sequence corresponding
1587 to this UCS4 value. */
1588 struct charseq
*seq
;
1591 snprintf (utmp
, sizeof (utmp
), "U%08X", last_wch
);
1592 seq
= charmap_find_value (charmap
, utmp
, 9);
1595 snprintf (utmp
, sizeof (utmp
), "U%04X", last_wch
);
1596 seq
= charmap_find_value (charmap
, utmp
, 5);
1600 /* Try looking in the repertoire map. */
1601 seq
= repertoire_find_seq (repertoire
, last_wch
);
1603 /* If this is the first time we look for this sequence create a new
1607 static const struct charseq negative
1608 = { .ucs4
= ILLEGAL_CHAR_VALUE
};
1610 /* Find the symbolic name for this UCS4 value. */
1611 if (repertoire
!= NULL
)
1613 const char *symbol
= repertoire_find_symbol (repertoire
,
1615 uint32_t *newp
= obstack_alloc (&repertoire
->mem_pool
,
1620 /* We have a name, now search the multibyte value. */
1621 seq
= charmap_find_value (charmap
, symbol
, strlen (symbol
));
1624 /* We have to create a fake entry. */
1625 seq
= (struct charseq
*) &negative
;
1627 seq
->ucs4
= last_wch
;
1629 insert_entry (&repertoire
->seq_table
, newp
, sizeof (uint32_t),
1633 /* We have to create a fake entry. */
1634 seq
= (struct charseq
*) &negative
;
1637 /* We have a name, now search the multibyte value. */
1638 if (seq
->ucs4
== last_wch
&& seq
->nbytes
== 1)
1639 /* Yep, we can store information about this byte sequence. */
1640 ctype
->class256_collection
[(size_t) seq
->bytes
[0]]
1643 /* And of course we have the UCS4 position. */
1645 *find_idx (ctype
, &ctype
->class_collection
,
1646 &ctype
->class_collection_max
,
1647 &ctype
->class_collection_act
, last_wch
) |= class_bit
;
1649 if (handle_digits
== 1)
1651 /* We must store the digit values. */
1652 if (ctype
->mbdigits_act
== ctype
->mbdigits_max
)
1654 ctype
->mbdigits_max
*= 2;
1655 ctype
->mbdigits
= xrealloc (ctype
->mbdigits
,
1656 (ctype
->mbdigits_max
1657 * sizeof (char *)));
1658 ctype
->wcdigits_max
*= 2;
1659 ctype
->wcdigits
= xrealloc (ctype
->wcdigits
,
1660 (ctype
->wcdigits_max
1661 * sizeof (uint32_t)));
1664 ctype
->mbdigits
[ctype
->mbdigits_act
++] = (seq
->ucs4
== last_wch
1666 ctype
->wcdigits
[ctype
->wcdigits_act
++] = last_wch
;
1668 else if (handle_digits
== 2)
1670 /* We must store the digit values. */
1671 if (ctype
->outdigits_act
>= 10)
1673 lr_error (ldfile
, _("\
1674 %s: field `%s' does not contain exactly ten entries"),
1675 "LC_CTYPE", "outdigit");
1679 ctype
->mboutdigits
[ctype
->outdigits_act
] = (seq
->ucs4
== last_wch
1681 ctype
->wcoutdigits
[ctype
->outdigits_act
] = last_wch
;
1682 ++ctype
->outdigits_act
;
1688 /* Ellipsis as in `/xea/x12.../xea/x34'. */
1690 charclass_charcode_ellipsis (struct linereader
*ldfile
,
1691 struct locale_ctype_t
*ctype
,
1692 const struct charmap_t
*charmap
,
1693 struct repertoire_t
*repertoire
,
1694 struct token
*now
, char *last_charcode
,
1695 uint32_t last_charcode_len
,
1696 unsigned long int class256_bit
,
1697 unsigned long int class_bit
, int ignore_content
,
1700 /* First check whether the to-value is larger. */
1701 if (now
->val
.charcode
.nbytes
!= last_charcode_len
)
1703 lr_error (ldfile
, _("\
1704 start and end character sequence of range must have the same length"));
1708 if (memcmp (last_charcode
, now
->val
.charcode
.bytes
, last_charcode_len
) > 0)
1710 lr_error (ldfile
, _("\
1711 to-value character sequence is smaller than from-value sequence"));
1715 if (!ignore_content
)
1719 /* Increment the byte sequence value. */
1720 struct charseq
*seq
;
1724 for (i
= last_charcode_len
- 1; i
>= 0; --i
)
1725 if (++last_charcode
[i
] != 0)
1728 if (last_charcode_len
== 1)
1729 /* Of course we have the charcode value. */
1730 ctype
->class256_collection
[(size_t) last_charcode
[0]]
1733 /* Find the symbolic name. */
1734 seq
= charmap_find_symbol (charmap
, last_charcode
,
1738 if (seq
->ucs4
== UNINITIALIZED_CHAR_VALUE
)
1739 seq
->ucs4
= repertoire_find_value (repertoire
, seq
->name
,
1740 strlen (seq
->name
));
1741 wch
= seq
== NULL
? ILLEGAL_CHAR_VALUE
: seq
->ucs4
;
1743 if (wch
!= ILLEGAL_CHAR_VALUE
&& class_bit
!= 0)
1744 *find_idx (ctype
, &ctype
->class_collection
,
1745 &ctype
->class_collection_max
,
1746 &ctype
->class_collection_act
, wch
) |= class_bit
;
1749 wch
= ILLEGAL_CHAR_VALUE
;
1751 if (handle_digits
== 1)
1753 /* We must store the digit values. */
1754 if (ctype
->mbdigits_act
== ctype
->mbdigits_max
)
1756 ctype
->mbdigits_max
*= 2;
1757 ctype
->mbdigits
= xrealloc (ctype
->mbdigits
,
1758 (ctype
->mbdigits_max
1759 * sizeof (char *)));
1760 ctype
->wcdigits_max
*= 2;
1761 ctype
->wcdigits
= xrealloc (ctype
->wcdigits
,
1762 (ctype
->wcdigits_max
1763 * sizeof (uint32_t)));
1766 seq
= xmalloc (sizeof (struct charseq
) + last_charcode_len
);
1767 memcpy ((char *) (seq
+ 1), last_charcode
, last_charcode_len
);
1768 seq
->nbytes
= last_charcode_len
;
1770 ctype
->mbdigits
[ctype
->mbdigits_act
++] = seq
;
1771 ctype
->wcdigits
[ctype
->wcdigits_act
++] = wch
;
1773 else if (handle_digits
== 2)
1775 struct charseq
*seq
;
1776 /* We must store the digit values. */
1777 if (ctype
->outdigits_act
>= 10)
1779 lr_error (ldfile
, _("\
1780 %s: field `%s' does not contain exactly ten entries"),
1781 "LC_CTYPE", "outdigit");
1785 seq
= xmalloc (sizeof (struct charseq
) + last_charcode_len
);
1786 memcpy ((char *) (seq
+ 1), last_charcode
, last_charcode_len
);
1787 seq
->nbytes
= last_charcode_len
;
1789 ctype
->mboutdigits
[ctype
->outdigits_act
] = seq
;
1790 ctype
->wcoutdigits
[ctype
->outdigits_act
] = wch
;
1791 ++ctype
->outdigits_act
;
1794 while (memcmp (last_charcode
, now
->val
.charcode
.bytes
,
1795 last_charcode_len
) != 0);
1801 find_translit2 (struct locale_ctype_t
*ctype
, const struct charmap_t
*charmap
,
1804 struct translit_t
*trunp
= ctype
->translit
;
1805 struct translit_ignore_t
*tirunp
= ctype
->translit_ignore
;
1807 while (trunp
!= NULL
)
1809 /* XXX We simplify things here. The transliterations we look
1810 for are only allowed to have one character. */
1811 if (trunp
->from
[0] == wch
&& trunp
->from
[1] == 0)
1813 /* Found it. Now look for a transliteration which can be
1814 represented with the character set. */
1815 struct translit_to_t
*torunp
= trunp
->to
;
1817 while (torunp
!= NULL
)
1821 for (i
= 0; torunp
->str
[i
] != 0; ++i
)
1825 snprintf (utmp
, sizeof (utmp
), "U%08X", torunp
->str
[i
]);
1826 if (charmap_find_value (charmap
, utmp
, 9) == NULL
)
1827 /* This character cannot be represented. */
1831 if (torunp
->str
[i
] == 0)
1834 torunp
= torunp
->next
;
1840 trunp
= trunp
->next
;
1843 /* Check for ignored chars. */
1844 while (tirunp
!= NULL
)
1846 if (tirunp
->from
<= wch
&& tirunp
->to
>= wch
)
1850 for (wi
= tirunp
->from
; wi
<= wch
; wi
+= tirunp
->step
)
1852 return (uint32_t []) { 0 };
1856 /* Nothing found. */
1862 find_translit (struct localedef_t
*locale
, const struct charmap_t
*charmap
,
1865 struct locale_ctype_t
*ctype
;
1866 uint32_t *result
= NULL
;
1868 assert (locale
!= NULL
);
1869 ctype
= locale
->categories
[LC_CTYPE
].ctype
;
1874 if (ctype
->translit
!= NULL
)
1875 result
= find_translit2 (ctype
, charmap
, wch
);
1879 struct translit_include_t
*irunp
= ctype
->translit_include
;
1881 while (irunp
!= NULL
&& result
== NULL
)
1883 result
= find_translit (find_locale (CTYPE_LOCALE
,
1885 irunp
->copy_repertoire
,
1888 irunp
= irunp
->next
;
1896 /* Read one transliteration entry. */
1898 read_widestring (struct linereader
*ldfile
, struct token
*now
,
1899 const struct charmap_t
*charmap
,
1900 struct repertoire_t
*repertoire
)
1904 if (now
->tok
== tok_default_missing
)
1905 /* The special name "" will denote this case. */
1906 wstr
= ((uint32_t *) { 0 });
1907 else if (now
->tok
== tok_bsymbol
)
1909 /* Get the value from the repertoire. */
1910 wstr
= (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1911 wstr
[0] = repertoire_find_value (repertoire
, now
->val
.str
.startmb
,
1912 now
->val
.str
.lenmb
);
1913 if (wstr
[0] == ILLEGAL_CHAR_VALUE
)
1915 /* We cannot proceed, we don't know the UCS4 value. */
1922 else if (now
->tok
== tok_ucs4
)
1924 wstr
= (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1925 wstr
[0] = now
->val
.ucs4
;
1928 else if (now
->tok
== tok_charcode
)
1930 /* Argh, we have to convert to the symbol name first and then to the
1932 struct charseq
*seq
= charmap_find_symbol (charmap
,
1933 now
->val
.str
.startmb
,
1934 now
->val
.str
.lenmb
);
1936 /* Cannot find the UCS4 value. */
1939 if (seq
->ucs4
== UNINITIALIZED_CHAR_VALUE
)
1940 seq
->ucs4
= repertoire_find_value (repertoire
, seq
->name
,
1941 strlen (seq
->name
));
1942 if (seq
->ucs4
== ILLEGAL_CHAR_VALUE
)
1943 /* We cannot proceed, we don't know the UCS4 value. */
1946 wstr
= (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1947 wstr
[0] = seq
->ucs4
;
1950 else if (now
->tok
== tok_string
)
1952 wstr
= now
->val
.str
.startwc
;
1953 if (wstr
== NULL
|| wstr
[0] == 0)
1958 if (now
->tok
!= tok_eol
&& now
->tok
!= tok_eof
)
1959 lr_ignore_rest (ldfile
, 0);
1960 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
1961 return (uint32_t *) -1l;
1969 read_translit_entry (struct linereader
*ldfile
, struct locale_ctype_t
*ctype
,
1970 struct token
*now
, const struct charmap_t
*charmap
,
1971 struct repertoire_t
*repertoire
)
1973 uint32_t *from_wstr
= read_widestring (ldfile
, now
, charmap
, repertoire
);
1974 struct translit_t
*result
;
1975 struct translit_to_t
**top
;
1976 struct obstack
*ob
= &ctype
->mempool
;
1980 if (from_wstr
== NULL
)
1981 /* There is no valid from string. */
1984 result
= (struct translit_t
*) obstack_alloc (ob
,
1985 sizeof (struct translit_t
));
1986 result
->from
= from_wstr
;
1987 result
->fname
= ldfile
->fname
;
1988 result
->lineno
= ldfile
->lineno
;
1989 result
->next
= NULL
;
1999 /* Next we have one or more transliterations. They are
2000 separated by semicolons. */
2001 now
= lr_token (ldfile
, charmap
, NULL
, repertoire
, verbose
);
2003 if (!first
&& (now
->tok
== tok_semicolon
|| now
->tok
== tok_eol
))
2005 /* One string read. */
2006 const uint32_t zero
= 0;
2010 obstack_grow (ob
, &zero
, 4);
2011 to_wstr
= obstack_finish (ob
);
2013 *top
= obstack_alloc (ob
, sizeof (struct translit_to_t
));
2014 (*top
)->str
= to_wstr
;
2015 (*top
)->next
= NULL
;
2018 if (now
->tok
== tok_eol
)
2020 result
->next
= ctype
->translit
;
2021 ctype
->translit
= result
;
2026 top
= &(*top
)->next
;
2031 to_wstr
= read_widestring (ldfile
, now
, charmap
, repertoire
);
2032 if (to_wstr
== (uint32_t *) -1l)
2034 /* An error occurred. */
2035 obstack_free (ob
, result
);
2039 if (to_wstr
== NULL
)
2042 /* This value is usable. */
2043 obstack_grow (ob
, to_wstr
, wcslen ((wchar_t *) to_wstr
) * 4);
2052 read_translit_ignore_entry (struct linereader
*ldfile
,
2053 struct locale_ctype_t
*ctype
,
2054 const struct charmap_t
*charmap
,
2055 struct repertoire_t
*repertoire
)
2057 /* We expect a semicolon-separated list of characters we ignore. We are
2058 only interested in the wide character definitions. These must be
2059 single characters, possibly defining a range when an ellipsis is used. */
2062 struct token
*now
= lr_token (ldfile
, charmap
, NULL
, repertoire
,
2064 struct translit_ignore_t
*newp
;
2067 if (now
->tok
== tok_eol
|| now
->tok
== tok_eof
)
2070 _("premature end of `translit_ignore' definition"));
2074 if (now
->tok
!= tok_bsymbol
&& now
->tok
!= tok_ucs4
)
2076 lr_error (ldfile
, _("syntax error"));
2077 lr_ignore_rest (ldfile
, 0);
2081 if (now
->tok
== tok_ucs4
)
2082 from
= now
->val
.ucs4
;
2084 /* Try to get the value. */
2085 from
= repertoire_find_value (repertoire
, now
->val
.str
.startmb
,
2086 now
->val
.str
.lenmb
);
2088 if (from
== ILLEGAL_CHAR_VALUE
)
2090 lr_error (ldfile
, "invalid character name");
2095 newp
= (struct translit_ignore_t
*)
2096 obstack_alloc (&ctype
->mempool
, sizeof (struct translit_ignore_t
));
2101 newp
->next
= ctype
->translit_ignore
;
2102 ctype
->translit_ignore
= newp
;
2105 /* Now we expect either a semicolon, an ellipsis, or the end of the
2107 now
= lr_token (ldfile
, charmap
, NULL
, repertoire
, verbose
);
2109 if (now
->tok
== tok_ellipsis2
|| now
->tok
== tok_ellipsis2_2
)
2111 /* XXX Should we bother implementing `....'? `...' certainly
2112 will not be implemented. */
2114 int step
= now
->tok
== tok_ellipsis2_2
? 2 : 1;
2116 now
= lr_token (ldfile
, charmap
, NULL
, repertoire
, verbose
);
2118 if (now
->tok
== tok_eol
|| now
->tok
== tok_eof
)
2121 _("premature end of `translit_ignore' definition"));
2125 if (now
->tok
!= tok_bsymbol
&& now
->tok
!= tok_ucs4
)
2127 lr_error (ldfile
, _("syntax error"));
2128 lr_ignore_rest (ldfile
, 0);
2132 if (now
->tok
== tok_ucs4
)
2135 /* Try to get the value. */
2136 to
= repertoire_find_value (repertoire
, now
->val
.str
.startmb
,
2137 now
->val
.str
.lenmb
);
2139 if (to
== ILLEGAL_CHAR_VALUE
)
2140 lr_error (ldfile
, "invalid character name");
2143 /* Make sure the `to'-value is larger. */
2150 lr_error (ldfile
, _("\
2151 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
2152 (to
| from
) < 65536 ? 4 : 8, to
,
2153 (to
| from
) < 65536 ? 4 : 8, from
);
2156 /* And the next token. */
2157 now
= lr_token (ldfile
, charmap
, NULL
, repertoire
, verbose
);
2160 if (now
->tok
== tok_eol
|| now
->tok
== tok_eof
)
2164 if (now
->tok
== tok_semicolon
)
2168 /* If we come here something is wrong. */
2169 lr_error (ldfile
, _("syntax error"));
2170 lr_ignore_rest (ldfile
, 0);
2176 /* The parser for the LC_CTYPE section of the locale definition. */
2178 ctype_read (struct linereader
*ldfile
, struct localedef_t
*result
,
2179 const struct charmap_t
*charmap
, const char *repertoire_name
,
2182 struct repertoire_t
*repertoire
= NULL
;
2183 struct locale_ctype_t
*ctype
;
2185 enum token_t nowtok
;
2187 struct charseq
*last_seq
;
2188 uint32_t last_wch
= 0;
2189 enum token_t last_token
;
2190 enum token_t ellipsis_token
;
2192 char last_charcode
[16];
2193 size_t last_charcode_len
= 0;
2194 const char *last_str
= NULL
;
2196 struct localedef_t
*copy_locale
= NULL
;
2198 /* Get the repertoire we have to use. */
2199 if (repertoire_name
!= NULL
)
2200 repertoire
= repertoire_read (repertoire_name
);
2202 /* The rest of the line containing `LC_CTYPE' must be free. */
2203 lr_ignore_rest (ldfile
, 1);
2208 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2211 while (nowtok
== tok_eol
);
2213 /* If we see `copy' now we are almost done. */
2214 if (nowtok
== tok_copy
)
2216 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2217 if (now
->tok
!= tok_string
)
2219 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2223 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2224 while (now
->tok
!= tok_eof
&& now
->tok
!= tok_end
);
2226 if (now
->tok
!= tok_eof
2227 || (now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
),
2228 now
->tok
== tok_eof
))
2229 lr_error (ldfile
, _("%s: premature end of file"), "LC_CTYPE");
2230 else if (now
->tok
!= tok_lc_ctype
)
2232 lr_error (ldfile
, _("\
2233 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2234 lr_ignore_rest (ldfile
, 0);
2237 lr_ignore_rest (ldfile
, 1);
2242 if (! ignore_content
)
2244 /* Get the locale definition. */
2245 copy_locale
= load_locale (LC_CTYPE
, now
->val
.str
.startmb
,
2246 repertoire_name
, charmap
, NULL
);
2247 if ((copy_locale
->avail
& CTYPE_LOCALE
) == 0)
2249 /* Not yet loaded. So do it now. */
2250 if (locfile_read (copy_locale
, charmap
) != 0)
2254 if (copy_locale
->categories
[LC_CTYPE
].ctype
== NULL
)
2258 lr_ignore_rest (ldfile
, 1);
2260 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2264 /* Prepare the data structures. */
2265 ctype_startup (ldfile
, result
, charmap
, copy_locale
, ignore_content
);
2266 ctype
= result
->categories
[LC_CTYPE
].ctype
;
2268 /* Remember the repertoire we use. */
2269 if (!ignore_content
)
2270 ctype
->repertoire
= repertoire
;
2274 unsigned long int class_bit
= 0;
2275 unsigned long int class256_bit
= 0;
2276 int handle_digits
= 0;
2278 /* Of course we don't proceed beyond the end of file. */
2279 if (nowtok
== tok_eof
)
2282 /* Ingore empty lines. */
2283 if (nowtok
== tok_eol
)
2285 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2293 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2294 while (now
->tok
== tok_ident
|| now
->tok
== tok_string
)
2296 ctype_class_new (ldfile
, ctype
, now
->val
.str
.startmb
);
2297 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2298 if (now
->tok
!= tok_semicolon
)
2300 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2302 if (now
->tok
!= tok_eol
)
2304 %s: syntax error in definition of new character class"), "LC_CTYPE");
2308 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2309 while (now
->tok
== tok_ident
|| now
->tok
== tok_string
)
2311 ctype_map_new (ldfile
, ctype
, now
->val
.str
.startmb
, charmap
);
2312 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2313 if (now
->tok
!= tok_semicolon
)
2315 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2317 if (now
->tok
!= tok_eol
)
2319 %s: syntax error in definition of new character map"), "LC_CTYPE");
2323 /* Ignore the rest of the line if we don't need the input of
2327 lr_ignore_rest (ldfile
, 0);
2331 /* We simply forget the `class' keyword and use the following
2332 operand to determine the bit. */
2333 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2334 if (now
->tok
== tok_ident
|| now
->tok
== tok_string
)
2336 /* Must can be one of the predefined class names. */
2337 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
)
2338 if (strcmp (ctype
->classnames
[cnt
], now
->val
.str
.startmb
) == 0)
2340 if (cnt
>= ctype
->nr_charclass
)
2342 #ifdef PREDEFINED_CLASSES
2343 if (now
->val
.str
.lenmb
== 8
2344 && memcmp ("special1", now
->val
.str
.startmb
, 8) == 0)
2345 class_bit
= _ISwspecial1
;
2346 else if (now
->val
.str
.lenmb
== 8
2347 && memcmp ("special2", now
->val
.str
.startmb
, 8) == 0)
2348 class_bit
= _ISwspecial2
;
2349 else if (now
->val
.str
.lenmb
== 8
2350 && memcmp ("special3", now
->val
.str
.startmb
, 8) == 0)
2351 class_bit
= _ISwspecial3
;
2355 /* OK, it's a new class. */
2356 ctype_class_new (ldfile
, ctype
, now
->val
.str
.startmb
);
2358 class_bit
= _ISwbit (ctype
->nr_charclass
- 1);
2363 class_bit
= _ISwbit (cnt
);
2365 free (now
->val
.str
.startmb
);
2368 else if (now
->tok
== tok_digit
)
2369 goto handle_tok_digit
;
2370 else if (now
->tok
< tok_upper
|| now
->tok
> tok_blank
)
2374 class_bit
= BITw (now
->tok
);
2375 class256_bit
= BIT (now
->tok
);
2378 /* The next character must be a semicolon. */
2379 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2380 if (now
->tok
!= tok_semicolon
)
2382 goto read_charclass
;
2395 /* Ignore the rest of the line if we don't need the input of
2399 lr_ignore_rest (ldfile
, 0);
2403 class_bit
= BITw (now
->tok
);
2404 class256_bit
= BIT (now
->tok
);
2407 ctype
->class_done
|= class_bit
;
2408 last_token
= tok_none
;
2409 ellipsis_token
= tok_none
;
2411 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2412 while (now
->tok
!= tok_eol
&& now
->tok
!= tok_eof
)
2415 struct charseq
*seq
;
2417 if (ellipsis_token
== tok_none
)
2419 if (get_character (now
, charmap
, repertoire
, &seq
, &wch
))
2422 if (!ignore_content
&& seq
!= NULL
&& seq
->nbytes
== 1)
2423 /* Yep, we can store information about this byte
2425 ctype
->class256_collection
[seq
->bytes
[0]] |= class256_bit
;
2427 if (!ignore_content
&& wch
!= ILLEGAL_CHAR_VALUE
2429 /* We have the UCS4 position. */
2430 *find_idx (ctype
, &ctype
->class_collection
,
2431 &ctype
->class_collection_max
,
2432 &ctype
->class_collection_act
, wch
) |= class_bit
;
2434 last_token
= now
->tok
;
2435 /* Terminate the string. */
2436 if (last_token
== tok_bsymbol
)
2438 now
->val
.str
.startmb
[now
->val
.str
.lenmb
] = '\0';
2439 last_str
= now
->val
.str
.startmb
;
2445 memcpy (last_charcode
, now
->val
.charcode
.bytes
, 16);
2446 last_charcode_len
= now
->val
.charcode
.nbytes
;
2448 if (!ignore_content
&& handle_digits
== 1)
2450 /* We must store the digit values. */
2451 if (ctype
->mbdigits_act
== ctype
->mbdigits_max
)
2453 ctype
->mbdigits_max
+= 10;
2454 ctype
->mbdigits
= xrealloc (ctype
->mbdigits
,
2455 (ctype
->mbdigits_max
2456 * sizeof (char *)));
2457 ctype
->wcdigits_max
+= 10;
2458 ctype
->wcdigits
= xrealloc (ctype
->wcdigits
,
2459 (ctype
->wcdigits_max
2460 * sizeof (uint32_t)));
2463 ctype
->mbdigits
[ctype
->mbdigits_act
++] = seq
;
2464 ctype
->wcdigits
[ctype
->wcdigits_act
++] = wch
;
2466 else if (!ignore_content
&& handle_digits
== 2)
2468 /* We must store the digit values. */
2469 if (ctype
->outdigits_act
>= 10)
2471 lr_error (ldfile
, _("\
2472 %s: field `%s' does not contain exactly ten entries"),
2473 "LC_CTYPE", "outdigit");
2474 lr_ignore_rest (ldfile
, 0);
2478 ctype
->mboutdigits
[ctype
->outdigits_act
] = seq
;
2479 ctype
->wcoutdigits
[ctype
->outdigits_act
] = wch
;
2480 ++ctype
->outdigits_act
;
2485 /* Now it gets complicated. We have to resolve the
2486 ellipsis problem. First we must distinguish between
2487 the different kind of ellipsis and this must match the
2488 tokens we have seen. */
2489 assert (last_token
!= tok_none
);
2491 if (last_token
!= now
->tok
)
2493 lr_error (ldfile
, _("\
2494 ellipsis range must be marked by two operands of same type"));
2495 lr_ignore_rest (ldfile
, 0);
2499 if (last_token
== tok_bsymbol
)
2501 if (ellipsis_token
== tok_ellipsis3
)
2502 lr_error (ldfile
, _("with symbolic name range values \
2503 the absolute ellipsis `...' must not be used"));
2505 charclass_symbolic_ellipsis (ldfile
, ctype
, charmap
,
2506 repertoire
, now
, last_str
,
2507 class256_bit
, class_bit
,
2512 handle_digits
, step
);
2514 else if (last_token
== tok_ucs4
)
2516 if (ellipsis_token
!= tok_ellipsis2
)
2517 lr_error (ldfile
, _("\
2518 with UCS range values one must use the hexadecimal symbolic ellipsis `..'"));
2520 charclass_ucs4_ellipsis (ldfile
, ctype
, charmap
,
2521 repertoire
, now
, last_wch
,
2522 class256_bit
, class_bit
,
2523 ignore_content
, handle_digits
,
2528 assert (last_token
== tok_charcode
);
2530 if (ellipsis_token
!= tok_ellipsis3
)
2531 lr_error (ldfile
, _("\
2532 with character code range values one must use the absolute ellipsis `...'"));
2534 charclass_charcode_ellipsis (ldfile
, ctype
, charmap
,
2538 class256_bit
, class_bit
,
2543 /* Now we have used the last value. */
2544 last_token
= tok_none
;
2547 /* Next we expect a semicolon or the end of the line. */
2548 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2549 if (now
->tok
== tok_eol
|| now
->tok
== tok_eof
)
2552 if (last_token
!= tok_none
2553 && now
->tok
>= tok_ellipsis2
&& now
->tok
<= tok_ellipsis4_2
)
2555 if (now
->tok
== tok_ellipsis2_2
)
2557 now
->tok
= tok_ellipsis2
;
2560 else if (now
->tok
== tok_ellipsis4_2
)
2562 now
->tok
= tok_ellipsis4
;
2566 ellipsis_token
= now
->tok
;
2568 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2572 if (now
->tok
!= tok_semicolon
)
2575 /* And get the next character. */
2576 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2578 ellipsis_token
= tok_none
;
2584 /* Ignore the rest of the line if we don't need the input of
2588 lr_ignore_rest (ldfile
, 0);
2593 class_bit
= _ISwdigit
;
2594 class256_bit
= _ISdigit
;
2596 goto read_charclass
;
2599 /* Ignore the rest of the line if we don't need the input of
2603 lr_ignore_rest (ldfile
, 0);
2607 if (ctype
->outdigits_act
!= 0)
2608 lr_error (ldfile
, _("\
2609 %s: field `%s' declared more than once"),
2610 "LC_CTYPE", "outdigit");
2614 goto read_charclass
;
2617 /* Ignore the rest of the line if we don't need the input of
2621 lr_ignore_rest (ldfile
, 0);
2629 /* Ignore the rest of the line if we don't need the input of
2633 lr_ignore_rest (ldfile
, 0);
2641 /* Ignore the rest of the line if we don't need the input of
2645 lr_ignore_rest (ldfile
, 0);
2649 /* We simply forget the `map' keyword and use the following
2650 operand to determine the mapping. */
2651 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2652 if (now
->tok
== tok_ident
|| now
->tok
== tok_string
)
2656 for (cnt
= 2; cnt
< ctype
->map_collection_nr
; ++cnt
)
2657 if (strcmp (now
->val
.str
.startmb
, ctype
->mapnames
[cnt
]) == 0)
2660 if (cnt
< ctype
->map_collection_nr
)
2661 free (now
->val
.str
.startmb
);
2663 /* OK, it's a new map. */
2664 ctype_map_new (ldfile
, ctype
, now
->val
.str
.startmb
, charmap
);
2668 else if (now
->tok
< tok_toupper
|| now
->tok
> tok_tolower
)
2671 mapidx
= now
->tok
- tok_toupper
;
2673 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2674 /* This better should be a semicolon. */
2675 if (now
->tok
!= tok_semicolon
)
2679 /* Test whether this mapping was already defined. */
2680 if (ctype
->tomap_done
[mapidx
])
2682 lr_error (ldfile
, _("duplicated definition for mapping `%s'"),
2683 ctype
->mapnames
[mapidx
]);
2684 lr_ignore_rest (ldfile
, 0);
2687 ctype
->tomap_done
[mapidx
] = 1;
2689 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2690 while (now
->tok
!= tok_eol
&& now
->tok
!= tok_eof
)
2692 struct charseq
*from_seq
;
2694 struct charseq
*to_seq
;
2697 /* Every pair starts with an opening brace. */
2698 if (now
->tok
!= tok_open_brace
)
2701 /* Next comes the from-value. */
2702 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2703 if (get_character (now
, charmap
, repertoire
, &from_seq
,
2707 /* The next is a comma. */
2708 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2709 if (now
->tok
!= tok_comma
)
2712 /* And the other value. */
2713 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2714 if (get_character (now
, charmap
, repertoire
, &to_seq
,
2718 /* And the last thing is the closing brace. */
2719 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2720 if (now
->tok
!= tok_close_brace
)
2723 if (!ignore_content
)
2725 /* Check whether the mapping converts from an ASCII value
2726 to a non-ASCII value. */
2727 if (from_seq
!= NULL
&& from_seq
->nbytes
== 1
2728 && isascii (from_seq
->bytes
[0])
2729 && to_seq
!= NULL
&& (to_seq
->nbytes
!= 1
2730 || !isascii (to_seq
->bytes
[0])))
2731 ctype
->to_nonascii
= 1;
2733 if (mapidx
< 2 && from_seq
!= NULL
&& to_seq
!= NULL
2734 && from_seq
->nbytes
== 1 && to_seq
->nbytes
== 1)
2735 /* We can use this value. */
2736 ctype
->map256_collection
[mapidx
][from_seq
->bytes
[0]]
2739 if (from_wch
!= ILLEGAL_CHAR_VALUE
2740 && to_wch
!= ILLEGAL_CHAR_VALUE
)
2741 /* Both correct values. */
2742 *find_idx (ctype
, &ctype
->map_collection
[mapidx
],
2743 &ctype
->map_collection_max
[mapidx
],
2744 &ctype
->map_collection_act
[mapidx
],
2748 /* Now comes a semicolon or the end of the line/file. */
2749 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2750 if (now
->tok
== tok_semicolon
)
2751 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2755 case tok_translit_start
:
2756 /* Ignore the entire translit section with its peculiar syntax
2757 if we don't need the input. */
2762 lr_ignore_rest (ldfile
, 0);
2763 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2765 while (now
->tok
!= tok_translit_end
&& now
->tok
!= tok_eof
);
2767 if (now
->tok
== tok_eof
)
2768 lr_error (ldfile
, _(\
2769 "%s: `translit_start' section does not end with `translit_end'"),
2775 /* The rest of the line better should be empty. */
2776 lr_ignore_rest (ldfile
, 1);
2778 /* We count here the number of allocated entries in the `translit'
2782 ldfile
->translate_strings
= 1;
2783 ldfile
->return_widestr
= 1;
2785 /* We proceed until we see the `translit_end' token. */
2786 while (now
= lr_token (ldfile
, charmap
, NULL
, repertoire
, verbose
),
2787 now
->tok
!= tok_translit_end
&& now
->tok
!= tok_eof
)
2789 if (now
->tok
== tok_eol
)
2790 /* Ignore empty lines. */
2793 if (now
->tok
== tok_include
)
2795 /* We have to include locale. */
2796 const char *locale_name
;
2797 const char *repertoire_name
;
2798 struct translit_include_t
*include_stmt
, **include_ptr
;
2800 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2801 /* This should be a string or an identifier. In any
2802 case something to name a locale. */
2803 if (now
->tok
!= tok_string
&& now
->tok
!= tok_ident
)
2806 lr_error (ldfile
, _("%s: syntax error"), "LC_CTYPE");
2807 lr_ignore_rest (ldfile
, 0);
2810 locale_name
= now
->val
.str
.startmb
;
2812 /* Next should be a semicolon. */
2813 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2814 if (now
->tok
!= tok_semicolon
)
2815 goto translit_syntax
;
2817 /* Now the repertoire name. */
2818 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2819 if ((now
->tok
!= tok_string
&& now
->tok
!= tok_ident
)
2820 || now
->val
.str
.startmb
== NULL
)
2821 goto translit_syntax
;
2822 repertoire_name
= now
->val
.str
.startmb
;
2823 if (repertoire_name
[0] == '\0')
2824 /* Ignore the empty string. */
2825 repertoire_name
= NULL
;
2827 /* Save the include statement for later processing. */
2828 include_stmt
= (struct translit_include_t
*)
2829 xmalloc (sizeof (struct translit_include_t
));
2830 include_stmt
->copy_locale
= locale_name
;
2831 include_stmt
->copy_repertoire
= repertoire_name
;
2832 include_stmt
->next
= NULL
;
2834 include_ptr
= &ctype
->translit_include
;
2835 while (*include_ptr
!= NULL
)
2836 include_ptr
= &(*include_ptr
)->next
;
2837 *include_ptr
= include_stmt
;
2839 /* The rest of the line must be empty. */
2840 lr_ignore_rest (ldfile
, 1);
2842 /* Make sure the locale is read. */
2843 add_to_readlist (LC_CTYPE
, locale_name
, repertoire_name
,
2847 else if (now
->tok
== tok_default_missing
)
2853 /* We expect a single character or string as the
2855 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2856 wstr
= read_widestring (ldfile
, now
, charmap
,
2861 if (ctype
->default_missing
!= NULL
)
2863 lr_error (ldfile
, _("\
2864 %s: duplicate `default_missing' definition"), "LC_CTYPE");
2865 WITH_CUR_LOCALE (error_at_line (0, 0,
2866 ctype
->default_missing_file
,
2867 ctype
->default_missing_lineno
,
2869 previous definition was here")));
2873 ctype
->default_missing
= wstr
;
2874 ctype
->default_missing_file
= ldfile
->fname
;
2875 ctype
->default_missing_lineno
= ldfile
->lineno
;
2877 /* We can have more entries, ignore them. */
2878 lr_ignore_rest (ldfile
, 0);
2881 else if (wstr
== (uint32_t *) -1l)
2882 /* This was an syntax error. */
2885 /* Maybe there is another replacement we can use. */
2886 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2887 if (now
->tok
== tok_eol
|| now
->tok
== tok_eof
)
2889 /* Nothing found. We tell the user. */
2890 lr_error (ldfile
, _("\
2891 %s: no representable `default_missing' definition found"), "LC_CTYPE");
2894 if (now
->tok
!= tok_semicolon
)
2895 goto translit_syntax
;
2900 else if (now
->tok
== tok_translit_ignore
)
2902 read_translit_ignore_entry (ldfile
, ctype
, charmap
,
2907 read_translit_entry (ldfile
, ctype
, now
, charmap
, repertoire
);
2909 ldfile
->return_widestr
= 0;
2911 if (now
->tok
== tok_eof
)
2912 lr_error (ldfile
, _(\
2913 "%s: `translit_start' section does not end with `translit_end'"),
2919 /* Ignore the rest of the line if we don't need the input of
2923 lr_ignore_rest (ldfile
, 0);
2927 /* This could mean one of several things. First test whether
2928 it's a character class name. */
2929 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
)
2930 if (strcmp (now
->val
.str
.startmb
, ctype
->classnames
[cnt
]) == 0)
2932 if (cnt
< ctype
->nr_charclass
)
2934 class_bit
= _ISwbit (cnt
);
2935 class256_bit
= cnt
<= 11 ? _ISbit (cnt
) : 0;
2936 free (now
->val
.str
.startmb
);
2937 goto read_charclass
;
2939 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
)
2940 if (strcmp (now
->val
.str
.startmb
, ctype
->mapnames
[cnt
]) == 0)
2942 if (cnt
< ctype
->map_collection_nr
)
2945 free (now
->val
.str
.startmb
);
2948 #ifdef PREDEFINED_CLASSES
2949 if (strcmp (now
->val
.str
.startmb
, "special1") == 0)
2951 class_bit
= _ISwspecial1
;
2952 free (now
->val
.str
.startmb
);
2953 goto read_charclass
;
2955 if (strcmp (now
->val
.str
.startmb
, "special2") == 0)
2957 class_bit
= _ISwspecial2
;
2958 free (now
->val
.str
.startmb
);
2959 goto read_charclass
;
2961 if (strcmp (now
->val
.str
.startmb
, "special3") == 0)
2963 class_bit
= _ISwspecial3
;
2964 free (now
->val
.str
.startmb
);
2965 goto read_charclass
;
2967 if (strcmp (now
->val
.str
.startmb
, "tosymmetric") == 0)
2976 /* Next we assume `LC_CTYPE'. */
2977 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
2978 if (now
->tok
== tok_eof
)
2980 if (now
->tok
== tok_eol
)
2981 lr_error (ldfile
, _("%s: incomplete `END' line"),
2983 else if (now
->tok
!= tok_lc_ctype
)
2984 lr_error (ldfile
, _("\
2985 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2986 lr_ignore_rest (ldfile
, now
->tok
== tok_lc_ctype
);
2991 if (now
->tok
!= tok_eof
)
2992 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2995 /* Prepare for the next round. */
2996 now
= lr_token (ldfile
, charmap
, NULL
, NULL
, verbose
);
3000 /* When we come here we reached the end of the file. */
3001 lr_error (ldfile
, _("%s: premature end of file"), "LC_CTYPE");
3006 set_class_defaults (struct locale_ctype_t
*ctype
,
3007 const struct charmap_t
*charmap
,
3008 struct repertoire_t
*repertoire
)
3012 /* These function defines the default values for the classes and conversions
3013 according to POSIX.2 2.5.2.1.
3014 It may seem that the order of these if-blocks is arbitrary but it is NOT.
3015 Don't move them unless you know what you do! */
3017 auto void set_default (int bitpos
, int from
, int to
);
3019 void set_default (int bitpos
, int from
, int to
)
3023 int bit
= _ISbit (bitpos
);
3024 int bitw
= _ISwbit (bitpos
);
3025 /* Define string. */
3028 for (ch
= from
; ch
<= to
; ++ch
)
3030 struct charseq
*seq
;
3033 seq
= charmap_find_value (charmap
, tmp
, 1);
3037 sprintf (buf
, "U%08X", ch
);
3038 seq
= charmap_find_value (charmap
, buf
, 9);
3043 WITH_CUR_LOCALE (error (0, 0, _("\
3044 %s: character `%s' not defined while needed as default value"),
3047 else if (seq
->nbytes
!= 1)
3048 WITH_CUR_LOCALE (error (0, 0, _("\
3049 %s: character `%s' in charmap not representable with one byte"),
3052 ctype
->class256_collection
[seq
->bytes
[0]] |= bit
;
3054 /* No need to search here, the ASCII value is also the Unicode
3056 ELEM (ctype
, class_collection
, , ch
) |= bitw
;
3060 /* Set default values if keyword was not present. */
3061 if ((ctype
->class_done
& BITw (tok_upper
)) == 0)
3062 /* "If this keyword [lower] is not specified, the lowercase letters
3063 `A' through `Z', ..., shall automatically belong to this class,
3064 with implementation defined character values." [P1003.2, 2.5.2.1] */
3065 set_default (BITPOS (tok_upper
), 'A', 'Z');
3067 if ((ctype
->class_done
& BITw (tok_lower
)) == 0)
3068 /* "If this keyword [lower] is not specified, the lowercase letters
3069 `a' through `z', ..., shall automatically belong to this class,
3070 with implementation defined character values." [P1003.2, 2.5.2.1] */
3071 set_default (BITPOS (tok_lower
), 'a', 'z');
3073 if ((ctype
->class_done
& BITw (tok_alpha
)) == 0)
3075 /* Table 2-6 in P1003.2 says that characters in class `upper' or
3076 class `lower' *must* be in class `alpha'. */
3077 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
);
3078 unsigned long int maskw
= BITw (tok_upper
) | BITw (tok_lower
);
3080 for (cnt
= 0; cnt
< 256; ++cnt
)
3081 if ((ctype
->class256_collection
[cnt
] & mask
) != 0)
3082 ctype
->class256_collection
[cnt
] |= BIT (tok_alpha
);
3084 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
3085 if ((ctype
->class_collection
[cnt
] & maskw
) != 0)
3086 ctype
->class_collection
[cnt
] |= BITw (tok_alpha
);
3089 if ((ctype
->class_done
& BITw (tok_digit
)) == 0)
3090 /* "If this keyword [digit] is not specified, the digits `0' through
3091 `9', ..., shall automatically belong to this class, with
3092 implementation-defined character values." [P1003.2, 2.5.2.1] */
3093 set_default (BITPOS (tok_digit
), '0', '9');
3095 /* "Only characters specified for the `alpha' and `digit' keyword
3096 shall be specified. Characters specified for the keyword `alpha'
3097 and `digit' are automatically included in this class. */
3099 unsigned long int mask
= BIT (tok_alpha
) | BIT (tok_digit
);
3100 unsigned long int maskw
= BITw (tok_alpha
) | BITw (tok_digit
);
3102 for (cnt
= 0; cnt
< 256; ++cnt
)
3103 if ((ctype
->class256_collection
[cnt
] & mask
) != 0)
3104 ctype
->class256_collection
[cnt
] |= BIT (tok_alnum
);
3106 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
3107 if ((ctype
->class_collection
[cnt
] & maskw
) != 0)
3108 ctype
->class_collection
[cnt
] |= BITw (tok_alnum
);
3111 if ((ctype
->class_done
& BITw (tok_space
)) == 0)
3112 /* "If this keyword [space] is not specified, the characters <space>,
3113 <form-feed>, <newline>, <carriage-return>, <tab>, and
3114 <vertical-tab>, ..., shall automatically belong to this class,
3115 with implementation-defined character values." [P1003.2, 2.5.2.1] */
3117 struct charseq
*seq
;
3119 seq
= charmap_find_value (charmap
, "space", 5);
3121 seq
= charmap_find_value (charmap
, "SP", 2);
3123 seq
= charmap_find_value (charmap
, "U00000020", 9);
3127 WITH_CUR_LOCALE (error (0, 0, _("\
3128 %s: character `%s' not defined while needed as default value"),
3129 "LC_CTYPE", "<space>"));
3131 else if (seq
->nbytes
!= 1)
3132 WITH_CUR_LOCALE (error (0, 0, _("\
3133 %s: character `%s' in charmap not representable with one byte"),
3134 "LC_CTYPE", "<space>"));
3136 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
3138 /* No need to search. */
3139 ELEM (ctype
, class_collection
, , L
' ') |= BITw (tok_space
);
3141 seq
= charmap_find_value (charmap
, "form-feed", 9);
3143 seq
= charmap_find_value (charmap
, "U0000000C", 9);
3147 WITH_CUR_LOCALE (error (0, 0, _("\
3148 %s: character `%s' not defined while needed as default value"),
3149 "LC_CTYPE", "<form-feed>"));
3151 else if (seq
->nbytes
!= 1)
3152 WITH_CUR_LOCALE (error (0, 0, _("\
3153 %s: character `%s' in charmap not representable with one byte"),
3154 "LC_CTYPE", "<form-feed>"));
3156 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
3158 /* No need to search. */
3159 ELEM (ctype
, class_collection
, , L
'\f') |= BITw (tok_space
);
3162 seq
= charmap_find_value (charmap
, "newline", 7);
3164 seq
= charmap_find_value (charmap
, "U0000000A", 9);
3168 WITH_CUR_LOCALE (error (0, 0, _("\
3169 %s: character `%s' not defined while needed as default value"),
3170 "LC_CTYPE", "<newline>"));
3172 else if (seq
->nbytes
!= 1)
3173 WITH_CUR_LOCALE (error (0, 0, _("\
3174 %s: character `%s' in charmap not representable with one byte"),
3175 "LC_CTYPE", "<newline>"));
3177 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
3179 /* No need to search. */
3180 ELEM (ctype
, class_collection
, , L
'\n') |= BITw (tok_space
);
3183 seq
= charmap_find_value (charmap
, "carriage-return", 15);
3185 seq
= charmap_find_value (charmap
, "U0000000D", 9);
3189 WITH_CUR_LOCALE (error (0, 0, _("\
3190 %s: character `%s' not defined while needed as default value"),
3191 "LC_CTYPE", "<carriage-return>"));
3193 else if (seq
->nbytes
!= 1)
3194 WITH_CUR_LOCALE (error (0, 0, _("\
3195 %s: character `%s' in charmap not representable with one byte"),
3196 "LC_CTYPE", "<carriage-return>"));
3198 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
3200 /* No need to search. */
3201 ELEM (ctype
, class_collection
, , L
'\r') |= BITw (tok_space
);
3204 seq
= charmap_find_value (charmap
, "tab", 3);
3206 seq
= charmap_find_value (charmap
, "U00000009", 9);
3210 WITH_CUR_LOCALE (error (0, 0, _("\
3211 %s: character `%s' not defined while needed as default value"),
3212 "LC_CTYPE", "<tab>"));
3214 else if (seq
->nbytes
!= 1)
3215 WITH_CUR_LOCALE (error (0, 0, _("\
3216 %s: character `%s' in charmap not representable with one byte"),
3217 "LC_CTYPE", "<tab>"));
3219 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
3221 /* No need to search. */
3222 ELEM (ctype
, class_collection
, , L
'\t') |= BITw (tok_space
);
3225 seq
= charmap_find_value (charmap
, "vertical-tab", 12);
3227 seq
= charmap_find_value (charmap
, "U0000000B", 9);
3231 WITH_CUR_LOCALE (error (0, 0, _("\
3232 %s: character `%s' not defined while needed as default value"),
3233 "LC_CTYPE", "<vertical-tab>"));
3235 else if (seq
->nbytes
!= 1)
3236 WITH_CUR_LOCALE (error (0, 0, _("\
3237 %s: character `%s' in charmap not representable with one byte"),
3238 "LC_CTYPE", "<vertical-tab>"));
3240 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_space
);
3242 /* No need to search. */
3243 ELEM (ctype
, class_collection
, , L
'\v') |= BITw (tok_space
);
3246 if ((ctype
->class_done
& BITw (tok_xdigit
)) == 0)
3247 /* "If this keyword is not specified, the digits `0' to `9', the
3248 uppercase letters `A' through `F', and the lowercase letters `a'
3249 through `f', ..., shell automatically belong to this class, with
3250 implementation defined character values." [P1003.2, 2.5.2.1] */
3252 set_default (BITPOS (tok_xdigit
), '0', '9');
3253 set_default (BITPOS (tok_xdigit
), 'A', 'F');
3254 set_default (BITPOS (tok_xdigit
), 'a', 'f');
3257 if ((ctype
->class_done
& BITw (tok_blank
)) == 0)
3258 /* "If this keyword [blank] is unspecified, the characters <space> and
3259 <tab> shall belong to this character class." [P1003.2, 2.5.2.1] */
3261 struct charseq
*seq
;
3263 seq
= charmap_find_value (charmap
, "space", 5);
3265 seq
= charmap_find_value (charmap
, "SP", 2);
3267 seq
= charmap_find_value (charmap
, "U00000020", 9);
3271 WITH_CUR_LOCALE (error (0, 0, _("\
3272 %s: character `%s' not defined while needed as default value"),
3273 "LC_CTYPE", "<space>"));
3275 else if (seq
->nbytes
!= 1)
3276 WITH_CUR_LOCALE (error (0, 0, _("\
3277 %s: character `%s' in charmap not representable with one byte"),
3278 "LC_CTYPE", "<space>"));
3280 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_blank
);
3282 /* No need to search. */
3283 ELEM (ctype
, class_collection
, , L
' ') |= BITw (tok_blank
);
3286 seq
= charmap_find_value (charmap
, "tab", 3);
3288 seq
= charmap_find_value (charmap
, "U00000009", 9);
3292 WITH_CUR_LOCALE (error (0, 0, _("\
3293 %s: character `%s' not defined while needed as default value"),
3294 "LC_CTYPE", "<tab>"));
3296 else if (seq
->nbytes
!= 1)
3297 WITH_CUR_LOCALE (error (0, 0, _("\
3298 %s: character `%s' in charmap not representable with one byte"),
3299 "LC_CTYPE", "<tab>"));
3301 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_blank
);
3303 /* No need to search. */
3304 ELEM (ctype
, class_collection
, , L
'\t') |= BITw (tok_blank
);
3307 if ((ctype
->class_done
& BITw (tok_graph
)) == 0)
3308 /* "If this keyword [graph] is not specified, characters specified for
3309 the keywords `upper', `lower', `alpha', `digit', `xdigit' and `punct',
3310 shall belong to this character class." [P1003.2, 2.5.2.1] */
3312 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
) |
3313 BIT (tok_alpha
) | BIT (tok_digit
) | BIT (tok_xdigit
) | BIT (tok_punct
);
3314 unsigned long int maskw
= BITw (tok_upper
) | BITw (tok_lower
) |
3315 BITw (tok_alpha
) | BITw (tok_digit
) | BITw (tok_xdigit
) |
3319 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
3320 if ((ctype
->class_collection
[cnt
] & maskw
) != 0)
3321 ctype
->class_collection
[cnt
] |= BITw (tok_graph
);
3323 for (cnt
= 0; cnt
< 256; ++cnt
)
3324 if ((ctype
->class256_collection
[cnt
] & mask
) != 0)
3325 ctype
->class256_collection
[cnt
] |= BIT (tok_graph
);
3328 if ((ctype
->class_done
& BITw (tok_print
)) == 0)
3329 /* "If this keyword [print] is not provided, characters specified for
3330 the keywords `upper', `lower', `alpha', `digit', `xdigit', `punct',
3331 and the <space> character shall belong to this character class."
3332 [P1003.2, 2.5.2.1] */
3334 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
) |
3335 BIT (tok_alpha
) | BIT (tok_digit
) | BIT (tok_xdigit
) | BIT (tok_punct
);
3336 unsigned long int maskw
= BITw (tok_upper
) | BITw (tok_lower
) |
3337 BITw (tok_alpha
) | BITw (tok_digit
) | BITw (tok_xdigit
) |
3340 struct charseq
*seq
;
3342 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
3343 if ((ctype
->class_collection
[cnt
] & maskw
) != 0)
3344 ctype
->class_collection
[cnt
] |= BITw (tok_print
);
3346 for (cnt
= 0; cnt
< 256; ++cnt
)
3347 if ((ctype
->class256_collection
[cnt
] & mask
) != 0)
3348 ctype
->class256_collection
[cnt
] |= BIT (tok_print
);
3351 seq
= charmap_find_value (charmap
, "space", 5);
3353 seq
= charmap_find_value (charmap
, "SP", 2);
3355 seq
= charmap_find_value (charmap
, "U00000020", 9);
3359 WITH_CUR_LOCALE (error (0, 0, _("\
3360 %s: character `%s' not defined while needed as default value"),
3361 "LC_CTYPE", "<space>"));
3363 else if (seq
->nbytes
!= 1)
3364 WITH_CUR_LOCALE (error (0, 0, _("\
3365 %s: character `%s' in charmap not representable with one byte"),
3366 "LC_CTYPE", "<space>"));
3368 ctype
->class256_collection
[seq
->bytes
[0]] |= BIT (tok_print
);
3370 /* No need to search. */
3371 ELEM (ctype
, class_collection
, , L
' ') |= BITw (tok_print
);
3374 if (ctype
->tomap_done
[0] == 0)
3375 /* "If this keyword [toupper] is not specified, the lowercase letters
3376 `a' through `z', and their corresponding uppercase letters `A' to
3377 `Z', ..., shall automatically be included, with implementation-
3378 defined character values." [P1003.2, 2.5.2.1] */
3383 strcpy (tmp
, "<?>");
3385 for (ch
= 'a'; ch
<= 'z'; ++ch
)
3387 struct charseq
*seq_from
, *seq_to
;
3391 seq_from
= charmap_find_value (charmap
, &tmp
[1], 1);
3392 if (seq_from
== NULL
)
3395 sprintf (buf
, "U%08X", ch
);
3396 seq_from
= charmap_find_value (charmap
, buf
, 9);
3398 if (seq_from
== NULL
)
3401 WITH_CUR_LOCALE (error (0, 0, _("\
3402 %s: character `%s' not defined while needed as default value"),
3405 else if (seq_from
->nbytes
!= 1)
3408 WITH_CUR_LOCALE (error (0, 0, _("\
3409 %s: character `%s' needed as default value not representable with one byte"),
3414 /* This conversion is implementation defined. */
3415 tmp
[1] = (char) (ch
+ ('A' - 'a'));
3416 seq_to
= charmap_find_value (charmap
, &tmp
[1], 1);
3420 sprintf (buf
, "U%08X", ch
+ ('A' - 'a'));
3421 seq_to
= charmap_find_value (charmap
, buf
, 9);
3426 WITH_CUR_LOCALE (error (0, 0, _("\
3427 %s: character `%s' not defined while needed as default value"),
3430 else if (seq_to
->nbytes
!= 1)
3433 WITH_CUR_LOCALE (error (0, 0, _("\
3434 %s: character `%s' needed as default value not representable with one byte"),
3438 /* The index [0] is determined by the order of the
3439 `ctype_map_newP' calls in `ctype_startup'. */
3440 ctype
->map256_collection
[0][seq_from
->bytes
[0]]
3444 /* No need to search. */
3445 ELEM (ctype
, map_collection
, [0], ch
) = ch
+ ('A' - 'a');
3449 if (ctype
->tomap_done
[1] == 0)
3450 /* "If this keyword [tolower] is not specified, the mapping shall be
3451 the reverse mapping of the one specified to `toupper'." [P1003.2] */
3453 for (cnt
= 0; cnt
< ctype
->map_collection_act
[0]; ++cnt
)
3454 if (ctype
->map_collection
[0][cnt
] != 0)
3455 ELEM (ctype
, map_collection
, [1],
3456 ctype
->map_collection
[0][cnt
])
3457 = ctype
->charnames
[cnt
];
3459 for (cnt
= 0; cnt
< 256; ++cnt
)
3460 if (ctype
->map256_collection
[0][cnt
] != 0)
3461 ctype
->map256_collection
[1][ctype
->map256_collection
[0][cnt
]] = cnt
;
3464 if (ctype
->outdigits_act
!= 10)
3466 if (ctype
->outdigits_act
!= 0)
3467 WITH_CUR_LOCALE (error (0, 0, _("\
3468 %s: field `%s' does not contain exactly ten entries"),
3469 "LC_CTYPE", "outdigit"));
3471 for (cnt
= ctype
->outdigits_act
; cnt
< 10; ++cnt
)
3473 ctype
->mboutdigits
[cnt
] = charmap_find_symbol (charmap
,
3474 (char *) digits
+ cnt
,
3477 if (ctype
->mboutdigits
[cnt
] == NULL
)
3478 ctype
->mboutdigits
[cnt
] = charmap_find_symbol (charmap
,
3480 strlen (longnames
[cnt
]));
3482 if (ctype
->mboutdigits
[cnt
] == NULL
)
3483 ctype
->mboutdigits
[cnt
] = charmap_find_symbol (charmap
,
3486 if (ctype
->mboutdigits
[cnt
] == NULL
)
3488 /* Provide a replacement. */
3489 WITH_CUR_LOCALE (error (0, 0, _("\
3490 no output digits defined and none of the standard names in the charmap")));
3492 ctype
->mboutdigits
[cnt
] = obstack_alloc (&((struct charmap_t
*) charmap
)->mem_pool
,
3493 sizeof (struct charseq
)
3496 /* This is better than nothing. */
3497 ctype
->mboutdigits
[cnt
]->bytes
[0] = digits
[cnt
];
3498 ctype
->mboutdigits
[cnt
]->nbytes
= 1;
3501 ctype
->wcoutdigits
[cnt
] = L
'0' + cnt
;
3504 ctype
->outdigits_act
= 10;
3509 /* Construction of sparse 3-level tables.
3510 See wchar-lookup.h for their structure and the meaning of p and q. */
3517 /* Working representation. */
3518 size_t level1_alloc
;
3521 size_t level2_alloc
;
3524 size_t level3_alloc
;
3527 /* Compressed representation. */
3532 /* Initialize. Assumes t->p and t->q have already been set. */
3534 wctype_table_init (struct wctype_table
*t
)
3537 t
->level1_alloc
= t
->level1_size
= 0;
3539 t
->level2_alloc
= t
->level2_size
= 0;
3541 t
->level3_alloc
= t
->level3_size
= 0;
3544 /* Retrieve an entry. */
3546 wctype_table_get (struct wctype_table
*t
, uint32_t wc
)
3548 uint32_t index1
= wc
>> (t
->q
+ t
->p
+ 5);
3549 if (index1
< t
->level1_size
)
3551 uint32_t lookup1
= t
->level1
[index1
];
3552 if (lookup1
!= EMPTY
)
3554 uint32_t index2
= ((wc
>> (t
->p
+ 5)) & ((1 << t
->q
) - 1))
3555 + (lookup1
<< t
->q
);
3556 uint32_t lookup2
= t
->level2
[index2
];
3557 if (lookup2
!= EMPTY
)
3559 uint32_t index3
= ((wc
>> 5) & ((1 << t
->p
) - 1))
3560 + (lookup2
<< t
->p
);
3561 uint32_t lookup3
= t
->level3
[index3
];
3562 uint32_t index4
= wc
& 0x1f;
3564 return (lookup3
>> index4
) & 1;
3571 /* Add one entry. */
3573 wctype_table_add (struct wctype_table
*t
, uint32_t wc
)
3575 uint32_t index1
= wc
>> (t
->q
+ t
->p
+ 5);
3576 uint32_t index2
= (wc
>> (t
->p
+ 5)) & ((1 << t
->q
) - 1);
3577 uint32_t index3
= (wc
>> 5) & ((1 << t
->p
) - 1);
3578 uint32_t index4
= wc
& 0x1f;
3581 if (index1
>= t
->level1_size
)
3583 if (index1
>= t
->level1_alloc
)
3585 size_t alloc
= 2 * t
->level1_alloc
;
3586 if (alloc
<= index1
)
3588 t
->level1
= (uint32_t *) xrealloc ((char *) t
->level1
,
3589 alloc
* sizeof (uint32_t));
3590 t
->level1_alloc
= alloc
;
3592 while (index1
>= t
->level1_size
)
3593 t
->level1
[t
->level1_size
++] = EMPTY
;
3596 if (t
->level1
[index1
] == EMPTY
)
3598 if (t
->level2_size
== t
->level2_alloc
)
3600 size_t alloc
= 2 * t
->level2_alloc
+ 1;
3601 t
->level2
= (uint32_t *) xrealloc ((char *) t
->level2
,
3602 (alloc
<< t
->q
) * sizeof (uint32_t));
3603 t
->level2_alloc
= alloc
;
3605 i1
= t
->level2_size
<< t
->q
;
3606 i2
= (t
->level2_size
+ 1) << t
->q
;
3607 for (i
= i1
; i
< i2
; i
++)
3608 t
->level2
[i
] = EMPTY
;
3609 t
->level1
[index1
] = t
->level2_size
++;
3612 index2
+= t
->level1
[index1
] << t
->q
;
3614 if (t
->level2
[index2
] == EMPTY
)
3616 if (t
->level3_size
== t
->level3_alloc
)
3618 size_t alloc
= 2 * t
->level3_alloc
+ 1;
3619 t
->level3
= (uint32_t *) xrealloc ((char *) t
->level3
,
3620 (alloc
<< t
->p
) * sizeof (uint32_t));
3621 t
->level3_alloc
= alloc
;
3623 i1
= t
->level3_size
<< t
->p
;
3624 i2
= (t
->level3_size
+ 1) << t
->p
;
3625 for (i
= i1
; i
< i2
; i
++)
3627 t
->level2
[index2
] = t
->level3_size
++;
3630 index3
+= t
->level2
[index2
] << t
->p
;
3632 t
->level3
[index3
] |= (uint32_t)1 << index4
;
3635 /* Finalize and shrink. */
3637 wctype_table_finalize (struct wctype_table
*t
)
3640 uint32_t reorder3
[t
->level3_size
];
3641 uint32_t reorder2
[t
->level2_size
];
3642 uint32_t level1_offset
, level2_offset
, level3_offset
;
3644 /* Uniquify level3 blocks. */
3646 for (j
= 0; j
< t
->level3_size
; j
++)
3648 for (i
= 0; i
< k
; i
++)
3649 if (memcmp (&t
->level3
[i
<< t
->p
], &t
->level3
[j
<< t
->p
],
3650 (1 << t
->p
) * sizeof (uint32_t)) == 0)
3652 /* Relocate block j to block i. */
3657 memcpy (&t
->level3
[i
<< t
->p
], &t
->level3
[j
<< t
->p
],
3658 (1 << t
->p
) * sizeof (uint32_t));
3664 for (i
= 0; i
< (t
->level2_size
<< t
->q
); i
++)
3665 if (t
->level2
[i
] != EMPTY
)
3666 t
->level2
[i
] = reorder3
[t
->level2
[i
]];
3668 /* Uniquify level2 blocks. */
3670 for (j
= 0; j
< t
->level2_size
; j
++)
3672 for (i
= 0; i
< k
; i
++)
3673 if (memcmp (&t
->level2
[i
<< t
->q
], &t
->level2
[j
<< t
->q
],
3674 (1 << t
->q
) * sizeof (uint32_t)) == 0)
3676 /* Relocate block j to block i. */
3681 memcpy (&t
->level2
[i
<< t
->q
], &t
->level2
[j
<< t
->q
],
3682 (1 << t
->q
) * sizeof (uint32_t));
3688 for (i
= 0; i
< t
->level1_size
; i
++)
3689 if (t
->level1
[i
] != EMPTY
)
3690 t
->level1
[i
] = reorder2
[t
->level1
[i
]];
3692 /* Create and fill the resulting compressed representation. */
3694 5 * sizeof (uint32_t)
3695 + t
->level1_size
* sizeof (uint32_t)
3696 + (t
->level2_size
<< t
->q
) * sizeof (uint32_t)
3697 + (t
->level3_size
<< t
->p
) * sizeof (uint32_t);
3698 t
->result
= (char *) xmalloc (t
->result_size
);
3701 5 * sizeof (uint32_t);
3703 5 * sizeof (uint32_t)
3704 + t
->level1_size
* sizeof (uint32_t);
3706 5 * sizeof (uint32_t)
3707 + t
->level1_size
* sizeof (uint32_t)
3708 + (t
->level2_size
<< t
->q
) * sizeof (uint32_t);
3710 ((uint32_t *) t
->result
)[0] = t
->q
+ t
->p
+ 5;
3711 ((uint32_t *) t
->result
)[1] = t
->level1_size
;
3712 ((uint32_t *) t
->result
)[2] = t
->p
+ 5;
3713 ((uint32_t *) t
->result
)[3] = (1 << t
->q
) - 1;
3714 ((uint32_t *) t
->result
)[4] = (1 << t
->p
) - 1;
3716 for (i
= 0; i
< t
->level1_size
; i
++)
3717 ((uint32_t *) (t
->result
+ level1_offset
))[i
] =
3718 (t
->level1
[i
] == EMPTY
3720 : (t
->level1
[i
] << t
->q
) * sizeof (uint32_t) + level2_offset
);
3722 for (i
= 0; i
< (t
->level2_size
<< t
->q
); i
++)
3723 ((uint32_t *) (t
->result
+ level2_offset
))[i
] =
3724 (t
->level2
[i
] == EMPTY
3726 : (t
->level2
[i
] << t
->p
) * sizeof (uint32_t) + level3_offset
);
3728 for (i
= 0; i
< (t
->level3_size
<< t
->p
); i
++)
3729 ((uint32_t *) (t
->result
+ level3_offset
))[i
] = t
->level3
[i
];
3731 if (t
->level1_alloc
> 0)
3733 if (t
->level2_alloc
> 0)
3735 if (t
->level3_alloc
> 0)
3739 #define TABLE wcwidth_table
3740 #define ELEMENT uint8_t
3741 #define DEFAULT 0xff
3744 #define TABLE wctrans_table
3745 #define ELEMENT int32_t
3747 #define wctrans_table_add wctrans_table_add_internal
3749 #undef wctrans_table_add
3750 /* The wctrans_table must actually store the difference between the
3751 desired result and the argument. */
3753 wctrans_table_add (struct wctrans_table
*t
, uint32_t wc
, uint32_t mapped_wc
)
3755 wctrans_table_add_internal (t
, wc
, mapped_wc
- wc
);
3759 /* Flattens the included transliterations into a translit list.
3760 Inserts them in the list at `cursor', and returns the new cursor. */
3761 static struct translit_t
**
3762 translit_flatten (struct locale_ctype_t
*ctype
,
3763 const struct charmap_t
*charmap
,
3764 struct translit_t
**cursor
)
3766 while (ctype
->translit_include
!= NULL
)
3768 const char *copy_locale
= ctype
->translit_include
->copy_locale
;
3769 const char *copy_repertoire
= ctype
->translit_include
->copy_repertoire
;
3770 struct localedef_t
*other
;
3772 /* Unchain the include statement. During the depth-first traversal
3773 we don't want to visit any locale more than once. */
3774 ctype
->translit_include
= ctype
->translit_include
->next
;
3776 other
= find_locale (LC_CTYPE
, copy_locale
, copy_repertoire
, charmap
);
3778 if (other
== NULL
|| other
->categories
[LC_CTYPE
].ctype
== NULL
)
3780 WITH_CUR_LOCALE (error (0, 0, _("\
3781 %s: transliteration data from locale `%s' not available"),
3782 "LC_CTYPE", copy_locale
));
3786 struct locale_ctype_t
*other_ctype
=
3787 other
->categories
[LC_CTYPE
].ctype
;
3789 cursor
= translit_flatten (other_ctype
, charmap
, cursor
);
3790 assert (other_ctype
->translit_include
== NULL
);
3792 if (other_ctype
->translit
!= NULL
)
3794 /* Insert the other_ctype->translit list at *cursor. */
3795 struct translit_t
*endp
= other_ctype
->translit
;
3796 while (endp
->next
!= NULL
)
3799 endp
->next
= *cursor
;
3800 *cursor
= other_ctype
->translit
;
3802 /* Avoid any risk of circular lists. */
3803 other_ctype
->translit
= NULL
;
3805 cursor
= &endp
->next
;
3808 if (ctype
->default_missing
== NULL
)
3809 ctype
->default_missing
= other_ctype
->default_missing
;
3817 allocate_arrays (struct locale_ctype_t
*ctype
, const struct charmap_t
*charmap
,
3818 struct repertoire_t
*repertoire
)
3826 /* You wonder about this amount of memory? This is only because some
3827 users do not manage to address the array with unsigned values or
3828 data types with range >= 256. '\200' would result in the array
3829 index -128. To help these poor people we duplicate the entries for
3830 128 up to 255 below the entry for \0. */
3831 ctype
->ctype_b
= (char_class_t
*) xcalloc (256 + 128, sizeof (char_class_t
));
3832 ctype
->ctype32_b
= (char_class32_t
*) xcalloc (256, sizeof (char_class32_t
));
3833 ctype
->class_b
= (uint32_t **)
3834 xmalloc (ctype
->nr_charclass
* sizeof (uint32_t *));
3835 ctype
->class_3level
= (struct iovec
*)
3836 xmalloc (ctype
->nr_charclass
* sizeof (struct iovec
));
3838 /* This is the array accessed using the multibyte string elements. */
3839 for (idx
= 0; idx
< 256; ++idx
)
3840 ctype
->ctype_b
[128 + idx
] = ctype
->class256_collection
[idx
];
3842 /* Mirror first 127 entries. We must take care that entry -1 is not
3843 mirrored because EOF == -1. */
3844 for (idx
= 0; idx
< 127; ++idx
)
3845 ctype
->ctype_b
[idx
] = ctype
->ctype_b
[256 + idx
];
3847 /* The 32 bit array contains all characters < 0x100. */
3848 for (idx
= 0; idx
< ctype
->class_collection_act
; ++idx
)
3849 if (ctype
->charnames
[idx
] < 0x100)
3850 ctype
->ctype32_b
[ctype
->charnames
[idx
]] = ctype
->class_collection
[idx
];
3852 for (nr
= 0; nr
< ctype
->nr_charclass
; nr
++)
3854 ctype
->class_b
[nr
] = (uint32_t *) xcalloc (256 / 32, sizeof (uint32_t));
3856 /* We only set CLASS_B for the bits in the ISO C classes, not
3857 the user defined classes. The number should not change but
3859 #define LAST_ISO_C_BIT 11
3860 if (nr
<= LAST_ISO_C_BIT
)
3861 for (idx
= 0; idx
< 256; ++idx
)
3862 if (ctype
->class256_collection
[idx
] & _ISbit (nr
))
3863 ctype
->class_b
[nr
][idx
>> 5] |= (uint32_t) 1 << (idx
& 0x1f);
3866 for (nr
= 0; nr
< ctype
->nr_charclass
; nr
++)
3868 struct wctype_table t
;
3870 t
.p
= 4; /* or: 5 */
3871 t
.q
= 7; /* or: 6 */
3872 wctype_table_init (&t
);
3874 for (idx
= 0; idx
< ctype
->class_collection_act
; ++idx
)
3875 if (ctype
->class_collection
[idx
] & _ISwbit (nr
))
3876 wctype_table_add (&t
, ctype
->charnames
[idx
]);
3878 wctype_table_finalize (&t
);
3881 WITH_CUR_LOCALE (fprintf (stderr
, _("\
3882 %s: table for class \"%s\": %lu bytes\n"),
3883 "LC_CTYPE", ctype
->classnames
[nr
],
3884 (unsigned long int) t
.result_size
));
3886 ctype
->class_3level
[nr
].iov_base
= t
.result
;
3887 ctype
->class_3level
[nr
].iov_len
= t
.result_size
;
3890 /* Room for table of mappings. */
3891 ctype
->map_b
= (uint32_t **) xmalloc (2 * sizeof (uint32_t *));
3892 ctype
->map32_b
= (uint32_t **) xmalloc (ctype
->map_collection_nr
3893 * sizeof (uint32_t *));
3894 ctype
->map_3level
= (struct iovec
*)
3895 xmalloc (ctype
->map_collection_nr
* sizeof (struct iovec
));
3897 /* Fill in all mappings. */
3898 for (idx
= 0; idx
< 2; ++idx
)
3902 /* Allocate table. */
3903 ctype
->map_b
[idx
] = (uint32_t *)
3904 xmalloc ((256 + 128) * sizeof (uint32_t));
3906 /* Copy values from collection. */
3907 for (idx2
= 0; idx2
< 256; ++idx2
)
3908 ctype
->map_b
[idx
][128 + idx2
] = ctype
->map256_collection
[idx
][idx2
];
3910 /* Mirror first 127 entries. We must take care not to map entry
3911 -1 because EOF == -1. */
3912 for (idx2
= 0; idx2
< 127; ++idx2
)
3913 ctype
->map_b
[idx
][idx2
] = ctype
->map_b
[idx
][256 + idx2
];
3915 /* EOF must map to EOF. */
3916 ctype
->map_b
[idx
][127] = EOF
;
3919 for (idx
= 0; idx
< ctype
->map_collection_nr
; ++idx
)
3923 /* Allocate table. */
3924 ctype
->map32_b
[idx
] = (uint32_t *) xmalloc (256 * sizeof (uint32_t));
3926 /* Copy values from collection. Default is identity mapping. */
3927 for (idx2
= 0; idx2
< 256; ++idx2
)
3928 ctype
->map32_b
[idx
][idx2
] =
3929 (ctype
->map_collection
[idx
][idx2
] != 0
3930 ? ctype
->map_collection
[idx
][idx2
]
3934 for (nr
= 0; nr
< ctype
->map_collection_nr
; nr
++)
3936 struct wctrans_table t
;
3940 wctrans_table_init (&t
);
3942 for (idx
= 0; idx
< ctype
->map_collection_act
[nr
]; ++idx
)
3943 if (ctype
->map_collection
[nr
][idx
] != 0)
3944 wctrans_table_add (&t
, ctype
->charnames
[idx
],
3945 ctype
->map_collection
[nr
][idx
]);
3947 wctrans_table_finalize (&t
);
3950 WITH_CUR_LOCALE (fprintf (stderr
, _("\
3951 %s: table for map \"%s\": %lu bytes\n"),
3952 "LC_CTYPE", ctype
->mapnames
[nr
],
3953 (unsigned long int) t
.result_size
));
3955 ctype
->map_3level
[nr
].iov_base
= t
.result
;
3956 ctype
->map_3level
[nr
].iov_len
= t
.result_size
;
3959 /* Extra array for class and map names. */
3960 ctype
->class_name_ptr
= (uint32_t *) xmalloc (ctype
->nr_charclass
3961 * sizeof (uint32_t));
3962 ctype
->map_name_ptr
= (uint32_t *) xmalloc (ctype
->map_collection_nr
3963 * sizeof (uint32_t));
3965 ctype
->class_offset
= _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1
);
3966 ctype
->map_offset
= ctype
->class_offset
+ ctype
->nr_charclass
;
3968 /* Array for width information. Because the expected widths are very
3969 small (never larger than 2) we use only one single byte. This
3971 We put only printable characters in the table. wcwidth is specified
3972 to return -1 for non-printable characters. Doing the check here
3973 saves a run-time check.
3974 But we put L'\0' in the table. This again saves a run-time check. */
3976 struct wcwidth_table t
;
3980 wcwidth_table_init (&t
);
3982 /* First set all the printable characters of the character set to
3983 the default width. */
3985 while (iterate_table (&charmap
->char_table
, &curs
, &key
, &len
, &vdata
) == 0)
3987 struct charseq
*data
= (struct charseq
*) vdata
;
3989 if (data
->ucs4
== UNINITIALIZED_CHAR_VALUE
)
3990 data
->ucs4
= repertoire_find_value (ctype
->repertoire
,
3993 if (data
->ucs4
!= ILLEGAL_CHAR_VALUE
)
3995 uint32_t *class_bits
=
3996 find_idx (ctype
, &ctype
->class_collection
, NULL
,
3997 &ctype
->class_collection_act
, data
->ucs4
);
3999 if (class_bits
!= NULL
&& (*class_bits
& BITw (tok_print
)))
4000 wcwidth_table_add (&t
, data
->ucs4
, charmap
->width_default
);
4004 /* Now add the explicitly specified widths. */
4005 if (charmap
->width_rules
!= NULL
)
4009 for (cnt
= 0; cnt
< charmap
->nwidth_rules
; ++cnt
)
4011 unsigned char bytes
[charmap
->mb_cur_max
];
4012 int nbytes
= charmap
->width_rules
[cnt
].from
->nbytes
;
4014 /* We have the range of character for which the width is
4015 specified described using byte sequences of the multibyte
4016 charset. We have to convert this to UCS4 now. And we
4017 cannot simply convert the beginning and the end of the
4018 sequence, we have to iterate over the byte sequence and
4019 convert it for every single character. */
4020 memcpy (bytes
, charmap
->width_rules
[cnt
].from
->bytes
, nbytes
);
4022 while (nbytes
< charmap
->width_rules
[cnt
].to
->nbytes
4023 || memcmp (bytes
, charmap
->width_rules
[cnt
].to
->bytes
,
4026 /* Find the UCS value for `bytes'. */
4029 struct charseq
*seq
=
4030 charmap_find_symbol (charmap
, (char *) bytes
, nbytes
);
4033 wch
= ILLEGAL_CHAR_VALUE
;
4034 else if (seq
->ucs4
!= UNINITIALIZED_CHAR_VALUE
)
4037 wch
= repertoire_find_value (ctype
->repertoire
, seq
->name
,
4038 strlen (seq
->name
));
4040 if (wch
!= ILLEGAL_CHAR_VALUE
)
4042 /* Store the value. */
4043 uint32_t *class_bits
=
4044 find_idx (ctype
, &ctype
->class_collection
, NULL
,
4045 &ctype
->class_collection_act
, wch
);
4047 if (class_bits
!= NULL
&& (*class_bits
& BITw (tok_print
)))
4048 wcwidth_table_add (&t
, wch
,
4049 charmap
->width_rules
[cnt
].width
);
4052 /* "Increment" the bytes sequence. */
4054 while (inner
>= 0 && bytes
[inner
] == 0xff)
4059 /* We have to extend the byte sequence. */
4060 if (nbytes
>= charmap
->width_rules
[cnt
].to
->nbytes
)
4064 memset (&bytes
[1], 0, nbytes
);
4070 while (++inner
< nbytes
)
4077 /* Set the width of L'\0' to 0. */
4078 wcwidth_table_add (&t
, 0, 0);
4080 wcwidth_table_finalize (&t
);
4083 WITH_CUR_LOCALE (fprintf (stderr
, _("%s: table for width: %lu bytes\n"),
4084 "LC_CTYPE", (unsigned long int) t
.result_size
));
4086 ctype
->width
.iov_base
= t
.result
;
4087 ctype
->width
.iov_len
= t
.result_size
;
4090 /* Set MB_CUR_MAX. */
4091 ctype
->mb_cur_max
= charmap
->mb_cur_max
;
4093 /* Now determine the table for the transliteration information.
4095 XXX It is not yet clear to me whether it is worth implementing a
4096 complicated algorithm which uses a hash table to locate the entries.
4097 For now I'll use a simple array which can be searching using binary
4099 if (ctype
->translit_include
!= NULL
)
4100 /* Traverse the locales mentioned in the `include' statements in a
4101 depth-first way and fold in their transliteration information. */
4102 translit_flatten (ctype
, charmap
, &ctype
->translit
);
4104 if (ctype
->translit
!= NULL
)
4106 /* First count how many entries we have. This is the upper limit
4107 since some entries from the included files might be overwritten. */
4110 struct translit_t
*runp
= ctype
->translit
;
4111 struct translit_t
**sorted
;
4112 size_t from_len
, to_len
;
4114 while (runp
!= NULL
)
4120 /* Next we allocate an array large enough and fill in the values. */
4121 sorted
= (struct translit_t
**) alloca (number
4122 * sizeof (struct translit_t
**));
4123 runp
= ctype
->translit
;
4127 /* Search for the place where to insert this string.
4128 XXX Better use a real sorting algorithm later. */
4132 while (idx
< number
)
4134 int res
= wcscmp ((const wchar_t *) sorted
[idx
]->from
,
4135 (const wchar_t *) runp
->from
);
4150 memmove (&sorted
[idx
+ 1], &sorted
[idx
],
4151 (number
- idx
) * sizeof (struct translit_t
*));
4158 while (runp
!= NULL
);
4160 /* The next step is putting all the possible transliteration
4161 strings in one memory block so that we can write it out.
4162 We need several different blocks:
4163 - index to the from-string array
4165 - index to the to-string array
4168 from_len
= to_len
= 0;
4169 for (cnt
= 0; cnt
< number
; ++cnt
)
4171 struct translit_to_t
*srunp
;
4172 from_len
+= wcslen ((const wchar_t *) sorted
[cnt
]->from
) + 1;
4173 srunp
= sorted
[cnt
]->to
;
4174 while (srunp
!= NULL
)
4176 to_len
+= wcslen ((const wchar_t *) srunp
->str
) + 1;
4177 srunp
= srunp
->next
;
4179 /* Plus one for the extra NUL character marking the end of
4180 the list for the current entry. */
4184 /* We can allocate the arrays for the results. */
4185 ctype
->translit_from_idx
= xmalloc (number
* sizeof (uint32_t));
4186 ctype
->translit_from_tbl
= xmalloc (from_len
* sizeof (uint32_t));
4187 ctype
->translit_to_idx
= xmalloc (number
* sizeof (uint32_t));
4188 ctype
->translit_to_tbl
= xmalloc (to_len
* sizeof (uint32_t));
4192 for (cnt
= 0; cnt
< number
; ++cnt
)
4195 struct translit_to_t
*srunp
;
4197 ctype
->translit_from_idx
[cnt
] = from_len
;
4198 ctype
->translit_to_idx
[cnt
] = to_len
;
4200 len
= wcslen ((const wchar_t *) sorted
[cnt
]->from
) + 1;
4201 wmemcpy ((wchar_t *) &ctype
->translit_from_tbl
[from_len
],
4202 (const wchar_t *) sorted
[cnt
]->from
, len
);
4205 ctype
->translit_to_idx
[cnt
] = to_len
;
4206 srunp
= sorted
[cnt
]->to
;
4207 while (srunp
!= NULL
)
4209 len
= wcslen ((const wchar_t *) srunp
->str
) + 1;
4210 wmemcpy ((wchar_t *) &ctype
->translit_to_tbl
[to_len
],
4211 (const wchar_t *) srunp
->str
, len
);
4213 srunp
= srunp
->next
;
4215 ctype
->translit_to_tbl
[to_len
++] = L
'\0';
4218 /* Store the information about the length. */
4219 ctype
->translit_idx_size
= number
;
4220 ctype
->translit_from_tbl_size
= from_len
* sizeof (uint32_t);
4221 ctype
->translit_to_tbl_size
= to_len
* sizeof (uint32_t);
4225 /* Provide some dummy pointers since we have nothing to write out. */
4226 static uint32_t no_str
= { 0 };
4228 ctype
->translit_from_idx
= &no_str
;
4229 ctype
->translit_from_tbl
= &no_str
;
4230 ctype
->translit_to_tbl
= &no_str
;
4231 ctype
->translit_idx_size
= 0;
4232 ctype
->translit_from_tbl_size
= 0;
4233 ctype
->translit_to_tbl_size
= 0;