2 * Copyright 2018 Nexenta Systems, Inc.
3 * Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved.
4 * Copyright 2015 John Marino <draco@marino.st>
6 * This source code is derived from the illumos localedef command, and
7 * provided under BSD-style license terms by Nexenta Systems, Inc.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
33 * LC_CTYPE database generation routines for localedef.
35 #include <sys/cdefs.h>
42 #include <sys/types.h>
45 #include "localedef.h"
48 /* Always include the defines for the target: */
49 #define _DONT_USE_CTYPE_INLINE_ /* Avoid dependencies on runetype.h */
54 /* Needed for bootstrapping, _CTYPE_N */
56 #define _CTYPE_N 0x00400000L
59 #define _ISUPPER _CTYPE_U
60 #define _ISLOWER _CTYPE_L
61 #define _ISDIGIT _CTYPE_D
62 #define _ISXDIGIT _CTYPE_X
63 #define _ISSPACE _CTYPE_S
64 #define _ISBLANK _CTYPE_B
65 #define _ISALPHA _CTYPE_A
66 #define _ISPUNCT _CTYPE_P
67 #define _ISGRAPH _CTYPE_G
68 #define _ISPRINT _CTYPE_R
69 #define _ISCNTRL _CTYPE_C
76 static wchar_t last_ctype
;
77 static int ctype_compare(const void *n1
, const void *n2
);
79 typedef struct ctype_node
{
84 RB_ENTRY(ctype_node
) entry
;
87 static RB_HEAD(ctypes
, ctype_node
) ctypes
;
88 RB_GENERATE_STATIC(ctypes
, ctype_node
, entry
, ctype_compare
);
91 ctype_compare(const void *n1
, const void *n2
)
93 const ctype_node_t
*c1
= n1
;
94 const ctype_node_t
*c2
= n2
;
96 return (c1
->wc
< c2
->wc
? -1 : c1
->wc
> c2
->wc
? 1 : 0);
107 add_ctype_impl(ctype_node_t
*ctn
)
111 ctn
->ctype
|= (_ISUPPER
| _ISALPHA
| _ISGRAPH
| _ISPRINT
);
114 ctn
->ctype
|= (_ISLOWER
| _ISALPHA
| _ISGRAPH
| _ISPRINT
);
117 ctn
->ctype
|= (_ISALPHA
| _ISGRAPH
| _ISPRINT
);
120 ctn
->ctype
|= (_ISDIGIT
| _ISGRAPH
| _ISPRINT
| _ISXDIGIT
| _E4
);
124 * This can be troublesome as <form-feed>, <newline>,
125 * <carriage-return>, <tab>, and <vertical-tab> are defined both
126 * as space and cntrl, and POSIX doesn't allow cntrl/print
127 * combination. We will take care of this in dump_ctype().
129 ctn
->ctype
|= (_ISSPACE
| _ISPRINT
);
132 ctn
->ctype
|= _ISCNTRL
;
135 ctn
->ctype
|= (_ISGRAPH
| _ISPRINT
);
138 ctn
->ctype
|= _ISPRINT
;
141 ctn
->ctype
|= (_ISPUNCT
| _ISGRAPH
| _ISPRINT
);
144 ctn
->ctype
|= (_ISXDIGIT
| _ISPRINT
);
147 ctn
->ctype
|= (_ISBLANK
| _ISSPACE
);
150 ctn
->ctype
|= (_E1
| _ISPRINT
| _ISGRAPH
);
153 ctn
->ctype
|= (_E2
| _ISPRINT
| _ISGRAPH
);
156 ctn
->ctype
|= (_E3
| _ISPRINT
| _ISGRAPH
);
159 ctn
->ctype
|= (_E4
| _ISPRINT
| _ISGRAPH
);
162 ctn
->ctype
|= (_E5
| _ISPRINT
| _ISGRAPH
);
166 * We can't do anything with this. The character
167 * should already be specified as a digit or alpha.
171 errf("not a valid character class");
175 static ctype_node_t
*
176 get_ctype(wchar_t wc
)
182 if ((ctn
= RB_FIND(ctypes
, &ctypes
, &srch
)) == NULL
) {
183 if ((ctn
= calloc(1, sizeof (*ctn
))) == NULL
) {
184 errf("out of memory");
189 RB_INSERT(ctypes
, &ctypes
, ctn
);
199 if ((ctn
= get_ctype(val
)) == NULL
) {
204 last_ctype
= ctn
->wc
;
208 add_ctype_range(wchar_t end
)
213 if (end
< last_ctype
) {
214 errf("malformed character range (%u ... %u))",
218 for (cur
= last_ctype
+ 1; cur
<= end
; cur
++) {
219 if ((ctn
= get_ctype(cur
)) == NULL
) {
230 * A word about widths: if the width mask is specified, then libc
231 * unconditionally honors it. Otherwise, it assumes printable
232 * characters have width 1, and non-printable characters have width
233 * -1 (except for NULL which is special with width 0). Hence, we have
234 * no need to inject defaults here -- the "default" unset value of 0
235 * indicates that libc should use its own logic in wcwidth as described.
238 add_width(int wc
, int width
)
242 if ((ctn
= get_ctype(wc
)) == NULL
) {
246 ctn
->ctype
&= ~(_CTYPE_SWM
);
249 ctn
->ctype
|= _CTYPE_SW0
;
252 ctn
->ctype
|= _CTYPE_SW1
;
255 ctn
->ctype
|= _CTYPE_SW2
;
258 ctn
->ctype
|= _CTYPE_SW3
;
264 add_width_range(int start
, int end
, int width
)
266 for (; start
<= end
; start
++) {
267 add_width(start
, width
);
272 add_caseconv(int val
, int wc
)
276 ctn
= get_ctype(val
);
300 ctype_node_t
*ctn
, *last_ct
, *last_lo
, *last_up
;
301 _FileRuneEntry
*ct
= NULL
;
302 _FileRuneEntry
*lo
= NULL
;
303 _FileRuneEntry
*up
= NULL
;
305 uint32_t runetype_ext_nranges
;
306 uint32_t maplower_ext_nranges
;
307 uint32_t mapupper_ext_nranges
;
309 (void) memset(&rl
, 0, sizeof (rl
));
310 runetype_ext_nranges
= 0;
312 maplower_ext_nranges
= 0;
314 mapupper_ext_nranges
= 0;
317 if ((f
= open_category()) == NULL
)
320 (void) memcpy(rl
.magic
, _FILE_RUNE_MAGIC_1
, 8);
321 (void) strlcpy(rl
.encoding
, get_wide_encoding(), sizeof (rl
.encoding
));
324 * Initialize the identity map.
326 for (wc
= 0; (unsigned)wc
< _CACHED_RUNES
; wc
++) {
327 rl
.maplower
[wc
] = htote(wc
);
328 rl
.mapupper
[wc
] = htote(wc
);
331 RB_FOREACH(ctn
, ctypes
, &ctypes
) {
337 * POSIX requires certain portable characters have
338 * certain types. Add them if they are missing.
340 if ((wc
>= 1) && (wc
<= 127)) {
341 if ((wc
>= 'A') && (wc
<= 'Z'))
342 ctn
->ctype
|= _ISUPPER
;
343 if ((wc
>= 'a') && (wc
<= 'z'))
344 ctn
->ctype
|= _ISLOWER
;
345 if ((wc
>= '0') && (wc
<= '9'))
346 ctn
->ctype
|= _ISDIGIT
;
348 ctn
->ctype
|= _ISPRINT
;
349 if (strchr(" \f\n\r\t\v", (char)wc
) != NULL
)
350 ctn
->ctype
|= _ISSPACE
;
351 if (strchr("0123456789ABCDEFabcdef", (char)wc
) != NULL
)
352 ctn
->ctype
|= _ISXDIGIT
;
353 if (strchr(" \t", (char)wc
))
354 ctn
->ctype
|= _ISBLANK
;
357 * Technically these settings are only
358 * required for the C locale. However, it
359 * turns out that because of the historical
360 * version of isprint(), we need them for all
361 * locales as well. Note that these are not
362 * necessarily valid punctation characters in
363 * the current language, but ispunct() needs
364 * to return TRUE for them.
366 if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
368 ctn
->ctype
|= _ISPUNCT
;
372 * POSIX also requires that certain types imply
373 * others. Add any inferred types here.
375 if (ctn
->ctype
& (_ISUPPER
|_ISLOWER
))
376 ctn
->ctype
|= _ISALPHA
;
377 if (ctn
->ctype
& _ISDIGIT
)
378 ctn
->ctype
|= _ISXDIGIT
;
379 if (ctn
->ctype
& _ISBLANK
)
380 ctn
->ctype
|= _ISSPACE
;
381 if (ctn
->ctype
& (_ISALPHA
|_ISDIGIT
|_ISXDIGIT
))
382 ctn
->ctype
|= _ISGRAPH
;
383 if (ctn
->ctype
& _ISGRAPH
)
384 ctn
->ctype
|= _ISPRINT
;
387 * POSIX requires that certain combinations are invalid.
388 * Try fixing the cases we know about (see add_ctype_impl()).
390 if ((ctn
->ctype
& (_ISSPACE
|_ISCNTRL
)) == (_ISSPACE
|_ISCNTRL
))
391 ctn
->ctype
&= ~_ISPRINT
;
394 * Finally, don't flag remaining cases as a fatal error,
395 * and just warn about them.
397 if ((ctn
->ctype
& _ISALPHA
) &&
398 (ctn
->ctype
& (_ISPUNCT
|_ISDIGIT
)))
400 if ((ctn
->ctype
& _ISPUNCT
) &&
401 (ctn
->ctype
& (_ISDIGIT
|_ISALPHA
|_ISXDIGIT
)))
403 if ((ctn
->ctype
& _ISSPACE
) && (ctn
->ctype
& _ISGRAPH
))
405 if ((ctn
->ctype
& _ISCNTRL
) && (ctn
->ctype
& _ISPRINT
))
407 if ((wc
== ' ') && (ctn
->ctype
& (_ISPUNCT
|_ISGRAPH
)))
411 warn("conflicting classes for character 0x%x (%x)",
415 * Handle the lower 256 characters using the simple
416 * optimization. Note that if we have not defined the
417 * upper/lower case, then we identity map it.
419 if ((unsigned)wc
< _CACHED_RUNES
) {
420 rl
.runetype
[wc
] = htote(ctn
->ctype
);
422 rl
.maplower
[wc
] = htote(ctn
->tolower
);
424 rl
.mapupper
[wc
] = htote(ctn
->toupper
);
428 if ((last_ct
!= NULL
) && (last_ct
->ctype
== ctn
->ctype
) &&
429 (last_ct
->wc
+ 1 == wc
)) {
430 ct
[runetype_ext_nranges
- 1].max
= htote(wc
);
432 runetype_ext_nranges
++;
433 ct
= realloc(ct
, sizeof (*ct
) * runetype_ext_nranges
);
434 ct
[runetype_ext_nranges
- 1].min
= htote(wc
);
435 ct
[runetype_ext_nranges
- 1].max
= htote(wc
);
436 ct
[runetype_ext_nranges
- 1].map
=
440 if (ctn
->tolower
== 0) {
442 } else if ((last_lo
!= NULL
) &&
443 (last_lo
->tolower
+ 1 == ctn
->tolower
)) {
444 lo
[maplower_ext_nranges
- 1].max
= htote(wc
);
447 maplower_ext_nranges
++;
448 lo
= realloc(lo
, sizeof (*lo
) * maplower_ext_nranges
);
449 lo
[maplower_ext_nranges
- 1].min
= htote(wc
);
450 lo
[maplower_ext_nranges
- 1].max
= htote(wc
);
451 lo
[maplower_ext_nranges
- 1].map
=
456 if (ctn
->toupper
== 0) {
458 } else if ((last_up
!= NULL
) &&
459 (last_up
->toupper
+ 1 == ctn
->toupper
)) {
460 up
[mapupper_ext_nranges
-1].max
= htote(wc
);
463 mapupper_ext_nranges
++;
464 up
= realloc(up
, sizeof (*up
) * mapupper_ext_nranges
);
465 up
[mapupper_ext_nranges
- 1].min
= htote(wc
);
466 up
[mapupper_ext_nranges
- 1].max
= htote(wc
);
467 up
[mapupper_ext_nranges
- 1].map
=
473 rl
.runetype_ext_nranges
= htote(runetype_ext_nranges
);
474 rl
.maplower_ext_nranges
= htote(maplower_ext_nranges
);
475 rl
.mapupper_ext_nranges
= htote(mapupper_ext_nranges
);
476 if ((wr_category(&rl
, sizeof (rl
), f
) < 0) ||
477 (wr_category(ct
, sizeof (*ct
) * runetype_ext_nranges
, f
) < 0) ||
478 (wr_category(lo
, sizeof (*lo
) * maplower_ext_nranges
, f
) < 0) ||
479 (wr_category(up
, sizeof (*up
) * mapupper_ext_nranges
, f
) < 0)) {