3 Copyright 2016-2024 Free Software Foundation, Inc.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18 /* Written by Paul Eggert. */
22 #include <localeinfo.h>
29 /* Use ISO C 99 API. */
31 # define char32_t wchar_t
32 # define mbrtoc32 mbrtowc
33 # define c32tolower towlower
34 # define c32toupper towupper
36 /* Use ISO C 11 + gnulib API. */
40 /* The sbclen implementation relies on this. */
41 static_assert (MB_LEN_MAX
<= SCHAR_MAX
);
43 /* Return true if the locale uses UTF-8. */
50 return mbrtoc32 (&wc
, "\xc4\x80", 2, &mbs
) == 2 && wc
== 0x100;
53 /* Return true if the locale is compatible enough with the C locale so
54 that the locale is single-byte, bytes are in collating-sequence
55 order, and there are no multi-character collating elements. */
58 using_simple_locale (bool multibyte
)
60 /* The native character set is known to be compatible with
61 the C locale. The following test isn't perfect, but it's good
62 enough in practice, as only ASCII and EBCDIC are in common use
63 and this test correctly accepts ASCII and rejects EBCDIC. */
64 enum { native_c_charset
=
65 ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
66 && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
67 && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
68 && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
69 && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
70 && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
71 && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
72 && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
73 && '}' == 125 && '~' == 126)
76 if (!native_c_charset
|| multibyte
)
79 /* As a heuristic, use strcoll to compare native character order.
80 If this agrees with byte order the locale should be simple.
81 This heuristic should work for all known practical locales,
82 although it would be invalid for artificially-constructed locales
83 where the native order is the collating-sequence order but there
84 are multi-character collating elements. */
85 for (int i
= 0; i
< UCHAR_MAX
; i
++)
86 if (0 <= strcoll (((char []) {i
, 0}), ((char []) {i
+ 1, 0})))
92 /* Initialize *LOCALEINFO from the current locale. */
95 init_localeinfo (struct localeinfo
*localeinfo
)
97 localeinfo
->multibyte
= MB_CUR_MAX
> 1;
98 localeinfo
->simple
= using_simple_locale (localeinfo
->multibyte
);
99 localeinfo
->using_utf8
= is_using_utf8 ();
101 for (int i
= CHAR_MIN
; i
<= CHAR_MAX
; i
++)
104 unsigned char uc
= i
;
107 size_t len
= mbrtoc32 (&wc
, &c
, 1, &s
);
108 localeinfo
->sbclen
[uc
] = len
<= 1 ? 1 : - (int) - len
;
109 localeinfo
->sbctowc
[uc
] = len
<= 1 ? wc
: WEOF
;
113 /* The set of char32_t values C such that there's a useful locale
114 somewhere where C != towupper (C) && C != towlower (towupper (C)).
115 For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
116 towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
117 towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */
118 static unsigned short int const lonesome_lower
[] =
120 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
121 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
123 /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
124 counterpart in locales predating Unicode 4.0.0 (April 2003). */
127 0x03F5, 0x1E9B, 0x1FBE,
130 /* Verify that the worst case fits. This is 1 for towupper, 1 for
131 towlower, and 1 for each entry in LONESOME_LOWER. */
132 static_assert (1 + 1 + sizeof lonesome_lower
/ sizeof *lonesome_lower
133 <= CASE_FOLDED_BUFSIZE
);
135 /* Find the characters equal to C after case-folding, other than C
136 itself, and store them into FOLDED. Return the number of characters
137 stored; this is zero if C is WEOF. */
140 case_folded_counterparts (wint_t c
, char32_t folded
[CASE_FOLDED_BUFSIZE
])
144 wint_t uc
= c32toupper (c
);
145 wint_t lc
= c32tolower (uc
);
148 if (lc
!= uc
&& lc
!= c
&& c32toupper (lc
) == uc
)
150 for (i
= 0; i
< sizeof lonesome_lower
/ sizeof *lonesome_lower
; i
++)
152 wint_t li
= lonesome_lower
[i
];
153 if (li
!= lc
&& li
!= uc
&& li
!= c
&& c32toupper (li
) == uc
)