openat: don’t close (-1)
[gnulib.git] / lib / localeinfo.c
blobc90f1d76b52eb7bdc3b5cf523eefeee58a712e2e
1 /* locale information
3 Copyright 2016-2024 Free Software Foundation, Inc.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3, or (at your option)
8 any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18 /* Written by Paul Eggert. */
20 #include <config.h>
22 #include <localeinfo.h>
24 #include <limits.h>
25 #include <locale.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #if GAWK
29 /* Use ISO C 99 API. */
30 # include <wctype.h>
31 # define char32_t wchar_t
32 # define mbrtoc32 mbrtowc
33 # define c32tolower towlower
34 # define c32toupper towupper
35 #else
36 /* Use ISO C 11 + gnulib API. */
37 # include <uchar.h>
38 #endif
40 /* The sbclen implementation relies on this. */
41 static_assert (MB_LEN_MAX <= SCHAR_MAX);
43 /* Return true if the locale uses UTF-8. */
45 static bool
46 is_using_utf8 (void)
48 char32_t wc;
49 mbstate_t mbs = {0};
50 return mbrtoc32 (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
53 /* Return true if the locale is compatible enough with the C locale so
54 that the locale is single-byte, bytes are in collating-sequence
55 order, and there are no multi-character collating elements. */
57 static bool
58 using_simple_locale (bool multibyte)
60 /* The native character set is known to be compatible with
61 the C locale. The following test isn't perfect, but it's good
62 enough in practice, as only ASCII and EBCDIC are in common use
63 and this test correctly accepts ASCII and rejects EBCDIC. */
64 enum { native_c_charset =
65 ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
66 && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
67 && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
68 && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
69 && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
70 && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
71 && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
72 && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
73 && '}' == 125 && '~' == 126)
76 if (!native_c_charset || multibyte)
77 return false;
79 /* As a heuristic, use strcoll to compare native character order.
80 If this agrees with byte order the locale should be simple.
81 This heuristic should work for all known practical locales,
82 although it would be invalid for artificially-constructed locales
83 where the native order is the collating-sequence order but there
84 are multi-character collating elements. */
85 for (int i = 0; i < UCHAR_MAX; i++)
86 if (0 <= strcoll (((char []) {i, 0}), ((char []) {i + 1, 0})))
87 return false;
89 return true;
92 /* Initialize *LOCALEINFO from the current locale. */
94 void
95 init_localeinfo (struct localeinfo *localeinfo)
97 localeinfo->multibyte = MB_CUR_MAX > 1;
98 localeinfo->simple = using_simple_locale (localeinfo->multibyte);
99 localeinfo->using_utf8 = is_using_utf8 ();
101 for (int i = CHAR_MIN; i <= CHAR_MAX; i++)
103 char c = i;
104 unsigned char uc = i;
105 mbstate_t s = {0};
106 char32_t wc;
107 size_t len = mbrtoc32 (&wc, &c, 1, &s);
108 localeinfo->sbclen[uc] = len <= 1 ? 1 : - (int) - len;
109 localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF;
113 /* The set of char32_t values C such that there's a useful locale
114 somewhere where C != towupper (C) && C != towlower (towupper (C)).
115 For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
116 towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
117 towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */
118 static unsigned short int const lonesome_lower[] =
120 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
121 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
123 /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
124 counterpart in locales predating Unicode 4.0.0 (April 2003). */
125 0x03F2,
127 0x03F5, 0x1E9B, 0x1FBE,
130 /* Verify that the worst case fits. This is 1 for towupper, 1 for
131 towlower, and 1 for each entry in LONESOME_LOWER. */
132 static_assert (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower
133 <= CASE_FOLDED_BUFSIZE);
135 /* Find the characters equal to C after case-folding, other than C
136 itself, and store them into FOLDED. Return the number of characters
137 stored; this is zero if C is WEOF. */
140 case_folded_counterparts (wint_t c, char32_t folded[CASE_FOLDED_BUFSIZE])
142 int i;
143 int n = 0;
144 wint_t uc = c32toupper (c);
145 wint_t lc = c32tolower (uc);
146 if (uc != c)
147 folded[n++] = uc;
148 if (lc != uc && lc != c && c32toupper (lc) == uc)
149 folded[n++] = lc;
150 for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
152 wint_t li = lonesome_lower[i];
153 if (li != lc && li != uc && li != c && c32toupper (li) == uc)
154 folded[n++] = li;
156 return n;