lib/localeinfo.c

   1 /* locale information
   2
   3    Copyright 2016-2024 Free Software Foundation, Inc.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation, either version 3, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  17
  18 /* Written by Paul Eggert.  */
  19
  20 #include <config.h>
  21
  22 #include <localeinfo.h>
  23
  24 #include <limits.h>
  25 #include <locale.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #if GAWK
  29 /* Use ISO C 99 API.  */
  30 # include <wctype.h>
  31 # define char32_t wchar_t
  32 # define mbrtoc32 mbrtowc
  33 # define c32tolower towlower
  34 # define c32toupper towupper
  35 #else
  36 /* Use ISO C 11 + gnulib API.  */
  37 # include <uchar.h>
  38 #endif
  39
  40 /* The sbclen implementation relies on this.  */
  41 static_assert (MB_LEN_MAX <= SCHAR_MAX);
  42
  43 /* Return true if the locale uses UTF-8.  */
  44
  45 static bool
  46 is_using_utf8 (void)
  47 {
  48   char32_t wc;
  49   mbstate_t mbs = {0};
  50   return mbrtoc32 (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
  51 }
  52
  53 /* Return true if the locale is compatible enough with the C locale so
  54    that the locale is single-byte, bytes are in collating-sequence
  55    order, and there are no multi-character collating elements.  */
  56
  57 static bool
  58 using_simple_locale (bool multibyte)
  59 {
  60   /* The native character set is known to be compatible with
  61      the C locale.  The following test isn't perfect, but it's good
  62      enough in practice, as only ASCII and EBCDIC are in common use
  63      and this test correctly accepts ASCII and rejects EBCDIC.  */
  64   enum { native_c_charset =
  65     ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
  66      && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
  67      && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
  68      && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
  69      && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
  70      && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
  71      && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
  72      && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
  73      && '}' == 125 && '~' == 126)
  74   };
  75
  76   if (!native_c_charset || multibyte)
  77     return false;
  78
  79   /* As a heuristic, use strcoll to compare native character order.
  80      If this agrees with byte order the locale should be simple.
  81      This heuristic should work for all known practical locales,
  82      although it would be invalid for artificially-constructed locales
  83      where the native order is the collating-sequence order but there
  84      are multi-character collating elements.  */
  85   for (int i = 0; i < UCHAR_MAX; i++)
  86     if (0 <= strcoll (((char []) {i, 0}), ((char []) {i + 1, 0})))
  87       return false;
  88
  89   return true;
  90 }
  91
  92 /* Initialize *LOCALEINFO from the current locale.  */
  93
  94 void
  95 init_localeinfo (struct localeinfo *localeinfo)
  96 {
  97   localeinfo->multibyte = MB_CUR_MAX > 1;
  98   localeinfo->simple = using_simple_locale (localeinfo->multibyte);
  99   localeinfo->using_utf8 = is_using_utf8 ();
 100
 101   for (int i = CHAR_MIN; i <= CHAR_MAX; i++)
 102     {
 103       char c = i;
 104       unsigned char uc = i;
 105       mbstate_t s = {0};
 106       char32_t wc;
 107       size_t len = mbrtoc32 (&wc, &c, 1, &s);
 108       localeinfo->sbclen[uc] = len <= 1 ? 1 : - (int) - len;
 109       localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF;
 110     }
 111 }
 112
 113 /* The set of char32_t values C such that there's a useful locale
 114    somewhere where C != towupper (C) && C != towlower (towupper (C)).
 115    For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
 116    towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
 117    towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU).  */
 118 static unsigned short int const lonesome_lower[] =
 119   {
 120     0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
 121     0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
 122
 123     /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
 124        counterpart in locales predating Unicode 4.0.0 (April 2003).  */
 125     0x03F2,
 126
 127     0x03F5, 0x1E9B, 0x1FBE,
 128   };
 129
 130 /* Verify that the worst case fits.  This is 1 for towupper, 1 for
 131    towlower, and 1 for each entry in LONESOME_LOWER.  */
 132 static_assert (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower
 133                <= CASE_FOLDED_BUFSIZE);
 134
 135 /* Find the characters equal to C after case-folding, other than C
 136    itself, and store them into FOLDED.  Return the number of characters
 137    stored; this is zero if C is WEOF.  */
 138
 139 int
 140 case_folded_counterparts (wint_t c, char32_t folded[CASE_FOLDED_BUFSIZE])
 141 {
 142   int i;
 143   int n = 0;
 144   wint_t uc = c32toupper (c);
 145   wint_t lc = c32tolower (uc);
 146   if (uc != c)
 147     folded[n++] = uc;
 148   if (lc != uc && lc != c && c32toupper (lc) == uc)
 149     folded[n++] = lc;
 150   for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
 151     {
 152       wint_t li = lonesome_lower[i];
 153       if (li != lc && li != uc && li != c && c32toupper (li) == uc)
 154         folded[n++] = li;
 155     }
 156   return n;
 157 }