components/python/python39/patches/26-locale-encoding.patch

   1 Python on Solaris doesn't handle non UTF-8 locales because of the way they are
   2 encoded. The wchar_t encoding used for stored symbols is not standardized. While
   3 on Linux symbols from all encodings will be mapped to their UTF-8 values, this
   4 is not the case on Solaris, where only UTF-8 locales work like that; other
   5 encodings can use any arbitrary value. Since Python expects no value to be
   6 higher than the maximum valid code point in Unicode (which is U+10FFFF), it
   7 breaks on Solaris when non UTF-8 locale is used. See bug 31790476.
   8
   9 To fix this, we have to convert given wchar_t to utf32 each time locale is not
  10 UTF-8 encoded (or ASCII, which can safely be ignored).
  11
  12 --- Python-3.9.1/Include/unicodeobject.h
  13 +++ Python-3.9.1/Include/unicodeobject.h
  14 @@ -97,6 +97,11 @@ Copyright (c) Corporation for National R
  15  #  include <wchar.h>
  16  #endif
  17
  18 +#if defined(__sun) && defined(__SVR4)
  19 +#  include <uchar.h>
  20 +#  include <langinfo.h>
  21 +#endif
  22 +
  23  /* Py_UCS4 and Py_UCS2 are typedefs for the respective
  24     unicode representations. */
  25  typedef uint32_t Py_UCS4;
  26 --- Python-3.9.1/Objects/unicodeobject.c
  27 +++ Python-3.9.1/Objects/unicodeobject.c
  28 @@ -2187,6 +2187,15 @@ PyUnicode_FromUnicode(const Py_UNICODE *
  29      return PyUnicode_FromWideChar(u, size);
  30  }
  31
  32 +#if defined(__sun) && defined(__SVR4)
  33 +/* Detect whether currently used locale uses UTF compatible encoding. */
  34 +int codeset_is_utf8_compatible()
  35 +{
  36 +    char* res = nl_langinfo(CODESET);
  37 +    return !(strcmp(res, "UTF-8") && strcmp(res, "646"));
  38 +}
  39 +#endif
  40 +
  41  PyObject *
  42  PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
  43  {
  44 @@ -2210,6 +2219,58 @@ PyUnicode_FromWideChar(const wchar_t *u,
  45      if (size == 0)
  46          _Py_RETURN_UNICODE_EMPTY();
  47
  48 +#if defined(__sun) && defined(__SVR4)
  49 +    /* Check whether current locale uses UTF to encode symbols */
  50 +    if (!codeset_is_utf8_compatible()) {
  51 +
  52 +        /* Given 'u' might not be NULL terminated (size smaller than its
  53 +           length); copy and terminate part we are interested in. */
  54 +        wchar_t* substr = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
  55 +        memcpy(substr, u, size * sizeof(wchar_t));
  56 +        substr[size] = 0;
  57 +
  58 +        /* Convert given wide-character string to a character string */
  59 +        size_t buffsize = wcstombs(NULL, substr, 0) + 1;
  60 +        if (buffsize == (size_t)-1) {
  61 +            PyMem_RawFree(substr);
  62 +            PyErr_Format(PyExc_ValueError, "wcstombs() conversion failed");
  63 +            return NULL;
  64 +        }
  65 +
  66 +        char* buffer = PyMem_RawMalloc(buffsize * sizeof(char));
  67 +        size_t res = wcstombs(buffer, substr, buffsize);
  68 +        assert(res == buffsize - 1);
  69 +
  70 +        /* Convert character string to UTF32 encoded char32_t string.
  71 +           Since wchar_t and char32_t have the same size on Solaris and one
  72 +           wchar_t symbol corresponds to one UTF32 value, we can safely
  73 +           reuse this buffer and skip additional allocation. */
  74 +        char32_t* c32 = (char32_t*) substr;
  75 +        mbstate_t state = {0};
  76 +
  77 +        int i = 0;
  78 +        char* ptr = buffer;
  79 +        char* end = ptr + res + 1;
  80 +        while (res = mbrtoc32(&(c32[i]), ptr, end - ptr, &state)) {
  81 +            if (res == (size_t)-1 || res == (size_t)-2 || res == (size_t)-3) {
  82 +                PyMem_RawFree(c32);
  83 +                PyMem_RawFree(buffer);
  84 +                PyErr_Format(PyExc_ValueError,
  85 +                             "mbrtoc32() conversion failed with error code: %d",
  86 +                             res);
  87 +                return NULL;
  88 +            }
  89 +            ptr += res;
  90 +            i ++;
  91 +        }
  92 +        PyMem_RawFree(buffer);
  93 +
  94 +        PyObject *unicode = _PyUnicode_FromUCS4(c32, size);
  95 +        PyMem_RawFree(c32);
  96 +        return unicode;
  97 +    }
  98 +#endif
  99 +
 100      /* Single character Unicode objects in the Latin-1 range are
 101         shared when using this constructor */
 102      if (size == 1 && (Py_UCS4)*u < 256)