1 Python on Solaris doesn't handle non UTF-8 locales because of the way they are
2 encoded. The wchar_t encoding used for stored symbols is not standardized. While
3 on Linux symbols from all encodings will be mapped to their UTF-8 values, this
4 is not the case on Solaris, where only UTF-8 locales work like that; other
5 encodings can use any arbitrary value. Since Python expects no value to be
6 higher than the maximum valid code point in Unicode (which is U+10FFFF), it
7 breaks on Solaris when non UTF-8 locale is used. See bug 31790476.
9 To fix this, we have to convert given wchar_t to utf32 each time locale is not
10 UTF-8 encoded (or ASCII, which can safely be ignored).
12 --- Python-3.9.1/Include/unicodeobject.h
13 +++ Python-3.9.1/Include/unicodeobject.h
14 @@ -97,6 +97,11 @@ Copyright (c) Corporation for National R
18 +#if defined(__sun) && defined(__SVR4)
20 +# include <langinfo.h>
23 /* Py_UCS4 and Py_UCS2 are typedefs for the respective
24 unicode representations. */
25 typedef uint32_t Py_UCS4;
26 --- Python-3.9.1/Objects/unicodeobject.c
27 +++ Python-3.9.1/Objects/unicodeobject.c
28 @@ -2187,6 +2187,15 @@ PyUnicode_FromUnicode(const Py_UNICODE *
29 return PyUnicode_FromWideChar(u, size);
32 +#if defined(__sun) && defined(__SVR4)
33 +/* Detect whether currently used locale uses UTF compatible encoding. */
34 +int codeset_is_utf8_compatible()
36 + char* res = nl_langinfo(CODESET);
37 + return !(strcmp(res, "UTF-8") && strcmp(res, "646"));
42 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
44 @@ -2210,6 +2219,58 @@ PyUnicode_FromWideChar(const wchar_t *u,
46 _Py_RETURN_UNICODE_EMPTY();
48 +#if defined(__sun) && defined(__SVR4)
49 + /* Check whether current locale uses UTF to encode symbols */
50 + if (!codeset_is_utf8_compatible()) {
52 + /* Given 'u' might not be NULL terminated (size smaller than its
53 + length); copy and terminate part we are interested in. */
54 + wchar_t* substr = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
55 + memcpy(substr, u, size * sizeof(wchar_t));
58 + /* Convert given wide-character string to a character string */
59 + size_t buffsize = wcstombs(NULL, substr, 0) + 1;
60 + if (buffsize == (size_t)-1) {
61 + PyMem_RawFree(substr);
62 + PyErr_Format(PyExc_ValueError, "wcstombs() conversion failed");
66 + char* buffer = PyMem_RawMalloc(buffsize * sizeof(char));
67 + size_t res = wcstombs(buffer, substr, buffsize);
68 + assert(res == buffsize - 1);
70 + /* Convert character string to UTF32 encoded char32_t string.
71 + Since wchar_t and char32_t have the same size on Solaris and one
72 + wchar_t symbol corresponds to one UTF32 value, we can safely
73 + reuse this buffer and skip additional allocation. */
74 + char32_t* c32 = (char32_t*) substr;
75 + mbstate_t state = {0};
79 + char* end = ptr + res + 1;
80 + while (res = mbrtoc32(&(c32[i]), ptr, end - ptr, &state)) {
81 + if (res == (size_t)-1 || res == (size_t)-2 || res == (size_t)-3) {
83 + PyMem_RawFree(buffer);
84 + PyErr_Format(PyExc_ValueError,
85 + "mbrtoc32() conversion failed with error code: %d",
92 + PyMem_RawFree(buffer);
94 + PyObject *unicode = _PyUnicode_FromUCS4(c32, size);
100 /* Single character Unicode objects in the Latin-1 range are
101 shared when using this constructor */
102 if (size == 1 && (Py_UCS4)*u < 256)