wsutil/unicode-utils.c

   1 /* unicode-utils.c
   2  * Unicode utility routines
   3  *
   4  * Wireshark - Network traffic analyzer
   5  * By Gerald Combs <gerald@wireshark.org>
   6  * Copyright 2006 Gerald Combs
   7  *
   8  * SPDX-License-Identifier: GPL-2.0-or-later
   9  */
  10
  11 #include "config.h"
  12
  13 #include "unicode-utils.h"
  14
  15 const int ws_utf8_seqlen[256] = {
  16     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x00...0x0f */
  17     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x10...0x1f */
  18     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x20...0x2f */
  19     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x30...0x3f */
  20     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x40...0x4f */
  21     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x50...0x5f */
  22     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x60...0x6f */
  23     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  /* 0x70...0x7f */
  24     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* 0x80...0x8f */
  25     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* 0x90...0x9f */
  26     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* 0xa0...0xaf */
  27     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  /* 0xb0...0xbf */
  28     0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  /* 0xc0...0xcf */
  29     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  /* 0xd0...0xdf */
  30     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,  /* 0xe0...0xef */
  31     4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0,  /* 0xf0...0xff */
  32 };
  33
  34 /* Given a pointer and a length, validates a string of bytes as UTF-8.
  35  * Returns the number of valid bytes, and a pointer immediately past
  36  * the checked region.
  37  *
  38  * Differs from Glib's g_utf8_validate_len in that null bytes are
  39  * considered valid UTF-8, and that maximal subparts are replaced as
  40  * a unit. (I.e., given a sequence of 2 or 3 bytes which are a
  41  * truncated version of a 3 or 4 byte UTF-8 character, but the next
  42  * byte does not continue the character, the set of 2 or 3 bytes
  43  * are replaced with one REPLACMENT CHARACTER.)
  44  */
  45 static inline size_t
  46 utf_8_validate(const uint8_t *start, ssize_t length, const uint8_t **end)
  47 {
  48     const uint8_t *ptr = start;
  49     uint8_t ch;
  50     size_t unichar_len, valid_bytes = 0;
  51
  52     while (length > 0) {
  53
  54         ch = *ptr;
  55
  56         if (ch < 0x80) {
  57             valid_bytes++;
  58             ptr++;
  59             length--;
  60             continue;
  61         }
  62
  63         ch = *ptr;
  64
  65         if (ch < 0xc2 || ch > 0xf4) {
  66             ptr++;
  67             length--;
  68             *end = ptr;
  69             return valid_bytes;
  70         }
  71
  72         if (ch < 0xe0) { /* 110xxxxx, 2 byte char */
  73             unichar_len = 2;
  74         } else if (ch < 0xf0) { /* 1110xxxx, 3 byte char */
  75             unichar_len = 3;
  76             ptr++;
  77             length--;
  78             if (length < 1) {
  79                 *end = ptr;
  80                 return valid_bytes;
  81             }
  82             switch (ch) {
  83                 case 0xe0:
  84                     if (*ptr < 0xa0 || *ptr > 0xbf) {
  85                         *end = ptr;
  86                         return valid_bytes;
  87                     }
  88                     break;
  89                 case 0xed:
  90                     if (*ptr < 0x80 || *ptr > 0x9f) {
  91                         *end = ptr;
  92                         return valid_bytes;
  93                     }
  94                     break;
  95                 default:
  96                     if (*ptr < 0x80 || *ptr > 0xbf) {
  97                         *end = ptr;
  98                         return valid_bytes;
  99                     }
 100             }
 101         } else { /* 11110xxx, 4 byte char - > 0xf4 excluded above */
 102             unichar_len = 4;
 103             ptr++;
 104             length--;
 105             if (length < 1) {
 106                 *end = ptr;
 107                 return valid_bytes;
 108             }
 109             switch (ch) {
 110                 case 0xf0:
 111                     if (*ptr < 0x90 || *ptr > 0xbf) {
 112                         *end = ptr;
 113                         return valid_bytes;
 114                     }
 115                     break;
 116                 case 0xf4:
 117                     if (*ptr < 0x80 || *ptr > 0x8f) {
 118                         *end = ptr;
 119                         return valid_bytes;
 120                     }
 121                     break;
 122                 default:
 123                     if (*ptr < 0x80 || *ptr > 0xbf) {
 124                         *end = ptr;
 125                         return valid_bytes;
 126                     }
 127             }
 128             ptr++;
 129             length--;
 130             if (length < 1) {
 131                 *end = ptr;
 132                 return valid_bytes;
 133             }
 134             if (*ptr < 0x80 || *ptr > 0xbf) {
 135                 *end = ptr;
 136                 return valid_bytes;
 137             }
 138         }
 139
 140         ptr++;
 141         length--;
 142         if (length < 1) {
 143             *end = ptr;
 144             return valid_bytes;
 145         }
 146         if (*ptr < 0x80 || *ptr > 0xbf) {
 147             *end = ptr;
 148             return valid_bytes;
 149         } else {
 150             ptr++;
 151             length--;
 152             valid_bytes += unichar_len;
 153         }
 154
 155     }
 156     *end = ptr;
 157     return valid_bytes;
 158 }
 159
 160 /*
 161  * Given a wmem scope, a pointer, and a length, treat the string of bytes
 162  * referred to by the pointer and length as a UTF-8 string, and return a
 163  * pointer to a UTF-8 string, allocated using the wmem scope, with all
 164  * ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER
 165  * according to the recommended "best practices" given in the Unicode
 166  * Standard and specified by W3C/WHATWG.
 167  *
 168  * Note that in conformance with the Unicode Standard, this treats three
 169  * byte sequences corresponding to UTF-16 surrogate halves (paired or unpaired)
 170  * and two byte overlong encodings of 7-bit ASCII characters as invalid and
 171  * substitutes REPLACEMENT CHARACTER for them. Explicit support for nonstandard
 172  * derivative encoding formats (e.g. CESU-8, Java Modified UTF-8, WTF-8) could
 173  * be added later.
 174  *
 175  * Compared with g_utf8_make_valid(), this function does not consider
 176  * internal NUL bytes as invalid and replace them with replacment characters.
 177  * It also replaces maximal subparts as a unit; i.e., a sequence of 2 or 3
 178  * bytes which are a truncated version of a valid 3 or 4 byte character (but
 179  * the next byte does not continue the character) are replaced with a single
 180  * REPLACEMENT CHARACTER, whereas the Glib function replaces each byte of the
 181  * sequence with its own (3 octet) REPLACEMENT CHARACTER.
 182  *
 183  * XXX: length should probably be a size_t instead of a int in all
 184  * these encoding functions
 185  * XXX: the buffer returned can be of different length than the input,
 186  * and can have internal NULs as well (so that strlen doesn't give its
 187  * length). As with the other encoding functions, we should return the
 188  * length of the output buffer (or a wmem_strbuf_t directly) and an
 189  * indication of whether there was an invalid character (i.e.
 190  * REPLACEMENT CHARACTER was used.)
 191  */
 192 wmem_strbuf_t *
 193 ws_utf8_make_valid_strbuf(wmem_allocator_t *scope, const uint8_t *ptr, ssize_t length)
 194 {
 195     wmem_strbuf_t *str;
 196
 197     str = wmem_strbuf_new_sized(scope, length+1);
 198
 199     /* See the Unicode Standard conformance chapter at
 200      * https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf especially
 201      * Table 3-7 "Well-Formed UTF-8 Byte Sequences" and
 202      * U+FFFD Substitution of Maximal Subparts. */
 203
 204     while (length > 0) {
 205         const uint8_t *prev = ptr;
 206         size_t valid_bytes = utf_8_validate(prev, length, &ptr);
 207
 208         if (valid_bytes) {
 209             wmem_strbuf_append_len(str, prev, valid_bytes);
 210         }
 211         length -= ptr - prev;
 212         prev += valid_bytes;
 213         if (ptr - prev) {
 214             wmem_strbuf_append_unichar_repl(str);
 215         }
 216     }
 217
 218     return str;
 219 }
 220
 221 uint8_t *
 222 ws_utf8_make_valid(wmem_allocator_t *scope, const uint8_t *ptr, ssize_t length)
 223 {
 224     wmem_strbuf_t *str = ws_utf8_make_valid_strbuf(scope, ptr, length);
 225     return wmem_strbuf_finalize(str);
 226 }
 227
 228 #ifdef _WIN32
 229
 230 #include <strsafe.h>
 231
 232 /** @file
 233  * Unicode utilities (internal interface)
 234  *
 235  * We define UNICODE and _UNICODE under Windows.  This means that
 236  * Windows SDK routines expect UTF-16 strings, in contrast to newer
 237  * versions of Glib and GTK+ which expect UTF-8.  This module provides
 238  * convenience routines for converting between UTF-8 and UTF-16.
 239  */
 240
 241 #define INITIAL_UTFBUF_SIZE 128
 242
 243 /*
 244  * XXX - Should we use g_utf8_to_utf16() and g_utf16_to_utf8()
 245  * instead?  The goal of the functions below was to provide simple
 246  * wrappers for UTF-8 <-> UTF-16 conversion without making the
 247  * caller worry about freeing up memory afterward.
 248  */
 249
 250 /* Convert from UTF-8 to UTF-16. */
 251 const wchar_t *
 252 utf_8to16(const char *utf8str)
 253 {
 254     static wchar_t *utf16buf[3];
 255     static int utf16buf_len[3];
 256     static int idx;
 257
 258     if (utf8str == NULL)
 259         return NULL;
 260
 261     idx = (idx + 1) % 3;
 262
 263     /*
 264      * Allocate the buffer if it's not already allocated.
 265      */
 266     if (utf16buf[idx] == NULL) {
 267         utf16buf_len[idx] = INITIAL_UTFBUF_SIZE;
 268         utf16buf[idx] = g_malloc(utf16buf_len[idx] * sizeof(wchar_t));
 269     }
 270
 271     while (MultiByteToWideChar(CP_UTF8, 0, utf8str, -1, NULL, 0) >= utf16buf_len[idx]) {
 272         /*
 273          * Double the buffer's size if it's not big enough.
 274          * The size of the buffer starts at 128, so doubling its size
 275          * adds at least another 128 bytes, which is more than enough
 276          * for one more character plus a terminating '\0'.
 277          */
 278         utf16buf_len[idx] *= 2;
 279         utf16buf[idx] = g_realloc(utf16buf[idx], utf16buf_len[idx] * sizeof(wchar_t));
 280     }
 281
 282     if (MultiByteToWideChar(CP_UTF8, 0, utf8str, -1, utf16buf[idx], utf16buf_len[idx]) == 0)
 283         return NULL;
 284
 285     return utf16buf[idx];
 286 }
 287
 288 void
 289 utf_8to16_snprintf(TCHAR *utf16buf, int utf16buf_len, const char* fmt, ...)
 290 {
 291     va_list ap;
 292     char* dst;
 293
 294     va_start(ap,fmt);
 295     dst = ws_strdup_vprintf(fmt, ap);
 296     va_end(ap);
 297
 298     StringCchPrintf(utf16buf, utf16buf_len, _T("%s"), utf_8to16(dst));
 299
 300     g_free(dst);
 301 }
 302
 303 /* Convert from UTF-16 to UTF-8. */
 304 char *
 305 utf_16to8(const wchar_t *utf16str)
 306 {
 307     static char *utf8buf[3];
 308     static int utf8buf_len[3];
 309     static int idx;
 310
 311     if (utf16str == NULL)
 312         return NULL;
 313
 314     idx = (idx + 1) % 3;
 315
 316     /*
 317      * Allocate the buffer if it's not already allocated.
 318     */
 319     if (utf8buf[idx] == NULL) {
 320         utf8buf_len[idx] = INITIAL_UTFBUF_SIZE;
 321         utf8buf[idx] = g_malloc(utf8buf_len[idx]);
 322     }
 323
 324     while (WideCharToMultiByte(CP_UTF8, 0, utf16str, -1, NULL, 0, NULL, NULL) >= utf8buf_len[idx]) {
 325         /*
 326          * Double the buffer's size if it's not big enough.
 327          * The size of the buffer starts at 128, so doubling its size
 328          * adds at least another 128 bytes, which is more than enough
 329          * for one more character plus a terminating '\0'.
 330          */
 331         utf8buf_len[idx] *= 2;
 332         utf8buf[idx] = g_realloc(utf8buf[idx], utf8buf_len[idx]);
 333     }
 334
 335     if (WideCharToMultiByte(CP_UTF8, 0, utf16str, -1, utf8buf[idx], utf8buf_len[idx], NULL, NULL) == 0)
 336         return NULL;
 337
 338     return utf8buf[idx];
 339 }
 340
 341 /* Convert our argument list from UTF-16 to UTF-8. */
 342 char **
 343 arg_list_utf_16to8(int argc, wchar_t *wc_argv[]) {
 344     char **argv;
 345     int i;
 346
 347     argv = (char **)g_malloc((argc + 1) * sizeof(char *));
 348     for (i = 0; i < argc; i++) {
 349         argv[i] = g_utf16_to_utf8(wc_argv[i], -1, NULL, NULL, NULL);
 350     }
 351     argv[argc] = NULL;
 352     return argv;
 353 }
 354
 355 #endif
 356
 357 /*
 358  * Editor modelines  -  https://www.wireshark.org/tools/modelines.html
 359  *
 360  * Local variables:
 361  * c-basic-offset: 4
 362  * tab-width: 8
 363  * indent-tabs-mode: nil
 364  * End:
 365  *
 366  * vi: set shiftwidth=4 tabstop=8 expandtab:
 367  * :indentSize=4:tabSize=8:noTabs=true:
 368  */