wsutil/unicode-utils.h

   1 /* unicode-utils.h
   2  * Unicode utility definitions
   3  *
   4  * Wireshark - Network traffic analyzer
   5  * By Gerald Combs <gerald@wireshark.org>
   6  * Copyright 2006 Gerald Combs
   7  *
   8  * SPDX-License-Identifier: GPL-2.0-or-later
   9  */
  10
  11 #ifndef __UNICODEUTIL_H__
  12 #define __UNICODEUTIL_H__
  13
  14 #include <wireshark.h>
  15
  16 #ifdef _WIN32
  17 #include <windows.h>
  18 #include <tchar.h>
  19 #include <wchar.h>
  20 #endif
  21
  22 /**
  23  * @file
  24  * Unicode convenience routines.
  25  */
  26
  27 #ifdef  __cplusplus
  28 extern "C" {
  29 #endif
  30
  31 #ifdef WS_DEBUG_UTF_8
  32 #define DEBUG_UTF_8_ENABLED true
  33 #else
  34 #define DEBUG_UTF_8_ENABLED false
  35 #endif
  36
  37 #define _CHECK_UTF_8(level, str, len) \
  38     do {                                                                \
  39         const char *__uni_endptr;                                       \
  40         if (DEBUG_UTF_8_ENABLED && (str) != NULL &&                     \
  41                         !g_utf8_validate(str, len, &__uni_endptr)) {    \
  42             ws_log_utf8(str, len, __uni_endptr);                        \
  43         }                                                               \
  44     } while (0)
  45
  46 #define WS_UTF_8_CHECK(str, len) \
  47     _CHECK_UTF_8(LOG_LEVEL_DEBUG, str, len)
  48
  49 #define WS_UTF_8_DEBUG_HERE(str, len) \
  50     _CHECK_UTF_8(LOG_LEVEL_ECHO, str, len)
  51
  52 WSUTIL_EXPORT
  53 const int ws_utf8_seqlen[256];
  54
  55 /** Given the first byte in an UTF-8 encoded code point,
  56  * return the length of the multibyte sequence, or *ZERO*
  57  * if the byte is invalid as the first byte in a multibyte
  58  * sequence.
  59  */
  60 #define ws_utf8_char_len(ch)  (ws_utf8_seqlen[(ch)])
  61
  62 /*
  63  * Given a wmem scope, a pointer, and a length, treat the string of bytes
  64  * referred to by the pointer and length as a UTF-8 string, and return a
  65  * pointer to a UTF-8 string, allocated using the wmem scope, with all
  66  * ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER
  67  * according to the recommended "best practices" given in the Unicode
  68  * Standard and specified by W3C/WHATWG.
  69  */
  70 WS_DLL_PUBLIC uint8_t *
  71 ws_utf8_make_valid(wmem_allocator_t *scope, const uint8_t *ptr, ssize_t length);
  72
  73 /*
  74  * Same as ws_utf8_make_valid() but returns a wmem_strbuf_t.
  75  */
  76 WS_DLL_PUBLIC wmem_strbuf_t *
  77 ws_utf8_make_valid_strbuf(wmem_allocator_t *scope, const uint8_t *ptr, ssize_t length);
  78
  79 #ifdef _WIN32
  80
  81 /** Given a UTF-8 string, convert it to UTF-16.  This is meant to be used
  82  * to convert between GTK+ 2.x (UTF-8) to Windows (UTF-16).
  83  *
  84  * @param utf8str The string to convert.  May be NULL.
  85  * @return The string converted to UTF-16.  If utf8str is NULL, returns
  86  * NULL.  The return value should NOT be freed by the caller.
  87  */
  88 WS_DLL_PUBLIC
  89 const wchar_t * utf_8to16(const char *utf8str);
  90
  91 /** Create a UTF-16 string (in place) according to the format string.
  92  *
  93  * @param utf16buf The buffer to return the UTF-16 string in.
  94  * @param utf16buf_len The size of the 'utf16buf' parameter
  95  * @param fmt A standard printf() format string
  96  */
  97 WS_DLL_PUBLIC
  98 void utf_8to16_snprintf(TCHAR *utf16buf, int utf16buf_len, const char* fmt, ...)
  99 G_GNUC_PRINTF(3, 4);
 100
 101 /** Given a UTF-16 string, convert it to UTF-8.  This is meant to be used
 102  * to convert between GTK+ 2.x (UTF-8) to Windows (UTF-16).
 103  *
 104  * @param utf16str The string to convert.  May be NULL.
 105  * @return The string converted to UTF-8.  If utf16str is NULL, returns
 106  * NULL.  The return value should NOT be freed by the caller.
 107  */
 108 WS_DLL_PUBLIC
 109 char * utf_16to8(const wchar_t *utf16str);
 110
 111 /** Convert the supplied program argument list from UTF-16 to UTF-8
 112  * return a pointer to the array of UTF-8 arguments. This is intended
 113  * to be used to normalize command line arguments at program startup.
 114  *
 115  * @param argc The number of arguments.
 116  * @param argv The argument values (vector).
 117  */
 118 WS_DLL_PUBLIC
 119 char **arg_list_utf_16to8(int argc, wchar_t *wc_argv[]);
 120
 121 #endif /* _WIN32 */
 122
 123 /*
 124  * defines for helping with UTF-16 surrogate pairs
 125  */
 126
 127 #define IS_LEAD_SURROGATE(uchar2) \
 128     ((uchar2) >= 0xd800 && (uchar2) < 0xdc00)
 129 #define IS_TRAIL_SURROGATE(uchar2) \
 130     ((uchar2) >= 0xdc00 && (uchar2) < 0xe000)
 131 #define SURROGATE_VALUE(lead, trail) \
 132     (((((lead) - 0xd800) << 10) | ((trail) - 0xdc00)) + 0x10000)
 133
 134 #ifdef  __cplusplus
 135 }
 136 #endif
 137
 138 #endif /* __UNICODEUTIL_H__ */