2 * Unicode utility definitions
4 * Wireshark - Network traffic analyzer
5 * By Gerald Combs <gerald@wireshark.org>
6 * Copyright 2006 Gerald Combs
8 * SPDX-License-Identifier: GPL-2.0-or-later
11 #ifndef __UNICODEUTIL_H__
12 #define __UNICODEUTIL_H__
14 #include <wireshark.h>
24 * Unicode convenience routines.
32 #define DEBUG_UTF_8_ENABLED true
34 #define DEBUG_UTF_8_ENABLED false
37 #define _CHECK_UTF_8(level, str, len) \
39 const char *__uni_endptr; \
40 if (DEBUG_UTF_8_ENABLED && (str) != NULL && \
41 !g_utf8_validate(str, len, &__uni_endptr)) { \
42 ws_log_utf8(str, len, __uni_endptr); \
46 #define WS_UTF_8_CHECK(str, len) \
47 _CHECK_UTF_8(LOG_LEVEL_DEBUG, str, len)
49 #define WS_UTF_8_DEBUG_HERE(str, len) \
50 _CHECK_UTF_8(LOG_LEVEL_ECHO, str, len)
53 const int ws_utf8_seqlen
[256];
55 /** Given the first byte in an UTF-8 encoded code point,
56 * return the length of the multibyte sequence, or *ZERO*
57 * if the byte is invalid as the first byte in a multibyte
60 #define ws_utf8_char_len(ch) (ws_utf8_seqlen[(ch)])
63 * Given a wmem scope, a pointer, and a length, treat the string of bytes
64 * referred to by the pointer and length as a UTF-8 string, and return a
65 * pointer to a UTF-8 string, allocated using the wmem scope, with all
66 * ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER
67 * according to the recommended "best practices" given in the Unicode
68 * Standard and specified by W3C/WHATWG.
70 WS_DLL_PUBLIC
uint8_t *
71 ws_utf8_make_valid(wmem_allocator_t
*scope
, const uint8_t *ptr
, ssize_t length
);
74 * Same as ws_utf8_make_valid() but returns a wmem_strbuf_t.
76 WS_DLL_PUBLIC wmem_strbuf_t
*
77 ws_utf8_make_valid_strbuf(wmem_allocator_t
*scope
, const uint8_t *ptr
, ssize_t length
);
81 /** Given a UTF-8 string, convert it to UTF-16. This is meant to be used
82 * to convert between GTK+ 2.x (UTF-8) to Windows (UTF-16).
84 * @param utf8str The string to convert. May be NULL.
85 * @return The string converted to UTF-16. If utf8str is NULL, returns
86 * NULL. The return value should NOT be freed by the caller.
89 const wchar_t * utf_8to16(const char *utf8str
);
91 /** Create a UTF-16 string (in place) according to the format string.
93 * @param utf16buf The buffer to return the UTF-16 string in.
94 * @param utf16buf_len The size of the 'utf16buf' parameter
95 * @param fmt A standard printf() format string
98 void utf_8to16_snprintf(TCHAR
*utf16buf
, int utf16buf_len
, const char* fmt
, ...)
101 /** Given a UTF-16 string, convert it to UTF-8. This is meant to be used
102 * to convert between GTK+ 2.x (UTF-8) to Windows (UTF-16).
104 * @param utf16str The string to convert. May be NULL.
105 * @return The string converted to UTF-8. If utf16str is NULL, returns
106 * NULL. The return value should NOT be freed by the caller.
109 char * utf_16to8(const wchar_t *utf16str
);
111 /** Convert the supplied program argument list from UTF-16 to UTF-8
112 * return a pointer to the array of UTF-8 arguments. This is intended
113 * to be used to normalize command line arguments at program startup.
115 * @param argc The number of arguments.
116 * @param argv The argument values (vector).
119 char **arg_list_utf_16to8(int argc
, wchar_t *wc_argv
[]);
124 * defines for helping with UTF-16 surrogate pairs
127 #define IS_LEAD_SURROGATE(uchar2) \
128 ((uchar2) >= 0xd800 && (uchar2) < 0xdc00)
129 #define IS_TRAIL_SURROGATE(uchar2) \
130 ((uchar2) >= 0xdc00 && (uchar2) < 0xe000)
131 #define SURROGATE_VALUE(lead, trail) \
132 (((((lead) - 0xd800) << 10) | ((trail) - 0xdc00)) + 0x10000)
138 #endif /* __UNICODEUTIL_H__ */