packet-ldap: fix regression for SASL handling
[wireshark-sm.git] / epan / charsets.h
blobd5099119d83f5230f07e8143f7666576f22de8b6
1 /* charsets.h
2 * Routines for handling character sets
4 * Wireshark - Network traffic analyzer
5 * By Gerald Combs <gerald@wireshark.org>
6 * Copyright 1998 Gerald Combs
8 * SPDX-License-Identifier: GPL-2.0-or-later
9 */
10 #ifndef __CHARSETS_H__
11 #define __CHARSETS_H__
13 #include "ws_symbol_export.h"
15 #ifdef __cplusplus
16 extern "C" {
17 #endif /* __cplusplus */
20 * Translation tables that map the upper 128 code points in single-byte
21 * "extended ASCII" character encodings to Unicode code points in the
22 * Basic Multilingual Plane.
25 /* Table for windows-1250 */
26 extern const gunichar2 charset_table_cp1250[0x80];
27 /* Table for windows-1251 */
28 extern const gunichar2 charset_table_cp1251[0x80];
29 /* Table for windows-1252 */
30 extern const gunichar2 charset_table_cp1252[0x80];
32 /* Tables for ISO-8859-X */
33 extern const gunichar2 charset_table_iso_8859_2[0x80];
34 extern const gunichar2 charset_table_iso_8859_3[0x80];
35 extern const gunichar2 charset_table_iso_8859_4[0x80];
36 extern const gunichar2 charset_table_iso_8859_5[0x80];
37 extern const gunichar2 charset_table_iso_8859_6[0x80];
38 extern const gunichar2 charset_table_iso_8859_7[0x80];
39 extern const gunichar2 charset_table_iso_8859_8[0x80];
40 extern const gunichar2 charset_table_iso_8859_9[0x80];
41 extern const gunichar2 charset_table_iso_8859_10[0x80];
42 extern const gunichar2 charset_table_iso_8859_11[0x80];
43 extern const gunichar2 charset_table_iso_8859_13[0x80];
44 extern const gunichar2 charset_table_iso_8859_14[0x80];
45 extern const gunichar2 charset_table_iso_8859_15[0x80];
46 extern const gunichar2 charset_table_iso_8859_16[0x80];
48 /* Tables for Mac character sets */
49 extern const gunichar2 charset_table_mac_roman[0x80];
51 /* Tables for DOS code pages */
52 extern const gunichar2 charset_table_cp437[0x80];
53 extern const gunichar2 charset_table_cp855[0x80];
54 extern const gunichar2 charset_table_cp866[0x80];
57 * Translation tables that map the lower 128 code points in single-byte
58 * ISO 646-based character encodings to Unicode code points in the
59 * Basic Multilingual Plane.
61 extern const gunichar2 charset_table_iso_646_basic[0x80];
63 /* Tables for EBCDIC code pages */
64 extern const gunichar2 charset_table_ebcdic[256];
65 extern const gunichar2 charset_table_ebcdic_cp037[256];
68 * Given a wmem scope, a pointer, and a length, treat the string of bytes
69 * referred to by the pointer and length as an ASCII string, with all bytes
70 * with the high-order bit set being invalid, and return a pointer to a
71 * UTF-8 string, allocated using the wmem scope.
73 * Octets with the highest bit set will be converted to the Unicode
74 * REPLACEMENT CHARACTER.
76 WS_DLL_PUBLIC guint8 *
77 get_ascii_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
80 * Given a wmem scope, a pointer, and a length, treat the string of bytes
81 * referred to by the pointer and length as a UTF-8 string, and return a
82 * pointer to a UTF-8 string, allocated using the wmem scope, with all
83 * ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER
84 * according to the recommended "best practices" given in the Unicode
85 * Standard and specified by W3C/WHATWG.
87 WS_DLL_PUBLIC guint8 *
88 get_utf_8_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
91 * Given a wmem scope, a pointer, a length, and a translation table,
92 * treat the string of bytes referred to by the pointer and length as a
93 * string encoded using one octet per character, with octets with the
94 * high-order bit clear being mapped by the translation table to 2-byte
95 * Unicode Basic Multilingual Plane characters (including REPLACEMENT
96 * CHARACTER) and octets with the high-order bit set being mapped to
97 * REPLACEMENT CHARACTER, and return a pointer to a UTF-8 string,
98 * allocated using the wmem scope.
100 WS_DLL_PUBLIC guint8 *
101 get_iso_646_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[0x80]);
104 * Given a wmem scope, a pointer, and a length, treat the string of bytes
105 * referred to by the pointer and length as an ISO 8859/1 string, and
106 * return a pointer to a UTF-8 string, allocated using the wmem scope.
108 WS_DLL_PUBLIC guint8 *
109 get_8859_1_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
112 * Given a wmem scope, a pointer, a length, and a translation table with
113 * 128 entries, treat the string of bytes referred to by the pointer and
114 * length as a string encoded using one octet per character, with octets
115 * with the high-order bit clear being ASCII and octets with the high-order
116 * bit set being mapped by the translation table to 2-byte Unicode Basic
117 * Multilingual Plane characters (including REPLACEMENT CHARACTER), and
118 * return a pointer to a UTF-8 string, allocated using the wmem scope.
120 WS_DLL_PUBLIC guint8 *
121 get_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[0x80]);
124 * Given a wmem scope, a pointer, and a length, treat the string of bytes
125 * referred to by the pointer and length as a UCS-2 encoded string
126 * containing characters from the Basic Multilingual Plane (plane 0) of
127 * Unicode, and return a pointer to a UTF-8 string, allocated with the
128 * wmem scope.
130 * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
132 * Specify length in bytes.
134 * XXX - should map lead and trail surrogate values to REPLACEMENT
135 * CHARACTERs (0xFFFD)?
136 * XXX - if there are an odd number of bytes, should put a
137 * REPLACEMENT CHARACTER at the end.
139 WS_DLL_PUBLIC guint8 *
140 get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
143 * Given a wmem scope, a pointer, and a length, treat the string of bytes
144 * referred to by the pointer and length as a UTF-16 encoded string, and
145 * return a pointer to a UTF-8 string, allocated with the wmem scope.
147 * See RFC 2781 section 2.2.
149 * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
151 * Specify length in bytes.
153 * XXX - should map surrogate errors to REPLACEMENT CHARACTERs (0xFFFD).
154 * XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
155 * XXX - if there are an odd number of bytes, should put a
156 * REPLACEMENT CHARACTER at the end.
158 WS_DLL_PUBLIC guint8 *
159 get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
162 * Given a wmem scope, a pointer, and a length, treat the string of bytes
163 * referred to by the pointer and length as a UCS-4 encoded string, and
164 * return a pointer to a UTF-8 string, allocated with the wmem scope.
166 * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
168 * Specify length in bytes
170 * XXX - should map lead and trail surrogate values to a "substitute"
171 * UTF-8 character?
172 * XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
173 * XXX - if the number of bytes isn't a multiple of 4, should put a
174 * REPLACEMENT CHARACTER at the end.
176 WS_DLL_PUBLIC guint8 *
177 get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
179 WS_DLL_PUBLIC guint8 *
180 get_ts_23_038_7bits_string_packed(wmem_allocator_t *scope, const guint8 *ptr,
181 const gint bit_offset, gint no_of_chars);
183 WS_DLL_PUBLIC guint8 *
184 get_ts_23_038_7bits_string_unpacked(wmem_allocator_t *scope, const guint8 *ptr,
185 gint length);
187 WS_DLL_PUBLIC guint8 *
188 get_etsi_ts_102_221_annex_a_string(wmem_allocator_t *scope, const guint8 *ptr,
189 gint length);
191 WS_DLL_PUBLIC guint8 *
192 get_ascii_7bits_string(wmem_allocator_t *scope, const guint8 *ptr,
193 const gint bit_offset, gint no_of_chars);
196 * Given a wmem scope, a pointer, a length, and a translation table with
197 * 256 entries, treat the string of bytes referred to by the pointer and
198 * length as a string encoded using one octet per character, with octets
199 * being mapped by the translation table to 2-byte Unicode Basic Multilingual
200 * Plane characters (including REPLACEMENT CHARACTER), and return a
201 * pointer to a UTF-8 string, allocated using the wmem scope.
203 WS_DLL_PUBLIC guint8 *
204 get_nonascii_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[256]);
207 * Given a wmem scope, a pointer, and a length, treat the bytes referred to
208 * by the pointer and length as a GB18030 encoded string, and return a pointer
209 * to a UTF-8 string, allocated using the wmem scope, converted having
210 * substituted REPLACEMENT CHARACTER according to the Unicode Standard
211 * 5.22 U+FFFD Substitution for Conversion.
212 * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
214 * As expected, this will also decode GBK and GB2312 strings.
216 WS_DLL_PUBLIC guint8 *
217 get_gb18030_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
220 * Given a wmem scope, a pointer, and a length, treat the bytes referred to
221 * by the pointer and length as a EUC-KR encoded string, and return a pointer
222 * to a UTF-8 string, allocated using the wmem scope, converted having
223 * substituted REPLACEMENT CHARACTER according to the Unicode Standard
224 * 5.22 U+FFFD Substitution for Conversion.
225 * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
227 WS_DLL_PUBLIC guint8 *
228 get_euc_kr_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
230 WS_DLL_PUBLIC guint8 *
231 get_t61_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
233 #if 0
234 void ASCII_to_EBCDIC(guint8 *buf, guint bytes);
235 guint8 ASCII_to_EBCDIC1(guint8 c);
236 #endif
237 WS_DLL_PUBLIC
238 void EBCDIC_to_ASCII(guint8 *buf, guint bytes);
239 WS_DLL_PUBLIC
240 guint8 EBCDIC_to_ASCII1(guint8 c);
242 #ifdef __cplusplus
244 #endif /* __cplusplus */
246 #endif /* __CHARSETS_H__ */
249 * Editor modelines - https://www.wireshark.org/tools/modelines.html
251 * Local variables:
252 * c-basic-offset: 4
253 * tab-width: 8
254 * indent-tabs-mode: nil
255 * End:
257 * vi: set shiftwidth=4 tabstop=8 expandtab:
258 * :indentSize=4:tabSize=8:noTabs=true: