regen pidl all: rm epan/dissectors/pidl/*-stamp; pushd epan/dissectors/pidl/ && make...
[wireshark-sm.git] / epan / charsets.c
blob79fc77ac7581a4110ad36e45a4f52f7c9d5669a8
1 /* charsets.c
2 * Routines for handling character sets
4 * Wireshark - Network traffic analyzer
5 * By Gerald Combs <gerald@wireshark.org>
6 * Copyright 1998 Gerald Combs
8 * SPDX-License-Identifier: GPL-2.0-or-later
9 */
11 #include "config.h"
13 #include <errno.h>
14 #include <glib.h>
16 #include <epan/proto.h>
17 #include <epan/wmem_scopes.h>
19 #include <wsutil/pint.h>
20 #include <wsutil/unicode-utils.h>
22 #include "charsets.h"
25 * 6-character abbreviation for "Unicode REPLACEMENT CHARACTER", so it
26 * takes up the same amount of space as the 6-character hex values for
27 * Basic Multilingual Plane code points in the tables below.
29 #define UNREPL UNICODE_REPLACEMENT_CHARACTER
31 /* ZERO WIDTH NON-BREAKING SPACE, also known informally as BOM */
32 #define BYTE_ORDER_MARK 0xFEFF
35 * Wikipedia's "Character encoding" template, giving a pile of character
36 * encodings and Wikipedia pages for them:
38 * http://en.wikipedia.org/wiki/Template:Character_encoding
40 * Unicode character encoding model:
42 * https://www.unicode.org/reports/tr17/
44 * International Components for Unicode character set mapping tables:
46 * http://site.icu-project.org/charts/charset
48 * MSDN information on code pages:
50 * https://docs.microsoft.com/en-us/windows/win32/intl/code-pages
52 * ASCII-based code pages, from IBM:
54 * http://www-01.ibm.com/software/globalization/cp/cp_cpgid.html
56 * EBCDIC code pages, from IBM:
58 * http://www-03.ibm.com/systems/i/software/globalization/codepages.html
60 * The IBM pages are no longer available; the versions archived on the
61 * Wayback Machine are, but the links to the PDF and text versions of
62 * the code pages don't all work (do *any* work?).
64 * Mappings to Unicode at the Unicode Consortium:
66 * https://www.unicode.org/Public/MAPPINGS/
68 * Of note, the VENDORS/MICSFT directory not only has various Windows
69 * and DOS code pages, but also several of the common MAC and EBCDIC
70 * code page mappings to Unicode.
74 * Given a wmem scope, a pointer, and a length, treat the string of bytes
75 * referred to by the pointer and length as an ASCII string, with all bytes
76 * with the high-order bit set being invalid, and return a pointer to a
77 * UTF-8 string, allocated using the wmem scope.
79 * Octets with the highest bit set will be converted to the Unicode
80 * REPLACEMENT CHARACTER.
82 uint8_t *
83 get_ascii_string(wmem_allocator_t *scope, const uint8_t *ptr, int length)
85 wmem_strbuf_t *str;
86 const uint8_t *prev = ptr;
87 size_t valid_bytes = 0;
89 str = wmem_strbuf_new_sized(scope, length+1);
91 while (length > 0) {
92 uint8_t ch = *ptr++;
94 if (ch < 0x80) {
95 valid_bytes++;
96 } else {
97 if (valid_bytes) {
98 wmem_strbuf_append_len(str, prev, valid_bytes);
99 valid_bytes = 0;
101 prev = ptr;
102 wmem_strbuf_append_unichar_repl(str);
104 length--;
106 if (valid_bytes) {
107 wmem_strbuf_append_len(str, prev, valid_bytes);
110 return (uint8_t *) wmem_strbuf_finalize(str);
113 uint8_t *
114 get_utf_8_string(wmem_allocator_t *scope, const uint8_t *ptr, int length)
116 return ws_utf8_make_valid(scope, ptr, length);
120 * ISO 646 "Basic code table".
122 const gunichar2 charset_table_iso_646_basic[0x80] = {
123 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, /* 0x00 - */
124 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* - 0x0F */
125 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, /* 0x10 - */
126 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* - 0x1F */
127 0x0020, 0x0021, 0x0022, UNREPL, UNREPL, 0x0025, 0x0026, 0x0027, /* 0x20 - */
128 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* - 0x2F */
129 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, /* 0x30 - */
130 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* - 0x3F */
131 UNREPL, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, /* 0x40 - */
132 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* - 0x4F */
133 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, /* 0x50 - */
134 0x0058, 0x0059, 0x005a, UNREPL, UNREPL, UNREPL, UNREPL, 0x005f, /* - 0x5F */
135 UNREPL, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, /* 0x60 - */
136 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* - 0x6F */
137 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, /* 0x70 - */
138 0x0078, 0x0079, 0x007a, UNREPL, UNREPL, UNREPL, UNREPL, 0x007f, /* - 0x7F */
142 * Given a wmem scope, a pointer, a length, and a translation table,
143 * treat the string of bytes referred to by the pointer and length as a
144 * string encoded using one octet per character, with octets with the
145 * high-order bit clear being mapped by the translation table to 2-byte
146 * Unicode Basic Multilingual Plane characters (including REPLACEMENT
147 * CHARACTER) and octets with the high-order bit set being mapped to
148 * REPLACEMENT CHARACTER, and return a pointer to a UTF-8 string,
149 * allocated using the wmem scope.
151 uint8_t *
152 get_iso_646_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, const gunichar2 table[0x80])
154 wmem_strbuf_t *str;
156 str = wmem_strbuf_new_sized(scope, length+1);
158 while (length > 0) {
159 uint8_t ch = *ptr;
161 if (ch < 0x80)
162 wmem_strbuf_append_unichar(str, table[ch]);
163 else
164 wmem_strbuf_append_unichar_repl(str);
165 ptr++;
166 length--;
169 return (uint8_t *) wmem_strbuf_finalize(str);
173 * Given a wmem scope, a pointer, and a length, treat the string of bytes
174 * referred to by the pointer and length as an ISO 8859/1 string, and
175 * return a pointer to a UTF-8 string, allocated using the wmem scope.
177 uint8_t *
178 get_8859_1_string(wmem_allocator_t *scope, const uint8_t *ptr, int length)
180 wmem_strbuf_t *str;
182 str = wmem_strbuf_new_sized(scope, length+1);
184 while (length > 0) {
185 uint8_t ch = *ptr;
187 if (ch < 0x80)
188 wmem_strbuf_append_c(str, ch);
189 else {
191 * Note: we assume here that the code points
192 * 0x80-0x9F are used for C1 control characters,
193 * and thus have the same value as the corresponding
194 * Unicode code points.
196 wmem_strbuf_append_unichar(str, ch);
198 ptr++;
199 length--;
202 return (uint8_t *) wmem_strbuf_finalize(str);
206 * Translation tables that map the upper 128 code points in single-byte
207 * "extended ASCII" character encodings to Unicode code points in the
208 * Basic Multilingual Plane.
211 /* ISO-8859-2 (https://en.wikipedia.org/wiki/ISO/IEC_8859-2#Code_page_layout) */
212 const gunichar2 charset_table_iso_8859_2[0x80] = {
213 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
214 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
215 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
216 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
217 0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7, /* 0xA0 - */
218 0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b, /* - 0xAF */
219 0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7, /* 0xB0 - */
220 0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c, /* - 0xBF */
221 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, /* 0xC0 - */
222 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e, /* - 0xCF */
223 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, /* 0xD0 - */
224 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df, /* - 0xDF */
225 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, /* 0xE0 - */
226 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f, /* - 0xEF */
227 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, /* 0xF0 - */
228 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9 /* - 0xFF */
231 /* generated by ../tools/make_charset_ISO-8859-3 */
232 const gunichar2 charset_table_iso_8859_3[0x80] = {
233 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
234 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
235 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
236 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
237 0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4, UNREPL, 0x0124, 0x00a7, /* 0xA0 - */
238 0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad, UNREPL, 0x017b, /* - 0xAF */
239 0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7, /* 0xB0 - */
240 0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd, UNREPL, 0x017c, /* - 0xBF */
241 0x00c0, 0x00c1, 0x00c2, UNREPL, 0x00c4, 0x010a, 0x0108, 0x00c7, /* 0xC0 - */
242 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */
243 UNREPL, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7, /* 0xD0 - */
244 0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df, /* - 0xDF */
245 0x00e0, 0x00e1, 0x00e2, UNREPL, 0x00e4, 0x010b, 0x0109, 0x00e7, /* 0xE0 - */
246 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */
247 UNREPL, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7, /* 0xF0 - */
248 0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9, /* - 0xFF */
251 /* generated by ../tools/make_charset_ISO-8859-4 */
252 const gunichar2 charset_table_iso_8859_4[0x80] = {
253 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
254 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
255 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
256 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
257 0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7, /* 0xA0 - */
258 0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af, /* - 0xAF */
259 0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7, /* 0xB0 - */
260 0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b, /* - 0xBF */
261 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e, /* 0xC0 - */
262 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a, /* - 0xCF */
263 0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7, /* 0xD0 - */
264 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df, /* - 0xDF */
265 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f, /* 0xE0 - */
266 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b, /* - 0xEF */
267 0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7, /* 0xF0 - */
268 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9, /* - 0xFF */
271 /* ISO-8859-5 (https://en.wikipedia.org/wiki/ISO/IEC_8859-5#Code_page_layout) */
272 const gunichar2 charset_table_iso_8859_5[0x80] = {
273 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
274 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
275 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
276 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
277 0x00a0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, /* 0xA0 - */
278 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x040d, 0x040e, 0x040f, /* - 0xAF */
279 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, /* 0xB0 - */
280 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, /* - 0xBF */
281 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, /* 0xC0 - */
282 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, /* - 0xCF */
283 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, /* 0xD0 - */
284 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, /* - 0xDF */
285 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, /* 0xE0 - */
286 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f, /* - 0xEF */
287 0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, /* 0xF0 - */
288 0x0458, 0x0459, 0x045a, 0x045b, 0x045c, 0x00a7, 0x045e, 0x045f /* - 0xFF */
291 /* generated by ../tools/make_charset_ISO-8859-6 */
292 const gunichar2 charset_table_iso_8859_6[0x80] = {
293 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
294 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
295 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
296 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
297 0x00a0, UNREPL, UNREPL, UNREPL, 0x00a4, UNREPL, UNREPL, UNREPL, /* 0xA0 - */
298 UNREPL, UNREPL, UNREPL, UNREPL, 0x060c, 0x00ad, UNREPL, UNREPL, /* - 0xAF */
299 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* 0xB0 - */
300 UNREPL, UNREPL, UNREPL, 0x061b, UNREPL, UNREPL, UNREPL, 0x061f, /* - 0xBF */
301 UNREPL, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627, /* 0xC0 - */
302 0x0628, 0x0629, 0x062a, 0x062b, 0x062c, 0x062d, 0x062e, 0x062f, /* - 0xCF */
303 0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x0637, /* 0xD0 - */
304 0x0638, 0x0639, 0x063a, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* - 0xDF */
305 0x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646, 0x0647, /* 0xE0 - */
306 0x0648, 0x0649, 0x064a, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, /* - 0xEF */
307 0x0650, 0x0651, 0x0652, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* 0xF0 - */
308 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* - 0xFF */
311 /* generated by ../tools/make_charset_ISO-8859-7 */
312 const gunichar2 charset_table_iso_8859_7[0x80] = {
313 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
314 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
315 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
316 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
317 0x00a0, 0x2018, 0x2019, 0x00a3, 0x20ac, 0x20af, 0x00a6, 0x00a7, /* 0xA0 - */
318 0x00a8, 0x00a9, 0x037a, 0x00ab, 0x00ac, 0x00ad, UNREPL, 0x2015, /* - 0xAF */
319 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x0384, 0x0385, 0x0386, 0x00b7, /* 0xB0 - */
320 0x0388, 0x0389, 0x038a, 0x00bb, 0x038c, 0x00bd, 0x038e, 0x038f, /* - 0xBF */
321 0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, /* 0xC0 - */
322 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f, /* - 0xCF */
323 0x03a0, 0x03a1, UNREPL, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7, /* 0xD0 - */
324 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x03ac, 0x03ad, 0x03ae, 0x03af, /* - 0xDF */
325 0x03b0, 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7, /* 0xE0 - */
326 0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf, /* - 0xEF */
327 0x03c0, 0x03c1, 0x03c2, 0x03c3, 0x03c4, 0x03c5, 0x03c6, 0x03c7, /* 0xF0 - */
328 0x03c8, 0x03c9, 0x03ca, 0x03cb, 0x03cc, 0x03cd, 0x03ce, UNREPL, /* - 0xFF */
331 /* generated by ../tools/make_charset_ISO-8859-8 */
332 const gunichar2 charset_table_iso_8859_8[0x80] = {
333 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
334 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
335 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
336 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
337 0x00a0, UNREPL, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, /* 0xA0 - */
338 0x00a8, 0x00a9, 0x00d7, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, /* - 0xAF */
339 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */
340 0x00b8, 0x00b9, 0x00f7, 0x00bb, 0x00bc, 0x00bd, 0x00be, UNREPL, /* - 0xBF */
341 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* 0xC0 - */
342 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* - 0xCF */
343 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* 0xD0 - */
344 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, 0x2017, /* - 0xDF */
345 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x05d5, 0x05d6, 0x05d7, /* 0xE0 - */
346 0x05d8, 0x05d9, 0x05da, 0x05db, 0x05dc, 0x05dd, 0x05de, 0x05df, /* - 0xEF */
347 0x05e0, 0x05e1, 0x05e2, 0x05e3, 0x05e4, 0x05e5, 0x05e6, 0x05e7, /* 0xF0 - */
348 0x05e8, 0x05e9, 0x05ea, UNREPL, UNREPL, 0x200e, 0x200f, UNREPL, /* - 0xFF */
351 /* ISO-8859-9 (https://en.wikipedia.org/wiki/ISO/IEC_8859-9#Code_page_layout) */
352 const gunichar2 charset_table_iso_8859_9[0x80] = {
353 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
354 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
355 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
356 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
357 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, /* 0xA0 - */
358 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, /* - 0xAF */
359 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */
360 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, /* - 0xBF */
361 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, /* 0xC0 - */
362 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */
363 0x011e, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, /* 0xD0 - */
364 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0130, 0x015e, 0x00df, /* - 0xDF */
365 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, /* 0xE0 - */
366 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */
367 0x011f, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, /* 0xF0 - */
368 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0131, 0x015f, 0x00ff /* - 0xFF */
371 /* generated by ../tools/make_charset_ISO-8859-10 */
372 const gunichar2 charset_table_iso_8859_10[0x80] = {
373 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
374 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
375 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
376 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
377 0x00a0, 0x0104, 0x0112, 0x0122, 0x012a, 0x0128, 0x0136, 0x00a7, /* 0xA0 - */
378 0x013b, 0x0110, 0x0160, 0x0166, 0x017d, 0x00ad, 0x016a, 0x014a, /* - 0xAF */
379 0x00b0, 0x0105, 0x0113, 0x0123, 0x012b, 0x0129, 0x0137, 0x00b7, /* 0xB0 - */
380 0x013c, 0x0111, 0x0161, 0x0167, 0x017e, 0x2015, 0x016b, 0x014b, /* - 0xBF */
381 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e, /* 0xC0 - */
382 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */
383 0x00d0, 0x0145, 0x014c, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x0168, /* 0xD0 - */
384 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, /* - 0xDF */
385 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f, /* 0xE0 - */
386 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */
387 0x00f0, 0x0146, 0x014d, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x0169, /* 0xF0 - */
388 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x0138, /* - 0xFF */
391 /* generated by ../tools/make_charset_ISO-8859-11 */
392 const gunichar2 charset_table_iso_8859_11[0x80] = {
393 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
394 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
395 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
396 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
397 0x00a0, 0x0e01, 0x0e02, 0x0e03, 0x0e04, 0x0e05, 0x0e06, 0x0e07, /* 0xA0 - */
398 0x0e08, 0x0e09, 0x0e0a, 0x0e0b, 0x0e0c, 0x0e0d, 0x0e0e, 0x0e0f, /* - 0xAF */
399 0x0e10, 0x0e11, 0x0e12, 0x0e13, 0x0e14, 0x0e15, 0x0e16, 0x0e17, /* 0xB0 - */
400 0x0e18, 0x0e19, 0x0e1a, 0x0e1b, 0x0e1c, 0x0e1d, 0x0e1e, 0x0e1f, /* - 0xBF */
401 0x0e20, 0x0e21, 0x0e22, 0x0e23, 0x0e24, 0x0e25, 0x0e26, 0x0e27, /* 0xC0 - */
402 0x0e28, 0x0e29, 0x0e2a, 0x0e2b, 0x0e2c, 0x0e2d, 0x0e2e, 0x0e2f, /* - 0xCF */
403 0x0e30, 0x0e31, 0x0e32, 0x0e33, 0x0e34, 0x0e35, 0x0e36, 0x0e37, /* 0xD0 - */
404 0x0e38, 0x0e39, 0x0e3a, UNREPL, UNREPL, UNREPL, UNREPL, 0x0e3f, /* - 0xDF */
405 0x0e40, 0x0e41, 0x0e42, 0x0e43, 0x0e44, 0x0e45, 0x0e46, 0x0e47, /* 0xE0 - */
406 0x0e48, 0x0e49, 0x0e4a, 0x0e4b, 0x0e4c, 0x0e4d, 0x0e4e, 0x0e4f, /* - 0xEF */
407 0x0e50, 0x0e51, 0x0e52, 0x0e53, 0x0e54, 0x0e55, 0x0e56, 0x0e57, /* 0xF0 - */
408 0x0e58, 0x0e59, 0x0e5a, 0x0e5b, UNREPL, UNREPL, UNREPL, UNREPL, /* - 0xFF */
411 /* generated by ../tools/make_charset_ISO-8859-13 */
412 const gunichar2 charset_table_iso_8859_13[0x80] = {
413 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
414 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
415 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
416 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
417 0x00a0, 0x201d, 0x00a2, 0x00a3, 0x00a4, 0x201e, 0x00a6, 0x00a7, /* 0xA0 - */
418 0x00d8, 0x00a9, 0x0156, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00c6, /* - 0xAF */
419 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x201c, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */
420 0x00f8, 0x00b9, 0x0157, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00e6, /* - 0xBF */
421 0x0104, 0x012e, 0x0100, 0x0106, 0x00c4, 0x00c5, 0x0118, 0x0112, /* 0xC0 - */
422 0x010c, 0x00c9, 0x0179, 0x0116, 0x0122, 0x0136, 0x012a, 0x013b, /* - 0xCF */
423 0x0160, 0x0143, 0x0145, 0x00d3, 0x014c, 0x00d5, 0x00d6, 0x00d7, /* 0xD0 - */
424 0x0172, 0x0141, 0x015a, 0x016a, 0x00dc, 0x017b, 0x017d, 0x00df, /* - 0xDF */
425 0x0105, 0x012f, 0x0101, 0x0107, 0x00e4, 0x00e5, 0x0119, 0x0113, /* 0xE0 - */
426 0x010d, 0x00e9, 0x017a, 0x0117, 0x0123, 0x0137, 0x012b, 0x013c, /* - 0xEF */
427 0x0161, 0x0144, 0x0146, 0x00f3, 0x014d, 0x00f5, 0x00f6, 0x00f7, /* 0xF0 - */
428 0x0173, 0x0142, 0x015b, 0x016b, 0x00fc, 0x017c, 0x017e, 0x2019, /* - 0xFF */
431 /* generated by ../tools/make_charset_ISO-8859-14 */
432 const gunichar2 charset_table_iso_8859_14[0x80] = {
433 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
434 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
435 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
436 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
437 0x00a0, 0x1e02, 0x1e03, 0x00a3, 0x010a, 0x010b, 0x1e0a, 0x00a7, /* 0xA0 - */
438 0x1e80, 0x00a9, 0x1e82, 0x1e0b, 0x1ef2, 0x00ad, 0x00ae, 0x0178, /* - 0xAF */
439 0x1e1e, 0x1e1f, 0x0120, 0x0121, 0x1e40, 0x1e41, 0x00b6, 0x1e56, /* 0xB0 - */
440 0x1e81, 0x1e57, 0x1e83, 0x1e60, 0x1ef3, 0x1e84, 0x1e85, 0x1e61, /* - 0xBF */
441 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, /* 0xC0 - */
442 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */
443 0x0174, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x1e6a, /* 0xD0 - */
444 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x0176, 0x00df, /* - 0xDF */
445 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, /* 0xE0 - */
446 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */
447 0x0175, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x1e6b, /* 0xF0 - */
448 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x0177, 0x00ff, /* - 0xFF */
451 /* generated by ../tools/make_charset_ISO-8859-15 */
452 const gunichar2 charset_table_iso_8859_15[0x80] = {
453 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
454 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
455 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
456 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
457 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x20ac, 0x00a5, 0x0160, 0x00a7, /* 0xA0 - */
458 0x0161, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, /* - 0xAF */
459 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x017d, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */
460 0x017e, 0x00b9, 0x00ba, 0x00bb, 0x0152, 0x0153, 0x0178, 0x00bf, /* - 0xBF */
461 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, /* 0xC0 - */
462 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */
463 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, /* 0xD0 - */
464 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, /* - 0xDF */
465 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, /* 0xE0 - */
466 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */
467 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, /* 0xF0 - */
468 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, /* - 0xFF */
471 /* generated by ../tools/make_charset_ISO-8859-16 */
472 const gunichar2 charset_table_iso_8859_16[0x80] = {
473 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
474 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
475 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
476 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
477 0x00a0, 0x0104, 0x0105, 0x0141, 0x20ac, 0x201e, 0x0160, 0x00a7, /* 0xA0 - */
478 0x0161, 0x00a9, 0x0218, 0x00ab, 0x0179, 0x00ad, 0x017a, 0x017b, /* - 0xAF */
479 0x00b0, 0x00b1, 0x010c, 0x0142, 0x017d, 0x201d, 0x00b6, 0x00b7, /* 0xB0 - */
480 0x017e, 0x010d, 0x0219, 0x00bb, 0x0152, 0x0153, 0x0178, 0x017c, /* - 0xBF */
481 0x00c0, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0106, 0x00c6, 0x00c7, /* 0xC0 - */
482 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */
483 0x0110, 0x0143, 0x00d2, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x015a, /* 0xD0 - */
484 0x0170, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0118, 0x021a, 0x00df, /* - 0xDF */
485 0x00e0, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x0107, 0x00e6, 0x00e7, /* 0xE0 - */
486 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */
487 0x0111, 0x0144, 0x00f2, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x015b, /* 0xF0 - */
488 0x0171, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0119, 0x021b, 0x00ff, /* - 0xFF */
492 * Windows-1250
494 * See:
495 * httpss://en.wikipedia.org/wiki/Windows-1250)
496 * https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1250.TXT
498 const gunichar2 charset_table_cp1250[0x80] = {
499 0x20ac, UNREPL, 0x201a, UNREPL, 0x201e, 0x2026, 0x2020, 0x2021, /* 0x80 - */
500 UNREPL, 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179, /* - 0x8F */
501 UNREPL, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, /* 0x90 - */
502 UNREPL, 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a, /* - 0x9F */
503 0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7, /* 0xA0 - */
504 0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b, /* - 0xAF */
505 0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */
506 0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c, /* - 0xBF */
507 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, /* 0xC0 - */
508 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e, /* - 0xCF */
509 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, /* 0xD0 - */
510 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df, /* - 0xDF */
511 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, /* 0xE0 - */
512 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f, /* - 0xEF */
513 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, /* 0xF0 - */
514 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9, /* - 0xFF */
518 * Windows-1251
520 * See:
521 * https://en.wikipedia.org/wiki/Windows-1251
522 * https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1251.TXT
524 const gunichar2 charset_table_cp1251[0x80] = {
525 0x0402, 0x0403, 0x201a, 0x0453, 0x201e, 0x2026, 0x2020, 0x2021, /* 0x80 - */
526 0x20ac, 0x2030, 0x0409, 0x2039, 0x040a, 0x040c, 0x040B, 0x040f, /* - 0x8F */
527 0x0452, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, /* 0x90 - */
528 UNREPL, 0x2122, 0x0459, 0x203a, 0x045a, 0x045c, 0x045b, 0x045f, /* - 0x9F */
529 0x00a0, 0x040e, 0x045e, 0x0408, 0x00a4, 0x0490, 0x00a6, 0x00a7, /* 0xA0 - */
530 0x0401, 0x00a9, 0x0404, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x0407, /* - 0xAF */
531 0x00b0, 0x00b1, 0x0406, 0x0456, 0x0491, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */
532 0x0451, 0x2116, 0x0454, 0x00bb, 0x0458, 0x0405, 0x0455, 0x0457, /* - 0xBF */
533 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, /* 0xC0 - */
534 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, /* - 0xCF */
535 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, /* 0xD0 - */
536 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, /* - 0xDF */
537 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, /* 0xE0 - */
538 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, /* - 0xEF */
539 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, /* 0xF0 - */
540 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f, /* - 0xFF */
544 * Windows-1252
546 * See:
547 * https://en.wikipedia.org/wiki/Windows-1252
548 * https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT
550 const gunichar2 charset_table_cp1252[0x80] = {
551 0x20ac, UNREPL, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021, /* 0x80 - */
552 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, UNREPL, 0x0172, UNREPL, /* - 0x8F */
553 UNREPL, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, /* 0x90 - */
554 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, UNREPL, 0x0173, 0x0178, /* - 0x9F */
555 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, /* 0xA0 - */
556 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, /* - 0xAF */
557 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */
558 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, /* - 0xBF */
559 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, /* 0xC0 - */
560 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */
561 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, /* 0xD0 - */
562 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, /* - 0xDF */
563 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, /* 0xE0 - */
564 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */
565 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, /* 0xF0 - */
566 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, /* - 0xFF */
569 /* generated by ./make_charset_table MACROMAN */
570 /* That's "MacRoman", not "Macro Man" (faster than a speeding recursive expansion!) */
571 const gunichar2 charset_table_mac_roman[0x80] = {
572 0x00c4, 0x00c5, 0x00c7, 0x00c9, 0x00d1, 0x00d6, 0x00dc, 0x00e1, /* 0x80 - */
573 0x00e0, 0x00e2, 0x00e4, 0x00e3, 0x00e5, 0x00e7, 0x00e9, 0x00e8, /* - 0x8F */
574 0x00ea, 0x00eb, 0x00ed, 0x00ec, 0x00ee, 0x00ef, 0x00f1, 0x00f3, /* 0x90 - */
575 0x00f2, 0x00f4, 0x00f6, 0x00f5, 0x00fa, 0x00f9, 0x00fb, 0x00fc, /* - 0x9F */
576 0x2020, 0x00b0, 0x00a2, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x00df, /* 0xA0 - */
577 0x00ae, 0x00a9, 0x2122, 0x00b4, 0x00a8, 0x2260, 0x00c6, 0x00d8, /* - 0xAF */
578 0x221e, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x00b5, 0x2202, 0x2211, /* 0xB0 - */
579 0x220f, 0x03c0, 0x222b, 0x00aa, 0x00ba, 0x03a9, 0x00e6, 0x00f8, /* - 0xBF */
580 0x00bf, 0x00a1, 0x00ac, 0x221a, 0x0192, 0x2248, 0x2206, 0x00ab, /* 0xC0 - */
581 0x00bb, 0x2026, 0x00a0, 0x00c0, 0x00c3, 0x00d5, 0x0152, 0x0153, /* - 0xCF */
582 0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x25ca, /* 0xD0 - */
583 0x00ff, 0x0178, 0x2044, 0x20ac, 0x2039, 0x203a, 0xfb01, 0xfb02, /* - 0xDF */
584 0x2021, 0x00b7, 0x201a, 0x201e, 0x2030, 0x00c2, 0x00ca, 0x00c1, /* 0xE0 - */
585 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, 0x00cc, 0x00d3, 0x00d4, /* - 0xEF */
586 0xf8ff, 0x00d2, 0x00da, 0x00db, 0x00d9, 0x0131, 0x02c6, 0x02dc, /* 0xF0 - */
587 0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7, /* - 0xFF */
590 /* generated by ./make_charset_table CP437 */
591 const gunichar2 charset_table_cp437[0x80] = {
592 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, /* 0x80 - */
593 0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5, /* - 0x8F */
594 0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, /* 0x90 - */
595 0x00ff, 0x00d6, 0x00dc, 0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192, /* - 0x9F */
596 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba, /* 0xA0 - */
597 0x00bf, 0x2310, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb, /* - 0xAF */
598 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, /* 0xB0 - */
599 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510, /* - 0xBF */
600 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, /* 0xC0 - */
601 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567, /* - 0xCF */
602 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, /* 0xD0 - */
603 0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, /* - 0xDF */
604 0x03b1, 0x00df, 0x0393, 0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03c4, /* 0xE0 - */
605 0x03a6, 0x0398, 0x03a9, 0x03b4, 0x221e, 0x03c6, 0x03b5, 0x2229, /* - 0xEF */
606 0x2261, 0x00b1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248, /* 0xF0 - */
607 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0, /* - 0xFF */
611 * CP855
613 * See
614 * https://en.wikipedia.org/wiki/CP855
615 * https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP855.TXT
617 * XXX - this doesn't have the graphics for 0x00 through 0x1F shown
618 * on the Wikipedia page, but not in the Microsoft mapping file;
619 * that would require a 256-code-point mapping table. (Are those
620 * positions used for the same graphics on all code pages - the PC
621 * graphics set, or whatever it's called?)
623 const gunichar2 charset_table_cp855[0x80] = {
624 0x0452, 0x0402, 0x0453, 0x0403, 0x0451, 0x0401, 0x0454, 0x0404, /* 0x80 - */
625 0x0455, 0x0405, 0x0456, 0x0406, 0x0457, 0x0407, 0x0458, 0x0408, /* - 0x8F */
626 0x0459, 0x0409, 0x045a, 0x040a, 0x045b, 0x040b, 0x045c, 0x040c, /* 0x90 - */
627 0x045e, 0x040e, 0x045f, 0x040f, 0x044e, 0x042e, 0x044a, 0x042a, /* - 0x9F */
628 0x0430, 0x0410, 0x0431, 0x0411, 0x0446, 0x0426, 0x0434, 0x0414, /* 0xA0 - */
629 0x0435, 0x0415, 0x0444, 0x0424, 0x0433, 0x0413, 0x00ab, 0x00bb, /* - 0xAF */
630 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x0445, 0x0425, 0x0438, /* 0xB0 - */
631 0x0418, 0x2563, 0x2551, 0x2557, 0x2550, 0x0439, 0x0419, 0x2510, /* - 0xBF */
632 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x043a, 0x041a, /* 0xC0 - */
633 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4, /* - 0xCF */
634 0x043b, 0x041b, 0x043c, 0x041c, 0x043d, 0x041d, 0x043e, 0x041e, /* 0xD0 - */
635 0x043f, 0x2518, 0x250c, 0x2588, 0x2584, 0x041f, 0x044f, 0x2580, /* - 0xDF */
636 0x042f, 0x0440, 0x0420, 0x0441, 0x0421, 0x0442, 0x0422, 0x0443, /* 0xE0 - */
637 0x0423, 0x0436, 0x0416, 0x0432, 0x0412, 0x044c, 0x042c, 0x2116, /* - 0xEF */
638 0x00ad, 0x044b, 0x042b, 0x0437, 0x0417, 0x0448, 0x0428, 0x044d, /* 0xF0 - */
639 0x042d, 0x0449, 0x0429, 0x0447, 0x0427, 0x00a7, 0x25a0, 0x00a0, /* - 0xFF */
643 * CP866
645 * See:
646 * https://en.wikipedia.org/wiki/CP866
647 * https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP866.TXT
649 const gunichar2 charset_table_cp866[0x80] = {
650 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, /* 0x80 - */
651 0x0418, 0x0419, 0x041A, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, /* - 0x8F */
652 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, /* 0x90 - */
653 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, /* - 0x9F */
654 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, /* 0xA0 - */
655 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, /* - 0xAF */
656 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, /* 0xB0 - */
657 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510, /* - 0xBF */
658 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, /* 0xC0 - */
659 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567, /* - 0xCF */
660 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, /* 0xD0 - */
661 0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, /* - 0xDF */
662 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, /* 0xE0 - */
663 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f, /* - 0xEF */
664 0x0401, 0x0451, 0x0404, 0x0454, 0x0407, 0x0457, 0x040e, 0x045e, /* 0xF0 - */
665 0x00b0, 0x2219, 0x00b7, 0x221a, 0x2216, 0x00a4, 0x25a0, 0x00a0, /* - 0xFF */
669 * Given a wmem scope, a pointer, a length, and a translation table with
670 * 128 entries, treat the string of bytes referred to by the pointer and
671 * length as a string encoded using one octet per character, with octets
672 * with the high-order bit clear being ASCII and octets with the high-order
673 * bit set being mapped by the translation table to 2-byte Unicode Basic
674 * Multilingual Plane characters (including REPLACEMENT CHARACTER), and
675 * return a pointer to a UTF-8 string, allocated using the wmem scope.
677 uint8_t *
678 get_unichar2_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, const gunichar2 table[0x80])
680 wmem_strbuf_t *str;
682 str = wmem_strbuf_new_sized(scope, length+1);
684 while (length > 0) {
685 uint8_t ch = *ptr;
687 if (ch < 0x80)
688 wmem_strbuf_append_c(str, ch);
689 else
690 wmem_strbuf_append_unichar(str, table[ch-0x80]);
691 ptr++;
692 length--;
695 return (uint8_t *) wmem_strbuf_finalize(str);
699 * Given a wmem scope, a pointer, and a length, treat the string of bytes
700 * referred to by the pointer and length as a UCS-2 encoded string
701 * containing characters from the Basic Multilingual Plane (plane 0) of
702 * Unicode, and return a pointer to a UTF-8 string, allocated with the
703 * wmem scope.
705 * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
706 * possibly ORed with ENC_BOM.
708 * Specify length in bytes.
710 uint8_t *
711 get_ucs_2_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, unsigned encoding)
713 gunichar2 uchar;
714 int i = 0; /* Byte counter for string */
715 wmem_strbuf_t *strbuf;
717 strbuf = wmem_strbuf_new_sized(scope, length+1);
719 if (encoding & ENC_BOM && length >= 2) {
720 if (pletoh16(ptr) == BYTE_ORDER_MARK) {
721 encoding = ENC_LITTLE_ENDIAN;
722 i += 2;
723 } else if (pntoh16(ptr) == BYTE_ORDER_MARK) {
724 encoding = ENC_BIG_ENDIAN;
725 i += 2;
729 encoding = encoding & ENC_LITTLE_ENDIAN;
731 for(; i + 1 < length; i += 2) {
732 if (encoding == ENC_BIG_ENDIAN) {
733 uchar = pntoh16(ptr + i);
734 } else {
735 uchar = pletoh16(ptr + i);
737 wmem_strbuf_append_unichar_validated(strbuf, uchar);
741 * If i < length, this means we were handed an odd number of bytes;
742 * insert a REPLACEMENT CHARACTER to mark the error.
744 if (i < length) {
745 wmem_strbuf_append_unichar_repl(strbuf);
747 return (uint8_t *) wmem_strbuf_finalize(strbuf);
751 * Given a wmem scope, a pointer, and a length, treat the string of bytes
752 * referred to by the pointer and length as a UTF-16 encoded string, and
753 * return a pointer to a UTF-8 string, allocated with the wmem scope.
755 * See RFC 2781 section 2.2.
757 * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
758 * possibly ORed with ENC_BOM.
760 * Specify length in bytes.
762 uint8_t *
763 get_utf_16_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, unsigned encoding)
765 wmem_strbuf_t *strbuf;
766 gunichar2 uchar2, lead_surrogate;
767 gunichar uchar;
768 int i = 0; /* Byte counter for string */
770 strbuf = wmem_strbuf_new_sized(scope, length+1);
772 if (encoding & ENC_BOM && length >= 2) {
773 if (pletoh16(ptr) == BYTE_ORDER_MARK) {
774 encoding = ENC_LITTLE_ENDIAN;
775 i += 2;
776 } else if (pntoh16(ptr) == BYTE_ORDER_MARK) {
777 encoding = ENC_BIG_ENDIAN;
778 i += 2;
782 encoding = encoding & ENC_LITTLE_ENDIAN;
784 for(; i + 1 < length; i += 2) {
785 if (encoding == ENC_BIG_ENDIAN)
786 uchar2 = pntoh16(ptr + i);
787 else
788 uchar2 = pletoh16(ptr + i);
790 if (IS_LEAD_SURROGATE(uchar2)) {
792 * Lead surrogate. Must be followed by
793 * a trail surrogate.
795 i += 2;
796 if (i + 1 >= length) {
798 * Oops, string ends with a lead surrogate.
800 * Insert a REPLACEMENT CHARACTER to mark the error,
801 * and quit.
803 wmem_strbuf_append_unichar(strbuf, UNREPL);
804 break;
806 lead_surrogate = uchar2;
807 if (encoding == ENC_BIG_ENDIAN)
808 uchar2 = pntoh16(ptr + i);
809 else
810 uchar2 = pletoh16(ptr + i);
811 if (IS_TRAIL_SURROGATE(uchar2)) {
812 /* Trail surrogate. */
813 uchar = SURROGATE_VALUE(lead_surrogate, uchar2);
814 wmem_strbuf_append_unichar(strbuf, uchar);
815 } else {
817 * Not a trail surrogate.
819 * Insert a REPLACEMENT CHARACTER to mark the error,
820 * and continue;
822 wmem_strbuf_append_unichar(strbuf, UNREPL);
824 } else {
825 if (IS_TRAIL_SURROGATE(uchar2)) {
827 * Trail surrogate without a preceding
828 * lead surrogate.
830 * Insert a REPLACEMENT CHARACTER to mark the error,
831 * and continue;
833 wmem_strbuf_append_unichar(strbuf, UNREPL);
834 } else {
836 * Non-surrogate; just append it.
838 wmem_strbuf_append_unichar(strbuf, uchar2);
844 * If i < length, this means we were handed an odd number of bytes,
845 * so we're not a valid UTF-16 string; insert a REPLACEMENT CHARACTER
846 * to mark the error.
848 if (i < length)
849 wmem_strbuf_append_unichar(strbuf, UNREPL);
850 return (uint8_t *) wmem_strbuf_finalize(strbuf);
854 * Given a wmem scope, a pointer, and a length, treat the string of bytes
855 * referred to by the pointer and length as a UCS-4 encoded string, and
856 * return a pointer to a UTF-8 string, allocated with the wmem scope.
858 * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
860 * Specify length in bytes
862 uint8_t *
863 get_ucs_4_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, unsigned encoding)
865 gunichar uchar;
866 int i = 0; /* Byte counter for string */
867 wmem_strbuf_t *strbuf;
869 strbuf = wmem_strbuf_new_sized(scope, length+1);
871 if (encoding & ENC_BOM && length >= 4) {
872 if (pletoh32(ptr) == BYTE_ORDER_MARK) {
873 encoding = ENC_LITTLE_ENDIAN;
874 i += 4;
875 } else if (pntoh32(ptr) == BYTE_ORDER_MARK) {
876 encoding = ENC_BIG_ENDIAN;
877 i += 4;
881 encoding = encoding & ENC_LITTLE_ENDIAN;
883 for(; i + 3 < length; i += 4) {
884 if (encoding == ENC_BIG_ENDIAN)
885 uchar = pntoh32(ptr + i);
886 else
887 uchar = pletoh32(ptr + i);
889 wmem_strbuf_append_unichar_validated(strbuf, uchar);
893 * if i < length, this means we were handed a number of bytes
894 * that's not a multiple of 4, so not a valid UCS-4 string.
895 * Insert a REPLACEMENT CHARACTER for the remaining bytes.
897 if (i < length) {
898 wmem_strbuf_append_unichar(strbuf, UNREPL);
900 return (uint8_t *)wmem_strbuf_finalize(strbuf);
904 * FROM GNOKII
905 * gsm-encoding.c
906 * gsm-sms.c
909 /* ETSI GSM 03.38, version 6.0.1, section 6.2.1; Default alphabet */
910 static const gunichar2 gsm_default_alphabet[0x80] = {
911 '@', 0xa3, '$', 0xa5, 0xe8, 0xe9, 0xf9, 0xec,
912 0xf2, 0xc7, '\n', 0xd8, 0xf8, '\r', 0xc5, 0xe5,
913 0x394, '_', 0x3a6, 0x393, 0x39b, 0x3a9, 0x3a0, 0x3a8,
914 0x3a3, 0x398, 0x39e, 0xa0, 0xc6, 0xe6, 0xdf, 0xc9,
915 ' ', '!', '\"', '#', 0xa4, '%', '&', '\'',
916 '(', ')', '*', '+', ',', '-', '.', '/',
917 '0', '1', '2', '3', '4', '5', '6', '7',
918 '8', '9', ':', ';', '<', '=', '>', '?',
919 0xa1, 'A', 'B', 'C', 'D', 'E', 'F', 'G',
920 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
921 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
922 'X', 'Y', 'Z', 0xc4, 0xd6, 0xd1, 0xdc, 0xa7,
923 0xbf, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
924 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
925 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
926 'x', 'y', 'z', 0xe4, 0xf6, 0xf1, 0xfc, 0xe0
929 static gunichar
930 GSM_to_UNICHAR(uint8_t c)
932 if (c < G_N_ELEMENTS(gsm_default_alphabet))
933 return gsm_default_alphabet[c];
935 return UNREPL;
938 static gunichar
939 GSMext_to_UNICHAR(uint8_t c)
941 switch (c)
943 case 0x0a: return 0x0c; /* form feed */
944 case 0x14: return '^';
945 case 0x28: return '{';
946 case 0x29: return '}';
947 case 0x2f: return '\\';
948 case 0x3c: return '[';
949 case 0x3d: return '~';
950 case 0x3e: return ']';
951 case 0x40: return '|';
952 case 0x65: return 0x20ac; /* euro */
955 return UNREPL; /* invalid character */
958 #define GN_BYTE_MASK ((1 << bits) - 1)
960 #define GN_CHAR_ESCAPE 0x1b
962 static bool
963 char_is_escape(unsigned char value)
965 return (value == GN_CHAR_ESCAPE);
968 static bool
969 handle_ts_23_038_char(wmem_strbuf_t *strbuf, uint8_t code_point,
970 bool saw_escape)
972 gunichar uchar;
974 if (char_is_escape(code_point)) {
976 * XXX - if saw_escape is true here, then this is
977 * the case where we escape to "another extension table",
978 * but TS 128 038 V11.0 doesn't specify such an extension
979 * table.
981 saw_escape = true;
982 } else {
983 if (!(code_point & 0x80)) {
985 * Code point is valid (7-bit).
986 * Have we seen an escape?
988 if (saw_escape) {
989 saw_escape = false;
990 uchar = GSMext_to_UNICHAR(code_point);
991 } else {
992 uchar = GSM_to_UNICHAR(code_point);
994 wmem_strbuf_append_unichar(strbuf, uchar);
995 } else {
996 /* Invalid - put in a REPLACEMENT CHARACTER */
997 wmem_strbuf_append_unichar(strbuf, UNREPL);
1000 return saw_escape;
1003 uint8_t *
1004 get_ts_23_038_7bits_string_packed(wmem_allocator_t *scope, const uint8_t *ptr,
1005 const int bit_offset, int no_of_chars)
1007 wmem_strbuf_t *strbuf;
1008 int char_count; /* character counter for string */
1009 uint8_t in_byte, out_byte, rest = 0x00;
1010 const uint8_t *start_ptr = ptr;
1011 bool saw_escape = false;
1012 int bits;
1014 strbuf = wmem_strbuf_new_sized(scope, no_of_chars+1);
1016 bits = bit_offset & 0x07;
1017 if (!bits) {
1018 bits = 7;
1021 for(char_count = 0; char_count < no_of_chars; ptr++) {
1022 /* Get the next byte from the string. */
1023 in_byte = *ptr;
1026 * Combine the bits we've accumulated with bits from
1027 * that byte to make a 7-bit code point.
1029 out_byte = ((in_byte & GN_BYTE_MASK) << (7 - bits)) | rest;
1032 * Leftover bits used in that code point.
1034 rest = in_byte >> bits;
1037 * If we don't start from 0th bit, we shouldn't go to the
1038 * next char. Under *out_num we have now 0 and under Rest -
1039 * _first_ part of the char.
1041 if ((start_ptr != ptr) || (bits == 7)) {
1042 saw_escape = handle_ts_23_038_char(strbuf, out_byte,
1043 saw_escape);
1044 char_count++;
1048 * After reading 7 octets we have read 7 full characters
1049 * but we have 7 bits as well. This is the next character.
1051 if ((bits == 1) && (char_count < no_of_chars)) {
1052 saw_escape = handle_ts_23_038_char(strbuf, rest,
1053 saw_escape);
1054 char_count++;
1055 bits = 7;
1056 rest = 0x00;
1057 } else {
1058 bits--;
1062 if (saw_escape) {
1064 * Escape not followed by anything.
1066 * XXX - for now, show the escape as a REPLACEMENT
1067 * CHARACTER.
1069 wmem_strbuf_append_unichar(strbuf, UNREPL);
1072 return (uint8_t *)wmem_strbuf_finalize(strbuf);
1075 uint8_t *
1076 get_ts_23_038_7bits_string_unpacked(wmem_allocator_t *scope, const uint8_t *ptr,
1077 int length)
1079 wmem_strbuf_t *strbuf;
1080 int i; /* Byte counter for string */
1081 bool saw_escape = false;
1083 strbuf = wmem_strbuf_new_sized(scope, length+1);
1085 for (i = 0; i < length; i++)
1086 saw_escape = handle_ts_23_038_char(strbuf, *ptr++, saw_escape);
1088 return (uint8_t *)wmem_strbuf_finalize(strbuf);
1092 * ETSI TS 102 221 Annex A.
1094 uint8_t *
1095 get_etsi_ts_102_221_annex_a_string(wmem_allocator_t *scope, const uint8_t *ptr,
1096 int length)
1098 uint8_t string_type;
1099 uint8_t string_len;
1100 gunichar2 ucs2_base;
1101 wmem_strbuf_t *strbuf;
1102 unsigned i; /* Byte counter for string */
1103 bool saw_escape = false;
1106 * get the first octet.
1108 if (length == 0) {
1109 /* XXX - return error indication */
1110 strbuf = wmem_strbuf_new(scope, "");
1111 return (uint8_t *)wmem_strbuf_finalize(strbuf);
1113 string_type = *ptr;
1114 ptr++;
1115 length--;
1117 if (string_type == 0x80) {
1119 * Annex A, coding scheme 1) - big-endian UCS-2.
1121 return get_ucs_2_string(scope, ptr, length, ENC_BIG_ENDIAN);
1125 * Annex A, coding schemes 2) and 3):
1127 * the second byte is the number of characters (characters,
1128 * not octets) in the string;
1130 * for coding scheme 2), the third byte defines bits 15 to 8
1131 * of all UCS-2 characters in the string (all bit numbers are
1132 * 1-origin, so bit 1 is the low-order bit), with bit 16 being 0;
1134 * for coding scheme 3), the third byte and fourth bytes, treated
1135 * as a big-endian value, define the base value for all UCS-2
1136 * characters in the string;
1138 * for all subsequent bytes, if bit 8 is 0, it's a character
1139 * in the GSM Default Alphabet, otherwise, it is added to
1140 * the UCS-2 base value to give a UCS-2 character.
1142 * XXX - that doesn't seem to indicate that a byte of 0x1b is
1143 * treated as an escape character, it just says that a single octet
1144 * with the 8th bit not set is a GSM Default Alphabet character.
1148 * Get the string length, in characters.
1150 if (length == 0) {
1151 /* XXX - return error indication */
1152 strbuf = wmem_strbuf_new(scope, "");
1153 return (uint8_t *)wmem_strbuf_finalize(strbuf);
1155 string_len = *ptr;
1156 ptr++;
1157 length--;
1159 strbuf = wmem_strbuf_new_sized(scope, 2*string_len+1);
1162 * Get the UCS-2 base.
1164 if (string_type == 0x81) {
1165 if (length == 0) {
1166 /* XXX - return error indication */
1167 return (uint8_t *)wmem_strbuf_finalize(strbuf);
1169 ucs2_base = (*ptr) << 7;
1170 ptr++;
1171 length--;
1172 } else if (string_type == 0x82) {
1173 if (length == 0) {
1174 /* XXX - return error indication */
1175 return (uint8_t *)wmem_strbuf_finalize(strbuf);
1177 ucs2_base = (*ptr) << 8;
1178 ptr++;
1179 length--;
1181 if (length == 0) {
1182 /* XXX - return error indication */
1183 return (uint8_t *)wmem_strbuf_finalize(strbuf);
1185 ucs2_base |= *ptr;
1186 ptr++;
1187 length--;
1188 } else {
1189 /* Invalid string type. */
1190 /* XXX - return error indication */
1191 return (uint8_t *)wmem_strbuf_finalize(strbuf);
1194 for (i = 0; i < string_len; i++) {
1195 uint8_t byte;
1197 if (length == 0) {
1198 /* XXX - return error indication */
1199 return (uint8_t *)wmem_strbuf_finalize(strbuf);
1201 byte = *ptr;
1202 if ((byte & 0x80) == 0) {
1203 saw_escape = handle_ts_23_038_char(strbuf, byte, saw_escape);
1204 } else {
1205 gunichar2 uchar;
1208 * XXX - if saw_escape is true, this is bogus.
1210 * XXX - if there are an odd number of bytes, should put a
1211 * REPLACEMENT CHARACTER at the end.
1213 uchar = ucs2_base + (byte & 0x7f);
1214 wmem_strbuf_append_unichar_validated(strbuf, uchar);
1218 return (uint8_t *)wmem_strbuf_finalize(strbuf);
1221 uint8_t *
1222 get_ascii_7bits_string(wmem_allocator_t *scope, const uint8_t *ptr,
1223 const int bit_offset, int no_of_chars)
1225 wmem_strbuf_t *strbuf;
1226 int char_count; /* character counter for string */
1227 uint8_t in_byte, out_byte, rest = 0x00;
1228 const uint8_t *start_ptr = ptr;
1229 int bits;
1231 bits = bit_offset & 0x07;
1232 if (!bits) {
1233 bits = 7;
1236 strbuf = wmem_strbuf_new_sized(scope, no_of_chars+1);
1237 for(char_count = 0; char_count < no_of_chars; ptr++) {
1238 /* Get the next byte from the string. */
1239 in_byte = *ptr;
1242 * Combine the bits we've accumulated with bits from
1243 * that byte to make a 7-bit code point.
1245 out_byte = (in_byte >> (8 - bits)) | rest;
1248 * Leftover bits used in that code point.
1250 rest = (in_byte << (bits - 1)) & 0x7f;
1253 * If we don't start from 0th bit, we shouldn't go to the
1254 * next char. Under *out_num we have now 0 and under Rest -
1255 * _first_ part of the char.
1257 if ((start_ptr != ptr) || (bits == 7)) {
1258 wmem_strbuf_append_c(strbuf, out_byte);
1259 char_count++;
1263 * After reading 7 octets we have read 7 full characters
1264 * but we have 7 bits as well. This is the next character.
1266 if ((bits == 1) && (char_count < no_of_chars)) {
1267 wmem_strbuf_append_c(strbuf, rest);
1268 char_count++;
1269 bits = 7;
1270 rest = 0x00;
1271 } else {
1272 bits--;
1276 return (uint8_t *)wmem_strbuf_finalize(strbuf);
1279 /* Tables for EBCDIC code pages */
1281 /* EBCDIC common; based on the table in appendix H of ESA/370 Principles
1282 of Operation, but with some code points that don't correspond to
1283 the same characters in code pages 037 and 1158 mapped to REPLACEMENT
1284 CHARACTER - there may be more code points of that sort */
1286 /* There are a few EBCDIC control codes that, strictly speaking, do not
1287 * map to any control codes in ASCII or Unicode for that matter. The
1288 * customary treatment is to map them in a particular way to ASCII C1
1289 * control codes that have no exact equivalent in EBCDIC, as below. */
1290 const gunichar2 charset_table_ebcdic[256] = {
1291 0x0000, 0x0001, 0x0002, 0x0003, 0x009c, 0x0009, 0x0086, 0x007f,
1292 0x0097, 0x008d, 0x008e, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
1293 0x0010, 0x0011, 0x0012, 0x0013, 0x009d, 0x0085, 0x0008, 0x0087,
1294 0x0018, 0x0019, 0x0092, 0x008f, 0x001c, 0x001d, 0x001e, 0x001f,
1295 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x000a, 0x0017, 0x001b,
1296 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x0005, 0x0006, 0x0007,
1297 UNREPL, UNREPL, 0x0016, 0x0093, 0x0094, 0x0095, 0x0096, 0x0004,
1298 0x0098, 0x0099, 0x009a, 0x009b, 0x0014, 0x0015, UNREPL, 0x001a,
1299 0x0020, 0x00a0, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1300 UNREPL, UNREPL, UNREPL, 0x002e, 0x003c, 0x0028, 0x002b, UNREPL,
1301 0x0026, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1302 UNREPL, UNREPL, UNREPL, 0x0024, 0x002a, 0x0029, 0x003b, UNREPL,
1303 0x002d, 0x002f, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1304 UNREPL, UNREPL, UNREPL, 0x002c, 0x0025, 0x005f, 0x003e, 0x003f,
1305 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1306 UNREPL, 0x0060, 0x003a, 0x0023, 0x0040, 0x0027, 0x003d, 0x0022,
1307 UNREPL, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
1308 0x0068, 0x0069, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1309 UNREPL, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070,
1310 0x0071, 0x0072, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1311 UNREPL, 0x007e, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078,
1312 0x0079, 0x007a, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1313 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1314 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1315 0x007b, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
1316 0x0048, 0x0049, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1317 0x007d, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050,
1318 0x0051, 0x0052, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1319 0x005c, UNREPL, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058,
1320 0x0059, 0x005a, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1321 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
1322 0x0038, 0x0039, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1325 /* EBCDIC code page 037 */
1326 const gunichar2 charset_table_ebcdic_cp037[256] = {
1327 0x0000, 0x0001, 0x0002, 0x0003, 0x009c, 0x0009, 0x0086, 0x007f,
1328 0x0097, 0x008d, 0x008e, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
1329 0x0010, 0x0011, 0x0012, 0x0013, 0x009d, 0x0085, 0x0008, 0x0087,
1330 0x0018, 0x0019, 0x0092, 0x008f, 0x001c, 0x001d, 0x001e, 0x001f,
1331 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x000a, 0x0017, 0x001b,
1332 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x0005, 0x0006, 0x0007,
1333 0x0090, 0x0091, 0x0016, 0x0093, 0x0094, 0x0095, 0x0096, 0x0004,
1334 0x0098, 0x0099, 0x009a, 0x009b, 0x0014, 0x0015, 0x009e, 0x001a,
1335 0x0020, 0x00a0, 0x00e2, 0x00e4, 0x00e0, 0x00e1, 0x00e3, 0x00e5,
1336 0x00e7, 0x00f1, 0x00a2, 0x002e, 0x003c, 0x0028, 0x002b, 0x007c,
1337 0x0026, 0x00e9, 0x00ea, 0x00eb, 0x00e8, 0x00ed, 0x00ee, 0x00ef,
1338 0x00ec, 0x00df, 0x0021, 0x0024, 0x002a, 0x0029, 0x003b, 0x00ac,
1339 0x002d, 0x002f, 0x00c2, 0x00c4, 0x00c0, 0x00c1, 0x00c3, 0x00c5,
1340 0x00c7, 0x00d1, 0x00a6, 0x002c, 0x0025, 0x005f, 0x003e, 0x003f,
1341 0x00f8, 0x00c9, 0x00ca, 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf,
1342 0x00cc, 0x0060, 0x003a, 0x0023, 0x0040, 0x0027, 0x003d, 0x0022,
1343 0x00d8, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
1344 0x0068, 0x0069, 0x00ab, 0x00bb, 0x00f0, 0x00fd, 0x00fe, 0x00b1,
1345 0x00b0, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070,
1346 0x0071, 0x0072, 0x00aa, 0x00ba, 0x00e6, 0x00b8, 0x00c6, 0x00a4,
1347 0x00b5, 0x007e, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078,
1348 0x0079, 0x007a, 0x00a1, 0x00bf, 0x00d0, 0x00dd, 0x00de, 0x00ae,
1349 0x005e, 0x00a3, 0x00a5, 0x00b7, 0x00a9, 0x00a7, 0x00b6, 0x00bc,
1350 0x00bd, 0x00be, 0x005b, 0x005d, 0x00af, 0x00a8, 0x00b4, 0x00d7,
1351 0x007b, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
1352 0x0048, 0x0049, 0x00ad, 0x00f4, 0x00f6, 0x00f2, 0x00f3, 0x00f5,
1353 0x007d, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050,
1354 0x0051, 0x0052, 0x00b9, 0x00fb, 0x00fc, 0x00f9, 0x00fa, 0x00ff,
1355 0x005c, 0x00f7, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058,
1356 0x0059, 0x005a, 0x00b2, 0x00d4, 0x00d6, 0x00d2, 0x00d3, 0x00d5,
1357 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
1358 0x0038, 0x0039, 0x00b3, 0x00db, 0x00dc, 0x00d9, 0x00da, 0x009f,
1361 /* EBCDIC code page 500
1362 * https://www.ibm.com/support/pages/conversion-character-differences-between-ccsid-037-and-ccsid-500
1363 * CCSID 500 ("International Latin-1") has exactly the same repertoire as 37,
1364 * covering all of ISO-8559-1, but with seven code points permuted.
1365 * It is notable because it is the default code page for DRDA:
1366 * https://www.ibm.com/support/pages/drda-user-id-and-password-not-being-transmitted-correctly-when-containing-characters-%C2%AC-%C2%A2?lnk=hm
1368 const gunichar2 charset_table_ebcdic_cp500[256] = {
1369 0x0000, 0x0001, 0x0002, 0x0003, 0x009c, 0x0009, 0x0086, 0x007f,
1370 0x0097, 0x008d, 0x008e, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
1371 0x0010, 0x0011, 0x0012, 0x0013, 0x009d, 0x0085, 0x0008, 0x0087,
1372 0x0018, 0x0019, 0x0092, 0x008f, 0x001c, 0x001d, 0x001e, 0x001f,
1373 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x000a, 0x0017, 0x001b,
1374 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x0005, 0x0006, 0x0007,
1375 0x0090, 0x0091, 0x0016, 0x0093, 0x0094, 0x0095, 0x0096, 0x0004,
1376 0x0098, 0x0099, 0x009a, 0x009b, 0x0014, 0x0015, 0x009e, 0x001a,
1377 0x0020, 0x00a0, 0x00e2, 0x00e4, 0x00e0, 0x00e1, 0x00e3, 0x00e5,
1378 0x00e7, 0x00f1, 0x005b, 0x002e, 0x003c, 0x0028, 0x002b, 0x0021,
1379 0x0026, 0x00e9, 0x00ea, 0x00eb, 0x00e8, 0x00ed, 0x00ee, 0x00ef,
1380 0x00ec, 0x00df, 0x005d, 0x0024, 0x002a, 0x0029, 0x003b, 0x005e,
1381 0x002d, 0x002f, 0x00c2, 0x00c4, 0x00c0, 0x00c1, 0x00c3, 0x00c5,
1382 0x00c7, 0x00d1, 0x00a6, 0x002c, 0x0025, 0x005f, 0x003e, 0x003f,
1383 0x00f8, 0x00c9, 0x00ca, 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf,
1384 0x00cc, 0x0060, 0x003a, 0x0023, 0x0040, 0x0027, 0x003d, 0x0022,
1385 0x00d8, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
1386 0x0068, 0x0069, 0x00ab, 0x00bb, 0x00f0, 0x00fd, 0x00fe, 0x00b1,
1387 0x00b0, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070,
1388 0x0071, 0x0072, 0x00aa, 0x00ba, 0x00e6, 0x00b8, 0x00c6, 0x00a4,
1389 0x00b5, 0x007e, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078,
1390 0x0079, 0x007a, 0x00a1, 0x00bf, 0x00d0, 0x00dd, 0x00de, 0x00ae,
1391 0x00a2, 0x00a3, 0x00a5, 0x00b7, 0x00a9, 0x00a7, 0x00b6, 0x00bc,
1392 0x00bd, 0x00be, 0x00ac, 0x007c, 0x00af, 0x00a8, 0x00b4, 0x00d7,
1393 0x007b, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
1394 0x0048, 0x0049, 0x00ad, 0x00f4, 0x00f6, 0x00f2, 0x00f3, 0x00f5,
1395 0x007d, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050,
1396 0x0051, 0x0052, 0x00b9, 0x00fb, 0x00fc, 0x00f9, 0x00fa, 0x00ff,
1397 0x005c, 0x00f7, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058,
1398 0x0059, 0x005a, 0x00b2, 0x00d4, 0x00d6, 0x00d2, 0x00d3, 0x00d5,
1399 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
1400 0x0038, 0x0039, 0x00b3, 0x00db, 0x00dc, 0x00d9, 0x00da, 0x009f,
1404 * Given a wmem scope, a pointer, a length, and a translation table with
1405 * 256 entries, treat the string of bytes referred to by the pointer and
1406 * length as a string encoded using one octet per character, with octets
1407 * being mapped by the translation table to 2-byte Unicode Basic Multilingual
1408 * Plane characters (including REPLACEMENT CHARACTER), and return a
1409 * pointer to a UTF-8 string, allocated using the wmem scope.
1411 uint8_t *
1412 get_nonascii_unichar2_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, const gunichar2 table[256])
1414 wmem_strbuf_t *str;
1416 str = wmem_strbuf_new_sized(scope, length+1);
1418 while (length > 0) {
1419 uint8_t ch = *ptr;
1421 wmem_strbuf_append_unichar(str, table[ch]);
1422 ptr++;
1423 length--;
1426 return (uint8_t *) wmem_strbuf_finalize(str);
1430 * Given a wmem scope, a pointer, a length, and a string referring to an
1431 * encoding (recognized by iconv), treat the bytes referred to by the pointer
1432 * and length as a string in that encoding, and return a pointer to a UTF-8
1433 * string, allocated using the wmem scope, converted from the original
1434 * encoding having substituted REPLACEMENT CHARACTER according to the
1435 * Unicode Standard 5.22 U+FFFD Substitution for Conversion
1436 * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
1438 static uint8_t *
1439 get_string_enc_iconv(wmem_allocator_t *scope, const uint8_t *ptr, int length, const char *encoding)
1441 GIConv cd;
1442 size_t inbytes, outbytes;
1443 size_t tempstr_size, bytes_written;
1444 size_t err;
1445 size_t max_subpart, tempinbytes;
1446 char *outptr, *tempstr;
1448 wmem_strbuf_t *str;
1450 if ((cd = g_iconv_open("UTF-8", encoding)) == (GIConv) -1) {
1451 REPORT_DISSECTOR_BUG("Unable to allocate iconv() converter from %s to UTF-8", encoding);
1452 /* Most likely to be a programming error passing in a bad encoding
1453 * name. However, could be a issue with the iconv support on the
1454 * system running WS. GLib requires iconv/libiconv, but is it possible
1455 * that some versions don't support all common encodings? */
1458 inbytes = length;
1459 str = wmem_strbuf_new_sized(scope, length+1);
1460 /* XXX: If speed becomes an issue, the faster way to do this would
1461 * involve passing the wmem_strbuf_t's string buffer directly into
1462 * g_iconv to avoid a memcpy later, but that requires changes to the
1463 * wmem_strbuf interface to have non const access to the string buffer,
1464 * and to manipulate the used length directly. */
1465 outbytes = tempstr_size = MAX(8, length);
1466 outptr = tempstr = (char *)g_malloc(outbytes);
1467 while (inbytes > 0) {
1468 err = g_iconv(cd, (char **)&ptr, &inbytes, &outptr, &outbytes);
1469 bytes_written = outptr - tempstr;
1470 wmem_strbuf_append_len(str, tempstr, bytes_written);
1471 outptr = tempstr;
1472 outbytes = tempstr_size;
1474 if (err == (size_t) -1) {
1475 /* Errors */
1476 switch (errno) {
1477 case EINVAL:
1478 /* Incomplete sequence at the end, not an error */
1479 wmem_strbuf_append_unichar_repl(str);
1480 inbytes = 0;
1481 break;
1482 case E2BIG:
1483 /* Not enough room (UTF-8 longer than the initial buffer),
1484 * start back at the beginning of the buffer */
1485 break;
1486 case EILSEQ:
1487 /* Find the maximal subpart of the ill-formed sequence */
1488 errno = EINVAL;
1489 for (max_subpart = 1; err == (size_t)-1 && errno == EINVAL; max_subpart++) {
1490 tempinbytes = max_subpart;
1491 err = g_iconv(cd, (char **)&ptr, &tempinbytes,
1492 &outptr, &outbytes);
1494 max_subpart = MAX(1, max_subpart-1);
1495 ptr += max_subpart;
1496 inbytes -= max_subpart;
1497 wmem_strbuf_append_unichar_repl(str);
1498 outptr = tempstr;
1499 outbytes = tempstr_size;
1500 break;
1501 default:
1502 /* Unexpected conversion error, unrecoverable */
1503 g_free(tempstr);
1504 g_iconv_close(cd);
1505 REPORT_DISSECTOR_BUG("Unexpected iconv() error when converting from %s to UTF-8", encoding);
1506 break;
1508 } else {
1509 /* Otherwise err is the number of replacement characters used,
1510 * but we don't care about that. */
1511 /* If we were converting to ISO-2022-JP or some other stateful
1512 * decoder with shift sequences (e.g. EBCDIC mixed-byte), a
1513 * final call with NULL input in order to output the shift
1514 * sequence back to initial state might make sense, but not
1515 * needed for UTF-8. */
1519 g_free(tempstr);
1520 g_iconv_close(cd);
1521 return (uint8_t *) wmem_strbuf_finalize(str);
1525 * Given a wmem scope, a pointer, and a length, treat the bytes referred to
1526 * by the pointer and length as a GB18030 encoded string, and return a pointer
1527 * to a UTF-8 string, allocated using the wmem scope, converted having
1528 * substituted REPLACEMENT CHARACTER according to the Unicode Standard
1529 * 5.22 U+FFFD Substitution for Conversion.
1530 * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
1532 * As expected, this will also decode GBK and GB2312 strings.
1534 uint8_t *
1535 get_gb18030_string(wmem_allocator_t *scope, const uint8_t *ptr, int length)
1537 /* iconv/libiconv support is guaranteed with GLib. Support this
1538 * via iconv, at least for now. */
1539 /* GNU libiconv has supported GB18030 (~ Windows Code page 54936) since
1540 * 2000-10-24 and version 1.4, is there is a system that compiles current
1541 * Wireshark yet its iconv only supports GBK (~ Windows Code page 936)? */
1542 const char *encoding = "GB18030";
1543 GIConv cd;
1544 if ((cd = g_iconv_open("UTF-8", encoding)) == (GIConv) -1) {
1545 encoding = "GBK";
1546 /* GB18030 is backwards compatible, at worst this will mean a few
1547 * extra REPLACEMENT CHARACTERs - GBK lacks the four byte encodings
1548 * from GB18030, which are all pairs of two byte sequences
1549 * 0x[81-FE] 0x[30-39]; that trailing byte is illegal in GBK
1550 * and thus the 4 byte characters will be replaced with two
1551 * REPLACEMENT CHARACTERs. */
1552 } else {
1553 g_iconv_close(cd);
1555 return get_string_enc_iconv(scope, ptr, length, encoding);
1559 * Given a wmem scope, a pointer, and a length, treat the bytes referred to
1560 * by the pointer and length as a EUC-KR encoded string, and return a pointer
1561 * to a UTF-8 string, allocated using the wmem scope, converted having
1562 * substituted REPLACEMENT CHARACTER according to the Unicode Standard
1563 * 5.22 U+FFFD Substitution for Conversion.
1564 * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
1566 uint8_t *
1567 get_euc_kr_string(wmem_allocator_t *scope, const uint8_t *ptr, int length)
1569 /* iconv/libiconv support is guaranteed with GLib. Support this
1570 * via iconv, at least for now. */
1571 return get_string_enc_iconv(scope, ptr, length, "EUC-KR");
1574 /* T.61 to UTF-8 conversion table from OpenLDAP project
1575 * https://www.openldap.org/devel/gitweb.cgi?p=openldap.git;a=blob;f=libraries/libldap/t61.c;hb=HEAD
1577 static const gunichar2 t61_tab[] = {
1578 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007,
1579 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f,
1580 0x010, 0x011, 0x012, 0x013, 0x014, 0x015, 0x016, 0x017,
1581 0x018, 0x019, 0x01a, 0x01b, 0x01c, 0x01d, 0x01e, 0x01f,
1582 0x020, 0x021, 0x022, 0x000, 0x000, 0x025, 0x026, 0x027,
1583 0x028, 0x029, 0x02a, 0x02b, 0x02c, 0x02d, 0x02e, 0x02f,
1584 0x030, 0x031, 0x032, 0x033, 0x034, 0x035, 0x036, 0x037,
1585 0x038, 0x039, 0x03a, 0x03b, 0x03c, 0x03d, 0x03e, 0x03f,
1586 0x040, 0x041, 0x042, 0x043, 0x044, 0x045, 0x046, 0x047,
1587 0x048, 0x049, 0x04a, 0x04b, 0x04c, 0x04d, 0x04e, 0x04f,
1588 0x050, 0x051, 0x052, 0x053, 0x054, 0x055, 0x056, 0x057,
1589 0x058, 0x059, 0x05a, 0x05b, 0x000, 0x05d, 0x000, 0x05f,
1590 0x000, 0x061, 0x062, 0x063, 0x064, 0x065, 0x066, 0x067,
1591 0x068, 0x069, 0x06a, 0x06b, 0x06c, 0x06d, 0x06e, 0x06f,
1592 0x070, 0x071, 0x072, 0x073, 0x074, 0x075, 0x076, 0x077,
1593 0x078, 0x079, 0x07a, 0x000, 0x07c, 0x000, 0x000, 0x07f,
1594 0x080, 0x081, 0x082, 0x083, 0x084, 0x085, 0x086, 0x087,
1595 0x088, 0x089, 0x08a, 0x08b, 0x08c, 0x08d, 0x08e, 0x08f,
1596 0x090, 0x091, 0x092, 0x093, 0x094, 0x095, 0x096, 0x097,
1597 0x098, 0x099, 0x09a, 0x09b, 0x09c, 0x09d, 0x09e, 0x09f,
1598 0x0a0, 0x0a1, 0x0a2, 0x0a3, 0x024, 0x0a5, 0x023, 0x0a7,
1599 0x0a4, 0x000, 0x000, 0x0ab, 0x000, 0x000, 0x000, 0x000,
1600 0x0b0, 0x0b1, 0x0b2, 0x0b3, 0x0d7, 0x0b5, 0x0b6, 0x0b7,
1601 0x0f7, 0x000, 0x000, 0x0bb, 0x0bc, 0x0bd, 0x0be, 0x0bf,
1602 0x000, 0x300, 0x301, 0x302, 0x303, 0x304, 0x306, 0x307,
1603 0x308, 0x000, 0x30a, 0x327, 0x332, 0x30b, 0x328, 0x30c,
1604 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
1605 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
1606 0x2126, 0xc6, 0x0d0, 0x0aa, 0x126, 0x000, 0x132, 0x13f,
1607 0x141, 0x0d8, 0x152, 0x0ba, 0x0de, 0x166, 0x14a, 0x149,
1608 0x138, 0x0e6, 0x111, 0x0f0, 0x127, 0x131, 0x133, 0x140,
1609 0x142, 0x0f8, 0x153, 0x0df, 0x0fe, 0x167, 0x14b, 0x000
1612 typedef gunichar2 wvec16[16];
1613 typedef gunichar2 wvec32[32];
1615 /* Substitutions when 0xc1-0xcf appears by itself or with space 0x20 */
1616 static const wvec16 accents = {
1617 0x000, 0x060, 0x0b4, 0x05e, 0x07e, 0x0af, 0x2d8, 0x2d9,
1618 0x0a8, 0x000, 0x2da, 0x0b8, 0x000, 0x2dd, 0x2db, 0x2c7};
1620 /* In the following tables, base characters commented in (parentheses)
1621 * are not defined by T.61 but are mapped anyway since their Unicode
1622 * composite exists.
1625 /* Grave accented chars AEIOU (NWY) */
1626 static const wvec32 c1_vec1 = {
1627 /* Upper case */
1628 0, 0xc0, 0, 0, 0, 0xc8, 0, 0, 0, 0xcc, 0, 0, 0, 0, 0x1f8, 0xd2,
1629 0, 0, 0, 0, 0, 0xd9, 0, 0x1e80, 0, 0x1ef2, 0, 0, 0, 0, 0, 0};
1630 static const wvec32 c1_vec2 = {
1631 /* Lower case */
1632 0, 0xe0, 0, 0, 0, 0xe8, 0, 0, 0, 0xec, 0, 0, 0, 0, 0x1f9, 0xf2,
1633 0, 0, 0, 0, 0, 0xf9, 0, 0x1e81, 0, 0x1ef3, 0, 0, 0, 0, 0, 0};
1635 static const wvec32 *c1_grave[] = {
1636 NULL, NULL, &c1_vec1, &c1_vec2, NULL, NULL, NULL, NULL
1639 /* Acute accented chars AEIOUYCLNRSZ (GKMPW) */
1640 static const wvec32 c2_vec1 = {
1641 /* Upper case */
1642 0, 0xc1, 0, 0x106, 0, 0xc9, 0, 0x1f4,
1643 0, 0xcd, 0, 0x1e30, 0x139, 0x1e3e, 0x143, 0xd3,
1644 0x1e54, 0, 0x154, 0x15a, 0, 0xda, 0, 0x1e82,
1645 0, 0xdd, 0x179, 0, 0, 0, 0, 0};
1646 static const wvec32 c2_vec2 = {
1647 /* Lower case */
1648 0, 0xe1, 0, 0x107, 0, 0xe9, 0, 0x1f5,
1649 0, 0xed, 0, 0x1e31, 0x13a, 0x1e3f, 0x144, 0xf3,
1650 0x1e55, 0, 0x155, 0x15b, 0, 0xfa, 0, 0x1e83,
1651 0, 0xfd, 0x17a, 0, 0, 0, 0, 0};
1652 static const wvec32 c2_vec3 = {
1653 /* (AE and ae) */
1654 0, 0x1fc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1655 0, 0x1fd, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1657 static const wvec32 *c2_acute[] = {
1658 NULL, NULL, &c2_vec1, &c2_vec2, NULL, NULL, NULL, &c2_vec3
1661 /* Circumflex AEIOUYCGHJSW (Z) */
1662 static const wvec32 c3_vec1 = {
1663 /* Upper case */
1664 0, 0xc2, 0, 0x108, 0, 0xca, 0, 0x11c,
1665 0x124, 0xce, 0x134, 0, 0, 0, 0, 0xd4,
1666 0, 0, 0, 0x15c, 0, 0xdb, 0, 0x174,
1667 0, 0x176, 0x1e90, 0, 0, 0, 0, 0};
1668 static const wvec32 c3_vec2 = {
1669 /* Lower case */
1670 0, 0xe2, 0, 0x109, 0, 0xea, 0, 0x11d,
1671 0x125, 0xee, 0x135, 0, 0, 0, 0, 0xf4,
1672 0, 0, 0, 0x15d, 0, 0xfb, 0, 0x175,
1673 0, 0x177, 0x1e91, 0, 0, 0, 0, 0};
1674 static const wvec32 *c3_circumflex[] = {
1675 NULL, NULL, &c3_vec1, &c3_vec2, NULL, NULL, NULL, NULL
1678 /* Tilde AIOUN (EVY) */
1679 static const wvec32 c4_vec1 = {
1680 /* Upper case */
1681 0, 0xc3, 0, 0, 0, 0x1ebc, 0, 0, 0, 0x128, 0, 0, 0, 0, 0xd1, 0xd5,
1682 0, 0, 0, 0, 0, 0x168, 0x1e7c, 0, 0, 0x1ef8, 0, 0, 0, 0, 0, 0};
1683 static const wvec32 c4_vec2 = {
1684 /* Lower case */
1685 0, 0xe3, 0, 0, 0, 0x1ebd, 0, 0, 0, 0x129, 0, 0, 0, 0, 0xf1, 0xf5,
1686 0, 0, 0, 0, 0, 0x169, 0x1e7d, 0, 0, 0x1ef9, 0, 0, 0, 0, 0, 0};
1687 static const wvec32 *c4_tilde[] = {
1688 NULL, NULL, &c4_vec1, &c4_vec2, NULL, NULL, NULL, NULL
1691 /* Macron AEIOU (YG) */
1692 static const wvec32 c5_vec1 = {
1693 /* Upper case */
1694 0, 0x100, 0, 0, 0, 0x112, 0, 0x1e20, 0, 0x12a, 0, 0, 0, 0, 0, 0x14c,
1695 0, 0, 0, 0, 0, 0x16a, 0, 0, 0, 0x232, 0, 0, 0, 0, 0, 0};
1696 static const wvec32 c5_vec2 = {
1697 /* Lower case */
1698 0, 0x101, 0, 0, 0, 0x113, 0, 0x1e21, 0, 0x12b, 0, 0, 0, 0, 0, 0x14d,
1699 0, 0, 0, 0, 0, 0x16b, 0, 0, 0, 0x233, 0, 0, 0, 0, 0, 0};
1700 static const wvec32 c5_vec3 = {
1701 /* (AE and ae) */
1702 0, 0x1e2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1703 0, 0x1e3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1704 static const wvec32 *c5_macron[] = {
1705 NULL, NULL, &c5_vec1, &c5_vec2, NULL, NULL, NULL, &c5_vec3
1708 /* Breve AUG (EIO) */
1709 static const wvec32 c6_vec1 = {
1710 /* Upper case */
1711 0, 0x102, 0, 0, 0, 0x114, 0, 0x11e, 0, 0x12c, 0, 0, 0, 0, 0, 0x14e,
1712 0, 0, 0, 0, 0, 0x16c, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1713 static const wvec32 c6_vec2 = {
1714 /* Lower case */
1715 0, 0x103, 0, 0, 0, 0x115, 0, 0x11f, 0, 0x12d, 0, 0, 0, 0, 0, 0x14f,
1716 0, 0, 0, 0, 0, 0x16d, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1717 static const wvec32 *c6_breve[] = {
1718 NULL, NULL, &c6_vec1, &c6_vec2, NULL, NULL, NULL, NULL
1721 /* Dot Above CEGIZ (AOBDFHMNPRSTWXY) */
1722 static const wvec32 c7_vec1 = {
1723 /* Upper case */
1724 0, 0x226, 0x1e02, 0x10a, 0x1e0a, 0x116, 0x1e1e, 0x120,
1725 0x1e22, 0x130, 0, 0, 0, 0x1e40, 0x1e44, 0x22e,
1726 0x1e56, 0, 0x1e58, 0x1e60, 0x1e6a, 0, 0, 0x1e86,
1727 0x1e8a, 0x1e8e, 0x17b, 0, 0, 0, 0, 0};
1728 static const wvec32 c7_vec2 = {
1729 /* Lower case */
1730 0, 0x227, 0x1e03, 0x10b, 0x1e0b, 0x117, 0x1e1f, 0x121,
1731 0x1e23, 0, 0, 0, 0, 0x1e41, 0x1e45, 0x22f,
1732 0x1e57, 0, 0x1e59, 0x1e61, 0x1e6b, 0, 0, 0x1e87,
1733 0x1e8b, 0x1e8f, 0x17c, 0, 0, 0, 0, 0};
1734 static const wvec32 *c7_dotabove[] = {
1735 NULL, NULL, &c7_vec1, &c7_vec2, NULL, NULL, NULL, NULL
1738 /* Diaeresis AEIOUY (HWXt) */
1739 static const wvec32 c8_vec1 = {
1740 /* Upper case */
1741 0, 0xc4, 0, 0, 0, 0xcb, 0, 0, 0x1e26, 0xcf, 0, 0, 0, 0, 0, 0xd6,
1742 0, 0, 0, 0, 0, 0xdc, 0, 0x1e84, 0x1e8c, 0x178, 0, 0, 0, 0, 0, 0};
1743 static const wvec32 c8_vec2 = {
1744 /* Lower case */
1745 0, 0xe4, 0, 0, 0, 0xeb, 0, 0, 0x1e27, 0xef, 0, 0, 0, 0, 0, 0xf6,
1746 0, 0, 0, 0, 0x1e97, 0xfc, 0, 0x1e85, 0x1e8d, 0xff, 0, 0, 0, 0, 0, 0};
1747 static const wvec32 *c8_diaeresis[] = {
1748 NULL, NULL, &c8_vec1, &c8_vec2, NULL, NULL, NULL, NULL
1751 /* Ring Above AU (wy) */
1752 static const wvec32 ca_vec1 = {
1753 /* Upper case */
1754 0, 0xc5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1755 0, 0, 0, 0, 0, 0x16e, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1756 static const wvec32 ca_vec2 = {
1757 /* Lower case */
1758 0, 0xe5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1759 0, 0, 0, 0, 0, 0x16f, 0, 0x1e98, 0, 0x1e99, 0, 0, 0, 0, 0, 0};
1760 static const wvec32 *ca_ringabove[] = {
1761 NULL, NULL, &ca_vec1, &ca_vec2, NULL, NULL, NULL, NULL
1764 /* Cedilla CGKLNRST (EDH) */
1765 static const wvec32 cb_vec1 = {
1766 /* Upper case */
1767 0, 0, 0, 0xc7, 0x1e10, 0x228, 0, 0x122,
1768 0x1e28, 0, 0, 0x136, 0x13b, 0, 0x145, 0,
1769 0, 0, 0x156, 0x15e, 0x162, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1770 static const wvec32 cb_vec2 = {
1771 /* Lower case */
1772 0, 0, 0, 0xe7, 0x1e11, 0x229, 0, 0x123,
1773 0x1e29, 0, 0, 0x137, 0x13c, 0, 0x146, 0,
1774 0, 0, 0x157, 0x15f, 0x163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1775 static const wvec32 *cb_cedilla[] = {
1776 NULL, NULL, &cb_vec1, &cb_vec2, NULL, NULL, NULL, NULL
1779 /* Double Acute Accent OU */
1780 static const wvec32 cd_vec1 = {
1781 /* Upper case */
1782 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x150,
1783 0, 0, 0, 0, 0, 0x170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1784 static const wvec32 cd_vec2 = {
1785 /* Lower case */
1786 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x151,
1787 0, 0, 0, 0, 0, 0x171, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1788 static const wvec32 *cd_doubleacute[] = {
1789 NULL, NULL, &cd_vec1, &cd_vec2, NULL, NULL, NULL, NULL
1792 /* Ogonek AEIU (O) */
1793 static const wvec32 ce_vec1 = {
1794 /* Upper case */
1795 0, 0x104, 0, 0, 0, 0x118, 0, 0, 0, 0x12e, 0, 0, 0, 0, 0, 0x1ea,
1796 0, 0, 0, 0, 0, 0x172, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1797 static const wvec32 ce_vec2 = {
1798 /* Lower case */
1799 0, 0x105, 0, 0, 0, 0x119, 0, 0, 0, 0x12f, 0, 0, 0, 0, 0, 0x1eb,
1800 0, 0, 0, 0, 0, 0x173, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1801 static const wvec32 *ce_ogonek[] = {
1802 NULL, NULL, &ce_vec1, &ce_vec2, NULL, NULL, NULL, NULL
1805 /* Caron CDELNRSTZ (AIOUGKjH) */
1806 static const wvec32 cf_vec1 = {
1807 /* Upper case */
1808 0, 0x1cd, 0, 0x10c, 0x10e, 0x11a, 0, 0x1e6,
1809 0x21e, 0x1cf, 0, 0x1e8, 0x13d, 0, 0x147, 0x1d1,
1810 0, 0, 0x158, 0x160, 0x164, 0x1d3, 0, 0,
1811 0, 0, 0x17d, 0, 0, 0, 0, 0};
1812 static const wvec32 cf_vec2 = {
1813 /* Lower case */
1814 0, 0x1ce, 0, 0x10d, 0x10f, 0x11b, 0, 0x1e7,
1815 0x21f, 0x1d0, 0x1f0, 0x1e9, 0x13e, 0, 0x148, 0x1d2,
1816 0, 0, 0x159, 0x161, 0x165, 0x1d4, 0, 0,
1817 0, 0, 0x17e, 0, 0, 0, 0, 0};
1818 static const wvec32 *cf_caron[] = {
1819 NULL, NULL, &cf_vec1, &cf_vec2, NULL, NULL, NULL, NULL
1822 static const wvec32 **cx_tab[] = {
1823 NULL, c1_grave, c2_acute, c3_circumflex, c4_tilde, c5_macron,
1824 c6_breve, c7_dotabove, c8_diaeresis, NULL, ca_ringabove,
1825 cb_cedilla, NULL, cd_doubleacute, ce_ogonek, cf_caron };
1827 uint8_t *
1828 get_t61_string(wmem_allocator_t *scope, const uint8_t *ptr, int length)
1830 int i;
1831 const uint8_t *c;
1832 wmem_strbuf_t *strbuf;
1834 strbuf = wmem_strbuf_new_sized(scope, length+1);
1836 for (i = 0, c = ptr; i < length; c++, i++) {
1837 if (!t61_tab[*c]) {
1838 wmem_strbuf_append_unichar(strbuf, UNREPL);
1839 } else if (i < length - 1 && (*c & 0xf0) == 0xc0) {
1840 int j = *c & 0x0f;
1841 /* If this is the end of the string, or if the base
1842 * character is just a space, treat this as a regular
1843 * spacing character.
1845 if ((!c[1] || c[1] == 0x20) && accents[j]) {
1846 wmem_strbuf_append_unichar(strbuf, accents[j]);
1847 } else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
1848 /* We have a composite mapping for this pair */
1849 (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
1850 wmem_strbuf_append_unichar(strbuf, (*cx_tab[j][c[1]>>5])[c[1]&0x1f]);
1851 } else {
1852 /* No mapping, just swap it around so the base
1853 * character comes first.
1855 wmem_strbuf_append_unichar(strbuf, c[1]);
1856 wmem_strbuf_append_unichar(strbuf, t61_tab[*c]);
1858 c++; i++;
1859 continue;
1860 } else {
1861 wmem_strbuf_append_unichar(strbuf, t61_tab[*c]);
1865 return (uint8_t *)wmem_strbuf_finalize(strbuf);
1868 /* The DECT standard charset from ETSI EN 300 175-5 Annex D
1870 static const gunichar2 dect_standard_8bits_code_table[] = {
1871 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1872 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1873 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
1874 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
1875 ' ', '!', '\"', '#', '$', '%', '&', '\'',
1876 '(', ')', '*', '+', ',', '-', '.', '/',
1877 '0', '1', '2', '3', '4', '5', '6', '7',
1878 '8', '9', ':', ';', '<', '=', '>', '?',
1879 '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
1880 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
1881 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
1882 'X', 'Y', 'Z', '[', '\\', ']', '^', '_',
1883 '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
1884 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
1885 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
1886 'x', 'y', 'z', '{', '|', '}', '~', 0x7f,
1889 uint8_t *
1890 get_dect_standard_8bits_string(wmem_allocator_t *scope, const uint8_t *ptr, int length)
1892 int position;
1893 const uint8_t *current_byte_ptr;
1894 wmem_strbuf_t *strbuf;
1896 strbuf = wmem_strbuf_new_sized(scope, length+1);
1898 for (position = 0, current_byte_ptr = ptr; position < length; current_byte_ptr++, position++) {
1899 if (*current_byte_ptr & 0x80) {
1900 wmem_strbuf_append_unichar(strbuf, UNREPL);
1901 } else if (!dect_standard_8bits_code_table[*current_byte_ptr]) {
1902 wmem_strbuf_append_unichar(strbuf, UNREPL);
1903 } else {
1904 wmem_strbuf_append_unichar(strbuf, dect_standard_8bits_code_table[*current_byte_ptr]);
1908 return (uint8_t *)wmem_strbuf_finalize(strbuf);
1911 * Editor modelines - https://www.wireshark.org/tools/modelines.html
1913 * Local variables:
1914 * c-basic-offset: 4
1915 * tab-width: 8
1916 * indent-tabs-mode: nil
1917 * End:
1919 * vi: set shiftwidth=4 tabstop=8 expandtab:
1920 * :indentSize=4:tabSize=8:noTabs=true: