epan/charsets.c

   1 /* charsets.c
   2  * Routines for handling character sets
   3  *
   4  * Wireshark - Network traffic analyzer
   5  * By Gerald Combs <gerald@wireshark.org>
   6  * Copyright 1998 Gerald Combs
   7  *
   8  * SPDX-License-Identifier: GPL-2.0-or-later
   9  */
  10
  11 #include "config.h"
  12
  13 #include <errno.h>
  14 #include <glib.h>
  15
  16 #include <epan/proto.h>
  17 #include <epan/wmem_scopes.h>
  18
  19 #include <wsutil/pint.h>
  20 #include <wsutil/unicode-utils.h>
  21
  22 #include "charsets.h"
  23
  24 /*
  25  * 6-character abbreviation for "Unicode REPLACEMENT CHARACTER", so it
  26  * takes up the same amount of space as the 6-character hex values for
  27  * Basic Multilingual Plane code points in the tables below.
  28  */
  29 #define UNREPL UNICODE_REPLACEMENT_CHARACTER
  30
  31 /* ZERO WIDTH NON-BREAKING SPACE, also known informally as BOM */
  32 #define BYTE_ORDER_MARK 0xFEFF
  33
  34 /*
  35  * Wikipedia's "Character encoding" template, giving a pile of character
  36  * encodings and Wikipedia pages for them:
  37  *
  38  *    http://en.wikipedia.org/wiki/Template:Character_encoding
  39  *
  40  * Unicode character encoding model:
  41  *
  42  *    https://www.unicode.org/reports/tr17/
  43  *
  44  * International Components for Unicode character set mapping tables:
  45  *
  46  *    http://site.icu-project.org/charts/charset
  47  *
  48  * MSDN information on code pages:
  49  *
  50  *    https://docs.microsoft.com/en-us/windows/win32/intl/code-pages
  51  *
  52  * ASCII-based code pages, from IBM:
  53  *
  54  *    http://www-01.ibm.com/software/globalization/cp/cp_cpgid.html
  55  *
  56  * EBCDIC code pages, from IBM:
  57  *
  58  *    http://www-03.ibm.com/systems/i/software/globalization/codepages.html
  59  *
  60  * The IBM pages are no longer available; the versions archived on the
  61  * Wayback Machine are, but the links to the PDF and text versions of
  62  * the code pages don't all work (do *any* work?).
  63  *
  64  * Mappings to Unicode at the Unicode Consortium:
  65  *
  66  *    https://www.unicode.org/Public/MAPPINGS/
  67  *
  68  * Of note, the VENDORS/MICSFT directory not only has various Windows
  69  * and DOS code pages, but also several of the common MAC and EBCDIC
  70  * code page mappings to Unicode.
  71  */
  72
  73 /*
  74  * Given a wmem scope, a pointer, and a length, treat the string of bytes
  75  * referred to by the pointer and length as an ASCII string, with all bytes
  76  * with the high-order bit set being invalid, and return a pointer to a
  77  * UTF-8 string, allocated using the wmem scope.
  78  *
  79  * Octets with the highest bit set will be converted to the Unicode
  80  * REPLACEMENT CHARACTER.
  81  */
  82 uint8_t *
  83 get_ascii_string(wmem_allocator_t *scope, const uint8_t *ptr, int length)
  84 {
  85     wmem_strbuf_t *str;
  86     const uint8_t *prev = ptr;
  87     size_t valid_bytes = 0;
  88
  89     str = wmem_strbuf_new_sized(scope, length+1);
  90
  91     while (length > 0) {
  92         uint8_t ch = *ptr++;
  93
  94         if (ch < 0x80) {
  95             valid_bytes++;
  96         } else {
  97             if (valid_bytes) {
  98                 wmem_strbuf_append_len(str, prev, valid_bytes);
  99                 valid_bytes = 0;
 100             }
 101             prev = ptr;
 102             wmem_strbuf_append_unichar_repl(str);
 103         }
 104         length--;
 105     }
 106     if (valid_bytes) {
 107         wmem_strbuf_append_len(str, prev, valid_bytes);
 108     }
 109
 110     return (uint8_t *) wmem_strbuf_finalize(str);
 111 }
 112
 113 uint8_t *
 114 get_utf_8_string(wmem_allocator_t *scope, const uint8_t *ptr, int length)
 115 {
 116     return ws_utf8_make_valid(scope, ptr, length);
 117 }
 118
 119 /*
 120  * ISO 646 "Basic code table".
 121  */
 122 const gunichar2 charset_table_iso_646_basic[0x80] = {
 123     0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,        /* 0x00 -      */
 124     0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,        /*      - 0x0F */
 125     0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,        /* 0x10 -      */
 126     0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,        /*      - 0x1F */
 127     0x0020, 0x0021, 0x0022, UNREPL, UNREPL, 0x0025, 0x0026, 0x0027,        /* 0x20 -      */
 128     0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,        /*      - 0x2F */
 129     0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,        /* 0x30 -      */
 130     0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,        /*      - 0x3F */
 131     UNREPL, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,        /* 0x40 -      */
 132     0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,        /*      - 0x4F */
 133     0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,        /* 0x50 -      */
 134     0x0058, 0x0059, 0x005a, UNREPL, UNREPL, UNREPL, UNREPL, 0x005f,        /*      - 0x5F */
 135     UNREPL, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,        /* 0x60 -      */
 136     0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,        /*      - 0x6F */
 137     0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,        /* 0x70 -      */
 138     0x0078, 0x0079, 0x007a, UNREPL, UNREPL, UNREPL, UNREPL, 0x007f,        /*      - 0x7F */
 139 };
 140
 141 /*
 142  * Given a wmem scope, a pointer, a length, and a translation table,
 143  * treat the string of bytes referred to by the pointer and length as a
 144  * string encoded using one octet per character, with octets with the
 145  * high-order bit clear being mapped by the translation table to 2-byte
 146  * Unicode Basic Multilingual Plane characters (including REPLACEMENT
 147  * CHARACTER) and octets with the high-order bit set being mapped to
 148  * REPLACEMENT CHARACTER, and return a pointer to a UTF-8 string,
 149  * allocated using the wmem scope.
 150  */
 151 uint8_t *
 152 get_iso_646_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, const gunichar2 table[0x80])
 153 {
 154     wmem_strbuf_t *str;
 155
 156     str = wmem_strbuf_new_sized(scope, length+1);
 157
 158     while (length > 0) {
 159         uint8_t ch = *ptr;
 160
 161         if (ch < 0x80)
 162             wmem_strbuf_append_unichar(str, table[ch]);
 163         else
 164             wmem_strbuf_append_unichar_repl(str);
 165         ptr++;
 166         length--;
 167     }
 168
 169     return (uint8_t *) wmem_strbuf_finalize(str);
 170 }
 171
 172 /*
 173  * Given a wmem scope, a pointer, and a length, treat the string of bytes
 174  * referred to by the pointer and length as an ISO 8859/1 string, and
 175  * return a pointer to a UTF-8 string, allocated using the wmem scope.
 176  */
 177 uint8_t *
 178 get_8859_1_string(wmem_allocator_t *scope, const uint8_t *ptr, int length)
 179 {
 180     wmem_strbuf_t *str;
 181
 182     str = wmem_strbuf_new_sized(scope, length+1);
 183
 184     while (length > 0) {
 185         uint8_t ch = *ptr;
 186
 187         if (ch < 0x80)
 188             wmem_strbuf_append_c(str, ch);
 189         else {
 190             /*
 191              * Note: we assume here that the code points
 192              * 0x80-0x9F are used for C1 control characters,
 193              * and thus have the same value as the corresponding
 194              * Unicode code points.
 195              */
 196             wmem_strbuf_append_unichar(str, ch);
 197         }
 198         ptr++;
 199         length--;
 200     }
 201
 202     return (uint8_t *) wmem_strbuf_finalize(str);
 203 }
 204
 205 /*
 206  * Translation tables that map the upper 128 code points in single-byte
 207  * "extended ASCII" character encodings to Unicode code points in the
 208  * Basic Multilingual Plane.
 209  */
 210
 211 /* ISO-8859-2 (https://en.wikipedia.org/wiki/ISO/IEC_8859-2#Code_page_layout) */
 212 const gunichar2 charset_table_iso_8859_2[0x80] = {
 213     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
 214     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
 215     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
 216     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
 217     0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7,        /* 0xA0 -      */
 218     0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b,        /*      - 0xAF */
 219     0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7,        /* 0xB0 -      */
 220     0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c,        /*      - 0xBF */
 221     0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,        /* 0xC0 -      */
 222     0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,        /*      - 0xCF */
 223     0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,        /* 0xD0 -      */
 224     0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,        /*      - 0xDF */
 225     0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,        /* 0xE0 -      */
 226     0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,        /*      - 0xEF */
 227     0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,        /* 0xF0 -      */
 228     0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9         /*      - 0xFF */
 229 };
 230
 231 /* generated by ../tools/make_charset_ISO-8859-3 */
 232 const gunichar2 charset_table_iso_8859_3[0x80] = {
 233     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
 234     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
 235     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
 236     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
 237     0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4, UNREPL, 0x0124, 0x00a7,        /* 0xA0 -      */
 238     0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad, UNREPL, 0x017b,        /*      - 0xAF */
 239     0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7,        /* 0xB0 -      */
 240     0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd, UNREPL, 0x017c,        /*      - 0xBF */
 241     0x00c0, 0x00c1, 0x00c2, UNREPL, 0x00c4, 0x010a, 0x0108, 0x00c7,        /* 0xC0 -      */
 242     0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,        /*      - 0xCF */
 243     UNREPL, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7,        /* 0xD0 -      */
 244     0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df,        /*      - 0xDF */
 245     0x00e0, 0x00e1, 0x00e2, UNREPL, 0x00e4, 0x010b, 0x0109, 0x00e7,        /* 0xE0 -      */
 246     0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,        /*      - 0xEF */
 247     UNREPL, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7,        /* 0xF0 -      */
 248     0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9,        /*      - 0xFF */
 249 };
 250
 251 /* generated by ../tools/make_charset_ISO-8859-4 */
 252 const gunichar2 charset_table_iso_8859_4[0x80] = {
 253     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
 254     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
 255     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
 256     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
 257     0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7,        /* 0xA0 -      */
 258     0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af,        /*      - 0xAF */
 259     0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7,        /* 0xB0 -      */
 260     0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b,        /*      - 0xBF */
 261     0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e,        /* 0xC0 -      */
 262     0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a,        /*      - 0xCF */
 263     0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7,        /* 0xD0 -      */
 264     0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df,        /*      - 0xDF */
 265     0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f,        /* 0xE0 -      */
 266     0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b,        /*      - 0xEF */
 267     0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7,        /* 0xF0 -      */
 268     0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9,        /*      - 0xFF */
 269 };
 270
 271 /* ISO-8859-5 (https://en.wikipedia.org/wiki/ISO/IEC_8859-5#Code_page_layout) */
 272 const gunichar2 charset_table_iso_8859_5[0x80] = {
 273     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
 274     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
 275     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
 276     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
 277     0x00a0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407,        /* 0xA0 -      */
 278     0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x040d, 0x040e, 0x040f,        /*      - 0xAF */
 279     0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,        /* 0xB0 -      */
 280     0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,        /*      - 0xBF */
 281     0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,        /* 0xC0 -      */
 282     0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,        /*      - 0xCF */
 283     0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,        /* 0xD0 -      */
 284     0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f,        /*      - 0xDF */
 285     0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,        /* 0xE0 -      */
 286     0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f,        /*      - 0xEF */
 287     0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457,        /* 0xF0 -      */
 288     0x0458, 0x0459, 0x045a, 0x045b, 0x045c, 0x00a7, 0x045e, 0x045f         /*      - 0xFF */
 289 };
 290
 291 /* generated by ../tools/make_charset_ISO-8859-6 */
 292 const gunichar2 charset_table_iso_8859_6[0x80] = {
 293     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
 294     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
 295     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
 296     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
 297     0x00a0, UNREPL, UNREPL, UNREPL, 0x00a4, UNREPL, UNREPL, UNREPL,        /* 0xA0 -      */
 298     UNREPL, UNREPL, UNREPL, UNREPL, 0x060c, 0x00ad, UNREPL, UNREPL,        /*      - 0xAF */
 299     UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,        /* 0xB0 -      */
 300     UNREPL, UNREPL, UNREPL, 0x061b, UNREPL, UNREPL, UNREPL, 0x061f,        /*      - 0xBF */
 301     UNREPL, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627,        /* 0xC0 -      */
 302     0x0628, 0x0629, 0x062a, 0x062b, 0x062c, 0x062d, 0x062e, 0x062f,        /*      - 0xCF */
 303     0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x0637,        /* 0xD0 -      */
 304     0x0638, 0x0639, 0x063a, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,        /*      - 0xDF */
 305     0x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646, 0x0647,        /* 0xE0 -      */
 306     0x0648, 0x0649, 0x064a, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f,        /*      - 0xEF */
 307     0x0650, 0x0651, 0x0652, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,        /* 0xF0 -      */
 308     UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,        /*      - 0xFF */
 309 };
 310
 311 /* generated by ../tools/make_charset_ISO-8859-7 */
 312 const gunichar2 charset_table_iso_8859_7[0x80] = {
 313     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
 314     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
 315     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
 316     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
 317     0x00a0, 0x2018, 0x2019, 0x00a3, 0x20ac, 0x20af, 0x00a6, 0x00a7,        /* 0xA0 -      */
 318     0x00a8, 0x00a9, 0x037a, 0x00ab, 0x00ac, 0x00ad, UNREPL, 0x2015,        /*      - 0xAF */
 319     0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x0384, 0x0385, 0x0386, 0x00b7,        /* 0xB0 -      */
 320     0x0388, 0x0389, 0x038a, 0x00bb, 0x038c, 0x00bd, 0x038e, 0x038f,        /*      - 0xBF */
 321     0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397,        /* 0xC0 -      */
 322     0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f,        /*      - 0xCF */
 323     0x03a0, 0x03a1, UNREPL, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7,        /* 0xD0 -      */
 324     0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x03ac, 0x03ad, 0x03ae, 0x03af,        /*      - 0xDF */
 325     0x03b0, 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7,        /* 0xE0 -      */
 326     0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf,        /*      - 0xEF */
 327     0x03c0, 0x03c1, 0x03c2, 0x03c3, 0x03c4, 0x03c5, 0x03c6, 0x03c7,        /* 0xF0 -      */
 328     0x03c8, 0x03c9, 0x03ca, 0x03cb, 0x03cc, 0x03cd, 0x03ce, UNREPL,        /*      - 0xFF */
 329 };
 330
 331 /* generated by ../tools/make_charset_ISO-8859-8 */
 332 const gunichar2 charset_table_iso_8859_8[0x80] = {
 333     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
 334     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
 335     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
 336     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
 337     0x00a0, UNREPL, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,        /* 0xA0 -      */
 338     0x00a8, 0x00a9, 0x00d7, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,        /*      - 0xAF */
 339     0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,        /* 0xB0 -      */
 340     0x00b8, 0x00b9, 0x00f7, 0x00bb, 0x00bc, 0x00bd, 0x00be, UNREPL,        /*      - 0xBF */
 341     UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,        /* 0xC0 -      */
 342     UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,        /*      - 0xCF */
 343     UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,        /* 0xD0 -      */
 344     UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, 0x2017,        /*      - 0xDF */
 345     0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x05d5, 0x05d6, 0x05d7,        /* 0xE0 -      */
 346     0x05d8, 0x05d9, 0x05da, 0x05db, 0x05dc, 0x05dd, 0x05de, 0x05df,        /*      - 0xEF */
 347     0x05e0, 0x05e1, 0x05e2, 0x05e3, 0x05e4, 0x05e5, 0x05e6, 0x05e7,        /* 0xF0 -      */
 348     0x05e8, 0x05e9, 0x05ea, UNREPL, UNREPL, 0x200e, 0x200f, UNREPL,        /*      - 0xFF */
 349 };
 350
 351 /* ISO-8859-9 (https://en.wikipedia.org/wiki/ISO/IEC_8859-9#Code_page_layout) */
 352 const gunichar2 charset_table_iso_8859_9[0x80] = {
 353     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
 354     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
 355     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
 356     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
 357     0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,        /* 0xA0 -      */
 358     0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,        /*      - 0xAF */
 359     0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,        /* 0xB0 -      */
 360     0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,        /*      - 0xBF */
 361     0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,        /* 0xC0 -      */
 362     0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,        /*      - 0xCF */
 363     0x011e, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,        /* 0xD0 -      */
 364     0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0130, 0x015e, 0x00df,        /*      - 0xDF */
 365     0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,        /* 0xE0 -      */
 366     0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,        /*      - 0xEF */
 367     0x011f, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,        /* 0xF0 -      */
 368     0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0131, 0x015f, 0x00ff         /*      - 0xFF */
 369 };
 370
 371 /* generated by ../tools/make_charset_ISO-8859-10 */
 372 const gunichar2 charset_table_iso_8859_10[0x80] = {
 373     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
 374     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
 375     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
 376     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
 377     0x00a0, 0x0104, 0x0112, 0x0122, 0x012a, 0x0128, 0x0136, 0x00a7,        /* 0xA0 -      */
 378     0x013b, 0x0110, 0x0160, 0x0166, 0x017d, 0x00ad, 0x016a, 0x014a,        /*      - 0xAF */
 379     0x00b0, 0x0105, 0x0113, 0x0123, 0x012b, 0x0129, 0x0137, 0x00b7,        /* 0xB0 -      */
 380     0x013c, 0x0111, 0x0161, 0x0167, 0x017e, 0x2015, 0x016b, 0x014b,        /*      - 0xBF */
 381     0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e,        /* 0xC0 -      */
 382     0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x00cf,        /*      - 0xCF */
 383     0x00d0, 0x0145, 0x014c, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x0168,        /* 0xD0 -      */
 384     0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,        /*      - 0xDF */
 385     0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f,        /* 0xE0 -      */
 386     0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x00ef,        /*      - 0xEF */
 387     0x00f0, 0x0146, 0x014d, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x0169,        /* 0xF0 -      */
 388     0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x0138,        /*      - 0xFF */
 389 };
 390
 391 /* generated by ../tools/make_charset_ISO-8859-11 */
 392 const gunichar2 charset_table_iso_8859_11[0x80] = {
 393     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
 394     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
 395     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
 396     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
 397     0x00a0, 0x0e01, 0x0e02, 0x0e03, 0x0e04, 0x0e05, 0x0e06, 0x0e07,        /* 0xA0 -      */
 398     0x0e08, 0x0e09, 0x0e0a, 0x0e0b, 0x0e0c, 0x0e0d, 0x0e0e, 0x0e0f,        /*      - 0xAF */
 399     0x0e10, 0x0e11, 0x0e12, 0x0e13, 0x0e14, 0x0e15, 0x0e16, 0x0e17,        /* 0xB0 -      */
 400     0x0e18, 0x0e19, 0x0e1a, 0x0e1b, 0x0e1c, 0x0e1d, 0x0e1e, 0x0e1f,        /*      - 0xBF */
 401     0x0e20, 0x0e21, 0x0e22, 0x0e23, 0x0e24, 0x0e25, 0x0e26, 0x0e27,        /* 0xC0 -      */
 402     0x0e28, 0x0e29, 0x0e2a, 0x0e2b, 0x0e2c, 0x0e2d, 0x0e2e, 0x0e2f,        /*      - 0xCF */
 403     0x0e30, 0x0e31, 0x0e32, 0x0e33, 0x0e34, 0x0e35, 0x0e36, 0x0e37,        /* 0xD0 -      */
 404     0x0e38, 0x0e39, 0x0e3a, UNREPL, UNREPL, UNREPL, UNREPL, 0x0e3f,        /*      - 0xDF */
 405     0x0e40, 0x0e41, 0x0e42, 0x0e43, 0x0e44, 0x0e45, 0x0e46, 0x0e47,        /* 0xE0 -      */
 406     0x0e48, 0x0e49, 0x0e4a, 0x0e4b, 0x0e4c, 0x0e4d, 0x0e4e, 0x0e4f,        /*      - 0xEF */
 407     0x0e50, 0x0e51, 0x0e52, 0x0e53, 0x0e54, 0x0e55, 0x0e56, 0x0e57,        /* 0xF0 -      */
 408     0x0e58, 0x0e59, 0x0e5a, 0x0e5b, UNREPL, UNREPL, UNREPL, UNREPL,        /*      - 0xFF */
 409 };
 410
 411 /* generated by ../tools/make_charset_ISO-8859-13 */
 412 const gunichar2 charset_table_iso_8859_13[0x80] = {
 413     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
 414     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
 415     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
 416     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
 417     0x00a0, 0x201d, 0x00a2, 0x00a3, 0x00a4, 0x201e, 0x00a6, 0x00a7,        /* 0xA0 -      */
 418     0x00d8, 0x00a9, 0x0156, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00c6,        /*      - 0xAF */
 419     0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x201c, 0x00b5, 0x00b6, 0x00b7,        /* 0xB0 -      */
 420     0x00f8, 0x00b9, 0x0157, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00e6,        /*      - 0xBF */
 421     0x0104, 0x012e, 0x0100, 0x0106, 0x00c4, 0x00c5, 0x0118, 0x0112,        /* 0xC0 -      */
 422     0x010c, 0x00c9, 0x0179, 0x0116, 0x0122, 0x0136, 0x012a, 0x013b,        /*      - 0xCF */
 423     0x0160, 0x0143, 0x0145, 0x00d3, 0x014c, 0x00d5, 0x00d6, 0x00d7,        /* 0xD0 -      */
 424     0x0172, 0x0141, 0x015a, 0x016a, 0x00dc, 0x017b, 0x017d, 0x00df,        /*      - 0xDF */
 425     0x0105, 0x012f, 0x0101, 0x0107, 0x00e4, 0x00e5, 0x0119, 0x0113,        /* 0xE0 -      */
 426     0x010d, 0x00e9, 0x017a, 0x0117, 0x0123, 0x0137, 0x012b, 0x013c,        /*      - 0xEF */
 427     0x0161, 0x0144, 0x0146, 0x00f3, 0x014d, 0x00f5, 0x00f6, 0x00f7,        /* 0xF0 -      */
 428     0x0173, 0x0142, 0x015b, 0x016b, 0x00fc, 0x017c, 0x017e, 0x2019,        /*      - 0xFF */
 429 };
 430
 431 /* generated by ../tools/make_charset_ISO-8859-14 */
 432 const gunichar2 charset_table_iso_8859_14[0x80] = {
 433     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
 434     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
 435     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
 436     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
 437     0x00a0, 0x1e02, 0x1e03, 0x00a3, 0x010a, 0x010b, 0x1e0a, 0x00a7,        /* 0xA0 -      */
 438     0x1e80, 0x00a9, 0x1e82, 0x1e0b, 0x1ef2, 0x00ad, 0x00ae, 0x0178,        /*      - 0xAF */
 439     0x1e1e, 0x1e1f, 0x0120, 0x0121, 0x1e40, 0x1e41, 0x00b6, 0x1e56,        /* 0xB0 -      */
 440     0x1e81, 0x1e57, 0x1e83, 0x1e60, 0x1ef3, 0x1e84, 0x1e85, 0x1e61,        /*      - 0xBF */
 441     0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,        /* 0xC0 -      */
 442     0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,        /*      - 0xCF */
 443     0x0174, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x1e6a,        /* 0xD0 -      */
 444     0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x0176, 0x00df,        /*      - 0xDF */
 445     0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,        /* 0xE0 -      */
 446     0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,        /*      - 0xEF */
 447     0x0175, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x1e6b,        /* 0xF0 -      */
 448     0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x0177, 0x00ff,        /*      - 0xFF */
 449 };
 450
 451 /* generated by ../tools/make_charset_ISO-8859-15 */
 452 const gunichar2 charset_table_iso_8859_15[0x80] = {
 453     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
 454     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
 455     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
 456     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
 457     0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x20ac, 0x00a5, 0x0160, 0x00a7,        /* 0xA0 -      */
 458     0x0161, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,        /*      - 0xAF */
 459     0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x017d, 0x00b5, 0x00b6, 0x00b7,        /* 0xB0 -      */
 460     0x017e, 0x00b9, 0x00ba, 0x00bb, 0x0152, 0x0153, 0x0178, 0x00bf,        /*      - 0xBF */
 461     0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,        /* 0xC0 -      */
 462     0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,        /*      - 0xCF */
 463     0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,        /* 0xD0 -      */
 464     0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,        /*      - 0xDF */
 465     0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,        /* 0xE0 -      */
 466     0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,        /*      - 0xEF */
 467     0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,        /* 0xF0 -      */
 468     0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,        /*      - 0xFF */
 469 };
 470
 471 /* generated by ../tools/make_charset_ISO-8859-16 */
 472 const gunichar2 charset_table_iso_8859_16[0x80] = {
 473     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
 474     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
 475     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
 476     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
 477     0x00a0, 0x0104, 0x0105, 0x0141, 0x20ac, 0x201e, 0x0160, 0x00a7,        /* 0xA0 -      */
 478     0x0161, 0x00a9, 0x0218, 0x00ab, 0x0179, 0x00ad, 0x017a, 0x017b,        /*      - 0xAF */
 479     0x00b0, 0x00b1, 0x010c, 0x0142, 0x017d, 0x201d, 0x00b6, 0x00b7,        /* 0xB0 -      */
 480     0x017e, 0x010d, 0x0219, 0x00bb, 0x0152, 0x0153, 0x0178, 0x017c,        /*      - 0xBF */
 481     0x00c0, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0106, 0x00c6, 0x00c7,        /* 0xC0 -      */
 482     0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,        /*      - 0xCF */
 483     0x0110, 0x0143, 0x00d2, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x015a,        /* 0xD0 -      */
 484     0x0170, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0118, 0x021a, 0x00df,        /*      - 0xDF */
 485     0x00e0, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x0107, 0x00e6, 0x00e7,        /* 0xE0 -      */
 486     0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,        /*      - 0xEF */
 487     0x0111, 0x0144, 0x00f2, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x015b,        /* 0xF0 -      */
 488     0x0171, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0119, 0x021b, 0x00ff,        /*      - 0xFF */
 489 };
 490
 491 /*
 492  * Windows-1250
 493  *
 494  * See:
 495  *     httpss://en.wikipedia.org/wiki/Windows-1250)
 496  *     https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1250.TXT
 497  */
 498 const gunichar2 charset_table_cp1250[0x80] = {
 499     0x20ac, UNREPL, 0x201a, UNREPL, 0x201e, 0x2026, 0x2020, 0x2021,        /* 0x80 -      */
 500     UNREPL, 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179,        /*      - 0x8F */
 501     UNREPL, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,        /* 0x90 -      */
 502     UNREPL, 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a,        /*      - 0x9F */
 503     0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7,        /* 0xA0 -      */
 504     0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b,        /*      - 0xAF */
 505     0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7,        /* 0xB0 -      */
 506     0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c,        /*      - 0xBF */
 507     0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,        /* 0xC0 -      */
 508     0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,        /*      - 0xCF */
 509     0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,        /* 0xD0 -      */
 510     0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,        /*      - 0xDF */
 511     0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,        /* 0xE0 -      */
 512     0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,        /*      - 0xEF */
 513     0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,        /* 0xF0 -      */
 514     0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9,        /*      - 0xFF */
 515 };
 516
 517 /*
 518  * Windows-1251
 519  *
 520  * See:
 521  *     https://en.wikipedia.org/wiki/Windows-1251
 522  *     https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1251.TXT
 523  */
 524 const gunichar2 charset_table_cp1251[0x80] = {
 525     0x0402, 0x0403, 0x201a, 0x0453, 0x201e, 0x2026, 0x2020, 0x2021,        /* 0x80 -      */
 526     0x20ac, 0x2030, 0x0409, 0x2039, 0x040a, 0x040c, 0x040B, 0x040f,        /*      - 0x8F */
 527     0x0452, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,        /* 0x90 -      */
 528     UNREPL, 0x2122, 0x0459, 0x203a, 0x045a, 0x045c, 0x045b, 0x045f,        /*      - 0x9F */
 529     0x00a0, 0x040e, 0x045e, 0x0408, 0x00a4, 0x0490, 0x00a6, 0x00a7,        /* 0xA0 -      */
 530     0x0401, 0x00a9, 0x0404, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x0407,        /*      - 0xAF */
 531     0x00b0, 0x00b1, 0x0406, 0x0456, 0x0491, 0x00b5, 0x00b6, 0x00b7,        /* 0xB0 -      */
 532     0x0451, 0x2116, 0x0454, 0x00bb, 0x0458, 0x0405, 0x0455, 0x0457,        /*      - 0xBF */
 533     0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,        /* 0xC0 -      */
 534     0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,        /*      - 0xCF */
 535     0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,        /* 0xD0 -      */
 536     0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,        /*      - 0xDF */
 537     0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,        /* 0xE0 -      */
 538     0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f,        /*      - 0xEF */
 539     0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,        /* 0xF0 -      */
 540     0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f,        /*      - 0xFF */
 541 };
 542
 543 /*
 544  * Windows-1252
 545  *
 546  * See:
 547  *     https://en.wikipedia.org/wiki/Windows-1252
 548  *     https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT
 549  */
 550 const gunichar2 charset_table_cp1252[0x80] = {
 551     0x20ac, UNREPL, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,        /* 0x80 -      */
 552     0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, UNREPL, 0x0172, UNREPL,        /*      - 0x8F */
 553     UNREPL, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,        /* 0x90 -      */
 554     0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, UNREPL, 0x0173, 0x0178,        /*      - 0x9F */
 555     0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,        /* 0xA0 -      */
 556     0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,        /*      - 0xAF */
 557     0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,        /* 0xB0 -      */
 558     0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,        /*      - 0xBF */
 559     0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,        /* 0xC0 -      */
 560     0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,        /*      - 0xCF */
 561     0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,        /* 0xD0 -      */
 562     0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,        /*      - 0xDF */
 563     0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,        /* 0xE0 -      */
 564     0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,        /*      - 0xEF */
 565     0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,        /* 0xF0 -      */
 566     0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,        /*      - 0xFF */
 567 };
 568
 569 /* generated by ./make_charset_table MACROMAN */
 570 /* That's "MacRoman", not "Macro Man" (faster than a speeding recursive expansion!) */
 571 const gunichar2 charset_table_mac_roman[0x80] = {
 572     0x00c4, 0x00c5, 0x00c7, 0x00c9, 0x00d1, 0x00d6, 0x00dc, 0x00e1,        /* 0x80 -      */
 573     0x00e0, 0x00e2, 0x00e4, 0x00e3, 0x00e5, 0x00e7, 0x00e9, 0x00e8,        /*      - 0x8F */
 574     0x00ea, 0x00eb, 0x00ed, 0x00ec, 0x00ee, 0x00ef, 0x00f1, 0x00f3,        /* 0x90 -      */
 575     0x00f2, 0x00f4, 0x00f6, 0x00f5, 0x00fa, 0x00f9, 0x00fb, 0x00fc,        /*      - 0x9F */
 576     0x2020, 0x00b0, 0x00a2, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x00df,        /* 0xA0 -      */
 577     0x00ae, 0x00a9, 0x2122, 0x00b4, 0x00a8, 0x2260, 0x00c6, 0x00d8,        /*      - 0xAF */
 578     0x221e, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x00b5, 0x2202, 0x2211,        /* 0xB0 -      */
 579     0x220f, 0x03c0, 0x222b, 0x00aa, 0x00ba, 0x03a9, 0x00e6, 0x00f8,        /*      - 0xBF */
 580     0x00bf, 0x00a1, 0x00ac, 0x221a, 0x0192, 0x2248, 0x2206, 0x00ab,        /* 0xC0 -      */
 581     0x00bb, 0x2026, 0x00a0, 0x00c0, 0x00c3, 0x00d5, 0x0152, 0x0153,        /*      - 0xCF */
 582     0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x25ca,        /* 0xD0 -      */
 583     0x00ff, 0x0178, 0x2044, 0x20ac, 0x2039, 0x203a, 0xfb01, 0xfb02,        /*      - 0xDF */
 584     0x2021, 0x00b7, 0x201a, 0x201e, 0x2030, 0x00c2, 0x00ca, 0x00c1,        /* 0xE0 -      */
 585     0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, 0x00cc, 0x00d3, 0x00d4,        /*      - 0xEF */
 586     0xf8ff, 0x00d2, 0x00da, 0x00db, 0x00d9, 0x0131, 0x02c6, 0x02dc,        /* 0xF0 -      */
 587     0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7,        /*      - 0xFF */
 588 };
 589
 590 /* generated by ./make_charset_table CP437 */
 591 const gunichar2 charset_table_cp437[0x80] = {
 592     0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7,        /* 0x80 -      */
 593     0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5,        /*      - 0x8F */
 594     0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9,        /* 0x90 -      */
 595     0x00ff, 0x00d6, 0x00dc, 0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192,        /*      - 0x9F */
 596     0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba,        /* 0xA0 -      */
 597     0x00bf, 0x2310, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb,        /*      - 0xAF */
 598     0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556,        /* 0xB0 -      */
 599     0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510,        /*      - 0xBF */
 600     0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f,        /* 0xC0 -      */
 601     0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567,        /*      - 0xCF */
 602     0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b,        /* 0xD0 -      */
 603     0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580,        /*      - 0xDF */
 604     0x03b1, 0x00df, 0x0393, 0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03c4,        /* 0xE0 -      */
 605     0x03a6, 0x0398, 0x03a9, 0x03b4, 0x221e, 0x03c6, 0x03b5, 0x2229,        /*      - 0xEF */
 606     0x2261, 0x00b1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248,        /* 0xF0 -      */
 607     0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0,        /*      - 0xFF */
 608 };
 609
 610 /*
 611  * CP855
 612  *
 613  * See
 614  *     https://en.wikipedia.org/wiki/CP855
 615  *     https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP855.TXT
 616  *
 617  * XXX - this doesn't have the graphics for 0x00 through 0x1F shown
 618  * on the Wikipedia page, but not in the Microsoft mapping file;
 619  * that would require a 256-code-point mapping table.  (Are those
 620  * positions used for the same graphics on all code pages - the PC
 621  * graphics set, or whatever it's called?)
 622  */
 623 const gunichar2 charset_table_cp855[0x80] = {
 624     0x0452, 0x0402, 0x0453, 0x0403, 0x0451, 0x0401, 0x0454, 0x0404,        /* 0x80 -      */
 625     0x0455, 0x0405, 0x0456, 0x0406, 0x0457, 0x0407, 0x0458, 0x0408,        /*      - 0x8F */
 626     0x0459, 0x0409, 0x045a, 0x040a, 0x045b, 0x040b, 0x045c, 0x040c,        /* 0x90 -      */
 627     0x045e, 0x040e, 0x045f, 0x040f, 0x044e, 0x042e, 0x044a, 0x042a,        /*      - 0x9F */
 628     0x0430, 0x0410, 0x0431, 0x0411, 0x0446, 0x0426, 0x0434, 0x0414,        /* 0xA0 -      */
 629     0x0435, 0x0415, 0x0444, 0x0424, 0x0433, 0x0413, 0x00ab, 0x00bb,        /*      - 0xAF */
 630     0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x0445, 0x0425, 0x0438,        /* 0xB0 -      */
 631     0x0418, 0x2563, 0x2551, 0x2557, 0x2550, 0x0439, 0x0419, 0x2510,        /*      - 0xBF */
 632     0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x043a, 0x041a,        /* 0xC0 -      */
 633     0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,        /*      - 0xCF */
 634     0x043b, 0x041b, 0x043c, 0x041c, 0x043d, 0x041d, 0x043e, 0x041e,        /* 0xD0 -      */
 635     0x043f, 0x2518, 0x250c, 0x2588, 0x2584, 0x041f, 0x044f, 0x2580,        /*      - 0xDF */
 636     0x042f, 0x0440, 0x0420, 0x0441, 0x0421, 0x0442, 0x0422, 0x0443,        /* 0xE0 -      */
 637     0x0423, 0x0436, 0x0416, 0x0432, 0x0412, 0x044c, 0x042c, 0x2116,        /*      - 0xEF */
 638     0x00ad, 0x044b, 0x042b, 0x0437, 0x0417, 0x0448, 0x0428, 0x044d,        /* 0xF0 -      */
 639     0x042d, 0x0449, 0x0429, 0x0447, 0x0427, 0x00a7, 0x25a0, 0x00a0,        /*      - 0xFF */
 640 };
 641
 642 /*
 643  * CP866
 644  *
 645  * See:
 646  *     https://en.wikipedia.org/wiki/CP866
 647  *     https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP866.TXT
 648  */
 649 const gunichar2 charset_table_cp866[0x80] = {
 650     0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,        /* 0x80 -      */
 651     0x0418, 0x0419, 0x041A, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,        /*      - 0x8F */
 652     0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,        /* 0x90 -      */
 653     0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,        /*      - 0x9F */
 654     0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,        /* 0xA0 -      */
 655     0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f,        /*      - 0xAF */
 656     0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556,        /* 0xB0 -      */
 657     0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510,        /*      - 0xBF */
 658     0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f,        /* 0xC0 -      */
 659     0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567,        /*      - 0xCF */
 660     0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b,        /* 0xD0 -      */
 661     0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580,        /*      - 0xDF */
 662     0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,        /* 0xE0 -      */
 663     0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f,        /*      - 0xEF */
 664     0x0401, 0x0451, 0x0404, 0x0454, 0x0407, 0x0457, 0x040e, 0x045e,        /* 0xF0 -      */
 665     0x00b0, 0x2219, 0x00b7, 0x221a, 0x2216, 0x00a4, 0x25a0, 0x00a0,        /*      - 0xFF */
 666 };
 667
 668 /*
 669  * Given a wmem scope, a pointer, a length, and a translation table with
 670  * 128 entries, treat the string of bytes referred to by the pointer and
 671  * length as a string encoded using one octet per character, with octets
 672  * with the high-order bit clear being ASCII and octets with the high-order
 673  * bit set being mapped by the translation table to 2-byte Unicode Basic
 674  * Multilingual Plane characters (including REPLACEMENT CHARACTER), and
 675  * return a pointer to a UTF-8 string, allocated using the wmem scope.
 676  */
 677 uint8_t *
 678 get_unichar2_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, const gunichar2 table[0x80])
 679 {
 680     wmem_strbuf_t *str;
 681
 682     str = wmem_strbuf_new_sized(scope, length+1);
 683
 684     while (length > 0) {
 685         uint8_t ch = *ptr;
 686
 687         if (ch < 0x80)
 688             wmem_strbuf_append_c(str, ch);
 689         else
 690             wmem_strbuf_append_unichar(str, table[ch-0x80]);
 691         ptr++;
 692         length--;
 693     }
 694
 695     return (uint8_t *) wmem_strbuf_finalize(str);
 696 }
 697
 698 /*
 699  * Given a wmem scope, a pointer, and a length, treat the string of bytes
 700  * referred to by the pointer and length as a UCS-2 encoded string
 701  * containing characters from the Basic Multilingual Plane (plane 0) of
 702  * Unicode, and return a pointer to a UTF-8 string, allocated with the
 703  * wmem scope.
 704  *
 705  * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
 706  * possibly ORed with ENC_BOM.
 707  *
 708  * Specify length in bytes.
 709  */
 710 uint8_t *
 711 get_ucs_2_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, unsigned encoding)
 712 {
 713     gunichar2      uchar;
 714     int            i = 0;       /* Byte counter for string */
 715     wmem_strbuf_t *strbuf;
 716
 717     strbuf = wmem_strbuf_new_sized(scope, length+1);
 718
 719     if (encoding & ENC_BOM && length >= 2) {
 720         if (pletoh16(ptr) == BYTE_ORDER_MARK) {
 721             encoding = ENC_LITTLE_ENDIAN;
 722             i += 2;
 723         } else if (pntoh16(ptr) == BYTE_ORDER_MARK) {
 724             encoding = ENC_BIG_ENDIAN;
 725             i += 2;
 726         }
 727     }
 728
 729     encoding = encoding & ENC_LITTLE_ENDIAN;
 730
 731     for(; i + 1 < length; i += 2) {
 732         if (encoding == ENC_BIG_ENDIAN) {
 733             uchar = pntoh16(ptr + i);
 734         } else {
 735             uchar = pletoh16(ptr + i);
 736         }
 737         wmem_strbuf_append_unichar_validated(strbuf, uchar);
 738     }
 739
 740     /*
 741      * If i < length, this means we were handed an odd number of bytes;
 742      * insert a REPLACEMENT CHARACTER to mark the error.
 743      */
 744     if (i < length) {
 745         wmem_strbuf_append_unichar_repl(strbuf);
 746     }
 747     return (uint8_t *) wmem_strbuf_finalize(strbuf);
 748 }
 749
 750 /*
 751  * Given a wmem scope, a pointer, and a length, treat the string of bytes
 752  * referred to by the pointer and length as a UTF-16 encoded string, and
 753  * return a pointer to a UTF-8 string, allocated with the wmem scope.
 754  *
 755  * See RFC 2781 section 2.2.
 756  *
 757  * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
 758  * possibly ORed with ENC_BOM.
 759  *
 760  * Specify length in bytes.
 761  */
 762 uint8_t *
 763 get_utf_16_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, unsigned encoding)
 764 {
 765     wmem_strbuf_t *strbuf;
 766     gunichar2      uchar2, lead_surrogate;
 767     gunichar       uchar;
 768     int            i = 0;       /* Byte counter for string */
 769
 770     strbuf = wmem_strbuf_new_sized(scope, length+1);
 771
 772     if (encoding & ENC_BOM && length >= 2) {
 773         if (pletoh16(ptr) == BYTE_ORDER_MARK) {
 774             encoding = ENC_LITTLE_ENDIAN;
 775             i += 2;
 776         } else if (pntoh16(ptr) == BYTE_ORDER_MARK) {
 777             encoding = ENC_BIG_ENDIAN;
 778             i += 2;
 779         }
 780     }
 781
 782     encoding = encoding & ENC_LITTLE_ENDIAN;
 783
 784     for(; i + 1 < length; i += 2) {
 785         if (encoding == ENC_BIG_ENDIAN)
 786             uchar2 = pntoh16(ptr + i);
 787         else
 788             uchar2 = pletoh16(ptr + i);
 789
 790         if (IS_LEAD_SURROGATE(uchar2)) {
 791             /*
 792              * Lead surrogate.  Must be followed by
 793              * a trail surrogate.
 794              */
 795             i += 2;
 796             if (i + 1 >= length) {
 797                 /*
 798                  * Oops, string ends with a lead surrogate.
 799                  *
 800                  * Insert a REPLACEMENT CHARACTER to mark the error,
 801                  * and quit.
 802                  */
 803                 wmem_strbuf_append_unichar(strbuf, UNREPL);
 804                 break;
 805             }
 806             lead_surrogate = uchar2;
 807             if (encoding == ENC_BIG_ENDIAN)
 808                 uchar2 = pntoh16(ptr + i);
 809             else
 810                 uchar2 = pletoh16(ptr + i);
 811             if (IS_TRAIL_SURROGATE(uchar2)) {
 812                 /* Trail surrogate. */
 813                 uchar = SURROGATE_VALUE(lead_surrogate, uchar2);
 814                 wmem_strbuf_append_unichar(strbuf, uchar);
 815             } else {
 816                 /*
 817                  * Not a trail surrogate.
 818                  *
 819                  * Insert a REPLACEMENT CHARACTER to mark the error,
 820                  * and continue;
 821                  */
 822                 wmem_strbuf_append_unichar(strbuf, UNREPL);
 823             }
 824         } else {
 825             if (IS_TRAIL_SURROGATE(uchar2)) {
 826                 /*
 827                  * Trail surrogate without a preceding
 828                  * lead surrogate.
 829                  *
 830                  * Insert a REPLACEMENT CHARACTER to mark the error,
 831                  * and continue;
 832                  */
 833                 wmem_strbuf_append_unichar(strbuf, UNREPL);
 834             } else {
 835                 /*
 836                  * Non-surrogate; just append it.
 837                  */
 838                 wmem_strbuf_append_unichar(strbuf, uchar2);
 839             }
 840         }
 841     }
 842
 843     /*
 844      * If i < length, this means we were handed an odd number of bytes,
 845      * so we're not a valid UTF-16 string; insert a REPLACEMENT CHARACTER
 846      * to mark the error.
 847      */
 848     if (i < length)
 849         wmem_strbuf_append_unichar(strbuf, UNREPL);
 850     return (uint8_t *) wmem_strbuf_finalize(strbuf);
 851 }
 852
 853 /*
 854  * Given a wmem scope, a pointer, and a length, treat the string of bytes
 855  * referred to by the pointer and length as a UCS-4 encoded string, and
 856  * return a pointer to a UTF-8 string, allocated with the wmem scope.
 857  *
 858  * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
 859  *
 860  * Specify length in bytes
 861  */
 862 uint8_t *
 863 get_ucs_4_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, unsigned encoding)
 864 {
 865     gunichar       uchar;
 866     int            i = 0;       /* Byte counter for string */
 867     wmem_strbuf_t *strbuf;
 868
 869     strbuf = wmem_strbuf_new_sized(scope, length+1);
 870
 871     if (encoding & ENC_BOM && length >= 4) {
 872         if (pletoh32(ptr) == BYTE_ORDER_MARK) {
 873             encoding = ENC_LITTLE_ENDIAN;
 874             i += 4;
 875         } else if (pntoh32(ptr) == BYTE_ORDER_MARK) {
 876             encoding = ENC_BIG_ENDIAN;
 877             i += 4;
 878         }
 879     }
 880
 881     encoding = encoding & ENC_LITTLE_ENDIAN;
 882
 883     for(; i + 3 < length; i += 4) {
 884         if (encoding == ENC_BIG_ENDIAN)
 885             uchar = pntoh32(ptr + i);
 886         else
 887             uchar = pletoh32(ptr + i);
 888
 889         wmem_strbuf_append_unichar_validated(strbuf, uchar);
 890     }
 891
 892     /*
 893      * if i < length, this means we were handed a number of bytes
 894      * that's not a multiple of 4, so not a valid UCS-4 string.
 895      * Insert a REPLACEMENT CHARACTER for the remaining bytes.
 896      */
 897     if (i < length) {
 898         wmem_strbuf_append_unichar(strbuf, UNREPL);
 899     }
 900     return (uint8_t *)wmem_strbuf_finalize(strbuf);
 901 }
 902
 903 /*
 904  * FROM GNOKII
 905  * gsm-encoding.c
 906  * gsm-sms.c
 907  */
 908
 909 /* ETSI GSM 03.38, version 6.0.1, section 6.2.1; Default alphabet */
 910 static const gunichar2 gsm_default_alphabet[0x80] = {
 911     '@',   0xa3,  '$',   0xa5,  0xe8,  0xe9,  0xf9,  0xec,
 912     0xf2,  0xc7,  '\n',  0xd8,  0xf8,  '\r',  0xc5,  0xe5,
 913     0x394, '_',   0x3a6, 0x393, 0x39b, 0x3a9, 0x3a0, 0x3a8,
 914     0x3a3, 0x398, 0x39e, 0xa0,  0xc6,  0xe6,  0xdf,  0xc9,
 915     ' ',   '!',   '\"',  '#',   0xa4,  '%',   '&',   '\'',
 916     '(',   ')',   '*',   '+',   ',',   '-',   '.',   '/',
 917     '0',   '1',   '2',   '3',   '4',   '5',   '6',   '7',
 918     '8',   '9',   ':',   ';',   '<',   '=',   '>',   '?',
 919     0xa1,  'A',   'B',   'C',   'D',   'E',   'F',   'G',
 920     'H',   'I',   'J',   'K',   'L',   'M',   'N',   'O',
 921     'P',   'Q',   'R',   'S',   'T',   'U',   'V',   'W',
 922     'X',   'Y',   'Z',   0xc4,  0xd6,  0xd1,  0xdc,  0xa7,
 923     0xbf,  'a',   'b',   'c',   'd',   'e',   'f',   'g',
 924     'h',   'i',   'j',   'k',   'l',   'm',   'n',   'o',
 925     'p',   'q',   'r',   's',   't',   'u',   'v',   'w',
 926     'x',   'y',   'z',   0xe4,  0xf6,  0xf1,  0xfc,  0xe0
 927 };
 928
 929 static gunichar
 930 GSM_to_UNICHAR(uint8_t c)
 931 {
 932     if (c < G_N_ELEMENTS(gsm_default_alphabet))
 933         return gsm_default_alphabet[c];
 934
 935     return UNREPL;
 936 }
 937
 938 static gunichar
 939 GSMext_to_UNICHAR(uint8_t c)
 940 {
 941     switch (c)
 942     {
 943         case 0x0a: return 0x0c; /* form feed */
 944         case 0x14: return '^';
 945         case 0x28: return '{';
 946         case 0x29: return '}';
 947         case 0x2f: return '\\';
 948         case 0x3c: return '[';
 949         case 0x3d: return '~';
 950         case 0x3e: return ']';
 951         case 0x40: return '|';
 952         case 0x65: return 0x20ac; /* euro */
 953     }
 954
 955     return UNREPL; /* invalid character */
 956 }
 957
 958 #define GN_BYTE_MASK ((1 << bits) - 1)
 959
 960 #define GN_CHAR_ESCAPE 0x1b
 961
 962 static bool
 963 char_is_escape(unsigned char value)
 964 {
 965     return (value == GN_CHAR_ESCAPE);
 966 }
 967
 968 static bool
 969 handle_ts_23_038_char(wmem_strbuf_t *strbuf, uint8_t code_point,
 970                       bool saw_escape)
 971 {
 972     gunichar       uchar;
 973
 974     if (char_is_escape(code_point)) {
 975         /*
 976          * XXX - if saw_escape is true here, then this is
 977          * the case where we escape to "another extension table",
 978          * but TS 128 038 V11.0 doesn't specify such an extension
 979          * table.
 980          */
 981         saw_escape = true;
 982     } else {
 983         if (!(code_point & 0x80)) {
 984             /*
 985              * Code point is valid (7-bit).
 986              * Have we seen an escape?
 987              */
 988             if (saw_escape) {
 989                 saw_escape = false;
 990                 uchar = GSMext_to_UNICHAR(code_point);
 991             } else {
 992                 uchar = GSM_to_UNICHAR(code_point);
 993             }
 994             wmem_strbuf_append_unichar(strbuf, uchar);
 995         } else {
 996             /* Invalid - put in a REPLACEMENT CHARACTER */
 997             wmem_strbuf_append_unichar(strbuf, UNREPL);
 998         }
 999     }
1000     return saw_escape;
1001 }
1002
1003 uint8_t *
1004 get_ts_23_038_7bits_string_packed(wmem_allocator_t *scope, const uint8_t *ptr,
1005                                   const int bit_offset, int no_of_chars)
1006 {
1007     wmem_strbuf_t *strbuf;
1008     int            char_count;                  /* character counter for string */
1009     uint8_t        in_byte, out_byte, rest = 0x00;
1010     const uint8_t *start_ptr = ptr;
1011     bool           saw_escape = false;
1012     int            bits;
1013
1014     strbuf = wmem_strbuf_new_sized(scope, no_of_chars+1);
1015
1016     bits = bit_offset & 0x07;
1017     if (!bits) {
1018         bits = 7;
1019     }
1020
1021     for(char_count = 0; char_count < no_of_chars; ptr++) {
1022         /* Get the next byte from the string. */
1023         in_byte = *ptr;
1024
1025         /*
1026          * Combine the bits we've accumulated with bits from
1027          * that byte to make a 7-bit code point.
1028          */
1029         out_byte = ((in_byte & GN_BYTE_MASK) << (7 - bits)) | rest;
1030
1031         /*
1032          * Leftover bits used in that code point.
1033          */
1034         rest = in_byte >> bits;
1035
1036         /*
1037          * If we don't start from 0th bit, we shouldn't go to the
1038          * next char. Under *out_num we have now 0 and under Rest -
1039          * _first_ part of the char.
1040          */
1041         if ((start_ptr != ptr) || (bits == 7)) {
1042             saw_escape = handle_ts_23_038_char(strbuf, out_byte,
1043                 saw_escape);
1044             char_count++;
1045         }
1046
1047         /*
1048          * After reading 7 octets we have read 7 full characters
1049          * but we have 7 bits as well. This is the next character.
1050          */
1051         if ((bits == 1) && (char_count < no_of_chars)) {
1052             saw_escape = handle_ts_23_038_char(strbuf, rest,
1053                 saw_escape);
1054             char_count++;
1055             bits = 7;
1056             rest = 0x00;
1057         } else {
1058             bits--;
1059         }
1060     }
1061
1062     if (saw_escape) {
1063         /*
1064          * Escape not followed by anything.
1065          *
1066          * XXX - for now, show the escape as a REPLACEMENT
1067          * CHARACTER.
1068          */
1069         wmem_strbuf_append_unichar(strbuf, UNREPL);
1070     }
1071
1072     return (uint8_t *)wmem_strbuf_finalize(strbuf);
1073 }
1074
1075 uint8_t *
1076 get_ts_23_038_7bits_string_unpacked(wmem_allocator_t *scope, const uint8_t *ptr,
1077                            int length)
1078 {
1079     wmem_strbuf_t *strbuf;
1080     int            i;       /* Byte counter for string */
1081     bool           saw_escape = false;
1082
1083     strbuf = wmem_strbuf_new_sized(scope, length+1);
1084
1085     for (i = 0; i < length; i++)
1086         saw_escape = handle_ts_23_038_char(strbuf, *ptr++, saw_escape);
1087
1088     return (uint8_t *)wmem_strbuf_finalize(strbuf);
1089 }
1090
1091 /*
1092  * ETSI TS 102 221 Annex A.
1093  */
1094 uint8_t *
1095 get_etsi_ts_102_221_annex_a_string(wmem_allocator_t *scope, const uint8_t *ptr,
1096                                    int length)
1097 {
1098     uint8_t        string_type;
1099     uint8_t        string_len;
1100     gunichar2      ucs2_base;
1101     wmem_strbuf_t *strbuf;
1102     unsigned       i;       /* Byte counter for string */
1103     bool           saw_escape = false;
1104
1105     /*
1106      * get the first octet.
1107      */
1108     if (length == 0) {
1109         /* XXX - return error indication */
1110         strbuf = wmem_strbuf_new(scope, "");
1111         return (uint8_t *)wmem_strbuf_finalize(strbuf);
1112     }
1113     string_type = *ptr;
1114     ptr++;
1115     length--;
1116
1117     if (string_type == 0x80) {
1118         /*
1119          * Annex A, coding scheme 1) - big-endian UCS-2.
1120          */
1121         return get_ucs_2_string(scope, ptr, length, ENC_BIG_ENDIAN);
1122     }
1123
1124     /*
1125      * Annex A, coding schemes 2) and 3):
1126      *
1127      *    the second byte is the number of characters (characters,
1128      *    not octets) in the string;
1129      *
1130      *    for coding scheme 2), the third byte defines bits 15 to 8
1131      *    of all UCS-2 characters in the string (all bit numbers are
1132      *    1-origin, so bit 1 is the low-order bit), with bit 16 being 0;
1133      *
1134      *    for coding scheme 3), the third byte and fourth bytes, treated
1135      *    as a big-endian value, define the base value for all UCS-2
1136      *    characters in the string;
1137      *
1138      *    for all subsequent bytes, if bit 8 is 0, it's a character
1139      *    in the GSM Default Alphabet, otherwise, it is added to
1140      *    the UCS-2 base value to give a UCS-2 character.
1141      *
1142      * XXX - that doesn't seem to indicate that a byte of 0x1b is
1143      * treated as an escape character, it just says that a single octet
1144      * with the 8th bit not set is a GSM Default Alphabet character.
1145      */
1146
1147     /*
1148      * Get the string length, in characters.
1149      */
1150     if (length == 0) {
1151         /* XXX - return error indication */
1152         strbuf = wmem_strbuf_new(scope, "");
1153         return (uint8_t *)wmem_strbuf_finalize(strbuf);
1154     }
1155     string_len = *ptr;
1156     ptr++;
1157     length--;
1158
1159     strbuf = wmem_strbuf_new_sized(scope, 2*string_len+1);
1160
1161     /*
1162      * Get the UCS-2 base.
1163      */
1164     if (string_type == 0x81) {
1165         if (length == 0) {
1166             /* XXX - return error indication */
1167             return (uint8_t *)wmem_strbuf_finalize(strbuf);
1168         }
1169         ucs2_base = (*ptr) << 7;
1170         ptr++;
1171         length--;
1172     } else if (string_type == 0x82) {
1173         if (length == 0) {
1174             /* XXX - return error indication */
1175             return (uint8_t *)wmem_strbuf_finalize(strbuf);
1176         }
1177         ucs2_base = (*ptr) << 8;
1178         ptr++;
1179         length--;
1180
1181         if (length == 0) {
1182             /* XXX - return error indication */
1183             return (uint8_t *)wmem_strbuf_finalize(strbuf);
1184         }
1185         ucs2_base |= *ptr;
1186         ptr++;
1187         length--;
1188     } else {
1189         /* Invalid string type. */
1190         /* XXX - return error indication */
1191         return (uint8_t *)wmem_strbuf_finalize(strbuf);
1192     }
1193
1194     for (i = 0; i < string_len; i++) {
1195         uint8_t byte;
1196
1197         if (length == 0) {
1198             /* XXX - return error indication */
1199             return (uint8_t *)wmem_strbuf_finalize(strbuf);
1200         }
1201         byte = *ptr;
1202         if ((byte & 0x80) == 0) {
1203             saw_escape = handle_ts_23_038_char(strbuf, byte, saw_escape);
1204         } else {
1205             gunichar2 uchar;
1206
1207             /*
1208              * XXX - if saw_escape is true, this is bogus.
1209              *
1210              * XXX - if there are an odd number of bytes, should put a
1211              * REPLACEMENT CHARACTER at the end.
1212              */
1213             uchar = ucs2_base + (byte & 0x7f);
1214             wmem_strbuf_append_unichar_validated(strbuf, uchar);
1215         }
1216     }
1217
1218     return (uint8_t *)wmem_strbuf_finalize(strbuf);
1219 }
1220
1221 uint8_t *
1222 get_ascii_7bits_string(wmem_allocator_t *scope, const uint8_t *ptr,
1223                        const int bit_offset, int no_of_chars)
1224 {
1225     wmem_strbuf_t *strbuf;
1226     int            char_count;                  /* character counter for string */
1227     uint8_t        in_byte, out_byte, rest = 0x00;
1228     const uint8_t *start_ptr = ptr;
1229     int            bits;
1230
1231     bits = bit_offset & 0x07;
1232     if (!bits) {
1233         bits = 7;
1234     }
1235
1236     strbuf = wmem_strbuf_new_sized(scope, no_of_chars+1);
1237     for(char_count = 0; char_count < no_of_chars; ptr++) {
1238         /* Get the next byte from the string. */
1239         in_byte = *ptr;
1240
1241         /*
1242          * Combine the bits we've accumulated with bits from
1243          * that byte to make a 7-bit code point.
1244          */
1245         out_byte = (in_byte >> (8 - bits)) | rest;
1246
1247         /*
1248          * Leftover bits used in that code point.
1249          */
1250         rest = (in_byte << (bits - 1)) & 0x7f;
1251
1252         /*
1253          * If we don't start from 0th bit, we shouldn't go to the
1254          * next char. Under *out_num we have now 0 and under Rest -
1255          * _first_ part of the char.
1256          */
1257         if ((start_ptr != ptr) || (bits == 7)) {
1258             wmem_strbuf_append_c(strbuf, out_byte);
1259             char_count++;
1260         }
1261
1262         /*
1263          * After reading 7 octets we have read 7 full characters
1264          * but we have 7 bits as well. This is the next character.
1265          */
1266         if ((bits == 1) && (char_count < no_of_chars)) {
1267             wmem_strbuf_append_c(strbuf, rest);
1268             char_count++;
1269             bits = 7;
1270             rest = 0x00;
1271         } else {
1272             bits--;
1273         }
1274     }
1275
1276     return (uint8_t *)wmem_strbuf_finalize(strbuf);
1277 }
1278
1279 /* Tables for EBCDIC code pages */
1280
1281 /* EBCDIC common; based on the table in appendix H of ESA/370 Principles
1282    of Operation, but with some code points that don't correspond to
1283    the same characters in code pages 037 and 1158 mapped to REPLACEMENT
1284    CHARACTER - there may be more code points of that sort */
1285
1286 /* There are a few EBCDIC control codes that, strictly speaking, do not
1287  * map to any control codes in ASCII or Unicode for that matter. The
1288  * customary treatment is to map them in a particular way to ASCII C1
1289  * control codes that have no exact equivalent in EBCDIC, as below. */
1290 const gunichar2 charset_table_ebcdic[256] = {
1291     0x0000, 0x0001, 0x0002, 0x0003, 0x009c, 0x0009, 0x0086, 0x007f,
1292     0x0097, 0x008d, 0x008e, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
1293     0x0010, 0x0011, 0x0012, 0x0013, 0x009d, 0x0085, 0x0008, 0x0087,
1294     0x0018, 0x0019, 0x0092, 0x008f, 0x001c, 0x001d, 0x001e, 0x001f,
1295     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x000a, 0x0017, 0x001b,
1296     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x0005, 0x0006, 0x0007,
1297     UNREPL, UNREPL, 0x0016, 0x0093, 0x0094, 0x0095, 0x0096, 0x0004,
1298     0x0098, 0x0099, 0x009a, 0x009b, 0x0014, 0x0015, UNREPL, 0x001a,
1299     0x0020, 0x00a0, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1300     UNREPL, UNREPL, UNREPL, 0x002e, 0x003c, 0x0028, 0x002b, UNREPL,
1301     0x0026, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1302     UNREPL, UNREPL, UNREPL, 0x0024, 0x002a, 0x0029, 0x003b, UNREPL,
1303     0x002d, 0x002f, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1304     UNREPL, UNREPL, UNREPL, 0x002c, 0x0025, 0x005f, 0x003e, 0x003f,
1305     UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1306     UNREPL, 0x0060, 0x003a, 0x0023, 0x0040, 0x0027, 0x003d, 0x0022,
1307     UNREPL, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
1308     0x0068, 0x0069, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1309     UNREPL, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070,
1310     0x0071, 0x0072, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1311     UNREPL, 0x007e, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078,
1312     0x0079, 0x007a, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1313     UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1314     UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1315     0x007b, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
1316     0x0048, 0x0049, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1317     0x007d, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050,
1318     0x0051, 0x0052, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1319     0x005c, UNREPL, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058,
1320     0x0059, 0x005a, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1321     0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
1322     0x0038, 0x0039, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1323 };
1324
1325 /* EBCDIC code page 037 */
1326 const gunichar2 charset_table_ebcdic_cp037[256] = {
1327     0x0000, 0x0001, 0x0002, 0x0003, 0x009c, 0x0009, 0x0086, 0x007f,
1328     0x0097, 0x008d, 0x008e, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
1329     0x0010, 0x0011, 0x0012, 0x0013, 0x009d, 0x0085, 0x0008, 0x0087,
1330     0x0018, 0x0019, 0x0092, 0x008f, 0x001c, 0x001d, 0x001e, 0x001f,
1331     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x000a, 0x0017, 0x001b,
1332     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x0005, 0x0006, 0x0007,
1333     0x0090, 0x0091, 0x0016, 0x0093, 0x0094, 0x0095, 0x0096, 0x0004,
1334     0x0098, 0x0099, 0x009a, 0x009b, 0x0014, 0x0015, 0x009e, 0x001a,
1335     0x0020, 0x00a0, 0x00e2, 0x00e4, 0x00e0, 0x00e1, 0x00e3, 0x00e5,
1336     0x00e7, 0x00f1, 0x00a2, 0x002e, 0x003c, 0x0028, 0x002b, 0x007c,
1337     0x0026, 0x00e9, 0x00ea, 0x00eb, 0x00e8, 0x00ed, 0x00ee, 0x00ef,
1338     0x00ec, 0x00df, 0x0021, 0x0024, 0x002a, 0x0029, 0x003b, 0x00ac,
1339     0x002d, 0x002f, 0x00c2, 0x00c4, 0x00c0, 0x00c1, 0x00c3, 0x00c5,
1340     0x00c7, 0x00d1, 0x00a6, 0x002c, 0x0025, 0x005f, 0x003e, 0x003f,
1341     0x00f8, 0x00c9, 0x00ca, 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf,
1342     0x00cc, 0x0060, 0x003a, 0x0023, 0x0040, 0x0027, 0x003d, 0x0022,
1343     0x00d8, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
1344     0x0068, 0x0069, 0x00ab, 0x00bb, 0x00f0, 0x00fd, 0x00fe, 0x00b1,
1345     0x00b0, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070,
1346     0x0071, 0x0072, 0x00aa, 0x00ba, 0x00e6, 0x00b8, 0x00c6, 0x00a4,
1347     0x00b5, 0x007e, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078,
1348     0x0079, 0x007a, 0x00a1, 0x00bf, 0x00d0, 0x00dd, 0x00de, 0x00ae,
1349     0x005e, 0x00a3, 0x00a5, 0x00b7, 0x00a9, 0x00a7, 0x00b6, 0x00bc,
1350     0x00bd, 0x00be, 0x005b, 0x005d, 0x00af, 0x00a8, 0x00b4, 0x00d7,
1351     0x007b, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
1352     0x0048, 0x0049, 0x00ad, 0x00f4, 0x00f6, 0x00f2, 0x00f3, 0x00f5,
1353     0x007d, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050,
1354     0x0051, 0x0052, 0x00b9, 0x00fb, 0x00fc, 0x00f9, 0x00fa, 0x00ff,
1355     0x005c, 0x00f7, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058,
1356     0x0059, 0x005a, 0x00b2, 0x00d4, 0x00d6, 0x00d2, 0x00d3, 0x00d5,
1357     0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
1358     0x0038, 0x0039, 0x00b3, 0x00db, 0x00dc, 0x00d9, 0x00da, 0x009f,
1359 };
1360
1361 /* EBCDIC code page 500
1362  * https://www.ibm.com/support/pages/conversion-character-differences-between-ccsid-037-and-ccsid-500
1363  * CCSID 500 ("International Latin-1") has exactly the same repertoire as 37,
1364  * covering all of ISO-8559-1, but with seven code points permuted.
1365  * It is notable because it is the default code page for DRDA:
1366  * https://www.ibm.com/support/pages/drda-user-id-and-password-not-being-transmitted-correctly-when-containing-characters-%C2%AC-%C2%A2?lnk=hm
1367  */
1368 const gunichar2 charset_table_ebcdic_cp500[256] = {
1369     0x0000, 0x0001, 0x0002, 0x0003, 0x009c, 0x0009, 0x0086, 0x007f,
1370     0x0097, 0x008d, 0x008e, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
1371     0x0010, 0x0011, 0x0012, 0x0013, 0x009d, 0x0085, 0x0008, 0x0087,
1372     0x0018, 0x0019, 0x0092, 0x008f, 0x001c, 0x001d, 0x001e, 0x001f,
1373     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x000a, 0x0017, 0x001b,
1374     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x0005, 0x0006, 0x0007,
1375     0x0090, 0x0091, 0x0016, 0x0093, 0x0094, 0x0095, 0x0096, 0x0004,
1376     0x0098, 0x0099, 0x009a, 0x009b, 0x0014, 0x0015, 0x009e, 0x001a,
1377     0x0020, 0x00a0, 0x00e2, 0x00e4, 0x00e0, 0x00e1, 0x00e3, 0x00e5,
1378     0x00e7, 0x00f1, 0x005b, 0x002e, 0x003c, 0x0028, 0x002b, 0x0021,
1379     0x0026, 0x00e9, 0x00ea, 0x00eb, 0x00e8, 0x00ed, 0x00ee, 0x00ef,
1380     0x00ec, 0x00df, 0x005d, 0x0024, 0x002a, 0x0029, 0x003b, 0x005e,
1381     0x002d, 0x002f, 0x00c2, 0x00c4, 0x00c0, 0x00c1, 0x00c3, 0x00c5,
1382     0x00c7, 0x00d1, 0x00a6, 0x002c, 0x0025, 0x005f, 0x003e, 0x003f,
1383     0x00f8, 0x00c9, 0x00ca, 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf,
1384     0x00cc, 0x0060, 0x003a, 0x0023, 0x0040, 0x0027, 0x003d, 0x0022,
1385     0x00d8, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
1386     0x0068, 0x0069, 0x00ab, 0x00bb, 0x00f0, 0x00fd, 0x00fe, 0x00b1,
1387     0x00b0, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070,
1388     0x0071, 0x0072, 0x00aa, 0x00ba, 0x00e6, 0x00b8, 0x00c6, 0x00a4,
1389     0x00b5, 0x007e, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078,
1390     0x0079, 0x007a, 0x00a1, 0x00bf, 0x00d0, 0x00dd, 0x00de, 0x00ae,
1391     0x00a2, 0x00a3, 0x00a5, 0x00b7, 0x00a9, 0x00a7, 0x00b6, 0x00bc,
1392     0x00bd, 0x00be, 0x00ac, 0x007c, 0x00af, 0x00a8, 0x00b4, 0x00d7,
1393     0x007b, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
1394     0x0048, 0x0049, 0x00ad, 0x00f4, 0x00f6, 0x00f2, 0x00f3, 0x00f5,
1395     0x007d, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050,
1396     0x0051, 0x0052, 0x00b9, 0x00fb, 0x00fc, 0x00f9, 0x00fa, 0x00ff,
1397     0x005c, 0x00f7, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058,
1398     0x0059, 0x005a, 0x00b2, 0x00d4, 0x00d6, 0x00d2, 0x00d3, 0x00d5,
1399     0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
1400     0x0038, 0x0039, 0x00b3, 0x00db, 0x00dc, 0x00d9, 0x00da, 0x009f,
1401 };
1402
1403 /*
1404  * Given a wmem scope, a pointer, a length, and a translation table with
1405  * 256 entries, treat the string of bytes referred to by the pointer and
1406  * length as a string encoded using one octet per character, with octets
1407  * being mapped by the translation table to 2-byte Unicode Basic Multilingual
1408  * Plane characters (including REPLACEMENT CHARACTER), and return a
1409  * pointer to a UTF-8 string, allocated using the wmem scope.
1410  */
1411 uint8_t *
1412 get_nonascii_unichar2_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, const gunichar2 table[256])
1413 {
1414     wmem_strbuf_t *str;
1415
1416     str = wmem_strbuf_new_sized(scope, length+1);
1417
1418     while (length > 0) {
1419         uint8_t ch = *ptr;
1420
1421         wmem_strbuf_append_unichar(str, table[ch]);
1422         ptr++;
1423         length--;
1424     }
1425
1426     return (uint8_t *) wmem_strbuf_finalize(str);
1427 }
1428
1429 /*
1430  * Given a wmem scope, a pointer, a length, and a string referring to an
1431  * encoding (recognized by iconv), treat the bytes referred to by the pointer
1432  * and length as a string in that encoding, and return a pointer to a UTF-8
1433  * string, allocated using the wmem scope, converted from the original
1434  * encoding having substituted REPLACEMENT CHARACTER according to the
1435  * Unicode Standard 5.22 U+FFFD Substitution for Conversion
1436  * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
1437  */
1438 static uint8_t *
1439 get_string_enc_iconv(wmem_allocator_t *scope, const uint8_t *ptr, int length, const char *encoding)
1440 {
1441     GIConv cd;
1442     size_t inbytes, outbytes;
1443     size_t tempstr_size, bytes_written;
1444     size_t err;
1445     size_t max_subpart, tempinbytes;
1446     char *outptr, *tempstr;
1447
1448     wmem_strbuf_t *str;
1449
1450     if ((cd = g_iconv_open("UTF-8", encoding)) == (GIConv) -1) {
1451         REPORT_DISSECTOR_BUG("Unable to allocate iconv() converter from %s to UTF-8", encoding);
1452         /* Most likely to be a programming error passing in a bad encoding
1453          * name. However, could be a issue with the iconv support on the
1454          * system running WS. GLib requires iconv/libiconv, but is it possible
1455          * that some versions don't support all common encodings? */
1456     }
1457
1458     inbytes = length;
1459     str = wmem_strbuf_new_sized(scope, length+1);
1460     /* XXX: If speed becomes an issue, the faster way to do this would
1461      * involve passing the wmem_strbuf_t's string buffer directly into
1462      * g_iconv to avoid a memcpy later, but that requires changes to the
1463      * wmem_strbuf interface to have non const access to the string buffer,
1464      * and to manipulate the used length directly. */
1465     outbytes = tempstr_size = MAX(8, length);
1466     outptr = tempstr = (char *)g_malloc(outbytes);
1467     while (inbytes > 0) {
1468         err = g_iconv(cd, (char **)&ptr, &inbytes, &outptr, &outbytes);
1469         bytes_written = outptr - tempstr;
1470         wmem_strbuf_append_len(str, tempstr, bytes_written);
1471         outptr = tempstr;
1472         outbytes = tempstr_size;
1473
1474         if (err == (size_t) -1) {
1475             /* Errors */
1476             switch (errno) {
1477                 case EINVAL:
1478                     /* Incomplete sequence at the end, not an error */
1479                     wmem_strbuf_append_unichar_repl(str);
1480                     inbytes = 0;
1481                     break;
1482                 case E2BIG:
1483                     /* Not enough room (UTF-8 longer than the initial buffer),
1484                      * start back at the beginning of the buffer */
1485                     break;
1486                 case EILSEQ:
1487                     /* Find the maximal subpart of the ill-formed sequence */
1488                     errno = EINVAL;
1489                     for (max_subpart = 1; err == (size_t)-1 && errno == EINVAL; max_subpart++) {
1490                         tempinbytes = max_subpart;
1491                         err = g_iconv(cd, (char **)&ptr, &tempinbytes,
1492                                 &outptr, &outbytes);
1493                     }
1494                     max_subpart = MAX(1, max_subpart-1);
1495                     ptr += max_subpart;
1496                     inbytes -= max_subpart;
1497                     wmem_strbuf_append_unichar_repl(str);
1498                     outptr = tempstr;
1499                     outbytes = tempstr_size;
1500                     break;
1501                 default:
1502                     /* Unexpected conversion error, unrecoverable */
1503                     g_free(tempstr);
1504                     g_iconv_close(cd);
1505                     REPORT_DISSECTOR_BUG("Unexpected iconv() error when converting from %s to UTF-8", encoding);
1506                     break;
1507             }
1508         } else {
1509             /* Otherwise err is the number of replacement characters used,
1510              * but we don't care about that. */
1511             /* If we were converting to ISO-2022-JP or some other stateful
1512              * decoder with shift sequences (e.g. EBCDIC mixed-byte), a
1513              * final call with NULL input in order to output the shift
1514              * sequence back to initial state might make sense, but not
1515              * needed for UTF-8. */
1516         }
1517     }
1518
1519     g_free(tempstr);
1520     g_iconv_close(cd);
1521     return (uint8_t *) wmem_strbuf_finalize(str);
1522 }
1523
1524 /*
1525  * Given a wmem scope, a pointer, and a length, treat the bytes referred to
1526  * by the pointer and length as a GB18030 encoded string, and return a pointer
1527  * to a UTF-8 string, allocated using the wmem scope, converted having
1528  * substituted REPLACEMENT CHARACTER according to the Unicode Standard
1529  * 5.22 U+FFFD Substitution for Conversion.
1530  * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
1531  *
1532  * As expected, this will also decode GBK and GB2312 strings.
1533  */
1534 uint8_t *
1535 get_gb18030_string(wmem_allocator_t *scope, const uint8_t *ptr, int length)
1536 {
1537     /* iconv/libiconv support is guaranteed with GLib. Support this
1538      * via iconv, at least for now. */
1539     /* GNU libiconv has supported GB18030 (~ Windows Code page 54936) since
1540      * 2000-10-24 and version 1.4, is there is a system that compiles current
1541      * Wireshark yet its iconv only supports GBK (~ Windows Code page 936)? */
1542     const char *encoding = "GB18030";
1543     GIConv cd;
1544     if ((cd = g_iconv_open("UTF-8", encoding)) == (GIConv) -1) {
1545         encoding = "GBK";
1546         /* GB18030 is backwards compatible, at worst this will mean a few
1547          * extra REPLACEMENT CHARACTERs - GBK lacks the four byte encodings
1548          * from GB18030, which are all pairs of two byte sequences
1549          * 0x[81-FE] 0x[30-39]; that trailing byte is illegal in GBK
1550          * and thus the 4 byte characters will be replaced with two
1551          * REPLACEMENT CHARACTERs. */
1552     } else {
1553         g_iconv_close(cd);
1554     }
1555     return get_string_enc_iconv(scope, ptr, length, encoding);
1556 }
1557
1558 /*
1559  * Given a wmem scope, a pointer, and a length, treat the bytes referred to
1560  * by the pointer and length as a EUC-KR encoded string, and return a pointer
1561  * to a UTF-8 string, allocated using the wmem scope, converted having
1562  * substituted REPLACEMENT CHARACTER according to the Unicode Standard
1563  * 5.22 U+FFFD Substitution for Conversion.
1564  * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
1565  */
1566 uint8_t *
1567 get_euc_kr_string(wmem_allocator_t *scope, const uint8_t *ptr, int length)
1568 {
1569     /* iconv/libiconv support is guaranteed with GLib. Support this
1570      * via iconv, at least for now. */
1571     return get_string_enc_iconv(scope, ptr, length, "EUC-KR");
1572 }
1573
1574 /* T.61 to UTF-8 conversion table from OpenLDAP project
1575  * https://www.openldap.org/devel/gitweb.cgi?p=openldap.git;a=blob;f=libraries/libldap/t61.c;hb=HEAD
1576  */
1577 static const gunichar2 t61_tab[] = {
1578     0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007,
1579     0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f,
1580     0x010, 0x011, 0x012, 0x013, 0x014, 0x015, 0x016, 0x017,
1581     0x018, 0x019, 0x01a, 0x01b, 0x01c, 0x01d, 0x01e, 0x01f,
1582     0x020, 0x021, 0x022, 0x000, 0x000, 0x025, 0x026, 0x027,
1583     0x028, 0x029, 0x02a, 0x02b, 0x02c, 0x02d, 0x02e, 0x02f,
1584     0x030, 0x031, 0x032, 0x033, 0x034, 0x035, 0x036, 0x037,
1585     0x038, 0x039, 0x03a, 0x03b, 0x03c, 0x03d, 0x03e, 0x03f,
1586     0x040, 0x041, 0x042, 0x043, 0x044, 0x045, 0x046, 0x047,
1587     0x048, 0x049, 0x04a, 0x04b, 0x04c, 0x04d, 0x04e, 0x04f,
1588     0x050, 0x051, 0x052, 0x053, 0x054, 0x055, 0x056, 0x057,
1589     0x058, 0x059, 0x05a, 0x05b, 0x000, 0x05d, 0x000, 0x05f,
1590     0x000, 0x061, 0x062, 0x063, 0x064, 0x065, 0x066, 0x067,
1591     0x068, 0x069, 0x06a, 0x06b, 0x06c, 0x06d, 0x06e, 0x06f,
1592     0x070, 0x071, 0x072, 0x073, 0x074, 0x075, 0x076, 0x077,
1593     0x078, 0x079, 0x07a, 0x000, 0x07c, 0x000, 0x000, 0x07f,
1594     0x080, 0x081, 0x082, 0x083, 0x084, 0x085, 0x086, 0x087,
1595     0x088, 0x089, 0x08a, 0x08b, 0x08c, 0x08d, 0x08e, 0x08f,
1596     0x090, 0x091, 0x092, 0x093, 0x094, 0x095, 0x096, 0x097,
1597     0x098, 0x099, 0x09a, 0x09b, 0x09c, 0x09d, 0x09e, 0x09f,
1598     0x0a0, 0x0a1, 0x0a2, 0x0a3, 0x024, 0x0a5, 0x023, 0x0a7,
1599     0x0a4, 0x000, 0x000, 0x0ab, 0x000, 0x000, 0x000, 0x000,
1600     0x0b0, 0x0b1, 0x0b2, 0x0b3, 0x0d7, 0x0b5, 0x0b6, 0x0b7,
1601     0x0f7, 0x000, 0x000, 0x0bb, 0x0bc, 0x0bd, 0x0be, 0x0bf,
1602     0x000, 0x300, 0x301, 0x302, 0x303, 0x304, 0x306, 0x307,
1603     0x308, 0x000, 0x30a, 0x327, 0x332, 0x30b, 0x328, 0x30c,
1604     0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
1605     0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
1606     0x2126, 0xc6, 0x0d0, 0x0aa, 0x126, 0x000, 0x132, 0x13f,
1607     0x141, 0x0d8, 0x152, 0x0ba, 0x0de, 0x166, 0x14a, 0x149,
1608     0x138, 0x0e6, 0x111, 0x0f0, 0x127, 0x131, 0x133, 0x140,
1609     0x142, 0x0f8, 0x153, 0x0df, 0x0fe, 0x167, 0x14b, 0x000
1610 };
1611
1612 typedef gunichar2 wvec16[16];
1613 typedef gunichar2 wvec32[32];
1614
1615 /* Substitutions when 0xc1-0xcf appears by itself or with space 0x20 */
1616 static const wvec16 accents = {
1617     0x000, 0x060, 0x0b4, 0x05e, 0x07e, 0x0af, 0x2d8, 0x2d9,
1618     0x0a8, 0x000, 0x2da, 0x0b8, 0x000, 0x2dd, 0x2db, 0x2c7};
1619
1620 /* In the following tables, base characters commented in (parentheses)
1621  * are not defined by T.61 but are mapped anyway since their Unicode
1622  * composite exists.
1623  */
1624
1625 /* Grave accented chars AEIOU (NWY) */
1626 static const wvec32 c1_vec1 = {
1627     /* Upper case */
1628     0, 0xc0, 0, 0, 0, 0xc8, 0, 0, 0, 0xcc, 0, 0, 0, 0, 0x1f8, 0xd2,
1629     0, 0, 0, 0, 0, 0xd9, 0, 0x1e80, 0, 0x1ef2, 0, 0, 0, 0, 0, 0};
1630 static const wvec32 c1_vec2 = {
1631     /* Lower case */
1632     0, 0xe0, 0, 0, 0, 0xe8, 0, 0, 0, 0xec, 0, 0, 0, 0, 0x1f9, 0xf2,
1633     0, 0, 0, 0, 0, 0xf9, 0, 0x1e81, 0, 0x1ef3, 0, 0, 0, 0, 0, 0};
1634
1635 static const wvec32 *c1_grave[] = {
1636     NULL, NULL, &c1_vec1, &c1_vec2, NULL, NULL, NULL, NULL
1637 };
1638
1639 /* Acute accented chars AEIOUYCLNRSZ (GKMPW) */
1640 static const wvec32 c2_vec1 = {
1641     /* Upper case */
1642     0, 0xc1, 0, 0x106, 0, 0xc9, 0, 0x1f4,
1643     0, 0xcd, 0, 0x1e30, 0x139, 0x1e3e, 0x143, 0xd3,
1644     0x1e54, 0, 0x154, 0x15a, 0, 0xda, 0, 0x1e82,
1645     0, 0xdd, 0x179, 0, 0, 0, 0, 0};
1646 static const wvec32 c2_vec2 = {
1647     /* Lower case */
1648     0, 0xe1, 0, 0x107, 0, 0xe9, 0, 0x1f5,
1649     0, 0xed, 0, 0x1e31, 0x13a, 0x1e3f, 0x144, 0xf3,
1650     0x1e55, 0, 0x155, 0x15b, 0, 0xfa, 0, 0x1e83,
1651     0, 0xfd, 0x17a, 0, 0, 0, 0, 0};
1652 static const wvec32 c2_vec3 = {
1653     /* (AE and ae) */
1654     0, 0x1fc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1655     0, 0x1fd, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1656
1657 static const wvec32 *c2_acute[] = {
1658     NULL, NULL, &c2_vec1, &c2_vec2, NULL, NULL, NULL, &c2_vec3
1659 };
1660
1661 /* Circumflex AEIOUYCGHJSW (Z) */
1662 static const wvec32 c3_vec1 = {
1663     /* Upper case */
1664     0, 0xc2, 0, 0x108, 0, 0xca, 0, 0x11c,
1665     0x124, 0xce, 0x134, 0, 0, 0, 0, 0xd4,
1666     0, 0, 0, 0x15c, 0, 0xdb, 0, 0x174,
1667     0, 0x176, 0x1e90, 0, 0, 0, 0, 0};
1668 static const wvec32 c3_vec2 = {
1669     /* Lower case */
1670     0, 0xe2, 0, 0x109, 0, 0xea, 0, 0x11d,
1671     0x125, 0xee, 0x135, 0, 0, 0, 0, 0xf4,
1672     0, 0, 0, 0x15d, 0, 0xfb, 0, 0x175,
1673     0, 0x177, 0x1e91, 0, 0, 0, 0, 0};
1674 static const wvec32 *c3_circumflex[] = {
1675     NULL, NULL, &c3_vec1, &c3_vec2, NULL, NULL, NULL, NULL
1676 };
1677
1678 /* Tilde AIOUN (EVY) */
1679 static const wvec32 c4_vec1 = {
1680     /* Upper case */
1681     0, 0xc3, 0, 0, 0, 0x1ebc, 0, 0, 0, 0x128, 0, 0, 0, 0, 0xd1, 0xd5,
1682     0, 0, 0, 0, 0, 0x168, 0x1e7c, 0, 0, 0x1ef8, 0, 0, 0, 0, 0, 0};
1683 static const wvec32 c4_vec2 = {
1684     /* Lower case */
1685     0, 0xe3, 0, 0, 0, 0x1ebd, 0, 0, 0, 0x129, 0, 0, 0, 0, 0xf1, 0xf5,
1686     0, 0, 0, 0, 0, 0x169, 0x1e7d, 0, 0, 0x1ef9, 0, 0, 0, 0, 0, 0};
1687 static const wvec32 *c4_tilde[] = {
1688     NULL, NULL, &c4_vec1, &c4_vec2, NULL, NULL, NULL, NULL
1689 };
1690
1691 /* Macron AEIOU (YG) */
1692 static const wvec32 c5_vec1 = {
1693     /* Upper case */
1694     0, 0x100, 0, 0, 0, 0x112, 0, 0x1e20, 0, 0x12a, 0, 0, 0, 0, 0, 0x14c,
1695     0, 0, 0, 0, 0, 0x16a, 0, 0, 0, 0x232, 0, 0, 0, 0, 0, 0};
1696 static const wvec32 c5_vec2 = {
1697     /* Lower case */
1698     0, 0x101, 0, 0, 0, 0x113, 0, 0x1e21, 0, 0x12b, 0, 0, 0, 0, 0, 0x14d,
1699     0, 0, 0, 0, 0, 0x16b, 0, 0, 0, 0x233, 0, 0, 0, 0, 0, 0};
1700 static const wvec32 c5_vec3 = {
1701     /* (AE and ae) */
1702     0, 0x1e2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1703     0, 0x1e3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1704 static const wvec32 *c5_macron[] = {
1705     NULL, NULL, &c5_vec1, &c5_vec2, NULL, NULL, NULL, &c5_vec3
1706 };
1707
1708 /* Breve AUG (EIO) */
1709 static const wvec32 c6_vec1 = {
1710     /* Upper case */
1711     0, 0x102, 0, 0, 0, 0x114, 0, 0x11e, 0, 0x12c, 0, 0, 0, 0, 0, 0x14e,
1712     0, 0, 0, 0, 0, 0x16c, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1713 static const wvec32 c6_vec2 = {
1714     /* Lower case */
1715     0, 0x103, 0, 0, 0, 0x115, 0, 0x11f, 0, 0x12d, 0, 0, 0, 0, 0, 0x14f,
1716     0, 0, 0, 0, 0, 0x16d, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1717 static const wvec32 *c6_breve[] = {
1718     NULL, NULL, &c6_vec1, &c6_vec2, NULL, NULL, NULL, NULL
1719 };
1720
1721 /* Dot Above CEGIZ (AOBDFHMNPRSTWXY) */
1722 static const wvec32 c7_vec1 = {
1723     /* Upper case */
1724     0, 0x226, 0x1e02, 0x10a, 0x1e0a, 0x116, 0x1e1e, 0x120,
1725     0x1e22, 0x130, 0, 0, 0, 0x1e40, 0x1e44, 0x22e,
1726     0x1e56, 0, 0x1e58, 0x1e60, 0x1e6a, 0, 0, 0x1e86,
1727     0x1e8a, 0x1e8e, 0x17b, 0, 0, 0, 0, 0};
1728 static const wvec32 c7_vec2 = {
1729     /* Lower case */
1730     0, 0x227, 0x1e03, 0x10b, 0x1e0b, 0x117, 0x1e1f, 0x121,
1731     0x1e23, 0, 0, 0, 0, 0x1e41, 0x1e45, 0x22f,
1732     0x1e57, 0, 0x1e59, 0x1e61, 0x1e6b, 0, 0, 0x1e87,
1733     0x1e8b, 0x1e8f, 0x17c, 0, 0, 0, 0, 0};
1734 static const wvec32 *c7_dotabove[] = {
1735     NULL, NULL, &c7_vec1, &c7_vec2, NULL, NULL, NULL, NULL
1736 };
1737
1738 /* Diaeresis AEIOUY (HWXt) */
1739 static const wvec32 c8_vec1 = {
1740     /* Upper case */
1741     0, 0xc4, 0, 0, 0, 0xcb, 0, 0, 0x1e26, 0xcf, 0, 0, 0, 0, 0, 0xd6,
1742     0, 0, 0, 0, 0, 0xdc, 0, 0x1e84, 0x1e8c, 0x178, 0, 0, 0, 0, 0, 0};
1743 static const wvec32 c8_vec2 = {
1744     /* Lower case */
1745     0, 0xe4, 0, 0, 0, 0xeb, 0, 0, 0x1e27, 0xef, 0, 0, 0, 0, 0, 0xf6,
1746     0, 0, 0, 0, 0x1e97, 0xfc, 0, 0x1e85, 0x1e8d, 0xff, 0, 0, 0, 0, 0, 0};
1747 static const wvec32 *c8_diaeresis[] = {
1748     NULL, NULL, &c8_vec1, &c8_vec2, NULL, NULL, NULL, NULL
1749 };
1750
1751 /* Ring Above AU (wy) */
1752 static const wvec32 ca_vec1 = {
1753     /* Upper case */
1754     0, 0xc5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1755     0, 0, 0, 0, 0, 0x16e, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1756 static const wvec32 ca_vec2 = {
1757     /* Lower case */
1758     0, 0xe5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1759     0, 0, 0, 0, 0, 0x16f, 0, 0x1e98, 0, 0x1e99, 0, 0, 0, 0, 0, 0};
1760 static const wvec32 *ca_ringabove[] = {
1761     NULL, NULL, &ca_vec1, &ca_vec2, NULL, NULL, NULL, NULL
1762 };
1763
1764 /* Cedilla CGKLNRST (EDH) */
1765 static const wvec32 cb_vec1 = {
1766     /* Upper case */
1767     0, 0, 0, 0xc7, 0x1e10, 0x228, 0, 0x122,
1768     0x1e28, 0, 0, 0x136, 0x13b, 0, 0x145, 0,
1769     0, 0, 0x156, 0x15e, 0x162, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1770 static const wvec32 cb_vec2 = {
1771     /* Lower case */
1772     0, 0, 0, 0xe7, 0x1e11, 0x229, 0, 0x123,
1773     0x1e29, 0, 0, 0x137, 0x13c, 0, 0x146, 0,
1774     0, 0, 0x157, 0x15f, 0x163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1775 static const wvec32 *cb_cedilla[] = {
1776     NULL, NULL, &cb_vec1, &cb_vec2, NULL, NULL, NULL, NULL
1777 };
1778
1779 /* Double Acute Accent OU */
1780 static const wvec32 cd_vec1 = {
1781     /* Upper case */
1782     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x150,
1783     0, 0, 0, 0, 0, 0x170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1784 static const wvec32 cd_vec2 = {
1785     /* Lower case */
1786     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x151,
1787     0, 0, 0, 0, 0, 0x171, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1788 static const wvec32 *cd_doubleacute[] = {
1789     NULL, NULL, &cd_vec1, &cd_vec2, NULL, NULL, NULL, NULL
1790 };
1791
1792 /* Ogonek AEIU (O) */
1793 static const wvec32 ce_vec1 = {
1794     /* Upper case */
1795     0, 0x104, 0, 0, 0, 0x118, 0, 0, 0, 0x12e, 0, 0, 0, 0, 0, 0x1ea,
1796     0, 0, 0, 0, 0, 0x172, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1797 static const wvec32 ce_vec2 = {
1798     /* Lower case */
1799     0, 0x105, 0, 0, 0, 0x119, 0, 0, 0, 0x12f, 0, 0, 0, 0, 0, 0x1eb,
1800     0, 0, 0, 0, 0, 0x173, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1801 static const wvec32 *ce_ogonek[] = {
1802     NULL, NULL, &ce_vec1, &ce_vec2, NULL, NULL, NULL, NULL
1803 };
1804
1805 /* Caron CDELNRSTZ (AIOUGKjH) */
1806 static const wvec32 cf_vec1 = {
1807     /* Upper case */
1808     0, 0x1cd, 0, 0x10c, 0x10e, 0x11a, 0, 0x1e6,
1809     0x21e, 0x1cf, 0, 0x1e8, 0x13d, 0, 0x147, 0x1d1,
1810     0, 0, 0x158, 0x160, 0x164, 0x1d3, 0, 0,
1811     0, 0, 0x17d, 0, 0, 0, 0, 0};
1812 static const wvec32 cf_vec2 = {
1813     /* Lower case */
1814     0, 0x1ce, 0, 0x10d, 0x10f, 0x11b, 0, 0x1e7,
1815     0x21f, 0x1d0, 0x1f0, 0x1e9, 0x13e, 0, 0x148, 0x1d2,
1816     0, 0, 0x159, 0x161, 0x165, 0x1d4, 0, 0,
1817     0, 0, 0x17e, 0, 0, 0, 0, 0};
1818 static const wvec32 *cf_caron[] = {
1819     NULL, NULL, &cf_vec1, &cf_vec2, NULL, NULL, NULL, NULL
1820 };
1821
1822 static const wvec32 **cx_tab[] = {
1823     NULL, c1_grave, c2_acute, c3_circumflex, c4_tilde, c5_macron,
1824     c6_breve, c7_dotabove, c8_diaeresis, NULL, ca_ringabove,
1825     cb_cedilla, NULL, cd_doubleacute, ce_ogonek, cf_caron };
1826
1827 uint8_t *
1828 get_t61_string(wmem_allocator_t *scope, const uint8_t *ptr, int length)
1829 {
1830     int            i;
1831     const uint8_t *c;
1832     wmem_strbuf_t *strbuf;
1833
1834     strbuf = wmem_strbuf_new_sized(scope, length+1);
1835
1836     for (i = 0, c = ptr; i < length; c++, i++) {
1837         if (!t61_tab[*c]) {
1838             wmem_strbuf_append_unichar(strbuf, UNREPL);
1839         } else if (i < length - 1 && (*c & 0xf0) == 0xc0) {
1840             int j = *c & 0x0f;
1841             /* If this is the end of the string, or if the base
1842              * character is just a space, treat this as a regular
1843              * spacing character.
1844              */
1845             if ((!c[1] || c[1] == 0x20) && accents[j]) {
1846                 wmem_strbuf_append_unichar(strbuf, accents[j]);
1847             } else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
1848                 /* We have a composite mapping for this pair */
1849                        (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
1850                 wmem_strbuf_append_unichar(strbuf, (*cx_tab[j][c[1]>>5])[c[1]&0x1f]);
1851             } else {
1852                 /* No mapping, just swap it around so the base
1853                  * character comes first.
1854                  */
1855                 wmem_strbuf_append_unichar(strbuf, c[1]);
1856                 wmem_strbuf_append_unichar(strbuf, t61_tab[*c]);
1857             }
1858             c++; i++;
1859             continue;
1860         } else {
1861             wmem_strbuf_append_unichar(strbuf, t61_tab[*c]);
1862         }
1863     }
1864
1865     return (uint8_t *)wmem_strbuf_finalize(strbuf);
1866 }
1867
1868 /* The DECT standard charset from ETSI EN 300 175-5 Annex D
1869  */
1870 static const gunichar2 dect_standard_8bits_code_table[] = {
1871     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1872     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1873     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
1874     0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
1875     ' ',  '!',  '\"', '#',  '$',  '%',  '&',  '\'',
1876     '(',  ')',  '*',  '+',  ',',  '-',  '.',  '/',
1877     '0',  '1',  '2',  '3',  '4',  '5',  '6',  '7',
1878     '8',  '9',  ':',  ';',  '<',  '=',  '>',  '?',
1879     '@',  'A',  'B',  'C',  'D',  'E',  'F',  'G',
1880     'H',  'I',  'J',  'K',  'L',  'M',  'N',  'O',
1881     'P',  'Q',  'R',  'S',  'T',  'U',  'V',  'W',
1882     'X',  'Y',  'Z',  '[', '\\',  ']',  '^',  '_',
1883     '`',  'a',  'b',  'c',  'd',  'e',  'f',  'g',
1884     'h',  'i',  'j',  'k',  'l',  'm',  'n',  'o',
1885     'p',  'q',  'r',  's',  't',  'u',  'v',  'w',
1886     'x',  'y',  'z',  '{',  '|',  '}',  '~', 0x7f,
1887 };
1888
1889 uint8_t *
1890 get_dect_standard_8bits_string(wmem_allocator_t *scope, const uint8_t *ptr, int length)
1891 {
1892     int            position;
1893     const uint8_t *current_byte_ptr;
1894     wmem_strbuf_t *strbuf;
1895
1896     strbuf = wmem_strbuf_new_sized(scope, length+1);
1897
1898     for (position = 0, current_byte_ptr = ptr; position < length; current_byte_ptr++, position++) {
1899         if (*current_byte_ptr & 0x80) {
1900             wmem_strbuf_append_unichar(strbuf, UNREPL);
1901         } else if (!dect_standard_8bits_code_table[*current_byte_ptr]) {
1902             wmem_strbuf_append_unichar(strbuf, UNREPL);
1903         } else {
1904             wmem_strbuf_append_unichar(strbuf, dect_standard_8bits_code_table[*current_byte_ptr]);
1905         }
1906     }
1907
1908     return (uint8_t *)wmem_strbuf_finalize(strbuf);
1909 }
1910 /*
1911  * Editor modelines  -  https://www.wireshark.org/tools/modelines.html
1912  *
1913  * Local variables:
1914  * c-basic-offset: 4
1915  * tab-width: 8
1916  * indent-tabs-mode: nil
1917  * End:
1918  *
1919  * vi: set shiftwidth=4 tabstop=8 expandtab:
1920  * :indentSize=4:tabSize=8:noTabs=true:
1921  */