epan/charsets.h

   1 /* charsets.h
   2  * Routines for handling character sets
   3  *
   4  * Wireshark - Network traffic analyzer
   5  * By Gerald Combs <gerald@wireshark.org>
   6  * Copyright 1998 Gerald Combs
   7  *
   8  * SPDX-License-Identifier: GPL-2.0-or-later
   9  */
  10 #ifndef __CHARSETS_H__
  11 #define __CHARSETS_H__
  12
  13 #include "ws_symbol_export.h"
  14
  15 #ifdef __cplusplus
  16 extern "C" {
  17 #endif /* __cplusplus */
  18
  19 /*
  20  * Translation tables that map the upper 128 code points in single-byte
  21  * "extended ASCII" character encodings to Unicode code points in the
  22  * Basic Multilingual Plane.
  23  */
  24
  25 /* Table for windows-1250 */
  26 extern const gunichar2 charset_table_cp1250[0x80];
  27 /* Table for windows-1251 */
  28 extern const gunichar2 charset_table_cp1251[0x80];
  29 /* Table for windows-1252 */
  30 extern const gunichar2 charset_table_cp1252[0x80];
  31
  32 /* Tables for ISO-8859-X */
  33 extern const gunichar2 charset_table_iso_8859_2[0x80];
  34 extern const gunichar2 charset_table_iso_8859_3[0x80];
  35 extern const gunichar2 charset_table_iso_8859_4[0x80];
  36 extern const gunichar2 charset_table_iso_8859_5[0x80];
  37 extern const gunichar2 charset_table_iso_8859_6[0x80];
  38 extern const gunichar2 charset_table_iso_8859_7[0x80];
  39 extern const gunichar2 charset_table_iso_8859_8[0x80];
  40 extern const gunichar2 charset_table_iso_8859_9[0x80];
  41 extern const gunichar2 charset_table_iso_8859_10[0x80];
  42 extern const gunichar2 charset_table_iso_8859_11[0x80];
  43 extern const gunichar2 charset_table_iso_8859_13[0x80];
  44 extern const gunichar2 charset_table_iso_8859_14[0x80];
  45 extern const gunichar2 charset_table_iso_8859_15[0x80];
  46 extern const gunichar2 charset_table_iso_8859_16[0x80];
  47
  48 /* Tables for Mac character sets */
  49 extern const gunichar2 charset_table_mac_roman[0x80];
  50
  51 /* Tables for DOS code pages */
  52 extern const gunichar2 charset_table_cp437[0x80];
  53 extern const gunichar2 charset_table_cp855[0x80];
  54 extern const gunichar2 charset_table_cp866[0x80];
  55
  56 /*
  57  * Translation tables that map the lower 128 code points in single-byte
  58  * ISO 646-based character encodings to Unicode code points in the
  59  * Basic Multilingual Plane.
  60  */
  61 extern const gunichar2 charset_table_iso_646_basic[0x80];
  62
  63 /* Tables for EBCDIC code pages */
  64 extern const gunichar2 charset_table_ebcdic[256];
  65 extern const gunichar2 charset_table_ebcdic_cp037[256];
  66
  67 /*
  68  * Given a wmem scope, a pointer, and a length, treat the string of bytes
  69  * referred to by the pointer and length as an ASCII string, with all bytes
  70  * with the high-order bit set being invalid, and return a pointer to a
  71  * UTF-8 string, allocated using the wmem scope.
  72  *
  73  * Octets with the highest bit set will be converted to the Unicode
  74  * REPLACEMENT CHARACTER.
  75  */
  76 WS_DLL_PUBLIC guint8 *
  77 get_ascii_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
  78
  79 /*
  80  * Given a wmem scope, a pointer, and a length, treat the string of bytes
  81  * referred to by the pointer and length as a UTF-8 string, and return a
  82  * pointer to a UTF-8 string, allocated using the wmem scope, with all
  83  * ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER
  84  * according to the recommended "best practices" given in the Unicode
  85  * Standard and specified by W3C/WHATWG.
  86  */
  87 WS_DLL_PUBLIC guint8 *
  88 get_utf_8_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
  89
  90 /*
  91  * Given a wmem scope, a pointer, a length, and a translation table,
  92  * treat the string of bytes referred to by the pointer and length as a
  93  * string encoded using one octet per character, with octets with the
  94  * high-order bit clear being mapped by the translation table to 2-byte
  95  * Unicode Basic Multilingual Plane characters (including REPLACEMENT
  96  * CHARACTER) and octets with the high-order bit set being mapped to
  97  * REPLACEMENT CHARACTER, and return a pointer to a UTF-8 string,
  98  * allocated using the wmem scope.
  99  */
 100 WS_DLL_PUBLIC guint8 *
 101 get_iso_646_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[0x80]);
 102
 103 /*
 104  * Given a wmem scope, a pointer, and a length, treat the string of bytes
 105  * referred to by the pointer and length as an ISO 8859/1 string, and
 106  * return a pointer to a UTF-8 string, allocated using the wmem scope.
 107  */
 108 WS_DLL_PUBLIC guint8 *
 109 get_8859_1_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
 110
 111 /*
 112  * Given a wmem scope, a pointer, a length, and a translation table with
 113  * 128 entries, treat the string of bytes referred to by the pointer and
 114  * length as a string encoded using one octet per character, with octets
 115  * with the high-order bit clear being ASCII and octets with the high-order
 116  * bit set being mapped by the translation table to 2-byte Unicode Basic
 117  * Multilingual Plane characters (including REPLACEMENT CHARACTER), and
 118  * return a pointer to a UTF-8 string, allocated using the wmem scope.
 119  */
 120 WS_DLL_PUBLIC guint8 *
 121 get_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[0x80]);
 122
 123 /*
 124  * Given a wmem scope, a pointer, and a length, treat the string of bytes
 125  * referred to by the pointer and length as a UCS-2 encoded string
 126  * containing characters from the Basic Multilingual Plane (plane 0) of
 127  * Unicode, and return a pointer to a UTF-8 string, allocated with the
 128  * wmem scope.
 129  *
 130  * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
 131  *
 132  * Specify length in bytes.
 133  *
 134  * XXX - should map lead and trail surrogate values to REPLACEMENT
 135  * CHARACTERs (0xFFFD)?
 136  * XXX - if there are an odd number of bytes, should put a
 137  * REPLACEMENT CHARACTER at the end.
 138  */
 139 WS_DLL_PUBLIC guint8 *
 140 get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
 141
 142 /*
 143  * Given a wmem scope, a pointer, and a length, treat the string of bytes
 144  * referred to by the pointer and length as a UTF-16 encoded string, and
 145  * return a pointer to a UTF-8 string, allocated with the wmem scope.
 146  *
 147  * See RFC 2781 section 2.2.
 148  *
 149  * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
 150  *
 151  * Specify length in bytes.
 152  *
 153  * XXX - should map surrogate errors to REPLACEMENT CHARACTERs (0xFFFD).
 154  * XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
 155  * XXX - if there are an odd number of bytes, should put a
 156  * REPLACEMENT CHARACTER at the end.
 157  */
 158 WS_DLL_PUBLIC guint8 *
 159 get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
 160
 161 /*
 162  * Given a wmem scope, a pointer, and a length, treat the string of bytes
 163  * referred to by the pointer and length as a UCS-4 encoded string, and
 164  * return a pointer to a UTF-8 string, allocated with the wmem scope.
 165  *
 166  * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
 167  *
 168  * Specify length in bytes
 169  *
 170  * XXX - should map lead and trail surrogate values to a "substitute"
 171  * UTF-8 character?
 172  * XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
 173  * XXX - if the number of bytes isn't a multiple of 4, should put a
 174  * REPLACEMENT CHARACTER at the end.
 175  */
 176 WS_DLL_PUBLIC guint8 *
 177 get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding);
 178
 179 WS_DLL_PUBLIC guint8 *
 180 get_ts_23_038_7bits_string_packed(wmem_allocator_t *scope, const guint8 *ptr,
 181         const gint bit_offset, gint no_of_chars);
 182
 183 WS_DLL_PUBLIC guint8 *
 184 get_ts_23_038_7bits_string_unpacked(wmem_allocator_t *scope, const guint8 *ptr,
 185         gint length);
 186
 187 WS_DLL_PUBLIC guint8 *
 188 get_etsi_ts_102_221_annex_a_string(wmem_allocator_t *scope, const guint8 *ptr,
 189         gint length);
 190
 191 WS_DLL_PUBLIC guint8 *
 192 get_ascii_7bits_string(wmem_allocator_t *scope, const guint8 *ptr,
 193         const gint bit_offset, gint no_of_chars);
 194
 195 /*
 196  * Given a wmem scope, a pointer, a length, and a translation table with
 197  * 256 entries, treat the string of bytes referred to by the pointer and
 198  * length as a string encoded using one octet per character, with octets
 199  * being mapped by the translation table to 2-byte Unicode Basic Multilingual
 200  * Plane characters (including REPLACEMENT CHARACTER), and return a
 201  * pointer to a UTF-8 string, allocated using the wmem scope.
 202  */
 203 WS_DLL_PUBLIC guint8 *
 204 get_nonascii_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[256]);
 205
 206 /*
 207  * Given a wmem scope, a pointer, and a length, treat the bytes referred to
 208  * by the pointer and length as a GB18030 encoded string, and return a pointer
 209  * to a UTF-8 string, allocated using the wmem scope, converted having
 210  * substituted REPLACEMENT CHARACTER according to the Unicode Standard
 211  * 5.22 U+FFFD Substitution for Conversion.
 212  * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
 213  *
 214  * As expected, this will also decode GBK and GB2312 strings.
 215  */
 216 WS_DLL_PUBLIC guint8 *
 217 get_gb18030_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
 218
 219 /*
 220  * Given a wmem scope, a pointer, and a length, treat the bytes referred to
 221  * by the pointer and length as a EUC-KR encoded string, and return a pointer
 222  * to a UTF-8 string, allocated using the wmem scope, converted having
 223  * substituted REPLACEMENT CHARACTER according to the Unicode Standard
 224  * 5.22 U+FFFD Substitution for Conversion.
 225  * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
 226  */
 227 WS_DLL_PUBLIC guint8 *
 228 get_euc_kr_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
 229
 230 WS_DLL_PUBLIC guint8 *
 231 get_t61_string(wmem_allocator_t *scope, const guint8 *ptr, gint length);
 232
 233 #if 0
 234 void ASCII_to_EBCDIC(guint8 *buf, guint bytes);
 235 guint8 ASCII_to_EBCDIC1(guint8 c);
 236 #endif
 237 WS_DLL_PUBLIC
 238 void EBCDIC_to_ASCII(guint8 *buf, guint bytes);
 239 WS_DLL_PUBLIC
 240 guint8 EBCDIC_to_ASCII1(guint8 c);
 241
 242 #ifdef __cplusplus
 243 }
 244 #endif /* __cplusplus */
 245
 246 #endif /* __CHARSETS_H__ */
 247
 248 /*
 249  * Editor modelines  -  https://www.wireshark.org/tools/modelines.html
 250  *
 251  * Local variables:
 252  * c-basic-offset: 4
 253  * tab-width: 8
 254  * indent-tabs-mode: nil
 255  * End:
 256  *
 257  * vi: set shiftwidth=4 tabstop=8 expandtab:
 258  * :indentSize=4:tabSize=8:noTabs=true:
 259  */