epan/charsets.h

   1 /** @file
   2  * Routines for handling character sets
   3  *
   4  * Wireshark - Network traffic analyzer
   5  * By Gerald Combs <gerald@wireshark.org>
   6  * Copyright 1998 Gerald Combs
   7  *
   8  * SPDX-License-Identifier: GPL-2.0-or-later
   9  */
  10 #ifndef __CHARSETS_H__
  11 #define __CHARSETS_H__
  12
  13 #include "ws_symbol_export.h"
  14
  15 #ifdef __cplusplus
  16 extern "C" {
  17 #endif /* __cplusplus */
  18
  19 /*
  20  * Translation tables that map the upper 128 code points in single-byte
  21  * "extended ASCII" character encodings to Unicode code points in the
  22  * Basic Multilingual Plane.
  23  */
  24
  25 /* Table for windows-1250 */
  26 extern const gunichar2 charset_table_cp1250[0x80];
  27 /* Table for windows-1251 */
  28 extern const gunichar2 charset_table_cp1251[0x80];
  29 /* Table for windows-1252 */
  30 extern const gunichar2 charset_table_cp1252[0x80];
  31
  32 /* Tables for ISO-8859-X */
  33 extern const gunichar2 charset_table_iso_8859_2[0x80];
  34 extern const gunichar2 charset_table_iso_8859_3[0x80];
  35 extern const gunichar2 charset_table_iso_8859_4[0x80];
  36 extern const gunichar2 charset_table_iso_8859_5[0x80];
  37 extern const gunichar2 charset_table_iso_8859_6[0x80];
  38 extern const gunichar2 charset_table_iso_8859_7[0x80];
  39 extern const gunichar2 charset_table_iso_8859_8[0x80];
  40 extern const gunichar2 charset_table_iso_8859_9[0x80];
  41 extern const gunichar2 charset_table_iso_8859_10[0x80];
  42 extern const gunichar2 charset_table_iso_8859_11[0x80];
  43 extern const gunichar2 charset_table_iso_8859_13[0x80];
  44 extern const gunichar2 charset_table_iso_8859_14[0x80];
  45 extern const gunichar2 charset_table_iso_8859_15[0x80];
  46 extern const gunichar2 charset_table_iso_8859_16[0x80];
  47
  48 /* Tables for Mac character sets */
  49 extern const gunichar2 charset_table_mac_roman[0x80];
  50
  51 /* Tables for DOS code pages */
  52 extern const gunichar2 charset_table_cp437[0x80];
  53 extern const gunichar2 charset_table_cp855[0x80];
  54 extern const gunichar2 charset_table_cp866[0x80];
  55
  56 /*
  57  * Translation tables that map the lower 128 code points in single-byte
  58  * ISO 646-based character encodings to Unicode code points in the
  59  * Basic Multilingual Plane.
  60  */
  61 extern const gunichar2 charset_table_iso_646_basic[0x80];
  62
  63 /* Tables for EBCDIC code pages */
  64 extern const gunichar2 charset_table_ebcdic[256];
  65 extern const gunichar2 charset_table_ebcdic_cp037[256];
  66 extern const gunichar2 charset_table_ebcdic_cp500[256];
  67
  68 /*
  69  * Given a wmem scope, a pointer, and a length, treat the string of bytes
  70  * referred to by the pointer and length as an ASCII string, with all bytes
  71  * with the high-order bit set being invalid, and return a pointer to a
  72  * UTF-8 string, allocated using the wmem scope.
  73  *
  74  * Octets with the highest bit set will be converted to the Unicode
  75  * REPLACEMENT CHARACTER.
  76  */
  77 WS_DLL_PUBLIC uint8_t *
  78 get_ascii_string(wmem_allocator_t *scope, const uint8_t *ptr, int length);
  79
  80 /*
  81  * Given a wmem scope, a pointer, and a length, treat the string of bytes
  82  * referred to by the pointer and length as a UTF-8 string, and return a
  83  * pointer to a UTF-8 string, allocated using the wmem scope, with all
  84  * ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER
  85  * according to the recommended "best practices" given in the Unicode
  86  * Standard and specified by W3C/WHATWG.
  87  */
  88 WS_DLL_PUBLIC uint8_t *
  89 get_utf_8_string(wmem_allocator_t *scope, const uint8_t *ptr, int length);
  90
  91 /*
  92  * Given a wmem scope, a pointer, a length, and a translation table,
  93  * treat the string of bytes referred to by the pointer and length as a
  94  * string encoded using one octet per character, with octets with the
  95  * high-order bit clear being mapped by the translation table to 2-byte
  96  * Unicode Basic Multilingual Plane characters (including REPLACEMENT
  97  * CHARACTER) and octets with the high-order bit set being mapped to
  98  * REPLACEMENT CHARACTER, and return a pointer to a UTF-8 string,
  99  * allocated using the wmem scope.
 100  */
 101 WS_DLL_PUBLIC uint8_t *
 102 get_iso_646_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, const gunichar2 table[0x80]);
 103
 104 /*
 105  * Given a wmem scope, a pointer, and a length, treat the string of bytes
 106  * referred to by the pointer and length as an ISO 8859/1 string, and
 107  * return a pointer to a UTF-8 string, allocated using the wmem scope.
 108  */
 109 WS_DLL_PUBLIC uint8_t *
 110 get_8859_1_string(wmem_allocator_t *scope, const uint8_t *ptr, int length);
 111
 112 /*
 113  * Given a wmem scope, a pointer, a length, and a translation table with
 114  * 128 entries, treat the string of bytes referred to by the pointer and
 115  * length as a string encoded using one octet per character, with octets
 116  * with the high-order bit clear being ASCII and octets with the high-order
 117  * bit set being mapped by the translation table to 2-byte Unicode Basic
 118  * Multilingual Plane characters (including REPLACEMENT CHARACTER), and
 119  * return a pointer to a UTF-8 string, allocated using the wmem scope.
 120  */
 121 WS_DLL_PUBLIC uint8_t *
 122 get_unichar2_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, const gunichar2 table[0x80]);
 123
 124 /*
 125  * Given a wmem scope, a pointer, and a length, treat the string of bytes
 126  * referred to by the pointer and length as a UCS-2 encoded string
 127  * containing characters from the Basic Multilingual Plane (plane 0) of
 128  * Unicode, and return a pointer to a UTF-8 string, allocated with the
 129  * wmem scope.
 130  *
 131  * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
 132  * possibly ORed with ENC_BOM.
 133  *
 134  * Specify length in bytes.
 135  */
 136 WS_DLL_PUBLIC uint8_t *
 137 get_ucs_2_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, unsigned encoding);
 138
 139 /*
 140  * Given a wmem scope, a pointer, and a length, treat the string of bytes
 141  * referred to by the pointer and length as a UTF-16 encoded string, and
 142  * return a pointer to a UTF-8 string, allocated with the wmem scope.
 143  *
 144  * See RFC 2781 section 2.2.
 145  *
 146  * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
 147  * possibly ORed with ENC_BOM.
 148  *
 149  * Specify length in bytes.
 150  */
 151 WS_DLL_PUBLIC uint8_t *
 152 get_utf_16_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, unsigned encoding);
 153
 154 /*
 155  * Given a wmem scope, a pointer, and a length, treat the string of bytes
 156  * referred to by the pointer and length as a UCS-4 encoded string, and
 157  * return a pointer to a UTF-8 string, allocated with the wmem scope.
 158  *
 159  * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN,
 160  * possibly ORed with ENC_BOM.
 161  *
 162  * Specify length in bytes.
 163  */
 164 WS_DLL_PUBLIC uint8_t *
 165 get_ucs_4_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, unsigned encoding);
 166
 167 WS_DLL_PUBLIC uint8_t *
 168 get_ts_23_038_7bits_string_packed(wmem_allocator_t *scope, const uint8_t *ptr,
 169         const int bit_offset, int no_of_chars);
 170
 171 WS_DLL_PUBLIC uint8_t *
 172 get_ts_23_038_7bits_string_unpacked(wmem_allocator_t *scope, const uint8_t *ptr,
 173         int length);
 174
 175 WS_DLL_PUBLIC uint8_t *
 176 get_etsi_ts_102_221_annex_a_string(wmem_allocator_t *scope, const uint8_t *ptr,
 177         int length);
 178
 179 WS_DLL_PUBLIC uint8_t *
 180 get_ascii_7bits_string(wmem_allocator_t *scope, const uint8_t *ptr,
 181         const int bit_offset, int no_of_chars);
 182
 183 /*
 184  * Given a wmem scope, a pointer, a length, and a translation table with
 185  * 256 entries, treat the string of bytes referred to by the pointer and
 186  * length as a string encoded using one octet per character, with octets
 187  * being mapped by the translation table to 2-byte Unicode Basic Multilingual
 188  * Plane characters (including REPLACEMENT CHARACTER), and return a
 189  * pointer to a UTF-8 string, allocated using the wmem scope.
 190  */
 191 WS_DLL_PUBLIC uint8_t *
 192 get_nonascii_unichar2_string(wmem_allocator_t *scope, const uint8_t *ptr, int length, const gunichar2 table[256]);
 193
 194 /*
 195  * Given a wmem scope, a pointer, and a length, treat the bytes referred to
 196  * by the pointer and length as a GB18030 encoded string, and return a pointer
 197  * to a UTF-8 string, allocated using the wmem scope, converted having
 198  * substituted REPLACEMENT CHARACTER according to the Unicode Standard
 199  * 5.22 U+FFFD Substitution for Conversion.
 200  * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
 201  *
 202  * As expected, this will also decode GBK and GB2312 strings.
 203  */
 204 WS_DLL_PUBLIC uint8_t *
 205 get_gb18030_string(wmem_allocator_t *scope, const uint8_t *ptr, int length);
 206
 207 /*
 208  * Given a wmem scope, a pointer, and a length, treat the bytes referred to
 209  * by the pointer and length as a EUC-KR encoded string, and return a pointer
 210  * to a UTF-8 string, allocated using the wmem scope, converted having
 211  * substituted REPLACEMENT CHARACTER according to the Unicode Standard
 212  * 5.22 U+FFFD Substitution for Conversion.
 213  * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
 214  */
 215 WS_DLL_PUBLIC uint8_t *
 216 get_euc_kr_string(wmem_allocator_t *scope, const uint8_t *ptr, int length);
 217
 218 WS_DLL_PUBLIC uint8_t *
 219 get_t61_string(wmem_allocator_t *scope, const uint8_t *ptr, int length);
 220
 221 WS_DLL_PUBLIC uint8_t *
 222 get_dect_standard_8bits_string(wmem_allocator_t *scope, const uint8_t *ptr, int length);
 223 #ifdef __cplusplus
 224 }
 225 #endif /* __cplusplus */
 226
 227 #endif /* __CHARSETS_H__ */
 228
 229 /*
 230  * Editor modelines  -  https://www.wireshark.org/tools/modelines.html
 231  *
 232  * Local variables:
 233  * c-basic-offset: 4
 234  * tab-width: 8
 235  * indent-tabs-mode: nil
 236  * End:
 237  *
 238  * vi: set shiftwidth=4 tabstop=8 expandtab:
 239  * :indentSize=4:tabSize=8:noTabs=true:
 240  */