src/include/mb/pg_wchar.h

   1 /*-------------------------------------------------------------------------
   2  *
   3  * pg_wchar.h
   4  *        multibyte-character support
   5  *
   6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  * src/include/mb/pg_wchar.h
  10  *
  11  *      NOTES
  12  *              This is used both by the backend and by frontends, but should not be
  13  *              included by libpq client programs.  In particular, a libpq client
  14  *              should not assume that the encoding IDs used by the version of libpq
  15  *              it's linked to match up with the IDs declared here.
  16  *
  17  *-------------------------------------------------------------------------
  18  */
  19 #ifndef PG_WCHAR_H
  20 #define PG_WCHAR_H
  21
  22 /*
  23  * The pg_wchar type
  24  */
  25 typedef unsigned int pg_wchar;
  26
  27 /*
  28  * Maximum byte length of multibyte characters in any backend encoding
  29  */
  30 #define MAX_MULTIBYTE_CHAR_LEN  4
  31
  32 /*
  33  * various definitions for EUC
  34  */
  35 #define SS2 0x8e                                /* single shift 2 (JIS0201) */
  36 #define SS3 0x8f                                /* single shift 3 (JIS0212) */
  37
  38 /*
  39  * SJIS validation macros
  40  */
  41 #define ISSJISHEAD(c) (((c) >= 0x81 && (c) <= 0x9f) || ((c) >= 0xe0 && (c) <= 0xfc))
  42 #define ISSJISTAIL(c) (((c) >= 0x40 && (c) <= 0x7e) || ((c) >= 0x80 && (c) <= 0xfc))
  43
  44 /*----------------------------------------------------
  45  * MULE Internal Encoding (MIC)
  46  *
  47  * This encoding follows the design used within XEmacs; it is meant to
  48  * subsume many externally-defined character sets.  Each character includes
  49  * identification of the character set it belongs to, so the encoding is
  50  * general but somewhat bulky.
  51  *
  52  * Currently PostgreSQL supports 5 types of MULE character sets:
  53  *
  54  * 1) 1-byte ASCII characters.  Each byte is below 0x80.
  55  *
  56  * 2) "Official" single byte charsets such as ISO-8859-1 (Latin1).
  57  *        Each MULE character consists of 2 bytes: LC1 + C1, where LC1 is
  58  *        an identifier for the charset (in the range 0x81 to 0x8d) and C1
  59  *        is the character code (in the range 0xa0 to 0xff).
  60  *
  61  * 3) "Private" single byte charsets such as SISHENG.  Each MULE
  62  *        character consists of 3 bytes: LCPRV1 + LC12 + C1, where LCPRV1
  63  *        is a private-charset flag, LC12 is an identifier for the charset,
  64  *        and C1 is the character code (in the range 0xa0 to 0xff).
  65  *        LCPRV1 is either 0x9a (if LC12 is in the range 0xa0 to 0xdf)
  66  *        or 0x9b (if LC12 is in the range 0xe0 to 0xef).
  67  *
  68  * 4) "Official" multibyte charsets such as JIS X0208.  Each MULE
  69  *        character consists of 3 bytes: LC2 + C1 + C2, where LC2 is
  70  *        an identifier for the charset (in the range 0x90 to 0x99) and C1
  71  *        and C2 form the character code (each in the range 0xa0 to 0xff).
  72  *
  73  * 5) "Private" multibyte charsets such as CNS 11643-1992 Plane 3.
  74  *        Each MULE character consists of 4 bytes: LCPRV2 + LC22 + C1 + C2,
  75  *        where LCPRV2 is a private-charset flag, LC22 is an identifier for
  76  *        the charset, and C1 and C2 form the character code (each in the range
  77  *        0xa0 to 0xff).  LCPRV2 is either 0x9c (if LC22 is in the range 0xf0
  78  *        to 0xf4) or 0x9d (if LC22 is in the range 0xf5 to 0xfe).
  79  *
  80  * "Official" encodings are those that have been assigned code numbers by
  81  * the XEmacs project; "private" encodings have Postgres-specific charset
  82  * identifiers.
  83  *
  84  * See the "XEmacs Internals Manual", available at http://www.xemacs.org,
  85  * for more details.  Note that for historical reasons, Postgres'
  86  * private-charset flag values do not match what XEmacs says they should be,
  87  * so this isn't really exactly MULE (not that private charsets would be
  88  * interoperable anyway).
  89  *
  90  * Note that XEmacs's implementation is different from what emacs does.
  91  * We follow emacs's implementation, rather than XEmacs's.
  92  *----------------------------------------------------
  93  */
  94
  95 /*
  96  * Charset identifiers (also called "leading bytes" in the MULE documentation)
  97  */
  98
  99 /*
 100  * Charset IDs for official single byte encodings (0x81-0x8e)
 101  */
 102 #define LC_ISO8859_1            0x81    /* ISO8859 Latin 1 */
 103 #define LC_ISO8859_2            0x82    /* ISO8859 Latin 2 */
 104 #define LC_ISO8859_3            0x83    /* ISO8859 Latin 3 */
 105 #define LC_ISO8859_4            0x84    /* ISO8859 Latin 4 */
 106 #define LC_TIS620                       0x85    /* Thai (not supported yet) */
 107 #define LC_ISO8859_7            0x86    /* Greek (not supported yet) */
 108 #define LC_ISO8859_6            0x87    /* Arabic (not supported yet) */
 109 #define LC_ISO8859_8            0x88    /* Hebrew (not supported yet) */
 110 #define LC_JISX0201K            0x89    /* Japanese 1 byte kana */
 111 #define LC_JISX0201R            0x8a    /* Japanese 1 byte Roman */
 112 /* Note that 0x8b seems to be unused as of Emacs 20.7.
 113  * However, there might be a chance that 0x8b could be used
 114  * in later versions of Emacs.
 115  */
 116 #define LC_KOI8_R                       0x8b    /* Cyrillic KOI8-R */
 117 #define LC_ISO8859_5            0x8c    /* ISO8859 Cyrillic */
 118 #define LC_ISO8859_9            0x8d    /* ISO8859 Latin 5 (not supported yet) */
 119 #define LC_ISO8859_15           0x8e    /* ISO8859 Latin 15 (not supported yet) */
 120 /* #define CONTROL_1            0x8f    control characters (unused) */
 121
 122 /* Is a leading byte for "official" single byte encodings? */
 123 #define IS_LC1(c)       ((unsigned char)(c) >= 0x81 && (unsigned char)(c) <= 0x8d)
 124
 125 /*
 126  * Charset IDs for official multibyte encodings (0x90-0x99)
 127  * 0x9a-0x9d are free. 0x9e and 0x9f are reserved.
 128  */
 129 #define LC_JISX0208_1978        0x90    /* Japanese Kanji, old JIS (not supported) */
 130 #define LC_GB2312_80            0x91    /* Chinese */
 131 #define LC_JISX0208                     0x92    /* Japanese Kanji (JIS X 0208) */
 132 #define LC_KS5601                       0x93    /* Korean */
 133 #define LC_JISX0212                     0x94    /* Japanese Kanji (JIS X 0212) */
 134 #define LC_CNS11643_1           0x95    /* CNS 11643-1992 Plane 1 */
 135 #define LC_CNS11643_2           0x96    /* CNS 11643-1992 Plane 2 */
 136 #define LC_JISX0213_1           0x97    /* Japanese Kanji (JIS X 0213 Plane 1)
 137                                                                          * (not supported) */
 138 #define LC_BIG5_1                       0x98    /* Plane 1 Chinese traditional (not
 139                                                                          * supported) */
 140 #define LC_BIG5_2                       0x99    /* Plane 1 Chinese traditional (not
 141                                                                          * supported) */
 142
 143 /* Is a leading byte for "official" multibyte encodings? */
 144 #define IS_LC2(c)       ((unsigned char)(c) >= 0x90 && (unsigned char)(c) <= 0x99)
 145
 146 /*
 147  * Postgres-specific prefix bytes for "private" single byte encodings
 148  * (According to the MULE docs, we should be using 0x9e for this)
 149  */
 150 #define LCPRV1_A                0x9a
 151 #define LCPRV1_B                0x9b
 152 #define IS_LCPRV1(c)    ((unsigned char)(c) == LCPRV1_A || (unsigned char)(c) == LCPRV1_B)
 153 #define IS_LCPRV1_A_RANGE(c)    \
 154         ((unsigned char)(c) >= 0xa0 && (unsigned char)(c) <= 0xdf)
 155 #define IS_LCPRV1_B_RANGE(c)    \
 156         ((unsigned char)(c) >= 0xe0 && (unsigned char)(c) <= 0xef)
 157
 158 /*
 159  * Postgres-specific prefix bytes for "private" multibyte encodings
 160  * (According to the MULE docs, we should be using 0x9f for this)
 161  */
 162 #define LCPRV2_A                0x9c
 163 #define LCPRV2_B                0x9d
 164 #define IS_LCPRV2(c)    ((unsigned char)(c) == LCPRV2_A || (unsigned char)(c) == LCPRV2_B)
 165 #define IS_LCPRV2_A_RANGE(c)    \
 166         ((unsigned char)(c) >= 0xf0 && (unsigned char)(c) <= 0xf4)
 167 #define IS_LCPRV2_B_RANGE(c)    \
 168         ((unsigned char)(c) >= 0xf5 && (unsigned char)(c) <= 0xfe)
 169
 170 /*
 171  * Charset IDs for private single byte encodings (0xa0-0xef)
 172  */
 173 #define LC_SISHENG                      0xa0    /* Chinese SiSheng characters for
 174                                                                          * PinYin/ZhuYin (not supported) */
 175 #define LC_IPA                          0xa1    /* IPA (International Phonetic
 176                                                                          * Association) (not supported) */
 177 #define LC_VISCII_LOWER         0xa2    /* Vietnamese VISCII1.1 lower-case (not
 178                                                                          * supported) */
 179 #define LC_VISCII_UPPER         0xa3    /* Vietnamese VISCII1.1 upper-case (not
 180                                                                          * supported) */
 181 #define LC_ARABIC_DIGIT         0xa4    /* Arabic digit (not supported) */
 182 #define LC_ARABIC_1_COLUMN      0xa5    /* Arabic 1-column (not supported) */
 183 #define LC_ASCII_RIGHT_TO_LEFT  0xa6    /* ASCII (left half of ISO8859-1) with
 184                                                                                  * right-to-left direction (not
 185                                                                                  * supported) */
 186 #define LC_LAO                          0xa7    /* Lao characters (ISO10646 0E80..0EDF)
 187                                                                          * (not supported) */
 188 #define LC_ARABIC_2_COLUMN      0xa8    /* Arabic 1-column (not supported) */
 189
 190 /*
 191  * Charset IDs for private multibyte encodings (0xf0-0xff)
 192  */
 193 #define LC_INDIAN_1_COLUMN      0xf0    /* Indian charset for 1-column width
 194                                                                          * glyphs (not supported) */
 195 #define LC_TIBETAN_1_COLUMN 0xf1        /* Tibetan 1-column width glyphs (not
 196                                                                          * supported) */
 197 #define LC_UNICODE_SUBSET_2 0xf2        /* Unicode characters of the range
 198                                                                          * U+2500..U+33FF. (not supported) */
 199 #define LC_UNICODE_SUBSET_3 0xf3        /* Unicode characters of the range
 200                                                                          * U+E000..U+FFFF. (not supported) */
 201 #define LC_UNICODE_SUBSET       0xf4    /* Unicode characters of the range
 202                                                                          * U+0100..U+24FF. (not supported) */
 203 #define LC_ETHIOPIC                     0xf5    /* Ethiopic characters (not supported) */
 204 #define LC_CNS11643_3           0xf6    /* CNS 11643-1992 Plane 3 */
 205 #define LC_CNS11643_4           0xf7    /* CNS 11643-1992 Plane 4 */
 206 #define LC_CNS11643_5           0xf8    /* CNS 11643-1992 Plane 5 */
 207 #define LC_CNS11643_6           0xf9    /* CNS 11643-1992 Plane 6 */
 208 #define LC_CNS11643_7           0xfa    /* CNS 11643-1992 Plane 7 */
 209 #define LC_INDIAN_2_COLUMN      0xfb    /* Indian charset for 2-column width
 210                                                                          * glyphs (not supported) */
 211 #define LC_TIBETAN                      0xfc    /* Tibetan (not supported) */
 212 /* #define FREE                         0xfd    free (unused) */
 213 /* #define FREE                         0xfe    free (unused) */
 214 /* #define FREE                         0xff    free (unused) */
 215
 216 /*----------------------------------------------------
 217  * end of MULE stuff
 218  *----------------------------------------------------
 219  */
 220
 221 /*
 222  * PostgreSQL encoding identifiers
 223  *
 224  * WARNING: the order of this enum must be same as order of entries
 225  *                      in the pg_enc2name_tbl[] array (in src/common/encnames.c), and
 226  *                      in the pg_wchar_table[] array (in src/common/wchar.c)!
 227  *
 228  *                      If you add some encoding don't forget to check
 229  *                      PG_ENCODING_BE_LAST macro.
 230  *
 231  * PG_SQL_ASCII is default encoding and must be = 0.
 232  *
 233  * XXX  We must avoid renumbering any backend encoding until libpq's major
 234  * version number is increased beyond 5; it turns out that the backend
 235  * encoding IDs are effectively part of libpq's ABI as far as 8.2 initdb and
 236  * psql are concerned.
 237  */
 238 typedef enum pg_enc
 239 {
 240         PG_SQL_ASCII = 0,                       /* SQL/ASCII */
 241         PG_EUC_JP,                                      /* EUC for Japanese */
 242         PG_EUC_CN,                                      /* EUC for Chinese */
 243         PG_EUC_KR,                                      /* EUC for Korean */
 244         PG_EUC_TW,                                      /* EUC for Taiwan */
 245         PG_EUC_JIS_2004,                        /* EUC-JIS-2004 */
 246         PG_UTF8,                                        /* Unicode UTF8 */
 247         PG_MULE_INTERNAL,                       /* Mule internal code */
 248         PG_LATIN1,                                      /* ISO-8859-1 Latin 1 */
 249         PG_LATIN2,                                      /* ISO-8859-2 Latin 2 */
 250         PG_LATIN3,                                      /* ISO-8859-3 Latin 3 */
 251         PG_LATIN4,                                      /* ISO-8859-4 Latin 4 */
 252         PG_LATIN5,                                      /* ISO-8859-9 Latin 5 */
 253         PG_LATIN6,                                      /* ISO-8859-10 Latin6 */
 254         PG_LATIN7,                                      /* ISO-8859-13 Latin7 */
 255         PG_LATIN8,                                      /* ISO-8859-14 Latin8 */
 256         PG_LATIN9,                                      /* ISO-8859-15 Latin9 */
 257         PG_LATIN10,                                     /* ISO-8859-16 Latin10 */
 258         PG_WIN1256,                                     /* windows-1256 */
 259         PG_WIN1258,                                     /* Windows-1258 */
 260         PG_WIN866,                                      /* (MS-DOS CP866) */
 261         PG_WIN874,                                      /* windows-874 */
 262         PG_KOI8R,                                       /* KOI8-R */
 263         PG_WIN1251,                                     /* windows-1251 */
 264         PG_WIN1252,                                     /* windows-1252 */
 265         PG_ISO_8859_5,                          /* ISO-8859-5 */
 266         PG_ISO_8859_6,                          /* ISO-8859-6 */
 267         PG_ISO_8859_7,                          /* ISO-8859-7 */
 268         PG_ISO_8859_8,                          /* ISO-8859-8 */
 269         PG_WIN1250,                                     /* windows-1250 */
 270         PG_WIN1253,                                     /* windows-1253 */
 271         PG_WIN1254,                                     /* windows-1254 */
 272         PG_WIN1255,                                     /* windows-1255 */
 273         PG_WIN1257,                                     /* windows-1257 */
 274         PG_KOI8U,                                       /* KOI8-U */
 275         /* PG_ENCODING_BE_LAST points to the above entry */
 276
 277         /* followings are for client encoding only */
 278         PG_SJIS,                                        /* Shift JIS (Windows-932) */
 279         PG_BIG5,                                        /* Big5 (Windows-950) */
 280         PG_GBK,                                         /* GBK (Windows-936) */
 281         PG_UHC,                                         /* UHC (Windows-949) */
 282         PG_GB18030,                                     /* GB18030 */
 283         PG_JOHAB,                                       /* EUC for Korean JOHAB */
 284         PG_SHIFT_JIS_2004,                      /* Shift-JIS-2004 */
 285         _PG_LAST_ENCODING_                      /* mark only */
 286
 287 } pg_enc;
 288
 289 #define PG_ENCODING_BE_LAST PG_KOI8U
 290
 291 /*
 292  * Please use these tests before access to pg_enc2name_tbl[]
 293  * or to other places...
 294  */
 295 #define PG_VALID_BE_ENCODING(_enc) \
 296                 ((_enc) >= 0 && (_enc) <= PG_ENCODING_BE_LAST)
 297
 298 #define PG_ENCODING_IS_CLIENT_ONLY(_enc) \
 299                 ((_enc) > PG_ENCODING_BE_LAST && (_enc) < _PG_LAST_ENCODING_)
 300
 301 #define PG_VALID_ENCODING(_enc) \
 302                 ((_enc) >= 0 && (_enc) < _PG_LAST_ENCODING_)
 303
 304 /* On FE are possible all encodings */
 305 #define PG_VALID_FE_ENCODING(_enc)      PG_VALID_ENCODING(_enc)
 306
 307 /*
 308  * When converting strings between different encodings, we assume that space
 309  * for converted result is 4-to-1 growth in the worst case.  The rate for
 310  * currently supported encoding pairs are within 3 (SJIS JIS X0201 half width
 311  * kana -> UTF8 is the worst case).  So "4" should be enough for the moment.
 312  *
 313  * Note that this is not the same as the maximum character width in any
 314  * particular encoding.
 315  */
 316 #define MAX_CONVERSION_GROWTH  4
 317
 318 /*
 319  * Maximum byte length of a string that's required in any encoding to convert
 320  * at least one character to any other encoding.  In other words, if you feed
 321  * MAX_CONVERSION_INPUT_LENGTH bytes to any encoding conversion function, it
 322  * is guaranteed to be able to convert something without needing more input
 323  * (assuming the input is valid).
 324  *
 325  * Currently, the maximum case is the conversion UTF8 -> SJIS JIS X0201 half
 326  * width kana, where a pair of UTF-8 characters is converted into a single
 327  * SHIFT_JIS_2004 character (the reverse of the worst case for
 328  * MAX_CONVERSION_GROWTH).  It needs 6 bytes of input.  In theory, a
 329  * user-defined conversion function might have more complicated cases, although
 330  * for the reverse mapping you would probably also need to bump up
 331  * MAX_CONVERSION_GROWTH.  But there is no need to be stingy here, so make it
 332  * generous.
 333  */
 334 #define MAX_CONVERSION_INPUT_LENGTH     16
 335
 336 /*
 337  * Maximum byte length of the string equivalent to any one Unicode code point,
 338  * in any backend encoding.  The current value assumes that a 4-byte UTF-8
 339  * character might expand by MAX_CONVERSION_GROWTH, which is a huge
 340  * overestimate.  But in current usage we don't allocate large multiples of
 341  * this, so there's little point in being stingy.
 342  */
 343 #define MAX_UNICODE_EQUIVALENT_STRING   16
 344
 345 /*
 346  * Table for mapping an encoding number to official encoding name and
 347  * possibly other subsidiary data.  Be careful to check encoding number
 348  * before accessing a table entry!
 349  *
 350  * if (PG_VALID_ENCODING(encoding))
 351  *              pg_enc2name_tbl[ encoding ];
 352  */
 353 typedef struct pg_enc2name
 354 {
 355         const char *name;
 356         pg_enc          encoding;
 357 #ifdef WIN32
 358         unsigned        codepage;               /* codepage for WIN32 */
 359 #endif
 360 } pg_enc2name;
 361
 362 extern const pg_enc2name pg_enc2name_tbl[];
 363
 364 /*
 365  * Encoding names for gettext
 366  */
 367 typedef struct pg_enc2gettext
 368 {
 369         pg_enc          encoding;
 370         const char *name;
 371 } pg_enc2gettext;
 372
 373 extern const pg_enc2gettext pg_enc2gettext_tbl[];
 374
 375 /*
 376  * pg_wchar stuff
 377  */
 378 typedef int (*mb2wchar_with_len_converter) (const unsigned char *from,
 379                                                                                         pg_wchar *to,
 380                                                                                         int len);
 381
 382 typedef int (*wchar2mb_with_len_converter) (const pg_wchar *from,
 383                                                                                         unsigned char *to,
 384                                                                                         int len);
 385
 386 typedef int (*mblen_converter) (const unsigned char *mbstr);
 387
 388 typedef int (*mbdisplaylen_converter) (const unsigned char *mbstr);
 389
 390 typedef bool (*mbcharacter_incrementer) (unsigned char *mbstr, int len);
 391
 392 typedef int (*mbchar_verifier) (const unsigned char *mbstr, int len);
 393
 394 typedef int (*mbstr_verifier) (const unsigned char *mbstr, int len);
 395
 396 typedef struct
 397 {
 398         mb2wchar_with_len_converter mb2wchar_with_len;  /* convert a multibyte
 399                                                                                                          * string to a wchar */
 400         wchar2mb_with_len_converter wchar2mb_with_len;  /* convert a wchar string
 401                                                                                                          * to a multibyte */
 402         mblen_converter mblen;          /* get byte length of a char */
 403         mbdisplaylen_converter dsplen;  /* get display width of a char */
 404         mbchar_verifier mbverifychar;   /* verify multibyte character */
 405         mbstr_verifier mbverifystr; /* verify multibyte string */
 406         int                     maxmblen;               /* max bytes for a char in this encoding */
 407 } pg_wchar_tbl;
 408
 409 extern const pg_wchar_tbl pg_wchar_table[];
 410
 411 /*
 412  * Data structures for conversions between UTF-8 and other encodings
 413  * (UtfToLocal() and LocalToUtf()).  In these data structures, characters of
 414  * either encoding are represented by uint32 words; hence we can only support
 415  * characters up to 4 bytes long.  For example, the byte sequence 0xC2 0x89
 416  * would be represented by 0x0000C289, and 0xE8 0xA2 0xB4 by 0x00E8A2B4.
 417  *
 418  * There are three possible ways a character can be mapped:
 419  *
 420  * 1. Using a radix tree, from source to destination code.
 421  * 2. Using a sorted array of source -> destination code pairs. This
 422  *        method is used for "combining" characters. There are so few of
 423  *        them that building a radix tree would be wasteful.
 424  * 3. Using a conversion function.
 425  */
 426
 427 /*
 428  * Radix tree for character conversion.
 429  *
 430  * Logically, this is actually four different radix trees, for 1-byte,
 431  * 2-byte, 3-byte and 4-byte inputs. The 1-byte tree is a simple lookup
 432  * table from source to target code. The 2-byte tree consists of two levels:
 433  * one lookup table for the first byte, where the value in the lookup table
 434  * points to a lookup table for the second byte. And so on.
 435  *
 436  * Physically, all the trees are stored in one big array, in 'chars16' or
 437  * 'chars32', depending on the maximum value that needs to be represented. For
 438  * each level in each tree, we also store lower and upper bound of allowed
 439  * values - values outside those bounds are considered invalid, and are left
 440  * out of the tables.
 441  *
 442  * In the intermediate levels of the trees, the values stored are offsets
 443  * into the chars[16|32] array.
 444  *
 445  * In the beginning of the chars[16|32] array, there is always a number of
 446  * zeros, so that you safely follow an index from an intermediate table
 447  * without explicitly checking for a zero. Following a zero any number of
 448  * times will always bring you to the dummy, all-zeros table in the
 449  * beginning. This helps to shave some cycles when looking up values.
 450  */
 451 typedef struct
 452 {
 453         /*
 454          * Array containing all the values. Only one of chars16 or chars32 is
 455          * used, depending on how wide the values we need to represent are.
 456          */
 457         const uint16 *chars16;
 458         const uint32 *chars32;
 459
 460         /* Radix tree for 1-byte inputs */
 461         uint32          b1root;                 /* offset of table in the chars[16|32] array */
 462         uint8           b1_lower;               /* min allowed value for a single byte input */
 463         uint8           b1_upper;               /* max allowed value for a single byte input */
 464
 465         /* Radix tree for 2-byte inputs */
 466         uint32          b2root;                 /* offset of 1st byte's table */
 467         uint8           b2_1_lower;             /* min/max allowed value for 1st input byte */
 468         uint8           b2_1_upper;
 469         uint8           b2_2_lower;             /* min/max allowed value for 2nd input byte */
 470         uint8           b2_2_upper;
 471
 472         /* Radix tree for 3-byte inputs */
 473         uint32          b3root;                 /* offset of 1st byte's table */
 474         uint8           b3_1_lower;             /* min/max allowed value for 1st input byte */
 475         uint8           b3_1_upper;
 476         uint8           b3_2_lower;             /* min/max allowed value for 2nd input byte */
 477         uint8           b3_2_upper;
 478         uint8           b3_3_lower;             /* min/max allowed value for 3rd input byte */
 479         uint8           b3_3_upper;
 480
 481         /* Radix tree for 4-byte inputs */
 482         uint32          b4root;                 /* offset of 1st byte's table */
 483         uint8           b4_1_lower;             /* min/max allowed value for 1st input byte */
 484         uint8           b4_1_upper;
 485         uint8           b4_2_lower;             /* min/max allowed value for 2nd input byte */
 486         uint8           b4_2_upper;
 487         uint8           b4_3_lower;             /* min/max allowed value for 3rd input byte */
 488         uint8           b4_3_upper;
 489         uint8           b4_4_lower;             /* min/max allowed value for 4th input byte */
 490         uint8           b4_4_upper;
 491
 492 } pg_mb_radix_tree;
 493
 494 /*
 495  * UTF-8 to local code conversion map (for combined characters)
 496  */
 497 typedef struct
 498 {
 499         uint32          utf1;                   /* UTF-8 code 1 */
 500         uint32          utf2;                   /* UTF-8 code 2 */
 501         uint32          code;                   /* local code */
 502 } pg_utf_to_local_combined;
 503
 504 /*
 505  * local code to UTF-8 conversion map (for combined characters)
 506  */
 507 typedef struct
 508 {
 509         uint32          code;                   /* local code */
 510         uint32          utf1;                   /* UTF-8 code 1 */
 511         uint32          utf2;                   /* UTF-8 code 2 */
 512 } pg_local_to_utf_combined;
 513
 514 /*
 515  * callback function for algorithmic encoding conversions (in either direction)
 516  *
 517  * if function returns zero, it does not know how to convert the code
 518  */
 519 typedef uint32 (*utf_local_conversion_func) (uint32 code);
 520
 521 /*
 522  * Support macro for encoding conversion functions to validate their
 523  * arguments.  (This could be made more compact if we included fmgr.h
 524  * here, but we don't want to do that because this header file is also
 525  * used by frontends.)
 526  */
 527 #define CHECK_ENCODING_CONVERSION_ARGS(srcencoding,destencoding) \
 528         check_encoding_conversion_args(PG_GETARG_INT32(0), \
 529                                                                    PG_GETARG_INT32(1), \
 530                                                                    PG_GETARG_INT32(4), \
 531                                                                    (srcencoding), \
 532                                                                    (destencoding))
 533
 534
 535 /*
 536  * Some handy functions for Unicode-specific tests.
 537  */
 538 static inline bool
 539 is_valid_unicode_codepoint(pg_wchar c)
 540 {
 541         return (c > 0 && c <= 0x10FFFF);
 542 }
 543
 544 static inline bool
 545 is_utf16_surrogate_first(pg_wchar c)
 546 {
 547         return (c >= 0xD800 && c <= 0xDBFF);
 548 }
 549
 550 static inline bool
 551 is_utf16_surrogate_second(pg_wchar c)
 552 {
 553         return (c >= 0xDC00 && c <= 0xDFFF);
 554 }
 555
 556 static inline pg_wchar
 557 surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
 558 {
 559         return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
 560 }
 561
 562
 563 /*
 564  * These functions are considered part of libpq's exported API and
 565  * are also declared in libpq-fe.h.
 566  */
 567 extern int      pg_char_to_encoding(const char *name);
 568 extern const char *pg_encoding_to_char(int encoding);
 569 extern int      pg_valid_server_encoding_id(int encoding);
 570
 571 /*
 572  * These functions are available to frontend code that links with libpgcommon
 573  * (in addition to the ones just above).  The constant tables declared
 574  * earlier in this file are also available from libpgcommon.
 575  */
 576 extern int      pg_encoding_mblen(int encoding, const char *mbstr);
 577 extern int      pg_encoding_mblen_bounded(int encoding, const char *mbstr);
 578 extern int      pg_encoding_dsplen(int encoding, const char *mbstr);
 579 extern int      pg_encoding_verifymbchar(int encoding, const char *mbstr, int len);
 580 extern int      pg_encoding_verifymbstr(int encoding, const char *mbstr, int len);
 581 extern int      pg_encoding_max_length(int encoding);
 582 extern int      pg_valid_client_encoding(const char *name);
 583 extern int      pg_valid_server_encoding(const char *name);
 584 extern bool is_encoding_supported_by_icu(int encoding);
 585 extern const char *get_encoding_name_for_icu(int encoding);
 586
 587 extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string);
 588 extern pg_wchar utf8_to_unicode(const unsigned char *c);
 589 extern bool pg_utf8_islegal(const unsigned char *source, int length);
 590 extern int      pg_utf_mblen(const unsigned char *s);
 591 extern int      pg_mule_mblen(const unsigned char *s);
 592
 593 /*
 594  * The remaining functions are backend-only.
 595  */
 596 extern int      pg_mb2wchar(const char *from, pg_wchar *to);
 597 extern int      pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len);
 598 extern int      pg_encoding_mb2wchar_with_len(int encoding,
 599                                                                                   const char *from, pg_wchar *to, int len);
 600 extern int      pg_wchar2mb(const pg_wchar *from, char *to);
 601 extern int      pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len);
 602 extern int      pg_encoding_wchar2mb_with_len(int encoding,
 603                                                                                   const pg_wchar *from, char *to, int len);
 604 extern int      pg_char_and_wchar_strcmp(const char *s1, const pg_wchar *s2);
 605 extern int      pg_wchar_strncmp(const pg_wchar *s1, const pg_wchar *s2, size_t n);
 606 extern int      pg_char_and_wchar_strncmp(const char *s1, const pg_wchar *s2, size_t n);
 607 extern size_t pg_wchar_strlen(const pg_wchar *wstr);
 608 extern int      pg_mblen(const char *mbstr);
 609 extern int      pg_dsplen(const char *mbstr);
 610 extern int      pg_mbstrlen(const char *mbstr);
 611 extern int      pg_mbstrlen_with_len(const char *mbstr, int len);
 612 extern int      pg_mbcliplen(const char *mbstr, int len, int limit);
 613 extern int      pg_encoding_mbcliplen(int encoding, const char *mbstr,
 614                                                                   int len, int limit);
 615 extern int      pg_mbcharcliplen(const char *mbstr, int len, int limit);
 616 extern int      pg_database_encoding_max_length(void);
 617 extern mbcharacter_incrementer pg_database_encoding_character_incrementer(void);
 618
 619 extern int      PrepareClientEncoding(int encoding);
 620 extern int      SetClientEncoding(int encoding);
 621 extern void InitializeClientEncoding(void);
 622 extern int      pg_get_client_encoding(void);
 623 extern const char *pg_get_client_encoding_name(void);
 624
 625 extern void SetDatabaseEncoding(int encoding);
 626 extern int      GetDatabaseEncoding(void);
 627 extern const char *GetDatabaseEncodingName(void);
 628 extern void SetMessageEncoding(int encoding);
 629 extern int      GetMessageEncoding(void);
 630
 631 #ifdef ENABLE_NLS
 632 extern int      pg_bind_textdomain_codeset(const char *domainname);
 633 #endif
 634
 635 extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len,
 636                                                                                                 int src_encoding,
 637                                                                                                 int dest_encoding);
 638 extern int      pg_do_encoding_conversion_buf(Oid proc,
 639                                                                                   int src_encoding,
 640                                                                                   int dest_encoding,
 641                                                                                   unsigned char *src, int srclen,
 642                                                                                   unsigned char *dst, int dstlen,
 643                                                                                   bool noError);
 644
 645 extern char *pg_client_to_server(const char *s, int len);
 646 extern char *pg_server_to_client(const char *s, int len);
 647 extern char *pg_any_to_server(const char *s, int len, int encoding);
 648 extern char *pg_server_to_any(const char *s, int len, int encoding);
 649
 650 extern void pg_unicode_to_server(pg_wchar c, unsigned char *s);
 651
 652 extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc);
 653 extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc);
 654
 655 extern int      UtfToLocal(const unsigned char *utf, int len,
 656                                            unsigned char *iso,
 657                                            const pg_mb_radix_tree *map,
 658                                            const pg_utf_to_local_combined *cmap, int cmapsize,
 659                                            utf_local_conversion_func conv_func,
 660                                            int encoding, bool noError);
 661 extern int      LocalToUtf(const unsigned char *iso, int len,
 662                                            unsigned char *utf,
 663                                            const pg_mb_radix_tree *map,
 664                                            const pg_local_to_utf_combined *cmap, int cmapsize,
 665                                            utf_local_conversion_func conv_func,
 666                                            int encoding, bool noError);
 667
 668 extern bool pg_verifymbstr(const char *mbstr, int len, bool noError);
 669 extern bool pg_verify_mbstr(int encoding, const char *mbstr, int len,
 670                                                         bool noError);
 671 extern int      pg_verify_mbstr_len(int encoding, const char *mbstr, int len,
 672                                                                 bool noError);
 673
 674 extern void check_encoding_conversion_args(int src_encoding,
 675                                                                                    int dest_encoding,
 676                                                                                    int len,
 677                                                                                    int expected_src_encoding,
 678                                                                                    int expected_dest_encoding);
 679
 680 extern void report_invalid_encoding(int encoding, const char *mbstr, int len) pg_attribute_noreturn();
 681 extern void report_untranslatable_char(int src_encoding, int dest_encoding,
 682                                                                            const char *mbstr, int len) pg_attribute_noreturn();
 683
 684 extern int      local2local(const unsigned char *l, unsigned char *p, int len,
 685                                                 int src_encoding, int dest_encoding,
 686                                                 const unsigned char *tab, bool noError);
 687 extern int      latin2mic(const unsigned char *l, unsigned char *p, int len,
 688                                           int lc, int encoding, bool noError);
 689 extern int      mic2latin(const unsigned char *mic, unsigned char *p, int len,
 690                                           int lc, int encoding, bool noError);
 691 extern int      latin2mic_with_table(const unsigned char *l, unsigned char *p,
 692                                                                  int len, int lc, int encoding,
 693                                                                  const unsigned char *tab, bool noError);
 694 extern int      mic2latin_with_table(const unsigned char *mic, unsigned char *p,
 695                                                                  int len, int lc, int encoding,
 696                                                                  const unsigned char *tab, bool noError);
 697
 698 #ifdef WIN32
 699 extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
 700 #endif
 701
 702 #endif                                                  /* PG_WCHAR_H */