1 /*-------------------------------------------------------------------------
4 * Encoding names and routines for working with them.
6 * Portions Copyright (c) 2001-2024, PostgreSQL Global Development Group
9 * src/common/encnames.c
11 *-------------------------------------------------------------------------
18 #include "mb/pg_wchar.h"
22 * All encoding names, sorted: *** A L P H A B E T I C ***
24 * All names must be without irrelevant chars, search routines use
25 * isalnum() chars only. It means ISO-8859-1, iso_8859-1 and Iso8859_1
26 * are always converted to 'iso88591'. All must be lower case.
28 * The table doesn't contain 'cs' aliases (like csISOLatin1). It's needed?
33 typedef struct pg_encname
39 static const pg_encname pg_encname_tbl
[] =
43 }, /* alias for WIN1258 */
49 }, /* Big5; Chinese for Taiwan multibyte set */
52 }, /* EUC-CN; Extended Unix Code for simplified
55 "eucjis2004", PG_EUC_JIS_2004
56 }, /* EUC-JIS-2004; Extended UNIX Code fixed
57 * Width for Japanese, standard JIS X 0213 */
60 }, /* EUC-JP; Extended UNIX Code fixed Width for
61 * Japanese, standard OSF */
64 }, /* EUC-KR; Extended Unix Code for Korean , KS
68 }, /* EUC-TW; Extended Unix Code for
70 * traditional Chinese */
73 }, /* GB18030;GB18030 */
76 }, /* GBK; Chinese Windows CodePage 936
77 * simplified Chinese */
80 }, /* ISO-8859-1; RFC1345,KXS2 */
82 "iso885910", PG_LATIN6
83 }, /* ISO-8859-10; RFC1345,KXS2 */
85 "iso885913", PG_LATIN7
86 }, /* ISO-8859-13; RFC1345,KXS2 */
88 "iso885914", PG_LATIN8
89 }, /* ISO-8859-14; RFC1345,KXS2 */
91 "iso885915", PG_LATIN9
92 }, /* ISO-8859-15; RFC1345,KXS2 */
94 "iso885916", PG_LATIN10
95 }, /* ISO-8859-16; RFC1345,KXS2 */
98 }, /* ISO-8859-2; RFC1345,KXS2 */
100 "iso88593", PG_LATIN3
101 }, /* ISO-8859-3; RFC1345,KXS2 */
103 "iso88594", PG_LATIN4
104 }, /* ISO-8859-4; RFC1345,KXS2 */
106 "iso88595", PG_ISO_8859_5
107 }, /* ISO-8859-5; RFC1345,KXS2 */
109 "iso88596", PG_ISO_8859_6
110 }, /* ISO-8859-6; RFC1345,KXS2 */
112 "iso88597", PG_ISO_8859_7
113 }, /* ISO-8859-7; RFC1345,KXS2 */
115 "iso88598", PG_ISO_8859_8
116 }, /* ISO-8859-8; RFC1345,KXS2 */
118 "iso88599", PG_LATIN5
119 }, /* ISO-8859-9; RFC1345,KXS2 */
122 }, /* JOHAB; Extended Unix Code for simplified
126 }, /* _dirty_ alias for KOI8-R (backward
130 }, /* KOI8-R; RFC1489 */
133 }, /* KOI8-U; RFC2319 */
136 }, /* alias for ISO-8859-1 */
138 "latin10", PG_LATIN10
139 }, /* alias for ISO-8859-16 */
142 }, /* alias for ISO-8859-2 */
145 }, /* alias for ISO-8859-3 */
148 }, /* alias for ISO-8859-4 */
151 }, /* alias for ISO-8859-9 */
154 }, /* alias for ISO-8859-10 */
157 }, /* alias for ISO-8859-13 */
160 }, /* alias for ISO-8859-14 */
163 }, /* alias for ISO-8859-15 */
166 }, /* alias for Shift_JIS */
168 "muleinternal", PG_MULE_INTERNAL
172 }, /* Shift_JIS; JIS X 0202-1991 */
175 "shiftjis2004", PG_SHIFT_JIS_2004
176 }, /* SHIFT-JIS-2004; Shift JIS for Japanese,
177 * standard JIS X 0213 */
180 }, /* alias for Shift_JIS */
182 "sqlascii", PG_SQL_ASCII
186 }, /* alias for WIN1258 */
188 "tcvn5712", PG_WIN1258
189 }, /* alias for WIN1258 */
192 }, /* UHC; Korean Windows CodePage 949 */
195 }, /* alias for UTF8 */
198 }, /* alias for UTF8 */
201 }, /* alias for WIN1258 */
204 }, /* _dirty_ alias for windows-1251 (backward
207 "win1250", PG_WIN1250
208 }, /* alias for Windows-1250 */
210 "win1251", PG_WIN1251
211 }, /* alias for Windows-1251 */
213 "win1252", PG_WIN1252
214 }, /* alias for Windows-1252 */
216 "win1253", PG_WIN1253
217 }, /* alias for Windows-1253 */
219 "win1254", PG_WIN1254
220 }, /* alias for Windows-1254 */
222 "win1255", PG_WIN1255
223 }, /* alias for Windows-1255 */
225 "win1256", PG_WIN1256
226 }, /* alias for Windows-1256 */
228 "win1257", PG_WIN1257
229 }, /* alias for Windows-1257 */
231 "win1258", PG_WIN1258
232 }, /* alias for Windows-1258 */
238 }, /* alias for Windows-874 */
241 }, /* alias for Shift_JIS */
244 }, /* alias for GBK */
247 }, /* alias for UHC */
250 }, /* alias for BIG5 */
252 "windows1250", PG_WIN1250
253 }, /* Windows-1251; Microsoft */
255 "windows1251", PG_WIN1251
256 }, /* Windows-1251; Microsoft */
258 "windows1252", PG_WIN1252
259 }, /* Windows-1252; Microsoft */
261 "windows1253", PG_WIN1253
262 }, /* Windows-1253; Microsoft */
264 "windows1254", PG_WIN1254
265 }, /* Windows-1254; Microsoft */
267 "windows1255", PG_WIN1255
268 }, /* Windows-1255; Microsoft */
270 "windows1256", PG_WIN1256
271 }, /* Windows-1256; Microsoft */
273 "windows1257", PG_WIN1257
274 }, /* Windows-1257; Microsoft */
276 "windows1258", PG_WIN1258
277 }, /* Windows-1258; Microsoft */
279 "windows866", PG_WIN866
282 "windows874", PG_WIN874
283 }, /* Windows-874; Microsoft */
285 "windows932", PG_SJIS
286 }, /* alias for Shift_JIS */
289 }, /* alias for GBK */
292 }, /* alias for UHC */
294 "windows950", PG_BIG5
295 } /* alias for BIG5 */
299 * These are "official" encoding names.
303 #define DEF_ENC2NAME(name, codepage) { #name, PG_##name }
305 #define DEF_ENC2NAME(name, codepage) { #name, PG_##name, codepage }
308 const pg_enc2name pg_enc2name_tbl
[] =
310 [PG_SQL_ASCII
] = DEF_ENC2NAME(SQL_ASCII
, 0),
311 [PG_EUC_JP
] = DEF_ENC2NAME(EUC_JP
, 20932),
312 [PG_EUC_CN
] = DEF_ENC2NAME(EUC_CN
, 20936),
313 [PG_EUC_KR
] = DEF_ENC2NAME(EUC_KR
, 51949),
314 [PG_EUC_TW
] = DEF_ENC2NAME(EUC_TW
, 0),
315 [PG_EUC_JIS_2004
] = DEF_ENC2NAME(EUC_JIS_2004
, 20932),
316 [PG_UTF8
] = DEF_ENC2NAME(UTF8
, 65001),
317 [PG_MULE_INTERNAL
] = DEF_ENC2NAME(MULE_INTERNAL
, 0),
318 [PG_LATIN1
] = DEF_ENC2NAME(LATIN1
, 28591),
319 [PG_LATIN2
] = DEF_ENC2NAME(LATIN2
, 28592),
320 [PG_LATIN3
] = DEF_ENC2NAME(LATIN3
, 28593),
321 [PG_LATIN4
] = DEF_ENC2NAME(LATIN4
, 28594),
322 [PG_LATIN5
] = DEF_ENC2NAME(LATIN5
, 28599),
323 [PG_LATIN6
] = DEF_ENC2NAME(LATIN6
, 0),
324 [PG_LATIN7
] = DEF_ENC2NAME(LATIN7
, 0),
325 [PG_LATIN8
] = DEF_ENC2NAME(LATIN8
, 0),
326 [PG_LATIN9
] = DEF_ENC2NAME(LATIN9
, 28605),
327 [PG_LATIN10
] = DEF_ENC2NAME(LATIN10
, 0),
328 [PG_WIN1256
] = DEF_ENC2NAME(WIN1256
, 1256),
329 [PG_WIN1258
] = DEF_ENC2NAME(WIN1258
, 1258),
330 [PG_WIN866
] = DEF_ENC2NAME(WIN866
, 866),
331 [PG_WIN874
] = DEF_ENC2NAME(WIN874
, 874),
332 [PG_KOI8R
] = DEF_ENC2NAME(KOI8R
, 20866),
333 [PG_WIN1251
] = DEF_ENC2NAME(WIN1251
, 1251),
334 [PG_WIN1252
] = DEF_ENC2NAME(WIN1252
, 1252),
335 [PG_ISO_8859_5
] = DEF_ENC2NAME(ISO_8859_5
, 28595),
336 [PG_ISO_8859_6
] = DEF_ENC2NAME(ISO_8859_6
, 28596),
337 [PG_ISO_8859_7
] = DEF_ENC2NAME(ISO_8859_7
, 28597),
338 [PG_ISO_8859_8
] = DEF_ENC2NAME(ISO_8859_8
, 28598),
339 [PG_WIN1250
] = DEF_ENC2NAME(WIN1250
, 1250),
340 [PG_WIN1253
] = DEF_ENC2NAME(WIN1253
, 1253),
341 [PG_WIN1254
] = DEF_ENC2NAME(WIN1254
, 1254),
342 [PG_WIN1255
] = DEF_ENC2NAME(WIN1255
, 1255),
343 [PG_WIN1257
] = DEF_ENC2NAME(WIN1257
, 1257),
344 [PG_KOI8U
] = DEF_ENC2NAME(KOI8U
, 21866),
345 [PG_SJIS
] = DEF_ENC2NAME(SJIS
, 932),
346 [PG_BIG5
] = DEF_ENC2NAME(BIG5
, 950),
347 [PG_GBK
] = DEF_ENC2NAME(GBK
, 936),
348 [PG_UHC
] = DEF_ENC2NAME(UHC
, 949),
349 [PG_GB18030
] = DEF_ENC2NAME(GB18030
, 54936),
350 [PG_JOHAB
] = DEF_ENC2NAME(JOHAB
, 0),
351 [PG_SHIFT_JIS_2004
] = DEF_ENC2NAME(SHIFT_JIS_2004
, 932),
355 * These are encoding names for gettext.
357 * This covers all encodings except MULE_INTERNAL, which is alien to gettext.
360 const char *pg_enc2gettext_tbl
[] =
362 [PG_SQL_ASCII
] = "US-ASCII",
364 [PG_MULE_INTERNAL
] = NULL
,
365 [PG_LATIN1
] = "LATIN1",
366 [PG_LATIN2
] = "LATIN2",
367 [PG_LATIN3
] = "LATIN3",
368 [PG_LATIN4
] = "LATIN4",
369 [PG_ISO_8859_5
] = "ISO-8859-5",
370 [PG_ISO_8859_6
] = "ISO_8859-6",
371 [PG_ISO_8859_7
] = "ISO-8859-7",
372 [PG_ISO_8859_8
] = "ISO-8859-8",
373 [PG_LATIN5
] = "LATIN5",
374 [PG_LATIN6
] = "LATIN6",
375 [PG_LATIN7
] = "LATIN7",
376 [PG_LATIN8
] = "LATIN8",
377 [PG_LATIN9
] = "LATIN-9",
378 [PG_LATIN10
] = "LATIN10",
379 [PG_KOI8R
] = "KOI8-R",
380 [PG_KOI8U
] = "KOI8-U",
381 [PG_WIN1250
] = "CP1250",
382 [PG_WIN1251
] = "CP1251",
383 [PG_WIN1252
] = "CP1252",
384 [PG_WIN1253
] = "CP1253",
385 [PG_WIN1254
] = "CP1254",
386 [PG_WIN1255
] = "CP1255",
387 [PG_WIN1256
] = "CP1256",
388 [PG_WIN1257
] = "CP1257",
389 [PG_WIN1258
] = "CP1258",
390 [PG_WIN866
] = "CP866",
391 [PG_WIN874
] = "CP874",
392 [PG_EUC_CN
] = "EUC-CN",
393 [PG_EUC_JP
] = "EUC-JP",
394 [PG_EUC_KR
] = "EUC-KR",
395 [PG_EUC_TW
] = "EUC-TW",
396 [PG_EUC_JIS_2004
] = "EUC-JP",
397 [PG_SJIS
] = "SHIFT-JIS",
401 [PG_GB18030
] = "GB18030",
402 [PG_JOHAB
] = "JOHAB",
403 [PG_SHIFT_JIS_2004
] = "SHIFT_JISX0213",
408 * Table of encoding names for ICU (currently covers backend encodings only)
410 * Reference: <https://ssl.icu-project.org/icu-bin/convexp>
412 * NULL entries are not supported by ICU, or their mapping is unclear.
414 static const char *const pg_enc2icu_tbl
[] =
416 [PG_SQL_ASCII
] = NULL
,
417 [PG_EUC_JP
] = "EUC-JP",
418 [PG_EUC_CN
] = "EUC-CN",
419 [PG_EUC_KR
] = "EUC-KR",
420 [PG_EUC_TW
] = "EUC-TW",
421 [PG_EUC_JIS_2004
] = NULL
,
423 [PG_MULE_INTERNAL
] = NULL
,
424 [PG_LATIN1
] = "ISO-8859-1",
425 [PG_LATIN2
] = "ISO-8859-2",
426 [PG_LATIN3
] = "ISO-8859-3",
427 [PG_LATIN4
] = "ISO-8859-4",
428 [PG_LATIN5
] = "ISO-8859-9",
429 [PG_LATIN6
] = "ISO-8859-10",
430 [PG_LATIN7
] = "ISO-8859-13",
431 [PG_LATIN8
] = "ISO-8859-14",
432 [PG_LATIN9
] = "ISO-8859-15",
434 [PG_WIN1256
] = "CP1256",
435 [PG_WIN1258
] = "CP1258",
436 [PG_WIN866
] = "CP866",
438 [PG_KOI8R
] = "KOI8-R",
439 [PG_WIN1251
] = "CP1251",
440 [PG_WIN1252
] = "CP1252",
441 [PG_ISO_8859_5
] = "ISO-8859-5",
442 [PG_ISO_8859_6
] = "ISO-8859-6",
443 [PG_ISO_8859_7
] = "ISO-8859-7",
444 [PG_ISO_8859_8
] = "ISO-8859-8",
445 [PG_WIN1250
] = "CP1250",
446 [PG_WIN1253
] = "CP1253",
447 [PG_WIN1254
] = "CP1254",
448 [PG_WIN1255
] = "CP1255",
449 [PG_WIN1257
] = "CP1257",
450 [PG_KOI8U
] = "KOI8-U",
453 StaticAssertDecl(lengthof(pg_enc2icu_tbl
) == PG_ENCODING_BE_LAST
+ 1,
454 "pg_enc2icu_tbl incomplete");
458 * Is this encoding supported by ICU?
461 is_encoding_supported_by_icu(int encoding
)
463 if (!PG_VALID_BE_ENCODING(encoding
))
465 return (pg_enc2icu_tbl
[encoding
] != NULL
);
469 * Returns ICU's name for encoding, or NULL if not supported
472 get_encoding_name_for_icu(int encoding
)
474 if (!PG_VALID_BE_ENCODING(encoding
))
476 return pg_enc2icu_tbl
[encoding
];
481 * Encoding checks, for error returns -1 else encoding id
485 pg_valid_client_encoding(const char *name
)
489 if ((enc
= pg_char_to_encoding(name
)) < 0)
492 if (!PG_VALID_FE_ENCODING(enc
))
499 pg_valid_server_encoding(const char *name
)
503 if ((enc
= pg_char_to_encoding(name
)) < 0)
506 if (!PG_VALID_BE_ENCODING(enc
))
513 pg_valid_server_encoding_id(int encoding
)
515 return PG_VALID_BE_ENCODING(encoding
);
519 * Remove irrelevant chars from encoding name, store at *newkey
521 * (Caller's responsibility to provide a large enough buffer)
524 clean_encoding_name(const char *key
, char *newkey
)
529 for (p
= key
, np
= newkey
; *p
!= '\0'; p
++)
531 if (isalnum((unsigned char) *p
))
533 if (*p
>= 'A' && *p
<= 'Z')
534 *np
++ = *p
+ 'a' - 'A';
544 * Search encoding by encoding name
546 * Returns encoding ID, or -1 if not recognized
549 pg_char_to_encoding(const char *name
)
551 unsigned int nel
= lengthof(pg_encname_tbl
);
552 const pg_encname
*base
= pg_encname_tbl
,
553 *last
= base
+ nel
- 1,
556 char buff
[NAMEDATALEN
],
559 if (name
== NULL
|| *name
== '\0')
562 if (strlen(name
) >= NAMEDATALEN
)
563 return -1; /* it's certainly not in the table */
565 key
= clean_encoding_name(name
, buff
);
569 position
= base
+ ((last
- base
) >> 1);
570 result
= key
[0] - position
->name
[0];
574 result
= strcmp(key
, position
->name
);
576 return position
->encoding
;
587 pg_encoding_to_char(int encoding
)
589 if (PG_VALID_ENCODING(encoding
))
591 const pg_enc2name
*p
= &pg_enc2name_tbl
[encoding
];
593 Assert(encoding
== p
->encoding
);