Fix a compiler warning in initStringInfo().
[pgsql.git] / src / common / wchar.c
blob20f8f3b000efda57b7368a1d02a33d666e97fc5a
1 /*-------------------------------------------------------------------------
3 * wchar.c
4 * Functions for working with multibyte characters in various encodings.
6 * Portions Copyright (c) 1998-2025, PostgreSQL Global Development Group
8 * IDENTIFICATION
9 * src/common/wchar.c
11 *-------------------------------------------------------------------------
13 #include "c.h"
15 #include "mb/pg_wchar.h"
16 #include "utils/ascii.h"
20 * Operations on multi-byte encodings are driven by a table of helper
21 * functions.
23 * To add an encoding support, define mblen(), dsplen(), verifychar() and
24 * verifystr() for the encoding. For server-encodings, also define mb2wchar()
25 * and wchar2mb() conversion functions.
27 * These functions generally assume that their input is validly formed.
28 * The "verifier" functions, further down in the file, have to be more
29 * paranoid.
31 * We expect that mblen() does not need to examine more than the first byte
32 * of the character to discover the correct length. GB18030 is an exception
33 * to that rule, though, as it also looks at second byte. But even that
34 * behaves in a predictable way, if you only pass the first byte: it will
35 * treat 4-byte encoded characters as two 2-byte encoded characters, which is
36 * good enough for all current uses.
38 * Note: for the display output of psql to work properly, the return values
39 * of the dsplen functions must conform to the Unicode standard. In particular
40 * the NUL character is zero width and control characters are generally
41 * width -1. It is recommended that non-ASCII encodings refer their ASCII
42 * subset to the ASCII routines to ensure consistency.
46 * SQL/ASCII
48 static int
49 pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
51 int cnt = 0;
53 while (len > 0 && *from)
55 *to++ = *from++;
56 len--;
57 cnt++;
59 *to = 0;
60 return cnt;
63 static int
64 pg_ascii_mblen(const unsigned char *s)
66 return 1;
69 static int
70 pg_ascii_dsplen(const unsigned char *s)
72 if (*s == '\0')
73 return 0;
74 if (*s < 0x20 || *s == 0x7f)
75 return -1;
77 return 1;
81 * EUC
83 static int
84 pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
86 int cnt = 0;
88 while (len > 0 && *from)
90 if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte
91 * KANA") */
93 from++;
94 *to = (SS2 << 8) | *from++;
95 len -= 2;
97 else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */
99 from++;
100 *to = (SS3 << 16) | (*from++ << 8);
101 *to |= *from++;
102 len -= 3;
104 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
106 *to = *from++ << 8;
107 *to |= *from++;
108 len -= 2;
110 else /* must be ASCII */
112 *to = *from++;
113 len--;
115 to++;
116 cnt++;
118 *to = 0;
119 return cnt;
122 static inline int
123 pg_euc_mblen(const unsigned char *s)
125 int len;
127 if (*s == SS2)
128 len = 2;
129 else if (*s == SS3)
130 len = 3;
131 else if (IS_HIGHBIT_SET(*s))
132 len = 2;
133 else
134 len = 1;
135 return len;
138 static inline int
139 pg_euc_dsplen(const unsigned char *s)
141 int len;
143 if (*s == SS2)
144 len = 2;
145 else if (*s == SS3)
146 len = 2;
147 else if (IS_HIGHBIT_SET(*s))
148 len = 2;
149 else
150 len = pg_ascii_dsplen(s);
151 return len;
155 * EUC_JP
157 static int
158 pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
160 return pg_euc2wchar_with_len(from, to, len);
163 static int
164 pg_eucjp_mblen(const unsigned char *s)
166 return pg_euc_mblen(s);
169 static int
170 pg_eucjp_dsplen(const unsigned char *s)
172 int len;
174 if (*s == SS2)
175 len = 1;
176 else if (*s == SS3)
177 len = 2;
178 else if (IS_HIGHBIT_SET(*s))
179 len = 2;
180 else
181 len = pg_ascii_dsplen(s);
182 return len;
186 * EUC_KR
188 static int
189 pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
191 return pg_euc2wchar_with_len(from, to, len);
194 static int
195 pg_euckr_mblen(const unsigned char *s)
197 return pg_euc_mblen(s);
200 static int
201 pg_euckr_dsplen(const unsigned char *s)
203 return pg_euc_dsplen(s);
207 * EUC_CN
210 static int
211 pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
213 int cnt = 0;
215 while (len > 0 && *from)
217 if (*from == SS2 && len >= 3) /* code set 2 (unused?) */
219 from++;
220 *to = (SS2 << 16) | (*from++ << 8);
221 *to |= *from++;
222 len -= 3;
224 else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */
226 from++;
227 *to = (SS3 << 16) | (*from++ << 8);
228 *to |= *from++;
229 len -= 3;
231 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
233 *to = *from++ << 8;
234 *to |= *from++;
235 len -= 2;
237 else
239 *to = *from++;
240 len--;
242 to++;
243 cnt++;
245 *to = 0;
246 return cnt;
249 static int
250 pg_euccn_mblen(const unsigned char *s)
252 int len;
254 if (IS_HIGHBIT_SET(*s))
255 len = 2;
256 else
257 len = 1;
258 return len;
261 static int
262 pg_euccn_dsplen(const unsigned char *s)
264 int len;
266 if (IS_HIGHBIT_SET(*s))
267 len = 2;
268 else
269 len = pg_ascii_dsplen(s);
270 return len;
274 * EUC_TW
277 static int
278 pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
280 int cnt = 0;
282 while (len > 0 && *from)
284 if (*from == SS2 && len >= 4) /* code set 2 */
286 from++;
287 *to = (((uint32) SS2) << 24) | (*from++ << 16);
288 *to |= *from++ << 8;
289 *to |= *from++;
290 len -= 4;
292 else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */
294 from++;
295 *to = (SS3 << 16) | (*from++ << 8);
296 *to |= *from++;
297 len -= 3;
299 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
301 *to = *from++ << 8;
302 *to |= *from++;
303 len -= 2;
305 else
307 *to = *from++;
308 len--;
310 to++;
311 cnt++;
313 *to = 0;
314 return cnt;
317 static int
318 pg_euctw_mblen(const unsigned char *s)
320 int len;
322 if (*s == SS2)
323 len = 4;
324 else if (*s == SS3)
325 len = 3;
326 else if (IS_HIGHBIT_SET(*s))
327 len = 2;
328 else
329 len = 1;
330 return len;
333 static int
334 pg_euctw_dsplen(const unsigned char *s)
336 int len;
338 if (*s == SS2)
339 len = 2;
340 else if (*s == SS3)
341 len = 2;
342 else if (IS_HIGHBIT_SET(*s))
343 len = 2;
344 else
345 len = pg_ascii_dsplen(s);
346 return len;
350 * Convert pg_wchar to EUC_* encoding.
351 * caller must allocate enough space for "to", including a trailing zero!
352 * len: length of from.
353 * "from" not necessarily null terminated.
355 static int
356 pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
358 int cnt = 0;
360 while (len > 0 && *from)
362 unsigned char c;
364 if ((c = (*from >> 24)))
366 *to++ = c;
367 *to++ = (*from >> 16) & 0xff;
368 *to++ = (*from >> 8) & 0xff;
369 *to++ = *from & 0xff;
370 cnt += 4;
372 else if ((c = (*from >> 16)))
374 *to++ = c;
375 *to++ = (*from >> 8) & 0xff;
376 *to++ = *from & 0xff;
377 cnt += 3;
379 else if ((c = (*from >> 8)))
381 *to++ = c;
382 *to++ = *from & 0xff;
383 cnt += 2;
385 else
387 *to++ = *from;
388 cnt++;
390 from++;
391 len--;
393 *to = 0;
394 return cnt;
399 * JOHAB
401 static int
402 pg_johab_mblen(const unsigned char *s)
404 return pg_euc_mblen(s);
407 static int
408 pg_johab_dsplen(const unsigned char *s)
410 return pg_euc_dsplen(s);
414 * convert UTF8 string to pg_wchar (UCS-4)
415 * caller must allocate enough space for "to", including a trailing zero!
416 * len: length of from.
417 * "from" not necessarily null terminated.
419 static int
420 pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
422 int cnt = 0;
423 uint32 c1,
428 while (len > 0 && *from)
430 if ((*from & 0x80) == 0)
432 *to = *from++;
433 len--;
435 else if ((*from & 0xe0) == 0xc0)
437 if (len < 2)
438 break; /* drop trailing incomplete char */
439 c1 = *from++ & 0x1f;
440 c2 = *from++ & 0x3f;
441 *to = (c1 << 6) | c2;
442 len -= 2;
444 else if ((*from & 0xf0) == 0xe0)
446 if (len < 3)
447 break; /* drop trailing incomplete char */
448 c1 = *from++ & 0x0f;
449 c2 = *from++ & 0x3f;
450 c3 = *from++ & 0x3f;
451 *to = (c1 << 12) | (c2 << 6) | c3;
452 len -= 3;
454 else if ((*from & 0xf8) == 0xf0)
456 if (len < 4)
457 break; /* drop trailing incomplete char */
458 c1 = *from++ & 0x07;
459 c2 = *from++ & 0x3f;
460 c3 = *from++ & 0x3f;
461 c4 = *from++ & 0x3f;
462 *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
463 len -= 4;
465 else
467 /* treat a bogus char as length 1; not ours to raise error */
468 *to = *from++;
469 len--;
471 to++;
472 cnt++;
474 *to = 0;
475 return cnt;
480 * Trivial conversion from pg_wchar to UTF-8.
481 * caller should allocate enough space for "to"
482 * len: length of from.
483 * "from" not necessarily null terminated.
485 static int
486 pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
488 int cnt = 0;
490 while (len > 0 && *from)
492 int char_len;
494 unicode_to_utf8(*from, to);
495 char_len = pg_utf_mblen(to);
496 cnt += char_len;
497 to += char_len;
498 from++;
499 len--;
501 *to = 0;
502 return cnt;
506 * Return the byte length of a UTF8 character pointed to by s
508 * Note: in the current implementation we do not support UTF8 sequences
509 * of more than 4 bytes; hence do NOT return a value larger than 4.
510 * We return "1" for any leading byte that is either flat-out illegal or
511 * indicates a length larger than we support.
513 * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
514 * other places would need to be fixed to change this.
517 pg_utf_mblen(const unsigned char *s)
519 int len;
521 if ((*s & 0x80) == 0)
522 len = 1;
523 else if ((*s & 0xe0) == 0xc0)
524 len = 2;
525 else if ((*s & 0xf0) == 0xe0)
526 len = 3;
527 else if ((*s & 0xf8) == 0xf0)
528 len = 4;
529 #ifdef NOT_USED
530 else if ((*s & 0xfc) == 0xf8)
531 len = 5;
532 else if ((*s & 0xfe) == 0xfc)
533 len = 6;
534 #endif
535 else
536 len = 1;
537 return len;
541 * This is an implementation of wcwidth() and wcswidth() as defined in
542 * "The Single UNIX Specification, Version 2, The Open Group, 1997"
543 * <http://www.unix.org/online.html>
545 * Markus Kuhn -- 2001-09-08 -- public domain
547 * customised for PostgreSQL
549 * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
552 struct mbinterval
554 unsigned int first;
555 unsigned int last;
558 /* auxiliary function for binary search in interval table */
559 static int
560 mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
562 int min = 0;
563 int mid;
565 if (ucs < table[0].first || ucs > table[max].last)
566 return 0;
567 while (max >= min)
569 mid = (min + max) / 2;
570 if (ucs > table[mid].last)
571 min = mid + 1;
572 else if (ucs < table[mid].first)
573 max = mid - 1;
574 else
575 return 1;
578 return 0;
582 /* The following functions define the column width of an ISO 10646
583 * character as follows:
585 * - The null character (U+0000) has a column width of 0.
587 * - Other C0/C1 control characters and DEL will lead to a return
588 * value of -1.
590 * - Non-spacing and enclosing combining characters (general
591 * category code Mn, Me or Cf in the Unicode database) have a
592 * column width of 0.
594 * - Spacing characters in the East Asian Wide (W) or East Asian
595 * FullWidth (F) category as defined in Unicode Technical
596 * Report #11 have a column width of 2.
598 * - All remaining characters (including all printable
599 * ISO 8859-1 and WGL4 characters, Unicode control characters,
600 * etc.) have a column width of 1.
602 * This implementation assumes that wchar_t characters are encoded
603 * in ISO 10646.
606 static int
607 ucs_wcwidth(pg_wchar ucs)
609 #include "common/unicode_nonspacing_table.h"
610 #include "common/unicode_east_asian_fw_table.h"
612 /* test for 8-bit control characters */
613 if (ucs == 0)
614 return 0;
616 if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
617 return -1;
620 * binary search in table of non-spacing characters
622 * XXX: In the official Unicode sources, it is possible for a character to
623 * be described as both non-spacing and wide at the same time. As of
624 * Unicode 13.0, treating the non-spacing property as the determining
625 * factor for display width leads to the correct behavior, so do that
626 * search first.
628 if (mbbisearch(ucs, nonspacing,
629 sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
630 return 0;
632 /* binary search in table of wide characters */
633 if (mbbisearch(ucs, east_asian_fw,
634 sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
635 return 2;
637 return 1;
640 static int
641 pg_utf_dsplen(const unsigned char *s)
643 return ucs_wcwidth(utf8_to_unicode(s));
647 * convert mule internal code to pg_wchar
648 * caller should allocate enough space for "to"
649 * len: length of from.
650 * "from" not necessarily null terminated.
652 static int
653 pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
655 int cnt = 0;
657 while (len > 0 && *from)
659 if (IS_LC1(*from) && len >= 2)
661 *to = *from++ << 16;
662 *to |= *from++;
663 len -= 2;
665 else if (IS_LCPRV1(*from) && len >= 3)
667 from++;
668 *to = *from++ << 16;
669 *to |= *from++;
670 len -= 3;
672 else if (IS_LC2(*from) && len >= 3)
674 *to = *from++ << 16;
675 *to |= *from++ << 8;
676 *to |= *from++;
677 len -= 3;
679 else if (IS_LCPRV2(*from) && len >= 4)
681 from++;
682 *to = *from++ << 16;
683 *to |= *from++ << 8;
684 *to |= *from++;
685 len -= 4;
687 else
688 { /* assume ASCII */
689 *to = (unsigned char) *from++;
690 len--;
692 to++;
693 cnt++;
695 *to = 0;
696 return cnt;
700 * convert pg_wchar to mule internal code
701 * caller should allocate enough space for "to"
702 * len: length of from.
703 * "from" not necessarily null terminated.
705 static int
706 pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
708 int cnt = 0;
710 while (len > 0 && *from)
712 unsigned char lb;
714 lb = (*from >> 16) & 0xff;
715 if (IS_LC1(lb))
717 *to++ = lb;
718 *to++ = *from & 0xff;
719 cnt += 2;
721 else if (IS_LC2(lb))
723 *to++ = lb;
724 *to++ = (*from >> 8) & 0xff;
725 *to++ = *from & 0xff;
726 cnt += 3;
728 else if (IS_LCPRV1_A_RANGE(lb))
730 *to++ = LCPRV1_A;
731 *to++ = lb;
732 *to++ = *from & 0xff;
733 cnt += 3;
735 else if (IS_LCPRV1_B_RANGE(lb))
737 *to++ = LCPRV1_B;
738 *to++ = lb;
739 *to++ = *from & 0xff;
740 cnt += 3;
742 else if (IS_LCPRV2_A_RANGE(lb))
744 *to++ = LCPRV2_A;
745 *to++ = lb;
746 *to++ = (*from >> 8) & 0xff;
747 *to++ = *from & 0xff;
748 cnt += 4;
750 else if (IS_LCPRV2_B_RANGE(lb))
752 *to++ = LCPRV2_B;
753 *to++ = lb;
754 *to++ = (*from >> 8) & 0xff;
755 *to++ = *from & 0xff;
756 cnt += 4;
758 else
760 *to++ = *from & 0xff;
761 cnt += 1;
763 from++;
764 len--;
766 *to = 0;
767 return cnt;
770 /* exported for direct use by conv.c */
772 pg_mule_mblen(const unsigned char *s)
774 int len;
776 if (IS_LC1(*s))
777 len = 2;
778 else if (IS_LCPRV1(*s))
779 len = 3;
780 else if (IS_LC2(*s))
781 len = 3;
782 else if (IS_LCPRV2(*s))
783 len = 4;
784 else
785 len = 1; /* assume ASCII */
786 return len;
789 static int
790 pg_mule_dsplen(const unsigned char *s)
792 int len;
795 * Note: it's not really appropriate to assume that all multibyte charsets
796 * are double-wide on screen. But this seems an okay approximation for
797 * the MULE charsets we currently support.
800 if (IS_LC1(*s))
801 len = 1;
802 else if (IS_LCPRV1(*s))
803 len = 1;
804 else if (IS_LC2(*s))
805 len = 2;
806 else if (IS_LCPRV2(*s))
807 len = 2;
808 else
809 len = 1; /* assume ASCII */
811 return len;
815 * ISO8859-1
817 static int
818 pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
820 int cnt = 0;
822 while (len > 0 && *from)
824 *to++ = *from++;
825 len--;
826 cnt++;
828 *to = 0;
829 return cnt;
833 * Trivial conversion from pg_wchar to single byte encoding. Just ignores
834 * high bits.
835 * caller should allocate enough space for "to"
836 * len: length of from.
837 * "from" not necessarily null terminated.
839 static int
840 pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
842 int cnt = 0;
844 while (len > 0 && *from)
846 *to++ = *from++;
847 len--;
848 cnt++;
850 *to = 0;
851 return cnt;
854 static int
855 pg_latin1_mblen(const unsigned char *s)
857 return 1;
860 static int
861 pg_latin1_dsplen(const unsigned char *s)
863 return pg_ascii_dsplen(s);
867 * SJIS
869 static int
870 pg_sjis_mblen(const unsigned char *s)
872 int len;
874 if (*s >= 0xa1 && *s <= 0xdf)
875 len = 1; /* 1 byte kana? */
876 else if (IS_HIGHBIT_SET(*s))
877 len = 2; /* kanji? */
878 else
879 len = 1; /* should be ASCII */
880 return len;
883 static int
884 pg_sjis_dsplen(const unsigned char *s)
886 int len;
888 if (*s >= 0xa1 && *s <= 0xdf)
889 len = 1; /* 1 byte kana? */
890 else if (IS_HIGHBIT_SET(*s))
891 len = 2; /* kanji? */
892 else
893 len = pg_ascii_dsplen(s); /* should be ASCII */
894 return len;
898 * Big5
900 static int
901 pg_big5_mblen(const unsigned char *s)
903 int len;
905 if (IS_HIGHBIT_SET(*s))
906 len = 2; /* kanji? */
907 else
908 len = 1; /* should be ASCII */
909 return len;
912 static int
913 pg_big5_dsplen(const unsigned char *s)
915 int len;
917 if (IS_HIGHBIT_SET(*s))
918 len = 2; /* kanji? */
919 else
920 len = pg_ascii_dsplen(s); /* should be ASCII */
921 return len;
925 * GBK
927 static int
928 pg_gbk_mblen(const unsigned char *s)
930 int len;
932 if (IS_HIGHBIT_SET(*s))
933 len = 2; /* kanji? */
934 else
935 len = 1; /* should be ASCII */
936 return len;
939 static int
940 pg_gbk_dsplen(const unsigned char *s)
942 int len;
944 if (IS_HIGHBIT_SET(*s))
945 len = 2; /* kanji? */
946 else
947 len = pg_ascii_dsplen(s); /* should be ASCII */
948 return len;
952 * UHC
954 static int
955 pg_uhc_mblen(const unsigned char *s)
957 int len;
959 if (IS_HIGHBIT_SET(*s))
960 len = 2; /* 2byte? */
961 else
962 len = 1; /* should be ASCII */
963 return len;
966 static int
967 pg_uhc_dsplen(const unsigned char *s)
969 int len;
971 if (IS_HIGHBIT_SET(*s))
972 len = 2; /* 2byte? */
973 else
974 len = pg_ascii_dsplen(s); /* should be ASCII */
975 return len;
979 * GB18030
980 * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
984 * Unlike all other mblen() functions, this also looks at the second byte of
985 * the input. However, if you only pass the first byte of a multi-byte
986 * string, and \0 as the second byte, this still works in a predictable way:
987 * a 4-byte character will be reported as two 2-byte characters. That's
988 * enough for all current uses, as a client-only encoding. It works that
989 * way, because in any valid 4-byte GB18030-encoded character, the third and
990 * fourth byte look like a 2-byte encoded character, when looked at
991 * separately.
993 static int
994 pg_gb18030_mblen(const unsigned char *s)
996 int len;
998 if (!IS_HIGHBIT_SET(*s))
999 len = 1; /* ASCII */
1000 else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1001 len = 4;
1002 else
1003 len = 2;
1004 return len;
1007 static int
1008 pg_gb18030_dsplen(const unsigned char *s)
1010 int len;
1012 if (IS_HIGHBIT_SET(*s))
1013 len = 2;
1014 else
1015 len = pg_ascii_dsplen(s); /* ASCII */
1016 return len;
1020 *-------------------------------------------------------------------
1021 * multibyte sequence validators
1023 * The verifychar functions accept "s", a pointer to the first byte of a
1024 * string, and "len", the remaining length of the string. If there is a
1025 * validly encoded character beginning at *s, return its length in bytes;
1026 * else return -1.
1028 * The verifystr functions also accept "s", a pointer to a string and "len",
1029 * the length of the string. They verify the whole string, and return the
1030 * number of input bytes (<= len) that are valid. In other words, if the
1031 * whole string is valid, verifystr returns "len", otherwise it returns the
1032 * byte offset of the first invalid character. The verifystr functions must
1033 * test for and reject zeroes in the input.
1035 * The verifychar functions can assume that len > 0 and that *s != '\0', but
1036 * they must test for and reject zeroes in any additional bytes of a
1037 * multibyte character. Note that this definition allows the function for a
1038 * single-byte encoding to be just "return 1".
1039 *-------------------------------------------------------------------
1041 static int
1042 pg_ascii_verifychar(const unsigned char *s, int len)
1044 return 1;
1047 static int
1048 pg_ascii_verifystr(const unsigned char *s, int len)
1050 const unsigned char *nullpos = memchr(s, 0, len);
1052 if (nullpos == NULL)
1053 return len;
1054 else
1055 return nullpos - s;
1058 #define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
1060 static int
1061 pg_eucjp_verifychar(const unsigned char *s, int len)
1063 int l;
1064 unsigned char c1,
1067 c1 = *s++;
1069 switch (c1)
1071 case SS2: /* JIS X 0201 */
1072 l = 2;
1073 if (l > len)
1074 return -1;
1075 c2 = *s++;
1076 if (c2 < 0xa1 || c2 > 0xdf)
1077 return -1;
1078 break;
1080 case SS3: /* JIS X 0212 */
1081 l = 3;
1082 if (l > len)
1083 return -1;
1084 c2 = *s++;
1085 if (!IS_EUC_RANGE_VALID(c2))
1086 return -1;
1087 c2 = *s++;
1088 if (!IS_EUC_RANGE_VALID(c2))
1089 return -1;
1090 break;
1092 default:
1093 if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1095 l = 2;
1096 if (l > len)
1097 return -1;
1098 if (!IS_EUC_RANGE_VALID(c1))
1099 return -1;
1100 c2 = *s++;
1101 if (!IS_EUC_RANGE_VALID(c2))
1102 return -1;
1104 else
1105 /* must be ASCII */
1107 l = 1;
1109 break;
1112 return l;
1115 static int
1116 pg_eucjp_verifystr(const unsigned char *s, int len)
1118 const unsigned char *start = s;
1120 while (len > 0)
1122 int l;
1124 /* fast path for ASCII-subset characters */
1125 if (!IS_HIGHBIT_SET(*s))
1127 if (*s == '\0')
1128 break;
1129 l = 1;
1131 else
1133 l = pg_eucjp_verifychar(s, len);
1134 if (l == -1)
1135 break;
1137 s += l;
1138 len -= l;
1141 return s - start;
1144 static int
1145 pg_euckr_verifychar(const unsigned char *s, int len)
1147 int l;
1148 unsigned char c1,
1151 c1 = *s++;
1153 if (IS_HIGHBIT_SET(c1))
1155 l = 2;
1156 if (l > len)
1157 return -1;
1158 if (!IS_EUC_RANGE_VALID(c1))
1159 return -1;
1160 c2 = *s++;
1161 if (!IS_EUC_RANGE_VALID(c2))
1162 return -1;
1164 else
1165 /* must be ASCII */
1167 l = 1;
1170 return l;
1173 static int
1174 pg_euckr_verifystr(const unsigned char *s, int len)
1176 const unsigned char *start = s;
1178 while (len > 0)
1180 int l;
1182 /* fast path for ASCII-subset characters */
1183 if (!IS_HIGHBIT_SET(*s))
1185 if (*s == '\0')
1186 break;
1187 l = 1;
1189 else
1191 l = pg_euckr_verifychar(s, len);
1192 if (l == -1)
1193 break;
1195 s += l;
1196 len -= l;
1199 return s - start;
1202 /* EUC-CN byte sequences are exactly same as EUC-KR */
1203 #define pg_euccn_verifychar pg_euckr_verifychar
1204 #define pg_euccn_verifystr pg_euckr_verifystr
1206 static int
1207 pg_euctw_verifychar(const unsigned char *s, int len)
1209 int l;
1210 unsigned char c1,
1213 c1 = *s++;
1215 switch (c1)
1217 case SS2: /* CNS 11643 Plane 1-7 */
1218 l = 4;
1219 if (l > len)
1220 return -1;
1221 c2 = *s++;
1222 if (c2 < 0xa1 || c2 > 0xa7)
1223 return -1;
1224 c2 = *s++;
1225 if (!IS_EUC_RANGE_VALID(c2))
1226 return -1;
1227 c2 = *s++;
1228 if (!IS_EUC_RANGE_VALID(c2))
1229 return -1;
1230 break;
1232 case SS3: /* unused */
1233 return -1;
1235 default:
1236 if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1238 l = 2;
1239 if (l > len)
1240 return -1;
1241 /* no further range check on c1? */
1242 c2 = *s++;
1243 if (!IS_EUC_RANGE_VALID(c2))
1244 return -1;
1246 else
1247 /* must be ASCII */
1249 l = 1;
1251 break;
1253 return l;
1256 static int
1257 pg_euctw_verifystr(const unsigned char *s, int len)
1259 const unsigned char *start = s;
1261 while (len > 0)
1263 int l;
1265 /* fast path for ASCII-subset characters */
1266 if (!IS_HIGHBIT_SET(*s))
1268 if (*s == '\0')
1269 break;
1270 l = 1;
1272 else
1274 l = pg_euctw_verifychar(s, len);
1275 if (l == -1)
1276 break;
1278 s += l;
1279 len -= l;
1282 return s - start;
1285 static int
1286 pg_johab_verifychar(const unsigned char *s, int len)
1288 int l,
1289 mbl;
1290 unsigned char c;
1292 l = mbl = pg_johab_mblen(s);
1294 if (len < l)
1295 return -1;
1297 if (!IS_HIGHBIT_SET(*s))
1298 return mbl;
1300 while (--l > 0)
1302 c = *++s;
1303 if (!IS_EUC_RANGE_VALID(c))
1304 return -1;
1306 return mbl;
1309 static int
1310 pg_johab_verifystr(const unsigned char *s, int len)
1312 const unsigned char *start = s;
1314 while (len > 0)
1316 int l;
1318 /* fast path for ASCII-subset characters */
1319 if (!IS_HIGHBIT_SET(*s))
1321 if (*s == '\0')
1322 break;
1323 l = 1;
1325 else
1327 l = pg_johab_verifychar(s, len);
1328 if (l == -1)
1329 break;
1331 s += l;
1332 len -= l;
1335 return s - start;
1338 static int
1339 pg_mule_verifychar(const unsigned char *s, int len)
1341 int l,
1342 mbl;
1343 unsigned char c;
1345 l = mbl = pg_mule_mblen(s);
1347 if (len < l)
1348 return -1;
1350 while (--l > 0)
1352 c = *++s;
1353 if (!IS_HIGHBIT_SET(c))
1354 return -1;
1356 return mbl;
1359 static int
1360 pg_mule_verifystr(const unsigned char *s, int len)
1362 const unsigned char *start = s;
1364 while (len > 0)
1366 int l;
1368 /* fast path for ASCII-subset characters */
1369 if (!IS_HIGHBIT_SET(*s))
1371 if (*s == '\0')
1372 break;
1373 l = 1;
1375 else
1377 l = pg_mule_verifychar(s, len);
1378 if (l == -1)
1379 break;
1381 s += l;
1382 len -= l;
1385 return s - start;
1388 static int
1389 pg_latin1_verifychar(const unsigned char *s, int len)
1391 return 1;
1394 static int
1395 pg_latin1_verifystr(const unsigned char *s, int len)
1397 const unsigned char *nullpos = memchr(s, 0, len);
1399 if (nullpos == NULL)
1400 return len;
1401 else
1402 return nullpos - s;
1405 static int
1406 pg_sjis_verifychar(const unsigned char *s, int len)
1408 int l,
1409 mbl;
1410 unsigned char c1,
1413 l = mbl = pg_sjis_mblen(s);
1415 if (len < l)
1416 return -1;
1418 if (l == 1) /* pg_sjis_mblen already verified it */
1419 return mbl;
1421 c1 = *s++;
1422 c2 = *s;
1423 if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1424 return -1;
1425 return mbl;
1428 static int
1429 pg_sjis_verifystr(const unsigned char *s, int len)
1431 const unsigned char *start = s;
1433 while (len > 0)
1435 int l;
1437 /* fast path for ASCII-subset characters */
1438 if (!IS_HIGHBIT_SET(*s))
1440 if (*s == '\0')
1441 break;
1442 l = 1;
1444 else
1446 l = pg_sjis_verifychar(s, len);
1447 if (l == -1)
1448 break;
1450 s += l;
1451 len -= l;
1454 return s - start;
1457 static int
1458 pg_big5_verifychar(const unsigned char *s, int len)
1460 int l,
1461 mbl;
1463 l = mbl = pg_big5_mblen(s);
1465 if (len < l)
1466 return -1;
1468 while (--l > 0)
1470 if (*++s == '\0')
1471 return -1;
1474 return mbl;
1477 static int
1478 pg_big5_verifystr(const unsigned char *s, int len)
1480 const unsigned char *start = s;
1482 while (len > 0)
1484 int l;
1486 /* fast path for ASCII-subset characters */
1487 if (!IS_HIGHBIT_SET(*s))
1489 if (*s == '\0')
1490 break;
1491 l = 1;
1493 else
1495 l = pg_big5_verifychar(s, len);
1496 if (l == -1)
1497 break;
1499 s += l;
1500 len -= l;
1503 return s - start;
1506 static int
1507 pg_gbk_verifychar(const unsigned char *s, int len)
1509 int l,
1510 mbl;
1512 l = mbl = pg_gbk_mblen(s);
1514 if (len < l)
1515 return -1;
1517 while (--l > 0)
1519 if (*++s == '\0')
1520 return -1;
1523 return mbl;
1526 static int
1527 pg_gbk_verifystr(const unsigned char *s, int len)
1529 const unsigned char *start = s;
1531 while (len > 0)
1533 int l;
1535 /* fast path for ASCII-subset characters */
1536 if (!IS_HIGHBIT_SET(*s))
1538 if (*s == '\0')
1539 break;
1540 l = 1;
1542 else
1544 l = pg_gbk_verifychar(s, len);
1545 if (l == -1)
1546 break;
1548 s += l;
1549 len -= l;
1552 return s - start;
1555 static int
1556 pg_uhc_verifychar(const unsigned char *s, int len)
1558 int l,
1559 mbl;
1561 l = mbl = pg_uhc_mblen(s);
1563 if (len < l)
1564 return -1;
1566 while (--l > 0)
1568 if (*++s == '\0')
1569 return -1;
1572 return mbl;
1575 static int
1576 pg_uhc_verifystr(const unsigned char *s, int len)
1578 const unsigned char *start = s;
1580 while (len > 0)
1582 int l;
1584 /* fast path for ASCII-subset characters */
1585 if (!IS_HIGHBIT_SET(*s))
1587 if (*s == '\0')
1588 break;
1589 l = 1;
1591 else
1593 l = pg_uhc_verifychar(s, len);
1594 if (l == -1)
1595 break;
1597 s += l;
1598 len -= l;
1601 return s - start;
1604 static int
1605 pg_gb18030_verifychar(const unsigned char *s, int len)
1607 int l;
1609 if (!IS_HIGHBIT_SET(*s))
1610 l = 1; /* ASCII */
1611 else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1613 /* Should be 4-byte, validate remaining bytes */
1614 if (*s >= 0x81 && *s <= 0xfe &&
1615 *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1616 *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1617 l = 4;
1618 else
1619 l = -1;
1621 else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1623 /* Should be 2-byte, validate */
1624 if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1625 (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1626 l = 2;
1627 else
1628 l = -1;
1630 else
1631 l = -1;
1632 return l;
1635 static int
1636 pg_gb18030_verifystr(const unsigned char *s, int len)
1638 const unsigned char *start = s;
1640 while (len > 0)
1642 int l;
1644 /* fast path for ASCII-subset characters */
1645 if (!IS_HIGHBIT_SET(*s))
1647 if (*s == '\0')
1648 break;
1649 l = 1;
1651 else
1653 l = pg_gb18030_verifychar(s, len);
1654 if (l == -1)
1655 break;
1657 s += l;
1658 len -= l;
1661 return s - start;
1664 static int
1665 pg_utf8_verifychar(const unsigned char *s, int len)
1667 int l;
1669 if ((*s & 0x80) == 0)
1671 if (*s == '\0')
1672 return -1;
1673 return 1;
1675 else if ((*s & 0xe0) == 0xc0)
1676 l = 2;
1677 else if ((*s & 0xf0) == 0xe0)
1678 l = 3;
1679 else if ((*s & 0xf8) == 0xf0)
1680 l = 4;
1681 else
1682 l = 1;
1684 if (l > len)
1685 return -1;
1687 if (!pg_utf8_islegal(s, l))
1688 return -1;
1690 return l;
1694 * The fast path of the UTF-8 verifier uses a deterministic finite automaton
1695 * (DFA) for multibyte characters. In a traditional table-driven DFA, the
1696 * input byte and current state are used to compute an index into an array of
1697 * state transitions. Since the address of the next transition is dependent
1698 * on this computation, there is latency in executing the load instruction,
1699 * and the CPU is not kept busy.
1701 * Instead, we use a "shift-based" DFA as described by Per Vognsen:
1703 * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
1705 * In a shift-based DFA, the input byte is an index into array of integers
1706 * whose bit pattern encodes the state transitions. To compute the next
1707 * state, we simply right-shift the integer by the current state and apply a
1708 * mask. In this scheme, the address of the transition only depends on the
1709 * input byte, so there is better pipelining.
1711 * The naming convention for states and transitions was adopted from a UTF-8
1712 * to UTF-16/32 transcoder, whose table is reproduced below:
1714 * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
1716 * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE
1717 * ==========================================================================
1718 * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, | BGN/END
1719 * err, err, err, err, err, err, err, err, err, err, err, err, | ERR
1721 * err, err, END, END, END, err, err, err, err, err, err, err, | CS1
1722 * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, | CS2
1723 * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, | CS3
1725 * err, err, err, err, CS1, err, err, err, err, err, err, err, | P3A
1726 * err, err, CS1, CS1, err, err, err, err, err, err, err, err, | P3B
1728 * err, err, err, CS2, CS2, err, err, err, err, err, err, err, | P4A
1729 * err, err, CS2, err, err, err, err, err, err, err, err, err, | P4B
1731 * In the most straightforward implementation, a shift-based DFA for UTF-8
1732 * requires 64-bit integers to encode the transitions, but with an SMT solver
1733 * it's possible to find state numbers such that the transitions fit within
1734 * 32-bit integers, as Dougall Johnson demonstrated:
1736 * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
1738 * This packed representation is the reason for the seemingly odd choice of
1739 * state values below.
1742 /* Error */
1743 #define ERR 0
1744 /* Begin */
1745 #define BGN 11
1746 /* Continuation states, expect 1/2/3 continuation bytes */
1747 #define CS1 16
1748 #define CS2 1
1749 #define CS3 5
1750 /* Partial states, where the first continuation byte has a restricted range */
1751 #define P3A 6 /* Lead was E0, check for 3-byte overlong */
1752 #define P3B 20 /* Lead was ED, check for surrogate */
1753 #define P4A 25 /* Lead was F0, check for 4-byte overlong */
1754 #define P4B 30 /* Lead was F4, check for too-large */
1755 /* Begin and End are the same state */
1756 #define END BGN
1758 /* the encoded state transitions for the lookup table */
1760 /* ASCII */
1761 #define ASC (END << BGN)
1762 /* 2-byte lead */
1763 #define L2A (CS1 << BGN)
1764 /* 3-byte lead */
1765 #define L3A (P3A << BGN)
1766 #define L3B (CS2 << BGN)
1767 #define L3C (P3B << BGN)
1768 /* 4-byte lead */
1769 #define L4A (P4A << BGN)
1770 #define L4B (CS3 << BGN)
1771 #define L4C (P4B << BGN)
1772 /* continuation byte */
1773 #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
1774 #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
1775 #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
1776 /* invalid byte */
1777 #define ILL ERR
1779 static const uint32 Utf8Transition[256] =
1781 /* ASCII */
1783 ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1784 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1785 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1786 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1788 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1789 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1790 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1791 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1793 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1794 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1795 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1796 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1798 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1799 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1800 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1801 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1803 /* continuation bytes */
1805 /* 80..8F */
1806 CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1807 CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1809 /* 90..9F */
1810 CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1811 CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1813 /* A0..BF */
1814 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1815 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1816 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1817 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1819 /* leading bytes */
1821 /* C0..DF */
1822 ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
1823 L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1824 L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1825 L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1827 /* E0..EF */
1828 L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
1829 L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
1831 /* F0..FF */
1832 L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
1833 ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
1836 static void
1837 utf8_advance(const unsigned char *s, uint32 *state, int len)
1839 /* Note: We deliberately don't check the state's value here. */
1840 while (len > 0)
1843 * It's important that the mask value is 31: In most instruction sets,
1844 * a shift by a 32-bit operand is understood to be a shift by its mod
1845 * 32, so the compiler should elide the mask operation.
1847 *state = Utf8Transition[*s++] >> (*state & 31);
1848 len--;
1851 *state &= 31;
1854 static int
1855 pg_utf8_verifystr(const unsigned char *s, int len)
1857 const unsigned char *start = s;
1858 const int orig_len = len;
1859 uint32 state = BGN;
1862 * With a stride of two vector widths, gcc will unroll the loop. Even if
1863 * the compiler can unroll a longer loop, it's not worth it because we
1864 * must fall back to the byte-wise algorithm if we find any non-ASCII.
1866 #define STRIDE_LENGTH (2 * sizeof(Vector8))
1868 if (len >= STRIDE_LENGTH)
1870 while (len >= STRIDE_LENGTH)
1873 * If the chunk is all ASCII, we can skip the full UTF-8 check,
1874 * but we must first check for a non-END state, which means the
1875 * previous chunk ended in the middle of a multibyte sequence.
1877 if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
1878 utf8_advance(s, &state, STRIDE_LENGTH);
1880 s += STRIDE_LENGTH;
1881 len -= STRIDE_LENGTH;
1884 /* The error state persists, so we only need to check for it here. */
1885 if (state == ERR)
1888 * Start over from the beginning with the slow path so we can
1889 * count the valid bytes.
1891 len = orig_len;
1892 s = start;
1894 else if (state != END)
1897 * The fast path exited in the middle of a multibyte sequence.
1898 * Walk backwards to find the leading byte so that the slow path
1899 * can resume checking from there. We must always backtrack at
1900 * least one byte, since the current byte could be e.g. an ASCII
1901 * byte after a 2-byte lead, which is invalid.
1905 Assert(s > start);
1906 s--;
1907 len++;
1908 Assert(IS_HIGHBIT_SET(*s));
1909 } while (pg_utf_mblen(s) <= 1);
1913 /* check remaining bytes */
1914 while (len > 0)
1916 int l;
1918 /* fast path for ASCII-subset characters */
1919 if (!IS_HIGHBIT_SET(*s))
1921 if (*s == '\0')
1922 break;
1923 l = 1;
1925 else
1927 l = pg_utf8_verifychar(s, len);
1928 if (l == -1)
1929 break;
1931 s += l;
1932 len -= l;
1935 return s - start;
1939 * Check for validity of a single UTF-8 encoded character
1941 * This directly implements the rules in RFC3629. The bizarre-looking
1942 * restrictions on the second byte are meant to ensure that there isn't
1943 * more than one encoding of a given Unicode character point; that is,
1944 * you may not use a longer-than-necessary byte sequence with high order
1945 * zero bits to represent a character that would fit in fewer bytes.
1946 * To do otherwise is to create security hazards (eg, create an apparent
1947 * non-ASCII character that decodes to plain ASCII).
1949 * length is assumed to have been obtained by pg_utf_mblen(), and the
1950 * caller must have checked that that many bytes are present in the buffer.
1952 bool
1953 pg_utf8_islegal(const unsigned char *source, int length)
1955 unsigned char a;
1957 switch (length)
1959 default:
1960 /* reject lengths 5 and 6 for now */
1961 return false;
1962 case 4:
1963 a = source[3];
1964 if (a < 0x80 || a > 0xBF)
1965 return false;
1966 /* FALL THRU */
1967 case 3:
1968 a = source[2];
1969 if (a < 0x80 || a > 0xBF)
1970 return false;
1971 /* FALL THRU */
1972 case 2:
1973 a = source[1];
1974 switch (*source)
1976 case 0xE0:
1977 if (a < 0xA0 || a > 0xBF)
1978 return false;
1979 break;
1980 case 0xED:
1981 if (a < 0x80 || a > 0x9F)
1982 return false;
1983 break;
1984 case 0xF0:
1985 if (a < 0x90 || a > 0xBF)
1986 return false;
1987 break;
1988 case 0xF4:
1989 if (a < 0x80 || a > 0x8F)
1990 return false;
1991 break;
1992 default:
1993 if (a < 0x80 || a > 0xBF)
1994 return false;
1995 break;
1997 /* FALL THRU */
1998 case 1:
1999 a = *source;
2000 if (a >= 0x80 && a < 0xC2)
2001 return false;
2002 if (a > 0xF4)
2003 return false;
2004 break;
2006 return true;
2011 *-------------------------------------------------------------------
2012 * encoding info table
2013 *-------------------------------------------------------------------
2015 const pg_wchar_tbl pg_wchar_table[] = {
2016 [PG_SQL_ASCII] = {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},
2017 [PG_EUC_JP] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
2018 [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2},
2019 [PG_EUC_KR] = {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},
2020 [PG_EUC_TW] = {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},
2021 [PG_EUC_JIS_2004] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
2022 [PG_UTF8] = {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},
2023 [PG_MULE_INTERNAL] = {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4},
2024 [PG_LATIN1] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2025 [PG_LATIN2] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2026 [PG_LATIN3] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2027 [PG_LATIN4] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2028 [PG_LATIN5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2029 [PG_LATIN6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2030 [PG_LATIN7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2031 [PG_LATIN8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2032 [PG_LATIN9] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2033 [PG_LATIN10] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2034 [PG_WIN1256] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2035 [PG_WIN1258] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2036 [PG_WIN866] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2037 [PG_WIN874] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2038 [PG_KOI8R] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2039 [PG_WIN1251] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2040 [PG_WIN1252] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2041 [PG_ISO_8859_5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2042 [PG_ISO_8859_6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2043 [PG_ISO_8859_7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2044 [PG_ISO_8859_8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2045 [PG_WIN1250] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2046 [PG_WIN1253] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2047 [PG_WIN1254] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2048 [PG_WIN1255] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2049 [PG_WIN1257] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2050 [PG_KOI8U] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2051 [PG_SJIS] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
2052 [PG_BIG5] = {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},
2053 [PG_GBK] = {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},
2054 [PG_UHC] = {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},
2055 [PG_GB18030] = {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
2056 [PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},
2057 [PG_SHIFT_JIS_2004] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
2061 * Returns the byte length of a multibyte character.
2063 * Caution: when dealing with text that is not certainly valid in the
2064 * specified encoding, the result may exceed the actual remaining
2065 * string length. Callers that are not prepared to deal with that
2066 * should use pg_encoding_mblen_bounded() instead.
2069 pg_encoding_mblen(int encoding, const char *mbstr)
2071 return (PG_VALID_ENCODING(encoding) ?
2072 pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
2073 pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
2077 * Returns the byte length of a multibyte character; but not more than
2078 * the distance to end of string.
2081 pg_encoding_mblen_bounded(int encoding, const char *mbstr)
2083 return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
2087 * Returns the display length of a multibyte character.
2090 pg_encoding_dsplen(int encoding, const char *mbstr)
2092 return (PG_VALID_ENCODING(encoding) ?
2093 pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
2094 pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
2098 * Verify the first multibyte character of the given string.
2099 * Return its byte length if good, -1 if bad. (See comments above for
2100 * full details of the mbverifychar API.)
2103 pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
2105 return (PG_VALID_ENCODING(encoding) ?
2106 pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
2107 pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
2111 * Verify that a string is valid for the given encoding.
2112 * Returns the number of input bytes (<= len) that form a valid string.
2113 * (See comments above for full details of the mbverifystr API.)
2116 pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
2118 return (PG_VALID_ENCODING(encoding) ?
2119 pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
2120 pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
2124 * fetch maximum length of a given encoding
2127 pg_encoding_max_length(int encoding)
2129 Assert(PG_VALID_ENCODING(encoding));
2131 return pg_wchar_table[encoding].maxmblen;