1 /*-------------------------------------------------------------------------
4 * Functions for working with multibyte characters in various encodings.
6 * Portions Copyright (c) 1998-2025, PostgreSQL Global Development Group
11 *-------------------------------------------------------------------------
15 #include "mb/pg_wchar.h"
16 #include "utils/ascii.h"
20 * Operations on multi-byte encodings are driven by a table of helper
23 * To add an encoding support, define mblen(), dsplen(), verifychar() and
24 * verifystr() for the encoding. For server-encodings, also define mb2wchar()
25 * and wchar2mb() conversion functions.
27 * These functions generally assume that their input is validly formed.
28 * The "verifier" functions, further down in the file, have to be more
31 * We expect that mblen() does not need to examine more than the first byte
32 * of the character to discover the correct length. GB18030 is an exception
33 * to that rule, though, as it also looks at second byte. But even that
34 * behaves in a predictable way, if you only pass the first byte: it will
35 * treat 4-byte encoded characters as two 2-byte encoded characters, which is
36 * good enough for all current uses.
38 * Note: for the display output of psql to work properly, the return values
39 * of the dsplen functions must conform to the Unicode standard. In particular
40 * the NUL character is zero width and control characters are generally
41 * width -1. It is recommended that non-ASCII encodings refer their ASCII
42 * subset to the ASCII routines to ensure consistency.
49 pg_ascii2wchar_with_len(const unsigned char *from
, pg_wchar
*to
, int len
)
53 while (len
> 0 && *from
)
64 pg_ascii_mblen(const unsigned char *s
)
70 pg_ascii_dsplen(const unsigned char *s
)
74 if (*s
< 0x20 || *s
== 0x7f)
84 pg_euc2wchar_with_len(const unsigned char *from
, pg_wchar
*to
, int len
)
88 while (len
> 0 && *from
)
90 if (*from
== SS2
&& len
>= 2) /* JIS X 0201 (so called "1 byte
94 *to
= (SS2
<< 8) | *from
++;
97 else if (*from
== SS3
&& len
>= 3) /* JIS X 0212 KANJI */
100 *to
= (SS3
<< 16) | (*from
++ << 8);
104 else if (IS_HIGHBIT_SET(*from
) && len
>= 2) /* JIS X 0208 KANJI */
110 else /* must be ASCII */
123 pg_euc_mblen(const unsigned char *s
)
131 else if (IS_HIGHBIT_SET(*s
))
139 pg_euc_dsplen(const unsigned char *s
)
147 else if (IS_HIGHBIT_SET(*s
))
150 len
= pg_ascii_dsplen(s
);
158 pg_eucjp2wchar_with_len(const unsigned char *from
, pg_wchar
*to
, int len
)
160 return pg_euc2wchar_with_len(from
, to
, len
);
164 pg_eucjp_mblen(const unsigned char *s
)
166 return pg_euc_mblen(s
);
170 pg_eucjp_dsplen(const unsigned char *s
)
178 else if (IS_HIGHBIT_SET(*s
))
181 len
= pg_ascii_dsplen(s
);
189 pg_euckr2wchar_with_len(const unsigned char *from
, pg_wchar
*to
, int len
)
191 return pg_euc2wchar_with_len(from
, to
, len
);
195 pg_euckr_mblen(const unsigned char *s
)
197 return pg_euc_mblen(s
);
201 pg_euckr_dsplen(const unsigned char *s
)
203 return pg_euc_dsplen(s
);
211 pg_euccn2wchar_with_len(const unsigned char *from
, pg_wchar
*to
, int len
)
215 while (len
> 0 && *from
)
217 if (*from
== SS2
&& len
>= 3) /* code set 2 (unused?) */
220 *to
= (SS2
<< 16) | (*from
++ << 8);
224 else if (*from
== SS3
&& len
>= 3) /* code set 3 (unused ?) */
227 *to
= (SS3
<< 16) | (*from
++ << 8);
231 else if (IS_HIGHBIT_SET(*from
) && len
>= 2) /* code set 1 */
250 pg_euccn_mblen(const unsigned char *s
)
254 if (IS_HIGHBIT_SET(*s
))
262 pg_euccn_dsplen(const unsigned char *s
)
266 if (IS_HIGHBIT_SET(*s
))
269 len
= pg_ascii_dsplen(s
);
278 pg_euctw2wchar_with_len(const unsigned char *from
, pg_wchar
*to
, int len
)
282 while (len
> 0 && *from
)
284 if (*from
== SS2
&& len
>= 4) /* code set 2 */
287 *to
= (((uint32
) SS2
) << 24) | (*from
++ << 16);
292 else if (*from
== SS3
&& len
>= 3) /* code set 3 (unused?) */
295 *to
= (SS3
<< 16) | (*from
++ << 8);
299 else if (IS_HIGHBIT_SET(*from
) && len
>= 2) /* code set 2 */
318 pg_euctw_mblen(const unsigned char *s
)
326 else if (IS_HIGHBIT_SET(*s
))
334 pg_euctw_dsplen(const unsigned char *s
)
342 else if (IS_HIGHBIT_SET(*s
))
345 len
= pg_ascii_dsplen(s
);
350 * Convert pg_wchar to EUC_* encoding.
351 * caller must allocate enough space for "to", including a trailing zero!
352 * len: length of from.
353 * "from" not necessarily null terminated.
356 pg_wchar2euc_with_len(const pg_wchar
*from
, unsigned char *to
, int len
)
360 while (len
> 0 && *from
)
364 if ((c
= (*from
>> 24)))
367 *to
++ = (*from
>> 16) & 0xff;
368 *to
++ = (*from
>> 8) & 0xff;
369 *to
++ = *from
& 0xff;
372 else if ((c
= (*from
>> 16)))
375 *to
++ = (*from
>> 8) & 0xff;
376 *to
++ = *from
& 0xff;
379 else if ((c
= (*from
>> 8)))
382 *to
++ = *from
& 0xff;
402 pg_johab_mblen(const unsigned char *s
)
404 return pg_euc_mblen(s
);
408 pg_johab_dsplen(const unsigned char *s
)
410 return pg_euc_dsplen(s
);
414 * convert UTF8 string to pg_wchar (UCS-4)
415 * caller must allocate enough space for "to", including a trailing zero!
416 * len: length of from.
417 * "from" not necessarily null terminated.
420 pg_utf2wchar_with_len(const unsigned char *from
, pg_wchar
*to
, int len
)
428 while (len
> 0 && *from
)
430 if ((*from
& 0x80) == 0)
435 else if ((*from
& 0xe0) == 0xc0)
438 break; /* drop trailing incomplete char */
441 *to
= (c1
<< 6) | c2
;
444 else if ((*from
& 0xf0) == 0xe0)
447 break; /* drop trailing incomplete char */
451 *to
= (c1
<< 12) | (c2
<< 6) | c3
;
454 else if ((*from
& 0xf8) == 0xf0)
457 break; /* drop trailing incomplete char */
462 *to
= (c1
<< 18) | (c2
<< 12) | (c3
<< 6) | c4
;
467 /* treat a bogus char as length 1; not ours to raise error */
480 * Trivial conversion from pg_wchar to UTF-8.
481 * caller should allocate enough space for "to"
482 * len: length of from.
483 * "from" not necessarily null terminated.
486 pg_wchar2utf_with_len(const pg_wchar
*from
, unsigned char *to
, int len
)
490 while (len
> 0 && *from
)
494 unicode_to_utf8(*from
, to
);
495 char_len
= pg_utf_mblen(to
);
506 * Return the byte length of a UTF8 character pointed to by s
508 * Note: in the current implementation we do not support UTF8 sequences
509 * of more than 4 bytes; hence do NOT return a value larger than 4.
510 * We return "1" for any leading byte that is either flat-out illegal or
511 * indicates a length larger than we support.
513 * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
514 * other places would need to be fixed to change this.
517 pg_utf_mblen(const unsigned char *s
)
521 if ((*s
& 0x80) == 0)
523 else if ((*s
& 0xe0) == 0xc0)
525 else if ((*s
& 0xf0) == 0xe0)
527 else if ((*s
& 0xf8) == 0xf0)
530 else if ((*s
& 0xfc) == 0xf8)
532 else if ((*s
& 0xfe) == 0xfc)
541 * This is an implementation of wcwidth() and wcswidth() as defined in
542 * "The Single UNIX Specification, Version 2, The Open Group, 1997"
543 * <http://www.unix.org/online.html>
545 * Markus Kuhn -- 2001-09-08 -- public domain
547 * customised for PostgreSQL
549 * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
558 /* auxiliary function for binary search in interval table */
560 mbbisearch(pg_wchar ucs
, const struct mbinterval
*table
, int max
)
565 if (ucs
< table
[0].first
|| ucs
> table
[max
].last
)
569 mid
= (min
+ max
) / 2;
570 if (ucs
> table
[mid
].last
)
572 else if (ucs
< table
[mid
].first
)
582 /* The following functions define the column width of an ISO 10646
583 * character as follows:
585 * - The null character (U+0000) has a column width of 0.
587 * - Other C0/C1 control characters and DEL will lead to a return
590 * - Non-spacing and enclosing combining characters (general
591 * category code Mn, Me or Cf in the Unicode database) have a
594 * - Spacing characters in the East Asian Wide (W) or East Asian
595 * FullWidth (F) category as defined in Unicode Technical
596 * Report #11 have a column width of 2.
598 * - All remaining characters (including all printable
599 * ISO 8859-1 and WGL4 characters, Unicode control characters,
600 * etc.) have a column width of 1.
602 * This implementation assumes that wchar_t characters are encoded
607 ucs_wcwidth(pg_wchar ucs
)
609 #include "common/unicode_nonspacing_table.h"
610 #include "common/unicode_east_asian_fw_table.h"
612 /* test for 8-bit control characters */
616 if (ucs
< 0x20 || (ucs
>= 0x7f && ucs
< 0xa0) || ucs
> 0x0010ffff)
620 * binary search in table of non-spacing characters
622 * XXX: In the official Unicode sources, it is possible for a character to
623 * be described as both non-spacing and wide at the same time. As of
624 * Unicode 13.0, treating the non-spacing property as the determining
625 * factor for display width leads to the correct behavior, so do that
628 if (mbbisearch(ucs
, nonspacing
,
629 sizeof(nonspacing
) / sizeof(struct mbinterval
) - 1))
632 /* binary search in table of wide characters */
633 if (mbbisearch(ucs
, east_asian_fw
,
634 sizeof(east_asian_fw
) / sizeof(struct mbinterval
) - 1))
641 pg_utf_dsplen(const unsigned char *s
)
643 return ucs_wcwidth(utf8_to_unicode(s
));
647 * convert mule internal code to pg_wchar
648 * caller should allocate enough space for "to"
649 * len: length of from.
650 * "from" not necessarily null terminated.
653 pg_mule2wchar_with_len(const unsigned char *from
, pg_wchar
*to
, int len
)
657 while (len
> 0 && *from
)
659 if (IS_LC1(*from
) && len
>= 2)
665 else if (IS_LCPRV1(*from
) && len
>= 3)
672 else if (IS_LC2(*from
) && len
>= 3)
679 else if (IS_LCPRV2(*from
) && len
>= 4)
689 *to
= (unsigned char) *from
++;
700 * convert pg_wchar to mule internal code
701 * caller should allocate enough space for "to"
702 * len: length of from.
703 * "from" not necessarily null terminated.
706 pg_wchar2mule_with_len(const pg_wchar
*from
, unsigned char *to
, int len
)
710 while (len
> 0 && *from
)
714 lb
= (*from
>> 16) & 0xff;
718 *to
++ = *from
& 0xff;
724 *to
++ = (*from
>> 8) & 0xff;
725 *to
++ = *from
& 0xff;
728 else if (IS_LCPRV1_A_RANGE(lb
))
732 *to
++ = *from
& 0xff;
735 else if (IS_LCPRV1_B_RANGE(lb
))
739 *to
++ = *from
& 0xff;
742 else if (IS_LCPRV2_A_RANGE(lb
))
746 *to
++ = (*from
>> 8) & 0xff;
747 *to
++ = *from
& 0xff;
750 else if (IS_LCPRV2_B_RANGE(lb
))
754 *to
++ = (*from
>> 8) & 0xff;
755 *to
++ = *from
& 0xff;
760 *to
++ = *from
& 0xff;
770 /* exported for direct use by conv.c */
772 pg_mule_mblen(const unsigned char *s
)
778 else if (IS_LCPRV1(*s
))
782 else if (IS_LCPRV2(*s
))
785 len
= 1; /* assume ASCII */
790 pg_mule_dsplen(const unsigned char *s
)
795 * Note: it's not really appropriate to assume that all multibyte charsets
796 * are double-wide on screen. But this seems an okay approximation for
797 * the MULE charsets we currently support.
802 else if (IS_LCPRV1(*s
))
806 else if (IS_LCPRV2(*s
))
809 len
= 1; /* assume ASCII */
818 pg_latin12wchar_with_len(const unsigned char *from
, pg_wchar
*to
, int len
)
822 while (len
> 0 && *from
)
833 * Trivial conversion from pg_wchar to single byte encoding. Just ignores
835 * caller should allocate enough space for "to"
836 * len: length of from.
837 * "from" not necessarily null terminated.
840 pg_wchar2single_with_len(const pg_wchar
*from
, unsigned char *to
, int len
)
844 while (len
> 0 && *from
)
855 pg_latin1_mblen(const unsigned char *s
)
861 pg_latin1_dsplen(const unsigned char *s
)
863 return pg_ascii_dsplen(s
);
870 pg_sjis_mblen(const unsigned char *s
)
874 if (*s
>= 0xa1 && *s
<= 0xdf)
875 len
= 1; /* 1 byte kana? */
876 else if (IS_HIGHBIT_SET(*s
))
877 len
= 2; /* kanji? */
879 len
= 1; /* should be ASCII */
884 pg_sjis_dsplen(const unsigned char *s
)
888 if (*s
>= 0xa1 && *s
<= 0xdf)
889 len
= 1; /* 1 byte kana? */
890 else if (IS_HIGHBIT_SET(*s
))
891 len
= 2; /* kanji? */
893 len
= pg_ascii_dsplen(s
); /* should be ASCII */
901 pg_big5_mblen(const unsigned char *s
)
905 if (IS_HIGHBIT_SET(*s
))
906 len
= 2; /* kanji? */
908 len
= 1; /* should be ASCII */
913 pg_big5_dsplen(const unsigned char *s
)
917 if (IS_HIGHBIT_SET(*s
))
918 len
= 2; /* kanji? */
920 len
= pg_ascii_dsplen(s
); /* should be ASCII */
928 pg_gbk_mblen(const unsigned char *s
)
932 if (IS_HIGHBIT_SET(*s
))
933 len
= 2; /* kanji? */
935 len
= 1; /* should be ASCII */
940 pg_gbk_dsplen(const unsigned char *s
)
944 if (IS_HIGHBIT_SET(*s
))
945 len
= 2; /* kanji? */
947 len
= pg_ascii_dsplen(s
); /* should be ASCII */
955 pg_uhc_mblen(const unsigned char *s
)
959 if (IS_HIGHBIT_SET(*s
))
960 len
= 2; /* 2byte? */
962 len
= 1; /* should be ASCII */
967 pg_uhc_dsplen(const unsigned char *s
)
971 if (IS_HIGHBIT_SET(*s
))
972 len
= 2; /* 2byte? */
974 len
= pg_ascii_dsplen(s
); /* should be ASCII */
980 * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
984 * Unlike all other mblen() functions, this also looks at the second byte of
985 * the input. However, if you only pass the first byte of a multi-byte
986 * string, and \0 as the second byte, this still works in a predictable way:
987 * a 4-byte character will be reported as two 2-byte characters. That's
988 * enough for all current uses, as a client-only encoding. It works that
989 * way, because in any valid 4-byte GB18030-encoded character, the third and
990 * fourth byte look like a 2-byte encoded character, when looked at
994 pg_gb18030_mblen(const unsigned char *s
)
998 if (!IS_HIGHBIT_SET(*s
))
1000 else if (*(s
+ 1) >= 0x30 && *(s
+ 1) <= 0x39)
1008 pg_gb18030_dsplen(const unsigned char *s
)
1012 if (IS_HIGHBIT_SET(*s
))
1015 len
= pg_ascii_dsplen(s
); /* ASCII */
1020 *-------------------------------------------------------------------
1021 * multibyte sequence validators
1023 * The verifychar functions accept "s", a pointer to the first byte of a
1024 * string, and "len", the remaining length of the string. If there is a
1025 * validly encoded character beginning at *s, return its length in bytes;
1028 * The verifystr functions also accept "s", a pointer to a string and "len",
1029 * the length of the string. They verify the whole string, and return the
1030 * number of input bytes (<= len) that are valid. In other words, if the
1031 * whole string is valid, verifystr returns "len", otherwise it returns the
1032 * byte offset of the first invalid character. The verifystr functions must
1033 * test for and reject zeroes in the input.
1035 * The verifychar functions can assume that len > 0 and that *s != '\0', but
1036 * they must test for and reject zeroes in any additional bytes of a
1037 * multibyte character. Note that this definition allows the function for a
1038 * single-byte encoding to be just "return 1".
1039 *-------------------------------------------------------------------
1042 pg_ascii_verifychar(const unsigned char *s
, int len
)
1048 pg_ascii_verifystr(const unsigned char *s
, int len
)
1050 const unsigned char *nullpos
= memchr(s
, 0, len
);
1052 if (nullpos
== NULL
)
1058 #define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
1061 pg_eucjp_verifychar(const unsigned char *s
, int len
)
1071 case SS2
: /* JIS X 0201 */
1076 if (c2
< 0xa1 || c2
> 0xdf)
1080 case SS3
: /* JIS X 0212 */
1085 if (!IS_EUC_RANGE_VALID(c2
))
1088 if (!IS_EUC_RANGE_VALID(c2
))
1093 if (IS_HIGHBIT_SET(c1
)) /* JIS X 0208? */
1098 if (!IS_EUC_RANGE_VALID(c1
))
1101 if (!IS_EUC_RANGE_VALID(c2
))
1116 pg_eucjp_verifystr(const unsigned char *s
, int len
)
1118 const unsigned char *start
= s
;
1124 /* fast path for ASCII-subset characters */
1125 if (!IS_HIGHBIT_SET(*s
))
1133 l
= pg_eucjp_verifychar(s
, len
);
1145 pg_euckr_verifychar(const unsigned char *s
, int len
)
1153 if (IS_HIGHBIT_SET(c1
))
1158 if (!IS_EUC_RANGE_VALID(c1
))
1161 if (!IS_EUC_RANGE_VALID(c2
))
1174 pg_euckr_verifystr(const unsigned char *s
, int len
)
1176 const unsigned char *start
= s
;
1182 /* fast path for ASCII-subset characters */
1183 if (!IS_HIGHBIT_SET(*s
))
1191 l
= pg_euckr_verifychar(s
, len
);
1202 /* EUC-CN byte sequences are exactly same as EUC-KR */
1203 #define pg_euccn_verifychar pg_euckr_verifychar
1204 #define pg_euccn_verifystr pg_euckr_verifystr
1207 pg_euctw_verifychar(const unsigned char *s
, int len
)
1217 case SS2
: /* CNS 11643 Plane 1-7 */
1222 if (c2
< 0xa1 || c2
> 0xa7)
1225 if (!IS_EUC_RANGE_VALID(c2
))
1228 if (!IS_EUC_RANGE_VALID(c2
))
1232 case SS3
: /* unused */
1236 if (IS_HIGHBIT_SET(c1
)) /* CNS 11643 Plane 1 */
1241 /* no further range check on c1? */
1243 if (!IS_EUC_RANGE_VALID(c2
))
1257 pg_euctw_verifystr(const unsigned char *s
, int len
)
1259 const unsigned char *start
= s
;
1265 /* fast path for ASCII-subset characters */
1266 if (!IS_HIGHBIT_SET(*s
))
1274 l
= pg_euctw_verifychar(s
, len
);
1286 pg_johab_verifychar(const unsigned char *s
, int len
)
1292 l
= mbl
= pg_johab_mblen(s
);
1297 if (!IS_HIGHBIT_SET(*s
))
1303 if (!IS_EUC_RANGE_VALID(c
))
1310 pg_johab_verifystr(const unsigned char *s
, int len
)
1312 const unsigned char *start
= s
;
1318 /* fast path for ASCII-subset characters */
1319 if (!IS_HIGHBIT_SET(*s
))
1327 l
= pg_johab_verifychar(s
, len
);
1339 pg_mule_verifychar(const unsigned char *s
, int len
)
1345 l
= mbl
= pg_mule_mblen(s
);
1353 if (!IS_HIGHBIT_SET(c
))
1360 pg_mule_verifystr(const unsigned char *s
, int len
)
1362 const unsigned char *start
= s
;
1368 /* fast path for ASCII-subset characters */
1369 if (!IS_HIGHBIT_SET(*s
))
1377 l
= pg_mule_verifychar(s
, len
);
1389 pg_latin1_verifychar(const unsigned char *s
, int len
)
1395 pg_latin1_verifystr(const unsigned char *s
, int len
)
1397 const unsigned char *nullpos
= memchr(s
, 0, len
);
1399 if (nullpos
== NULL
)
1406 pg_sjis_verifychar(const unsigned char *s
, int len
)
1413 l
= mbl
= pg_sjis_mblen(s
);
1418 if (l
== 1) /* pg_sjis_mblen already verified it */
1423 if (!ISSJISHEAD(c1
) || !ISSJISTAIL(c2
))
1429 pg_sjis_verifystr(const unsigned char *s
, int len
)
1431 const unsigned char *start
= s
;
1437 /* fast path for ASCII-subset characters */
1438 if (!IS_HIGHBIT_SET(*s
))
1446 l
= pg_sjis_verifychar(s
, len
);
1458 pg_big5_verifychar(const unsigned char *s
, int len
)
1463 l
= mbl
= pg_big5_mblen(s
);
1478 pg_big5_verifystr(const unsigned char *s
, int len
)
1480 const unsigned char *start
= s
;
1486 /* fast path for ASCII-subset characters */
1487 if (!IS_HIGHBIT_SET(*s
))
1495 l
= pg_big5_verifychar(s
, len
);
1507 pg_gbk_verifychar(const unsigned char *s
, int len
)
1512 l
= mbl
= pg_gbk_mblen(s
);
1527 pg_gbk_verifystr(const unsigned char *s
, int len
)
1529 const unsigned char *start
= s
;
1535 /* fast path for ASCII-subset characters */
1536 if (!IS_HIGHBIT_SET(*s
))
1544 l
= pg_gbk_verifychar(s
, len
);
1556 pg_uhc_verifychar(const unsigned char *s
, int len
)
1561 l
= mbl
= pg_uhc_mblen(s
);
1576 pg_uhc_verifystr(const unsigned char *s
, int len
)
1578 const unsigned char *start
= s
;
1584 /* fast path for ASCII-subset characters */
1585 if (!IS_HIGHBIT_SET(*s
))
1593 l
= pg_uhc_verifychar(s
, len
);
1605 pg_gb18030_verifychar(const unsigned char *s
, int len
)
1609 if (!IS_HIGHBIT_SET(*s
))
1611 else if (len
>= 4 && *(s
+ 1) >= 0x30 && *(s
+ 1) <= 0x39)
1613 /* Should be 4-byte, validate remaining bytes */
1614 if (*s
>= 0x81 && *s
<= 0xfe &&
1615 *(s
+ 2) >= 0x81 && *(s
+ 2) <= 0xfe &&
1616 *(s
+ 3) >= 0x30 && *(s
+ 3) <= 0x39)
1621 else if (len
>= 2 && *s
>= 0x81 && *s
<= 0xfe)
1623 /* Should be 2-byte, validate */
1624 if ((*(s
+ 1) >= 0x40 && *(s
+ 1) <= 0x7e) ||
1625 (*(s
+ 1) >= 0x80 && *(s
+ 1) <= 0xfe))
1636 pg_gb18030_verifystr(const unsigned char *s
, int len
)
1638 const unsigned char *start
= s
;
1644 /* fast path for ASCII-subset characters */
1645 if (!IS_HIGHBIT_SET(*s
))
1653 l
= pg_gb18030_verifychar(s
, len
);
1665 pg_utf8_verifychar(const unsigned char *s
, int len
)
1669 if ((*s
& 0x80) == 0)
1675 else if ((*s
& 0xe0) == 0xc0)
1677 else if ((*s
& 0xf0) == 0xe0)
1679 else if ((*s
& 0xf8) == 0xf0)
1687 if (!pg_utf8_islegal(s
, l
))
1694 * The fast path of the UTF-8 verifier uses a deterministic finite automaton
1695 * (DFA) for multibyte characters. In a traditional table-driven DFA, the
1696 * input byte and current state are used to compute an index into an array of
1697 * state transitions. Since the address of the next transition is dependent
1698 * on this computation, there is latency in executing the load instruction,
1699 * and the CPU is not kept busy.
1701 * Instead, we use a "shift-based" DFA as described by Per Vognsen:
1703 * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
1705 * In a shift-based DFA, the input byte is an index into array of integers
1706 * whose bit pattern encodes the state transitions. To compute the next
1707 * state, we simply right-shift the integer by the current state and apply a
1708 * mask. In this scheme, the address of the transition only depends on the
1709 * input byte, so there is better pipelining.
1711 * The naming convention for states and transitions was adopted from a UTF-8
1712 * to UTF-16/32 transcoder, whose table is reproduced below:
1714 * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
1716 * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE
1717 * ==========================================================================
1718 * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, | BGN/END
1719 * err, err, err, err, err, err, err, err, err, err, err, err, | ERR
1721 * err, err, END, END, END, err, err, err, err, err, err, err, | CS1
1722 * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, | CS2
1723 * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, | CS3
1725 * err, err, err, err, CS1, err, err, err, err, err, err, err, | P3A
1726 * err, err, CS1, CS1, err, err, err, err, err, err, err, err, | P3B
1728 * err, err, err, CS2, CS2, err, err, err, err, err, err, err, | P4A
1729 * err, err, CS2, err, err, err, err, err, err, err, err, err, | P4B
1731 * In the most straightforward implementation, a shift-based DFA for UTF-8
1732 * requires 64-bit integers to encode the transitions, but with an SMT solver
1733 * it's possible to find state numbers such that the transitions fit within
1734 * 32-bit integers, as Dougall Johnson demonstrated:
1736 * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
1738 * This packed representation is the reason for the seemingly odd choice of
1739 * state values below.
1746 /* Continuation states, expect 1/2/3 continuation bytes */
1750 /* Partial states, where the first continuation byte has a restricted range */
1751 #define P3A 6 /* Lead was E0, check for 3-byte overlong */
1752 #define P3B 20 /* Lead was ED, check for surrogate */
1753 #define P4A 25 /* Lead was F0, check for 4-byte overlong */
1754 #define P4B 30 /* Lead was F4, check for too-large */
1755 /* Begin and End are the same state */
1758 /* the encoded state transitions for the lookup table */
1761 #define ASC (END << BGN)
1763 #define L2A (CS1 << BGN)
1765 #define L3A (P3A << BGN)
1766 #define L3B (CS2 << BGN)
1767 #define L3C (P3B << BGN)
1769 #define L4A (P4A << BGN)
1770 #define L4B (CS3 << BGN)
1771 #define L4C (P4B << BGN)
1772 /* continuation byte */
1773 #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
1774 #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
1775 #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
1779 static const uint32 Utf8Transition
[256] =
1783 ILL
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
,
1784 ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
,
1785 ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
,
1786 ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
,
1788 ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
,
1789 ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
,
1790 ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
,
1791 ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
,
1793 ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
,
1794 ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
,
1795 ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
,
1796 ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
,
1798 ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
,
1799 ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
,
1800 ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
,
1801 ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
, ASC
,
1803 /* continuation bytes */
1806 CR1
, CR1
, CR1
, CR1
, CR1
, CR1
, CR1
, CR1
,
1807 CR1
, CR1
, CR1
, CR1
, CR1
, CR1
, CR1
, CR1
,
1810 CR2
, CR2
, CR2
, CR2
, CR2
, CR2
, CR2
, CR2
,
1811 CR2
, CR2
, CR2
, CR2
, CR2
, CR2
, CR2
, CR2
,
1814 CR3
, CR3
, CR3
, CR3
, CR3
, CR3
, CR3
, CR3
,
1815 CR3
, CR3
, CR3
, CR3
, CR3
, CR3
, CR3
, CR3
,
1816 CR3
, CR3
, CR3
, CR3
, CR3
, CR3
, CR3
, CR3
,
1817 CR3
, CR3
, CR3
, CR3
, CR3
, CR3
, CR3
, CR3
,
1822 ILL
, ILL
, L2A
, L2A
, L2A
, L2A
, L2A
, L2A
,
1823 L2A
, L2A
, L2A
, L2A
, L2A
, L2A
, L2A
, L2A
,
1824 L2A
, L2A
, L2A
, L2A
, L2A
, L2A
, L2A
, L2A
,
1825 L2A
, L2A
, L2A
, L2A
, L2A
, L2A
, L2A
, L2A
,
1828 L3A
, L3B
, L3B
, L3B
, L3B
, L3B
, L3B
, L3B
,
1829 L3B
, L3B
, L3B
, L3B
, L3B
, L3C
, L3B
, L3B
,
1832 L4A
, L4B
, L4B
, L4B
, L4C
, ILL
, ILL
, ILL
,
1833 ILL
, ILL
, ILL
, ILL
, ILL
, ILL
, ILL
, ILL
1837 utf8_advance(const unsigned char *s
, uint32
*state
, int len
)
1839 /* Note: We deliberately don't check the state's value here. */
1843 * It's important that the mask value is 31: In most instruction sets,
1844 * a shift by a 32-bit operand is understood to be a shift by its mod
1845 * 32, so the compiler should elide the mask operation.
1847 *state
= Utf8Transition
[*s
++] >> (*state
& 31);
1855 pg_utf8_verifystr(const unsigned char *s
, int len
)
1857 const unsigned char *start
= s
;
1858 const int orig_len
= len
;
1862 * With a stride of two vector widths, gcc will unroll the loop. Even if
1863 * the compiler can unroll a longer loop, it's not worth it because we
1864 * must fall back to the byte-wise algorithm if we find any non-ASCII.
1866 #define STRIDE_LENGTH (2 * sizeof(Vector8))
1868 if (len
>= STRIDE_LENGTH
)
1870 while (len
>= STRIDE_LENGTH
)
1873 * If the chunk is all ASCII, we can skip the full UTF-8 check,
1874 * but we must first check for a non-END state, which means the
1875 * previous chunk ended in the middle of a multibyte sequence.
1877 if (state
!= END
|| !is_valid_ascii(s
, STRIDE_LENGTH
))
1878 utf8_advance(s
, &state
, STRIDE_LENGTH
);
1881 len
-= STRIDE_LENGTH
;
1884 /* The error state persists, so we only need to check for it here. */
1888 * Start over from the beginning with the slow path so we can
1889 * count the valid bytes.
1894 else if (state
!= END
)
1897 * The fast path exited in the middle of a multibyte sequence.
1898 * Walk backwards to find the leading byte so that the slow path
1899 * can resume checking from there. We must always backtrack at
1900 * least one byte, since the current byte could be e.g. an ASCII
1901 * byte after a 2-byte lead, which is invalid.
1908 Assert(IS_HIGHBIT_SET(*s
));
1909 } while (pg_utf_mblen(s
) <= 1);
1913 /* check remaining bytes */
1918 /* fast path for ASCII-subset characters */
1919 if (!IS_HIGHBIT_SET(*s
))
1927 l
= pg_utf8_verifychar(s
, len
);
1939 * Check for validity of a single UTF-8 encoded character
1941 * This directly implements the rules in RFC3629. The bizarre-looking
1942 * restrictions on the second byte are meant to ensure that there isn't
1943 * more than one encoding of a given Unicode character point; that is,
1944 * you may not use a longer-than-necessary byte sequence with high order
1945 * zero bits to represent a character that would fit in fewer bytes.
1946 * To do otherwise is to create security hazards (eg, create an apparent
1947 * non-ASCII character that decodes to plain ASCII).
1949 * length is assumed to have been obtained by pg_utf_mblen(), and the
1950 * caller must have checked that that many bytes are present in the buffer.
1953 pg_utf8_islegal(const unsigned char *source
, int length
)
1960 /* reject lengths 5 and 6 for now */
1964 if (a
< 0x80 || a
> 0xBF)
1969 if (a
< 0x80 || a
> 0xBF)
1977 if (a
< 0xA0 || a
> 0xBF)
1981 if (a
< 0x80 || a
> 0x9F)
1985 if (a
< 0x90 || a
> 0xBF)
1989 if (a
< 0x80 || a
> 0x8F)
1993 if (a
< 0x80 || a
> 0xBF)
2000 if (a
>= 0x80 && a
< 0xC2)
2011 *-------------------------------------------------------------------
2012 * encoding info table
2013 *-------------------------------------------------------------------
2015 const pg_wchar_tbl pg_wchar_table
[] = {
2016 [PG_SQL_ASCII
] = {pg_ascii2wchar_with_len
, pg_wchar2single_with_len
, pg_ascii_mblen
, pg_ascii_dsplen
, pg_ascii_verifychar
, pg_ascii_verifystr
, 1},
2017 [PG_EUC_JP
] = {pg_eucjp2wchar_with_len
, pg_wchar2euc_with_len
, pg_eucjp_mblen
, pg_eucjp_dsplen
, pg_eucjp_verifychar
, pg_eucjp_verifystr
, 3},
2018 [PG_EUC_CN
] = {pg_euccn2wchar_with_len
, pg_wchar2euc_with_len
, pg_euccn_mblen
, pg_euccn_dsplen
, pg_euccn_verifychar
, pg_euccn_verifystr
, 2},
2019 [PG_EUC_KR
] = {pg_euckr2wchar_with_len
, pg_wchar2euc_with_len
, pg_euckr_mblen
, pg_euckr_dsplen
, pg_euckr_verifychar
, pg_euckr_verifystr
, 3},
2020 [PG_EUC_TW
] = {pg_euctw2wchar_with_len
, pg_wchar2euc_with_len
, pg_euctw_mblen
, pg_euctw_dsplen
, pg_euctw_verifychar
, pg_euctw_verifystr
, 4},
2021 [PG_EUC_JIS_2004
] = {pg_eucjp2wchar_with_len
, pg_wchar2euc_with_len
, pg_eucjp_mblen
, pg_eucjp_dsplen
, pg_eucjp_verifychar
, pg_eucjp_verifystr
, 3},
2022 [PG_UTF8
] = {pg_utf2wchar_with_len
, pg_wchar2utf_with_len
, pg_utf_mblen
, pg_utf_dsplen
, pg_utf8_verifychar
, pg_utf8_verifystr
, 4},
2023 [PG_MULE_INTERNAL
] = {pg_mule2wchar_with_len
, pg_wchar2mule_with_len
, pg_mule_mblen
, pg_mule_dsplen
, pg_mule_verifychar
, pg_mule_verifystr
, 4},
2024 [PG_LATIN1
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2025 [PG_LATIN2
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2026 [PG_LATIN3
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2027 [PG_LATIN4
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2028 [PG_LATIN5
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2029 [PG_LATIN6
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2030 [PG_LATIN7
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2031 [PG_LATIN8
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2032 [PG_LATIN9
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2033 [PG_LATIN10
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2034 [PG_WIN1256
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2035 [PG_WIN1258
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2036 [PG_WIN866
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2037 [PG_WIN874
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2038 [PG_KOI8R
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2039 [PG_WIN1251
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2040 [PG_WIN1252
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2041 [PG_ISO_8859_5
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2042 [PG_ISO_8859_6
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2043 [PG_ISO_8859_7
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2044 [PG_ISO_8859_8
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2045 [PG_WIN1250
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2046 [PG_WIN1253
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2047 [PG_WIN1254
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2048 [PG_WIN1255
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2049 [PG_WIN1257
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2050 [PG_KOI8U
] = {pg_latin12wchar_with_len
, pg_wchar2single_with_len
, pg_latin1_mblen
, pg_latin1_dsplen
, pg_latin1_verifychar
, pg_latin1_verifystr
, 1},
2051 [PG_SJIS
] = {0, 0, pg_sjis_mblen
, pg_sjis_dsplen
, pg_sjis_verifychar
, pg_sjis_verifystr
, 2},
2052 [PG_BIG5
] = {0, 0, pg_big5_mblen
, pg_big5_dsplen
, pg_big5_verifychar
, pg_big5_verifystr
, 2},
2053 [PG_GBK
] = {0, 0, pg_gbk_mblen
, pg_gbk_dsplen
, pg_gbk_verifychar
, pg_gbk_verifystr
, 2},
2054 [PG_UHC
] = {0, 0, pg_uhc_mblen
, pg_uhc_dsplen
, pg_uhc_verifychar
, pg_uhc_verifystr
, 2},
2055 [PG_GB18030
] = {0, 0, pg_gb18030_mblen
, pg_gb18030_dsplen
, pg_gb18030_verifychar
, pg_gb18030_verifystr
, 4},
2056 [PG_JOHAB
] = {0, 0, pg_johab_mblen
, pg_johab_dsplen
, pg_johab_verifychar
, pg_johab_verifystr
, 3},
2057 [PG_SHIFT_JIS_2004
] = {0, 0, pg_sjis_mblen
, pg_sjis_dsplen
, pg_sjis_verifychar
, pg_sjis_verifystr
, 2},
2061 * Returns the byte length of a multibyte character.
2063 * Caution: when dealing with text that is not certainly valid in the
2064 * specified encoding, the result may exceed the actual remaining
2065 * string length. Callers that are not prepared to deal with that
2066 * should use pg_encoding_mblen_bounded() instead.
2069 pg_encoding_mblen(int encoding
, const char *mbstr
)
2071 return (PG_VALID_ENCODING(encoding
) ?
2072 pg_wchar_table
[encoding
].mblen((const unsigned char *) mbstr
) :
2073 pg_wchar_table
[PG_SQL_ASCII
].mblen((const unsigned char *) mbstr
));
2077 * Returns the byte length of a multibyte character; but not more than
2078 * the distance to end of string.
2081 pg_encoding_mblen_bounded(int encoding
, const char *mbstr
)
2083 return strnlen(mbstr
, pg_encoding_mblen(encoding
, mbstr
));
2087 * Returns the display length of a multibyte character.
2090 pg_encoding_dsplen(int encoding
, const char *mbstr
)
2092 return (PG_VALID_ENCODING(encoding
) ?
2093 pg_wchar_table
[encoding
].dsplen((const unsigned char *) mbstr
) :
2094 pg_wchar_table
[PG_SQL_ASCII
].dsplen((const unsigned char *) mbstr
));
2098 * Verify the first multibyte character of the given string.
2099 * Return its byte length if good, -1 if bad. (See comments above for
2100 * full details of the mbverifychar API.)
2103 pg_encoding_verifymbchar(int encoding
, const char *mbstr
, int len
)
2105 return (PG_VALID_ENCODING(encoding
) ?
2106 pg_wchar_table
[encoding
].mbverifychar((const unsigned char *) mbstr
, len
) :
2107 pg_wchar_table
[PG_SQL_ASCII
].mbverifychar((const unsigned char *) mbstr
, len
));
2111 * Verify that a string is valid for the given encoding.
2112 * Returns the number of input bytes (<= len) that form a valid string.
2113 * (See comments above for full details of the mbverifystr API.)
2116 pg_encoding_verifymbstr(int encoding
, const char *mbstr
, int len
)
2118 return (PG_VALID_ENCODING(encoding
) ?
2119 pg_wchar_table
[encoding
].mbverifystr((const unsigned char *) mbstr
, len
) :
2120 pg_wchar_table
[PG_SQL_ASCII
].mbverifystr((const unsigned char *) mbstr
, len
));
2124 * fetch maximum length of a given encoding
2127 pg_encoding_max_length(int encoding
)
2129 Assert(PG_VALID_ENCODING(encoding
));
2131 return pg_wchar_table
[encoding
].maxmblen
;