vacuumlazy.c: Tweak local variable name.
[pgsql.git] / src / common / wchar.c
blobfb9d9f5c85f8111f2010dad28a051cac44aeafe6
1 /*-------------------------------------------------------------------------
3 * wchar.c
4 * Functions for working with multibyte characters in various encodings.
6 * Portions Copyright (c) 1998-2023, PostgreSQL Global Development Group
8 * IDENTIFICATION
9 * src/common/wchar.c
11 *-------------------------------------------------------------------------
13 #include "c.h"
15 #include "mb/pg_wchar.h"
19 * Operations on multi-byte encodings are driven by a table of helper
20 * functions.
22 * To add an encoding support, define mblen(), dsplen(), verifychar() and
23 * verifystr() for the encoding. For server-encodings, also define mb2wchar()
24 * and wchar2mb() conversion functions.
26 * These functions generally assume that their input is validly formed.
27 * The "verifier" functions, further down in the file, have to be more
28 * paranoid.
30 * We expect that mblen() does not need to examine more than the first byte
31 * of the character to discover the correct length. GB18030 is an exception
32 * to that rule, though, as it also looks at second byte. But even that
33 * behaves in a predictable way, if you only pass the first byte: it will
34 * treat 4-byte encoded characters as two 2-byte encoded characters, which is
35 * good enough for all current uses.
37 * Note: for the display output of psql to work properly, the return values
38 * of the dsplen functions must conform to the Unicode standard. In particular
39 * the NUL character is zero width and control characters are generally
40 * width -1. It is recommended that non-ASCII encodings refer their ASCII
41 * subset to the ASCII routines to ensure consistency.
45 * SQL/ASCII
47 static int
48 pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
50 int cnt = 0;
52 while (len > 0 && *from)
54 *to++ = *from++;
55 len--;
56 cnt++;
58 *to = 0;
59 return cnt;
62 static int
63 pg_ascii_mblen(const unsigned char *s)
65 return 1;
68 static int
69 pg_ascii_dsplen(const unsigned char *s)
71 if (*s == '\0')
72 return 0;
73 if (*s < 0x20 || *s == 0x7f)
74 return -1;
76 return 1;
80 * EUC
82 static int
83 pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
85 int cnt = 0;
87 while (len > 0 && *from)
89 if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte
90 * KANA") */
92 from++;
93 *to = (SS2 << 8) | *from++;
94 len -= 2;
96 else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */
98 from++;
99 *to = (SS3 << 16) | (*from++ << 8);
100 *to |= *from++;
101 len -= 3;
103 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
105 *to = *from++ << 8;
106 *to |= *from++;
107 len -= 2;
109 else /* must be ASCII */
111 *to = *from++;
112 len--;
114 to++;
115 cnt++;
117 *to = 0;
118 return cnt;
121 static inline int
122 pg_euc_mblen(const unsigned char *s)
124 int len;
126 if (*s == SS2)
127 len = 2;
128 else if (*s == SS3)
129 len = 3;
130 else if (IS_HIGHBIT_SET(*s))
131 len = 2;
132 else
133 len = 1;
134 return len;
137 static inline int
138 pg_euc_dsplen(const unsigned char *s)
140 int len;
142 if (*s == SS2)
143 len = 2;
144 else if (*s == SS3)
145 len = 2;
146 else if (IS_HIGHBIT_SET(*s))
147 len = 2;
148 else
149 len = pg_ascii_dsplen(s);
150 return len;
154 * EUC_JP
156 static int
157 pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
159 return pg_euc2wchar_with_len(from, to, len);
162 static int
163 pg_eucjp_mblen(const unsigned char *s)
165 return pg_euc_mblen(s);
168 static int
169 pg_eucjp_dsplen(const unsigned char *s)
171 int len;
173 if (*s == SS2)
174 len = 1;
175 else if (*s == SS3)
176 len = 2;
177 else if (IS_HIGHBIT_SET(*s))
178 len = 2;
179 else
180 len = pg_ascii_dsplen(s);
181 return len;
185 * EUC_KR
187 static int
188 pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
190 return pg_euc2wchar_with_len(from, to, len);
193 static int
194 pg_euckr_mblen(const unsigned char *s)
196 return pg_euc_mblen(s);
199 static int
200 pg_euckr_dsplen(const unsigned char *s)
202 return pg_euc_dsplen(s);
206 * EUC_CN
209 static int
210 pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
212 int cnt = 0;
214 while (len > 0 && *from)
216 if (*from == SS2 && len >= 3) /* code set 2 (unused?) */
218 from++;
219 *to = (SS2 << 16) | (*from++ << 8);
220 *to |= *from++;
221 len -= 3;
223 else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */
225 from++;
226 *to = (SS3 << 16) | (*from++ << 8);
227 *to |= *from++;
228 len -= 3;
230 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
232 *to = *from++ << 8;
233 *to |= *from++;
234 len -= 2;
236 else
238 *to = *from++;
239 len--;
241 to++;
242 cnt++;
244 *to = 0;
245 return cnt;
248 static int
249 pg_euccn_mblen(const unsigned char *s)
251 int len;
253 if (IS_HIGHBIT_SET(*s))
254 len = 2;
255 else
256 len = 1;
257 return len;
260 static int
261 pg_euccn_dsplen(const unsigned char *s)
263 int len;
265 if (IS_HIGHBIT_SET(*s))
266 len = 2;
267 else
268 len = pg_ascii_dsplen(s);
269 return len;
273 * EUC_TW
276 static int
277 pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
279 int cnt = 0;
281 while (len > 0 && *from)
283 if (*from == SS2 && len >= 4) /* code set 2 */
285 from++;
286 *to = (((uint32) SS2) << 24) | (*from++ << 16);
287 *to |= *from++ << 8;
288 *to |= *from++;
289 len -= 4;
291 else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */
293 from++;
294 *to = (SS3 << 16) | (*from++ << 8);
295 *to |= *from++;
296 len -= 3;
298 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
300 *to = *from++ << 8;
301 *to |= *from++;
302 len -= 2;
304 else
306 *to = *from++;
307 len--;
309 to++;
310 cnt++;
312 *to = 0;
313 return cnt;
316 static int
317 pg_euctw_mblen(const unsigned char *s)
319 int len;
321 if (*s == SS2)
322 len = 4;
323 else if (*s == SS3)
324 len = 3;
325 else if (IS_HIGHBIT_SET(*s))
326 len = 2;
327 else
328 len = 1;
329 return len;
332 static int
333 pg_euctw_dsplen(const unsigned char *s)
335 int len;
337 if (*s == SS2)
338 len = 2;
339 else if (*s == SS3)
340 len = 2;
341 else if (IS_HIGHBIT_SET(*s))
342 len = 2;
343 else
344 len = pg_ascii_dsplen(s);
345 return len;
349 * Convert pg_wchar to EUC_* encoding.
350 * caller must allocate enough space for "to", including a trailing zero!
351 * len: length of from.
352 * "from" not necessarily null terminated.
354 static int
355 pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
357 int cnt = 0;
359 while (len > 0 && *from)
361 unsigned char c;
363 if ((c = (*from >> 24)))
365 *to++ = c;
366 *to++ = (*from >> 16) & 0xff;
367 *to++ = (*from >> 8) & 0xff;
368 *to++ = *from & 0xff;
369 cnt += 4;
371 else if ((c = (*from >> 16)))
373 *to++ = c;
374 *to++ = (*from >> 8) & 0xff;
375 *to++ = *from & 0xff;
376 cnt += 3;
378 else if ((c = (*from >> 8)))
380 *to++ = c;
381 *to++ = *from & 0xff;
382 cnt += 2;
384 else
386 *to++ = *from;
387 cnt++;
389 from++;
390 len--;
392 *to = 0;
393 return cnt;
398 * JOHAB
400 static int
401 pg_johab_mblen(const unsigned char *s)
403 return pg_euc_mblen(s);
406 static int
407 pg_johab_dsplen(const unsigned char *s)
409 return pg_euc_dsplen(s);
413 * convert UTF8 string to pg_wchar (UCS-4)
414 * caller must allocate enough space for "to", including a trailing zero!
415 * len: length of from.
416 * "from" not necessarily null terminated.
418 static int
419 pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
421 int cnt = 0;
422 uint32 c1,
427 while (len > 0 && *from)
429 if ((*from & 0x80) == 0)
431 *to = *from++;
432 len--;
434 else if ((*from & 0xe0) == 0xc0)
436 if (len < 2)
437 break; /* drop trailing incomplete char */
438 c1 = *from++ & 0x1f;
439 c2 = *from++ & 0x3f;
440 *to = (c1 << 6) | c2;
441 len -= 2;
443 else if ((*from & 0xf0) == 0xe0)
445 if (len < 3)
446 break; /* drop trailing incomplete char */
447 c1 = *from++ & 0x0f;
448 c2 = *from++ & 0x3f;
449 c3 = *from++ & 0x3f;
450 *to = (c1 << 12) | (c2 << 6) | c3;
451 len -= 3;
453 else if ((*from & 0xf8) == 0xf0)
455 if (len < 4)
456 break; /* drop trailing incomplete char */
457 c1 = *from++ & 0x07;
458 c2 = *from++ & 0x3f;
459 c3 = *from++ & 0x3f;
460 c4 = *from++ & 0x3f;
461 *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
462 len -= 4;
464 else
466 /* treat a bogus char as length 1; not ours to raise error */
467 *to = *from++;
468 len--;
470 to++;
471 cnt++;
473 *to = 0;
474 return cnt;
479 * Map a Unicode code point to UTF-8. utf8string must have 4 bytes of
480 * space allocated.
482 unsigned char *
483 unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
485 if (c <= 0x7F)
487 utf8string[0] = c;
489 else if (c <= 0x7FF)
491 utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
492 utf8string[1] = 0x80 | (c & 0x3F);
494 else if (c <= 0xFFFF)
496 utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
497 utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
498 utf8string[2] = 0x80 | (c & 0x3F);
500 else
502 utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
503 utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
504 utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
505 utf8string[3] = 0x80 | (c & 0x3F);
508 return utf8string;
512 * Trivial conversion from pg_wchar to UTF-8.
513 * caller should allocate enough space for "to"
514 * len: length of from.
515 * "from" not necessarily null terminated.
517 static int
518 pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
520 int cnt = 0;
522 while (len > 0 && *from)
524 int char_len;
526 unicode_to_utf8(*from, to);
527 char_len = pg_utf_mblen(to);
528 cnt += char_len;
529 to += char_len;
530 from++;
531 len--;
533 *to = 0;
534 return cnt;
538 * Return the byte length of a UTF8 character pointed to by s
540 * Note: in the current implementation we do not support UTF8 sequences
541 * of more than 4 bytes; hence do NOT return a value larger than 4.
542 * We return "1" for any leading byte that is either flat-out illegal or
543 * indicates a length larger than we support.
545 * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
546 * other places would need to be fixed to change this.
549 pg_utf_mblen(const unsigned char *s)
551 int len;
553 if ((*s & 0x80) == 0)
554 len = 1;
555 else if ((*s & 0xe0) == 0xc0)
556 len = 2;
557 else if ((*s & 0xf0) == 0xe0)
558 len = 3;
559 else if ((*s & 0xf8) == 0xf0)
560 len = 4;
561 #ifdef NOT_USED
562 else if ((*s & 0xfc) == 0xf8)
563 len = 5;
564 else if ((*s & 0xfe) == 0xfc)
565 len = 6;
566 #endif
567 else
568 len = 1;
569 return len;
573 * This is an implementation of wcwidth() and wcswidth() as defined in
574 * "The Single UNIX Specification, Version 2, The Open Group, 1997"
575 * <http://www.unix.org/online.html>
577 * Markus Kuhn -- 2001-09-08 -- public domain
579 * customised for PostgreSQL
581 * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
584 struct mbinterval
586 unsigned int first;
587 unsigned int last;
590 /* auxiliary function for binary search in interval table */
591 static int
592 mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
594 int min = 0;
595 int mid;
597 if (ucs < table[0].first || ucs > table[max].last)
598 return 0;
599 while (max >= min)
601 mid = (min + max) / 2;
602 if (ucs > table[mid].last)
603 min = mid + 1;
604 else if (ucs < table[mid].first)
605 max = mid - 1;
606 else
607 return 1;
610 return 0;
614 /* The following functions define the column width of an ISO 10646
615 * character as follows:
617 * - The null character (U+0000) has a column width of 0.
619 * - Other C0/C1 control characters and DEL will lead to a return
620 * value of -1.
622 * - Non-spacing and enclosing combining characters (general
623 * category code Mn, Me or Cf in the Unicode database) have a
624 * column width of 0.
626 * - Spacing characters in the East Asian Wide (W) or East Asian
627 * FullWidth (F) category as defined in Unicode Technical
628 * Report #11 have a column width of 2.
630 * - All remaining characters (including all printable
631 * ISO 8859-1 and WGL4 characters, Unicode control characters,
632 * etc.) have a column width of 1.
634 * This implementation assumes that wchar_t characters are encoded
635 * in ISO 10646.
638 static int
639 ucs_wcwidth(pg_wchar ucs)
641 #include "common/unicode_nonspacing_table.h"
642 #include "common/unicode_east_asian_fw_table.h"
644 /* test for 8-bit control characters */
645 if (ucs == 0)
646 return 0;
648 if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
649 return -1;
652 * binary search in table of non-spacing characters
654 * XXX: In the official Unicode sources, it is possible for a character to
655 * be described as both non-spacing and wide at the same time. As of
656 * Unicode 13.0, treating the non-spacing property as the determining
657 * factor for display width leads to the correct behavior, so do that
658 * search first.
660 if (mbbisearch(ucs, nonspacing,
661 sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
662 return 0;
664 /* binary search in table of wide characters */
665 if (mbbisearch(ucs, east_asian_fw,
666 sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
667 return 2;
669 return 1;
673 * Convert a UTF-8 character to a Unicode code point.
674 * This is a one-character version of pg_utf2wchar_with_len.
676 * No error checks here, c must point to a long-enough string.
678 pg_wchar
679 utf8_to_unicode(const unsigned char *c)
681 if ((*c & 0x80) == 0)
682 return (pg_wchar) c[0];
683 else if ((*c & 0xe0) == 0xc0)
684 return (pg_wchar) (((c[0] & 0x1f) << 6) |
685 (c[1] & 0x3f));
686 else if ((*c & 0xf0) == 0xe0)
687 return (pg_wchar) (((c[0] & 0x0f) << 12) |
688 ((c[1] & 0x3f) << 6) |
689 (c[2] & 0x3f));
690 else if ((*c & 0xf8) == 0xf0)
691 return (pg_wchar) (((c[0] & 0x07) << 18) |
692 ((c[1] & 0x3f) << 12) |
693 ((c[2] & 0x3f) << 6) |
694 (c[3] & 0x3f));
695 else
696 /* that is an invalid code on purpose */
697 return 0xffffffff;
700 static int
701 pg_utf_dsplen(const unsigned char *s)
703 return ucs_wcwidth(utf8_to_unicode(s));
707 * convert mule internal code to pg_wchar
708 * caller should allocate enough space for "to"
709 * len: length of from.
710 * "from" not necessarily null terminated.
712 static int
713 pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
715 int cnt = 0;
717 while (len > 0 && *from)
719 if (IS_LC1(*from) && len >= 2)
721 *to = *from++ << 16;
722 *to |= *from++;
723 len -= 2;
725 else if (IS_LCPRV1(*from) && len >= 3)
727 from++;
728 *to = *from++ << 16;
729 *to |= *from++;
730 len -= 3;
732 else if (IS_LC2(*from) && len >= 3)
734 *to = *from++ << 16;
735 *to |= *from++ << 8;
736 *to |= *from++;
737 len -= 3;
739 else if (IS_LCPRV2(*from) && len >= 4)
741 from++;
742 *to = *from++ << 16;
743 *to |= *from++ << 8;
744 *to |= *from++;
745 len -= 4;
747 else
748 { /* assume ASCII */
749 *to = (unsigned char) *from++;
750 len--;
752 to++;
753 cnt++;
755 *to = 0;
756 return cnt;
760 * convert pg_wchar to mule internal code
761 * caller should allocate enough space for "to"
762 * len: length of from.
763 * "from" not necessarily null terminated.
765 static int
766 pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
768 int cnt = 0;
770 while (len > 0 && *from)
772 unsigned char lb;
774 lb = (*from >> 16) & 0xff;
775 if (IS_LC1(lb))
777 *to++ = lb;
778 *to++ = *from & 0xff;
779 cnt += 2;
781 else if (IS_LC2(lb))
783 *to++ = lb;
784 *to++ = (*from >> 8) & 0xff;
785 *to++ = *from & 0xff;
786 cnt += 3;
788 else if (IS_LCPRV1_A_RANGE(lb))
790 *to++ = LCPRV1_A;
791 *to++ = lb;
792 *to++ = *from & 0xff;
793 cnt += 3;
795 else if (IS_LCPRV1_B_RANGE(lb))
797 *to++ = LCPRV1_B;
798 *to++ = lb;
799 *to++ = *from & 0xff;
800 cnt += 3;
802 else if (IS_LCPRV2_A_RANGE(lb))
804 *to++ = LCPRV2_A;
805 *to++ = lb;
806 *to++ = (*from >> 8) & 0xff;
807 *to++ = *from & 0xff;
808 cnt += 4;
810 else if (IS_LCPRV2_B_RANGE(lb))
812 *to++ = LCPRV2_B;
813 *to++ = lb;
814 *to++ = (*from >> 8) & 0xff;
815 *to++ = *from & 0xff;
816 cnt += 4;
818 else
820 *to++ = *from & 0xff;
821 cnt += 1;
823 from++;
824 len--;
826 *to = 0;
827 return cnt;
830 /* exported for direct use by conv.c */
832 pg_mule_mblen(const unsigned char *s)
834 int len;
836 if (IS_LC1(*s))
837 len = 2;
838 else if (IS_LCPRV1(*s))
839 len = 3;
840 else if (IS_LC2(*s))
841 len = 3;
842 else if (IS_LCPRV2(*s))
843 len = 4;
844 else
845 len = 1; /* assume ASCII */
846 return len;
849 static int
850 pg_mule_dsplen(const unsigned char *s)
852 int len;
855 * Note: it's not really appropriate to assume that all multibyte charsets
856 * are double-wide on screen. But this seems an okay approximation for
857 * the MULE charsets we currently support.
860 if (IS_LC1(*s))
861 len = 1;
862 else if (IS_LCPRV1(*s))
863 len = 1;
864 else if (IS_LC2(*s))
865 len = 2;
866 else if (IS_LCPRV2(*s))
867 len = 2;
868 else
869 len = 1; /* assume ASCII */
871 return len;
875 * ISO8859-1
877 static int
878 pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
880 int cnt = 0;
882 while (len > 0 && *from)
884 *to++ = *from++;
885 len--;
886 cnt++;
888 *to = 0;
889 return cnt;
893 * Trivial conversion from pg_wchar to single byte encoding. Just ignores
894 * high bits.
895 * caller should allocate enough space for "to"
896 * len: length of from.
897 * "from" not necessarily null terminated.
899 static int
900 pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
902 int cnt = 0;
904 while (len > 0 && *from)
906 *to++ = *from++;
907 len--;
908 cnt++;
910 *to = 0;
911 return cnt;
914 static int
915 pg_latin1_mblen(const unsigned char *s)
917 return 1;
920 static int
921 pg_latin1_dsplen(const unsigned char *s)
923 return pg_ascii_dsplen(s);
927 * SJIS
929 static int
930 pg_sjis_mblen(const unsigned char *s)
932 int len;
934 if (*s >= 0xa1 && *s <= 0xdf)
935 len = 1; /* 1 byte kana? */
936 else if (IS_HIGHBIT_SET(*s))
937 len = 2; /* kanji? */
938 else
939 len = 1; /* should be ASCII */
940 return len;
943 static int
944 pg_sjis_dsplen(const unsigned char *s)
946 int len;
948 if (*s >= 0xa1 && *s <= 0xdf)
949 len = 1; /* 1 byte kana? */
950 else if (IS_HIGHBIT_SET(*s))
951 len = 2; /* kanji? */
952 else
953 len = pg_ascii_dsplen(s); /* should be ASCII */
954 return len;
958 * Big5
960 static int
961 pg_big5_mblen(const unsigned char *s)
963 int len;
965 if (IS_HIGHBIT_SET(*s))
966 len = 2; /* kanji? */
967 else
968 len = 1; /* should be ASCII */
969 return len;
972 static int
973 pg_big5_dsplen(const unsigned char *s)
975 int len;
977 if (IS_HIGHBIT_SET(*s))
978 len = 2; /* kanji? */
979 else
980 len = pg_ascii_dsplen(s); /* should be ASCII */
981 return len;
985 * GBK
987 static int
988 pg_gbk_mblen(const unsigned char *s)
990 int len;
992 if (IS_HIGHBIT_SET(*s))
993 len = 2; /* kanji? */
994 else
995 len = 1; /* should be ASCII */
996 return len;
999 static int
1000 pg_gbk_dsplen(const unsigned char *s)
1002 int len;
1004 if (IS_HIGHBIT_SET(*s))
1005 len = 2; /* kanji? */
1006 else
1007 len = pg_ascii_dsplen(s); /* should be ASCII */
1008 return len;
1012 * UHC
1014 static int
1015 pg_uhc_mblen(const unsigned char *s)
1017 int len;
1019 if (IS_HIGHBIT_SET(*s))
1020 len = 2; /* 2byte? */
1021 else
1022 len = 1; /* should be ASCII */
1023 return len;
1026 static int
1027 pg_uhc_dsplen(const unsigned char *s)
1029 int len;
1031 if (IS_HIGHBIT_SET(*s))
1032 len = 2; /* 2byte? */
1033 else
1034 len = pg_ascii_dsplen(s); /* should be ASCII */
1035 return len;
1039 * GB18030
1040 * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
1044 * Unlike all other mblen() functions, this also looks at the second byte of
1045 * the input. However, if you only pass the first byte of a multi-byte
1046 * string, and \0 as the second byte, this still works in a predictable way:
1047 * a 4-byte character will be reported as two 2-byte characters. That's
1048 * enough for all current uses, as a client-only encoding. It works that
1049 * way, because in any valid 4-byte GB18030-encoded character, the third and
1050 * fourth byte look like a 2-byte encoded character, when looked at
1051 * separately.
1053 static int
1054 pg_gb18030_mblen(const unsigned char *s)
1056 int len;
1058 if (!IS_HIGHBIT_SET(*s))
1059 len = 1; /* ASCII */
1060 else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1061 len = 4;
1062 else
1063 len = 2;
1064 return len;
1067 static int
1068 pg_gb18030_dsplen(const unsigned char *s)
1070 int len;
1072 if (IS_HIGHBIT_SET(*s))
1073 len = 2;
1074 else
1075 len = pg_ascii_dsplen(s); /* ASCII */
1076 return len;
1080 *-------------------------------------------------------------------
1081 * multibyte sequence validators
1083 * The verifychar functions accept "s", a pointer to the first byte of a
1084 * string, and "len", the remaining length of the string. If there is a
1085 * validly encoded character beginning at *s, return its length in bytes;
1086 * else return -1.
1088 * The verifystr functions also accept "s", a pointer to a string and "len",
1089 * the length of the string. They verify the whole string, and return the
1090 * number of input bytes (<= len) that are valid. In other words, if the
1091 * whole string is valid, verifystr returns "len", otherwise it returns the
1092 * byte offset of the first invalid character. The verifystr functions must
1093 * test for and reject zeroes in the input.
1095 * The verifychar functions can assume that len > 0 and that *s != '\0', but
1096 * they must test for and reject zeroes in any additional bytes of a
1097 * multibyte character. Note that this definition allows the function for a
1098 * single-byte encoding to be just "return 1".
1099 *-------------------------------------------------------------------
1101 static int
1102 pg_ascii_verifychar(const unsigned char *s, int len)
1104 return 1;
1107 static int
1108 pg_ascii_verifystr(const unsigned char *s, int len)
1110 const unsigned char *nullpos = memchr(s, 0, len);
1112 if (nullpos == NULL)
1113 return len;
1114 else
1115 return nullpos - s;
1118 #define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
1120 static int
1121 pg_eucjp_verifychar(const unsigned char *s, int len)
1123 int l;
1124 unsigned char c1,
1127 c1 = *s++;
1129 switch (c1)
1131 case SS2: /* JIS X 0201 */
1132 l = 2;
1133 if (l > len)
1134 return -1;
1135 c2 = *s++;
1136 if (c2 < 0xa1 || c2 > 0xdf)
1137 return -1;
1138 break;
1140 case SS3: /* JIS X 0212 */
1141 l = 3;
1142 if (l > len)
1143 return -1;
1144 c2 = *s++;
1145 if (!IS_EUC_RANGE_VALID(c2))
1146 return -1;
1147 c2 = *s++;
1148 if (!IS_EUC_RANGE_VALID(c2))
1149 return -1;
1150 break;
1152 default:
1153 if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1155 l = 2;
1156 if (l > len)
1157 return -1;
1158 if (!IS_EUC_RANGE_VALID(c1))
1159 return -1;
1160 c2 = *s++;
1161 if (!IS_EUC_RANGE_VALID(c2))
1162 return -1;
1164 else
1165 /* must be ASCII */
1167 l = 1;
1169 break;
1172 return l;
1175 static int
1176 pg_eucjp_verifystr(const unsigned char *s, int len)
1178 const unsigned char *start = s;
1180 while (len > 0)
1182 int l;
1184 /* fast path for ASCII-subset characters */
1185 if (!IS_HIGHBIT_SET(*s))
1187 if (*s == '\0')
1188 break;
1189 l = 1;
1191 else
1193 l = pg_eucjp_verifychar(s, len);
1194 if (l == -1)
1195 break;
1197 s += l;
1198 len -= l;
1201 return s - start;
1204 static int
1205 pg_euckr_verifychar(const unsigned char *s, int len)
1207 int l;
1208 unsigned char c1,
1211 c1 = *s++;
1213 if (IS_HIGHBIT_SET(c1))
1215 l = 2;
1216 if (l > len)
1217 return -1;
1218 if (!IS_EUC_RANGE_VALID(c1))
1219 return -1;
1220 c2 = *s++;
1221 if (!IS_EUC_RANGE_VALID(c2))
1222 return -1;
1224 else
1225 /* must be ASCII */
1227 l = 1;
1230 return l;
1233 static int
1234 pg_euckr_verifystr(const unsigned char *s, int len)
1236 const unsigned char *start = s;
1238 while (len > 0)
1240 int l;
1242 /* fast path for ASCII-subset characters */
1243 if (!IS_HIGHBIT_SET(*s))
1245 if (*s == '\0')
1246 break;
1247 l = 1;
1249 else
1251 l = pg_euckr_verifychar(s, len);
1252 if (l == -1)
1253 break;
1255 s += l;
1256 len -= l;
1259 return s - start;
1262 /* EUC-CN byte sequences are exactly same as EUC-KR */
1263 #define pg_euccn_verifychar pg_euckr_verifychar
1264 #define pg_euccn_verifystr pg_euckr_verifystr
1266 static int
1267 pg_euctw_verifychar(const unsigned char *s, int len)
1269 int l;
1270 unsigned char c1,
1273 c1 = *s++;
1275 switch (c1)
1277 case SS2: /* CNS 11643 Plane 1-7 */
1278 l = 4;
1279 if (l > len)
1280 return -1;
1281 c2 = *s++;
1282 if (c2 < 0xa1 || c2 > 0xa7)
1283 return -1;
1284 c2 = *s++;
1285 if (!IS_EUC_RANGE_VALID(c2))
1286 return -1;
1287 c2 = *s++;
1288 if (!IS_EUC_RANGE_VALID(c2))
1289 return -1;
1290 break;
1292 case SS3: /* unused */
1293 return -1;
1295 default:
1296 if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1298 l = 2;
1299 if (l > len)
1300 return -1;
1301 /* no further range check on c1? */
1302 c2 = *s++;
1303 if (!IS_EUC_RANGE_VALID(c2))
1304 return -1;
1306 else
1307 /* must be ASCII */
1309 l = 1;
1311 break;
1313 return l;
1316 static int
1317 pg_euctw_verifystr(const unsigned char *s, int len)
1319 const unsigned char *start = s;
1321 while (len > 0)
1323 int l;
1325 /* fast path for ASCII-subset characters */
1326 if (!IS_HIGHBIT_SET(*s))
1328 if (*s == '\0')
1329 break;
1330 l = 1;
1332 else
1334 l = pg_euctw_verifychar(s, len);
1335 if (l == -1)
1336 break;
1338 s += l;
1339 len -= l;
1342 return s - start;
1345 static int
1346 pg_johab_verifychar(const unsigned char *s, int len)
1348 int l,
1349 mbl;
1350 unsigned char c;
1352 l = mbl = pg_johab_mblen(s);
1354 if (len < l)
1355 return -1;
1357 if (!IS_HIGHBIT_SET(*s))
1358 return mbl;
1360 while (--l > 0)
1362 c = *++s;
1363 if (!IS_EUC_RANGE_VALID(c))
1364 return -1;
1366 return mbl;
1369 static int
1370 pg_johab_verifystr(const unsigned char *s, int len)
1372 const unsigned char *start = s;
1374 while (len > 0)
1376 int l;
1378 /* fast path for ASCII-subset characters */
1379 if (!IS_HIGHBIT_SET(*s))
1381 if (*s == '\0')
1382 break;
1383 l = 1;
1385 else
1387 l = pg_johab_verifychar(s, len);
1388 if (l == -1)
1389 break;
1391 s += l;
1392 len -= l;
1395 return s - start;
1398 static int
1399 pg_mule_verifychar(const unsigned char *s, int len)
1401 int l,
1402 mbl;
1403 unsigned char c;
1405 l = mbl = pg_mule_mblen(s);
1407 if (len < l)
1408 return -1;
1410 while (--l > 0)
1412 c = *++s;
1413 if (!IS_HIGHBIT_SET(c))
1414 return -1;
1416 return mbl;
1419 static int
1420 pg_mule_verifystr(const unsigned char *s, int len)
1422 const unsigned char *start = s;
1424 while (len > 0)
1426 int l;
1428 /* fast path for ASCII-subset characters */
1429 if (!IS_HIGHBIT_SET(*s))
1431 if (*s == '\0')
1432 break;
1433 l = 1;
1435 else
1437 l = pg_mule_verifychar(s, len);
1438 if (l == -1)
1439 break;
1441 s += l;
1442 len -= l;
1445 return s - start;
1448 static int
1449 pg_latin1_verifychar(const unsigned char *s, int len)
1451 return 1;
1454 static int
1455 pg_latin1_verifystr(const unsigned char *s, int len)
1457 const unsigned char *nullpos = memchr(s, 0, len);
1459 if (nullpos == NULL)
1460 return len;
1461 else
1462 return nullpos - s;
1465 static int
1466 pg_sjis_verifychar(const unsigned char *s, int len)
1468 int l,
1469 mbl;
1470 unsigned char c1,
1473 l = mbl = pg_sjis_mblen(s);
1475 if (len < l)
1476 return -1;
1478 if (l == 1) /* pg_sjis_mblen already verified it */
1479 return mbl;
1481 c1 = *s++;
1482 c2 = *s;
1483 if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1484 return -1;
1485 return mbl;
1488 static int
1489 pg_sjis_verifystr(const unsigned char *s, int len)
1491 const unsigned char *start = s;
1493 while (len > 0)
1495 int l;
1497 /* fast path for ASCII-subset characters */
1498 if (!IS_HIGHBIT_SET(*s))
1500 if (*s == '\0')
1501 break;
1502 l = 1;
1504 else
1506 l = pg_sjis_verifychar(s, len);
1507 if (l == -1)
1508 break;
1510 s += l;
1511 len -= l;
1514 return s - start;
1517 static int
1518 pg_big5_verifychar(const unsigned char *s, int len)
1520 int l,
1521 mbl;
1523 l = mbl = pg_big5_mblen(s);
1525 if (len < l)
1526 return -1;
1528 while (--l > 0)
1530 if (*++s == '\0')
1531 return -1;
1534 return mbl;
1537 static int
1538 pg_big5_verifystr(const unsigned char *s, int len)
1540 const unsigned char *start = s;
1542 while (len > 0)
1544 int l;
1546 /* fast path for ASCII-subset characters */
1547 if (!IS_HIGHBIT_SET(*s))
1549 if (*s == '\0')
1550 break;
1551 l = 1;
1553 else
1555 l = pg_big5_verifychar(s, len);
1556 if (l == -1)
1557 break;
1559 s += l;
1560 len -= l;
1563 return s - start;
1566 static int
1567 pg_gbk_verifychar(const unsigned char *s, int len)
1569 int l,
1570 mbl;
1572 l = mbl = pg_gbk_mblen(s);
1574 if (len < l)
1575 return -1;
1577 while (--l > 0)
1579 if (*++s == '\0')
1580 return -1;
1583 return mbl;
1586 static int
1587 pg_gbk_verifystr(const unsigned char *s, int len)
1589 const unsigned char *start = s;
1591 while (len > 0)
1593 int l;
1595 /* fast path for ASCII-subset characters */
1596 if (!IS_HIGHBIT_SET(*s))
1598 if (*s == '\0')
1599 break;
1600 l = 1;
1602 else
1604 l = pg_gbk_verifychar(s, len);
1605 if (l == -1)
1606 break;
1608 s += l;
1609 len -= l;
1612 return s - start;
1615 static int
1616 pg_uhc_verifychar(const unsigned char *s, int len)
1618 int l,
1619 mbl;
1621 l = mbl = pg_uhc_mblen(s);
1623 if (len < l)
1624 return -1;
1626 while (--l > 0)
1628 if (*++s == '\0')
1629 return -1;
1632 return mbl;
1635 static int
1636 pg_uhc_verifystr(const unsigned char *s, int len)
1638 const unsigned char *start = s;
1640 while (len > 0)
1642 int l;
1644 /* fast path for ASCII-subset characters */
1645 if (!IS_HIGHBIT_SET(*s))
1647 if (*s == '\0')
1648 break;
1649 l = 1;
1651 else
1653 l = pg_uhc_verifychar(s, len);
1654 if (l == -1)
1655 break;
1657 s += l;
1658 len -= l;
1661 return s - start;
1664 static int
1665 pg_gb18030_verifychar(const unsigned char *s, int len)
1667 int l;
1669 if (!IS_HIGHBIT_SET(*s))
1670 l = 1; /* ASCII */
1671 else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1673 /* Should be 4-byte, validate remaining bytes */
1674 if (*s >= 0x81 && *s <= 0xfe &&
1675 *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1676 *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1677 l = 4;
1678 else
1679 l = -1;
1681 else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1683 /* Should be 2-byte, validate */
1684 if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1685 (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1686 l = 2;
1687 else
1688 l = -1;
1690 else
1691 l = -1;
1692 return l;
1695 static int
1696 pg_gb18030_verifystr(const unsigned char *s, int len)
1698 const unsigned char *start = s;
1700 while (len > 0)
1702 int l;
1704 /* fast path for ASCII-subset characters */
1705 if (!IS_HIGHBIT_SET(*s))
1707 if (*s == '\0')
1708 break;
1709 l = 1;
1711 else
1713 l = pg_gb18030_verifychar(s, len);
1714 if (l == -1)
1715 break;
1717 s += l;
1718 len -= l;
1721 return s - start;
1724 static int
1725 pg_utf8_verifychar(const unsigned char *s, int len)
1727 int l;
1729 if ((*s & 0x80) == 0)
1731 if (*s == '\0')
1732 return -1;
1733 return 1;
1735 else if ((*s & 0xe0) == 0xc0)
1736 l = 2;
1737 else if ((*s & 0xf0) == 0xe0)
1738 l = 3;
1739 else if ((*s & 0xf8) == 0xf0)
1740 l = 4;
1741 else
1742 l = 1;
1744 if (l > len)
1745 return -1;
1747 if (!pg_utf8_islegal(s, l))
1748 return -1;
1750 return l;
1754 * The fast path of the UTF-8 verifier uses a deterministic finite automaton
1755 * (DFA) for multibyte characters. In a traditional table-driven DFA, the
1756 * input byte and current state are used to compute an index into an array of
1757 * state transitions. Since the address of the next transition is dependent
1758 * on this computation, there is latency in executing the load instruction,
1759 * and the CPU is not kept busy.
1761 * Instead, we use a "shift-based" DFA as described by Per Vognsen:
1763 * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
1765 * In a shift-based DFA, the input byte is an index into array of integers
1766 * whose bit pattern encodes the state transitions. To compute the next
1767 * state, we simply right-shift the integer by the current state and apply a
1768 * mask. In this scheme, the address of the transition only depends on the
1769 * input byte, so there is better pipelining.
1771 * The naming convention for states and transitions was adopted from a UTF-8
1772 * to UTF-16/32 transcoder, whose table is reproduced below:
1774 * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
1776 * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE
1777 * ==========================================================================
1778 * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, | BGN/END
1779 * err, err, err, err, err, err, err, err, err, err, err, err, | ERR
1781 * err, err, END, END, END, err, err, err, err, err, err, err, | CS1
1782 * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, | CS2
1783 * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, | CS3
1785 * err, err, err, err, CS1, err, err, err, err, err, err, err, | P3A
1786 * err, err, CS1, CS1, err, err, err, err, err, err, err, err, | P3B
1788 * err, err, err, CS2, CS2, err, err, err, err, err, err, err, | P4A
1789 * err, err, CS2, err, err, err, err, err, err, err, err, err, | P4B
1791 * In the most straightforward implementation, a shift-based DFA for UTF-8
1792 * requires 64-bit integers to encode the transitions, but with an SMT solver
1793 * it's possible to find state numbers such that the transitions fit within
1794 * 32-bit integers, as Dougall Johnson demonstrated:
1796 * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
1798 * This packed representation is the reason for the seemingly odd choice of
1799 * state values below.
1802 /* Error */
1803 #define ERR 0
1804 /* Begin */
1805 #define BGN 11
1806 /* Continuation states, expect 1/2/3 continuation bytes */
1807 #define CS1 16
1808 #define CS2 1
1809 #define CS3 5
1810 /* Partial states, where the first continuation byte has a restricted range */
1811 #define P3A 6 /* Lead was E0, check for 3-byte overlong */
1812 #define P3B 20 /* Lead was ED, check for surrogate */
1813 #define P4A 25 /* Lead was F0, check for 4-byte overlong */
1814 #define P4B 30 /* Lead was F4, check for too-large */
1815 /* Begin and End are the same state */
1816 #define END BGN
1818 /* the encoded state transitions for the lookup table */
1820 /* ASCII */
1821 #define ASC (END << BGN)
1822 /* 2-byte lead */
1823 #define L2A (CS1 << BGN)
1824 /* 3-byte lead */
1825 #define L3A (P3A << BGN)
1826 #define L3B (CS2 << BGN)
1827 #define L3C (P3B << BGN)
1828 /* 4-byte lead */
1829 #define L4A (P4A << BGN)
1830 #define L4B (CS3 << BGN)
1831 #define L4C (P4B << BGN)
1832 /* continuation byte */
1833 #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
1834 #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
1835 #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
1836 /* invalid byte */
1837 #define ILL ERR
1839 static const uint32 Utf8Transition[256] =
1841 /* ASCII */
1843 ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1844 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1845 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1846 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1848 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1849 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1850 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1851 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1853 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1854 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1855 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1856 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1858 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1859 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1860 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1861 ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1863 /* continuation bytes */
1865 /* 80..8F */
1866 CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1867 CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1869 /* 90..9F */
1870 CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1871 CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1873 /* A0..BF */
1874 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1875 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1876 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1877 CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1879 /* leading bytes */
1881 /* C0..DF */
1882 ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
1883 L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1884 L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1885 L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1887 /* E0..EF */
1888 L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
1889 L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
1891 /* F0..FF */
1892 L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
1893 ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
1896 static void
1897 utf8_advance(const unsigned char *s, uint32 *state, int len)
1899 /* Note: We deliberately don't check the state's value here. */
1900 while (len > 0)
1903 * It's important that the mask value is 31: In most instruction sets,
1904 * a shift by a 32-bit operand is understood to be a shift by its mod
1905 * 32, so the compiler should elide the mask operation.
1907 *state = Utf8Transition[*s++] >> (*state & 31);
1908 len--;
1911 *state &= 31;
1914 static int
1915 pg_utf8_verifystr(const unsigned char *s, int len)
1917 const unsigned char *start = s;
1918 const int orig_len = len;
1919 uint32 state = BGN;
1922 * With a stride of two vector widths, gcc will unroll the loop. Even if
1923 * the compiler can unroll a longer loop, it's not worth it because we
1924 * must fall back to the byte-wise algorithm if we find any non-ASCII.
1926 #define STRIDE_LENGTH (2 * sizeof(Vector8))
1928 if (len >= STRIDE_LENGTH)
1930 while (len >= STRIDE_LENGTH)
1933 * If the chunk is all ASCII, we can skip the full UTF-8 check,
1934 * but we must first check for a non-END state, which means the
1935 * previous chunk ended in the middle of a multibyte sequence.
1937 if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
1938 utf8_advance(s, &state, STRIDE_LENGTH);
1940 s += STRIDE_LENGTH;
1941 len -= STRIDE_LENGTH;
1944 /* The error state persists, so we only need to check for it here. */
1945 if (state == ERR)
1948 * Start over from the beginning with the slow path so we can
1949 * count the valid bytes.
1951 len = orig_len;
1952 s = start;
1954 else if (state != END)
1957 * The fast path exited in the middle of a multibyte sequence.
1958 * Walk backwards to find the leading byte so that the slow path
1959 * can resume checking from there. We must always backtrack at
1960 * least one byte, since the current byte could be e.g. an ASCII
1961 * byte after a 2-byte lead, which is invalid.
1965 Assert(s > start);
1966 s--;
1967 len++;
1968 Assert(IS_HIGHBIT_SET(*s));
1969 } while (pg_utf_mblen(s) <= 1);
1973 /* check remaining bytes */
1974 while (len > 0)
1976 int l;
1978 /* fast path for ASCII-subset characters */
1979 if (!IS_HIGHBIT_SET(*s))
1981 if (*s == '\0')
1982 break;
1983 l = 1;
1985 else
1987 l = pg_utf8_verifychar(s, len);
1988 if (l == -1)
1989 break;
1991 s += l;
1992 len -= l;
1995 return s - start;
1999 * Check for validity of a single UTF-8 encoded character
2001 * This directly implements the rules in RFC3629. The bizarre-looking
2002 * restrictions on the second byte are meant to ensure that there isn't
2003 * more than one encoding of a given Unicode character point; that is,
2004 * you may not use a longer-than-necessary byte sequence with high order
2005 * zero bits to represent a character that would fit in fewer bytes.
2006 * To do otherwise is to create security hazards (eg, create an apparent
2007 * non-ASCII character that decodes to plain ASCII).
2009 * length is assumed to have been obtained by pg_utf_mblen(), and the
2010 * caller must have checked that that many bytes are present in the buffer.
2012 bool
2013 pg_utf8_islegal(const unsigned char *source, int length)
2015 unsigned char a;
2017 switch (length)
2019 default:
2020 /* reject lengths 5 and 6 for now */
2021 return false;
2022 case 4:
2023 a = source[3];
2024 if (a < 0x80 || a > 0xBF)
2025 return false;
2026 /* FALL THRU */
2027 case 3:
2028 a = source[2];
2029 if (a < 0x80 || a > 0xBF)
2030 return false;
2031 /* FALL THRU */
2032 case 2:
2033 a = source[1];
2034 switch (*source)
2036 case 0xE0:
2037 if (a < 0xA0 || a > 0xBF)
2038 return false;
2039 break;
2040 case 0xED:
2041 if (a < 0x80 || a > 0x9F)
2042 return false;
2043 break;
2044 case 0xF0:
2045 if (a < 0x90 || a > 0xBF)
2046 return false;
2047 break;
2048 case 0xF4:
2049 if (a < 0x80 || a > 0x8F)
2050 return false;
2051 break;
2052 default:
2053 if (a < 0x80 || a > 0xBF)
2054 return false;
2055 break;
2057 /* FALL THRU */
2058 case 1:
2059 a = *source;
2060 if (a >= 0x80 && a < 0xC2)
2061 return false;
2062 if (a > 0xF4)
2063 return false;
2064 break;
2066 return true;
2071 *-------------------------------------------------------------------
2072 * encoding info table
2073 * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
2074 *-------------------------------------------------------------------
2076 const pg_wchar_tbl pg_wchar_table[] = {
2077 {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1}, /* PG_SQL_ASCII */
2078 {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3}, /* PG_EUC_JP */
2079 {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2}, /* PG_EUC_CN */
2080 {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3}, /* PG_EUC_KR */
2081 {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4}, /* PG_EUC_TW */
2082 {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3}, /* PG_EUC_JIS_2004 */
2083 {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4}, /* PG_UTF8 */
2084 {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4}, /* PG_MULE_INTERNAL */
2085 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN1 */
2086 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN2 */
2087 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN3 */
2088 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN4 */
2089 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN5 */
2090 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN6 */
2091 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN7 */
2092 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN8 */
2093 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN9 */
2094 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_LATIN10 */
2095 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1256 */
2096 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1258 */
2097 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN866 */
2098 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN874 */
2099 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_KOI8R */
2100 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1251 */
2101 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1252 */
2102 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* ISO-8859-5 */
2103 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* ISO-8859-6 */
2104 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* ISO-8859-7 */
2105 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* ISO-8859-8 */
2106 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1250 */
2107 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1253 */
2108 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1254 */
2109 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1255 */
2110 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_WIN1257 */
2111 {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1}, /* PG_KOI8U */
2112 {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2}, /* PG_SJIS */
2113 {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2}, /* PG_BIG5 */
2114 {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2}, /* PG_GBK */
2115 {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2}, /* PG_UHC */
2116 {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4}, /* PG_GB18030 */
2117 {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3}, /* PG_JOHAB */
2118 {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2} /* PG_SHIFT_JIS_2004 */
2122 * Returns the byte length of a multibyte character.
2124 * Caution: when dealing with text that is not certainly valid in the
2125 * specified encoding, the result may exceed the actual remaining
2126 * string length. Callers that are not prepared to deal with that
2127 * should use pg_encoding_mblen_bounded() instead.
2130 pg_encoding_mblen(int encoding, const char *mbstr)
2132 return (PG_VALID_ENCODING(encoding) ?
2133 pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
2134 pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
2138 * Returns the byte length of a multibyte character; but not more than
2139 * the distance to end of string.
2142 pg_encoding_mblen_bounded(int encoding, const char *mbstr)
2144 return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
2148 * Returns the display length of a multibyte character.
2151 pg_encoding_dsplen(int encoding, const char *mbstr)
2153 return (PG_VALID_ENCODING(encoding) ?
2154 pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
2155 pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
2159 * Verify the first multibyte character of the given string.
2160 * Return its byte length if good, -1 if bad. (See comments above for
2161 * full details of the mbverifychar API.)
2164 pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
2166 return (PG_VALID_ENCODING(encoding) ?
2167 pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
2168 pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
2172 * Verify that a string is valid for the given encoding.
2173 * Returns the number of input bytes (<= len) that form a valid string.
2174 * (See comments above for full details of the mbverifystr API.)
2177 pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
2179 return (PG_VALID_ENCODING(encoding) ?
2180 pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
2181 pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
2185 * fetch maximum length of a given encoding
2188 pg_encoding_max_length(int encoding)
2190 Assert(PG_VALID_ENCODING(encoding));
2192 return pg_wchar_table[encoding].maxmblen;