src/common/wchar.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * wchar.c
   4  *        Functions for working with multibyte characters in various encodings.
   5  *
   6  * Portions Copyright (c) 1998-2025, PostgreSQL Global Development Group
   7  *
   8  * IDENTIFICATION
   9  *        src/common/wchar.c
  10  *
  11  *-------------------------------------------------------------------------
  12  */
  13 #include "c.h"
  14
  15 #include "mb/pg_wchar.h"
  16 #include "utils/ascii.h"
  17
  18
  19 /*
  20  * Operations on multi-byte encodings are driven by a table of helper
  21  * functions.
  22  *
  23  * To add an encoding support, define mblen(), dsplen(), verifychar() and
  24  * verifystr() for the encoding.  For server-encodings, also define mb2wchar()
  25  * and wchar2mb() conversion functions.
  26  *
  27  * These functions generally assume that their input is validly formed.
  28  * The "verifier" functions, further down in the file, have to be more
  29  * paranoid.
  30  *
  31  * We expect that mblen() does not need to examine more than the first byte
  32  * of the character to discover the correct length.  GB18030 is an exception
  33  * to that rule, though, as it also looks at second byte.  But even that
  34  * behaves in a predictable way, if you only pass the first byte: it will
  35  * treat 4-byte encoded characters as two 2-byte encoded characters, which is
  36  * good enough for all current uses.
  37  *
  38  * Note: for the display output of psql to work properly, the return values
  39  * of the dsplen functions must conform to the Unicode standard. In particular
  40  * the NUL character is zero width and control characters are generally
  41  * width -1. It is recommended that non-ASCII encodings refer their ASCII
  42  * subset to the ASCII routines to ensure consistency.
  43  */
  44
  45 /*
  46  * SQL/ASCII
  47  */
  48 static int
  49 pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
  50 {
  51         int                     cnt = 0;
  52
  53         while (len > 0 && *from)
  54         {
  55                 *to++ = *from++;
  56                 len--;
  57                 cnt++;
  58         }
  59         *to = 0;
  60         return cnt;
  61 }
  62
  63 static int
  64 pg_ascii_mblen(const unsigned char *s)
  65 {
  66         return 1;
  67 }
  68
  69 static int
  70 pg_ascii_dsplen(const unsigned char *s)
  71 {
  72         if (*s == '\0')
  73                 return 0;
  74         if (*s < 0x20 || *s == 0x7f)
  75                 return -1;
  76
  77         return 1;
  78 }
  79
  80 /*
  81  * EUC
  82  */
  83 static int
  84 pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
  85 {
  86         int                     cnt = 0;
  87
  88         while (len > 0 && *from)
  89         {
  90                 if (*from == SS2 && len >= 2)   /* JIS X 0201 (so called "1 byte
  91                                                                                  * KANA") */
  92                 {
  93                         from++;
  94                         *to = (SS2 << 8) | *from++;
  95                         len -= 2;
  96                 }
  97                 else if (*from == SS3 && len >= 3)      /* JIS X 0212 KANJI */
  98                 {
  99                         from++;
 100                         *to = (SS3 << 16) | (*from++ << 8);
 101                         *to |= *from++;
 102                         len -= 3;
 103                 }
 104                 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
 105                 {
 106                         *to = *from++ << 8;
 107                         *to |= *from++;
 108                         len -= 2;
 109                 }
 110                 else                                    /* must be ASCII */
 111                 {
 112                         *to = *from++;
 113                         len--;
 114                 }
 115                 to++;
 116                 cnt++;
 117         }
 118         *to = 0;
 119         return cnt;
 120 }
 121
 122 static inline int
 123 pg_euc_mblen(const unsigned char *s)
 124 {
 125         int                     len;
 126
 127         if (*s == SS2)
 128                 len = 2;
 129         else if (*s == SS3)
 130                 len = 3;
 131         else if (IS_HIGHBIT_SET(*s))
 132                 len = 2;
 133         else
 134                 len = 1;
 135         return len;
 136 }
 137
 138 static inline int
 139 pg_euc_dsplen(const unsigned char *s)
 140 {
 141         int                     len;
 142
 143         if (*s == SS2)
 144                 len = 2;
 145         else if (*s == SS3)
 146                 len = 2;
 147         else if (IS_HIGHBIT_SET(*s))
 148                 len = 2;
 149         else
 150                 len = pg_ascii_dsplen(s);
 151         return len;
 152 }
 153
 154 /*
 155  * EUC_JP
 156  */
 157 static int
 158 pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 159 {
 160         return pg_euc2wchar_with_len(from, to, len);
 161 }
 162
 163 static int
 164 pg_eucjp_mblen(const unsigned char *s)
 165 {
 166         return pg_euc_mblen(s);
 167 }
 168
 169 static int
 170 pg_eucjp_dsplen(const unsigned char *s)
 171 {
 172         int                     len;
 173
 174         if (*s == SS2)
 175                 len = 1;
 176         else if (*s == SS3)
 177                 len = 2;
 178         else if (IS_HIGHBIT_SET(*s))
 179                 len = 2;
 180         else
 181                 len = pg_ascii_dsplen(s);
 182         return len;
 183 }
 184
 185 /*
 186  * EUC_KR
 187  */
 188 static int
 189 pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 190 {
 191         return pg_euc2wchar_with_len(from, to, len);
 192 }
 193
 194 static int
 195 pg_euckr_mblen(const unsigned char *s)
 196 {
 197         return pg_euc_mblen(s);
 198 }
 199
 200 static int
 201 pg_euckr_dsplen(const unsigned char *s)
 202 {
 203         return pg_euc_dsplen(s);
 204 }
 205
 206 /*
 207  * EUC_CN
 208  *
 209  */
 210 static int
 211 pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 212 {
 213         int                     cnt = 0;
 214
 215         while (len > 0 && *from)
 216         {
 217                 if (*from == SS2 && len >= 3)   /* code set 2 (unused?) */
 218                 {
 219                         from++;
 220                         *to = (SS2 << 16) | (*from++ << 8);
 221                         *to |= *from++;
 222                         len -= 3;
 223                 }
 224                 else if (*from == SS3 && len >= 3)      /* code set 3 (unused ?) */
 225                 {
 226                         from++;
 227                         *to = (SS3 << 16) | (*from++ << 8);
 228                         *to |= *from++;
 229                         len -= 3;
 230                 }
 231                 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
 232                 {
 233                         *to = *from++ << 8;
 234                         *to |= *from++;
 235                         len -= 2;
 236                 }
 237                 else
 238                 {
 239                         *to = *from++;
 240                         len--;
 241                 }
 242                 to++;
 243                 cnt++;
 244         }
 245         *to = 0;
 246         return cnt;
 247 }
 248
 249 static int
 250 pg_euccn_mblen(const unsigned char *s)
 251 {
 252         int                     len;
 253
 254         if (IS_HIGHBIT_SET(*s))
 255                 len = 2;
 256         else
 257                 len = 1;
 258         return len;
 259 }
 260
 261 static int
 262 pg_euccn_dsplen(const unsigned char *s)
 263 {
 264         int                     len;
 265
 266         if (IS_HIGHBIT_SET(*s))
 267                 len = 2;
 268         else
 269                 len = pg_ascii_dsplen(s);
 270         return len;
 271 }
 272
 273 /*
 274  * EUC_TW
 275  *
 276  */
 277 static int
 278 pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 279 {
 280         int                     cnt = 0;
 281
 282         while (len > 0 && *from)
 283         {
 284                 if (*from == SS2 && len >= 4)   /* code set 2 */
 285                 {
 286                         from++;
 287                         *to = (((uint32) SS2) << 24) | (*from++ << 16);
 288                         *to |= *from++ << 8;
 289                         *to |= *from++;
 290                         len -= 4;
 291                 }
 292                 else if (*from == SS3 && len >= 3)      /* code set 3 (unused?) */
 293                 {
 294                         from++;
 295                         *to = (SS3 << 16) | (*from++ << 8);
 296                         *to |= *from++;
 297                         len -= 3;
 298                 }
 299                 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
 300                 {
 301                         *to = *from++ << 8;
 302                         *to |= *from++;
 303                         len -= 2;
 304                 }
 305                 else
 306                 {
 307                         *to = *from++;
 308                         len--;
 309                 }
 310                 to++;
 311                 cnt++;
 312         }
 313         *to = 0;
 314         return cnt;
 315 }
 316
 317 static int
 318 pg_euctw_mblen(const unsigned char *s)
 319 {
 320         int                     len;
 321
 322         if (*s == SS2)
 323                 len = 4;
 324         else if (*s == SS3)
 325                 len = 3;
 326         else if (IS_HIGHBIT_SET(*s))
 327                 len = 2;
 328         else
 329                 len = 1;
 330         return len;
 331 }
 332
 333 static int
 334 pg_euctw_dsplen(const unsigned char *s)
 335 {
 336         int                     len;
 337
 338         if (*s == SS2)
 339                 len = 2;
 340         else if (*s == SS3)
 341                 len = 2;
 342         else if (IS_HIGHBIT_SET(*s))
 343                 len = 2;
 344         else
 345                 len = pg_ascii_dsplen(s);
 346         return len;
 347 }
 348
 349 /*
 350  * Convert pg_wchar to EUC_* encoding.
 351  * caller must allocate enough space for "to", including a trailing zero!
 352  * len: length of from.
 353  * "from" not necessarily null terminated.
 354  */
 355 static int
 356 pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
 357 {
 358         int                     cnt = 0;
 359
 360         while (len > 0 && *from)
 361         {
 362                 unsigned char c;
 363
 364                 if ((c = (*from >> 24)))
 365                 {
 366                         *to++ = c;
 367                         *to++ = (*from >> 16) & 0xff;
 368                         *to++ = (*from >> 8) & 0xff;
 369                         *to++ = *from & 0xff;
 370                         cnt += 4;
 371                 }
 372                 else if ((c = (*from >> 16)))
 373                 {
 374                         *to++ = c;
 375                         *to++ = (*from >> 8) & 0xff;
 376                         *to++ = *from & 0xff;
 377                         cnt += 3;
 378                 }
 379                 else if ((c = (*from >> 8)))
 380                 {
 381                         *to++ = c;
 382                         *to++ = *from & 0xff;
 383                         cnt += 2;
 384                 }
 385                 else
 386                 {
 387                         *to++ = *from;
 388                         cnt++;
 389                 }
 390                 from++;
 391                 len--;
 392         }
 393         *to = 0;
 394         return cnt;
 395 }
 396
 397
 398 /*
 399  * JOHAB
 400  */
 401 static int
 402 pg_johab_mblen(const unsigned char *s)
 403 {
 404         return pg_euc_mblen(s);
 405 }
 406
 407 static int
 408 pg_johab_dsplen(const unsigned char *s)
 409 {
 410         return pg_euc_dsplen(s);
 411 }
 412
 413 /*
 414  * convert UTF8 string to pg_wchar (UCS-4)
 415  * caller must allocate enough space for "to", including a trailing zero!
 416  * len: length of from.
 417  * "from" not necessarily null terminated.
 418  */
 419 static int
 420 pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 421 {
 422         int                     cnt = 0;
 423         uint32          c1,
 424                                 c2,
 425                                 c3,
 426                                 c4;
 427
 428         while (len > 0 && *from)
 429         {
 430                 if ((*from & 0x80) == 0)
 431                 {
 432                         *to = *from++;
 433                         len--;
 434                 }
 435                 else if ((*from & 0xe0) == 0xc0)
 436                 {
 437                         if (len < 2)
 438                                 break;                  /* drop trailing incomplete char */
 439                         c1 = *from++ & 0x1f;
 440                         c2 = *from++ & 0x3f;
 441                         *to = (c1 << 6) | c2;
 442                         len -= 2;
 443                 }
 444                 else if ((*from & 0xf0) == 0xe0)
 445                 {
 446                         if (len < 3)
 447                                 break;                  /* drop trailing incomplete char */
 448                         c1 = *from++ & 0x0f;
 449                         c2 = *from++ & 0x3f;
 450                         c3 = *from++ & 0x3f;
 451                         *to = (c1 << 12) | (c2 << 6) | c3;
 452                         len -= 3;
 453                 }
 454                 else if ((*from & 0xf8) == 0xf0)
 455                 {
 456                         if (len < 4)
 457                                 break;                  /* drop trailing incomplete char */
 458                         c1 = *from++ & 0x07;
 459                         c2 = *from++ & 0x3f;
 460                         c3 = *from++ & 0x3f;
 461                         c4 = *from++ & 0x3f;
 462                         *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
 463                         len -= 4;
 464                 }
 465                 else
 466                 {
 467                         /* treat a bogus char as length 1; not ours to raise error */
 468                         *to = *from++;
 469                         len--;
 470                 }
 471                 to++;
 472                 cnt++;
 473         }
 474         *to = 0;
 475         return cnt;
 476 }
 477
 478
 479 /*
 480  * Trivial conversion from pg_wchar to UTF-8.
 481  * caller should allocate enough space for "to"
 482  * len: length of from.
 483  * "from" not necessarily null terminated.
 484  */
 485 static int
 486 pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
 487 {
 488         int                     cnt = 0;
 489
 490         while (len > 0 && *from)
 491         {
 492                 int                     char_len;
 493
 494                 unicode_to_utf8(*from, to);
 495                 char_len = pg_utf_mblen(to);
 496                 cnt += char_len;
 497                 to += char_len;
 498                 from++;
 499                 len--;
 500         }
 501         *to = 0;
 502         return cnt;
 503 }
 504
 505 /*
 506  * Return the byte length of a UTF8 character pointed to by s
 507  *
 508  * Note: in the current implementation we do not support UTF8 sequences
 509  * of more than 4 bytes; hence do NOT return a value larger than 4.
 510  * We return "1" for any leading byte that is either flat-out illegal or
 511  * indicates a length larger than we support.
 512  *
 513  * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
 514  * other places would need to be fixed to change this.
 515  */
 516 int
 517 pg_utf_mblen(const unsigned char *s)
 518 {
 519         int                     len;
 520
 521         if ((*s & 0x80) == 0)
 522                 len = 1;
 523         else if ((*s & 0xe0) == 0xc0)
 524                 len = 2;
 525         else if ((*s & 0xf0) == 0xe0)
 526                 len = 3;
 527         else if ((*s & 0xf8) == 0xf0)
 528                 len = 4;
 529 #ifdef NOT_USED
 530         else if ((*s & 0xfc) == 0xf8)
 531                 len = 5;
 532         else if ((*s & 0xfe) == 0xfc)
 533                 len = 6;
 534 #endif
 535         else
 536                 len = 1;
 537         return len;
 538 }
 539
 540 /*
 541  * This is an implementation of wcwidth() and wcswidth() as defined in
 542  * "The Single UNIX Specification, Version 2, The Open Group, 1997"
 543  * <http://www.unix.org/online.html>
 544  *
 545  * Markus Kuhn -- 2001-09-08 -- public domain
 546  *
 547  * customised for PostgreSQL
 548  *
 549  * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
 550  */
 551
 552 struct mbinterval
 553 {
 554         unsigned int first;
 555         unsigned int last;
 556 };
 557
 558 /* auxiliary function for binary search in interval table */
 559 static int
 560 mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
 561 {
 562         int                     min = 0;
 563         int                     mid;
 564
 565         if (ucs < table[0].first || ucs > table[max].last)
 566                 return 0;
 567         while (max >= min)
 568         {
 569                 mid = (min + max) / 2;
 570                 if (ucs > table[mid].last)
 571                         min = mid + 1;
 572                 else if (ucs < table[mid].first)
 573                         max = mid - 1;
 574                 else
 575                         return 1;
 576         }
 577
 578         return 0;
 579 }
 580
 581
 582 /* The following functions define the column width of an ISO 10646
 583  * character as follows:
 584  *
 585  *        - The null character (U+0000) has a column width of 0.
 586  *
 587  *        - Other C0/C1 control characters and DEL will lead to a return
 588  *              value of -1.
 589  *
 590  *        - Non-spacing and enclosing combining characters (general
 591  *              category code Mn, Me or Cf in the Unicode database) have a
 592  *              column width of 0.
 593  *
 594  *        - Spacing characters in the East Asian Wide (W) or East Asian
 595  *              FullWidth (F) category as defined in Unicode Technical
 596  *              Report #11 have a column width of 2.
 597  *
 598  *        - All remaining characters (including all printable
 599  *              ISO 8859-1 and WGL4 characters, Unicode control characters,
 600  *              etc.) have a column width of 1.
 601  *
 602  * This implementation assumes that wchar_t characters are encoded
 603  * in ISO 10646.
 604  */
 605
 606 static int
 607 ucs_wcwidth(pg_wchar ucs)
 608 {
 609 #include "common/unicode_nonspacing_table.h"
 610 #include "common/unicode_east_asian_fw_table.h"
 611
 612         /* test for 8-bit control characters */
 613         if (ucs == 0)
 614                 return 0;
 615
 616         if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
 617                 return -1;
 618
 619         /*
 620          * binary search in table of non-spacing characters
 621          *
 622          * XXX: In the official Unicode sources, it is possible for a character to
 623          * be described as both non-spacing and wide at the same time. As of
 624          * Unicode 13.0, treating the non-spacing property as the determining
 625          * factor for display width leads to the correct behavior, so do that
 626          * search first.
 627          */
 628         if (mbbisearch(ucs, nonspacing,
 629                                    sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
 630                 return 0;
 631
 632         /* binary search in table of wide characters */
 633         if (mbbisearch(ucs, east_asian_fw,
 634                                    sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
 635                 return 2;
 636
 637         return 1;
 638 }
 639
 640 static int
 641 pg_utf_dsplen(const unsigned char *s)
 642 {
 643         return ucs_wcwidth(utf8_to_unicode(s));
 644 }
 645
 646 /*
 647  * convert mule internal code to pg_wchar
 648  * caller should allocate enough space for "to"
 649  * len: length of from.
 650  * "from" not necessarily null terminated.
 651  */
 652 static int
 653 pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 654 {
 655         int                     cnt = 0;
 656
 657         while (len > 0 && *from)
 658         {
 659                 if (IS_LC1(*from) && len >= 2)
 660                 {
 661                         *to = *from++ << 16;
 662                         *to |= *from++;
 663                         len -= 2;
 664                 }
 665                 else if (IS_LCPRV1(*from) && len >= 3)
 666                 {
 667                         from++;
 668                         *to = *from++ << 16;
 669                         *to |= *from++;
 670                         len -= 3;
 671                 }
 672                 else if (IS_LC2(*from) && len >= 3)
 673                 {
 674                         *to = *from++ << 16;
 675                         *to |= *from++ << 8;
 676                         *to |= *from++;
 677                         len -= 3;
 678                 }
 679                 else if (IS_LCPRV2(*from) && len >= 4)
 680                 {
 681                         from++;
 682                         *to = *from++ << 16;
 683                         *to |= *from++ << 8;
 684                         *to |= *from++;
 685                         len -= 4;
 686                 }
 687                 else
 688                 {                                               /* assume ASCII */
 689                         *to = (unsigned char) *from++;
 690                         len--;
 691                 }
 692                 to++;
 693                 cnt++;
 694         }
 695         *to = 0;
 696         return cnt;
 697 }
 698
 699 /*
 700  * convert pg_wchar to mule internal code
 701  * caller should allocate enough space for "to"
 702  * len: length of from.
 703  * "from" not necessarily null terminated.
 704  */
 705 static int
 706 pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
 707 {
 708         int                     cnt = 0;
 709
 710         while (len > 0 && *from)
 711         {
 712                 unsigned char lb;
 713
 714                 lb = (*from >> 16) & 0xff;
 715                 if (IS_LC1(lb))
 716                 {
 717                         *to++ = lb;
 718                         *to++ = *from & 0xff;
 719                         cnt += 2;
 720                 }
 721                 else if (IS_LC2(lb))
 722                 {
 723                         *to++ = lb;
 724                         *to++ = (*from >> 8) & 0xff;
 725                         *to++ = *from & 0xff;
 726                         cnt += 3;
 727                 }
 728                 else if (IS_LCPRV1_A_RANGE(lb))
 729                 {
 730                         *to++ = LCPRV1_A;
 731                         *to++ = lb;
 732                         *to++ = *from & 0xff;
 733                         cnt += 3;
 734                 }
 735                 else if (IS_LCPRV1_B_RANGE(lb))
 736                 {
 737                         *to++ = LCPRV1_B;
 738                         *to++ = lb;
 739                         *to++ = *from & 0xff;
 740                         cnt += 3;
 741                 }
 742                 else if (IS_LCPRV2_A_RANGE(lb))
 743                 {
 744                         *to++ = LCPRV2_A;
 745                         *to++ = lb;
 746                         *to++ = (*from >> 8) & 0xff;
 747                         *to++ = *from & 0xff;
 748                         cnt += 4;
 749                 }
 750                 else if (IS_LCPRV2_B_RANGE(lb))
 751                 {
 752                         *to++ = LCPRV2_B;
 753                         *to++ = lb;
 754                         *to++ = (*from >> 8) & 0xff;
 755                         *to++ = *from & 0xff;
 756                         cnt += 4;
 757                 }
 758                 else
 759                 {
 760                         *to++ = *from & 0xff;
 761                         cnt += 1;
 762                 }
 763                 from++;
 764                 len--;
 765         }
 766         *to = 0;
 767         return cnt;
 768 }
 769
 770 /* exported for direct use by conv.c */
 771 int
 772 pg_mule_mblen(const unsigned char *s)
 773 {
 774         int                     len;
 775
 776         if (IS_LC1(*s))
 777                 len = 2;
 778         else if (IS_LCPRV1(*s))
 779                 len = 3;
 780         else if (IS_LC2(*s))
 781                 len = 3;
 782         else if (IS_LCPRV2(*s))
 783                 len = 4;
 784         else
 785                 len = 1;                                /* assume ASCII */
 786         return len;
 787 }
 788
 789 static int
 790 pg_mule_dsplen(const unsigned char *s)
 791 {
 792         int                     len;
 793
 794         /*
 795          * Note: it's not really appropriate to assume that all multibyte charsets
 796          * are double-wide on screen.  But this seems an okay approximation for
 797          * the MULE charsets we currently support.
 798          */
 799
 800         if (IS_LC1(*s))
 801                 len = 1;
 802         else if (IS_LCPRV1(*s))
 803                 len = 1;
 804         else if (IS_LC2(*s))
 805                 len = 2;
 806         else if (IS_LCPRV2(*s))
 807                 len = 2;
 808         else
 809                 len = 1;                                /* assume ASCII */
 810
 811         return len;
 812 }
 813
 814 /*
 815  * ISO8859-1
 816  */
 817 static int
 818 pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 819 {
 820         int                     cnt = 0;
 821
 822         while (len > 0 && *from)
 823         {
 824                 *to++ = *from++;
 825                 len--;
 826                 cnt++;
 827         }
 828         *to = 0;
 829         return cnt;
 830 }
 831
 832 /*
 833  * Trivial conversion from pg_wchar to single byte encoding. Just ignores
 834  * high bits.
 835  * caller should allocate enough space for "to"
 836  * len: length of from.
 837  * "from" not necessarily null terminated.
 838  */
 839 static int
 840 pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
 841 {
 842         int                     cnt = 0;
 843
 844         while (len > 0 && *from)
 845         {
 846                 *to++ = *from++;
 847                 len--;
 848                 cnt++;
 849         }
 850         *to = 0;
 851         return cnt;
 852 }
 853
 854 static int
 855 pg_latin1_mblen(const unsigned char *s)
 856 {
 857         return 1;
 858 }
 859
 860 static int
 861 pg_latin1_dsplen(const unsigned char *s)
 862 {
 863         return pg_ascii_dsplen(s);
 864 }
 865
 866 /*
 867  * SJIS
 868  */
 869 static int
 870 pg_sjis_mblen(const unsigned char *s)
 871 {
 872         int                     len;
 873
 874         if (*s >= 0xa1 && *s <= 0xdf)
 875                 len = 1;                                /* 1 byte kana? */
 876         else if (IS_HIGHBIT_SET(*s))
 877                 len = 2;                                /* kanji? */
 878         else
 879                 len = 1;                                /* should be ASCII */
 880         return len;
 881 }
 882
 883 static int
 884 pg_sjis_dsplen(const unsigned char *s)
 885 {
 886         int                     len;
 887
 888         if (*s >= 0xa1 && *s <= 0xdf)
 889                 len = 1;                                /* 1 byte kana? */
 890         else if (IS_HIGHBIT_SET(*s))
 891                 len = 2;                                /* kanji? */
 892         else
 893                 len = pg_ascii_dsplen(s);       /* should be ASCII */
 894         return len;
 895 }
 896
 897 /*
 898  * Big5
 899  */
 900 static int
 901 pg_big5_mblen(const unsigned char *s)
 902 {
 903         int                     len;
 904
 905         if (IS_HIGHBIT_SET(*s))
 906                 len = 2;                                /* kanji? */
 907         else
 908                 len = 1;                                /* should be ASCII */
 909         return len;
 910 }
 911
 912 static int
 913 pg_big5_dsplen(const unsigned char *s)
 914 {
 915         int                     len;
 916
 917         if (IS_HIGHBIT_SET(*s))
 918                 len = 2;                                /* kanji? */
 919         else
 920                 len = pg_ascii_dsplen(s);       /* should be ASCII */
 921         return len;
 922 }
 923
 924 /*
 925  * GBK
 926  */
 927 static int
 928 pg_gbk_mblen(const unsigned char *s)
 929 {
 930         int                     len;
 931
 932         if (IS_HIGHBIT_SET(*s))
 933                 len = 2;                                /* kanji? */
 934         else
 935                 len = 1;                                /* should be ASCII */
 936         return len;
 937 }
 938
 939 static int
 940 pg_gbk_dsplen(const unsigned char *s)
 941 {
 942         int                     len;
 943
 944         if (IS_HIGHBIT_SET(*s))
 945                 len = 2;                                /* kanji? */
 946         else
 947                 len = pg_ascii_dsplen(s);       /* should be ASCII */
 948         return len;
 949 }
 950
 951 /*
 952  * UHC
 953  */
 954 static int
 955 pg_uhc_mblen(const unsigned char *s)
 956 {
 957         int                     len;
 958
 959         if (IS_HIGHBIT_SET(*s))
 960                 len = 2;                                /* 2byte? */
 961         else
 962                 len = 1;                                /* should be ASCII */
 963         return len;
 964 }
 965
 966 static int
 967 pg_uhc_dsplen(const unsigned char *s)
 968 {
 969         int                     len;
 970
 971         if (IS_HIGHBIT_SET(*s))
 972                 len = 2;                                /* 2byte? */
 973         else
 974                 len = pg_ascii_dsplen(s);       /* should be ASCII */
 975         return len;
 976 }
 977
 978 /*
 979  * GB18030
 980  *      Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
 981  */
 982
 983 /*
 984  * Unlike all other mblen() functions, this also looks at the second byte of
 985  * the input.  However, if you only pass the first byte of a multi-byte
 986  * string, and \0 as the second byte, this still works in a predictable way:
 987  * a 4-byte character will be reported as two 2-byte characters.  That's
 988  * enough for all current uses, as a client-only encoding.  It works that
 989  * way, because in any valid 4-byte GB18030-encoded character, the third and
 990  * fourth byte look like a 2-byte encoded character, when looked at
 991  * separately.
 992  */
 993 static int
 994 pg_gb18030_mblen(const unsigned char *s)
 995 {
 996         int                     len;
 997
 998         if (!IS_HIGHBIT_SET(*s))
 999                 len = 1;                                /* ASCII */
1000         else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1001                 len = 4;
1002         else
1003                 len = 2;
1004         return len;
1005 }
1006
1007 static int
1008 pg_gb18030_dsplen(const unsigned char *s)
1009 {
1010         int                     len;
1011
1012         if (IS_HIGHBIT_SET(*s))
1013                 len = 2;
1014         else
1015                 len = pg_ascii_dsplen(s);       /* ASCII */
1016         return len;
1017 }
1018
1019 /*
1020  *-------------------------------------------------------------------
1021  * multibyte sequence validators
1022  *
1023  * The verifychar functions accept "s", a pointer to the first byte of a
1024  * string, and "len", the remaining length of the string.  If there is a
1025  * validly encoded character beginning at *s, return its length in bytes;
1026  * else return -1.
1027  *
1028  * The verifystr functions also accept "s", a pointer to a string and "len",
1029  * the length of the string.  They verify the whole string, and return the
1030  * number of input bytes (<= len) that are valid.  In other words, if the
1031  * whole string is valid, verifystr returns "len", otherwise it returns the
1032  * byte offset of the first invalid character.  The verifystr functions must
1033  * test for and reject zeroes in the input.
1034  *
1035  * The verifychar functions can assume that len > 0 and that *s != '\0', but
1036  * they must test for and reject zeroes in any additional bytes of a
1037  * multibyte character.  Note that this definition allows the function for a
1038  * single-byte encoding to be just "return 1".
1039  *-------------------------------------------------------------------
1040  */
1041 static int
1042 pg_ascii_verifychar(const unsigned char *s, int len)
1043 {
1044         return 1;
1045 }
1046
1047 static int
1048 pg_ascii_verifystr(const unsigned char *s, int len)
1049 {
1050         const unsigned char *nullpos = memchr(s, 0, len);
1051
1052         if (nullpos == NULL)
1053                 return len;
1054         else
1055                 return nullpos - s;
1056 }
1057
1058 #define IS_EUC_RANGE_VALID(c)   ((c) >= 0xa1 && (c) <= 0xfe)
1059
1060 static int
1061 pg_eucjp_verifychar(const unsigned char *s, int len)
1062 {
1063         int                     l;
1064         unsigned char c1,
1065                                 c2;
1066
1067         c1 = *s++;
1068
1069         switch (c1)
1070         {
1071                 case SS2:                               /* JIS X 0201 */
1072                         l = 2;
1073                         if (l > len)
1074                                 return -1;
1075                         c2 = *s++;
1076                         if (c2 < 0xa1 || c2 > 0xdf)
1077                                 return -1;
1078                         break;
1079
1080                 case SS3:                               /* JIS X 0212 */
1081                         l = 3;
1082                         if (l > len)
1083                                 return -1;
1084                         c2 = *s++;
1085                         if (!IS_EUC_RANGE_VALID(c2))
1086                                 return -1;
1087                         c2 = *s++;
1088                         if (!IS_EUC_RANGE_VALID(c2))
1089                                 return -1;
1090                         break;
1091
1092                 default:
1093                         if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1094                         {
1095                                 l = 2;
1096                                 if (l > len)
1097                                         return -1;
1098                                 if (!IS_EUC_RANGE_VALID(c1))
1099                                         return -1;
1100                                 c2 = *s++;
1101                                 if (!IS_EUC_RANGE_VALID(c2))
1102                                         return -1;
1103                         }
1104                         else
1105                                 /* must be ASCII */
1106                         {
1107                                 l = 1;
1108                         }
1109                         break;
1110         }
1111
1112         return l;
1113 }
1114
1115 static int
1116 pg_eucjp_verifystr(const unsigned char *s, int len)
1117 {
1118         const unsigned char *start = s;
1119
1120         while (len > 0)
1121         {
1122                 int                     l;
1123
1124                 /* fast path for ASCII-subset characters */
1125                 if (!IS_HIGHBIT_SET(*s))
1126                 {
1127                         if (*s == '\0')
1128                                 break;
1129                         l = 1;
1130                 }
1131                 else
1132                 {
1133                         l = pg_eucjp_verifychar(s, len);
1134                         if (l == -1)
1135                                 break;
1136                 }
1137                 s += l;
1138                 len -= l;
1139         }
1140
1141         return s - start;
1142 }
1143
1144 static int
1145 pg_euckr_verifychar(const unsigned char *s, int len)
1146 {
1147         int                     l;
1148         unsigned char c1,
1149                                 c2;
1150
1151         c1 = *s++;
1152
1153         if (IS_HIGHBIT_SET(c1))
1154         {
1155                 l = 2;
1156                 if (l > len)
1157                         return -1;
1158                 if (!IS_EUC_RANGE_VALID(c1))
1159                         return -1;
1160                 c2 = *s++;
1161                 if (!IS_EUC_RANGE_VALID(c2))
1162                         return -1;
1163         }
1164         else
1165                 /* must be ASCII */
1166         {
1167                 l = 1;
1168         }
1169
1170         return l;
1171 }
1172
1173 static int
1174 pg_euckr_verifystr(const unsigned char *s, int len)
1175 {
1176         const unsigned char *start = s;
1177
1178         while (len > 0)
1179         {
1180                 int                     l;
1181
1182                 /* fast path for ASCII-subset characters */
1183                 if (!IS_HIGHBIT_SET(*s))
1184                 {
1185                         if (*s == '\0')
1186                                 break;
1187                         l = 1;
1188                 }
1189                 else
1190                 {
1191                         l = pg_euckr_verifychar(s, len);
1192                         if (l == -1)
1193                                 break;
1194                 }
1195                 s += l;
1196                 len -= l;
1197         }
1198
1199         return s - start;
1200 }
1201
1202 /* EUC-CN byte sequences are exactly same as EUC-KR */
1203 #define pg_euccn_verifychar     pg_euckr_verifychar
1204 #define pg_euccn_verifystr      pg_euckr_verifystr
1205
1206 static int
1207 pg_euctw_verifychar(const unsigned char *s, int len)
1208 {
1209         int                     l;
1210         unsigned char c1,
1211                                 c2;
1212
1213         c1 = *s++;
1214
1215         switch (c1)
1216         {
1217                 case SS2:                               /* CNS 11643 Plane 1-7 */
1218                         l = 4;
1219                         if (l > len)
1220                                 return -1;
1221                         c2 = *s++;
1222                         if (c2 < 0xa1 || c2 > 0xa7)
1223                                 return -1;
1224                         c2 = *s++;
1225                         if (!IS_EUC_RANGE_VALID(c2))
1226                                 return -1;
1227                         c2 = *s++;
1228                         if (!IS_EUC_RANGE_VALID(c2))
1229                                 return -1;
1230                         break;
1231
1232                 case SS3:                               /* unused */
1233                         return -1;
1234
1235                 default:
1236                         if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1237                         {
1238                                 l = 2;
1239                                 if (l > len)
1240                                         return -1;
1241                                 /* no further range check on c1? */
1242                                 c2 = *s++;
1243                                 if (!IS_EUC_RANGE_VALID(c2))
1244                                         return -1;
1245                         }
1246                         else
1247                                 /* must be ASCII */
1248                         {
1249                                 l = 1;
1250                         }
1251                         break;
1252         }
1253         return l;
1254 }
1255
1256 static int
1257 pg_euctw_verifystr(const unsigned char *s, int len)
1258 {
1259         const unsigned char *start = s;
1260
1261         while (len > 0)
1262         {
1263                 int                     l;
1264
1265                 /* fast path for ASCII-subset characters */
1266                 if (!IS_HIGHBIT_SET(*s))
1267                 {
1268                         if (*s == '\0')
1269                                 break;
1270                         l = 1;
1271                 }
1272                 else
1273                 {
1274                         l = pg_euctw_verifychar(s, len);
1275                         if (l == -1)
1276                                 break;
1277                 }
1278                 s += l;
1279                 len -= l;
1280         }
1281
1282         return s - start;
1283 }
1284
1285 static int
1286 pg_johab_verifychar(const unsigned char *s, int len)
1287 {
1288         int                     l,
1289                                 mbl;
1290         unsigned char c;
1291
1292         l = mbl = pg_johab_mblen(s);
1293
1294         if (len < l)
1295                 return -1;
1296
1297         if (!IS_HIGHBIT_SET(*s))
1298                 return mbl;
1299
1300         while (--l > 0)
1301         {
1302                 c = *++s;
1303                 if (!IS_EUC_RANGE_VALID(c))
1304                         return -1;
1305         }
1306         return mbl;
1307 }
1308
1309 static int
1310 pg_johab_verifystr(const unsigned char *s, int len)
1311 {
1312         const unsigned char *start = s;
1313
1314         while (len > 0)
1315         {
1316                 int                     l;
1317
1318                 /* fast path for ASCII-subset characters */
1319                 if (!IS_HIGHBIT_SET(*s))
1320                 {
1321                         if (*s == '\0')
1322                                 break;
1323                         l = 1;
1324                 }
1325                 else
1326                 {
1327                         l = pg_johab_verifychar(s, len);
1328                         if (l == -1)
1329                                 break;
1330                 }
1331                 s += l;
1332                 len -= l;
1333         }
1334
1335         return s - start;
1336 }
1337
1338 static int
1339 pg_mule_verifychar(const unsigned char *s, int len)
1340 {
1341         int                     l,
1342                                 mbl;
1343         unsigned char c;
1344
1345         l = mbl = pg_mule_mblen(s);
1346
1347         if (len < l)
1348                 return -1;
1349
1350         while (--l > 0)
1351         {
1352                 c = *++s;
1353                 if (!IS_HIGHBIT_SET(c))
1354                         return -1;
1355         }
1356         return mbl;
1357 }
1358
1359 static int
1360 pg_mule_verifystr(const unsigned char *s, int len)
1361 {
1362         const unsigned char *start = s;
1363
1364         while (len > 0)
1365         {
1366                 int                     l;
1367
1368                 /* fast path for ASCII-subset characters */
1369                 if (!IS_HIGHBIT_SET(*s))
1370                 {
1371                         if (*s == '\0')
1372                                 break;
1373                         l = 1;
1374                 }
1375                 else
1376                 {
1377                         l = pg_mule_verifychar(s, len);
1378                         if (l == -1)
1379                                 break;
1380                 }
1381                 s += l;
1382                 len -= l;
1383         }
1384
1385         return s - start;
1386 }
1387
1388 static int
1389 pg_latin1_verifychar(const unsigned char *s, int len)
1390 {
1391         return 1;
1392 }
1393
1394 static int
1395 pg_latin1_verifystr(const unsigned char *s, int len)
1396 {
1397         const unsigned char *nullpos = memchr(s, 0, len);
1398
1399         if (nullpos == NULL)
1400                 return len;
1401         else
1402                 return nullpos - s;
1403 }
1404
1405 static int
1406 pg_sjis_verifychar(const unsigned char *s, int len)
1407 {
1408         int                     l,
1409                                 mbl;
1410         unsigned char c1,
1411                                 c2;
1412
1413         l = mbl = pg_sjis_mblen(s);
1414
1415         if (len < l)
1416                 return -1;
1417
1418         if (l == 1)                                     /* pg_sjis_mblen already verified it */
1419                 return mbl;
1420
1421         c1 = *s++;
1422         c2 = *s;
1423         if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1424                 return -1;
1425         return mbl;
1426 }
1427
1428 static int
1429 pg_sjis_verifystr(const unsigned char *s, int len)
1430 {
1431         const unsigned char *start = s;
1432
1433         while (len > 0)
1434         {
1435                 int                     l;
1436
1437                 /* fast path for ASCII-subset characters */
1438                 if (!IS_HIGHBIT_SET(*s))
1439                 {
1440                         if (*s == '\0')
1441                                 break;
1442                         l = 1;
1443                 }
1444                 else
1445                 {
1446                         l = pg_sjis_verifychar(s, len);
1447                         if (l == -1)
1448                                 break;
1449                 }
1450                 s += l;
1451                 len -= l;
1452         }
1453
1454         return s - start;
1455 }
1456
1457 static int
1458 pg_big5_verifychar(const unsigned char *s, int len)
1459 {
1460         int                     l,
1461                                 mbl;
1462
1463         l = mbl = pg_big5_mblen(s);
1464
1465         if (len < l)
1466                 return -1;
1467
1468         while (--l > 0)
1469         {
1470                 if (*++s == '\0')
1471                         return -1;
1472         }
1473
1474         return mbl;
1475 }
1476
1477 static int
1478 pg_big5_verifystr(const unsigned char *s, int len)
1479 {
1480         const unsigned char *start = s;
1481
1482         while (len > 0)
1483         {
1484                 int                     l;
1485
1486                 /* fast path for ASCII-subset characters */
1487                 if (!IS_HIGHBIT_SET(*s))
1488                 {
1489                         if (*s == '\0')
1490                                 break;
1491                         l = 1;
1492                 }
1493                 else
1494                 {
1495                         l = pg_big5_verifychar(s, len);
1496                         if (l == -1)
1497                                 break;
1498                 }
1499                 s += l;
1500                 len -= l;
1501         }
1502
1503         return s - start;
1504 }
1505
1506 static int
1507 pg_gbk_verifychar(const unsigned char *s, int len)
1508 {
1509         int                     l,
1510                                 mbl;
1511
1512         l = mbl = pg_gbk_mblen(s);
1513
1514         if (len < l)
1515                 return -1;
1516
1517         while (--l > 0)
1518         {
1519                 if (*++s == '\0')
1520                         return -1;
1521         }
1522
1523         return mbl;
1524 }
1525
1526 static int
1527 pg_gbk_verifystr(const unsigned char *s, int len)
1528 {
1529         const unsigned char *start = s;
1530
1531         while (len > 0)
1532         {
1533                 int                     l;
1534
1535                 /* fast path for ASCII-subset characters */
1536                 if (!IS_HIGHBIT_SET(*s))
1537                 {
1538                         if (*s == '\0')
1539                                 break;
1540                         l = 1;
1541                 }
1542                 else
1543                 {
1544                         l = pg_gbk_verifychar(s, len);
1545                         if (l == -1)
1546                                 break;
1547                 }
1548                 s += l;
1549                 len -= l;
1550         }
1551
1552         return s - start;
1553 }
1554
1555 static int
1556 pg_uhc_verifychar(const unsigned char *s, int len)
1557 {
1558         int                     l,
1559                                 mbl;
1560
1561         l = mbl = pg_uhc_mblen(s);
1562
1563         if (len < l)
1564                 return -1;
1565
1566         while (--l > 0)
1567         {
1568                 if (*++s == '\0')
1569                         return -1;
1570         }
1571
1572         return mbl;
1573 }
1574
1575 static int
1576 pg_uhc_verifystr(const unsigned char *s, int len)
1577 {
1578         const unsigned char *start = s;
1579
1580         while (len > 0)
1581         {
1582                 int                     l;
1583
1584                 /* fast path for ASCII-subset characters */
1585                 if (!IS_HIGHBIT_SET(*s))
1586                 {
1587                         if (*s == '\0')
1588                                 break;
1589                         l = 1;
1590                 }
1591                 else
1592                 {
1593                         l = pg_uhc_verifychar(s, len);
1594                         if (l == -1)
1595                                 break;
1596                 }
1597                 s += l;
1598                 len -= l;
1599         }
1600
1601         return s - start;
1602 }
1603
1604 static int
1605 pg_gb18030_verifychar(const unsigned char *s, int len)
1606 {
1607         int                     l;
1608
1609         if (!IS_HIGHBIT_SET(*s))
1610                 l = 1;                                  /* ASCII */
1611         else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1612         {
1613                 /* Should be 4-byte, validate remaining bytes */
1614                 if (*s >= 0x81 && *s <= 0xfe &&
1615                         *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1616                         *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1617                         l = 4;
1618                 else
1619                         l = -1;
1620         }
1621         else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1622         {
1623                 /* Should be 2-byte, validate */
1624                 if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1625                         (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1626                         l = 2;
1627                 else
1628                         l = -1;
1629         }
1630         else
1631                 l = -1;
1632         return l;
1633 }
1634
1635 static int
1636 pg_gb18030_verifystr(const unsigned char *s, int len)
1637 {
1638         const unsigned char *start = s;
1639
1640         while (len > 0)
1641         {
1642                 int                     l;
1643
1644                 /* fast path for ASCII-subset characters */
1645                 if (!IS_HIGHBIT_SET(*s))
1646                 {
1647                         if (*s == '\0')
1648                                 break;
1649                         l = 1;
1650                 }
1651                 else
1652                 {
1653                         l = pg_gb18030_verifychar(s, len);
1654                         if (l == -1)
1655                                 break;
1656                 }
1657                 s += l;
1658                 len -= l;
1659         }
1660
1661         return s - start;
1662 }
1663
1664 static int
1665 pg_utf8_verifychar(const unsigned char *s, int len)
1666 {
1667         int                     l;
1668
1669         if ((*s & 0x80) == 0)
1670         {
1671                 if (*s == '\0')
1672                         return -1;
1673                 return 1;
1674         }
1675         else if ((*s & 0xe0) == 0xc0)
1676                 l = 2;
1677         else if ((*s & 0xf0) == 0xe0)
1678                 l = 3;
1679         else if ((*s & 0xf8) == 0xf0)
1680                 l = 4;
1681         else
1682                 l = 1;
1683
1684         if (l > len)
1685                 return -1;
1686
1687         if (!pg_utf8_islegal(s, l))
1688                 return -1;
1689
1690         return l;
1691 }
1692
1693 /*
1694  * The fast path of the UTF-8 verifier uses a deterministic finite automaton
1695  * (DFA) for multibyte characters. In a traditional table-driven DFA, the
1696  * input byte and current state are used to compute an index into an array of
1697  * state transitions. Since the address of the next transition is dependent
1698  * on this computation, there is latency in executing the load instruction,
1699  * and the CPU is not kept busy.
1700  *
1701  * Instead, we use a "shift-based" DFA as described by Per Vognsen:
1702  *
1703  * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
1704  *
1705  * In a shift-based DFA, the input byte is an index into array of integers
1706  * whose bit pattern encodes the state transitions. To compute the next
1707  * state, we simply right-shift the integer by the current state and apply a
1708  * mask. In this scheme, the address of the transition only depends on the
1709  * input byte, so there is better pipelining.
1710  *
1711  * The naming convention for states and transitions was adopted from a UTF-8
1712  * to UTF-16/32 transcoder, whose table is reproduced below:
1713  *
1714  * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
1715  *
1716  * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
1717  * ==========================================================================
1718  * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B,      | BGN/END
1719  * err, err, err, err, err, err, err, err, err, err, err, err,      | ERR
1720  *                                                                  |
1721  * err, err, END, END, END, err, err, err, err, err, err, err,      | CS1
1722  * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,      | CS2
1723  * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,      | CS3
1724  *                                                                  |
1725  * err, err, err, err, CS1, err, err, err, err, err, err, err,      | P3A
1726  * err, err, CS1, CS1, err, err, err, err, err, err, err, err,      | P3B
1727  *                                                                  |
1728  * err, err, err, CS2, CS2, err, err, err, err, err, err, err,      | P4A
1729  * err, err, CS2, err, err, err, err, err, err, err, err, err,      | P4B
1730  *
1731  * In the most straightforward implementation, a shift-based DFA for UTF-8
1732  * requires 64-bit integers to encode the transitions, but with an SMT solver
1733  * it's possible to find state numbers such that the transitions fit within
1734  * 32-bit integers, as Dougall Johnson demonstrated:
1735  *
1736  * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
1737  *
1738  * This packed representation is the reason for the seemingly odd choice of
1739  * state values below.
1740  */
1741
1742 /* Error */
1743 #define ERR  0
1744 /* Begin */
1745 #define BGN 11
1746 /* Continuation states, expect 1/2/3 continuation bytes */
1747 #define CS1 16
1748 #define CS2  1
1749 #define CS3  5
1750 /* Partial states, where the first continuation byte has a restricted range */
1751 #define P3A  6                                  /* Lead was E0, check for 3-byte overlong */
1752 #define P3B 20                                  /* Lead was ED, check for surrogate */
1753 #define P4A 25                                  /* Lead was F0, check for 4-byte overlong */
1754 #define P4B 30                                  /* Lead was F4, check for too-large */
1755 /* Begin and End are the same state */
1756 #define END BGN
1757
1758 /* the encoded state transitions for the lookup table */
1759
1760 /* ASCII */
1761 #define ASC (END << BGN)
1762 /* 2-byte lead */
1763 #define L2A (CS1 << BGN)
1764 /* 3-byte lead */
1765 #define L3A (P3A << BGN)
1766 #define L3B (CS2 << BGN)
1767 #define L3C (P3B << BGN)
1768 /* 4-byte lead */
1769 #define L4A (P4A << BGN)
1770 #define L4B (CS3 << BGN)
1771 #define L4C (P4B << BGN)
1772 /* continuation byte */
1773 #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
1774 #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
1775 #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
1776 /* invalid byte */
1777 #define ILL ERR
1778
1779 static const uint32 Utf8Transition[256] =
1780 {
1781         /* ASCII */
1782
1783         ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1784         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1785         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1786         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1787
1788         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1789         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1790         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1791         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1792
1793         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1794         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1795         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1796         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1797
1798         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1799         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1800         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1801         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1802
1803         /* continuation bytes */
1804
1805         /* 80..8F */
1806         CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1807         CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1808
1809         /* 90..9F */
1810         CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1811         CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1812
1813         /* A0..BF */
1814         CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1815         CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1816         CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1817         CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1818
1819         /* leading bytes */
1820
1821         /* C0..DF */
1822         ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
1823         L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1824         L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1825         L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1826
1827         /* E0..EF */
1828         L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
1829         L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
1830
1831         /* F0..FF */
1832         L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
1833         ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
1834 };
1835
1836 static void
1837 utf8_advance(const unsigned char *s, uint32 *state, int len)
1838 {
1839         /* Note: We deliberately don't check the state's value here. */
1840         while (len > 0)
1841         {
1842                 /*
1843                  * It's important that the mask value is 31: In most instruction sets,
1844                  * a shift by a 32-bit operand is understood to be a shift by its mod
1845                  * 32, so the compiler should elide the mask operation.
1846                  */
1847                 *state = Utf8Transition[*s++] >> (*state & 31);
1848                 len--;
1849         }
1850
1851         *state &= 31;
1852 }
1853
1854 static int
1855 pg_utf8_verifystr(const unsigned char *s, int len)
1856 {
1857         const unsigned char *start = s;
1858         const int       orig_len = len;
1859         uint32          state = BGN;
1860
1861 /*
1862  * With a stride of two vector widths, gcc will unroll the loop. Even if
1863  * the compiler can unroll a longer loop, it's not worth it because we
1864  * must fall back to the byte-wise algorithm if we find any non-ASCII.
1865  */
1866 #define STRIDE_LENGTH (2 * sizeof(Vector8))
1867
1868         if (len >= STRIDE_LENGTH)
1869         {
1870                 while (len >= STRIDE_LENGTH)
1871                 {
1872                         /*
1873                          * If the chunk is all ASCII, we can skip the full UTF-8 check,
1874                          * but we must first check for a non-END state, which means the
1875                          * previous chunk ended in the middle of a multibyte sequence.
1876                          */
1877                         if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
1878                                 utf8_advance(s, &state, STRIDE_LENGTH);
1879
1880                         s += STRIDE_LENGTH;
1881                         len -= STRIDE_LENGTH;
1882                 }
1883
1884                 /* The error state persists, so we only need to check for it here. */
1885                 if (state == ERR)
1886                 {
1887                         /*
1888                          * Start over from the beginning with the slow path so we can
1889                          * count the valid bytes.
1890                          */
1891                         len = orig_len;
1892                         s = start;
1893                 }
1894                 else if (state != END)
1895                 {
1896                         /*
1897                          * The fast path exited in the middle of a multibyte sequence.
1898                          * Walk backwards to find the leading byte so that the slow path
1899                          * can resume checking from there. We must always backtrack at
1900                          * least one byte, since the current byte could be e.g. an ASCII
1901                          * byte after a 2-byte lead, which is invalid.
1902                          */
1903                         do
1904                         {
1905                                 Assert(s > start);
1906                                 s--;
1907                                 len++;
1908                                 Assert(IS_HIGHBIT_SET(*s));
1909                         } while (pg_utf_mblen(s) <= 1);
1910                 }
1911         }
1912
1913         /* check remaining bytes */
1914         while (len > 0)
1915         {
1916                 int                     l;
1917
1918                 /* fast path for ASCII-subset characters */
1919                 if (!IS_HIGHBIT_SET(*s))
1920                 {
1921                         if (*s == '\0')
1922                                 break;
1923                         l = 1;
1924                 }
1925                 else
1926                 {
1927                         l = pg_utf8_verifychar(s, len);
1928                         if (l == -1)
1929                                 break;
1930                 }
1931                 s += l;
1932                 len -= l;
1933         }
1934
1935         return s - start;
1936 }
1937
1938 /*
1939  * Check for validity of a single UTF-8 encoded character
1940  *
1941  * This directly implements the rules in RFC3629.  The bizarre-looking
1942  * restrictions on the second byte are meant to ensure that there isn't
1943  * more than one encoding of a given Unicode character point; that is,
1944  * you may not use a longer-than-necessary byte sequence with high order
1945  * zero bits to represent a character that would fit in fewer bytes.
1946  * To do otherwise is to create security hazards (eg, create an apparent
1947  * non-ASCII character that decodes to plain ASCII).
1948  *
1949  * length is assumed to have been obtained by pg_utf_mblen(), and the
1950  * caller must have checked that that many bytes are present in the buffer.
1951  */
1952 bool
1953 pg_utf8_islegal(const unsigned char *source, int length)
1954 {
1955         unsigned char a;
1956
1957         switch (length)
1958         {
1959                 default:
1960                         /* reject lengths 5 and 6 for now */
1961                         return false;
1962                 case 4:
1963                         a = source[3];
1964                         if (a < 0x80 || a > 0xBF)
1965                                 return false;
1966                         /* FALL THRU */
1967                 case 3:
1968                         a = source[2];
1969                         if (a < 0x80 || a > 0xBF)
1970                                 return false;
1971                         /* FALL THRU */
1972                 case 2:
1973                         a = source[1];
1974                         switch (*source)
1975                         {
1976                                 case 0xE0:
1977                                         if (a < 0xA0 || a > 0xBF)
1978                                                 return false;
1979                                         break;
1980                                 case 0xED:
1981                                         if (a < 0x80 || a > 0x9F)
1982                                                 return false;
1983                                         break;
1984                                 case 0xF0:
1985                                         if (a < 0x90 || a > 0xBF)
1986                                                 return false;
1987                                         break;
1988                                 case 0xF4:
1989                                         if (a < 0x80 || a > 0x8F)
1990                                                 return false;
1991                                         break;
1992                                 default:
1993                                         if (a < 0x80 || a > 0xBF)
1994                                                 return false;
1995                                         break;
1996                         }
1997                         /* FALL THRU */
1998                 case 1:
1999                         a = *source;
2000                         if (a >= 0x80 && a < 0xC2)
2001                                 return false;
2002                         if (a > 0xF4)
2003                                 return false;
2004                         break;
2005         }
2006         return true;
2007 }
2008
2009
2010 /*
2011  *-------------------------------------------------------------------
2012  * encoding info table
2013  *-------------------------------------------------------------------
2014  */
2015 const pg_wchar_tbl pg_wchar_table[] = {
2016         [PG_SQL_ASCII] = {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},
2017         [PG_EUC_JP] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
2018         [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2},
2019         [PG_EUC_KR] = {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},
2020         [PG_EUC_TW] = {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},
2021         [PG_EUC_JIS_2004] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
2022         [PG_UTF8] = {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},
2023         [PG_MULE_INTERNAL] = {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4},
2024         [PG_LATIN1] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2025         [PG_LATIN2] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2026         [PG_LATIN3] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2027         [PG_LATIN4] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2028         [PG_LATIN5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2029         [PG_LATIN6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2030         [PG_LATIN7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2031         [PG_LATIN8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2032         [PG_LATIN9] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2033         [PG_LATIN10] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2034         [PG_WIN1256] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2035         [PG_WIN1258] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2036         [PG_WIN866] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2037         [PG_WIN874] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2038         [PG_KOI8R] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2039         [PG_WIN1251] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2040         [PG_WIN1252] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2041         [PG_ISO_8859_5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2042         [PG_ISO_8859_6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2043         [PG_ISO_8859_7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2044         [PG_ISO_8859_8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2045         [PG_WIN1250] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2046         [PG_WIN1253] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2047         [PG_WIN1254] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2048         [PG_WIN1255] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2049         [PG_WIN1257] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2050         [PG_KOI8U] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
2051         [PG_SJIS] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
2052         [PG_BIG5] = {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},
2053         [PG_GBK] = {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},
2054         [PG_UHC] = {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},
2055         [PG_GB18030] = {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
2056         [PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},
2057         [PG_SHIFT_JIS_2004] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
2058 };
2059
2060 /*
2061  * Returns the byte length of a multibyte character.
2062  *
2063  * Caution: when dealing with text that is not certainly valid in the
2064  * specified encoding, the result may exceed the actual remaining
2065  * string length.  Callers that are not prepared to deal with that
2066  * should use pg_encoding_mblen_bounded() instead.
2067  */
2068 int
2069 pg_encoding_mblen(int encoding, const char *mbstr)
2070 {
2071         return (PG_VALID_ENCODING(encoding) ?
2072                         pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
2073                         pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
2074 }
2075
2076 /*
2077  * Returns the byte length of a multibyte character; but not more than
2078  * the distance to end of string.
2079  */
2080 int
2081 pg_encoding_mblen_bounded(int encoding, const char *mbstr)
2082 {
2083         return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
2084 }
2085
2086 /*
2087  * Returns the display length of a multibyte character.
2088  */
2089 int
2090 pg_encoding_dsplen(int encoding, const char *mbstr)
2091 {
2092         return (PG_VALID_ENCODING(encoding) ?
2093                         pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
2094                         pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
2095 }
2096
2097 /*
2098  * Verify the first multibyte character of the given string.
2099  * Return its byte length if good, -1 if bad.  (See comments above for
2100  * full details of the mbverifychar API.)
2101  */
2102 int
2103 pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
2104 {
2105         return (PG_VALID_ENCODING(encoding) ?
2106                         pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
2107                         pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
2108 }
2109
2110 /*
2111  * Verify that a string is valid for the given encoding.
2112  * Returns the number of input bytes (<= len) that form a valid string.
2113  * (See comments above for full details of the mbverifystr API.)
2114  */
2115 int
2116 pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
2117 {
2118         return (PG_VALID_ENCODING(encoding) ?
2119                         pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
2120                         pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
2121 }
2122
2123 /*
2124  * fetch maximum length of a given encoding
2125  */
2126 int
2127 pg_encoding_max_length(int encoding)
2128 {
2129         Assert(PG_VALID_ENCODING(encoding));
2130
2131         return pg_wchar_table[encoding].maxmblen;
2132 }