src/common/wchar.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * wchar.c
   4  *        Functions for working with multibyte characters in various encodings.
   5  *
   6  * Portions Copyright (c) 1998-2023, PostgreSQL Global Development Group
   7  *
   8  * IDENTIFICATION
   9  *        src/common/wchar.c
  10  *
  11  *-------------------------------------------------------------------------
  12  */
  13 #include "c.h"
  14
  15 #include "mb/pg_wchar.h"
  16
  17
  18 /*
  19  * Operations on multi-byte encodings are driven by a table of helper
  20  * functions.
  21  *
  22  * To add an encoding support, define mblen(), dsplen(), verifychar() and
  23  * verifystr() for the encoding.  For server-encodings, also define mb2wchar()
  24  * and wchar2mb() conversion functions.
  25  *
  26  * These functions generally assume that their input is validly formed.
  27  * The "verifier" functions, further down in the file, have to be more
  28  * paranoid.
  29  *
  30  * We expect that mblen() does not need to examine more than the first byte
  31  * of the character to discover the correct length.  GB18030 is an exception
  32  * to that rule, though, as it also looks at second byte.  But even that
  33  * behaves in a predictable way, if you only pass the first byte: it will
  34  * treat 4-byte encoded characters as two 2-byte encoded characters, which is
  35  * good enough for all current uses.
  36  *
  37  * Note: for the display output of psql to work properly, the return values
  38  * of the dsplen functions must conform to the Unicode standard. In particular
  39  * the NUL character is zero width and control characters are generally
  40  * width -1. It is recommended that non-ASCII encodings refer their ASCII
  41  * subset to the ASCII routines to ensure consistency.
  42  */
  43
  44 /*
  45  * SQL/ASCII
  46  */
  47 static int
  48 pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
  49 {
  50         int                     cnt = 0;
  51
  52         while (len > 0 && *from)
  53         {
  54                 *to++ = *from++;
  55                 len--;
  56                 cnt++;
  57         }
  58         *to = 0;
  59         return cnt;
  60 }
  61
  62 static int
  63 pg_ascii_mblen(const unsigned char *s)
  64 {
  65         return 1;
  66 }
  67
  68 static int
  69 pg_ascii_dsplen(const unsigned char *s)
  70 {
  71         if (*s == '\0')
  72                 return 0;
  73         if (*s < 0x20 || *s == 0x7f)
  74                 return -1;
  75
  76         return 1;
  77 }
  78
  79 /*
  80  * EUC
  81  */
  82 static int
  83 pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
  84 {
  85         int                     cnt = 0;
  86
  87         while (len > 0 && *from)
  88         {
  89                 if (*from == SS2 && len >= 2)   /* JIS X 0201 (so called "1 byte
  90                                                                                  * KANA") */
  91                 {
  92                         from++;
  93                         *to = (SS2 << 8) | *from++;
  94                         len -= 2;
  95                 }
  96                 else if (*from == SS3 && len >= 3)      /* JIS X 0212 KANJI */
  97                 {
  98                         from++;
  99                         *to = (SS3 << 16) | (*from++ << 8);
 100                         *to |= *from++;
 101                         len -= 3;
 102                 }
 103                 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
 104                 {
 105                         *to = *from++ << 8;
 106                         *to |= *from++;
 107                         len -= 2;
 108                 }
 109                 else                                    /* must be ASCII */
 110                 {
 111                         *to = *from++;
 112                         len--;
 113                 }
 114                 to++;
 115                 cnt++;
 116         }
 117         *to = 0;
 118         return cnt;
 119 }
 120
 121 static inline int
 122 pg_euc_mblen(const unsigned char *s)
 123 {
 124         int                     len;
 125
 126         if (*s == SS2)
 127                 len = 2;
 128         else if (*s == SS3)
 129                 len = 3;
 130         else if (IS_HIGHBIT_SET(*s))
 131                 len = 2;
 132         else
 133                 len = 1;
 134         return len;
 135 }
 136
 137 static inline int
 138 pg_euc_dsplen(const unsigned char *s)
 139 {
 140         int                     len;
 141
 142         if (*s == SS2)
 143                 len = 2;
 144         else if (*s == SS3)
 145                 len = 2;
 146         else if (IS_HIGHBIT_SET(*s))
 147                 len = 2;
 148         else
 149                 len = pg_ascii_dsplen(s);
 150         return len;
 151 }
 152
 153 /*
 154  * EUC_JP
 155  */
 156 static int
 157 pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 158 {
 159         return pg_euc2wchar_with_len(from, to, len);
 160 }
 161
 162 static int
 163 pg_eucjp_mblen(const unsigned char *s)
 164 {
 165         return pg_euc_mblen(s);
 166 }
 167
 168 static int
 169 pg_eucjp_dsplen(const unsigned char *s)
 170 {
 171         int                     len;
 172
 173         if (*s == SS2)
 174                 len = 1;
 175         else if (*s == SS3)
 176                 len = 2;
 177         else if (IS_HIGHBIT_SET(*s))
 178                 len = 2;
 179         else
 180                 len = pg_ascii_dsplen(s);
 181         return len;
 182 }
 183
 184 /*
 185  * EUC_KR
 186  */
 187 static int
 188 pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 189 {
 190         return pg_euc2wchar_with_len(from, to, len);
 191 }
 192
 193 static int
 194 pg_euckr_mblen(const unsigned char *s)
 195 {
 196         return pg_euc_mblen(s);
 197 }
 198
 199 static int
 200 pg_euckr_dsplen(const unsigned char *s)
 201 {
 202         return pg_euc_dsplen(s);
 203 }
 204
 205 /*
 206  * EUC_CN
 207  *
 208  */
 209 static int
 210 pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 211 {
 212         int                     cnt = 0;
 213
 214         while (len > 0 && *from)
 215         {
 216                 if (*from == SS2 && len >= 3)   /* code set 2 (unused?) */
 217                 {
 218                         from++;
 219                         *to = (SS2 << 16) | (*from++ << 8);
 220                         *to |= *from++;
 221                         len -= 3;
 222                 }
 223                 else if (*from == SS3 && len >= 3)      /* code set 3 (unused ?) */
 224                 {
 225                         from++;
 226                         *to = (SS3 << 16) | (*from++ << 8);
 227                         *to |= *from++;
 228                         len -= 3;
 229                 }
 230                 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
 231                 {
 232                         *to = *from++ << 8;
 233                         *to |= *from++;
 234                         len -= 2;
 235                 }
 236                 else
 237                 {
 238                         *to = *from++;
 239                         len--;
 240                 }
 241                 to++;
 242                 cnt++;
 243         }
 244         *to = 0;
 245         return cnt;
 246 }
 247
 248 static int
 249 pg_euccn_mblen(const unsigned char *s)
 250 {
 251         int                     len;
 252
 253         if (IS_HIGHBIT_SET(*s))
 254                 len = 2;
 255         else
 256                 len = 1;
 257         return len;
 258 }
 259
 260 static int
 261 pg_euccn_dsplen(const unsigned char *s)
 262 {
 263         int                     len;
 264
 265         if (IS_HIGHBIT_SET(*s))
 266                 len = 2;
 267         else
 268                 len = pg_ascii_dsplen(s);
 269         return len;
 270 }
 271
 272 /*
 273  * EUC_TW
 274  *
 275  */
 276 static int
 277 pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 278 {
 279         int                     cnt = 0;
 280
 281         while (len > 0 && *from)
 282         {
 283                 if (*from == SS2 && len >= 4)   /* code set 2 */
 284                 {
 285                         from++;
 286                         *to = (((uint32) SS2) << 24) | (*from++ << 16);
 287                         *to |= *from++ << 8;
 288                         *to |= *from++;
 289                         len -= 4;
 290                 }
 291                 else if (*from == SS3 && len >= 3)      /* code set 3 (unused?) */
 292                 {
 293                         from++;
 294                         *to = (SS3 << 16) | (*from++ << 8);
 295                         *to |= *from++;
 296                         len -= 3;
 297                 }
 298                 else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
 299                 {
 300                         *to = *from++ << 8;
 301                         *to |= *from++;
 302                         len -= 2;
 303                 }
 304                 else
 305                 {
 306                         *to = *from++;
 307                         len--;
 308                 }
 309                 to++;
 310                 cnt++;
 311         }
 312         *to = 0;
 313         return cnt;
 314 }
 315
 316 static int
 317 pg_euctw_mblen(const unsigned char *s)
 318 {
 319         int                     len;
 320
 321         if (*s == SS2)
 322                 len = 4;
 323         else if (*s == SS3)
 324                 len = 3;
 325         else if (IS_HIGHBIT_SET(*s))
 326                 len = 2;
 327         else
 328                 len = 1;
 329         return len;
 330 }
 331
 332 static int
 333 pg_euctw_dsplen(const unsigned char *s)
 334 {
 335         int                     len;
 336
 337         if (*s == SS2)
 338                 len = 2;
 339         else if (*s == SS3)
 340                 len = 2;
 341         else if (IS_HIGHBIT_SET(*s))
 342                 len = 2;
 343         else
 344                 len = pg_ascii_dsplen(s);
 345         return len;
 346 }
 347
 348 /*
 349  * Convert pg_wchar to EUC_* encoding.
 350  * caller must allocate enough space for "to", including a trailing zero!
 351  * len: length of from.
 352  * "from" not necessarily null terminated.
 353  */
 354 static int
 355 pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
 356 {
 357         int                     cnt = 0;
 358
 359         while (len > 0 && *from)
 360         {
 361                 unsigned char c;
 362
 363                 if ((c = (*from >> 24)))
 364                 {
 365                         *to++ = c;
 366                         *to++ = (*from >> 16) & 0xff;
 367                         *to++ = (*from >> 8) & 0xff;
 368                         *to++ = *from & 0xff;
 369                         cnt += 4;
 370                 }
 371                 else if ((c = (*from >> 16)))
 372                 {
 373                         *to++ = c;
 374                         *to++ = (*from >> 8) & 0xff;
 375                         *to++ = *from & 0xff;
 376                         cnt += 3;
 377                 }
 378                 else if ((c = (*from >> 8)))
 379                 {
 380                         *to++ = c;
 381                         *to++ = *from & 0xff;
 382                         cnt += 2;
 383                 }
 384                 else
 385                 {
 386                         *to++ = *from;
 387                         cnt++;
 388                 }
 389                 from++;
 390                 len--;
 391         }
 392         *to = 0;
 393         return cnt;
 394 }
 395
 396
 397 /*
 398  * JOHAB
 399  */
 400 static int
 401 pg_johab_mblen(const unsigned char *s)
 402 {
 403         return pg_euc_mblen(s);
 404 }
 405
 406 static int
 407 pg_johab_dsplen(const unsigned char *s)
 408 {
 409         return pg_euc_dsplen(s);
 410 }
 411
 412 /*
 413  * convert UTF8 string to pg_wchar (UCS-4)
 414  * caller must allocate enough space for "to", including a trailing zero!
 415  * len: length of from.
 416  * "from" not necessarily null terminated.
 417  */
 418 static int
 419 pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 420 {
 421         int                     cnt = 0;
 422         uint32          c1,
 423                                 c2,
 424                                 c3,
 425                                 c4;
 426
 427         while (len > 0 && *from)
 428         {
 429                 if ((*from & 0x80) == 0)
 430                 {
 431                         *to = *from++;
 432                         len--;
 433                 }
 434                 else if ((*from & 0xe0) == 0xc0)
 435                 {
 436                         if (len < 2)
 437                                 break;                  /* drop trailing incomplete char */
 438                         c1 = *from++ & 0x1f;
 439                         c2 = *from++ & 0x3f;
 440                         *to = (c1 << 6) | c2;
 441                         len -= 2;
 442                 }
 443                 else if ((*from & 0xf0) == 0xe0)
 444                 {
 445                         if (len < 3)
 446                                 break;                  /* drop trailing incomplete char */
 447                         c1 = *from++ & 0x0f;
 448                         c2 = *from++ & 0x3f;
 449                         c3 = *from++ & 0x3f;
 450                         *to = (c1 << 12) | (c2 << 6) | c3;
 451                         len -= 3;
 452                 }
 453                 else if ((*from & 0xf8) == 0xf0)
 454                 {
 455                         if (len < 4)
 456                                 break;                  /* drop trailing incomplete char */
 457                         c1 = *from++ & 0x07;
 458                         c2 = *from++ & 0x3f;
 459                         c3 = *from++ & 0x3f;
 460                         c4 = *from++ & 0x3f;
 461                         *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
 462                         len -= 4;
 463                 }
 464                 else
 465                 {
 466                         /* treat a bogus char as length 1; not ours to raise error */
 467                         *to = *from++;
 468                         len--;
 469                 }
 470                 to++;
 471                 cnt++;
 472         }
 473         *to = 0;
 474         return cnt;
 475 }
 476
 477
 478 /*
 479  * Map a Unicode code point to UTF-8.  utf8string must have 4 bytes of
 480  * space allocated.
 481  */
 482 unsigned char *
 483 unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
 484 {
 485         if (c <= 0x7F)
 486         {
 487                 utf8string[0] = c;
 488         }
 489         else if (c <= 0x7FF)
 490         {
 491                 utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
 492                 utf8string[1] = 0x80 | (c & 0x3F);
 493         }
 494         else if (c <= 0xFFFF)
 495         {
 496                 utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
 497                 utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
 498                 utf8string[2] = 0x80 | (c & 0x3F);
 499         }
 500         else
 501         {
 502                 utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
 503                 utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
 504                 utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
 505                 utf8string[3] = 0x80 | (c & 0x3F);
 506         }
 507
 508         return utf8string;
 509 }
 510
 511 /*
 512  * Trivial conversion from pg_wchar to UTF-8.
 513  * caller should allocate enough space for "to"
 514  * len: length of from.
 515  * "from" not necessarily null terminated.
 516  */
 517 static int
 518 pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
 519 {
 520         int                     cnt = 0;
 521
 522         while (len > 0 && *from)
 523         {
 524                 int                     char_len;
 525
 526                 unicode_to_utf8(*from, to);
 527                 char_len = pg_utf_mblen(to);
 528                 cnt += char_len;
 529                 to += char_len;
 530                 from++;
 531                 len--;
 532         }
 533         *to = 0;
 534         return cnt;
 535 }
 536
 537 /*
 538  * Return the byte length of a UTF8 character pointed to by s
 539  *
 540  * Note: in the current implementation we do not support UTF8 sequences
 541  * of more than 4 bytes; hence do NOT return a value larger than 4.
 542  * We return "1" for any leading byte that is either flat-out illegal or
 543  * indicates a length larger than we support.
 544  *
 545  * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
 546  * other places would need to be fixed to change this.
 547  */
 548 int
 549 pg_utf_mblen(const unsigned char *s)
 550 {
 551         int                     len;
 552
 553         if ((*s & 0x80) == 0)
 554                 len = 1;
 555         else if ((*s & 0xe0) == 0xc0)
 556                 len = 2;
 557         else if ((*s & 0xf0) == 0xe0)
 558                 len = 3;
 559         else if ((*s & 0xf8) == 0xf0)
 560                 len = 4;
 561 #ifdef NOT_USED
 562         else if ((*s & 0xfc) == 0xf8)
 563                 len = 5;
 564         else if ((*s & 0xfe) == 0xfc)
 565                 len = 6;
 566 #endif
 567         else
 568                 len = 1;
 569         return len;
 570 }
 571
 572 /*
 573  * This is an implementation of wcwidth() and wcswidth() as defined in
 574  * "The Single UNIX Specification, Version 2, The Open Group, 1997"
 575  * <http://www.unix.org/online.html>
 576  *
 577  * Markus Kuhn -- 2001-09-08 -- public domain
 578  *
 579  * customised for PostgreSQL
 580  *
 581  * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
 582  */
 583
 584 struct mbinterval
 585 {
 586         unsigned int first;
 587         unsigned int last;
 588 };
 589
 590 /* auxiliary function for binary search in interval table */
 591 static int
 592 mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
 593 {
 594         int                     min = 0;
 595         int                     mid;
 596
 597         if (ucs < table[0].first || ucs > table[max].last)
 598                 return 0;
 599         while (max >= min)
 600         {
 601                 mid = (min + max) / 2;
 602                 if (ucs > table[mid].last)
 603                         min = mid + 1;
 604                 else if (ucs < table[mid].first)
 605                         max = mid - 1;
 606                 else
 607                         return 1;
 608         }
 609
 610         return 0;
 611 }
 612
 613
 614 /* The following functions define the column width of an ISO 10646
 615  * character as follows:
 616  *
 617  *        - The null character (U+0000) has a column width of 0.
 618  *
 619  *        - Other C0/C1 control characters and DEL will lead to a return
 620  *              value of -1.
 621  *
 622  *        - Non-spacing and enclosing combining characters (general
 623  *              category code Mn, Me or Cf in the Unicode database) have a
 624  *              column width of 0.
 625  *
 626  *        - Spacing characters in the East Asian Wide (W) or East Asian
 627  *              FullWidth (F) category as defined in Unicode Technical
 628  *              Report #11 have a column width of 2.
 629  *
 630  *        - All remaining characters (including all printable
 631  *              ISO 8859-1 and WGL4 characters, Unicode control characters,
 632  *              etc.) have a column width of 1.
 633  *
 634  * This implementation assumes that wchar_t characters are encoded
 635  * in ISO 10646.
 636  */
 637
 638 static int
 639 ucs_wcwidth(pg_wchar ucs)
 640 {
 641 #include "common/unicode_nonspacing_table.h"
 642 #include "common/unicode_east_asian_fw_table.h"
 643
 644         /* test for 8-bit control characters */
 645         if (ucs == 0)
 646                 return 0;
 647
 648         if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
 649                 return -1;
 650
 651         /*
 652          * binary search in table of non-spacing characters
 653          *
 654          * XXX: In the official Unicode sources, it is possible for a character to
 655          * be described as both non-spacing and wide at the same time. As of
 656          * Unicode 13.0, treating the non-spacing property as the determining
 657          * factor for display width leads to the correct behavior, so do that
 658          * search first.
 659          */
 660         if (mbbisearch(ucs, nonspacing,
 661                                    sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
 662                 return 0;
 663
 664         /* binary search in table of wide characters */
 665         if (mbbisearch(ucs, east_asian_fw,
 666                                    sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
 667                 return 2;
 668
 669         return 1;
 670 }
 671
 672 /*
 673  * Convert a UTF-8 character to a Unicode code point.
 674  * This is a one-character version of pg_utf2wchar_with_len.
 675  *
 676  * No error checks here, c must point to a long-enough string.
 677  */
 678 pg_wchar
 679 utf8_to_unicode(const unsigned char *c)
 680 {
 681         if ((*c & 0x80) == 0)
 682                 return (pg_wchar) c[0];
 683         else if ((*c & 0xe0) == 0xc0)
 684                 return (pg_wchar) (((c[0] & 0x1f) << 6) |
 685                                                    (c[1] & 0x3f));
 686         else if ((*c & 0xf0) == 0xe0)
 687                 return (pg_wchar) (((c[0] & 0x0f) << 12) |
 688                                                    ((c[1] & 0x3f) << 6) |
 689                                                    (c[2] & 0x3f));
 690         else if ((*c & 0xf8) == 0xf0)
 691                 return (pg_wchar) (((c[0] & 0x07) << 18) |
 692                                                    ((c[1] & 0x3f) << 12) |
 693                                                    ((c[2] & 0x3f) << 6) |
 694                                                    (c[3] & 0x3f));
 695         else
 696                 /* that is an invalid code on purpose */
 697                 return 0xffffffff;
 698 }
 699
 700 static int
 701 pg_utf_dsplen(const unsigned char *s)
 702 {
 703         return ucs_wcwidth(utf8_to_unicode(s));
 704 }
 705
 706 /*
 707  * convert mule internal code to pg_wchar
 708  * caller should allocate enough space for "to"
 709  * len: length of from.
 710  * "from" not necessarily null terminated.
 711  */
 712 static int
 713 pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 714 {
 715         int                     cnt = 0;
 716
 717         while (len > 0 && *from)
 718         {
 719                 if (IS_LC1(*from) && len >= 2)
 720                 {
 721                         *to = *from++ << 16;
 722                         *to |= *from++;
 723                         len -= 2;
 724                 }
 725                 else if (IS_LCPRV1(*from) && len >= 3)
 726                 {
 727                         from++;
 728                         *to = *from++ << 16;
 729                         *to |= *from++;
 730                         len -= 3;
 731                 }
 732                 else if (IS_LC2(*from) && len >= 3)
 733                 {
 734                         *to = *from++ << 16;
 735                         *to |= *from++ << 8;
 736                         *to |= *from++;
 737                         len -= 3;
 738                 }
 739                 else if (IS_LCPRV2(*from) && len >= 4)
 740                 {
 741                         from++;
 742                         *to = *from++ << 16;
 743                         *to |= *from++ << 8;
 744                         *to |= *from++;
 745                         len -= 4;
 746                 }
 747                 else
 748                 {                                               /* assume ASCII */
 749                         *to = (unsigned char) *from++;
 750                         len--;
 751                 }
 752                 to++;
 753                 cnt++;
 754         }
 755         *to = 0;
 756         return cnt;
 757 }
 758
 759 /*
 760  * convert pg_wchar to mule internal code
 761  * caller should allocate enough space for "to"
 762  * len: length of from.
 763  * "from" not necessarily null terminated.
 764  */
 765 static int
 766 pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
 767 {
 768         int                     cnt = 0;
 769
 770         while (len > 0 && *from)
 771         {
 772                 unsigned char lb;
 773
 774                 lb = (*from >> 16) & 0xff;
 775                 if (IS_LC1(lb))
 776                 {
 777                         *to++ = lb;
 778                         *to++ = *from & 0xff;
 779                         cnt += 2;
 780                 }
 781                 else if (IS_LC2(lb))
 782                 {
 783                         *to++ = lb;
 784                         *to++ = (*from >> 8) & 0xff;
 785                         *to++ = *from & 0xff;
 786                         cnt += 3;
 787                 }
 788                 else if (IS_LCPRV1_A_RANGE(lb))
 789                 {
 790                         *to++ = LCPRV1_A;
 791                         *to++ = lb;
 792                         *to++ = *from & 0xff;
 793                         cnt += 3;
 794                 }
 795                 else if (IS_LCPRV1_B_RANGE(lb))
 796                 {
 797                         *to++ = LCPRV1_B;
 798                         *to++ = lb;
 799                         *to++ = *from & 0xff;
 800                         cnt += 3;
 801                 }
 802                 else if (IS_LCPRV2_A_RANGE(lb))
 803                 {
 804                         *to++ = LCPRV2_A;
 805                         *to++ = lb;
 806                         *to++ = (*from >> 8) & 0xff;
 807                         *to++ = *from & 0xff;
 808                         cnt += 4;
 809                 }
 810                 else if (IS_LCPRV2_B_RANGE(lb))
 811                 {
 812                         *to++ = LCPRV2_B;
 813                         *to++ = lb;
 814                         *to++ = (*from >> 8) & 0xff;
 815                         *to++ = *from & 0xff;
 816                         cnt += 4;
 817                 }
 818                 else
 819                 {
 820                         *to++ = *from & 0xff;
 821                         cnt += 1;
 822                 }
 823                 from++;
 824                 len--;
 825         }
 826         *to = 0;
 827         return cnt;
 828 }
 829
 830 /* exported for direct use by conv.c */
 831 int
 832 pg_mule_mblen(const unsigned char *s)
 833 {
 834         int                     len;
 835
 836         if (IS_LC1(*s))
 837                 len = 2;
 838         else if (IS_LCPRV1(*s))
 839                 len = 3;
 840         else if (IS_LC2(*s))
 841                 len = 3;
 842         else if (IS_LCPRV2(*s))
 843                 len = 4;
 844         else
 845                 len = 1;                                /* assume ASCII */
 846         return len;
 847 }
 848
 849 static int
 850 pg_mule_dsplen(const unsigned char *s)
 851 {
 852         int                     len;
 853
 854         /*
 855          * Note: it's not really appropriate to assume that all multibyte charsets
 856          * are double-wide on screen.  But this seems an okay approximation for
 857          * the MULE charsets we currently support.
 858          */
 859
 860         if (IS_LC1(*s))
 861                 len = 1;
 862         else if (IS_LCPRV1(*s))
 863                 len = 1;
 864         else if (IS_LC2(*s))
 865                 len = 2;
 866         else if (IS_LCPRV2(*s))
 867                 len = 2;
 868         else
 869                 len = 1;                                /* assume ASCII */
 870
 871         return len;
 872 }
 873
 874 /*
 875  * ISO8859-1
 876  */
 877 static int
 878 pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 879 {
 880         int                     cnt = 0;
 881
 882         while (len > 0 && *from)
 883         {
 884                 *to++ = *from++;
 885                 len--;
 886                 cnt++;
 887         }
 888         *to = 0;
 889         return cnt;
 890 }
 891
 892 /*
 893  * Trivial conversion from pg_wchar to single byte encoding. Just ignores
 894  * high bits.
 895  * caller should allocate enough space for "to"
 896  * len: length of from.
 897  * "from" not necessarily null terminated.
 898  */
 899 static int
 900 pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
 901 {
 902         int                     cnt = 0;
 903
 904         while (len > 0 && *from)
 905         {
 906                 *to++ = *from++;
 907                 len--;
 908                 cnt++;
 909         }
 910         *to = 0;
 911         return cnt;
 912 }
 913
 914 static int
 915 pg_latin1_mblen(const unsigned char *s)
 916 {
 917         return 1;
 918 }
 919
 920 static int
 921 pg_latin1_dsplen(const unsigned char *s)
 922 {
 923         return pg_ascii_dsplen(s);
 924 }
 925
 926 /*
 927  * SJIS
 928  */
 929 static int
 930 pg_sjis_mblen(const unsigned char *s)
 931 {
 932         int                     len;
 933
 934         if (*s >= 0xa1 && *s <= 0xdf)
 935                 len = 1;                                /* 1 byte kana? */
 936         else if (IS_HIGHBIT_SET(*s))
 937                 len = 2;                                /* kanji? */
 938         else
 939                 len = 1;                                /* should be ASCII */
 940         return len;
 941 }
 942
 943 static int
 944 pg_sjis_dsplen(const unsigned char *s)
 945 {
 946         int                     len;
 947
 948         if (*s >= 0xa1 && *s <= 0xdf)
 949                 len = 1;                                /* 1 byte kana? */
 950         else if (IS_HIGHBIT_SET(*s))
 951                 len = 2;                                /* kanji? */
 952         else
 953                 len = pg_ascii_dsplen(s);       /* should be ASCII */
 954         return len;
 955 }
 956
 957 /*
 958  * Big5
 959  */
 960 static int
 961 pg_big5_mblen(const unsigned char *s)
 962 {
 963         int                     len;
 964
 965         if (IS_HIGHBIT_SET(*s))
 966                 len = 2;                                /* kanji? */
 967         else
 968                 len = 1;                                /* should be ASCII */
 969         return len;
 970 }
 971
 972 static int
 973 pg_big5_dsplen(const unsigned char *s)
 974 {
 975         int                     len;
 976
 977         if (IS_HIGHBIT_SET(*s))
 978                 len = 2;                                /* kanji? */
 979         else
 980                 len = pg_ascii_dsplen(s);       /* should be ASCII */
 981         return len;
 982 }
 983
 984 /*
 985  * GBK
 986  */
 987 static int
 988 pg_gbk_mblen(const unsigned char *s)
 989 {
 990         int                     len;
 991
 992         if (IS_HIGHBIT_SET(*s))
 993                 len = 2;                                /* kanji? */
 994         else
 995                 len = 1;                                /* should be ASCII */
 996         return len;
 997 }
 998
 999 static int
1000 pg_gbk_dsplen(const unsigned char *s)
1001 {
1002         int                     len;
1003
1004         if (IS_HIGHBIT_SET(*s))
1005                 len = 2;                                /* kanji? */
1006         else
1007                 len = pg_ascii_dsplen(s);       /* should be ASCII */
1008         return len;
1009 }
1010
1011 /*
1012  * UHC
1013  */
1014 static int
1015 pg_uhc_mblen(const unsigned char *s)
1016 {
1017         int                     len;
1018
1019         if (IS_HIGHBIT_SET(*s))
1020                 len = 2;                                /* 2byte? */
1021         else
1022                 len = 1;                                /* should be ASCII */
1023         return len;
1024 }
1025
1026 static int
1027 pg_uhc_dsplen(const unsigned char *s)
1028 {
1029         int                     len;
1030
1031         if (IS_HIGHBIT_SET(*s))
1032                 len = 2;                                /* 2byte? */
1033         else
1034                 len = pg_ascii_dsplen(s);       /* should be ASCII */
1035         return len;
1036 }
1037
1038 /*
1039  * GB18030
1040  *      Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
1041  */
1042
1043 /*
1044  * Unlike all other mblen() functions, this also looks at the second byte of
1045  * the input.  However, if you only pass the first byte of a multi-byte
1046  * string, and \0 as the second byte, this still works in a predictable way:
1047  * a 4-byte character will be reported as two 2-byte characters.  That's
1048  * enough for all current uses, as a client-only encoding.  It works that
1049  * way, because in any valid 4-byte GB18030-encoded character, the third and
1050  * fourth byte look like a 2-byte encoded character, when looked at
1051  * separately.
1052  */
1053 static int
1054 pg_gb18030_mblen(const unsigned char *s)
1055 {
1056         int                     len;
1057
1058         if (!IS_HIGHBIT_SET(*s))
1059                 len = 1;                                /* ASCII */
1060         else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1061                 len = 4;
1062         else
1063                 len = 2;
1064         return len;
1065 }
1066
1067 static int
1068 pg_gb18030_dsplen(const unsigned char *s)
1069 {
1070         int                     len;
1071
1072         if (IS_HIGHBIT_SET(*s))
1073                 len = 2;
1074         else
1075                 len = pg_ascii_dsplen(s);       /* ASCII */
1076         return len;
1077 }
1078
1079 /*
1080  *-------------------------------------------------------------------
1081  * multibyte sequence validators
1082  *
1083  * The verifychar functions accept "s", a pointer to the first byte of a
1084  * string, and "len", the remaining length of the string.  If there is a
1085  * validly encoded character beginning at *s, return its length in bytes;
1086  * else return -1.
1087  *
1088  * The verifystr functions also accept "s", a pointer to a string and "len",
1089  * the length of the string.  They verify the whole string, and return the
1090  * number of input bytes (<= len) that are valid.  In other words, if the
1091  * whole string is valid, verifystr returns "len", otherwise it returns the
1092  * byte offset of the first invalid character.  The verifystr functions must
1093  * test for and reject zeroes in the input.
1094  *
1095  * The verifychar functions can assume that len > 0 and that *s != '\0', but
1096  * they must test for and reject zeroes in any additional bytes of a
1097  * multibyte character.  Note that this definition allows the function for a
1098  * single-byte encoding to be just "return 1".
1099  *-------------------------------------------------------------------
1100  */
1101 static int
1102 pg_ascii_verifychar(const unsigned char *s, int len)
1103 {
1104         return 1;
1105 }
1106
1107 static int
1108 pg_ascii_verifystr(const unsigned char *s, int len)
1109 {
1110         const unsigned char *nullpos = memchr(s, 0, len);
1111
1112         if (nullpos == NULL)
1113                 return len;
1114         else
1115                 return nullpos - s;
1116 }
1117
1118 #define IS_EUC_RANGE_VALID(c)   ((c) >= 0xa1 && (c) <= 0xfe)
1119
1120 static int
1121 pg_eucjp_verifychar(const unsigned char *s, int len)
1122 {
1123         int                     l;
1124         unsigned char c1,
1125                                 c2;
1126
1127         c1 = *s++;
1128
1129         switch (c1)
1130         {
1131                 case SS2:                               /* JIS X 0201 */
1132                         l = 2;
1133                         if (l > len)
1134                                 return -1;
1135                         c2 = *s++;
1136                         if (c2 < 0xa1 || c2 > 0xdf)
1137                                 return -1;
1138                         break;
1139
1140                 case SS3:                               /* JIS X 0212 */
1141                         l = 3;
1142                         if (l > len)
1143                                 return -1;
1144                         c2 = *s++;
1145                         if (!IS_EUC_RANGE_VALID(c2))
1146                                 return -1;
1147                         c2 = *s++;
1148                         if (!IS_EUC_RANGE_VALID(c2))
1149                                 return -1;
1150                         break;
1151
1152                 default:
1153                         if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1154                         {
1155                                 l = 2;
1156                                 if (l > len)
1157                                         return -1;
1158                                 if (!IS_EUC_RANGE_VALID(c1))
1159                                         return -1;
1160                                 c2 = *s++;
1161                                 if (!IS_EUC_RANGE_VALID(c2))
1162                                         return -1;
1163                         }
1164                         else
1165                                 /* must be ASCII */
1166                         {
1167                                 l = 1;
1168                         }
1169                         break;
1170         }
1171
1172         return l;
1173 }
1174
1175 static int
1176 pg_eucjp_verifystr(const unsigned char *s, int len)
1177 {
1178         const unsigned char *start = s;
1179
1180         while (len > 0)
1181         {
1182                 int                     l;
1183
1184                 /* fast path for ASCII-subset characters */
1185                 if (!IS_HIGHBIT_SET(*s))
1186                 {
1187                         if (*s == '\0')
1188                                 break;
1189                         l = 1;
1190                 }
1191                 else
1192                 {
1193                         l = pg_eucjp_verifychar(s, len);
1194                         if (l == -1)
1195                                 break;
1196                 }
1197                 s += l;
1198                 len -= l;
1199         }
1200
1201         return s - start;
1202 }
1203
1204 static int
1205 pg_euckr_verifychar(const unsigned char *s, int len)
1206 {
1207         int                     l;
1208         unsigned char c1,
1209                                 c2;
1210
1211         c1 = *s++;
1212
1213         if (IS_HIGHBIT_SET(c1))
1214         {
1215                 l = 2;
1216                 if (l > len)
1217                         return -1;
1218                 if (!IS_EUC_RANGE_VALID(c1))
1219                         return -1;
1220                 c2 = *s++;
1221                 if (!IS_EUC_RANGE_VALID(c2))
1222                         return -1;
1223         }
1224         else
1225                 /* must be ASCII */
1226         {
1227                 l = 1;
1228         }
1229
1230         return l;
1231 }
1232
1233 static int
1234 pg_euckr_verifystr(const unsigned char *s, int len)
1235 {
1236         const unsigned char *start = s;
1237
1238         while (len > 0)
1239         {
1240                 int                     l;
1241
1242                 /* fast path for ASCII-subset characters */
1243                 if (!IS_HIGHBIT_SET(*s))
1244                 {
1245                         if (*s == '\0')
1246                                 break;
1247                         l = 1;
1248                 }
1249                 else
1250                 {
1251                         l = pg_euckr_verifychar(s, len);
1252                         if (l == -1)
1253                                 break;
1254                 }
1255                 s += l;
1256                 len -= l;
1257         }
1258
1259         return s - start;
1260 }
1261
1262 /* EUC-CN byte sequences are exactly same as EUC-KR */
1263 #define pg_euccn_verifychar     pg_euckr_verifychar
1264 #define pg_euccn_verifystr      pg_euckr_verifystr
1265
1266 static int
1267 pg_euctw_verifychar(const unsigned char *s, int len)
1268 {
1269         int                     l;
1270         unsigned char c1,
1271                                 c2;
1272
1273         c1 = *s++;
1274
1275         switch (c1)
1276         {
1277                 case SS2:                               /* CNS 11643 Plane 1-7 */
1278                         l = 4;
1279                         if (l > len)
1280                                 return -1;
1281                         c2 = *s++;
1282                         if (c2 < 0xa1 || c2 > 0xa7)
1283                                 return -1;
1284                         c2 = *s++;
1285                         if (!IS_EUC_RANGE_VALID(c2))
1286                                 return -1;
1287                         c2 = *s++;
1288                         if (!IS_EUC_RANGE_VALID(c2))
1289                                 return -1;
1290                         break;
1291
1292                 case SS3:                               /* unused */
1293                         return -1;
1294
1295                 default:
1296                         if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1297                         {
1298                                 l = 2;
1299                                 if (l > len)
1300                                         return -1;
1301                                 /* no further range check on c1? */
1302                                 c2 = *s++;
1303                                 if (!IS_EUC_RANGE_VALID(c2))
1304                                         return -1;
1305                         }
1306                         else
1307                                 /* must be ASCII */
1308                         {
1309                                 l = 1;
1310                         }
1311                         break;
1312         }
1313         return l;
1314 }
1315
1316 static int
1317 pg_euctw_verifystr(const unsigned char *s, int len)
1318 {
1319         const unsigned char *start = s;
1320
1321         while (len > 0)
1322         {
1323                 int                     l;
1324
1325                 /* fast path for ASCII-subset characters */
1326                 if (!IS_HIGHBIT_SET(*s))
1327                 {
1328                         if (*s == '\0')
1329                                 break;
1330                         l = 1;
1331                 }
1332                 else
1333                 {
1334                         l = pg_euctw_verifychar(s, len);
1335                         if (l == -1)
1336                                 break;
1337                 }
1338                 s += l;
1339                 len -= l;
1340         }
1341
1342         return s - start;
1343 }
1344
1345 static int
1346 pg_johab_verifychar(const unsigned char *s, int len)
1347 {
1348         int                     l,
1349                                 mbl;
1350         unsigned char c;
1351
1352         l = mbl = pg_johab_mblen(s);
1353
1354         if (len < l)
1355                 return -1;
1356
1357         if (!IS_HIGHBIT_SET(*s))
1358                 return mbl;
1359
1360         while (--l > 0)
1361         {
1362                 c = *++s;
1363                 if (!IS_EUC_RANGE_VALID(c))
1364                         return -1;
1365         }
1366         return mbl;
1367 }
1368
1369 static int
1370 pg_johab_verifystr(const unsigned char *s, int len)
1371 {
1372         const unsigned char *start = s;
1373
1374         while (len > 0)
1375         {
1376                 int                     l;
1377
1378                 /* fast path for ASCII-subset characters */
1379                 if (!IS_HIGHBIT_SET(*s))
1380                 {
1381                         if (*s == '\0')
1382                                 break;
1383                         l = 1;
1384                 }
1385                 else
1386                 {
1387                         l = pg_johab_verifychar(s, len);
1388                         if (l == -1)
1389                                 break;
1390                 }
1391                 s += l;
1392                 len -= l;
1393         }
1394
1395         return s - start;
1396 }
1397
1398 static int
1399 pg_mule_verifychar(const unsigned char *s, int len)
1400 {
1401         int                     l,
1402                                 mbl;
1403         unsigned char c;
1404
1405         l = mbl = pg_mule_mblen(s);
1406
1407         if (len < l)
1408                 return -1;
1409
1410         while (--l > 0)
1411         {
1412                 c = *++s;
1413                 if (!IS_HIGHBIT_SET(c))
1414                         return -1;
1415         }
1416         return mbl;
1417 }
1418
1419 static int
1420 pg_mule_verifystr(const unsigned char *s, int len)
1421 {
1422         const unsigned char *start = s;
1423
1424         while (len > 0)
1425         {
1426                 int                     l;
1427
1428                 /* fast path for ASCII-subset characters */
1429                 if (!IS_HIGHBIT_SET(*s))
1430                 {
1431                         if (*s == '\0')
1432                                 break;
1433                         l = 1;
1434                 }
1435                 else
1436                 {
1437                         l = pg_mule_verifychar(s, len);
1438                         if (l == -1)
1439                                 break;
1440                 }
1441                 s += l;
1442                 len -= l;
1443         }
1444
1445         return s - start;
1446 }
1447
1448 static int
1449 pg_latin1_verifychar(const unsigned char *s, int len)
1450 {
1451         return 1;
1452 }
1453
1454 static int
1455 pg_latin1_verifystr(const unsigned char *s, int len)
1456 {
1457         const unsigned char *nullpos = memchr(s, 0, len);
1458
1459         if (nullpos == NULL)
1460                 return len;
1461         else
1462                 return nullpos - s;
1463 }
1464
1465 static int
1466 pg_sjis_verifychar(const unsigned char *s, int len)
1467 {
1468         int                     l,
1469                                 mbl;
1470         unsigned char c1,
1471                                 c2;
1472
1473         l = mbl = pg_sjis_mblen(s);
1474
1475         if (len < l)
1476                 return -1;
1477
1478         if (l == 1)                                     /* pg_sjis_mblen already verified it */
1479                 return mbl;
1480
1481         c1 = *s++;
1482         c2 = *s;
1483         if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1484                 return -1;
1485         return mbl;
1486 }
1487
1488 static int
1489 pg_sjis_verifystr(const unsigned char *s, int len)
1490 {
1491         const unsigned char *start = s;
1492
1493         while (len > 0)
1494         {
1495                 int                     l;
1496
1497                 /* fast path for ASCII-subset characters */
1498                 if (!IS_HIGHBIT_SET(*s))
1499                 {
1500                         if (*s == '\0')
1501                                 break;
1502                         l = 1;
1503                 }
1504                 else
1505                 {
1506                         l = pg_sjis_verifychar(s, len);
1507                         if (l == -1)
1508                                 break;
1509                 }
1510                 s += l;
1511                 len -= l;
1512         }
1513
1514         return s - start;
1515 }
1516
1517 static int
1518 pg_big5_verifychar(const unsigned char *s, int len)
1519 {
1520         int                     l,
1521                                 mbl;
1522
1523         l = mbl = pg_big5_mblen(s);
1524
1525         if (len < l)
1526                 return -1;
1527
1528         while (--l > 0)
1529         {
1530                 if (*++s == '\0')
1531                         return -1;
1532         }
1533
1534         return mbl;
1535 }
1536
1537 static int
1538 pg_big5_verifystr(const unsigned char *s, int len)
1539 {
1540         const unsigned char *start = s;
1541
1542         while (len > 0)
1543         {
1544                 int                     l;
1545
1546                 /* fast path for ASCII-subset characters */
1547                 if (!IS_HIGHBIT_SET(*s))
1548                 {
1549                         if (*s == '\0')
1550                                 break;
1551                         l = 1;
1552                 }
1553                 else
1554                 {
1555                         l = pg_big5_verifychar(s, len);
1556                         if (l == -1)
1557                                 break;
1558                 }
1559                 s += l;
1560                 len -= l;
1561         }
1562
1563         return s - start;
1564 }
1565
1566 static int
1567 pg_gbk_verifychar(const unsigned char *s, int len)
1568 {
1569         int                     l,
1570                                 mbl;
1571
1572         l = mbl = pg_gbk_mblen(s);
1573
1574         if (len < l)
1575                 return -1;
1576
1577         while (--l > 0)
1578         {
1579                 if (*++s == '\0')
1580                         return -1;
1581         }
1582
1583         return mbl;
1584 }
1585
1586 static int
1587 pg_gbk_verifystr(const unsigned char *s, int len)
1588 {
1589         const unsigned char *start = s;
1590
1591         while (len > 0)
1592         {
1593                 int                     l;
1594
1595                 /* fast path for ASCII-subset characters */
1596                 if (!IS_HIGHBIT_SET(*s))
1597                 {
1598                         if (*s == '\0')
1599                                 break;
1600                         l = 1;
1601                 }
1602                 else
1603                 {
1604                         l = pg_gbk_verifychar(s, len);
1605                         if (l == -1)
1606                                 break;
1607                 }
1608                 s += l;
1609                 len -= l;
1610         }
1611
1612         return s - start;
1613 }
1614
1615 static int
1616 pg_uhc_verifychar(const unsigned char *s, int len)
1617 {
1618         int                     l,
1619                                 mbl;
1620
1621         l = mbl = pg_uhc_mblen(s);
1622
1623         if (len < l)
1624                 return -1;
1625
1626         while (--l > 0)
1627         {
1628                 if (*++s == '\0')
1629                         return -1;
1630         }
1631
1632         return mbl;
1633 }
1634
1635 static int
1636 pg_uhc_verifystr(const unsigned char *s, int len)
1637 {
1638         const unsigned char *start = s;
1639
1640         while (len > 0)
1641         {
1642                 int                     l;
1643
1644                 /* fast path for ASCII-subset characters */
1645                 if (!IS_HIGHBIT_SET(*s))
1646                 {
1647                         if (*s == '\0')
1648                                 break;
1649                         l = 1;
1650                 }
1651                 else
1652                 {
1653                         l = pg_uhc_verifychar(s, len);
1654                         if (l == -1)
1655                                 break;
1656                 }
1657                 s += l;
1658                 len -= l;
1659         }
1660
1661         return s - start;
1662 }
1663
1664 static int
1665 pg_gb18030_verifychar(const unsigned char *s, int len)
1666 {
1667         int                     l;
1668
1669         if (!IS_HIGHBIT_SET(*s))
1670                 l = 1;                                  /* ASCII */
1671         else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1672         {
1673                 /* Should be 4-byte, validate remaining bytes */
1674                 if (*s >= 0x81 && *s <= 0xfe &&
1675                         *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1676                         *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1677                         l = 4;
1678                 else
1679                         l = -1;
1680         }
1681         else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1682         {
1683                 /* Should be 2-byte, validate */
1684                 if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1685                         (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1686                         l = 2;
1687                 else
1688                         l = -1;
1689         }
1690         else
1691                 l = -1;
1692         return l;
1693 }
1694
1695 static int
1696 pg_gb18030_verifystr(const unsigned char *s, int len)
1697 {
1698         const unsigned char *start = s;
1699
1700         while (len > 0)
1701         {
1702                 int                     l;
1703
1704                 /* fast path for ASCII-subset characters */
1705                 if (!IS_HIGHBIT_SET(*s))
1706                 {
1707                         if (*s == '\0')
1708                                 break;
1709                         l = 1;
1710                 }
1711                 else
1712                 {
1713                         l = pg_gb18030_verifychar(s, len);
1714                         if (l == -1)
1715                                 break;
1716                 }
1717                 s += l;
1718                 len -= l;
1719         }
1720
1721         return s - start;
1722 }
1723
1724 static int
1725 pg_utf8_verifychar(const unsigned char *s, int len)
1726 {
1727         int                     l;
1728
1729         if ((*s & 0x80) == 0)
1730         {
1731                 if (*s == '\0')
1732                         return -1;
1733                 return 1;
1734         }
1735         else if ((*s & 0xe0) == 0xc0)
1736                 l = 2;
1737         else if ((*s & 0xf0) == 0xe0)
1738                 l = 3;
1739         else if ((*s & 0xf8) == 0xf0)
1740                 l = 4;
1741         else
1742                 l = 1;
1743
1744         if (l > len)
1745                 return -1;
1746
1747         if (!pg_utf8_islegal(s, l))
1748                 return -1;
1749
1750         return l;
1751 }
1752
1753 /*
1754  * The fast path of the UTF-8 verifier uses a deterministic finite automaton
1755  * (DFA) for multibyte characters. In a traditional table-driven DFA, the
1756  * input byte and current state are used to compute an index into an array of
1757  * state transitions. Since the address of the next transition is dependent
1758  * on this computation, there is latency in executing the load instruction,
1759  * and the CPU is not kept busy.
1760  *
1761  * Instead, we use a "shift-based" DFA as described by Per Vognsen:
1762  *
1763  * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
1764  *
1765  * In a shift-based DFA, the input byte is an index into array of integers
1766  * whose bit pattern encodes the state transitions. To compute the next
1767  * state, we simply right-shift the integer by the current state and apply a
1768  * mask. In this scheme, the address of the transition only depends on the
1769  * input byte, so there is better pipelining.
1770  *
1771  * The naming convention for states and transitions was adopted from a UTF-8
1772  * to UTF-16/32 transcoder, whose table is reproduced below:
1773  *
1774  * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
1775  *
1776  * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
1777  * ==========================================================================
1778  * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B,      | BGN/END
1779  * err, err, err, err, err, err, err, err, err, err, err, err,      | ERR
1780  *                                                                  |
1781  * err, err, END, END, END, err, err, err, err, err, err, err,      | CS1
1782  * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,      | CS2
1783  * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,      | CS3
1784  *                                                                  |
1785  * err, err, err, err, CS1, err, err, err, err, err, err, err,      | P3A
1786  * err, err, CS1, CS1, err, err, err, err, err, err, err, err,      | P3B
1787  *                                                                  |
1788  * err, err, err, CS2, CS2, err, err, err, err, err, err, err,      | P4A
1789  * err, err, CS2, err, err, err, err, err, err, err, err, err,      | P4B
1790  *
1791  * In the most straightforward implementation, a shift-based DFA for UTF-8
1792  * requires 64-bit integers to encode the transitions, but with an SMT solver
1793  * it's possible to find state numbers such that the transitions fit within
1794  * 32-bit integers, as Dougall Johnson demonstrated:
1795  *
1796  * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
1797  *
1798  * This packed representation is the reason for the seemingly odd choice of
1799  * state values below.
1800  */
1801
1802 /* Error */
1803 #define ERR  0
1804 /* Begin */
1805 #define BGN 11
1806 /* Continuation states, expect 1/2/3 continuation bytes */
1807 #define CS1 16
1808 #define CS2  1
1809 #define CS3  5
1810 /* Partial states, where the first continuation byte has a restricted range */
1811 #define P3A  6                                  /* Lead was E0, check for 3-byte overlong */
1812 #define P3B 20                                  /* Lead was ED, check for surrogate */
1813 #define P4A 25                                  /* Lead was F0, check for 4-byte overlong */
1814 #define P4B 30                                  /* Lead was F4, check for too-large */
1815 /* Begin and End are the same state */
1816 #define END BGN
1817
1818 /* the encoded state transitions for the lookup table */
1819
1820 /* ASCII */
1821 #define ASC (END << BGN)
1822 /* 2-byte lead */
1823 #define L2A (CS1 << BGN)
1824 /* 3-byte lead */
1825 #define L3A (P3A << BGN)
1826 #define L3B (CS2 << BGN)
1827 #define L3C (P3B << BGN)
1828 /* 4-byte lead */
1829 #define L4A (P4A << BGN)
1830 #define L4B (CS3 << BGN)
1831 #define L4C (P4B << BGN)
1832 /* continuation byte */
1833 #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
1834 #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
1835 #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
1836 /* invalid byte */
1837 #define ILL ERR
1838
1839 static const uint32 Utf8Transition[256] =
1840 {
1841         /* ASCII */
1842
1843         ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1844         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1845         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1846         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1847
1848         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1849         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1850         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1851         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1852
1853         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1854         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1855         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1856         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1857
1858         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1859         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1860         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1861         ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
1862
1863         /* continuation bytes */
1864
1865         /* 80..8F */
1866         CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1867         CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
1868
1869         /* 90..9F */
1870         CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1871         CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
1872
1873         /* A0..BF */
1874         CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1875         CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1876         CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1877         CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
1878
1879         /* leading bytes */
1880
1881         /* C0..DF */
1882         ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
1883         L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1884         L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1885         L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
1886
1887         /* E0..EF */
1888         L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
1889         L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
1890
1891         /* F0..FF */
1892         L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
1893         ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
1894 };
1895
1896 static void
1897 utf8_advance(const unsigned char *s, uint32 *state, int len)
1898 {
1899         /* Note: We deliberately don't check the state's value here. */
1900         while (len > 0)
1901         {
1902                 /*
1903                  * It's important that the mask value is 31: In most instruction sets,
1904                  * a shift by a 32-bit operand is understood to be a shift by its mod
1905                  * 32, so the compiler should elide the mask operation.
1906                  */
1907                 *state = Utf8Transition[*s++] >> (*state & 31);
1908                 len--;
1909         }
1910
1911         *state &= 31;
1912 }
1913
1914 static int
1915 pg_utf8_verifystr(const unsigned char *s, int len)
1916 {
1917         const unsigned char *start = s;
1918         const int       orig_len = len;
1919         uint32          state = BGN;
1920
1921 /*
1922  * With a stride of two vector widths, gcc will unroll the loop. Even if
1923  * the compiler can unroll a longer loop, it's not worth it because we
1924  * must fall back to the byte-wise algorithm if we find any non-ASCII.
1925  */
1926 #define STRIDE_LENGTH (2 * sizeof(Vector8))
1927
1928         if (len >= STRIDE_LENGTH)
1929         {
1930                 while (len >= STRIDE_LENGTH)
1931                 {
1932                         /*
1933                          * If the chunk is all ASCII, we can skip the full UTF-8 check,
1934                          * but we must first check for a non-END state, which means the
1935                          * previous chunk ended in the middle of a multibyte sequence.
1936                          */
1937                         if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
1938                                 utf8_advance(s, &state, STRIDE_LENGTH);
1939
1940                         s += STRIDE_LENGTH;
1941                         len -= STRIDE_LENGTH;
1942                 }
1943
1944                 /* The error state persists, so we only need to check for it here. */
1945                 if (state == ERR)
1946                 {
1947                         /*
1948                          * Start over from the beginning with the slow path so we can
1949                          * count the valid bytes.
1950                          */
1951                         len = orig_len;
1952                         s = start;
1953                 }
1954                 else if (state != END)
1955                 {
1956                         /*
1957                          * The fast path exited in the middle of a multibyte sequence.
1958                          * Walk backwards to find the leading byte so that the slow path
1959                          * can resume checking from there. We must always backtrack at
1960                          * least one byte, since the current byte could be e.g. an ASCII
1961                          * byte after a 2-byte lead, which is invalid.
1962                          */
1963                         do
1964                         {
1965                                 Assert(s > start);
1966                                 s--;
1967                                 len++;
1968                                 Assert(IS_HIGHBIT_SET(*s));
1969                         } while (pg_utf_mblen(s) <= 1);
1970                 }
1971         }
1972
1973         /* check remaining bytes */
1974         while (len > 0)
1975         {
1976                 int                     l;
1977
1978                 /* fast path for ASCII-subset characters */
1979                 if (!IS_HIGHBIT_SET(*s))
1980                 {
1981                         if (*s == '\0')
1982                                 break;
1983                         l = 1;
1984                 }
1985                 else
1986                 {
1987                         l = pg_utf8_verifychar(s, len);
1988                         if (l == -1)
1989                                 break;
1990                 }
1991                 s += l;
1992                 len -= l;
1993         }
1994
1995         return s - start;
1996 }
1997
1998 /*
1999  * Check for validity of a single UTF-8 encoded character
2000  *
2001  * This directly implements the rules in RFC3629.  The bizarre-looking
2002  * restrictions on the second byte are meant to ensure that there isn't
2003  * more than one encoding of a given Unicode character point; that is,
2004  * you may not use a longer-than-necessary byte sequence with high order
2005  * zero bits to represent a character that would fit in fewer bytes.
2006  * To do otherwise is to create security hazards (eg, create an apparent
2007  * non-ASCII character that decodes to plain ASCII).
2008  *
2009  * length is assumed to have been obtained by pg_utf_mblen(), and the
2010  * caller must have checked that that many bytes are present in the buffer.
2011  */
2012 bool
2013 pg_utf8_islegal(const unsigned char *source, int length)
2014 {
2015         unsigned char a;
2016
2017         switch (length)
2018         {
2019                 default:
2020                         /* reject lengths 5 and 6 for now */
2021                         return false;
2022                 case 4:
2023                         a = source[3];
2024                         if (a < 0x80 || a > 0xBF)
2025                                 return false;
2026                         /* FALL THRU */
2027                 case 3:
2028                         a = source[2];
2029                         if (a < 0x80 || a > 0xBF)
2030                                 return false;
2031                         /* FALL THRU */
2032                 case 2:
2033                         a = source[1];
2034                         switch (*source)
2035                         {
2036                                 case 0xE0:
2037                                         if (a < 0xA0 || a > 0xBF)
2038                                                 return false;
2039                                         break;
2040                                 case 0xED:
2041                                         if (a < 0x80 || a > 0x9F)
2042                                                 return false;
2043                                         break;
2044                                 case 0xF0:
2045                                         if (a < 0x90 || a > 0xBF)
2046                                                 return false;
2047                                         break;
2048                                 case 0xF4:
2049                                         if (a < 0x80 || a > 0x8F)
2050                                                 return false;
2051                                         break;
2052                                 default:
2053                                         if (a < 0x80 || a > 0xBF)
2054                                                 return false;
2055                                         break;
2056                         }
2057                         /* FALL THRU */
2058                 case 1:
2059                         a = *source;
2060                         if (a >= 0x80 && a < 0xC2)
2061                                 return false;
2062                         if (a > 0xF4)
2063                                 return false;
2064                         break;
2065         }
2066         return true;
2067 }
2068
2069
2070 /*
2071  *-------------------------------------------------------------------
2072  * encoding info table
2073  * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
2074  *-------------------------------------------------------------------
2075  */
2076 const pg_wchar_tbl pg_wchar_table[] = {
2077         {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},       /* PG_SQL_ASCII */
2078         {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},  /* PG_EUC_JP */
2079         {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2},  /* PG_EUC_CN */
2080         {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},  /* PG_EUC_KR */
2081         {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},  /* PG_EUC_TW */
2082         {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},  /* PG_EUC_JIS_2004 */
2083         {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},  /* PG_UTF8 */
2084         {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4},      /* PG_MULE_INTERNAL */
2085         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN1 */
2086         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN2 */
2087         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN3 */
2088         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN4 */
2089         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN5 */
2090         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN6 */
2091         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN7 */
2092         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN8 */
2093         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN9 */
2094         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN10 */
2095         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1256 */
2096         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1258 */
2097         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN866 */
2098         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN874 */
2099         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_KOI8R */
2100         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1251 */
2101         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1252 */
2102         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* ISO-8859-5 */
2103         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* ISO-8859-6 */
2104         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* ISO-8859-7 */
2105         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* ISO-8859-8 */
2106         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1250 */
2107         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1253 */
2108         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1254 */
2109         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1255 */
2110         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1257 */
2111         {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_KOI8U */
2112         {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},        /* PG_SJIS */
2113         {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},        /* PG_BIG5 */
2114         {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},    /* PG_GBK */
2115         {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},    /* PG_UHC */
2116         {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},    /* PG_GB18030 */
2117         {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},    /* PG_JOHAB */
2118         {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2} /* PG_SHIFT_JIS_2004 */
2119 };
2120
2121 /*
2122  * Returns the byte length of a multibyte character.
2123  *
2124  * Caution: when dealing with text that is not certainly valid in the
2125  * specified encoding, the result may exceed the actual remaining
2126  * string length.  Callers that are not prepared to deal with that
2127  * should use pg_encoding_mblen_bounded() instead.
2128  */
2129 int
2130 pg_encoding_mblen(int encoding, const char *mbstr)
2131 {
2132         return (PG_VALID_ENCODING(encoding) ?
2133                         pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
2134                         pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
2135 }
2136
2137 /*
2138  * Returns the byte length of a multibyte character; but not more than
2139  * the distance to end of string.
2140  */
2141 int
2142 pg_encoding_mblen_bounded(int encoding, const char *mbstr)
2143 {
2144         return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
2145 }
2146
2147 /*
2148  * Returns the display length of a multibyte character.
2149  */
2150 int
2151 pg_encoding_dsplen(int encoding, const char *mbstr)
2152 {
2153         return (PG_VALID_ENCODING(encoding) ?
2154                         pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
2155                         pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
2156 }
2157
2158 /*
2159  * Verify the first multibyte character of the given string.
2160  * Return its byte length if good, -1 if bad.  (See comments above for
2161  * full details of the mbverifychar API.)
2162  */
2163 int
2164 pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
2165 {
2166         return (PG_VALID_ENCODING(encoding) ?
2167                         pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
2168                         pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
2169 }
2170
2171 /*
2172  * Verify that a string is valid for the given encoding.
2173  * Returns the number of input bytes (<= len) that form a valid string.
2174  * (See comments above for full details of the mbverifystr API.)
2175  */
2176 int
2177 pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
2178 {
2179         return (PG_VALID_ENCODING(encoding) ?
2180                         pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
2181                         pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
2182 }
2183
2184 /*
2185  * fetch maximum length of a given encoding
2186  */
2187 int
2188 pg_encoding_max_length(int encoding)
2189 {
2190         Assert(PG_VALID_ENCODING(encoding));
2191
2192         return pg_wchar_table[encoding].maxmblen;
2193 }