src/backend/utils/mb/mbutils.c

   1 /*
   2  * This file contains public functions for conversion between
   3  * client encoding and server internal encoding.
   4  * (currently mule internal code (mic) is used)
   5  * Tatsuo Ishii
   6  *
   7  * $PostgreSQL$
   8  */
   9 #include "postgres.h"
  10
  11 #include "access/xact.h"
  12 #include "catalog/namespace.h"
  13 #include "mb/pg_wchar.h"
  14 #include "utils/builtins.h"
  15 #include "utils/memutils.h"
  16 #include "utils/pg_locale.h"
  17 #include "utils/syscache.h"
  18
  19 /*
  20  * When converting strings between different encodings, we assume that space
  21  * for converted result is 4-to-1 growth in the worst case. The rate for
  22  * currently supported encoding pairs are within 3 (SJIS JIS X0201 half width
  23  * kanna -> UTF8 is the worst case).  So "4" should be enough for the moment.
  24  *
  25  * Note that this is not the same as the maximum character width in any
  26  * particular encoding.
  27  */
  28 #define MAX_CONVERSION_GROWTH  4
  29
  30 /*
  31  * We handle for actual FE and BE encoding setting encoding-identificator
  32  * and encoding-name too. It prevent searching and conversion from encoding
  33  * to encoding name in getdatabaseencoding() and other routines.
  34  */
  35 static pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
  36 static pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
  37
  38 /*
  39  * Caches for conversion function info. These values are allocated in
  40  * MbProcContext. That context is a child of TopMemoryContext,
  41  * which allows these values to survive across transactions. See
  42  * SetClientEncoding() for more details.
  43  */
  44 static MemoryContext MbProcContext = NULL;
  45 static FmgrInfo *ToServerConvProc = NULL;
  46 static FmgrInfo *ToClientConvProc = NULL;
  47
  48 /*
  49  * During backend startup we can't set client encoding because we (a)
  50  * can't look up the conversion functions, and (b) may not know the database
  51  * encoding yet either.  So SetClientEncoding() just accepts anything and
  52  * remembers it for InitializeClientEncoding() to apply later.
  53  */
  54 static bool backend_startup_complete = false;
  55 static int      pending_client_encoding = PG_SQL_ASCII;
  56
  57
  58 /* Internal functions */
  59 static char *perform_default_encoding_conversion(const char *src,
  60                                                                         int len, bool is_client_to_server);
  61 static int      cliplen(const char *str, int len, int limit);
  62
  63
  64 /*
  65  * Set the client encoding and save fmgrinfo for the conversion
  66  * function if necessary.  Returns 0 if okay, -1 if not (bad encoding
  67  * or can't support conversion)
  68  */
  69 int
  70 SetClientEncoding(int encoding, bool doit)
  71 {
  72         int                     current_server_encoding;
  73         Oid                     to_server_proc,
  74                                 to_client_proc;
  75         FmgrInfo   *to_server;
  76         FmgrInfo   *to_client;
  77         MemoryContext oldcontext;
  78
  79         if (!PG_VALID_FE_ENCODING(encoding))
  80                 return -1;
  81
  82         /* Can't do anything during startup, per notes above */
  83         if (!backend_startup_complete)
  84         {
  85                 if (doit)
  86                         pending_client_encoding = encoding;
  87                 return 0;
  88         }
  89
  90         current_server_encoding = GetDatabaseEncoding();
  91
  92         /*
  93          * Check for cases that require no conversion function.
  94          */
  95         if (current_server_encoding == encoding ||
  96                 current_server_encoding == PG_SQL_ASCII ||
  97                 encoding == PG_SQL_ASCII)
  98         {
  99                 if (doit)
 100                 {
 101                         ClientEncoding = &pg_enc2name_tbl[encoding];
 102                         ToServerConvProc = NULL;
 103                         ToClientConvProc = NULL;
 104                         if (MbProcContext)
 105                                 MemoryContextReset(MbProcContext);
 106                 }
 107                 return 0;
 108         }
 109
 110         /*
 111          * If we're not inside a transaction then we can't do catalog lookups, so
 112          * fail.  After backend startup, this could only happen if we are
 113          * re-reading postgresql.conf due to SIGHUP --- so basically this just
 114          * constrains the ability to change client_encoding on the fly from
 115          * postgresql.conf.  Which would probably be a stupid thing to do anyway.
 116          */
 117         if (!IsTransactionState())
 118                 return -1;
 119
 120         /*
 121          * Look up the conversion functions.
 122          */
 123         to_server_proc = FindDefaultConversionProc(encoding,
 124                                                                                            current_server_encoding);
 125         if (!OidIsValid(to_server_proc))
 126                 return -1;
 127         to_client_proc = FindDefaultConversionProc(current_server_encoding,
 128                                                                                            encoding);
 129         if (!OidIsValid(to_client_proc))
 130                 return -1;
 131
 132         /*
 133          * Done if not wanting to actually apply setting.
 134          */
 135         if (!doit)
 136                 return 0;
 137
 138         /* Before loading the new fmgr info, remove the old info, if any */
 139         ToServerConvProc = NULL;
 140         ToClientConvProc = NULL;
 141         if (MbProcContext != NULL)
 142         {
 143                 MemoryContextReset(MbProcContext);
 144         }
 145         else
 146         {
 147                 /*
 148                  * This is the first time through, so create the context. Make it a
 149                  * child of TopMemoryContext so that these values survive across
 150                  * transactions.
 151                  */
 152                 MbProcContext = AllocSetContextCreate(TopMemoryContext,
 153                                                                                           "MbProcContext",
 154                                                                                           ALLOCSET_SMALL_MINSIZE,
 155                                                                                           ALLOCSET_SMALL_INITSIZE,
 156                                                                                           ALLOCSET_SMALL_MAXSIZE);
 157         }
 158
 159         /* Load the fmgr info into MbProcContext */
 160         oldcontext = MemoryContextSwitchTo(MbProcContext);
 161         to_server = palloc(sizeof(FmgrInfo));
 162         to_client = palloc(sizeof(FmgrInfo));
 163         fmgr_info(to_server_proc, to_server);
 164         fmgr_info(to_client_proc, to_client);
 165         MemoryContextSwitchTo(oldcontext);
 166
 167         ClientEncoding = &pg_enc2name_tbl[encoding];
 168         ToServerConvProc = to_server;
 169         ToClientConvProc = to_client;
 170
 171         return 0;
 172 }
 173
 174 /*
 175  * Initialize client encoding if necessary.
 176  *              called from InitPostgres() once during backend starting up.
 177  */
 178 void
 179 InitializeClientEncoding(void)
 180 {
 181         Assert(!backend_startup_complete);
 182         backend_startup_complete = true;
 183
 184         if (SetClientEncoding(pending_client_encoding, true) < 0)
 185         {
 186                 /*
 187                  * Oops, the requested conversion is not available. We couldn't fail
 188                  * before, but we can now.
 189                  */
 190                 ereport(FATAL,
 191                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 192                                  errmsg("conversion between %s and %s is not supported",
 193                                                 pg_enc2name_tbl[pending_client_encoding].name,
 194                                                 GetDatabaseEncodingName())));
 195         }
 196 }
 197
 198 /*
 199  * returns the current client encoding */
 200 int
 201 pg_get_client_encoding(void)
 202 {
 203         Assert(ClientEncoding);
 204         return ClientEncoding->encoding;
 205 }
 206
 207 /*
 208  * returns the current client encoding name
 209  */
 210 const char *
 211 pg_get_client_encoding_name(void)
 212 {
 213         Assert(ClientEncoding);
 214         return ClientEncoding->name;
 215 }
 216
 217 /*
 218  * Apply encoding conversion on src and return it. The encoding
 219  * conversion function is chosen from the pg_conversion system catalog
 220  * marked as "default". If it is not found in the schema search path,
 221  * it's taken from pg_catalog schema. If it even is not in the schema,
 222  * warn and return src.
 223  *
 224  * In the case of no conversion, src is returned.
 225  *
 226  * Note: we try to avoid raising error, since that could get us into
 227  * infinite recursion when this function is invoked during error message
 228  * sending.  It should be OK to raise error for overlength strings though,
 229  * since the recursion will come with a shorter message.
 230  */
 231 unsigned char *
 232 pg_do_encoding_conversion(unsigned char *src, int len,
 233                                                   int src_encoding, int dest_encoding)
 234 {
 235         unsigned char *result;
 236         Oid                     proc;
 237
 238         if (!IsTransactionState())
 239                 return src;
 240
 241         if (src_encoding == dest_encoding)
 242                 return src;
 243
 244         if (src_encoding == PG_SQL_ASCII || dest_encoding == PG_SQL_ASCII)
 245                 return src;
 246
 247         if (len <= 0)
 248                 return src;
 249
 250         proc = FindDefaultConversionProc(src_encoding, dest_encoding);
 251         if (!OidIsValid(proc))
 252         {
 253                 ereport(LOG,
 254                                 (errcode(ERRCODE_UNDEFINED_FUNCTION),
 255                                  errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
 256                                                 pg_encoding_to_char(src_encoding),
 257                                                 pg_encoding_to_char(dest_encoding))));
 258                 return src;
 259         }
 260
 261         /*
 262          * XXX we should avoid throwing errors in OidFunctionCall. Otherwise we
 263          * are going into infinite loop!  So we have to make sure that the
 264          * function exists before calling OidFunctionCall.
 265          */
 266         if (!SearchSysCacheExists(PROCOID,
 267                                                           ObjectIdGetDatum(proc),
 268                                                           0, 0, 0))
 269         {
 270                 elog(LOG, "cache lookup failed for function %u", proc);
 271                 return src;
 272         }
 273
 274         /*
 275          * Allocate space for conversion result, being wary of integer overflow
 276          */
 277         if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
 278                 ereport(ERROR,
 279                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 280                                  errmsg("out of memory"),
 281                  errdetail("String of %d bytes is too long for encoding conversion.",
 282                                    len)));
 283
 284         result = palloc(len * MAX_CONVERSION_GROWTH + 1);
 285
 286         OidFunctionCall5(proc,
 287                                          Int32GetDatum(src_encoding),
 288                                          Int32GetDatum(dest_encoding),
 289                                          CStringGetDatum(src),
 290                                          CStringGetDatum(result),
 291                                          Int32GetDatum(len));
 292         return result;
 293 }
 294
 295 /*
 296  * Convert string using encoding_name. The source
 297  * encoding is the DB encoding.
 298  *
 299  * BYTEA convert_to(TEXT string, NAME encoding_name) */
 300 Datum
 301 pg_convert_to(PG_FUNCTION_ARGS)
 302 {
 303         Datum           string = PG_GETARG_DATUM(0);
 304         Datum           dest_encoding_name = PG_GETARG_DATUM(1);
 305         Datum           src_encoding_name = DirectFunctionCall1(namein,
 306                                                                         CStringGetDatum(DatabaseEncoding->name));
 307         Datum           result;
 308
 309         /*
 310          * pg_convert expects a bytea as its first argument. We're passing it a
 311          * text argument here, relying on the fact that they are both in fact
 312          * varlena types, and thus structurally identical.
 313          */
 314         result = DirectFunctionCall3(pg_convert, string,
 315                                                                  src_encoding_name, dest_encoding_name);
 316
 317         PG_RETURN_DATUM(result);
 318 }
 319
 320 /*
 321  * Convert string using encoding_name. The destination
 322  * encoding is the DB encoding.
 323  *
 324  * TEXT convert_from(BYTEA string, NAME encoding_name) */
 325 Datum
 326 pg_convert_from(PG_FUNCTION_ARGS)
 327 {
 328         Datum           string = PG_GETARG_DATUM(0);
 329         Datum           src_encoding_name = PG_GETARG_DATUM(1);
 330         Datum           dest_encoding_name = DirectFunctionCall1(namein,
 331                                                                         CStringGetDatum(DatabaseEncoding->name));
 332         Datum           result;
 333
 334         result = DirectFunctionCall3(pg_convert, string,
 335                                                                  src_encoding_name, dest_encoding_name);
 336
 337         /*
 338          * pg_convert returns a bytea, which we in turn return as text, relying on
 339          * the fact that they are both in fact varlena types, and thus
 340          * structurally identical. Although not all bytea values are valid text,
 341          * in this case it will be because we've told pg_convert to return one
 342          * that is valid as text in the current database encoding.
 343          */
 344         PG_RETURN_DATUM(result);
 345 }
 346
 347 /*
 348  * Convert string using encoding_names.
 349  *
 350  * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
 351  */
 352 Datum
 353 pg_convert(PG_FUNCTION_ARGS)
 354 {
 355         bytea      *string = PG_GETARG_BYTEA_P(0);
 356         char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
 357         int                     src_encoding = pg_char_to_encoding(src_encoding_name);
 358         char       *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
 359         int                     dest_encoding = pg_char_to_encoding(dest_encoding_name);
 360         unsigned char *result;
 361         bytea      *retval;
 362         unsigned char *str;
 363         int                     len;
 364
 365         if (src_encoding < 0)
 366                 ereport(ERROR,
 367                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 368                                  errmsg("invalid source encoding name \"%s\"",
 369                                                 src_encoding_name)));
 370         if (dest_encoding < 0)
 371                 ereport(ERROR,
 372                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 373                                  errmsg("invalid destination encoding name \"%s\"",
 374                                                 dest_encoding_name)));
 375
 376         /* make sure that source string is valid and null terminated */
 377         len = VARSIZE(string) - VARHDRSZ;
 378         pg_verify_mbstr(src_encoding, VARDATA(string), len, false);
 379         str = palloc(len + 1);
 380         memcpy(str, VARDATA(string), len);
 381         *(str + len) = '\0';
 382
 383         result = pg_do_encoding_conversion(str, len, src_encoding, dest_encoding);
 384         if (result == NULL)
 385                 elog(ERROR, "encoding conversion failed");
 386
 387         /*
 388          * build bytea data type structure.
 389          */
 390         len = strlen((char *) result) + VARHDRSZ;
 391         retval = palloc(len);
 392         SET_VARSIZE(retval, len);
 393         memcpy(VARDATA(retval), result, len - VARHDRSZ);
 394
 395         if (result != str)
 396                 pfree(result);
 397         pfree(str);
 398
 399         /* free memory if allocated by the toaster */
 400         PG_FREE_IF_COPY(string, 0);
 401
 402         PG_RETURN_BYTEA_P(retval);
 403 }
 404
 405 /*
 406  * get the length of the string considered as text in the specified
 407  * encoding. Raises an error if the data is not valid in that
 408  * encoding.
 409  *
 410  * INT4 length (BYTEA string, NAME src_encoding_name)
 411  */
 412 Datum
 413 length_in_encoding(PG_FUNCTION_ARGS)
 414 {
 415         bytea      *string = PG_GETARG_BYTEA_P(0);
 416         char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
 417         int                     src_encoding = pg_char_to_encoding(src_encoding_name);
 418         int                     len = VARSIZE(string) - VARHDRSZ;
 419         int                     retval;
 420
 421         if (src_encoding < 0)
 422                 ereport(ERROR,
 423                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 424                                  errmsg("invalid encoding name \"%s\"",
 425                                                 src_encoding_name)));
 426
 427         retval = pg_verify_mbstr_len(src_encoding, VARDATA(string), len, false);
 428         PG_RETURN_INT32(retval);
 429
 430 }
 431
 432 /*
 433  * convert client encoding to server encoding.
 434  */
 435 char *
 436 pg_client_to_server(const char *s, int len)
 437 {
 438         Assert(DatabaseEncoding);
 439         Assert(ClientEncoding);
 440
 441         if (len <= 0)
 442                 return (char *) s;
 443
 444         if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
 445                 ClientEncoding->encoding == PG_SQL_ASCII)
 446         {
 447                 /*
 448                  * No conversion is needed, but we must still validate the data.
 449                  */
 450                 (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
 451                 return (char *) s;
 452         }
 453
 454         if (DatabaseEncoding->encoding == PG_SQL_ASCII)
 455         {
 456                 /*
 457                  * No conversion is possible, but we must still validate the data,
 458                  * because the client-side code might have done string escaping using
 459                  * the selected client_encoding.  If the client encoding is ASCII-safe
 460                  * then we just do a straight validation under that encoding.  For an
 461                  * ASCII-unsafe encoding we have a problem: we dare not pass such data
 462                  * to the parser but we have no way to convert it.      We compromise by
 463                  * rejecting the data if it contains any non-ASCII characters.
 464                  */
 465                 if (PG_VALID_BE_ENCODING(ClientEncoding->encoding))
 466                         (void) pg_verify_mbstr(ClientEncoding->encoding, s, len, false);
 467                 else
 468                 {
 469                         int                     i;
 470
 471                         for (i = 0; i < len; i++)
 472                         {
 473                                 if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
 474                                         ereport(ERROR,
 475                                                         (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 476                                          errmsg("invalid byte value for encoding \"%s\": 0x%02x",
 477                                                         pg_enc2name_tbl[PG_SQL_ASCII].name,
 478                                                         (unsigned char) s[i])));
 479                         }
 480                 }
 481                 return (char *) s;
 482         }
 483
 484         return perform_default_encoding_conversion(s, len, true);
 485 }
 486
 487 /*
 488  * convert server encoding to client encoding.
 489  */
 490 char *
 491 pg_server_to_client(const char *s, int len)
 492 {
 493         Assert(DatabaseEncoding);
 494         Assert(ClientEncoding);
 495
 496         if (len <= 0)
 497                 return (char *) s;
 498
 499         if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
 500                 ClientEncoding->encoding == PG_SQL_ASCII ||
 501                 DatabaseEncoding->encoding == PG_SQL_ASCII)
 502                 return (char *) s;              /* assume data is valid */
 503
 504         return perform_default_encoding_conversion(s, len, false);
 505 }
 506
 507 /*
 508  *      Perform default encoding conversion using cached FmgrInfo. Since
 509  *      this function does not access database at all, it is safe to call
 510  *      outside transactions. Explicit setting client encoding required
 511  *      before calling this function. Otherwise no conversion is
 512  *      performed.
 513 */
 514 static char *
 515 perform_default_encoding_conversion(const char *src, int len, bool is_client_to_server)
 516 {
 517         char       *result;
 518         int                     src_encoding,
 519                                 dest_encoding;
 520         FmgrInfo   *flinfo;
 521
 522         if (is_client_to_server)
 523         {
 524                 src_encoding = ClientEncoding->encoding;
 525                 dest_encoding = DatabaseEncoding->encoding;
 526                 flinfo = ToServerConvProc;
 527         }
 528         else
 529         {
 530                 src_encoding = DatabaseEncoding->encoding;
 531                 dest_encoding = ClientEncoding->encoding;
 532                 flinfo = ToClientConvProc;
 533         }
 534
 535         if (flinfo == NULL)
 536                 return (char *) src;
 537
 538         /*
 539          * Allocate space for conversion result, being wary of integer overflow
 540          */
 541         if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
 542                 ereport(ERROR,
 543                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 544                                  errmsg("out of memory"),
 545                  errdetail("String of %d bytes is too long for encoding conversion.",
 546                                    len)));
 547
 548         result = palloc(len * MAX_CONVERSION_GROWTH + 1);
 549
 550         FunctionCall5(flinfo,
 551                                   Int32GetDatum(src_encoding),
 552                                   Int32GetDatum(dest_encoding),
 553                                   CStringGetDatum(src),
 554                                   CStringGetDatum(result),
 555                                   Int32GetDatum(len));
 556         return result;
 557 }
 558
 559
 560
 561 #ifdef USE_WIDE_UPPER_LOWER
 562
 563 /*
 564  * wchar2char --- convert wide characters to multibyte format
 565  *
 566  * This has the same API as the standard wcstombs() function; in particular,
 567  * tolen is the maximum number of bytes to store at *to, and *from must be
 568  * zero-terminated.  The output will be zero-terminated iff there is room.
 569  */
 570 size_t
 571 wchar2char(char *to, const wchar_t *from, size_t tolen)
 572 {
 573         size_t result;
 574
 575         if (tolen == 0)
 576                 return 0;
 577
 578 #ifdef WIN32
 579         /*
 580          * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding,
 581          * and for some reason mbstowcs and wcstombs won't do this for us,
 582          * so we use MultiByteToWideChar().
 583          */
 584         if (GetDatabaseEncoding() == PG_UTF8)
 585         {
 586                 result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
 587                                                                 NULL, NULL);
 588                 /* A zero return is failure */
 589                 if (result <= 0)
 590                         result = -1;
 591                 else
 592                 {
 593                         Assert(result <= tolen);
 594                         /* Microsoft counts the zero terminator in the result */
 595                         result--;
 596                 }
 597         }
 598         else
 599 #endif   /* WIN32 */
 600                 result = wcstombs(to, from, tolen);
 601         return result;
 602 }
 603
 604 /*
 605  * char2wchar --- convert multibyte characters to wide characters
 606  *
 607  * This has almost the API of mbstowcs(), except that *from need not be
 608  * null-terminated; instead, the number of input bytes is specified as
 609  * fromlen.  Also, we ereport() rather than returning -1 for invalid
 610  * input encoding.      tolen is the maximum number of wchar_t's to store at *to.
 611  * The output will be zero-terminated iff there is room.
 612  */
 613 size_t
 614 char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen)
 615 {
 616         size_t          result;
 617
 618         if (tolen == 0)
 619                 return 0;
 620
 621 #ifdef WIN32
 622         /* See WIN32 "Unicode" comment above */
 623         if (GetDatabaseEncoding() == PG_UTF8)
 624         {
 625                 /* Win32 API does not work for zero-length input */
 626                 if (fromlen == 0)
 627                         result = 0;
 628                 else
 629                 {
 630                         result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
 631                         /* A zero return is failure */
 632                         if (result == 0)
 633                                 result = -1;
 634                 }
 635
 636                 if (result != -1)
 637                 {
 638                         Assert(result < tolen);
 639                         /* Append trailing null wchar (MultiByteToWideChar() does not) */
 640                         to[result] = 0;
 641                 }
 642         }
 643         else
 644 #endif   /* WIN32 */
 645         {
 646                 if (lc_ctype_is_c())
 647                 {
 648                         /*
 649                          * pg_mb2wchar_with_len always adds trailing '\0', so 'to' should be
 650                          * allocated with sufficient space
 651                          */
 652                         result = pg_mb2wchar_with_len(from, (pg_wchar *) to, fromlen);
 653                 }
 654                 else
 655                 {
 656                         /* mbstowcs requires ending '\0' */
 657                         char       *str = pnstrdup(from, fromlen);
 658
 659                         result = mbstowcs(to, str, tolen);
 660                         pfree(str);
 661                 }
 662         }
 663
 664         if (result == -1)
 665         {
 666                 /*
 667                  * Invalid multibyte character encountered.  We try to give a useful
 668                  * error message by letting pg_verifymbstr check the string.  But it's
 669                  * possible that the string is OK to us, and not OK to mbstowcs ---
 670                  * this suggests that the LC_CTYPE locale is different from the
 671                  * database encoding.  Give a generic error message if verifymbstr
 672                  * can't find anything wrong.
 673                  */
 674                 pg_verifymbstr(from, fromlen, false);   /* might not return */
 675                 /* but if it does ... */
 676                 ereport(ERROR,
 677                                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 678                                  errmsg("invalid multibyte character for locale"),
 679                                  errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
 680         }
 681
 682         return result;
 683 }
 684
 685 #endif
 686
 687 /* convert a multibyte string to a wchar */
 688 int
 689 pg_mb2wchar(const char *from, pg_wchar *to)
 690 {
 691         return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, strlen(from));
 692 }
 693
 694 /* convert a multibyte string to a wchar with a limited length */
 695 int
 696 pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
 697 {
 698         return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
 699 }
 700
 701 /* same, with any encoding */
 702 int
 703 pg_encoding_mb2wchar_with_len(int encoding,
 704                                                           const char *from, pg_wchar *to, int len)
 705 {
 706         return (*pg_wchar_table[encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
 707 }
 708
 709 /* returns the byte length of a multibyte word */
 710 int
 711 pg_mblen(const char *mbstr)
 712 {
 713         return ((*pg_wchar_table[DatabaseEncoding->encoding].mblen) ((const unsigned char *) mbstr));
 714 }
 715
 716 /* returns the display length of a multibyte word */
 717 int
 718 pg_dsplen(const char *mbstr)
 719 {
 720         return ((*pg_wchar_table[DatabaseEncoding->encoding].dsplen) ((const unsigned char *) mbstr));
 721 }
 722
 723 /* returns the length (counted in wchars) of a multibyte string */
 724 int
 725 pg_mbstrlen(const char *mbstr)
 726 {
 727         int                     len = 0;
 728
 729         /* optimization for single byte encoding */
 730         if (pg_database_encoding_max_length() == 1)
 731                 return strlen(mbstr);
 732
 733         while (*mbstr)
 734         {
 735                 mbstr += pg_mblen(mbstr);
 736                 len++;
 737         }
 738         return len;
 739 }
 740
 741 /* returns the length (counted in wchars) of a multibyte string
 742  * (not necessarily NULL terminated)
 743  */
 744 int
 745 pg_mbstrlen_with_len(const char *mbstr, int limit)
 746 {
 747         int                     len = 0;
 748
 749         /* optimization for single byte encoding */
 750         if (pg_database_encoding_max_length() == 1)
 751                 return limit;
 752
 753         while (limit > 0 && *mbstr)
 754         {
 755                 int                     l = pg_mblen(mbstr);
 756
 757                 limit -= l;
 758                 mbstr += l;
 759                 len++;
 760         }
 761         return len;
 762 }
 763
 764 /*
 765  * returns the byte length of a multibyte string
 766  * (not necessarily  NULL terminated)
 767  * that is no longer than limit.
 768  * this function does not break multibyte word boundary.
 769  */
 770 int
 771 pg_mbcliplen(const char *mbstr, int len, int limit)
 772 {
 773         int                     clen = 0;
 774         int                     l;
 775
 776         /* optimization for single byte encoding */
 777         if (pg_database_encoding_max_length() == 1)
 778                 return cliplen(mbstr, len, limit);
 779
 780         while (len > 0 && *mbstr)
 781         {
 782                 l = pg_mblen(mbstr);
 783                 if ((clen + l) > limit)
 784                         break;
 785                 clen += l;
 786                 if (clen == limit)
 787                         break;
 788                 len -= l;
 789                 mbstr += l;
 790         }
 791         return clen;
 792 }
 793
 794 /*
 795  * Similar to pg_mbcliplen except the limit parameter specifies the
 796  * character length, not the byte length.  */
 797 int
 798 pg_mbcharcliplen(const char *mbstr, int len, int limit)
 799 {
 800         int                     clen = 0;
 801         int                     nch = 0;
 802         int                     l;
 803
 804         /* optimization for single byte encoding */
 805         if (pg_database_encoding_max_length() == 1)
 806                 return cliplen(mbstr, len, limit);
 807
 808         while (len > 0 && *mbstr)
 809         {
 810                 l = pg_mblen(mbstr);
 811                 nch++;
 812                 if (nch > limit)
 813                         break;
 814                 clen += l;
 815                 len -= l;
 816                 mbstr += l;
 817         }
 818         return clen;
 819 }
 820
 821 void
 822 SetDatabaseEncoding(int encoding)
 823 {
 824         if (!PG_VALID_BE_ENCODING(encoding))
 825                 elog(ERROR, "invalid database encoding: %d", encoding);
 826
 827         DatabaseEncoding = &pg_enc2name_tbl[encoding];
 828         Assert(DatabaseEncoding->encoding == encoding);
 829
 830         /*
 831          * On Windows, we allow UTF-8 database encoding to be used with any
 832          * locale setting, because UTF-8 requires special handling anyway.
 833          * But this means that gettext() might be misled about what output
 834          * encoding it should use, so we have to tell it explicitly.
 835          *
 836          * In future we might want to call bind_textdomain_codeset
 837          * unconditionally, but that requires knowing how to spell the codeset
 838          * name properly for all encodings on all platforms, which might be
 839          * problematic.
 840          *
 841          * This is presently unnecessary, but harmless, on non-Windows platforms.
 842          */
 843 #ifdef ENABLE_NLS
 844         if (encoding == PG_UTF8)
 845                 if (bind_textdomain_codeset("postgres", "UTF-8") == NULL)
 846                         elog(LOG, "bind_textdomain_codeset failed");
 847 #endif
 848 }
 849
 850 void
 851 SetDefaultClientEncoding(void)
 852 {
 853         ClientEncoding = &pg_enc2name_tbl[GetDatabaseEncoding()];
 854 }
 855
 856 int
 857 GetDatabaseEncoding(void)
 858 {
 859         Assert(DatabaseEncoding);
 860         return DatabaseEncoding->encoding;
 861 }
 862
 863 const char *
 864 GetDatabaseEncodingName(void)
 865 {
 866         Assert(DatabaseEncoding);
 867         return DatabaseEncoding->name;
 868 }
 869
 870 Datum
 871 getdatabaseencoding(PG_FUNCTION_ARGS)
 872 {
 873         Assert(DatabaseEncoding);
 874         return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
 875 }
 876
 877 Datum
 878 pg_client_encoding(PG_FUNCTION_ARGS)
 879 {
 880         Assert(ClientEncoding);
 881         return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
 882 }
 883
 884 static int
 885 cliplen(const char *str, int len, int limit)
 886 {
 887         int                     l = 0;
 888         const char *s;
 889
 890         for (s = str; *s; s++, l++)
 891         {
 892                 if (l >= len || l >= limit)
 893                         return l;
 894         }
 895         return (s - str);
 896 }