src/backend/utils/mb/mbutils.c

   1 /*
   2  * This file contains public functions for conversion between
   3  * client encoding and server (database) encoding.
   4  *
   5  * Tatsuo Ishii
   6  *
   7  * $PostgreSQL$
   8  */
   9 #include "postgres.h"
  10
  11 #include "access/xact.h"
  12 #include "catalog/namespace.h"
  13 #include "mb/pg_wchar.h"
  14 #include "utils/builtins.h"
  15 #include "utils/memutils.h"
  16 #include "utils/pg_locale.h"
  17 #include "utils/syscache.h"
  18
  19 /*
  20  * When converting strings between different encodings, we assume that space
  21  * for converted result is 4-to-1 growth in the worst case. The rate for
  22  * currently supported encoding pairs are within 3 (SJIS JIS X0201 half width
  23  * kanna -> UTF8 is the worst case).  So "4" should be enough for the moment.
  24  *
  25  * Note that this is not the same as the maximum character width in any
  26  * particular encoding.
  27  */
  28 #define MAX_CONVERSION_GROWTH  4
  29
  30 /*
  31  * We maintain a simple linked list caching the fmgr lookup info for the
  32  * currently selected conversion functions, as well as any that have been
  33  * selected previously in the current session.  (We remember previous
  34  * settings because we must be able to restore a previous setting during
  35  * transaction rollback, without doing any fresh catalog accesses.)
  36  *
  37  * Since we'll never release this data, we just keep it in TopMemoryContext.
  38  */
  39 typedef struct ConvProcInfo
  40 {
  41         int                     s_encoding;             /* server and client encoding IDs */
  42         int                     c_encoding;
  43         FmgrInfo        to_server_info; /* lookup info for conversion procs */
  44         FmgrInfo        to_client_info;
  45 } ConvProcInfo;
  46
  47 static List *ConvProcList = NIL;        /* List of ConvProcInfo */
  48
  49 /*
  50  * These variables point to the currently active conversion functions,
  51  * or are NULL when no conversion is needed.
  52  */
  53 static FmgrInfo *ToServerConvProc = NULL;
  54 static FmgrInfo *ToClientConvProc = NULL;
  55
  56 /*
  57  * These variables track the currently selected FE and BE encodings.
  58  */
  59 static pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
  60 static pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
  61
  62 /*
  63  * During backend startup we can't set client encoding because we (a)
  64  * can't look up the conversion functions, and (b) may not know the database
  65  * encoding yet either.  So SetClientEncoding() just accepts anything and
  66  * remembers it for InitializeClientEncoding() to apply later.
  67  */
  68 static bool backend_startup_complete = false;
  69 static int      pending_client_encoding = PG_SQL_ASCII;
  70
  71
  72 /* Internal functions */
  73 static char *perform_default_encoding_conversion(const char *src,
  74                                                                         int len, bool is_client_to_server);
  75 static int      cliplen(const char *str, int len, int limit);
  76
  77
  78 /*
  79  * Set the client encoding and save fmgrinfo for the conversion
  80  * function if necessary.  Returns 0 if okay, -1 if not (bad encoding
  81  * or can't support conversion)
  82  */
  83 int
  84 SetClientEncoding(int encoding, bool doit)
  85 {
  86         int                     current_server_encoding;
  87         ListCell   *lc;
  88
  89         if (!PG_VALID_FE_ENCODING(encoding))
  90                 return -1;
  91
  92         /* Can't do anything during startup, per notes above */
  93         if (!backend_startup_complete)
  94         {
  95                 if (doit)
  96                         pending_client_encoding = encoding;
  97                 return 0;
  98         }
  99
 100         current_server_encoding = GetDatabaseEncoding();
 101
 102         /*
 103          * Check for cases that require no conversion function.
 104          */
 105         if (current_server_encoding == encoding ||
 106                 current_server_encoding == PG_SQL_ASCII ||
 107                 encoding == PG_SQL_ASCII)
 108         {
 109                 if (doit)
 110                 {
 111                         ClientEncoding = &pg_enc2name_tbl[encoding];
 112                         ToServerConvProc = NULL;
 113                         ToClientConvProc = NULL;
 114                 }
 115                 return 0;
 116         }
 117
 118         if (IsTransactionState())
 119         {
 120                 /*
 121                  * If we're in a live transaction, it's safe to access the catalogs,
 122                  * so look up the functions.  We repeat the lookup even if the info is
 123                  * already cached, so that we can react to changes in the contents of
 124                  * pg_conversion.
 125                  */
 126                 Oid                     to_server_proc,
 127                                         to_client_proc;
 128                 ConvProcInfo *convinfo;
 129                 MemoryContext oldcontext;
 130
 131                 to_server_proc = FindDefaultConversionProc(encoding,
 132                                                                                                    current_server_encoding);
 133                 if (!OidIsValid(to_server_proc))
 134                         return -1;
 135                 to_client_proc = FindDefaultConversionProc(current_server_encoding,
 136                                                                                                    encoding);
 137                 if (!OidIsValid(to_client_proc))
 138                         return -1;
 139
 140                 /*
 141                  * Done if not wanting to actually apply setting.
 142                  */
 143                 if (!doit)
 144                         return 0;
 145
 146                 /*
 147                  * Load the fmgr info into TopMemoryContext (could still fail here)
 148                  */
 149                 convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
 150                                                                                                            sizeof(ConvProcInfo));
 151                 convinfo->s_encoding = current_server_encoding;
 152                 convinfo->c_encoding = encoding;
 153                 fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
 154                                           TopMemoryContext);
 155                 fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
 156                                           TopMemoryContext);
 157
 158                 /* Attach new info to head of list */
 159                 oldcontext = MemoryContextSwitchTo(TopMemoryContext);
 160                 ConvProcList = lcons(convinfo, ConvProcList);
 161                 MemoryContextSwitchTo(oldcontext);
 162
 163                 /*
 164                  * Everything is okay, so apply the setting.
 165                  */
 166                 ClientEncoding = &pg_enc2name_tbl[encoding];
 167                 ToServerConvProc = &convinfo->to_server_info;
 168                 ToClientConvProc = &convinfo->to_client_info;
 169
 170                 /*
 171                  * Remove any older entry for the same encoding pair (this is just to
 172                  * avoid memory leakage).
 173                  */
 174                 foreach(lc, ConvProcList)
 175                 {
 176                         ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
 177
 178                         if (oldinfo == convinfo)
 179                                 continue;
 180                         if (oldinfo->s_encoding == convinfo->s_encoding &&
 181                                 oldinfo->c_encoding == convinfo->c_encoding)
 182                         {
 183                                 ConvProcList = list_delete_ptr(ConvProcList, oldinfo);
 184                                 pfree(oldinfo);
 185                                 break;                  /* need not look further */
 186                         }
 187                 }
 188
 189                 return 0;                               /* success */
 190         }
 191         else
 192         {
 193                 /*
 194                  * If we're not in a live transaction, the only thing we can do is
 195                  * restore a previous setting using the cache.  This covers all
 196                  * transaction-rollback cases.  The only case it might not work for is
 197                  * trying to change client_encoding on the fly by editing
 198                  * postgresql.conf and SIGHUP'ing.  Which would probably be a stupid
 199                  * thing to do anyway.
 200                  */
 201                 foreach(lc, ConvProcList)
 202                 {
 203                         ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
 204
 205                         if (oldinfo->s_encoding == current_server_encoding &&
 206                                 oldinfo->c_encoding == encoding)
 207                         {
 208                                 if (doit)
 209                                 {
 210                                         ClientEncoding = &pg_enc2name_tbl[encoding];
 211                                         ToServerConvProc = &oldinfo->to_server_info;
 212                                         ToClientConvProc = &oldinfo->to_client_info;
 213                                 }
 214                                 return 0;
 215                         }
 216                 }
 217
 218                 return -1;                              /* it's not cached, so fail */
 219         }
 220 }
 221
 222 /*
 223  * Initialize client encoding if necessary.
 224  *              called from InitPostgres() once during backend startup.
 225  */
 226 void
 227 InitializeClientEncoding(void)
 228 {
 229         Assert(!backend_startup_complete);
 230         backend_startup_complete = true;
 231
 232         if (SetClientEncoding(pending_client_encoding, true) < 0)
 233         {
 234                 /*
 235                  * Oops, the requested conversion is not available. We couldn't fail
 236                  * before, but we can now.
 237                  */
 238                 ereport(FATAL,
 239                                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 240                                  errmsg("conversion between %s and %s is not supported",
 241                                                 pg_enc2name_tbl[pending_client_encoding].name,
 242                                                 GetDatabaseEncodingName())));
 243         }
 244 }
 245
 246 /*
 247  * returns the current client encoding
 248  */
 249 int
 250 pg_get_client_encoding(void)
 251 {
 252         Assert(ClientEncoding);
 253         return ClientEncoding->encoding;
 254 }
 255
 256 /*
 257  * returns the current client encoding name
 258  */
 259 const char *
 260 pg_get_client_encoding_name(void)
 261 {
 262         Assert(ClientEncoding);
 263         return ClientEncoding->name;
 264 }
 265
 266 /*
 267  * Apply encoding conversion on src and return it. The encoding
 268  * conversion function is chosen from the pg_conversion system catalog
 269  * marked as "default". If it is not found in the schema search path,
 270  * it's taken from pg_catalog schema. If it even is not in the schema,
 271  * warn and return src.
 272  *
 273  * If conversion occurs, a palloc'd null-terminated string is returned.
 274  * In the case of no conversion, src is returned.
 275  *
 276  * CAUTION: although the presence of a length argument means that callers
 277  * can pass non-null-terminated strings, care is required because the same
 278  * string will be passed back if no conversion occurs.  Such callers *must*
 279  * check whether result == src and handle that case differently.
 280  *
 281  * Note: we try to avoid raising error, since that could get us into
 282  * infinite recursion when this function is invoked during error message
 283  * sending.  It should be OK to raise error for overlength strings though,
 284  * since the recursion will come with a shorter message.
 285  */
 286 unsigned char *
 287 pg_do_encoding_conversion(unsigned char *src, int len,
 288                                                   int src_encoding, int dest_encoding)
 289 {
 290         unsigned char *result;
 291         Oid                     proc;
 292
 293         if (!IsTransactionState())
 294                 return src;
 295
 296         if (src_encoding == dest_encoding)
 297                 return src;
 298
 299         if (src_encoding == PG_SQL_ASCII || dest_encoding == PG_SQL_ASCII)
 300                 return src;
 301
 302         if (len <= 0)
 303                 return src;
 304
 305         proc = FindDefaultConversionProc(src_encoding, dest_encoding);
 306         if (!OidIsValid(proc))
 307         {
 308                 ereport(LOG,
 309                                 (errcode(ERRCODE_UNDEFINED_FUNCTION),
 310                                  errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
 311                                                 pg_encoding_to_char(src_encoding),
 312                                                 pg_encoding_to_char(dest_encoding))));
 313                 return src;
 314         }
 315
 316         /*
 317          * XXX we should avoid throwing errors in OidFunctionCall. Otherwise we
 318          * are going into infinite loop!  So we have to make sure that the
 319          * function exists before calling OidFunctionCall.
 320          */
 321         if (!SearchSysCacheExists(PROCOID,
 322                                                           ObjectIdGetDatum(proc),
 323                                                           0, 0, 0))
 324         {
 325                 elog(LOG, "cache lookup failed for function %u", proc);
 326                 return src;
 327         }
 328
 329         /*
 330          * Allocate space for conversion result, being wary of integer overflow
 331          */
 332         if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
 333                 ereport(ERROR,
 334                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 335                                  errmsg("out of memory"),
 336                  errdetail("String of %d bytes is too long for encoding conversion.",
 337                                    len)));
 338
 339         result = palloc(len * MAX_CONVERSION_GROWTH + 1);
 340
 341         OidFunctionCall5(proc,
 342                                          Int32GetDatum(src_encoding),
 343                                          Int32GetDatum(dest_encoding),
 344                                          CStringGetDatum(src),
 345                                          CStringGetDatum(result),
 346                                          Int32GetDatum(len));
 347         return result;
 348 }
 349
 350 /*
 351  * Convert string using encoding_name. The source
 352  * encoding is the DB encoding.
 353  *
 354  * BYTEA convert_to(TEXT string, NAME encoding_name) */
 355 Datum
 356 pg_convert_to(PG_FUNCTION_ARGS)
 357 {
 358         Datum           string = PG_GETARG_DATUM(0);
 359         Datum           dest_encoding_name = PG_GETARG_DATUM(1);
 360         Datum           src_encoding_name = DirectFunctionCall1(namein,
 361                                                                         CStringGetDatum(DatabaseEncoding->name));
 362         Datum           result;
 363
 364         /*
 365          * pg_convert expects a bytea as its first argument. We're passing it a
 366          * text argument here, relying on the fact that they are both in fact
 367          * varlena types, and thus structurally identical.
 368          */
 369         result = DirectFunctionCall3(pg_convert, string,
 370                                                                  src_encoding_name, dest_encoding_name);
 371
 372         PG_RETURN_DATUM(result);
 373 }
 374
 375 /*
 376  * Convert string using encoding_name. The destination
 377  * encoding is the DB encoding.
 378  *
 379  * TEXT convert_from(BYTEA string, NAME encoding_name) */
 380 Datum
 381 pg_convert_from(PG_FUNCTION_ARGS)
 382 {
 383         Datum           string = PG_GETARG_DATUM(0);
 384         Datum           src_encoding_name = PG_GETARG_DATUM(1);
 385         Datum           dest_encoding_name = DirectFunctionCall1(namein,
 386                                                                         CStringGetDatum(DatabaseEncoding->name));
 387         Datum           result;
 388
 389         result = DirectFunctionCall3(pg_convert, string,
 390                                                                  src_encoding_name, dest_encoding_name);
 391
 392         /*
 393          * pg_convert returns a bytea, which we in turn return as text, relying on
 394          * the fact that they are both in fact varlena types, and thus
 395          * structurally identical. Although not all bytea values are valid text,
 396          * in this case it will be because we've told pg_convert to return one
 397          * that is valid as text in the current database encoding.
 398          */
 399         PG_RETURN_DATUM(result);
 400 }
 401
 402 /*
 403  * Convert string using encoding_names.
 404  *
 405  * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
 406  */
 407 Datum
 408 pg_convert(PG_FUNCTION_ARGS)
 409 {
 410         bytea      *string = PG_GETARG_BYTEA_P(0);
 411         char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
 412         int                     src_encoding = pg_char_to_encoding(src_encoding_name);
 413         char       *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
 414         int                     dest_encoding = pg_char_to_encoding(dest_encoding_name);
 415         unsigned char *result;
 416         bytea      *retval;
 417         unsigned char *str;
 418         int                     len;
 419
 420         if (src_encoding < 0)
 421                 ereport(ERROR,
 422                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 423                                  errmsg("invalid source encoding name \"%s\"",
 424                                                 src_encoding_name)));
 425         if (dest_encoding < 0)
 426                 ereport(ERROR,
 427                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 428                                  errmsg("invalid destination encoding name \"%s\"",
 429                                                 dest_encoding_name)));
 430
 431         /* make sure that source string is valid and null terminated */
 432         len = VARSIZE(string) - VARHDRSZ;
 433         pg_verify_mbstr(src_encoding, VARDATA(string), len, false);
 434         str = palloc(len + 1);
 435         memcpy(str, VARDATA(string), len);
 436         *(str + len) = '\0';
 437
 438         result = pg_do_encoding_conversion(str, len, src_encoding, dest_encoding);
 439
 440         /*
 441          * build bytea data type structure.
 442          */
 443         len = strlen((char *) result) + VARHDRSZ;
 444         retval = palloc(len);
 445         SET_VARSIZE(retval, len);
 446         memcpy(VARDATA(retval), result, len - VARHDRSZ);
 447
 448         if (result != str)
 449                 pfree(result);
 450         pfree(str);
 451
 452         /* free memory if allocated by the toaster */
 453         PG_FREE_IF_COPY(string, 0);
 454
 455         PG_RETURN_BYTEA_P(retval);
 456 }
 457
 458 /*
 459  * get the length of the string considered as text in the specified
 460  * encoding. Raises an error if the data is not valid in that
 461  * encoding.
 462  *
 463  * INT4 length (BYTEA string, NAME src_encoding_name)
 464  */
 465 Datum
 466 length_in_encoding(PG_FUNCTION_ARGS)
 467 {
 468         bytea      *string = PG_GETARG_BYTEA_P(0);
 469         char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
 470         int                     src_encoding = pg_char_to_encoding(src_encoding_name);
 471         int                     len = VARSIZE(string) - VARHDRSZ;
 472         int                     retval;
 473
 474         if (src_encoding < 0)
 475                 ereport(ERROR,
 476                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 477                                  errmsg("invalid encoding name \"%s\"",
 478                                                 src_encoding_name)));
 479
 480         retval = pg_verify_mbstr_len(src_encoding, VARDATA(string), len, false);
 481         PG_RETURN_INT32(retval);
 482
 483 }
 484
 485 Datum
 486 pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
 487 {
 488         int encoding = PG_GETARG_INT32(0);
 489
 490         if (PG_VALID_ENCODING(encoding))
 491                 PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
 492         else
 493                 PG_RETURN_NULL();
 494 }
 495
 496 /*
 497  * convert client encoding to server encoding.
 498  */
 499 char *
 500 pg_client_to_server(const char *s, int len)
 501 {
 502         Assert(DatabaseEncoding);
 503         Assert(ClientEncoding);
 504
 505         if (len <= 0)
 506                 return (char *) s;
 507
 508         if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
 509                 ClientEncoding->encoding == PG_SQL_ASCII)
 510         {
 511                 /*
 512                  * No conversion is needed, but we must still validate the data.
 513                  */
 514                 (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
 515                 return (char *) s;
 516         }
 517
 518         if (DatabaseEncoding->encoding == PG_SQL_ASCII)
 519         {
 520                 /*
 521                  * No conversion is possible, but we must still validate the data,
 522                  * because the client-side code might have done string escaping using
 523                  * the selected client_encoding.  If the client encoding is ASCII-safe
 524                  * then we just do a straight validation under that encoding.  For an
 525                  * ASCII-unsafe encoding we have a problem: we dare not pass such data
 526                  * to the parser but we have no way to convert it.      We compromise by
 527                  * rejecting the data if it contains any non-ASCII characters.
 528                  */
 529                 if (PG_VALID_BE_ENCODING(ClientEncoding->encoding))
 530                         (void) pg_verify_mbstr(ClientEncoding->encoding, s, len, false);
 531                 else
 532                 {
 533                         int                     i;
 534
 535                         for (i = 0; i < len; i++)
 536                         {
 537                                 if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
 538                                         ereport(ERROR,
 539                                                         (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 540                                          errmsg("invalid byte value for encoding \"%s\": 0x%02x",
 541                                                         pg_enc2name_tbl[PG_SQL_ASCII].name,
 542                                                         (unsigned char) s[i])));
 543                         }
 544                 }
 545                 return (char *) s;
 546         }
 547
 548         return perform_default_encoding_conversion(s, len, true);
 549 }
 550
 551 /*
 552  * convert server encoding to client encoding.
 553  */
 554 char *
 555 pg_server_to_client(const char *s, int len)
 556 {
 557         Assert(DatabaseEncoding);
 558         Assert(ClientEncoding);
 559
 560         if (len <= 0)
 561                 return (char *) s;
 562
 563         if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
 564                 ClientEncoding->encoding == PG_SQL_ASCII ||
 565                 DatabaseEncoding->encoding == PG_SQL_ASCII)
 566                 return (char *) s;              /* assume data is valid */
 567
 568         return perform_default_encoding_conversion(s, len, false);
 569 }
 570
 571 /*
 572  *      Perform default encoding conversion using cached FmgrInfo. Since
 573  *      this function does not access database at all, it is safe to call
 574  *      outside transactions.  If the conversion has not been set up by
 575  *      SetClientEncoding(), no conversion is performed.
 576  */
 577 static char *
 578 perform_default_encoding_conversion(const char *src, int len, bool is_client_to_server)
 579 {
 580         char       *result;
 581         int                     src_encoding,
 582                                 dest_encoding;
 583         FmgrInfo   *flinfo;
 584
 585         if (is_client_to_server)
 586         {
 587                 src_encoding = ClientEncoding->encoding;
 588                 dest_encoding = DatabaseEncoding->encoding;
 589                 flinfo = ToServerConvProc;
 590         }
 591         else
 592         {
 593                 src_encoding = DatabaseEncoding->encoding;
 594                 dest_encoding = ClientEncoding->encoding;
 595                 flinfo = ToClientConvProc;
 596         }
 597
 598         if (flinfo == NULL)
 599                 return (char *) src;
 600
 601         /*
 602          * Allocate space for conversion result, being wary of integer overflow
 603          */
 604         if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
 605                 ereport(ERROR,
 606                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 607                                  errmsg("out of memory"),
 608                  errdetail("String of %d bytes is too long for encoding conversion.",
 609                                    len)));
 610
 611         result = palloc(len * MAX_CONVERSION_GROWTH + 1);
 612
 613         FunctionCall5(flinfo,
 614                                   Int32GetDatum(src_encoding),
 615                                   Int32GetDatum(dest_encoding),
 616                                   CStringGetDatum(src),
 617                                   CStringGetDatum(result),
 618                                   Int32GetDatum(len));
 619         return result;
 620 }
 621
 622
 623
 624 #ifdef USE_WIDE_UPPER_LOWER
 625
 626 /*
 627  * wchar2char --- convert wide characters to multibyte format
 628  *
 629  * This has the same API as the standard wcstombs() function; in particular,
 630  * tolen is the maximum number of bytes to store at *to, and *from must be
 631  * zero-terminated.  The output will be zero-terminated iff there is room.
 632  */
 633 size_t
 634 wchar2char(char *to, const wchar_t *from, size_t tolen)
 635 {
 636         size_t          result;
 637
 638         if (tolen == 0)
 639                 return 0;
 640
 641 #ifdef WIN32
 642
 643         /*
 644          * On Windows, the "Unicode" locales assume UTF16 not UTF8 encoding, and
 645          * for some reason mbstowcs and wcstombs won't do this for us, so we use
 646          * MultiByteToWideChar().
 647          */
 648         if (GetDatabaseEncoding() == PG_UTF8)
 649         {
 650                 result = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
 651                                                                          NULL, NULL);
 652                 /* A zero return is failure */
 653                 if (result <= 0)
 654                         result = -1;
 655                 else
 656                 {
 657                         Assert(result <= tolen);
 658                         /* Microsoft counts the zero terminator in the result */
 659                         result--;
 660                 }
 661         }
 662         else
 663 #endif   /* WIN32 */
 664         {
 665                 Assert(!lc_ctype_is_c());
 666                 result = wcstombs(to, from, tolen);
 667         }
 668         return result;
 669 }
 670
 671 /*
 672  * char2wchar --- convert multibyte characters to wide characters
 673  *
 674  * This has almost the API of mbstowcs(), except that *from need not be
 675  * null-terminated; instead, the number of input bytes is specified as
 676  * fromlen.  Also, we ereport() rather than returning -1 for invalid
 677  * input encoding.      tolen is the maximum number of wchar_t's to store at *to.
 678  * The output will be zero-terminated iff there is room.
 679  */
 680 size_t
 681 char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen)
 682 {
 683         size_t          result;
 684
 685         if (tolen == 0)
 686                 return 0;
 687
 688 #ifdef WIN32
 689         /* See WIN32 "Unicode" comment above */
 690         if (GetDatabaseEncoding() == PG_UTF8)
 691         {
 692                 /* Win32 API does not work for zero-length input */
 693                 if (fromlen == 0)
 694                         result = 0;
 695                 else
 696                 {
 697                         result = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen - 1);
 698                         /* A zero return is failure */
 699                         if (result == 0)
 700                                 result = -1;
 701                 }
 702
 703                 if (result != -1)
 704                 {
 705                         Assert(result < tolen);
 706                         /* Append trailing null wchar (MultiByteToWideChar() does not) */
 707                         to[result] = 0;
 708                 }
 709         }
 710         else
 711 #endif   /* WIN32 */
 712         {
 713                 /* mbstowcs requires ending '\0' */
 714                 char       *str = pnstrdup(from, fromlen);
 715
 716                 Assert(!lc_ctype_is_c());
 717                 result = mbstowcs(to, str, tolen);
 718                 pfree(str);
 719         }
 720
 721         if (result == -1)
 722         {
 723                 /*
 724                  * Invalid multibyte character encountered.  We try to give a useful
 725                  * error message by letting pg_verifymbstr check the string.  But it's
 726                  * possible that the string is OK to us, and not OK to mbstowcs ---
 727                  * this suggests that the LC_CTYPE locale is different from the
 728                  * database encoding.  Give a generic error message if verifymbstr
 729                  * can't find anything wrong.
 730                  */
 731                 pg_verifymbstr(from, fromlen, false);   /* might not return */
 732                 /* but if it does ... */
 733                 ereport(ERROR,
 734                                 (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 735                                  errmsg("invalid multibyte character for locale"),
 736                                  errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
 737         }
 738
 739         return result;
 740 }
 741 #endif
 742
 743 /* convert a multibyte string to a wchar */
 744 int
 745 pg_mb2wchar(const char *from, pg_wchar *to)
 746 {
 747         return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, strlen(from));
 748 }
 749
 750 /* convert a multibyte string to a wchar with a limited length */
 751 int
 752 pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
 753 {
 754         return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
 755 }
 756
 757 /* same, with any encoding */
 758 int
 759 pg_encoding_mb2wchar_with_len(int encoding,
 760                                                           const char *from, pg_wchar *to, int len)
 761 {
 762         return (*pg_wchar_table[encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
 763 }
 764
 765 /* returns the byte length of a multibyte character */
 766 int
 767 pg_mblen(const char *mbstr)
 768 {
 769         return ((*pg_wchar_table[DatabaseEncoding->encoding].mblen) ((const unsigned char *) mbstr));
 770 }
 771
 772 /* returns the display length of a multibyte character */
 773 int
 774 pg_dsplen(const char *mbstr)
 775 {
 776         return ((*pg_wchar_table[DatabaseEncoding->encoding].dsplen) ((const unsigned char *) mbstr));
 777 }
 778
 779 /* returns the length (counted in wchars) of a multibyte string */
 780 int
 781 pg_mbstrlen(const char *mbstr)
 782 {
 783         int                     len = 0;
 784
 785         /* optimization for single byte encoding */
 786         if (pg_database_encoding_max_length() == 1)
 787                 return strlen(mbstr);
 788
 789         while (*mbstr)
 790         {
 791                 mbstr += pg_mblen(mbstr);
 792                 len++;
 793         }
 794         return len;
 795 }
 796
 797 /* returns the length (counted in wchars) of a multibyte string
 798  * (not necessarily NULL terminated)
 799  */
 800 int
 801 pg_mbstrlen_with_len(const char *mbstr, int limit)
 802 {
 803         int                     len = 0;
 804
 805         /* optimization for single byte encoding */
 806         if (pg_database_encoding_max_length() == 1)
 807                 return limit;
 808
 809         while (limit > 0 && *mbstr)
 810         {
 811                 int                     l = pg_mblen(mbstr);
 812
 813                 limit -= l;
 814                 mbstr += l;
 815                 len++;
 816         }
 817         return len;
 818 }
 819
 820 /*
 821  * returns the byte length of a multibyte string
 822  * (not necessarily NULL terminated)
 823  * that is no longer than limit.
 824  * this function does not break multibyte character boundary.
 825  */
 826 int
 827 pg_mbcliplen(const char *mbstr, int len, int limit)
 828 {
 829         return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
 830                                                                  len, limit);
 831 }
 832
 833 /*
 834  * pg_mbcliplen with specified encoding
 835  */
 836 int
 837 pg_encoding_mbcliplen(int encoding, const char *mbstr,
 838                                           int len, int limit)
 839 {
 840         mblen_converter mblen_fn;
 841         int                     clen = 0;
 842         int                     l;
 843
 844         /* optimization for single byte encoding */
 845         if (pg_encoding_max_length(encoding) == 1)
 846                 return cliplen(mbstr, len, limit);
 847
 848         mblen_fn = pg_wchar_table[encoding].mblen;
 849
 850         while (len > 0 && *mbstr)
 851         {
 852                 l = (*mblen_fn) ((const unsigned char *) mbstr);
 853                 if ((clen + l) > limit)
 854                         break;
 855                 clen += l;
 856                 if (clen == limit)
 857                         break;
 858                 len -= l;
 859                 mbstr += l;
 860         }
 861         return clen;
 862 }
 863
 864 /*
 865  * Similar to pg_mbcliplen except the limit parameter specifies the
 866  * character length, not the byte length.
 867  */
 868 int
 869 pg_mbcharcliplen(const char *mbstr, int len, int limit)
 870 {
 871         int                     clen = 0;
 872         int                     nch = 0;
 873         int                     l;
 874
 875         /* optimization for single byte encoding */
 876         if (pg_database_encoding_max_length() == 1)
 877                 return cliplen(mbstr, len, limit);
 878
 879         while (len > 0 && *mbstr)
 880         {
 881                 l = pg_mblen(mbstr);
 882                 nch++;
 883                 if (nch > limit)
 884                         break;
 885                 clen += l;
 886                 len -= l;
 887                 mbstr += l;
 888         }
 889         return clen;
 890 }
 891
 892 /* mbcliplen for any single-byte encoding */
 893 static int
 894 cliplen(const char *str, int len, int limit)
 895 {
 896         int                     l = 0;
 897
 898         len = Min(len, limit);
 899         while (l < len && str[l])
 900                 l++;
 901         return l;
 902 }
 903
 904 void
 905 SetDatabaseEncoding(int encoding)
 906 {
 907         if (!PG_VALID_BE_ENCODING(encoding))
 908                 elog(ERROR, "invalid database encoding: %d", encoding);
 909
 910         DatabaseEncoding = &pg_enc2name_tbl[encoding];
 911         Assert(DatabaseEncoding->encoding == encoding);
 912 }
 913
 914 /*
 915  * Bind gettext to the codeset equivalent with the database encoding.
 916  */
 917 void
 918 pg_bind_textdomain_codeset(const char *domainname)
 919 {
 920 #if defined(ENABLE_NLS)
 921         int                     encoding = GetDatabaseEncoding();
 922         int                     i;
 923
 924         /*
 925          * gettext() uses the codeset specified by LC_CTYPE by default, so if that
 926          * matches the database encoding we don't need to do anything. In CREATE
 927          * DATABASE, we enforce or trust that the locale's codeset matches
 928          * database encoding, except for the C locale. In C locale, we bind
 929          * gettext() explicitly to the right codeset.
 930          *
 931          * On Windows, though, gettext() tends to get confused so we always bind
 932          * it.
 933          */
 934 #ifndef WIN32
 935         const char *ctype = setlocale(LC_CTYPE, NULL);
 936
 937         if (pg_strcasecmp(ctype, "C") != 0 && pg_strcasecmp(ctype, "POSIX") != 0)
 938                 return;
 939 #endif
 940
 941         for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
 942         {
 943                 if (pg_enc2gettext_tbl[i].encoding == encoding)
 944                 {
 945                         if (bind_textdomain_codeset(domainname,
 946                                                                                 pg_enc2gettext_tbl[i].name) == NULL)
 947                                 elog(LOG, "bind_textdomain_codeset failed");
 948                         break;
 949                 }
 950         }
 951 #endif
 952 }
 953
 954 int
 955 GetDatabaseEncoding(void)
 956 {
 957         Assert(DatabaseEncoding);
 958         return DatabaseEncoding->encoding;
 959 }
 960
 961 const char *
 962 GetDatabaseEncodingName(void)
 963 {
 964         Assert(DatabaseEncoding);
 965         return DatabaseEncoding->name;
 966 }
 967
 968 Datum
 969 getdatabaseencoding(PG_FUNCTION_ARGS)
 970 {
 971         Assert(DatabaseEncoding);
 972         return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
 973 }
 974
 975 Datum
 976 pg_client_encoding(PG_FUNCTION_ARGS)
 977 {
 978         Assert(ClientEncoding);
 979         return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
 980 }