src/backend/utils/mb/conv.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  *        Utility functions for conversion procs.
   4  *
   5  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
   6  * Portions Copyright (c) 1994, Regents of the University of California
   7  *
   8  * IDENTIFICATION
   9  *        $PostgreSQL$
  10  *
  11  *-------------------------------------------------------------------------
  12  */
  13 #include "postgres.h"
  14 #include "mb/pg_wchar.h"
  15
  16
  17 /*
  18  * LATINn ---> MIC when the charset's local codes map directly to MIC
  19  *
  20  * l points to the source string of length len
  21  * p is the output area (must be large enough!)
  22  * lc is the mule character set id for the local encoding
  23  * encoding is the PG identifier for the local encoding
  24  */
  25 void
  26 latin2mic(const unsigned char *l, unsigned char *p, int len,
  27                   int lc, int encoding)
  28 {
  29         int                     c1;
  30
  31         while (len > 0)
  32         {
  33                 c1 = *l;
  34                 if (c1 == 0)
  35                         report_invalid_encoding(encoding, (const char *) l, len);
  36                 if (IS_HIGHBIT_SET(c1))
  37                         *p++ = lc;
  38                 *p++ = c1;
  39                 l++;
  40                 len--;
  41         }
  42         *p = '\0';
  43 }
  44
  45 /*
  46  * MIC ---> LATINn when the charset's local codes map directly to MIC
  47  *
  48  * mic points to the source string of length len
  49  * p is the output area (must be large enough!)
  50  * lc is the mule character set id for the local encoding
  51  * encoding is the PG identifier for the local encoding
  52  */
  53 void
  54 mic2latin(const unsigned char *mic, unsigned char *p, int len,
  55                   int lc, int encoding)
  56 {
  57         int                     c1;
  58
  59         while (len > 0)
  60         {
  61                 c1 = *mic;
  62                 if (c1 == 0)
  63                         report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
  64                 if (!IS_HIGHBIT_SET(c1))
  65                 {
  66                         /* easy for ASCII */
  67                         *p++ = c1;
  68                         mic++;
  69                         len--;
  70                 }
  71                 else
  72                 {
  73                         int                     l = pg_mic_mblen(mic);
  74
  75                         if (len < l)
  76                                 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
  77                                                                                 len);
  78                         if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
  79                                 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
  80                                                                                    (const char *) mic, len);
  81                         *p++ = mic[1];
  82                         mic += 2;
  83                         len -= 2;
  84                 }
  85         }
  86         *p = '\0';
  87 }
  88
  89
  90 /*
  91  * ASCII ---> MIC
  92  *
  93  * While ordinarily SQL_ASCII encoding is forgiving of high-bit-set
  94  * characters, here we must take a hard line because we don't know
  95  * the appropriate MIC equivalent.
  96  */
  97 void
  98 pg_ascii2mic(const unsigned char *l, unsigned char *p, int len)
  99 {
 100         int                     c1;
 101
 102         while (len > 0)
 103         {
 104                 c1 = *l;
 105                 if (c1 == 0 || IS_HIGHBIT_SET(c1))
 106                         report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len);
 107                 *p++ = c1;
 108                 l++;
 109                 len--;
 110         }
 111         *p = '\0';
 112 }
 113
 114 /*
 115  * MIC ---> ASCII
 116  */
 117 void
 118 pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len)
 119 {
 120         int                     c1;
 121
 122         while (len > 0)
 123         {
 124                 c1 = *mic;
 125                 if (c1 == 0 || IS_HIGHBIT_SET(c1))
 126                         report_untranslatable_char(PG_MULE_INTERNAL, PG_SQL_ASCII,
 127                                                                            (const char *) mic, len);
 128                 *p++ = c1;
 129                 mic++;
 130                 len--;
 131         }
 132         *p = '\0';
 133 }
 134
 135 /*
 136  * latin2mic_with_table: a generic single byte charset encoding
 137  * conversion from a local charset to the mule internal code.
 138  *
 139  * l points to the source string of length len
 140  * p is the output area (must be large enough!)
 141  * lc is the mule character set id for the local encoding
 142  * encoding is the PG identifier for the local encoding
 143  * tab holds conversion entries for the local charset
 144  * starting from 128 (0x80). each entry in the table
 145  * holds the corresponding code point for the mule internal code.
 146  */
 147 void
 148 latin2mic_with_table(const unsigned char *l,
 149                                          unsigned char *p,
 150                                          int len,
 151                                          int lc,
 152                                          int encoding,
 153                                          const unsigned char *tab)
 154 {
 155         unsigned char c1,
 156                                 c2;
 157
 158         while (len > 0)
 159         {
 160                 c1 = *l;
 161                 if (c1 == 0)
 162                         report_invalid_encoding(encoding, (const char *) l, len);
 163                 if (!IS_HIGHBIT_SET(c1))
 164                         *p++ = c1;
 165                 else
 166                 {
 167                         c2 = tab[c1 - HIGHBIT];
 168                         if (c2)
 169                         {
 170                                 *p++ = lc;
 171                                 *p++ = c2;
 172                         }
 173                         else
 174                                 report_untranslatable_char(encoding, PG_MULE_INTERNAL,
 175                                                                                    (const char *) l, len);
 176                 }
 177                 l++;
 178                 len--;
 179         }
 180         *p = '\0';
 181 }
 182
 183 /*
 184  * mic2latin_with_table: a generic single byte charset encoding
 185  * conversion from the mule internal code to a local charset.
 186  *
 187  * mic points to the source string of length len
 188  * p is the output area (must be large enough!)
 189  * lc is the mule character set id for the local encoding
 190  * encoding is the PG identifier for the local encoding
 191  * tab holds conversion entries for the mule internal code's
 192  * second byte, starting from 128 (0x80). each entry in the table
 193  * holds the corresponding code point for the local charset.
 194  */
 195 void
 196 mic2latin_with_table(const unsigned char *mic,
 197                                          unsigned char *p,
 198                                          int len,
 199                                          int lc,
 200                                          int encoding,
 201                                          const unsigned char *tab)
 202 {
 203         unsigned char c1,
 204                                 c2;
 205
 206         while (len > 0)
 207         {
 208                 c1 = *mic;
 209                 if (c1 == 0)
 210                         report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
 211                 if (!IS_HIGHBIT_SET(c1))
 212                 {
 213                         /* easy for ASCII */
 214                         *p++ = c1;
 215                         mic++;
 216                         len--;
 217                 }
 218                 else
 219                 {
 220                         int                     l = pg_mic_mblen(mic);
 221
 222                         if (len < l)
 223                                 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
 224                                                                                 len);
 225                         if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
 226                                 (c2 = tab[mic[1] - HIGHBIT]) == 0)
 227                         {
 228                                 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
 229                                                                                    (const char *) mic, len);
 230                                 break;                  /* keep compiler quiet */
 231                         }
 232                         *p++ = c2;
 233                         mic += 2;
 234                         len -= 2;
 235                 }
 236         }
 237         *p = '\0';
 238 }
 239
 240 /*
 241  * comparison routine for bsearch()
 242  * this routine is intended for UTF8 -> local code
 243  */
 244 static int
 245 compare1(const void *p1, const void *p2)
 246 {
 247         uint32          v1,
 248                                 v2;
 249
 250         v1 = *(uint32 *) p1;
 251         v2 = ((pg_utf_to_local *) p2)->utf;
 252         return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
 253 }
 254
 255 /*
 256  * comparison routine for bsearch()
 257  * this routine is intended for local code -> UTF8
 258  */
 259 static int
 260 compare2(const void *p1, const void *p2)
 261 {
 262         uint32          v1,
 263                                 v2;
 264
 265         v1 = *(uint32 *) p1;
 266         v2 = ((pg_local_to_utf *) p2)->code;
 267         return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
 268 }
 269
 270 /*
 271  * comparison routine for bsearch()
 272  * this routine is intended for combined UTF8 -> local code
 273  */
 274 static int
 275 compare3(const void *p1, const void *p2)
 276 {
 277         uint32          s1,
 278                                 s2,
 279                                 d1,
 280                                 d2;
 281
 282         s1 = *(uint32 *) p1;
 283         s2 = *((uint32 *) p1 + 1);
 284         d1 = ((pg_utf_to_local_combined *) p2)->utf1;
 285         d2 = ((pg_utf_to_local_combined *) p2)->utf2;
 286         return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
 287 }
 288
 289 /*
 290  * comparison routine for bsearch()
 291  * this routine is intended for local code -> combined UTF8
 292  */
 293 static int
 294 compare4(const void *p1, const void *p2)
 295 {
 296         uint32          v1,
 297                                 v2;
 298
 299         v1 = *(uint32 *) p1;
 300         v2 = ((pg_local_to_utf_combined *) p2)->code;
 301         return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
 302 }
 303
 304 /*
 305  * convert 32bit wide character to mutibye stream pointed to by iso
 306  */
 307 static unsigned char *
 308 set_iso_code(unsigned char *iso, uint32 code)
 309 {
 310         if (code & 0xff000000)
 311                 *iso++ = code >> 24;
 312         if (code & 0x00ff0000)
 313                 *iso++ = (code & 0x00ff0000) >> 16;
 314         if (code & 0x0000ff00)
 315                 *iso++ = (code & 0x0000ff00) >> 8;
 316         if (code & 0x000000ff)
 317                 *iso++ = code & 0x000000ff;
 318         return iso;
 319 }
 320
 321 /*
 322  * UTF8 ---> local code
 323  *
 324  * utf: input UTF8 string (need not be null-terminated).
 325  * iso: pointer to the output area (must be large enough!)
 326  * map: the conversion map.
 327  * cmap: the conversion map for combined characters.
 328  *                (optional)
 329  * size1: the size of the conversion map.
 330  * size2: the size of the conversion map for combined characters
 331  *                (optional)
 332  * encoding: the PG identifier for the local encoding.
 333  * len: length of input string.
 334  */
 335 void
 336 UtfToLocal(const unsigned char *utf, unsigned char *iso,
 337                    const pg_utf_to_local *map, const pg_utf_to_local_combined *cmap,
 338                    int size1, int size2, int encoding, int len)
 339 {
 340         uint32          iutf;
 341         uint32          cutf[2];
 342         uint32          code;
 343         pg_utf_to_local *p;
 344         pg_utf_to_local_combined *cp;
 345         int                     l;
 346
 347         for (; len > 0; len -= l)
 348         {
 349                 /* "break" cases all represent errors */
 350                 if (*utf == '\0')
 351                         break;
 352
 353                 l = pg_utf_mblen(utf);
 354
 355                 if (len < l)
 356                         break;
 357
 358                 if (!pg_utf8_islegal(utf, l))
 359                         break;
 360
 361                 if (l == 1)
 362                 {
 363                         /* ASCII case is easy */
 364                         *iso++ = *utf++;
 365                         continue;
 366                 }
 367                 else if (l == 2)
 368                 {
 369                         iutf = *utf++ << 8;
 370                         iutf |= *utf++;
 371                 }
 372                 else if (l == 3)
 373                 {
 374                         iutf = *utf++ << 16;
 375                         iutf |= *utf++ << 8;
 376                         iutf |= *utf++;
 377                 }
 378                 else if (l == 4)
 379                 {
 380                         iutf = *utf++ << 24;
 381                         iutf |= *utf++ << 16;
 382                         iutf |= *utf++ << 8;
 383                         iutf |= *utf++;
 384                 }
 385
 386                 /*
 387                  * first, try with combined map if possible
 388                  */
 389                 if (cmap && len > l)
 390                 {
 391                         const unsigned char *utf_save = utf;
 392                         int                     len_save = len;
 393                         int                     l_save = l;
 394
 395                         len -= l;
 396
 397                         l = pg_utf_mblen(utf);
 398                         if (len < l)
 399                                 break;
 400
 401                         if (!pg_utf8_islegal(utf, l))
 402                                 break;
 403
 404                         cutf[0] = iutf;
 405
 406                         if (l == 1)
 407                         {
 408                                 if (len_save > 1)
 409                                 {
 410                                         p = bsearch(&cutf[0], map, size1,
 411                                                                 sizeof(pg_utf_to_local), compare1);
 412                                         if (p == NULL)
 413                                                 report_untranslatable_char(PG_UTF8, encoding,
 414                                                            (const char *) (utf_save - l_save), len_save);
 415                                         iso = set_iso_code(iso, p->code);
 416                                 }
 417
 418                                 /* ASCII case is easy */
 419                                 *iso++ = *utf++;
 420                                 continue;
 421                         }
 422                         else if (l == 2)
 423                         {
 424                                 iutf = *utf++ << 8;
 425                                 iutf |= *utf++;
 426                         }
 427                         else if (l == 3)
 428                         {
 429                                 iutf = *utf++ << 16;
 430                                 iutf |= *utf++ << 8;
 431                                 iutf |= *utf++;
 432                         }
 433                         else if (l == 4)
 434                         {
 435                                 iutf = *utf++ << 24;
 436                                 iutf |= *utf++ << 16;
 437                                 iutf |= *utf++ << 8;
 438                                 iutf |= *utf++;
 439                         }
 440
 441                         cutf[1] = iutf;
 442                         cp = bsearch(cutf, cmap, size2,
 443                                                  sizeof(pg_utf_to_local_combined), compare3);
 444                         if (cp)
 445                                 code = cp->code;
 446                         else
 447                         {
 448                                 /* not found in combined map. try with ordinary map */
 449                                 p = bsearch(&cutf[0], map, size1,
 450                                                         sizeof(pg_utf_to_local), compare1);
 451                                 if (p == NULL)
 452                                         report_untranslatable_char(PG_UTF8, encoding,
 453                                                            (const char *) (utf_save - l_save), len_save);
 454                                 iso = set_iso_code(iso, p->code);
 455
 456                                 p = bsearch(&cutf[1], map, size1,
 457                                                         sizeof(pg_utf_to_local), compare1);
 458                                 if (p == NULL)
 459                                         report_untranslatable_char(PG_UTF8, encoding,
 460                                                                                            (const char *) (utf - l), len);
 461                                 code = p->code;
 462                         }
 463                 }
 464                 else    /* no cmap or no remaining data */
 465                 {
 466                         p = bsearch(&iutf, map, size1,
 467                                                 sizeof(pg_utf_to_local), compare1);
 468                         if (p == NULL)
 469                                 report_untranslatable_char(PG_UTF8, encoding,
 470                                                                                    (const char *) (utf - l), len);
 471                         code = p->code;
 472                 }
 473                 iso = set_iso_code(iso, code);
 474         }
 475
 476         if (len > 0)
 477                 report_invalid_encoding(PG_UTF8, (const char *) utf, len);
 478
 479         *iso = '\0';
 480 }
 481
 482 /*
 483  * local code ---> UTF8
 484  *
 485  * iso: input local string (need not be null-terminated).
 486  * utf: pointer to the output area (must be large enough!)
 487  * map: the conversion map.
 488  * cmap: the conversion map for combined characters.
 489  *                (optional)
 490  * size1: the size of the conversion map.
 491  * size2: the size of the conversion map for combined characters
 492  *                (optional)
 493  * encoding: the PG identifier for the local encoding.
 494  * len: length of input string.
 495  */
 496 void
 497 LocalToUtf(const unsigned char *iso, unsigned char *utf,
 498                    const pg_local_to_utf *map, const pg_local_to_utf_combined *cmap,
 499                    int size1, int size2, int encoding, int len)
 500 {
 501         unsigned int iiso;
 502         int                     l;
 503         pg_local_to_utf *p;
 504         pg_local_to_utf_combined *cp;
 505
 506         if (!PG_VALID_ENCODING(encoding))
 507                 ereport(ERROR,
 508                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 509                                  errmsg("invalid encoding number: %d", encoding)));
 510
 511         for (; len > 0; len -= l)
 512         {
 513                 /* "break" cases all represent errors */
 514                 if (*iso == '\0')
 515                         break;
 516
 517                 if (!IS_HIGHBIT_SET(*iso))
 518                 {
 519                         /* ASCII case is easy */
 520                         *utf++ = *iso++;
 521                         l = 1;
 522                         continue;
 523                 }
 524
 525                 l = pg_encoding_verifymb(encoding, (const char *) iso, len);
 526                 if (l < 0)
 527                         break;
 528
 529                 if (l == 1)
 530                         iiso = *iso++;
 531                 else if (l == 2)
 532                 {
 533                         iiso = *iso++ << 8;
 534                         iiso |= *iso++;
 535                 }
 536                 else if (l == 3)
 537                 {
 538                         iiso = *iso++ << 16;
 539                         iiso |= *iso++ << 8;
 540                         iiso |= *iso++;
 541                 }
 542                 else if (l == 4)
 543                 {
 544                         iiso = *iso++ << 24;
 545                         iiso |= *iso++ << 16;
 546                         iiso |= *iso++ << 8;
 547                         iiso |= *iso++;
 548                 }
 549
 550                 p = bsearch(&iiso, map, size1,
 551                                         sizeof(pg_local_to_utf), compare2);
 552
 553                 if (p == NULL)
 554                 {
 555                         /*
 556                          * not found in the ordinary map. if there's a combined character
 557                          * map, try with it
 558                          */
 559                         if (cmap)
 560                         {
 561                                 cp = bsearch(&iiso, cmap, size2,
 562                                                          sizeof(pg_local_to_utf_combined), compare4);
 563
 564                                 if (cp)
 565                                 {
 566                                         if (cp->utf1 & 0xff000000)
 567                                                 *utf++ = cp->utf1 >> 24;
 568                                         if (cp->utf1 & 0x00ff0000)
 569                                                 *utf++ = (cp->utf1 & 0x00ff0000) >> 16;
 570                                         if (cp->utf1 & 0x0000ff00)
 571                                                 *utf++ = (cp->utf1 & 0x0000ff00) >> 8;
 572                                         if (cp->utf1 & 0x000000ff)
 573                                                 *utf++ = cp->utf1 & 0x000000ff;
 574
 575                                         if (cp->utf2 & 0xff000000)
 576                                                 *utf++ = cp->utf2 >> 24;
 577                                         if (cp->utf2 & 0x00ff0000)
 578                                                 *utf++ = (cp->utf2 & 0x00ff0000) >> 16;
 579                                         if (cp->utf2 & 0x0000ff00)
 580                                                 *utf++ = (cp->utf2 & 0x0000ff00) >> 8;
 581                                         if (cp->utf2 & 0x000000ff)
 582                                                 *utf++ = cp->utf2 & 0x000000ff;
 583
 584                                         continue;
 585                                 }
 586                         }
 587
 588                         report_untranslatable_char(encoding, PG_UTF8,
 589                                                                            (const char *) (iso - l), len);
 590
 591                 }
 592                 else
 593                 {
 594                         if (p->utf & 0xff000000)
 595                                 *utf++ = p->utf >> 24;
 596                         if (p->utf & 0x00ff0000)
 597                                 *utf++ = (p->utf & 0x00ff0000) >> 16;
 598                         if (p->utf & 0x0000ff00)
 599                                 *utf++ = (p->utf & 0x0000ff00) >> 8;
 600                         if (p->utf & 0x000000ff)
 601                                 *utf++ = p->utf & 0x000000ff;
 602                 }
 603         }
 604
 605         if (len > 0)
 606                 report_invalid_encoding(encoding, (const char *) iso, len);
 607
 608         *utf = '\0';
 609 }