lib/striconveh.c

   1 /* Character set conversion with error handling.
   2    Copyright (C) 2001-2024 Free Software Foundation, Inc.
   3    Written by Bruno Haible and Simon Josefsson.
   4
   5    This file is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU Lesser General Public License as
   7    published by the Free Software Foundation; either version 2.1 of the
   8    License, or (at your option) any later version.
   9
  10    This file is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public License
  16    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  17
  18 #include <config.h>
  19
  20 /* Specification.  */
  21 #include "striconveh.h"
  22
  23 #include <errno.h>
  24 #include <stdlib.h>
  25 #include <string.h>
  26
  27 #if HAVE_ICONV
  28 # include <iconv.h>
  29 # include "unistr.h"
  30 #endif
  31
  32 #include "c-strcase.h"
  33 #include "c-strcaseeq.h"
  34
  35 #ifndef SIZE_MAX
  36 # define SIZE_MAX ((size_t) -1)
  37 #endif
  38
  39
  40 #if HAVE_ICONV
  41
  42 /* The caller must provide an iconveh_t, not just an iconv_t, because when a
  43    conversion error occurs, we may have to determine the Unicode representation
  44    of the inconvertible character.  */
  45
  46 int
  47 iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
  48 {
  49   iconv_t cd;
  50   iconv_t cd1;
  51   iconv_t cd2;
  52
  53   cd = iconv_open (to_codeset, from_codeset);
  54
  55   if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
  56     cd1 = (iconv_t)(-1);
  57   else
  58     {
  59       cd1 = iconv_open ("UTF-8", from_codeset);
  60       if (cd1 == (iconv_t)(-1))
  61         {
  62           int saved_errno = errno;
  63           if (cd != (iconv_t)(-1))
  64             iconv_close (cd);
  65           errno = saved_errno;
  66           return -1;
  67         }
  68     }
  69
  70   if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
  71 # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
  72       && !defined __UCLIBC__) \
  73      || _LIBICONV_VERSION >= 0x0105 \
  74      || defined ICONV_SET_TRANSLITERATE
  75       || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
  76 # endif
  77      )
  78     cd2 = (iconv_t)(-1);
  79   else
  80     {
  81       cd2 = iconv_open (to_codeset, "UTF-8");
  82       if (cd2 == (iconv_t)(-1))
  83         {
  84           int saved_errno = errno;
  85           if (cd1 != (iconv_t)(-1))
  86             iconv_close (cd1);
  87           if (cd != (iconv_t)(-1))
  88             iconv_close (cd);
  89           errno = saved_errno;
  90           return -1;
  91         }
  92     }
  93
  94   cdp->cd = cd;
  95   cdp->cd1 = cd1;
  96   cdp->cd2 = cd2;
  97   return 0;
  98 }
  99
 100 int
 101 iconveh_close (const iconveh_t *cd)
 102 {
 103   if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
 104     {
 105       /* Return -1, but preserve the errno from iconv_close.  */
 106       int saved_errno = errno;
 107       if (cd->cd1 != (iconv_t)(-1))
 108         iconv_close (cd->cd1);
 109       if (cd->cd != (iconv_t)(-1))
 110         iconv_close (cd->cd);
 111       errno = saved_errno;
 112       return -1;
 113     }
 114   if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
 115     {
 116       /* Return -1, but preserve the errno from iconv_close.  */
 117       int saved_errno = errno;
 118       if (cd->cd != (iconv_t)(-1))
 119         iconv_close (cd->cd);
 120       errno = saved_errno;
 121       return -1;
 122     }
 123   if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
 124     return -1;
 125   return 0;
 126 }
 127
 128 /* iconv_carefully is like iconv, except that it stops as soon as it encounters
 129    a conversion error, and it returns in *INCREMENTED a boolean telling whether
 130    it has incremented the input pointers past the error location.  */
 131 # if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
 132      && !(defined __GLIBC__ && !defined __UCLIBC__)
 133 /* Irix iconv() inserts a NUL byte if it cannot convert.
 134    NetBSD iconv() inserts a question mark if it cannot convert.
 135    Only GNU libiconv (excluding the bastard Apple iconv) and GNU libc are
 136    known to prefer to fail rather than doing a lossy conversion.  */
 137 static size_t
 138 iconv_carefully (iconv_t cd,
 139                  const char **inbuf, size_t *inbytesleft,
 140                  char **outbuf, size_t *outbytesleft,
 141                  bool *incremented)
 142 {
 143   const char *inptr = *inbuf;
 144   const char *inptr_end = inptr + *inbytesleft;
 145   char *outptr = *outbuf;
 146   size_t outsize = *outbytesleft;
 147   const char *inptr_before;
 148   size_t res;
 149
 150   do
 151     {
 152       size_t insize;
 153
 154       inptr_before = inptr;
 155       res = (size_t)(-1);
 156
 157       for (insize = 1; inptr + insize <= inptr_end; insize++)
 158         {
 159           res = iconv (cd,
 160                        (ICONV_CONST char **) &inptr, &insize,
 161                        &outptr, &outsize);
 162           if (!(res == (size_t)(-1) && errno == EINVAL))
 163             break;
 164           /* iconv can eat up a shift sequence but give EINVAL while attempting
 165              to convert the first character.  E.g. libiconv does this.  */
 166           if (inptr > inptr_before)
 167             {
 168               res = 0;
 169               break;
 170             }
 171         }
 172
 173       if (res == 0)
 174         {
 175           *outbuf = outptr;
 176           *outbytesleft = outsize;
 177         }
 178     }
 179   while (res == 0 && inptr < inptr_end);
 180
 181   *inbuf = inptr;
 182   *inbytesleft = inptr_end - inptr;
 183   if (res != (size_t)(-1) && res > 0)
 184     {
 185       /* iconv() has already incremented INPTR.  We cannot go back to a
 186          previous INPTR, otherwise the state inside CD would become invalid,
 187          if FROM_CODESET is a stateful encoding.  So, tell the caller that
 188          *INBUF has already been incremented.  */
 189       *incremented = (inptr > inptr_before);
 190       errno = EILSEQ;
 191       return (size_t)(-1);
 192     }
 193   else
 194     {
 195       *incremented = false;
 196       return res;
 197     }
 198 }
 199 # else
 200 #  define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
 201      (*(incremented) = false, \
 202       iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
 203 # endif
 204
 205 /* iconv_carefully_1 is like iconv_carefully, except that it stops after
 206    converting one character or one shift sequence.  */
 207 static size_t
 208 iconv_carefully_1 (iconv_t cd,
 209                    const char **inbuf, size_t *inbytesleft,
 210                    char **outbuf, size_t *outbytesleft,
 211                    bool *incremented)
 212 {
 213   const char *inptr_before = *inbuf;
 214   const char *inptr = inptr_before;
 215   const char *inptr_end = inptr_before + *inbytesleft;
 216   char *outptr = *outbuf;
 217   size_t outsize = *outbytesleft;
 218   size_t res = (size_t)(-1);
 219   size_t insize;
 220
 221   for (insize = 1; inptr_before + insize <= inptr_end; insize++)
 222     {
 223       inptr = inptr_before;
 224       res = iconv (cd,
 225                    (ICONV_CONST char **) &inptr, &insize,
 226                    &outptr, &outsize);
 227       if (!(res == (size_t)(-1) && errno == EINVAL))
 228         break;
 229       /* iconv can eat up a shift sequence but give EINVAL while attempting
 230          to convert the first character.  E.g. libiconv does this.  */
 231       if (inptr > inptr_before)
 232         {
 233           res = 0;
 234           break;
 235         }
 236     }
 237
 238   *inbuf = inptr;
 239   *inbytesleft = inptr_end - inptr;
 240 # if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
 241      && !(defined __GLIBC__ && !defined __UCLIBC__)
 242   /* Irix iconv() inserts a NUL byte if it cannot convert.
 243      NetBSD iconv() inserts a question mark if it cannot convert.
 244      Only GNU libiconv (excluding the bastard Apple iconv) and GNU libc are
 245      known to prefer to fail rather than doing a lossy conversion.  */
 246   if (res != (size_t)(-1) && res > 0)
 247     {
 248       /* iconv() has already incremented INPTR.  We cannot go back to a
 249          previous INPTR, otherwise the state inside CD would become invalid,
 250          if FROM_CODESET is a stateful encoding.  So, tell the caller that
 251          *INBUF has already been incremented.  */
 252       *incremented = (inptr > inptr_before);
 253       errno = EILSEQ;
 254       return (size_t)(-1);
 255     }
 256 # endif
 257
 258   if (res != (size_t)(-1))
 259     {
 260       *outbuf = outptr;
 261       *outbytesleft = outsize;
 262     }
 263   *incremented = false;
 264   return res;
 265 }
 266
 267 /* utf8conv_carefully is like iconv, except that
 268      - it converts from UTF-8 to UTF-8,
 269      - it stops as soon as it encounters a conversion error, and it returns
 270        in *INCREMENTED a boolean telling whether it has incremented the input
 271        pointers past the error location,
 272      - if one_character_only is true, it stops after converting one
 273        character.  */
 274 static size_t
 275 utf8conv_carefully (bool one_character_only,
 276                     const char **inbuf, size_t *inbytesleft,
 277                     char **outbuf, size_t *outbytesleft,
 278                     bool *incremented)
 279 {
 280   const char *inptr = *inbuf;
 281   size_t insize = *inbytesleft;
 282   char *outptr = *outbuf;
 283   size_t outsize = *outbytesleft;
 284   size_t res;
 285
 286   res = 0;
 287   do
 288     {
 289       ucs4_t uc;
 290       int n;
 291       int m;
 292
 293       n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
 294       if (n < 0)
 295         {
 296           errno = (n == -2 ? EINVAL : EILSEQ);
 297           n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
 298           inptr += n;
 299           insize -= n;
 300           res = (size_t)(-1);
 301           *incremented = true;
 302           break;
 303         }
 304       if (outsize == 0)
 305         {
 306           errno = E2BIG;
 307           res = (size_t)(-1);
 308           *incremented = false;
 309           break;
 310         }
 311       m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
 312       if (m == -2)
 313         {
 314           errno = E2BIG;
 315           res = (size_t)(-1);
 316           *incremented = false;
 317           break;
 318         }
 319       inptr += n;
 320       insize -= n;
 321       if (m == -1)
 322         {
 323           errno = EILSEQ;
 324           res = (size_t)(-1);
 325           *incremented = true;
 326           break;
 327         }
 328       outptr += m;
 329       outsize -= m;
 330     }
 331   while (!one_character_only && insize > 0);
 332
 333   *inbuf = inptr;
 334   *inbytesleft = insize;
 335   *outbuf = outptr;
 336   *outbytesleft = outsize;
 337   return res;
 338 }
 339
 340 static int
 341 mem_cd_iconveh_internal (const char *src, size_t srclen,
 342                          iconv_t cd, iconv_t cd1, iconv_t cd2,
 343                          enum iconv_ilseq_handler handler,
 344                          size_t extra_alloc,
 345                          size_t *offsets,
 346                          char **resultp, size_t *lengthp)
 347 {
 348   /* When a conversion error occurs, we cannot start using CD1 and CD2 at
 349      this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
 350      Instead, we have to start afresh from the beginning of SRC.  */
 351   /* Use a temporary buffer, so that for small strings, a single malloc()
 352      call will be sufficient.  */
 353 # define tmpbufsize 4096
 354   /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
 355      libiconv's UCS-4-INTERNAL encoding.  */
 356   union { unsigned int align; char buf[tmpbufsize]; } tmp;
 357 # define tmpbuf tmp.buf
 358
 359   char *initial_result;
 360   char *result;
 361   size_t allocated;
 362   size_t length;
 363   size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
 364
 365   if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
 366     {
 367       initial_result = *resultp;
 368       allocated = *lengthp;
 369     }
 370   else
 371     {
 372       initial_result = tmpbuf;
 373       allocated = sizeof (tmpbuf);
 374     }
 375   result = initial_result;
 376
 377   /* Test whether a direct conversion is possible at all.  */
 378   if (cd == (iconv_t)(-1))
 379     goto indirectly;
 380
 381   if (offsets != NULL)
 382     {
 383       size_t i;
 384
 385       for (i = 0; i < srclen; i++)
 386         offsets[i] = (size_t)(-1);
 387
 388       last_length = (size_t)(-1);
 389     }
 390   length = 0;
 391
 392   /* First, try a direct conversion, and see whether a conversion error
 393      occurs at all.  */
 394   {
 395     const char *inptr = src;
 396     size_t insize = srclen;
 397
 398     /* Set to the initial state.  */
 399     iconv (cd, NULL, NULL, NULL, NULL);
 400
 401     while (insize > 0)
 402       {
 403         char *outptr = result + length;
 404         size_t outsize = allocated - extra_alloc - length;
 405         bool incremented;
 406         size_t res;
 407         bool grow;
 408
 409         if (offsets != NULL)
 410           {
 411             if (length != last_length) /* ensure that offset[] be increasing */
 412               {
 413                 offsets[inptr - src] = length;
 414                 last_length = length;
 415               }
 416             res = iconv_carefully_1 (cd,
 417                                      &inptr, &insize,
 418                                      &outptr, &outsize,
 419                                      &incremented);
 420           }
 421         else
 422           /* Use iconv_carefully instead of iconv here, because:
 423              - If TO_CODESET is UTF-8, we can do the error handling in this
 424                loop, no need for a second loop,
 425              - With iconv() implementations other than GNU libiconv and GNU
 426                libc, if we use iconv() in a big swoop, checking for an E2BIG
 427                return, we lose the number of irreversible conversions.  */
 428           res = iconv_carefully (cd,
 429                                  &inptr, &insize,
 430                                  &outptr, &outsize,
 431                                  &incremented);
 432
 433         length = outptr - result;
 434         grow = (length + extra_alloc > allocated / 2);
 435         if (res == (size_t)(-1))
 436           {
 437             if (errno == E2BIG)
 438               grow = true;
 439             else if (errno == EINVAL)
 440               break;
 441             else if (errno == EILSEQ && handler != iconveh_error)
 442               {
 443                 if (cd2 == (iconv_t)(-1))
 444                   {
 445                     /* TO_CODESET is UTF-8.  */
 446                     /* Error handling can produce up to 1 or 3 bytes of
 447                        output.  */
 448                     size_t extra_need =
 449                       (handler == iconveh_replacement_character ? 3 : 1);
 450                     if (length + extra_need + extra_alloc > allocated)
 451                       {
 452                         char *memory;
 453
 454                         allocated = 2 * allocated;
 455                         if (length + extra_need + extra_alloc > allocated)
 456                           allocated = 2 * allocated;
 457                         if (length + extra_need + extra_alloc > allocated)
 458                           abort ();
 459                         if (result == initial_result)
 460                           memory = (char *) malloc (allocated);
 461                         else
 462                           memory = (char *) realloc (result, allocated);
 463                         if (memory == NULL)
 464                           {
 465                             if (result != initial_result)
 466                               free (result);
 467                             errno = ENOMEM;
 468                             return -1;
 469                           }
 470                         if (result == initial_result)
 471                           memcpy (memory, initial_result, length);
 472                         result = memory;
 473                         grow = false;
 474                       }
 475                     /* The input is invalid in FROM_CODESET.  Eat up one byte
 476                        and emit a replacement character or a question mark.  */
 477                     if (!incremented)
 478                       {
 479                         if (insize == 0)
 480                           abort ();
 481                         inptr++;
 482                         insize--;
 483                       }
 484                     if (handler == iconveh_replacement_character)
 485                       {
 486                         /* U+FFFD in UTF-8 encoding.  */
 487                         result[length+0] = '\357';
 488                         result[length+1] = '\277';
 489                         result[length+2] = '\275';
 490                         length += 3;
 491                       }
 492                     else
 493                       {
 494                         result[length] = '?';
 495                         length++;
 496                       }
 497                   }
 498                 else
 499                   goto indirectly;
 500               }
 501             else
 502               {
 503                 if (result != initial_result)
 504                   free (result);
 505                 return -1;
 506               }
 507           }
 508         if (insize == 0)
 509           break;
 510         if (grow)
 511           {
 512             char *memory;
 513
 514             allocated = 2 * allocated;
 515             if (result == initial_result)
 516               memory = (char *) malloc (allocated);
 517             else
 518               memory = (char *) realloc (result, allocated);
 519             if (memory == NULL)
 520               {
 521                 if (result != initial_result)
 522                   free (result);
 523                 errno = ENOMEM;
 524                 return -1;
 525               }
 526             if (result == initial_result)
 527               memcpy (memory, initial_result, length);
 528             result = memory;
 529           }
 530       }
 531   }
 532
 533   /* Now get the conversion state back to the initial state.
 534      But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
 535 #if defined _LIBICONV_VERSION \
 536     || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
 537          || defined __sun)
 538   for (;;)
 539     {
 540       char *outptr = result + length;
 541       size_t outsize = allocated - extra_alloc - length;
 542       size_t res;
 543
 544       res = iconv (cd, NULL, NULL, &outptr, &outsize);
 545       length = outptr - result;
 546       if (res == (size_t)(-1))
 547         {
 548           if (errno == E2BIG)
 549             {
 550               char *memory;
 551
 552               allocated = 2 * allocated;
 553               if (result == initial_result)
 554                 memory = (char *) malloc (allocated);
 555               else
 556                 memory = (char *) realloc (result, allocated);
 557               if (memory == NULL)
 558                 {
 559                   if (result != initial_result)
 560                     free (result);
 561                   errno = ENOMEM;
 562                   return -1;
 563                 }
 564               if (result == initial_result)
 565                 memcpy (memory, initial_result, length);
 566               result = memory;
 567             }
 568           else
 569             {
 570               if (result != initial_result)
 571                 free (result);
 572               return -1;
 573             }
 574         }
 575       else
 576         break;
 577     }
 578 #endif
 579
 580   /* The direct conversion succeeded.  */
 581   goto done;
 582
 583  indirectly:
 584   /* The direct conversion failed.
 585      Use a conversion through UTF-8.  */
 586   if (offsets != NULL)
 587     {
 588       size_t i;
 589
 590       for (i = 0; i < srclen; i++)
 591         offsets[i] = (size_t)(-1);
 592
 593       last_length = (size_t)(-1);
 594     }
 595   length = 0;
 596   {
 597     const bool slowly = (offsets != NULL || handler == iconveh_error);
 598 # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
 599     char utf8buf[utf8bufsize + 3];
 600     size_t utf8len = 0;
 601     const char *in1ptr = src;
 602     size_t in1size = srclen;
 603     bool do_final_flush1 = true;
 604     bool do_final_flush2 = true;
 605
 606     /* Set to the initial state.  */
 607     if (cd1 != (iconv_t)(-1))
 608       iconv (cd1, NULL, NULL, NULL, NULL);
 609     if (cd2 != (iconv_t)(-1))
 610       iconv (cd2, NULL, NULL, NULL, NULL);
 611
 612     while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
 613       {
 614         char *out1ptr = utf8buf + utf8len;
 615         size_t out1size = utf8bufsize - utf8len;
 616         bool incremented1;
 617         size_t res1;
 618         int errno1;
 619
 620         /* Conversion step 1: from FROM_CODESET to UTF-8.  */
 621         if (in1size > 0)
 622           {
 623             if (offsets != NULL
 624                 && length != last_length) /* ensure that offset[] be increasing */
 625               {
 626                 offsets[in1ptr - src] = length;
 627                 last_length = length;
 628               }
 629             if (cd1 != (iconv_t)(-1))
 630               {
 631                 if (slowly)
 632                   res1 = iconv_carefully_1 (cd1,
 633                                             &in1ptr, &in1size,
 634                                             &out1ptr, &out1size,
 635                                             &incremented1);
 636                 else
 637                   res1 = iconv_carefully (cd1,
 638                                           &in1ptr, &in1size,
 639                                           &out1ptr, &out1size,
 640                                           &incremented1);
 641               }
 642             else
 643               {
 644                 /* FROM_CODESET is UTF-8.  */
 645                 res1 = utf8conv_carefully (slowly,
 646                                            &in1ptr, &in1size,
 647                                            &out1ptr, &out1size,
 648                                            &incremented1);
 649               }
 650           }
 651         else if (do_final_flush1)
 652           {
 653             /* Now get the conversion state of CD1 back to the initial state.
 654                But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
 655 # if defined _LIBICONV_VERSION \
 656      || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
 657           || defined __sun)
 658             if (cd1 != (iconv_t)(-1))
 659               res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
 660             else
 661 # endif
 662               res1 = 0;
 663             do_final_flush1 = false;
 664             incremented1 = true;
 665           }
 666         else
 667           {
 668             res1 = 0;
 669             incremented1 = true;
 670           }
 671         if (res1 == (size_t)(-1)
 672             && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
 673           {
 674             if (result != initial_result)
 675               free (result);
 676             return -1;
 677           }
 678         if (res1 == (size_t)(-1)
 679             && errno == EILSEQ && handler != iconveh_error)
 680           {
 681             /* The input is invalid in FROM_CODESET.  Eat up one byte and
 682                emit a U+FFFD character or a question mark.  Room for this
 683                character was allocated at the end of utf8buf.  */
 684             if (!incremented1)
 685               {
 686                 if (in1size == 0)
 687                   abort ();
 688                 in1ptr++;
 689                 in1size--;
 690               }
 691             if (handler == iconveh_replacement_character)
 692               {
 693                 /* U+FFFD in UTF-8 encoding.  */
 694                 out1ptr[0] = '\357';
 695                 out1ptr[1] = '\277';
 696                 out1ptr[2] = '\275';
 697                 out1ptr += 3;
 698               }
 699             else
 700               *out1ptr++ = '?';
 701             res1 = 0;
 702           }
 703         errno1 = errno;
 704         utf8len = out1ptr - utf8buf;
 705
 706         if (offsets != NULL
 707             || in1size == 0
 708             || utf8len > utf8bufsize / 2
 709             || (res1 == (size_t)(-1) && errno1 == E2BIG))
 710           {
 711             /* Conversion step 2: from UTF-8 to TO_CODESET.  */
 712             const char *in2ptr = utf8buf;
 713             size_t in2size = utf8len;
 714
 715             while (in2size > 0
 716                    || (in1size == 0 && !do_final_flush1 && do_final_flush2))
 717               {
 718                 char *out2ptr = result + length;
 719                 size_t out2size = allocated - extra_alloc - length;
 720                 bool incremented2;
 721                 size_t res2;
 722                 bool grow;
 723
 724                 if (in2size > 0)
 725                   {
 726                     if (cd2 != (iconv_t)(-1))
 727                       res2 = iconv_carefully (cd2,
 728                                               &in2ptr, &in2size,
 729                                               &out2ptr, &out2size,
 730                                               &incremented2);
 731                     else
 732                       /* TO_CODESET is UTF-8.  */
 733                       res2 = utf8conv_carefully (false,
 734                                                  &in2ptr, &in2size,
 735                                                  &out2ptr, &out2size,
 736                                                  &incremented2);
 737                   }
 738                 else /* in1size == 0 && !do_final_flush1
 739                         && in2size == 0 && do_final_flush2 */
 740                   {
 741                     /* Now get the conversion state of CD1 back to the initial
 742                        state.  But avoid glibc-2.1 bug and Solaris 2.7 bug.  */
 743 # if defined _LIBICONV_VERSION \
 744      || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
 745           || defined __sun)
 746                     if (cd2 != (iconv_t)(-1))
 747                       res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
 748                     else
 749 # endif
 750                       res2 = 0;
 751                     do_final_flush2 = false;
 752                     incremented2 = true;
 753                   }
 754
 755                 length = out2ptr - result;
 756                 grow = (length + extra_alloc > allocated / 2);
 757                 if (res2 == (size_t)(-1))
 758                   {
 759                     if (errno == E2BIG)
 760                       grow = true;
 761                     else if (errno == EINVAL)
 762                       break;
 763                     else if (errno == EILSEQ && handler != iconveh_error)
 764                       {
 765                         /* Error handling can produce up to 10 bytes of UTF-8
 766                            output.  But TO_CODESET may be UCS-2, UTF-16 or
 767                            UCS-4, so use CD2 here as well.  */
 768                         char scratchbuf[10];
 769                         size_t scratchlen;
 770                         ucs4_t uc;
 771                         const char *inptr;
 772                         size_t insize;
 773                         size_t res;
 774
 775                         if (incremented2)
 776                           {
 777                             if (u8_prev (&uc, (const uint8_t *) in2ptr,
 778                                          (const uint8_t *) utf8buf)
 779                                 == NULL)
 780                               abort ();
 781                           }
 782                         else
 783                           {
 784                             int n;
 785                             if (in2size == 0)
 786                               abort ();
 787                             n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
 788                                                   in2size);
 789                             in2ptr += n;
 790                             in2size -= n;
 791                           }
 792
 793                         if (handler == iconveh_escape_sequence)
 794                           {
 795                             static char const hex[16] = "0123456789ABCDEF";
 796                             scratchlen = 0;
 797                             scratchbuf[scratchlen++] = '\\';
 798                             if (uc < 0x10000)
 799                               scratchbuf[scratchlen++] = 'u';
 800                             else
 801                               {
 802                                 scratchbuf[scratchlen++] = 'U';
 803                                 scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
 804                                 scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
 805                                 scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
 806                                 scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
 807                               }
 808                             scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
 809                             scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
 810                             scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
 811                             scratchbuf[scratchlen++] = hex[uc & 15];
 812                           }
 813                         else if (handler == iconveh_replacement_character)
 814                           {
 815                             /* U+FFFD in UTF-8 encoding.  */
 816                             scratchbuf[0] = '\357';
 817                             scratchbuf[1] = '\277';
 818                             scratchbuf[2] = '\275';
 819                             scratchlen = 3;
 820                           }
 821                         else
 822                           {
 823                             scratchbuf[0] = '?';
 824                             scratchlen = 1;
 825                           }
 826
 827                         inptr = scratchbuf;
 828                         insize = scratchlen;
 829                         if (cd2 != (iconv_t)(-1))
 830                           {
 831                             char *out2ptr_try = out2ptr;
 832                             size_t out2size_try = out2size;
 833                             res = iconv (cd2,
 834                                          (ICONV_CONST char **) &inptr, &insize,
 835                                          &out2ptr_try, &out2size_try);
 836                             if (handler == iconveh_replacement_character
 837                                 && (res == (size_t)(-1)
 838                                     ? errno == EILSEQ
 839                                     /* FreeBSD iconv(), NetBSD iconv(), and
 840                                        Solaris 11 iconv() insert a '?' if they
 841                                        cannot convert.  This is what we want.
 842                                        But IRIX iconv() inserts a NUL byte if it
 843                                        cannot convert.
 844                                        And musl libc iconv() inserts a '*' if it
 845                                        cannot convert.  */
 846                                     : (res > 0
 847                                        && !(out2ptr_try - out2ptr == 1
 848                                             && *out2ptr == '?'))))
 849                               {
 850                                 /* The iconv() call failed.
 851                                    U+FFFD can't be converted to TO_CODESET.
 852                                    Use '?' instead.  */
 853                                 scratchbuf[0] = '?';
 854                                 scratchlen = 1;
 855                                 inptr = scratchbuf;
 856                                 insize = scratchlen;
 857                                 res = iconv (cd2,
 858                                              (ICONV_CONST char **) &inptr, &insize,
 859                                              &out2ptr, &out2size);
 860                               }
 861                             else
 862                               {
 863                                 /* Accept the results of the iconv() call.  */
 864                                 out2ptr = out2ptr_try;
 865                                 out2size = out2size_try;
 866                                 res = 0;
 867                               }
 868                           }
 869                         else
 870                           {
 871                             /* TO_CODESET is UTF-8.  */
 872                             if (out2size >= insize)
 873                               {
 874                                 memcpy (out2ptr, inptr, insize);
 875                                 out2ptr += insize;
 876                                 out2size -= insize;
 877                                 inptr += insize;
 878                                 insize = 0;
 879                                 res = 0;
 880                               }
 881                             else
 882                               {
 883                                 errno = E2BIG;
 884                                 res = (size_t)(-1);
 885                               }
 886                           }
 887                         length = out2ptr - result;
 888                         if (res == (size_t)(-1) && errno == E2BIG)
 889                           {
 890                             char *memory;
 891
 892                             allocated = 2 * allocated;
 893                             if (length + 1 + extra_alloc > allocated)
 894                               abort ();
 895                             if (result == initial_result)
 896                               memory = (char *) malloc (allocated);
 897                             else
 898                               memory = (char *) realloc (result, allocated);
 899                             if (memory == NULL)
 900                               {
 901                                 if (result != initial_result)
 902                                   free (result);
 903                                 errno = ENOMEM;
 904                                 return -1;
 905                               }
 906                             if (result == initial_result)
 907                               memcpy (memory, initial_result, length);
 908                             result = memory;
 909                             grow = false;
 910
 911                             out2ptr = result + length;
 912                             out2size = allocated - extra_alloc - length;
 913                             if (cd2 != (iconv_t)(-1))
 914                               res = iconv (cd2,
 915                                            (ICONV_CONST char **) &inptr,
 916                                            &insize,
 917                                            &out2ptr, &out2size);
 918                             else
 919                               {
 920                                 /* TO_CODESET is UTF-8.  */
 921                                 if (!(out2size >= insize))
 922                                   abort ();
 923                                 memcpy (out2ptr, inptr, insize);
 924                                 out2ptr += insize;
 925                                 out2size -= insize;
 926                                 inptr += insize;
 927                                 insize = 0;
 928                                 res = 0;
 929                               }
 930                             length = out2ptr - result;
 931                           }
 932 # if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
 933      && !(defined __GLIBC__ && !defined __UCLIBC__)
 934                         /* IRIX iconv() inserts a NUL byte if it cannot convert.
 935                            FreeBSD iconv(), NetBSD iconv(), and Solaris 11
 936                            iconv() insert a '?' if they cannot convert.
 937                            musl libc iconv() inserts a '*' if it cannot convert.
 938                            Only GNU libiconv (excluding the bastard Apple iconv)
 939                            and GNU libc are known to prefer to fail rather than
 940                            doing a lossy conversion.  */
 941                         if (res != (size_t)(-1) && res > 0)
 942                           {
 943                             errno = EILSEQ;
 944                             res = (size_t)(-1);
 945                           }
 946 # endif
 947                         if (res == (size_t)(-1))
 948                           {
 949                             /* Failure converting the ASCII replacement.  */
 950                             if (result != initial_result)
 951                               free (result);
 952                             return -1;
 953                           }
 954                       }
 955                     else
 956                       {
 957                         if (result != initial_result)
 958                           free (result);
 959                         return -1;
 960                       }
 961                   }
 962                 if (!(in2size > 0
 963                       || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
 964                   break;
 965                 if (grow)
 966                   {
 967                     char *memory;
 968
 969                     allocated = 2 * allocated;
 970                     if (result == initial_result)
 971                       memory = (char *) malloc (allocated);
 972                     else
 973                       memory = (char *) realloc (result, allocated);
 974                     if (memory == NULL)
 975                       {
 976                         if (result != initial_result)
 977                           free (result);
 978                         errno = ENOMEM;
 979                         return -1;
 980                       }
 981                     if (result == initial_result)
 982                       memcpy (memory, initial_result, length);
 983                     result = memory;
 984                   }
 985               }
 986
 987             /* Move the remaining bytes to the beginning of utf8buf.  */
 988             if (in2size > 0)
 989               memmove (utf8buf, in2ptr, in2size);
 990             utf8len = in2size;
 991           }
 992
 993         if (res1 == (size_t)(-1))
 994           {
 995             if (errno1 == EINVAL)
 996               in1size = 0;
 997             else if (errno1 == EILSEQ)
 998               {
 999                 if (result != initial_result)
1000                   free (result);
1001                 errno = errno1;
1002                 return -1;
1003               }
1004           }
1005       }
1006 # undef utf8bufsize
1007   }
1008
1009  done:
1010   /* Now the final memory allocation.  */
1011   if (result == tmpbuf)
1012     {
1013       size_t memsize = length + extra_alloc;
1014
1015       if (*resultp != NULL && *lengthp >= memsize)
1016         result = *resultp;
1017       else
1018         {
1019           char *memory;
1020
1021           memory = (char *) malloc (memsize > 0 ? memsize : 1);
1022           if (memory != NULL)
1023             result = memory;
1024           else
1025             {
1026               errno = ENOMEM;
1027               return -1;
1028             }
1029         }
1030       memcpy (result, tmpbuf, length);
1031     }
1032   else if (result != *resultp && length + extra_alloc < allocated)
1033     {
1034       /* Shrink the allocated memory if possible.  */
1035       size_t memsize = length + extra_alloc;
1036       char *memory;
1037
1038       memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
1039       if (memory != NULL)
1040         result = memory;
1041     }
1042   *resultp = result;
1043   *lengthp = length;
1044   return 0;
1045 # undef tmpbuf
1046 # undef tmpbufsize
1047 }
1048
1049 int
1050 mem_cd_iconveh (const char *src, size_t srclen,
1051                 const iconveh_t *cd,
1052                 enum iconv_ilseq_handler handler,
1053                 size_t *offsets,
1054                 char **resultp, size_t *lengthp)
1055 {
1056   return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
1057                                   handler, 0, offsets, resultp, lengthp);
1058 }
1059
1060 char *
1061 str_cd_iconveh (const char *src,
1062                 const iconveh_t *cd,
1063                 enum iconv_ilseq_handler handler)
1064 {
1065   /* For most encodings, a trailing NUL byte in the input will be converted
1066      to a trailing NUL byte in the output.  But not for UTF-7.  So that this
1067      function is usable for UTF-7, we have to exclude the NUL byte from the
1068      conversion and add it by hand afterwards.  */
1069   char *result = NULL;
1070   size_t length = 0;
1071   int retval = mem_cd_iconveh_internal (src, strlen (src),
1072                                         cd->cd, cd->cd1, cd->cd2, handler, 1,
1073                                         NULL, &result, &length);
1074
1075   if (retval < 0)
1076     {
1077       free (result);
1078       return NULL;
1079     }
1080
1081   /* Add the terminating NUL byte.  */
1082   result[length] = '\0';
1083
1084   return result;
1085 }
1086
1087 #endif
1088
1089 int
1090 mem_iconveh (const char *src, size_t srclen,
1091              const char *from_codeset, const char *to_codeset,
1092              enum iconv_ilseq_handler handler,
1093              size_t *offsets,
1094              char **resultp, size_t *lengthp)
1095 {
1096   if (srclen == 0)
1097     {
1098       /* Nothing to convert.  */
1099       *lengthp = 0;
1100       return 0;
1101     }
1102   else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
1103     {
1104       char *result;
1105
1106       if (*resultp != NULL && *lengthp >= srclen)
1107         result = *resultp;
1108       else
1109         {
1110           result = (char *) malloc (srclen);
1111           if (result == NULL)
1112             {
1113               errno = ENOMEM;
1114               return -1;
1115             }
1116         }
1117       memcpy (result, src, srclen);
1118       *resultp = result;
1119       *lengthp = srclen;
1120       return 0;
1121     }
1122   else
1123     {
1124 #if HAVE_ICONV
1125       iconveh_t cd;
1126       char *result;
1127       size_t length;
1128       int retval;
1129
1130       if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1131         return -1;
1132
1133       result = *resultp;
1134       length = *lengthp;
1135       retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
1136                                &result, &length);
1137
1138       if (retval < 0)
1139         {
1140           /* Close cd, but preserve the errno from str_cd_iconv.  */
1141           int saved_errno = errno;
1142           iconveh_close (&cd);
1143           errno = saved_errno;
1144         }
1145       else
1146         {
1147           if (iconveh_close (&cd) < 0)
1148             {
1149               if (result != *resultp)
1150                 free (result);
1151               return -1;
1152             }
1153           *resultp = result;
1154           *lengthp = length;
1155         }
1156       return retval;
1157 #else
1158       /* This is a different error code than if iconv_open existed but didn't
1159          support from_codeset and to_codeset, so that the caller can emit
1160          an error message such as
1161            "iconv() is not supported. Installing GNU libiconv and
1162             then reinstalling this package would fix this."  */
1163       errno = ENOSYS;
1164       return -1;
1165 #endif
1166     }
1167 }
1168
1169 char *
1170 str_iconveh (const char *src,
1171              const char *from_codeset, const char *to_codeset,
1172              enum iconv_ilseq_handler handler)
1173 {
1174   if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
1175     {
1176       char *result = strdup (src);
1177
1178       if (result == NULL)
1179         errno = ENOMEM;
1180       return result;
1181     }
1182   else
1183     {
1184 #if HAVE_ICONV
1185       iconveh_t cd;
1186       char *result;
1187
1188       if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1189         return NULL;
1190
1191       result = str_cd_iconveh (src, &cd, handler);
1192
1193       if (result == NULL)
1194         {
1195           /* Close cd, but preserve the errno from str_cd_iconv.  */
1196           int saved_errno = errno;
1197           iconveh_close (&cd);
1198           errno = saved_errno;
1199         }
1200       else
1201         {
1202           if (iconveh_close (&cd) < 0)
1203             {
1204               free (result);
1205               return NULL;
1206             }
1207         }
1208       return result;
1209 #else
1210       /* This is a different error code than if iconv_open existed but didn't
1211          support from_codeset and to_codeset, so that the caller can emit
1212          an error message such as
1213            "iconv() is not supported. Installing GNU libiconv and
1214             then reinstalling this package would fix this."  */
1215       errno = ENOSYS;
1216       return NULL;
1217 #endif
1218     }
1219 }