openat: don’t close (-1)
[gnulib.git] / lib / striconveh.c
blob952754ba5ca55a6539bf03c3c7bd25bf7dc54389
1 /* Character set conversion with error handling.
2 Copyright (C) 2001-2024 Free Software Foundation, Inc.
3 Written by Bruno Haible and Simon Josefsson.
5 This file is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation; either version 2.1 of the
8 License, or (at your option) any later version.
10 This file is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18 #include <config.h>
20 /* Specification. */
21 #include "striconveh.h"
23 #include <errno.h>
24 #include <stdlib.h>
25 #include <string.h>
27 #if HAVE_ICONV
28 # include <iconv.h>
29 # include "unistr.h"
30 #endif
32 #include "c-strcase.h"
33 #include "c-strcaseeq.h"
35 #ifndef SIZE_MAX
36 # define SIZE_MAX ((size_t) -1)
37 #endif
40 #if HAVE_ICONV
42 /* The caller must provide an iconveh_t, not just an iconv_t, because when a
43 conversion error occurs, we may have to determine the Unicode representation
44 of the inconvertible character. */
46 int
47 iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
49 iconv_t cd;
50 iconv_t cd1;
51 iconv_t cd2;
53 cd = iconv_open (to_codeset, from_codeset);
55 if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
56 cd1 = (iconv_t)(-1);
57 else
59 cd1 = iconv_open ("UTF-8", from_codeset);
60 if (cd1 == (iconv_t)(-1))
62 int saved_errno = errno;
63 if (cd != (iconv_t)(-1))
64 iconv_close (cd);
65 errno = saved_errno;
66 return -1;
70 if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
71 # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
72 && !defined __UCLIBC__) \
73 || _LIBICONV_VERSION >= 0x0105 \
74 || defined ICONV_SET_TRANSLITERATE
75 || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
76 # endif
78 cd2 = (iconv_t)(-1);
79 else
81 cd2 = iconv_open (to_codeset, "UTF-8");
82 if (cd2 == (iconv_t)(-1))
84 int saved_errno = errno;
85 if (cd1 != (iconv_t)(-1))
86 iconv_close (cd1);
87 if (cd != (iconv_t)(-1))
88 iconv_close (cd);
89 errno = saved_errno;
90 return -1;
94 cdp->cd = cd;
95 cdp->cd1 = cd1;
96 cdp->cd2 = cd2;
97 return 0;
101 iconveh_close (const iconveh_t *cd)
103 if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
105 /* Return -1, but preserve the errno from iconv_close. */
106 int saved_errno = errno;
107 if (cd->cd1 != (iconv_t)(-1))
108 iconv_close (cd->cd1);
109 if (cd->cd != (iconv_t)(-1))
110 iconv_close (cd->cd);
111 errno = saved_errno;
112 return -1;
114 if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
116 /* Return -1, but preserve the errno from iconv_close. */
117 int saved_errno = errno;
118 if (cd->cd != (iconv_t)(-1))
119 iconv_close (cd->cd);
120 errno = saved_errno;
121 return -1;
123 if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
124 return -1;
125 return 0;
128 /* iconv_carefully is like iconv, except that it stops as soon as it encounters
129 a conversion error, and it returns in *INCREMENTED a boolean telling whether
130 it has incremented the input pointers past the error location. */
131 # if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
132 && !(defined __GLIBC__ && !defined __UCLIBC__)
133 /* Irix iconv() inserts a NUL byte if it cannot convert.
134 NetBSD iconv() inserts a question mark if it cannot convert.
135 Only GNU libiconv (excluding the bastard Apple iconv) and GNU libc are
136 known to prefer to fail rather than doing a lossy conversion. */
137 static size_t
138 iconv_carefully (iconv_t cd,
139 const char **inbuf, size_t *inbytesleft,
140 char **outbuf, size_t *outbytesleft,
141 bool *incremented)
143 const char *inptr = *inbuf;
144 const char *inptr_end = inptr + *inbytesleft;
145 char *outptr = *outbuf;
146 size_t outsize = *outbytesleft;
147 const char *inptr_before;
148 size_t res;
152 size_t insize;
154 inptr_before = inptr;
155 res = (size_t)(-1);
157 for (insize = 1; inptr + insize <= inptr_end; insize++)
159 res = iconv (cd,
160 (ICONV_CONST char **) &inptr, &insize,
161 &outptr, &outsize);
162 if (!(res == (size_t)(-1) && errno == EINVAL))
163 break;
164 /* iconv can eat up a shift sequence but give EINVAL while attempting
165 to convert the first character. E.g. libiconv does this. */
166 if (inptr > inptr_before)
168 res = 0;
169 break;
173 if (res == 0)
175 *outbuf = outptr;
176 *outbytesleft = outsize;
179 while (res == 0 && inptr < inptr_end);
181 *inbuf = inptr;
182 *inbytesleft = inptr_end - inptr;
183 if (res != (size_t)(-1) && res > 0)
185 /* iconv() has already incremented INPTR. We cannot go back to a
186 previous INPTR, otherwise the state inside CD would become invalid,
187 if FROM_CODESET is a stateful encoding. So, tell the caller that
188 *INBUF has already been incremented. */
189 *incremented = (inptr > inptr_before);
190 errno = EILSEQ;
191 return (size_t)(-1);
193 else
195 *incremented = false;
196 return res;
199 # else
200 # define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
201 (*(incremented) = false, \
202 iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
203 # endif
205 /* iconv_carefully_1 is like iconv_carefully, except that it stops after
206 converting one character or one shift sequence. */
207 static size_t
208 iconv_carefully_1 (iconv_t cd,
209 const char **inbuf, size_t *inbytesleft,
210 char **outbuf, size_t *outbytesleft,
211 bool *incremented)
213 const char *inptr_before = *inbuf;
214 const char *inptr = inptr_before;
215 const char *inptr_end = inptr_before + *inbytesleft;
216 char *outptr = *outbuf;
217 size_t outsize = *outbytesleft;
218 size_t res = (size_t)(-1);
219 size_t insize;
221 for (insize = 1; inptr_before + insize <= inptr_end; insize++)
223 inptr = inptr_before;
224 res = iconv (cd,
225 (ICONV_CONST char **) &inptr, &insize,
226 &outptr, &outsize);
227 if (!(res == (size_t)(-1) && errno == EINVAL))
228 break;
229 /* iconv can eat up a shift sequence but give EINVAL while attempting
230 to convert the first character. E.g. libiconv does this. */
231 if (inptr > inptr_before)
233 res = 0;
234 break;
238 *inbuf = inptr;
239 *inbytesleft = inptr_end - inptr;
240 # if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
241 && !(defined __GLIBC__ && !defined __UCLIBC__)
242 /* Irix iconv() inserts a NUL byte if it cannot convert.
243 NetBSD iconv() inserts a question mark if it cannot convert.
244 Only GNU libiconv (excluding the bastard Apple iconv) and GNU libc are
245 known to prefer to fail rather than doing a lossy conversion. */
246 if (res != (size_t)(-1) && res > 0)
248 /* iconv() has already incremented INPTR. We cannot go back to a
249 previous INPTR, otherwise the state inside CD would become invalid,
250 if FROM_CODESET is a stateful encoding. So, tell the caller that
251 *INBUF has already been incremented. */
252 *incremented = (inptr > inptr_before);
253 errno = EILSEQ;
254 return (size_t)(-1);
256 # endif
258 if (res != (size_t)(-1))
260 *outbuf = outptr;
261 *outbytesleft = outsize;
263 *incremented = false;
264 return res;
267 /* utf8conv_carefully is like iconv, except that
268 - it converts from UTF-8 to UTF-8,
269 - it stops as soon as it encounters a conversion error, and it returns
270 in *INCREMENTED a boolean telling whether it has incremented the input
271 pointers past the error location,
272 - if one_character_only is true, it stops after converting one
273 character. */
274 static size_t
275 utf8conv_carefully (bool one_character_only,
276 const char **inbuf, size_t *inbytesleft,
277 char **outbuf, size_t *outbytesleft,
278 bool *incremented)
280 const char *inptr = *inbuf;
281 size_t insize = *inbytesleft;
282 char *outptr = *outbuf;
283 size_t outsize = *outbytesleft;
284 size_t res;
286 res = 0;
289 ucs4_t uc;
290 int n;
291 int m;
293 n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
294 if (n < 0)
296 errno = (n == -2 ? EINVAL : EILSEQ);
297 n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
298 inptr += n;
299 insize -= n;
300 res = (size_t)(-1);
301 *incremented = true;
302 break;
304 if (outsize == 0)
306 errno = E2BIG;
307 res = (size_t)(-1);
308 *incremented = false;
309 break;
311 m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
312 if (m == -2)
314 errno = E2BIG;
315 res = (size_t)(-1);
316 *incremented = false;
317 break;
319 inptr += n;
320 insize -= n;
321 if (m == -1)
323 errno = EILSEQ;
324 res = (size_t)(-1);
325 *incremented = true;
326 break;
328 outptr += m;
329 outsize -= m;
331 while (!one_character_only && insize > 0);
333 *inbuf = inptr;
334 *inbytesleft = insize;
335 *outbuf = outptr;
336 *outbytesleft = outsize;
337 return res;
340 static int
341 mem_cd_iconveh_internal (const char *src, size_t srclen,
342 iconv_t cd, iconv_t cd1, iconv_t cd2,
343 enum iconv_ilseq_handler handler,
344 size_t extra_alloc,
345 size_t *offsets,
346 char **resultp, size_t *lengthp)
348 /* When a conversion error occurs, we cannot start using CD1 and CD2 at
349 this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
350 Instead, we have to start afresh from the beginning of SRC. */
351 /* Use a temporary buffer, so that for small strings, a single malloc()
352 call will be sufficient. */
353 # define tmpbufsize 4096
354 /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
355 libiconv's UCS-4-INTERNAL encoding. */
356 union { unsigned int align; char buf[tmpbufsize]; } tmp;
357 # define tmpbuf tmp.buf
359 char *initial_result;
360 char *result;
361 size_t allocated;
362 size_t length;
363 size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
365 if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
367 initial_result = *resultp;
368 allocated = *lengthp;
370 else
372 initial_result = tmpbuf;
373 allocated = sizeof (tmpbuf);
375 result = initial_result;
377 /* Test whether a direct conversion is possible at all. */
378 if (cd == (iconv_t)(-1))
379 goto indirectly;
381 if (offsets != NULL)
383 size_t i;
385 for (i = 0; i < srclen; i++)
386 offsets[i] = (size_t)(-1);
388 last_length = (size_t)(-1);
390 length = 0;
392 /* First, try a direct conversion, and see whether a conversion error
393 occurs at all. */
395 const char *inptr = src;
396 size_t insize = srclen;
398 /* Set to the initial state. */
399 iconv (cd, NULL, NULL, NULL, NULL);
401 while (insize > 0)
403 char *outptr = result + length;
404 size_t outsize = allocated - extra_alloc - length;
405 bool incremented;
406 size_t res;
407 bool grow;
409 if (offsets != NULL)
411 if (length != last_length) /* ensure that offset[] be increasing */
413 offsets[inptr - src] = length;
414 last_length = length;
416 res = iconv_carefully_1 (cd,
417 &inptr, &insize,
418 &outptr, &outsize,
419 &incremented);
421 else
422 /* Use iconv_carefully instead of iconv here, because:
423 - If TO_CODESET is UTF-8, we can do the error handling in this
424 loop, no need for a second loop,
425 - With iconv() implementations other than GNU libiconv and GNU
426 libc, if we use iconv() in a big swoop, checking for an E2BIG
427 return, we lose the number of irreversible conversions. */
428 res = iconv_carefully (cd,
429 &inptr, &insize,
430 &outptr, &outsize,
431 &incremented);
433 length = outptr - result;
434 grow = (length + extra_alloc > allocated / 2);
435 if (res == (size_t)(-1))
437 if (errno == E2BIG)
438 grow = true;
439 else if (errno == EINVAL)
440 break;
441 else if (errno == EILSEQ && handler != iconveh_error)
443 if (cd2 == (iconv_t)(-1))
445 /* TO_CODESET is UTF-8. */
446 /* Error handling can produce up to 1 or 3 bytes of
447 output. */
448 size_t extra_need =
449 (handler == iconveh_replacement_character ? 3 : 1);
450 if (length + extra_need + extra_alloc > allocated)
452 char *memory;
454 allocated = 2 * allocated;
455 if (length + extra_need + extra_alloc > allocated)
456 allocated = 2 * allocated;
457 if (length + extra_need + extra_alloc > allocated)
458 abort ();
459 if (result == initial_result)
460 memory = (char *) malloc (allocated);
461 else
462 memory = (char *) realloc (result, allocated);
463 if (memory == NULL)
465 if (result != initial_result)
466 free (result);
467 errno = ENOMEM;
468 return -1;
470 if (result == initial_result)
471 memcpy (memory, initial_result, length);
472 result = memory;
473 grow = false;
475 /* The input is invalid in FROM_CODESET. Eat up one byte
476 and emit a replacement character or a question mark. */
477 if (!incremented)
479 if (insize == 0)
480 abort ();
481 inptr++;
482 insize--;
484 if (handler == iconveh_replacement_character)
486 /* U+FFFD in UTF-8 encoding. */
487 result[length+0] = '\357';
488 result[length+1] = '\277';
489 result[length+2] = '\275';
490 length += 3;
492 else
494 result[length] = '?';
495 length++;
498 else
499 goto indirectly;
501 else
503 if (result != initial_result)
504 free (result);
505 return -1;
508 if (insize == 0)
509 break;
510 if (grow)
512 char *memory;
514 allocated = 2 * allocated;
515 if (result == initial_result)
516 memory = (char *) malloc (allocated);
517 else
518 memory = (char *) realloc (result, allocated);
519 if (memory == NULL)
521 if (result != initial_result)
522 free (result);
523 errno = ENOMEM;
524 return -1;
526 if (result == initial_result)
527 memcpy (memory, initial_result, length);
528 result = memory;
533 /* Now get the conversion state back to the initial state.
534 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
535 #if defined _LIBICONV_VERSION \
536 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
537 || defined __sun)
538 for (;;)
540 char *outptr = result + length;
541 size_t outsize = allocated - extra_alloc - length;
542 size_t res;
544 res = iconv (cd, NULL, NULL, &outptr, &outsize);
545 length = outptr - result;
546 if (res == (size_t)(-1))
548 if (errno == E2BIG)
550 char *memory;
552 allocated = 2 * allocated;
553 if (result == initial_result)
554 memory = (char *) malloc (allocated);
555 else
556 memory = (char *) realloc (result, allocated);
557 if (memory == NULL)
559 if (result != initial_result)
560 free (result);
561 errno = ENOMEM;
562 return -1;
564 if (result == initial_result)
565 memcpy (memory, initial_result, length);
566 result = memory;
568 else
570 if (result != initial_result)
571 free (result);
572 return -1;
575 else
576 break;
578 #endif
580 /* The direct conversion succeeded. */
581 goto done;
583 indirectly:
584 /* The direct conversion failed.
585 Use a conversion through UTF-8. */
586 if (offsets != NULL)
588 size_t i;
590 for (i = 0; i < srclen; i++)
591 offsets[i] = (size_t)(-1);
593 last_length = (size_t)(-1);
595 length = 0;
597 const bool slowly = (offsets != NULL || handler == iconveh_error);
598 # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
599 char utf8buf[utf8bufsize + 3];
600 size_t utf8len = 0;
601 const char *in1ptr = src;
602 size_t in1size = srclen;
603 bool do_final_flush1 = true;
604 bool do_final_flush2 = true;
606 /* Set to the initial state. */
607 if (cd1 != (iconv_t)(-1))
608 iconv (cd1, NULL, NULL, NULL, NULL);
609 if (cd2 != (iconv_t)(-1))
610 iconv (cd2, NULL, NULL, NULL, NULL);
612 while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
614 char *out1ptr = utf8buf + utf8len;
615 size_t out1size = utf8bufsize - utf8len;
616 bool incremented1;
617 size_t res1;
618 int errno1;
620 /* Conversion step 1: from FROM_CODESET to UTF-8. */
621 if (in1size > 0)
623 if (offsets != NULL
624 && length != last_length) /* ensure that offset[] be increasing */
626 offsets[in1ptr - src] = length;
627 last_length = length;
629 if (cd1 != (iconv_t)(-1))
631 if (slowly)
632 res1 = iconv_carefully_1 (cd1,
633 &in1ptr, &in1size,
634 &out1ptr, &out1size,
635 &incremented1);
636 else
637 res1 = iconv_carefully (cd1,
638 &in1ptr, &in1size,
639 &out1ptr, &out1size,
640 &incremented1);
642 else
644 /* FROM_CODESET is UTF-8. */
645 res1 = utf8conv_carefully (slowly,
646 &in1ptr, &in1size,
647 &out1ptr, &out1size,
648 &incremented1);
651 else if (do_final_flush1)
653 /* Now get the conversion state of CD1 back to the initial state.
654 But avoid glibc-2.1 bug and Solaris 2.7 bug. */
655 # if defined _LIBICONV_VERSION \
656 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
657 || defined __sun)
658 if (cd1 != (iconv_t)(-1))
659 res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
660 else
661 # endif
662 res1 = 0;
663 do_final_flush1 = false;
664 incremented1 = true;
666 else
668 res1 = 0;
669 incremented1 = true;
671 if (res1 == (size_t)(-1)
672 && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
674 if (result != initial_result)
675 free (result);
676 return -1;
678 if (res1 == (size_t)(-1)
679 && errno == EILSEQ && handler != iconveh_error)
681 /* The input is invalid in FROM_CODESET. Eat up one byte and
682 emit a U+FFFD character or a question mark. Room for this
683 character was allocated at the end of utf8buf. */
684 if (!incremented1)
686 if (in1size == 0)
687 abort ();
688 in1ptr++;
689 in1size--;
691 if (handler == iconveh_replacement_character)
693 /* U+FFFD in UTF-8 encoding. */
694 out1ptr[0] = '\357';
695 out1ptr[1] = '\277';
696 out1ptr[2] = '\275';
697 out1ptr += 3;
699 else
700 *out1ptr++ = '?';
701 res1 = 0;
703 errno1 = errno;
704 utf8len = out1ptr - utf8buf;
706 if (offsets != NULL
707 || in1size == 0
708 || utf8len > utf8bufsize / 2
709 || (res1 == (size_t)(-1) && errno1 == E2BIG))
711 /* Conversion step 2: from UTF-8 to TO_CODESET. */
712 const char *in2ptr = utf8buf;
713 size_t in2size = utf8len;
715 while (in2size > 0
716 || (in1size == 0 && !do_final_flush1 && do_final_flush2))
718 char *out2ptr = result + length;
719 size_t out2size = allocated - extra_alloc - length;
720 bool incremented2;
721 size_t res2;
722 bool grow;
724 if (in2size > 0)
726 if (cd2 != (iconv_t)(-1))
727 res2 = iconv_carefully (cd2,
728 &in2ptr, &in2size,
729 &out2ptr, &out2size,
730 &incremented2);
731 else
732 /* TO_CODESET is UTF-8. */
733 res2 = utf8conv_carefully (false,
734 &in2ptr, &in2size,
735 &out2ptr, &out2size,
736 &incremented2);
738 else /* in1size == 0 && !do_final_flush1
739 && in2size == 0 && do_final_flush2 */
741 /* Now get the conversion state of CD1 back to the initial
742 state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
743 # if defined _LIBICONV_VERSION \
744 || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
745 || defined __sun)
746 if (cd2 != (iconv_t)(-1))
747 res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
748 else
749 # endif
750 res2 = 0;
751 do_final_flush2 = false;
752 incremented2 = true;
755 length = out2ptr - result;
756 grow = (length + extra_alloc > allocated / 2);
757 if (res2 == (size_t)(-1))
759 if (errno == E2BIG)
760 grow = true;
761 else if (errno == EINVAL)
762 break;
763 else if (errno == EILSEQ && handler != iconveh_error)
765 /* Error handling can produce up to 10 bytes of UTF-8
766 output. But TO_CODESET may be UCS-2, UTF-16 or
767 UCS-4, so use CD2 here as well. */
768 char scratchbuf[10];
769 size_t scratchlen;
770 ucs4_t uc;
771 const char *inptr;
772 size_t insize;
773 size_t res;
775 if (incremented2)
777 if (u8_prev (&uc, (const uint8_t *) in2ptr,
778 (const uint8_t *) utf8buf)
779 == NULL)
780 abort ();
782 else
784 int n;
785 if (in2size == 0)
786 abort ();
787 n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
788 in2size);
789 in2ptr += n;
790 in2size -= n;
793 if (handler == iconveh_escape_sequence)
795 static char const hex[16] = "0123456789ABCDEF";
796 scratchlen = 0;
797 scratchbuf[scratchlen++] = '\\';
798 if (uc < 0x10000)
799 scratchbuf[scratchlen++] = 'u';
800 else
802 scratchbuf[scratchlen++] = 'U';
803 scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
804 scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
805 scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
806 scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
808 scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
809 scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
810 scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
811 scratchbuf[scratchlen++] = hex[uc & 15];
813 else if (handler == iconveh_replacement_character)
815 /* U+FFFD in UTF-8 encoding. */
816 scratchbuf[0] = '\357';
817 scratchbuf[1] = '\277';
818 scratchbuf[2] = '\275';
819 scratchlen = 3;
821 else
823 scratchbuf[0] = '?';
824 scratchlen = 1;
827 inptr = scratchbuf;
828 insize = scratchlen;
829 if (cd2 != (iconv_t)(-1))
831 char *out2ptr_try = out2ptr;
832 size_t out2size_try = out2size;
833 res = iconv (cd2,
834 (ICONV_CONST char **) &inptr, &insize,
835 &out2ptr_try, &out2size_try);
836 if (handler == iconveh_replacement_character
837 && (res == (size_t)(-1)
838 ? errno == EILSEQ
839 /* FreeBSD iconv(), NetBSD iconv(), and
840 Solaris 11 iconv() insert a '?' if they
841 cannot convert. This is what we want.
842 But IRIX iconv() inserts a NUL byte if it
843 cannot convert.
844 And musl libc iconv() inserts a '*' if it
845 cannot convert. */
846 : (res > 0
847 && !(out2ptr_try - out2ptr == 1
848 && *out2ptr == '?'))))
850 /* The iconv() call failed.
851 U+FFFD can't be converted to TO_CODESET.
852 Use '?' instead. */
853 scratchbuf[0] = '?';
854 scratchlen = 1;
855 inptr = scratchbuf;
856 insize = scratchlen;
857 res = iconv (cd2,
858 (ICONV_CONST char **) &inptr, &insize,
859 &out2ptr, &out2size);
861 else
863 /* Accept the results of the iconv() call. */
864 out2ptr = out2ptr_try;
865 out2size = out2size_try;
866 res = 0;
869 else
871 /* TO_CODESET is UTF-8. */
872 if (out2size >= insize)
874 memcpy (out2ptr, inptr, insize);
875 out2ptr += insize;
876 out2size -= insize;
877 inptr += insize;
878 insize = 0;
879 res = 0;
881 else
883 errno = E2BIG;
884 res = (size_t)(-1);
887 length = out2ptr - result;
888 if (res == (size_t)(-1) && errno == E2BIG)
890 char *memory;
892 allocated = 2 * allocated;
893 if (length + 1 + extra_alloc > allocated)
894 abort ();
895 if (result == initial_result)
896 memory = (char *) malloc (allocated);
897 else
898 memory = (char *) realloc (result, allocated);
899 if (memory == NULL)
901 if (result != initial_result)
902 free (result);
903 errno = ENOMEM;
904 return -1;
906 if (result == initial_result)
907 memcpy (memory, initial_result, length);
908 result = memory;
909 grow = false;
911 out2ptr = result + length;
912 out2size = allocated - extra_alloc - length;
913 if (cd2 != (iconv_t)(-1))
914 res = iconv (cd2,
915 (ICONV_CONST char **) &inptr,
916 &insize,
917 &out2ptr, &out2size);
918 else
920 /* TO_CODESET is UTF-8. */
921 if (!(out2size >= insize))
922 abort ();
923 memcpy (out2ptr, inptr, insize);
924 out2ptr += insize;
925 out2size -= insize;
926 inptr += insize;
927 insize = 0;
928 res = 0;
930 length = out2ptr - result;
932 # if !(defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
933 && !(defined __GLIBC__ && !defined __UCLIBC__)
934 /* IRIX iconv() inserts a NUL byte if it cannot convert.
935 FreeBSD iconv(), NetBSD iconv(), and Solaris 11
936 iconv() insert a '?' if they cannot convert.
937 musl libc iconv() inserts a '*' if it cannot convert.
938 Only GNU libiconv (excluding the bastard Apple iconv)
939 and GNU libc are known to prefer to fail rather than
940 doing a lossy conversion. */
941 if (res != (size_t)(-1) && res > 0)
943 errno = EILSEQ;
944 res = (size_t)(-1);
946 # endif
947 if (res == (size_t)(-1))
949 /* Failure converting the ASCII replacement. */
950 if (result != initial_result)
951 free (result);
952 return -1;
955 else
957 if (result != initial_result)
958 free (result);
959 return -1;
962 if (!(in2size > 0
963 || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
964 break;
965 if (grow)
967 char *memory;
969 allocated = 2 * allocated;
970 if (result == initial_result)
971 memory = (char *) malloc (allocated);
972 else
973 memory = (char *) realloc (result, allocated);
974 if (memory == NULL)
976 if (result != initial_result)
977 free (result);
978 errno = ENOMEM;
979 return -1;
981 if (result == initial_result)
982 memcpy (memory, initial_result, length);
983 result = memory;
987 /* Move the remaining bytes to the beginning of utf8buf. */
988 if (in2size > 0)
989 memmove (utf8buf, in2ptr, in2size);
990 utf8len = in2size;
993 if (res1 == (size_t)(-1))
995 if (errno1 == EINVAL)
996 in1size = 0;
997 else if (errno1 == EILSEQ)
999 if (result != initial_result)
1000 free (result);
1001 errno = errno1;
1002 return -1;
1006 # undef utf8bufsize
1009 done:
1010 /* Now the final memory allocation. */
1011 if (result == tmpbuf)
1013 size_t memsize = length + extra_alloc;
1015 if (*resultp != NULL && *lengthp >= memsize)
1016 result = *resultp;
1017 else
1019 char *memory;
1021 memory = (char *) malloc (memsize > 0 ? memsize : 1);
1022 if (memory != NULL)
1023 result = memory;
1024 else
1026 errno = ENOMEM;
1027 return -1;
1030 memcpy (result, tmpbuf, length);
1032 else if (result != *resultp && length + extra_alloc < allocated)
1034 /* Shrink the allocated memory if possible. */
1035 size_t memsize = length + extra_alloc;
1036 char *memory;
1038 memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
1039 if (memory != NULL)
1040 result = memory;
1042 *resultp = result;
1043 *lengthp = length;
1044 return 0;
1045 # undef tmpbuf
1046 # undef tmpbufsize
1050 mem_cd_iconveh (const char *src, size_t srclen,
1051 const iconveh_t *cd,
1052 enum iconv_ilseq_handler handler,
1053 size_t *offsets,
1054 char **resultp, size_t *lengthp)
1056 return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
1057 handler, 0, offsets, resultp, lengthp);
1060 char *
1061 str_cd_iconveh (const char *src,
1062 const iconveh_t *cd,
1063 enum iconv_ilseq_handler handler)
1065 /* For most encodings, a trailing NUL byte in the input will be converted
1066 to a trailing NUL byte in the output. But not for UTF-7. So that this
1067 function is usable for UTF-7, we have to exclude the NUL byte from the
1068 conversion and add it by hand afterwards. */
1069 char *result = NULL;
1070 size_t length = 0;
1071 int retval = mem_cd_iconveh_internal (src, strlen (src),
1072 cd->cd, cd->cd1, cd->cd2, handler, 1,
1073 NULL, &result, &length);
1075 if (retval < 0)
1077 free (result);
1078 return NULL;
1081 /* Add the terminating NUL byte. */
1082 result[length] = '\0';
1084 return result;
1087 #endif
1090 mem_iconveh (const char *src, size_t srclen,
1091 const char *from_codeset, const char *to_codeset,
1092 enum iconv_ilseq_handler handler,
1093 size_t *offsets,
1094 char **resultp, size_t *lengthp)
1096 if (srclen == 0)
1098 /* Nothing to convert. */
1099 *lengthp = 0;
1100 return 0;
1102 else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
1104 char *result;
1106 if (*resultp != NULL && *lengthp >= srclen)
1107 result = *resultp;
1108 else
1110 result = (char *) malloc (srclen);
1111 if (result == NULL)
1113 errno = ENOMEM;
1114 return -1;
1117 memcpy (result, src, srclen);
1118 *resultp = result;
1119 *lengthp = srclen;
1120 return 0;
1122 else
1124 #if HAVE_ICONV
1125 iconveh_t cd;
1126 char *result;
1127 size_t length;
1128 int retval;
1130 if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1131 return -1;
1133 result = *resultp;
1134 length = *lengthp;
1135 retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
1136 &result, &length);
1138 if (retval < 0)
1140 /* Close cd, but preserve the errno from str_cd_iconv. */
1141 int saved_errno = errno;
1142 iconveh_close (&cd);
1143 errno = saved_errno;
1145 else
1147 if (iconveh_close (&cd) < 0)
1149 if (result != *resultp)
1150 free (result);
1151 return -1;
1153 *resultp = result;
1154 *lengthp = length;
1156 return retval;
1157 #else
1158 /* This is a different error code than if iconv_open existed but didn't
1159 support from_codeset and to_codeset, so that the caller can emit
1160 an error message such as
1161 "iconv() is not supported. Installing GNU libiconv and
1162 then reinstalling this package would fix this." */
1163 errno = ENOSYS;
1164 return -1;
1165 #endif
1169 char *
1170 str_iconveh (const char *src,
1171 const char *from_codeset, const char *to_codeset,
1172 enum iconv_ilseq_handler handler)
1174 if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
1176 char *result = strdup (src);
1178 if (result == NULL)
1179 errno = ENOMEM;
1180 return result;
1182 else
1184 #if HAVE_ICONV
1185 iconveh_t cd;
1186 char *result;
1188 if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
1189 return NULL;
1191 result = str_cd_iconveh (src, &cd, handler);
1193 if (result == NULL)
1195 /* Close cd, but preserve the errno from str_cd_iconv. */
1196 int saved_errno = errno;
1197 iconveh_close (&cd);
1198 errno = saved_errno;
1200 else
1202 if (iconveh_close (&cd) < 0)
1204 free (result);
1205 return NULL;
1208 return result;
1209 #else
1210 /* This is a different error code than if iconv_open existed but didn't
1211 support from_codeset and to_codeset, so that the caller can emit
1212 an error message such as
1213 "iconv() is not supported. Installing GNU libiconv and
1214 then reinstalling this package would fix this." */
1215 errno = ENOSYS;
1216 return NULL;
1217 #endif