1 /* Character set conversion with error handling and autodetection.
2 Copyright (C) 2002, 2005, 2007, 2009-2025 Free Software Foundation, Inc.
3 Written by Bruno Haible.
5 This file is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as
7 published by the Free Software Foundation; either version 2.1 of the
8 License, or (at your option) any later version.
10 This file is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
21 #include "striconveha.h"
28 #include "c-strcase.h"
29 #include "striconveh.h"
31 #define SIZEOF(a) (sizeof(a)/sizeof(a[0]))
34 /* Autodetection list. */
36 struct autodetect_alias
38 struct autodetect_alias
*next
;
40 const char * const *encodings_to_try
;
43 static const char * const autodetect_utf8_try
[] =
45 /* Try UTF-8 first. There are very few ISO-8859-1 inputs that would
46 be valid UTF-8, but many UTF-8 inputs are valid ISO-8859-1. */
47 "UTF-8", "ISO-8859-1",
50 static const char * const autodetect_jp_try
[] =
52 /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
54 Try EUC-JP next. Short SHIFT_JIS inputs may come out wrong. This
55 is unavoidable. People will condemn SHIFT_JIS.
56 If we tried SHIFT_JIS first, then some short EUC-JP inputs would
57 come out wrong, and people would condemn EUC-JP and Unix, which
59 Finally try SHIFT_JIS. */
60 "ISO-2022-JP-2", "EUC-JP", "SHIFT_JIS",
63 static const char * const autodetect_kr_try
[] =
65 /* Try 7-bit encoding first. If the input contains bytes >= 0x80,
67 Finally try EUC-KR. */
68 "ISO-2022-KR", "EUC-KR",
72 static struct autodetect_alias autodetect_predefined
[] =
74 { &autodetect_predefined
[1], "autodetect_utf8", autodetect_utf8_try
},
75 { &autodetect_predefined
[2], "autodetect_jp", autodetect_jp_try
},
76 { NULL
, "autodetect_kr", autodetect_kr_try
}
79 static struct autodetect_alias
*autodetect_list
= &autodetect_predefined
[0];
80 static struct autodetect_alias
**autodetect_list_end
=
81 &autodetect_predefined
[SIZEOF(autodetect_predefined
)-1].next
;
84 uniconv_register_autodetect (const char *name
,
85 const char * const *try_in_order
)
92 /* The TRY_IN_ORDER list must not be empty. */
93 if (try_in_order
[0] == NULL
)
99 /* We must deep-copy NAME and TRY_IN_ORDER, because they may be allocated
100 with dynamic extent. */
101 namelen
= strlen (name
) + 1;
102 memneed
= sizeof (struct autodetect_alias
) + namelen
+ sizeof (char *);
103 for (i
= 0; try_in_order
[i
] != NULL
; i
++)
104 memneed
+= sizeof (char *) + strlen (try_in_order
[i
]) + 1;
107 void *memory
= malloc (memneed
);
110 struct autodetect_alias
*new_alias
= memory
;
111 memory
= new_alias
+ 1;
113 char const **new_try_in_order
= memory
;
114 memory
= new_try_in_order
+ listlen
+ 1;
116 char *new_name
= memcpy (memory
, name
, namelen
);
117 memory
= new_name
+ namelen
;
119 for (i
= 0; i
< listlen
; i
++)
121 size_t len
= strlen (try_in_order
[i
]) + 1;
122 char *copy
= memcpy (memory
, try_in_order
[i
], len
);
123 new_try_in_order
[i
] = copy
;
126 new_try_in_order
[i
] = NULL
;
128 /* Now insert the new alias. */
129 new_alias
->name
= new_name
;
130 new_alias
->encodings_to_try
= new_try_in_order
;
131 new_alias
->next
= NULL
;
132 /* FIXME: Not multithread-safe. */
133 *autodetect_list_end
= new_alias
;
134 autodetect_list_end
= &new_alias
->next
;
144 /* Like mem_iconveha, except no handling of transliteration. */
146 mem_iconveha_notranslit (const char *src
, size_t srclen
,
147 const char *from_codeset
, const char *to_codeset
,
148 enum iconv_ilseq_handler handler
,
150 char **resultp
, size_t *lengthp
)
152 int retval
= mem_iconveh (src
, srclen
, from_codeset
, to_codeset
, handler
,
153 offsets
, resultp
, lengthp
);
154 if (retval
>= 0 || errno
!= EINVAL
)
158 struct autodetect_alias
*alias
;
160 /* Unsupported from_codeset or to_codeset. Check whether the caller
161 requested autodetection. */
162 for (alias
= autodetect_list
; alias
!= NULL
; alias
= alias
->next
)
163 if (strcmp (from_codeset
, alias
->name
) == 0)
165 const char * const *encodings
;
167 if (handler
!= iconveh_error
)
169 /* First try all encodings without any forgiving. */
170 encodings
= alias
->encodings_to_try
;
173 retval
= mem_iconveha_notranslit (src
, srclen
,
174 *encodings
, to_codeset
,
175 iconveh_error
, offsets
,
177 if (!(retval
< 0 && errno
== EILSEQ
))
181 while (*encodings
!= NULL
);
184 encodings
= alias
->encodings_to_try
;
187 retval
= mem_iconveha_notranslit (src
, srclen
,
188 *encodings
, to_codeset
,
191 if (!(retval
< 0 && errno
== EILSEQ
))
195 while (*encodings
!= NULL
);
197 /* Return the last call's result. */
201 /* It wasn't an autodetection name. */
208 mem_iconveha (const char *src
, size_t srclen
,
209 const char *from_codeset
, const char *to_codeset
,
211 enum iconv_ilseq_handler handler
,
213 char **resultp
, size_t *lengthp
)
217 /* Nothing to convert. */
222 /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5 or Citrus/FreeBSD/macOS
223 iconv, we want to use transliteration. */
224 #if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
225 && !defined __UCLIBC__) \
226 || _LIBICONV_VERSION >= 0x0105 \
227 || defined ICONV_SET_TRANSLITERATE
231 size_t len
= strlen (to_codeset
);
232 char *to_codeset_suffixed
= (char *) malloca (len
+ 10 + 1);
233 if (to_codeset_suffixed
== NULL
)
238 memcpy (to_codeset_suffixed
, to_codeset
, len
);
239 memcpy (to_codeset_suffixed
+ len
, "//TRANSLIT", 10 + 1);
241 retval
= mem_iconveha_notranslit (src
, srclen
,
242 from_codeset
, to_codeset_suffixed
,
243 handler
, offsets
, resultp
, lengthp
);
245 freea (to_codeset_suffixed
);
251 return mem_iconveha_notranslit (src
, srclen
,
252 from_codeset
, to_codeset
,
253 handler
, offsets
, resultp
, lengthp
);
256 /* Like str_iconveha, except no handling of transliteration. */
258 str_iconveha_notranslit (const char *src
,
259 const char *from_codeset
, const char *to_codeset
,
260 enum iconv_ilseq_handler handler
)
262 char *result
= str_iconveh (src
, from_codeset
, to_codeset
, handler
);
264 if (result
!= NULL
|| errno
!= EINVAL
)
268 struct autodetect_alias
*alias
;
270 /* Unsupported from_codeset or to_codeset. Check whether the caller
271 requested autodetection. */
272 for (alias
= autodetect_list
; alias
!= NULL
; alias
= alias
->next
)
273 if (strcmp (from_codeset
, alias
->name
) == 0)
275 const char * const *encodings
;
277 if (handler
!= iconveh_error
)
279 /* First try all encodings without any forgiving. */
280 encodings
= alias
->encodings_to_try
;
283 result
= str_iconveha_notranslit (src
,
284 *encodings
, to_codeset
,
286 if (!(result
== NULL
&& errno
== EILSEQ
))
290 while (*encodings
!= NULL
);
293 encodings
= alias
->encodings_to_try
;
296 result
= str_iconveha_notranslit (src
,
297 *encodings
, to_codeset
,
299 if (!(result
== NULL
&& errno
== EILSEQ
))
303 while (*encodings
!= NULL
);
305 /* Return the last call's result. */
309 /* It wasn't an autodetection name. */
316 str_iconveha (const char *src
,
317 const char *from_codeset
, const char *to_codeset
,
319 enum iconv_ilseq_handler handler
)
321 if (*src
== '\0' || c_strcasecmp (from_codeset
, to_codeset
) == 0)
323 char *result
= strdup (src
);
330 /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5 or Citrus/FreeBSD/macOS
331 iconv, we want to use transliteration. */
332 #if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
333 && !defined __UCLIBC__) \
334 || _LIBICONV_VERSION >= 0x0105 \
335 || defined ICONV_SET_TRANSLITERATE
339 size_t len
= strlen (to_codeset
);
340 char *to_codeset_suffixed
= (char *) malloca (len
+ 10 + 1);
341 if (to_codeset_suffixed
== NULL
)
346 memcpy (to_codeset_suffixed
, to_codeset
, len
);
347 memcpy (to_codeset_suffixed
+ len
, "//TRANSLIT", 10 + 1);
349 result
= str_iconveha_notranslit (src
, from_codeset
, to_codeset_suffixed
,
352 freea (to_codeset_suffixed
);
358 return str_iconveha_notranslit (src
, from_codeset
, to_codeset
, handler
);