1 /* Test of character set conversion with error handling and autodetection.
2 Copyright (C) 2007-2024 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Written by Bruno Haible <bruno@clisp.org>, 2007. */
21 #include "striconveha.h"
32 extern int iconv_supports_encoding (const char *encoding
);
34 /* Magic number for detecting bounds violations. */
35 #define MAGIC 0x1983EFF1
38 new_offsets (size_t n
)
40 size_t *offsets
= (size_t *) malloc ((n
+ 1) * sizeof (size_t));
49 static enum iconv_ilseq_handler handlers
[] =
50 { iconveh_error
, iconveh_question_mark
, iconveh_escape_sequence
};
55 /* Assume that iconv() supports at least the encodings ASCII, ISO-8859-1,
56 ISO-8859-2, and UTF-8. */
58 /* ------------------------- Test mem_iconveha() ------------------------- */
60 /* Test conversion from ISO-8859-2 to ISO-8859-1 with no errors. */
61 for (h
= 0; h
< SIZEOF (handlers
); h
++)
63 enum iconv_ilseq_handler handler
= handlers
[h
];
64 static const char input
[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337";
65 static const char expected
[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337";
66 for (o
= 0; o
< 2; o
++)
68 size_t *offsets
= (o
? new_offsets (strlen (input
)) : NULL
);
71 int retval
= mem_iconveha (input
, strlen (input
),
72 "ISO-8859-2", "ISO-8859-1",
77 ASSERT (length
== strlen (expected
));
78 ASSERT (result
!= NULL
&& memcmp (result
, expected
, strlen (expected
)) == 0);
81 for (i
= 0; i
< 37; i
++)
82 ASSERT (offsets
[i
] == i
);
83 ASSERT (offsets
[37] == MAGIC
);
90 /* Test conversion from ISO-8859-2 to ISO-8859-1 with EILSEQ. */
91 for (h
= 0; h
< SIZEOF (handlers
); h
++)
93 enum iconv_ilseq_handler handler
= handlers
[h
];
94 static const char input
[] = "Rafa\263 Maszkowski"; /* Rafał Maszkowski */
95 for (o
= 0; o
< 2; o
++)
97 size_t *offsets
= (o
? new_offsets (strlen (input
)) : NULL
);
100 int retval
= mem_iconveha (input
, strlen (input
),
101 "ISO-8859-2", "ISO-8859-1",
108 ASSERT (retval
== -1 && errno
== EILSEQ
);
109 ASSERT (result
== NULL
);
113 case iconveh_question_mark
:
115 static const char expected
[] = "Rafa? Maszkowski";
116 ASSERT (retval
== 0);
117 ASSERT (length
== strlen (expected
));
118 ASSERT (result
!= NULL
&& memcmp (result
, expected
, strlen (expected
)) == 0);
121 for (i
= 0; i
< 16; i
++)
122 ASSERT (offsets
[i
] == i
);
123 ASSERT (offsets
[16] == MAGIC
);
129 case iconveh_escape_sequence
:
131 static const char expected
[] = "Rafa\\u0142 Maszkowski";
132 ASSERT (retval
== 0);
133 ASSERT (length
== strlen (expected
));
134 ASSERT (result
!= NULL
&& memcmp (result
, expected
, strlen (expected
)) == 0);
137 for (i
= 0; i
< 16; i
++)
138 ASSERT (offsets
[i
] == (i
< 5 ? i
:
140 ASSERT (offsets
[16] == MAGIC
);
150 /* Test conversion from ISO-8859-1 to UTF-8 with no errors. */
151 for (h
= 0; h
< SIZEOF (handlers
); h
++)
153 enum iconv_ilseq_handler handler
= handlers
[h
];
154 static const char input
[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337";
155 static const char expected
[] = "\303\204rger mit b\303\266sen B\303\274bchen ohne Augenma\303\237";
156 for (o
= 0; o
< 2; o
++)
158 size_t *offsets
= (o
? new_offsets (strlen (input
)) : NULL
);
161 int retval
= mem_iconveha (input
, strlen (input
),
162 "ISO-8859-1", "UTF-8",
166 ASSERT (retval
== 0);
167 ASSERT (length
== strlen (expected
));
168 ASSERT (result
!= NULL
&& memcmp (result
, expected
, strlen (expected
)) == 0);
171 for (i
= 0; i
< 37; i
++)
172 ASSERT (offsets
[i
] == (i
< 1 ? i
:
176 ASSERT (offsets
[37] == MAGIC
);
183 /* Test conversion from UTF-8 to ISO-8859-1 with no errors. */
184 for (h
= 0; h
< SIZEOF (handlers
); h
++)
186 enum iconv_ilseq_handler handler
= handlers
[h
];
187 static const char input
[] = "\303\204rger mit b\303\266sen B\303\274bchen ohne Augenma\303\237";
188 static const char expected
[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337";
189 for (o
= 0; o
< 2; o
++)
191 size_t *offsets
= (o
? new_offsets (strlen (input
)) : NULL
);
194 int retval
= mem_iconveha (input
, strlen (input
),
195 "UTF-8", "ISO-8859-1",
199 ASSERT (retval
== 0);
200 ASSERT (length
== strlen (expected
));
201 ASSERT (result
!= NULL
&& memcmp (result
, expected
, strlen (expected
)) == 0);
204 for (i
= 0; i
< 41; i
++)
205 ASSERT (offsets
[i
] == (i
< 1 ? i
:
206 i
== 1 ? (size_t)(-1) :
208 i
== 13 ? (size_t)(-1) :
210 i
== 20 ? (size_t)(-1) :
213 ASSERT (offsets
[41] == MAGIC
);
220 /* Test conversion from UTF-8 to ISO-8859-1 with EILSEQ. */
221 for (h
= 0; h
< SIZEOF (handlers
); h
++)
223 enum iconv_ilseq_handler handler
= handlers
[h
];
224 static const char input
[] = "Rafa\305\202 Maszkowski"; /* Rafał Maszkowski */
225 for (o
= 0; o
< 2; o
++)
227 size_t *offsets
= (o
? new_offsets (strlen (input
)) : NULL
);
230 int retval
= mem_iconveha (input
, strlen (input
),
231 "UTF-8", "ISO-8859-1",
238 ASSERT (retval
== -1 && errno
== EILSEQ
);
239 ASSERT (result
== NULL
);
243 case iconveh_question_mark
:
245 static const char expected
[] = "Rafa? Maszkowski";
246 ASSERT (retval
== 0);
247 ASSERT (length
== strlen (expected
));
248 ASSERT (result
!= NULL
&& memcmp (result
, expected
, strlen (expected
)) == 0);
251 for (i
= 0; i
< 17; i
++)
252 ASSERT (offsets
[i
] == (i
< 5 ? i
:
253 i
== 5 ? (size_t)(-1) :
255 ASSERT (offsets
[17] == MAGIC
);
261 case iconveh_escape_sequence
:
263 static const char expected
[] = "Rafa\\u0142 Maszkowski";
264 ASSERT (retval
== 0);
265 ASSERT (length
== strlen (expected
));
266 ASSERT (result
!= NULL
&& memcmp (result
, expected
, strlen (expected
)) == 0);
269 for (i
= 0; i
< 17; i
++)
270 ASSERT (offsets
[i
] == (i
< 5 ? i
:
271 i
== 5 ? (size_t)(-1) :
273 ASSERT (offsets
[17] == MAGIC
);
283 /* Test conversion from UTF-8 to ISO-8859-1 with EINVAL. */
284 for (h
= 0; h
< SIZEOF (handlers
); h
++)
286 enum iconv_ilseq_handler handler
= handlers
[h
];
287 static const char input
[] = "\342";
288 for (o
= 0; o
< 2; o
++)
290 size_t *offsets
= (o
? new_offsets (strlen (input
)) : NULL
);
293 int retval
= mem_iconveha (input
, strlen (input
),
294 "UTF-8", "ISO-8859-1",
298 ASSERT (retval
== 0);
299 ASSERT (length
== 0);
302 ASSERT (offsets
[0] == 0);
303 ASSERT (offsets
[1] == MAGIC
);
310 /* autodetect_jp is only supported when iconv() support ISO-2022-JP-2. */
311 # if (defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
312 || !(defined _AIX || defined __sgi || defined __hpux || defined __osf__ || defined __sun)
313 if (iconv_supports_encoding ("ISO-2022-JP-2"))
315 /* Test conversions from autodetect_jp to UTF-8. */
316 for (h
= 0; h
< SIZEOF (handlers
); h
++)
318 enum iconv_ilseq_handler handler
= handlers
[h
];
319 static const char input
[] = "\244\263\244\363\244\313\244\301\244\317"; /* こんにちは in EUC-JP */
320 static const char expected
[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* こんにちは */
321 for (o
= 0; o
< 2; o
++)
323 size_t *offsets
= (o
? new_offsets (strlen (input
)) : NULL
);
326 int retval
= mem_iconveha (input
, strlen (input
),
327 "autodetect_jp", "UTF-8",
331 ASSERT (retval
== 0);
332 ASSERT (length
== strlen (expected
));
333 ASSERT (result
!= NULL
&& memcmp (result
, expected
, strlen (expected
)) == 0);
336 for (i
= 0; i
< 10; i
++)
337 ASSERT (offsets
[i
] == ((i
% 2) == 0 ? (i
/ 2) * 3 : (size_t)(-1)));
338 ASSERT (offsets
[10] == MAGIC
);
344 for (h
= 0; h
< SIZEOF (handlers
); h
++)
346 enum iconv_ilseq_handler handler
= handlers
[h
];
347 static const char input
[] = "\202\261\202\361\202\311\202\277\202\315"; /* こんにちは in Shift_JIS */
348 static const char expected
[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* こんにちは */
349 for (o
= 0; o
< 2; o
++)
351 size_t *offsets
= (o
? new_offsets (strlen (input
)) : NULL
);
354 int retval
= mem_iconveha (input
, strlen (input
),
355 "autodetect_jp", "UTF-8",
359 ASSERT (retval
== 0);
360 ASSERT (length
== strlen (expected
));
361 ASSERT (result
!= NULL
&& memcmp (result
, expected
, strlen (expected
)) == 0);
364 for (i
= 0; i
< 10; i
++)
365 ASSERT (offsets
[i
] == ((i
% 2) == 0 ? (i
/ 2) * 3 : (size_t)(-1)));
366 ASSERT (offsets
[10] == MAGIC
);
372 for (h
= 0; h
< SIZEOF (handlers
); h
++)
374 enum iconv_ilseq_handler handler
= handlers
[h
];
375 static const char input
[] = "\033$B$3$s$K$A$O\033(B"; /* こんにちは in ISO-2022-JP-2 */
376 static const char expected
[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* こんにちは */
377 for (o
= 0; o
< 2; o
++)
379 size_t *offsets
= (o
? new_offsets (strlen (input
)) : NULL
);
382 int retval
= mem_iconveha (input
, strlen (input
),
383 "autodetect_jp", "UTF-8",
387 ASSERT (retval
== 0);
388 ASSERT (length
== strlen (expected
));
389 ASSERT (result
!= NULL
&& memcmp (result
, expected
, strlen (expected
)) == 0);
392 for (i
= 0; i
< 16; i
++)
393 ASSERT (offsets
[i
] == (i
== 0 ? 0 :
400 ASSERT (offsets
[16] == MAGIC
);
409 # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) && !defined __UCLIBC__) || (_LIBICONV_VERSION >= 0x0105 && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__))
410 /* Test conversion from UTF-8 to ISO-8859-1 with transliteration. */
411 for (h
= 0; h
< SIZEOF (handlers
); h
++)
413 enum iconv_ilseq_handler handler
= handlers
[h
];
414 static const char input
[] = "Costs: 27 \342\202\254"; /* EURO SIGN */
415 static const char expected
[] = "Costs: 27 EUR";
416 for (o
= 0; o
< 2; o
++)
418 size_t *offsets
= (o
? new_offsets (strlen (input
)) : NULL
);
421 int retval
= mem_iconveha (input
, strlen (input
),
422 "UTF-8", "ISO-8859-1",
426 ASSERT (retval
== 0);
427 ASSERT (length
== strlen (expected
));
428 ASSERT (result
!= NULL
&& memcmp (result
, expected
, strlen (expected
)) == 0);
431 for (i
= 0; i
< 13; i
++)
432 ASSERT (offsets
[i
] == (i
< 11 ? i
: (size_t)(-1)));
433 ASSERT (offsets
[13] == MAGIC
);
441 /* ------------------------- Test str_iconveha() ------------------------- */
443 /* Test conversion from ISO-8859-2 to ISO-8859-1 with no errors. */
444 for (h
= 0; h
< SIZEOF (handlers
); h
++)
446 enum iconv_ilseq_handler handler
= handlers
[h
];
447 static const char input
[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337";
448 static const char expected
[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337";
449 char *result
= str_iconveha (input
, "ISO-8859-2", "ISO-8859-1", false, handler
);
450 ASSERT (result
!= NULL
);
451 ASSERT (strcmp (result
, expected
) == 0);
455 /* Test conversion from ISO-8859-2 to ISO-8859-1 with EILSEQ. */
456 for (h
= 0; h
< SIZEOF (handlers
); h
++)
458 enum iconv_ilseq_handler handler
= handlers
[h
];
459 static const char input
[] = "Rafa\263 Maszkowski"; /* Rafał Maszkowski */
460 char *result
= str_iconveha (input
, "ISO-8859-2", "ISO-8859-1", false, handler
);
464 ASSERT (result
== NULL
&& errno
== EILSEQ
);
466 case iconveh_question_mark
:
468 static const char expected
[] = "Rafa? Maszkowski";
469 ASSERT (result
!= NULL
);
470 ASSERT (strcmp (result
, expected
) == 0);
474 case iconveh_escape_sequence
:
476 static const char expected
[] = "Rafa\\u0142 Maszkowski";
477 ASSERT (result
!= NULL
);
478 ASSERT (strcmp (result
, expected
) == 0);
485 /* Test conversion from ISO-8859-1 to UTF-8 with no errors. */
486 for (h
= 0; h
< SIZEOF (handlers
); h
++)
488 enum iconv_ilseq_handler handler
= handlers
[h
];
489 static const char input
[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337";
490 static const char expected
[] = "\303\204rger mit b\303\266sen B\303\274bchen ohne Augenma\303\237";
491 char *result
= str_iconveha (input
, "ISO-8859-1", "UTF-8", false, handler
);
492 ASSERT (result
!= NULL
);
493 ASSERT (strcmp (result
, expected
) == 0);
497 /* Test conversion from UTF-8 to ISO-8859-1 with no errors. */
498 for (h
= 0; h
< SIZEOF (handlers
); h
++)
500 enum iconv_ilseq_handler handler
= handlers
[h
];
501 static const char input
[] = "\303\204rger mit b\303\266sen B\303\274bchen ohne Augenma\303\237";
502 static const char expected
[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337";
503 char *result
= str_iconveha (input
, "UTF-8", "ISO-8859-1", false, handler
);
504 ASSERT (result
!= NULL
);
505 ASSERT (strcmp (result
, expected
) == 0);
509 /* Test conversion from UTF-8 to ISO-8859-1 with EILSEQ. */
510 for (h
= 0; h
< SIZEOF (handlers
); h
++)
512 enum iconv_ilseq_handler handler
= handlers
[h
];
513 static const char input
[] = "Costs: 27 \342\202\254"; /* EURO SIGN */
514 char *result
= str_iconveha (input
, "UTF-8", "ISO-8859-1", false, handler
);
518 ASSERT (result
== NULL
&& errno
== EILSEQ
);
520 case iconveh_question_mark
:
522 static const char expected
[] = "Costs: 27 ?";
523 ASSERT (result
!= NULL
);
524 ASSERT (strcmp (result
, expected
) == 0);
528 case iconveh_escape_sequence
:
530 static const char expected
[] = "Costs: 27 \\u20AC";
531 ASSERT (result
!= NULL
);
532 ASSERT (strcmp (result
, expected
) == 0);
539 /* Test conversion from UTF-8 to ISO-8859-1 with EINVAL. */
540 for (h
= 0; h
< SIZEOF (handlers
); h
++)
542 enum iconv_ilseq_handler handler
= handlers
[h
];
543 static const char input
[] = "\342";
544 char *result
= str_iconveha (input
, "UTF-8", "ISO-8859-1", false, handler
);
545 ASSERT (result
!= NULL
);
546 ASSERT (strcmp (result
, "") == 0);
550 /* autodetect_jp is only supported when iconv() support ISO-2022-JP-2. */
551 # if (defined _LIBICONV_VERSION && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__)) \
552 || !(defined _AIX || defined __sgi || defined __hpux || defined __osf__ || defined __sun)
553 if (iconv_supports_encoding ("ISO-2022-JP-2"))
555 /* Test conversions from autodetect_jp to UTF-8. */
556 for (h
= 0; h
< SIZEOF (handlers
); h
++)
558 enum iconv_ilseq_handler handler
= handlers
[h
];
559 static const char input
[] = "\244\263\244\363\244\313\244\301\244\317"; /* こんにちは in EUC-JP */
560 static const char expected
[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* こんにちは */
561 char *result
= str_iconveha (input
, "autodetect_jp", "UTF-8", false, handler
);
562 ASSERT (result
!= NULL
);
563 ASSERT (strcmp (result
, expected
) == 0);
566 for (h
= 0; h
< SIZEOF (handlers
); h
++)
568 enum iconv_ilseq_handler handler
= handlers
[h
];
569 static const char input
[] = "\202\261\202\361\202\311\202\277\202\315"; /* こんにちは in Shift_JIS */
570 static const char expected
[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* こんにちは */
571 char *result
= str_iconveha (input
, "autodetect_jp", "UTF-8", false, handler
);
572 ASSERT (result
!= NULL
);
573 ASSERT (strcmp (result
, expected
) == 0);
576 for (h
= 0; h
< SIZEOF (handlers
); h
++)
578 enum iconv_ilseq_handler handler
= handlers
[h
];
579 static const char input
[] = "\033$B$3$s$K$A$O\033(B"; /* こんにちは in ISO-2022-JP-2 */
580 static const char expected
[] = "\343\201\223\343\202\223\343\201\253\343\201\241\343\201\257"; /* こんにちは */
581 char *result
= str_iconveha (input
, "autodetect_jp", "UTF-8", false, handler
);
582 ASSERT (result
!= NULL
);
583 ASSERT (strcmp (result
, expected
) == 0);
589 # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) && !defined __UCLIBC__) || (_LIBICONV_VERSION >= 0x0105 && !(_LIBICONV_VERSION == 0x10b && defined __APPLE__))
590 /* Test conversion from UTF-8 to ISO-8859-1 with transliteration. */
591 for (h
= 0; h
< SIZEOF (handlers
); h
++)
593 enum iconv_ilseq_handler handler
= handlers
[h
];
594 static const char input
[] = "Costs: 27 \342\202\254"; /* EURO SIGN */
595 static const char expected
[] = "Costs: 27 EUR";
596 char *result
= str_iconveha (input
, "UTF-8", "ISO-8859-1", true, handler
);
597 ASSERT (result
!= NULL
);
598 ASSERT (strcmp (result
, expected
) == 0);
605 return test_exit_status
;