vfs: check userland buffers before reading them.
[haiku.git] / src / libs / iconv / iconv.c
blob9d522051a8c593bf299667fa403856c13993b372
1 /*
2 * Copyright (C) 1999-2007 Free Software Foundation, Inc.
3 * This file is part of the GNU LIBICONV Library.
5 * The GNU LIBICONV Library is free software; you can redistribute it
6 * and/or modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either version 2
8 * of the License, or (at your option) any later version.
10 * The GNU LIBICONV Library is distributed in the hope that it will be
11 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
15 * You should have received a copy of the GNU Library General Public
16 * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
17 * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
18 * Fifth Floor, Boston, MA 02110-1301, USA.
21 #include "iconv.h"
23 #include <stdlib.h>
24 #include <string.h>
25 #include "config.h"
26 #include "localcharset.h"
28 #if ENABLE_EXTRA
30 * Consider all system dependent encodings, for any system,
31 * and the extra encodings.
33 #define USE_AIX
34 #define USE_OSF1
35 #define USE_DOS
36 #define USE_EXTRA
37 #else
39 * Consider those system dependent encodings that are needed for the
40 * current system.
42 #ifdef _AIX
43 #define USE_AIX
44 #endif
45 #if defined(__osf__) || defined(VMS)
46 #define USE_OSF1
47 #endif
48 #if defined(__DJGPP__) || (defined(_WIN32) && (defined(_MSC_VER) || defined(__MINGW32__)))
49 #define USE_DOS
50 #endif
51 #endif
54 * Data type for general conversion loop.
56 struct loop_funcs {
57 size_t (*loop_convert) (iconv_t icd,
58 const char* * inbuf, size_t *inbytesleft,
59 char* * outbuf, size_t *outbytesleft);
60 size_t (*loop_reset) (iconv_t icd,
61 char* * outbuf, size_t *outbytesleft);
65 * Converters.
67 #include "converters.h"
70 * Transliteration tables.
72 #include "cjk_variants.h"
73 #include "translit.h"
76 * Table of all supported encodings.
78 struct encoding {
79 struct mbtowc_funcs ifuncs; /* conversion multibyte -> unicode */
80 struct wctomb_funcs ofuncs; /* conversion unicode -> multibyte */
81 int oflags; /* flags for unicode -> multibyte conversion */
83 enum {
84 #define DEFENCODING(xxx_names,xxx,xxx_ifuncs1,xxx_ifuncs2,xxx_ofuncs1,xxx_ofuncs2) \
85 ei_##xxx ,
86 #include "encodings.def"
87 #ifdef USE_AIX
88 #include "encodings_aix.def"
89 #endif
90 #ifdef USE_OSF1
91 #include "encodings_osf1.def"
92 #endif
93 #ifdef USE_DOS
94 #include "encodings_dos.def"
95 #endif
96 #ifdef USE_EXTRA
97 #include "encodings_extra.def"
98 #endif
99 #include "encodings_local.def"
100 #undef DEFENCODING
101 ei_for_broken_compilers_that_dont_like_trailing_commas
103 #include "flags.h"
104 static struct encoding const all_encodings[] = {
105 #define DEFENCODING(xxx_names,xxx,xxx_ifuncs1,xxx_ifuncs2,xxx_ofuncs1,xxx_ofuncs2) \
106 { xxx_ifuncs1,xxx_ifuncs2, xxx_ofuncs1,xxx_ofuncs2, ei_##xxx##_oflags },
107 #include "encodings.def"
108 #ifdef USE_AIX
109 #include "encodings_aix.def"
110 #endif
111 #ifdef USE_OSF1
112 #include "encodings_osf1.def"
113 #endif
114 #ifdef USE_DOS
115 #include "encodings_dos.def"
116 #endif
117 #ifdef USE_EXTRA
118 #include "encodings_extra.def"
119 #endif
120 #undef DEFENCODING
121 #define DEFENCODING(xxx_names,xxx,xxx_ifuncs1,xxx_ifuncs2,xxx_ofuncs1,xxx_ofuncs2) \
122 { xxx_ifuncs1,xxx_ifuncs2, xxx_ofuncs1,xxx_ofuncs2, 0 },
123 #include "encodings_local.def"
124 #undef DEFENCODING
128 * Conversion loops.
130 #include "loops.h"
133 * Alias lookup function.
134 * Defines
135 * struct alias { int name; unsigned int encoding_index; };
136 * const struct alias * aliases_lookup (const char *str, unsigned int len);
137 * #define MAX_WORD_LENGTH ...
139 #include "aliases.h"
142 * System dependent alias lookup function.
143 * Defines
144 * const struct alias * aliases2_lookup (const char *str);
146 #if defined(USE_AIX) || defined(USE_OSF1) || defined(USE_DOS) || defined(USE_EXTRA) /* || ... */
147 struct stringpool2_t {
148 #define S(tag,name,encoding_index) char stringpool_##tag[sizeof(name)];
149 #include "aliases2.h"
150 #undef S
152 static const struct stringpool2_t stringpool2_contents = {
153 #define S(tag,name,encoding_index) name,
154 #include "aliases2.h"
155 #undef S
157 #define stringpool2 ((const char *) &stringpool2_contents)
158 static const struct alias sysdep_aliases[] = {
159 #define S(tag,name,encoding_index) { (int)(long)&((struct stringpool2_t *)0)->stringpool_##tag, encoding_index },
160 #include "aliases2.h"
161 #undef S
163 #if defined(__GNUC__) && !defined(DEBUG)
164 __inline
165 #endif
166 const struct alias *
167 aliases2_lookup (register const char *str)
169 const struct alias * ptr;
170 unsigned int count;
171 for (ptr = sysdep_aliases, count = sizeof(sysdep_aliases)/sizeof(sysdep_aliases[0]); count > 0; ptr++, count--)
172 if (!strcmp(str, stringpool2 + ptr->name))
173 return ptr;
174 return NULL;
176 #else
177 #define aliases2_lookup(str) NULL
178 #define stringpool2 NULL
179 #endif
181 #if 0
182 /* Like !strcasecmp, except that the both strings can be assumed to be ASCII
183 and the first string can be assumed to be in uppercase. */
184 static int strequal (const char* str1, const char* str2)
186 unsigned char c1;
187 unsigned char c2;
188 for (;;) {
189 c1 = * (unsigned char *) str1++;
190 c2 = * (unsigned char *) str2++;
191 if (c1 == 0)
192 break;
193 if (c2 >= 'a' && c2 <= 'z')
194 c2 -= 'a'-'A';
195 if (c1 != c2)
196 break;
198 return (c1 == c2);
200 #endif
202 iconv_t iconv_open (const char* tocode, const char* fromcode)
204 struct conv_struct * cd;
205 char buf[MAX_WORD_LENGTH+10+1];
206 const char* cp;
207 char* bp;
208 const struct alias * ap;
209 unsigned int count;
210 unsigned int from_index;
211 int from_wchar;
212 unsigned int to_index;
213 int to_wchar;
214 int transliterate = 0;
215 int discard_ilseq = 0;
217 /* Before calling aliases_lookup, convert the input string to upper case,
218 * and check whether it's entirely ASCII (we call gperf with option "-7"
219 * to achieve a smaller table) and non-empty. If it's not entirely ASCII,
220 * or if it's too long, it is not a valid encoding name.
222 for (to_wchar = 0;;) {
223 /* Search tocode in the table. */
224 for (cp = tocode, bp = buf, count = MAX_WORD_LENGTH+10+1; ; cp++, bp++) {
225 unsigned char c = * (unsigned char *) cp;
226 if (c >= 0x80)
227 goto invalid;
228 if (c >= 'a' && c <= 'z')
229 c -= 'a'-'A';
230 *bp = c;
231 if (c == '\0')
232 break;
233 if (--count == 0)
234 goto invalid;
236 for (;;) {
237 if (bp-buf >= 10 && memcmp(bp-10,"//TRANSLIT",10)==0) {
238 bp -= 10;
239 *bp = '\0';
240 transliterate = 1;
241 continue;
243 if (bp-buf >= 8 && memcmp(bp-8,"//IGNORE",8)==0) {
244 bp -= 8;
245 *bp = '\0';
246 discard_ilseq = 1;
247 continue;
249 break;
251 if (buf[0] == '\0') {
252 tocode = locale_charset();
253 /* Avoid an endless loop that could occur when using an older version
254 of localcharset.c. */
255 if (tocode[0] == '\0')
256 goto invalid;
257 continue;
259 ap = aliases_lookup(buf,bp-buf);
260 if (ap == NULL) {
261 ap = aliases2_lookup(buf);
262 if (ap == NULL)
263 goto invalid;
265 if (ap->encoding_index == ei_local_char) {
266 tocode = locale_charset();
267 /* Avoid an endless loop that could occur when using an older version
268 of localcharset.c. */
269 if (tocode[0] == '\0')
270 goto invalid;
271 continue;
273 if (ap->encoding_index == ei_local_wchar_t) {
274 /* On systems which define __STDC_ISO_10646__, wchar_t is Unicode.
275 This is also the case on native Woe32 systems. */
276 #if __STDC_ISO_10646__ || ((defined _WIN32 || defined __WIN32__) && !defined __CYGWIN__)
277 if (sizeof(wchar_t) == 4) {
278 to_index = ei_ucs4internal;
279 break;
281 if (sizeof(wchar_t) == 2) {
282 to_index = ei_ucs2internal;
283 break;
285 if (sizeof(wchar_t) == 1) {
286 to_index = ei_iso8859_1;
287 break;
289 #endif
290 #if HAVE_MBRTOWC
291 to_wchar = 1;
292 tocode = locale_charset();
293 continue;
294 #endif
295 goto invalid;
297 to_index = ap->encoding_index;
298 break;
300 for (from_wchar = 0;;) {
301 /* Search fromcode in the table. */
302 for (cp = fromcode, bp = buf, count = MAX_WORD_LENGTH+10+1; ; cp++, bp++) {
303 unsigned char c = * (unsigned char *) cp;
304 if (c >= 0x80)
305 goto invalid;
306 if (c >= 'a' && c <= 'z')
307 c -= 'a'-'A';
308 *bp = c;
309 if (c == '\0')
310 break;
311 if (--count == 0)
312 goto invalid;
314 for (;;) {
315 if (bp-buf >= 10 && memcmp(bp-10,"//TRANSLIT",10)==0) {
316 bp -= 10;
317 *bp = '\0';
318 continue;
320 if (bp-buf >= 8 && memcmp(bp-8,"//IGNORE",8)==0) {
321 bp -= 8;
322 *bp = '\0';
323 continue;
325 break;
327 if (buf[0] == '\0') {
328 fromcode = locale_charset();
329 /* Avoid an endless loop that could occur when using an older version
330 of localcharset.c. */
331 if (fromcode[0] == '\0')
332 goto invalid;
333 continue;
335 ap = aliases_lookup(buf,bp-buf);
336 if (ap == NULL) {
337 ap = aliases2_lookup(buf);
338 if (ap == NULL)
339 goto invalid;
341 if (ap->encoding_index == ei_local_char) {
342 fromcode = locale_charset();
343 /* Avoid an endless loop that could occur when using an older version
344 of localcharset.c. */
345 if (fromcode[0] == '\0')
346 goto invalid;
347 continue;
349 if (ap->encoding_index == ei_local_wchar_t) {
350 /* On systems which define __STDC_ISO_10646__, wchar_t is Unicode.
351 This is also the case on native Woe32 systems. */
352 #if __STDC_ISO_10646__ || ((defined _WIN32 || defined __WIN32__) && !defined __CYGWIN__)
353 if (sizeof(wchar_t) == 4) {
354 from_index = ei_ucs4internal;
355 break;
357 if (sizeof(wchar_t) == 2) {
358 from_index = ei_ucs2internal;
359 break;
361 if (sizeof(wchar_t) == 1) {
362 from_index = ei_iso8859_1;
363 break;
365 #endif
366 #if HAVE_WCRTOMB
367 from_wchar = 1;
368 fromcode = locale_charset();
369 continue;
370 #endif
371 goto invalid;
373 from_index = ap->encoding_index;
374 break;
376 cd = (struct conv_struct *) malloc(from_wchar != to_wchar
377 ? sizeof(struct wchar_conv_struct)
378 : sizeof(struct conv_struct));
379 if (cd == NULL) {
380 errno = ENOMEM;
381 return (iconv_t)(-1);
383 cd->iindex = from_index;
384 cd->ifuncs = all_encodings[from_index].ifuncs;
385 cd->oindex = to_index;
386 cd->ofuncs = all_encodings[to_index].ofuncs;
387 cd->oflags = all_encodings[to_index].oflags;
388 /* Initialize the loop functions. */
389 #if HAVE_MBRTOWC
390 if (to_wchar) {
391 #if HAVE_WCRTOMB
392 if (from_wchar) {
393 cd->lfuncs.loop_convert = wchar_id_loop_convert;
394 cd->lfuncs.loop_reset = wchar_id_loop_reset;
395 } else
396 #endif
398 cd->lfuncs.loop_convert = wchar_to_loop_convert;
399 cd->lfuncs.loop_reset = wchar_to_loop_reset;
401 } else
402 #endif
404 #if HAVE_WCRTOMB
405 if (from_wchar) {
406 cd->lfuncs.loop_convert = wchar_from_loop_convert;
407 cd->lfuncs.loop_reset = wchar_from_loop_reset;
408 } else
409 #endif
411 cd->lfuncs.loop_convert = unicode_loop_convert;
412 cd->lfuncs.loop_reset = unicode_loop_reset;
415 /* Initialize the states. */
416 memset(&cd->istate,'\0',sizeof(state_t));
417 memset(&cd->ostate,'\0',sizeof(state_t));
418 /* Initialize the operation flags. */
419 cd->transliterate = transliterate;
420 cd->discard_ilseq = discard_ilseq;
421 #ifndef LIBICONV_PLUG
422 cd->fallbacks.mb_to_uc_fallback = NULL;
423 cd->fallbacks.uc_to_mb_fallback = NULL;
424 cd->fallbacks.mb_to_wc_fallback = NULL;
425 cd->fallbacks.wc_to_mb_fallback = NULL;
426 cd->fallbacks.data = NULL;
427 cd->hooks.uc_hook = NULL;
428 cd->hooks.wc_hook = NULL;
429 cd->hooks.data = NULL;
430 #endif
431 /* Initialize additional fields. */
432 if (from_wchar != to_wchar) {
433 struct wchar_conv_struct * wcd = (struct wchar_conv_struct *) cd;
434 memset(&wcd->state,'\0',sizeof(mbstate_t));
436 /* Done. */
437 return (iconv_t)cd;
438 invalid:
439 errno = EINVAL;
440 return (iconv_t)(-1);
443 size_t iconv (iconv_t icd,
444 ICONV_CONST char* * inbuf, size_t *inbytesleft,
445 char* * outbuf, size_t *outbytesleft)
447 conv_t cd = (conv_t) icd;
448 if (inbuf == NULL || *inbuf == NULL)
449 return cd->lfuncs.loop_reset(icd,outbuf,outbytesleft);
450 else
451 return cd->lfuncs.loop_convert(icd,
452 (const char* *)inbuf,inbytesleft,
453 outbuf,outbytesleft);
456 int iconv_close (iconv_t icd)
458 conv_t cd = (conv_t) icd;
459 free(cd);
460 return 0;
463 #ifndef LIBICONV_PLUG
465 int iconvctl (iconv_t icd, int request, void* argument)
467 conv_t cd = (conv_t) icd;
468 switch (request) {
469 case ICONV_TRIVIALP:
470 *(int *)argument =
471 ((cd->lfuncs.loop_convert == unicode_loop_convert
472 && cd->iindex == cd->oindex)
473 || cd->lfuncs.loop_convert == wchar_id_loop_convert
474 ? 1 : 0);
475 return 0;
476 case ICONV_GET_TRANSLITERATE:
477 *(int *)argument = cd->transliterate;
478 return 0;
479 case ICONV_SET_TRANSLITERATE:
480 cd->transliterate = (*(const int *)argument ? 1 : 0);
481 return 0;
482 case ICONV_GET_DISCARD_ILSEQ:
483 *(int *)argument = cd->discard_ilseq;
484 return 0;
485 case ICONV_SET_DISCARD_ILSEQ:
486 cd->discard_ilseq = (*(const int *)argument ? 1 : 0);
487 return 0;
488 case ICONV_SET_HOOKS:
489 if (argument != NULL) {
490 cd->hooks = *(const struct iconv_hooks *)argument;
491 } else {
492 cd->hooks.uc_hook = NULL;
493 cd->hooks.wc_hook = NULL;
494 cd->hooks.data = NULL;
496 return 0;
497 case ICONV_SET_FALLBACKS:
498 if (argument != NULL) {
499 cd->fallbacks = *(const struct iconv_fallbacks *)argument;
500 } else {
501 cd->fallbacks.mb_to_uc_fallback = NULL;
502 cd->fallbacks.uc_to_mb_fallback = NULL;
503 cd->fallbacks.mb_to_wc_fallback = NULL;
504 cd->fallbacks.wc_to_mb_fallback = NULL;
505 cd->fallbacks.data = NULL;
507 return 0;
508 default:
509 errno = EINVAL;
510 return -1;
514 /* An alias after its name has been converted from 'int' to 'const char*'. */
515 struct nalias { const char* name; unsigned int encoding_index; };
517 static int compare_by_index (const void * arg1, const void * arg2)
519 const struct nalias * alias1 = (const struct nalias *) arg1;
520 const struct nalias * alias2 = (const struct nalias *) arg2;
521 return (int)alias1->encoding_index - (int)alias2->encoding_index;
524 static int compare_by_name (const void * arg1, const void * arg2)
526 const char * name1 = *(const char **)arg1;
527 const char * name2 = *(const char **)arg2;
528 /* Compare alphabetically, but put "CS" names at the end. */
529 int sign = strcmp(name1,name2);
530 if (sign != 0) {
531 sign = ((name1[0]=='C' && name1[1]=='S') - (name2[0]=='C' && name2[1]=='S'))
532 * 4 + (sign >= 0 ? 1 : -1);
534 return sign;
537 void iconvlist (int (*do_one) (unsigned int namescount,
538 const char * const * names,
539 void* data),
540 void* data)
542 #define aliascount1 sizeof(aliases)/sizeof(aliases[0])
543 #ifndef aliases2_lookup
544 #define aliascount2 sizeof(sysdep_aliases)/sizeof(sysdep_aliases[0])
545 #else
546 #define aliascount2 0
547 #endif
548 #define aliascount (aliascount1+aliascount2)
549 struct nalias aliasbuf[aliascount];
550 const char * namesbuf[aliascount];
551 size_t num_aliases;
553 /* Put all existing aliases into a buffer. */
554 size_t i;
555 size_t j;
556 j = 0;
557 for (i = 0; i < aliascount1; i++) {
558 const struct alias * p = &aliases[i];
559 if (p->name >= 0
560 && p->encoding_index != ei_local_char
561 && p->encoding_index != ei_local_wchar_t) {
562 aliasbuf[j].name = stringpool + p->name;
563 aliasbuf[j].encoding_index = p->encoding_index;
564 j++;
567 #ifndef aliases2_lookup
568 for (i = 0; i < aliascount2; i++) {
569 aliasbuf[j].name = stringpool2 + sysdep_aliases[i].name;
570 aliasbuf[j].encoding_index = sysdep_aliases[i].encoding_index;
571 j++;
573 #endif
574 num_aliases = j;
576 /* Sort by encoding_index. */
577 if (num_aliases > 1)
578 qsort(aliasbuf, num_aliases, sizeof(struct nalias), compare_by_index);
580 /* Process all aliases with the same encoding_index together. */
581 size_t j;
582 j = 0;
583 while (j < num_aliases) {
584 unsigned int ei = aliasbuf[j].encoding_index;
585 size_t i = 0;
587 namesbuf[i++] = aliasbuf[j++].name;
588 while (j < num_aliases && aliasbuf[j].encoding_index == ei);
589 if (i > 1)
590 qsort(namesbuf, i, sizeof(const char *), compare_by_name);
591 /* Call the callback. */
592 if (do_one(i,namesbuf,data))
593 break;
596 #undef aliascount
597 #undef aliascount2
598 #undef aliascount1
602 * Table of canonical names of encodings.
603 * Instead of strings, it contains offsets into stringpool and stringpool2.
605 static const unsigned short all_canonical[] = {
606 #include "canonical.h"
607 #ifdef USE_AIX
608 #include "canonical_aix.h"
609 #endif
610 #ifdef USE_OSF1
611 #include "canonical_osf1.h"
612 #endif
613 #ifdef USE_DOS
614 #include "canonical_dos.h"
615 #endif
616 #ifdef USE_EXTRA
617 #include "canonical_extra.h"
618 #endif
619 #include "canonical_local.h"
622 const char * iconv_canonicalize (const char * name)
624 const char* code;
625 char buf[MAX_WORD_LENGTH+10+1];
626 const char* cp;
627 char* bp;
628 const struct alias * ap;
629 unsigned int count;
630 unsigned int index;
631 const char* pool;
633 /* Before calling aliases_lookup, convert the input string to upper case,
634 * and check whether it's entirely ASCII (we call gperf with option "-7"
635 * to achieve a smaller table) and non-empty. If it's not entirely ASCII,
636 * or if it's too long, it is not a valid encoding name.
638 for (code = name;;) {
639 /* Search code in the table. */
640 for (cp = code, bp = buf, count = MAX_WORD_LENGTH+10+1; ; cp++, bp++) {
641 unsigned char c = * (unsigned char *) cp;
642 if (c >= 0x80)
643 goto invalid;
644 if (c >= 'a' && c <= 'z')
645 c -= 'a'-'A';
646 *bp = c;
647 if (c == '\0')
648 break;
649 if (--count == 0)
650 goto invalid;
652 for (;;) {
653 if (bp-buf >= 10 && memcmp(bp-10,"//TRANSLIT",10)==0) {
654 bp -= 10;
655 *bp = '\0';
656 continue;
658 if (bp-buf >= 8 && memcmp(bp-8,"//IGNORE",8)==0) {
659 bp -= 8;
660 *bp = '\0';
661 continue;
663 break;
665 if (buf[0] == '\0') {
666 code = locale_charset();
667 /* Avoid an endless loop that could occur when using an older version
668 of localcharset.c. */
669 if (code[0] == '\0')
670 goto invalid;
671 continue;
673 pool = stringpool;
674 ap = aliases_lookup(buf,bp-buf);
675 if (ap == NULL) {
676 pool = stringpool2;
677 ap = aliases2_lookup(buf);
678 if (ap == NULL)
679 goto invalid;
681 if (ap->encoding_index == ei_local_char) {
682 code = locale_charset();
683 /* Avoid an endless loop that could occur when using an older version
684 of localcharset.c. */
685 if (code[0] == '\0')
686 goto invalid;
687 continue;
689 if (ap->encoding_index == ei_local_wchar_t) {
690 /* On systems which define __STDC_ISO_10646__, wchar_t is Unicode.
691 This is also the case on native Woe32 systems. */
692 #if __STDC_ISO_10646__ || ((defined _WIN32 || defined __WIN32__) && !defined __CYGWIN__)
693 if (sizeof(wchar_t) == 4) {
694 index = ei_ucs4internal;
695 break;
697 if (sizeof(wchar_t) == 2) {
698 index = ei_ucs2internal;
699 break;
701 if (sizeof(wchar_t) == 1) {
702 index = ei_iso8859_1;
703 break;
705 #endif
707 index = ap->encoding_index;
708 break;
710 return all_canonical[index] + pool;
711 invalid:
712 return name;
715 int _libiconv_version = _LIBICONV_VERSION;
717 #if defined __FreeBSD__ && !defined __gnu_freebsd__
718 /* GNU libiconv is the native FreeBSD iconv implementation since 2002.
719 It wants to define the symbols 'iconv_open', 'iconv', 'iconv_close'. */
720 #define strong_alias(name, aliasname) _strong_alias(name, aliasname)
721 #define _strong_alias(name, aliasname) \
722 extern __typeof (name) aliasname __attribute__ ((alias (#name)));
723 #undef iconv_open
724 #undef iconv
725 #undef iconv_close
726 strong_alias (libiconv_open, iconv_open)
727 strong_alias (libiconv, iconv)
728 strong_alias (libiconv_close, iconv_close)
729 #endif
731 #endif