1 /* utf8conf.c - UTF8 character set conversion
2 * Copyright (C) 1994, 1998, 1999, 2000, 2001,
3 * 2003, 2006, 2008 Free Software Foundation, Inc.
5 * This file is part of JNLIB.
7 * JNLIB is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU Lesser General Public License as
9 * published by the Free Software Foundation; either version 3 of
10 * the License, or (at your option) any later version.
12 * JNLIB is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
26 #ifdef HAVE_LANGINFO_CODESET
30 #ifndef HAVE_W32_SYSTEM
34 #include "libjnlib-config.h"
35 #include "stringhelp.h"
43 static const char *active_charset_name
= "iso-8859-1";
44 static int no_translation
; /* Set to true if we let simply pass through. */
45 static int use_iconv
; /* iconv comversion fucntions required. */
48 /* Under W32 we dlopen the iconv dll and don't require any iconv
49 related headers at all. However we need to define some stuff. */
50 #ifdef HAVE_W32_SYSTEM
51 typedef void *iconv_t
;
53 #define ICONV_CONST const
55 static iconv_t (* __stdcall iconv_open
) (const char *tocode
,
56 const char *fromcode
);
57 static size_t (* __stdcall iconv
) (iconv_t cd
,
58 const char **inbuf
, size_t *inbytesleft
,
59 char **outbuf
, size_t *outbytesleft
);
60 static int (* __stdcall iconv_close
) (iconv_t cd
);
71 done
= 1; /* Do it right now because we might get called recursivly
74 handle
= dlopen ("iconv.dll", RTLD_LAZY
);
77 iconv_open
= dlsym (handle
, "libiconv_open");
79 iconv
= dlsym (handle
, "libiconv");
81 iconv_close
= dlsym (handle
, "libiconv_close");
83 if (!handle
|| !iconv_close
)
85 log_info (_("error loading `%s': %s\n"),
86 "iconv.dll", dlerror ());
87 log_info (_("please see %s for more information\n"),
88 "http://www.gnupg.org/download/iconv.html");
96 return iconv_open
? 0: -1;
98 #endif /*HAVE_W32_SYSTEM*/
101 /* Error handler for iconv failures. This is needed to not clutter the
102 output with repeated diagnostics about a missing conversion. */
104 handle_iconv_error (const char *to
, const char *from
, int use_fallback
)
108 static int shown1
, shown2
;
111 if (to
&& !strcmp (to
, "utf-8"))
123 log_info (_("conversion from `%s' to `%s' not available\n"),
131 log_info (_("iconv_open failed: %s\n"), strerror (errno
));
137 /* To avoid further error messages we fallback to Latin-1 for the
138 native encoding. This is justified as one can expect that on a
139 utf-8 enabled system nl_langinfo() will work and thus we won't
140 never get to here. Thus Latin-1 seems to be a reasonable
142 active_charset_name
= "iso-8859-1";
151 set_native_charset (const char *newset
)
153 const char *full_newset
;
157 #ifdef HAVE_W32_SYSTEM
158 static char codepage
[30];
162 /* We are a console program thus we need to use the
163 GetConsoleOutputCP function and not the the GetACP which
164 would give the codepage for a GUI program. Note this is not
165 a bulletproof detection because GetConsoleCP might return a
166 different one for console input. Not sure how to cope with
167 that. If the console Code page is not known we fall back to
168 the system code page. */
169 cpno
= GetConsoleOutputCP ();
172 sprintf (codepage
, "CP%u", cpno
);
173 /* Resolve alias. We use a long string string and not the usual
174 array to optimize if the code is taken to a DSO. Taken from
177 for (aliases
= ("CP936" "\0" "GBK" "\0"
178 "CP1361" "\0" "JOHAB" "\0"
179 "CP20127" "\0" "ASCII" "\0"
180 "CP20866" "\0" "KOI8-R" "\0"
181 "CP21866" "\0" "KOI8-RU" "\0"
182 "CP28591" "\0" "ISO-8859-1" "\0"
183 "CP28592" "\0" "ISO-8859-2" "\0"
184 "CP28593" "\0" "ISO-8859-3" "\0"
185 "CP28594" "\0" "ISO-8859-4" "\0"
186 "CP28595" "\0" "ISO-8859-5" "\0"
187 "CP28596" "\0" "ISO-8859-6" "\0"
188 "CP28597" "\0" "ISO-8859-7" "\0"
189 "CP28598" "\0" "ISO-8859-8" "\0"
190 "CP28599" "\0" "ISO-8859-9" "\0"
191 "CP28605" "\0" "ISO-8859-15" "\0"
192 "CP65001" "\0" "UTF-8" "\0");
194 aliases
+= strlen (aliases
) + 1, aliases
+= strlen (aliases
) + 1)
196 if (!strcmp (codepage
, aliases
) ||(*aliases
== '*' && !aliases
[1]))
198 newset
= aliases
+ strlen (aliases
) + 1;
203 #else /*!HAVE_W32_SYSTEM*/
205 #ifdef HAVE_LANGINFO_CODESET
206 newset
= nl_langinfo (CODESET
);
207 #else /*!HAVE_LANGINFO_CODESET*/
208 /* Try to get the used charset from environment variables. */
209 static char codepage
[30];
210 const char *lc
, *dot
, *mod
;
212 strcpy (codepage
, "iso-8859-1");
213 lc
= getenv ("LC_ALL");
216 lc
= getenv ("LC_CTYPE");
218 lc
= getenv ("LANG");
222 dot
= strchr (lc
, '.');
225 mod
= strchr (++dot
, '@');
227 mod
= dot
+ strlen (dot
);
228 if (mod
- dot
< sizeof codepage
&& dot
!= mod
)
230 memcpy (codepage
, dot
, mod
- dot
);
231 codepage
[mod
- dot
] = 0;
236 #endif /*!HAVE_LANGINFO_CODESET*/
237 #endif /*!HAVE_W32_SYSTEM*/
240 full_newset
= newset
;
241 if (strlen (newset
) > 3 && !ascii_memcasecmp (newset
, "iso", 3))
244 if (*newset
== '-' || *newset
== '_')
248 /* Note that we silently assume that plain ASCII is actually meant
249 as Latin-1. This makes sense because many Unix system don't have
250 their locale set up properly and thus would get annoying error
251 messages and we have to handle all the "bug" reports. Latin-1 has
252 always been the character set used for 8 bit characters on Unix
255 || !ascii_strcasecmp (newset
, "8859-1" )
256 || !ascii_strcasecmp (newset
, "646" )
257 || !ascii_strcasecmp (newset
, "ASCII" )
258 || !ascii_strcasecmp (newset
, "ANSI_X3.4-1968" )
261 active_charset_name
= "iso-8859-1";
265 else if ( !ascii_strcasecmp (newset
, "utf8" )
266 || !ascii_strcasecmp(newset
, "utf-8") )
268 active_charset_name
= "utf-8";
276 #ifdef HAVE_W32_SYSTEM
277 if (load_libiconv ())
279 #endif /*HAVE_W32_SYSTEM*/
281 cd
= iconv_open (full_newset
, "utf-8");
282 if (cd
== (iconv_t
)-1)
284 handle_iconv_error (full_newset
, "utf-8", 0);
288 cd
= iconv_open ("utf-8", full_newset
);
289 if (cd
== (iconv_t
)-1)
291 handle_iconv_error ("utf-8", full_newset
, 0);
295 active_charset_name
= full_newset
;
303 get_native_charset ()
305 return active_charset_name
;
308 /* Return true if the native charset is utf-8. */
310 is_native_utf8 (void)
312 return no_translation
;
316 /* Convert string, which is in native encoding to UTF8 and return a
317 new allocated UTF-8 string. This function terminates the process
318 on memory shortage. */
320 native_to_utf8 (const char *orig_string
)
322 const unsigned char *string
= (const unsigned char *)orig_string
;
323 const unsigned char *s
;
330 /* Already utf-8 encoded. */
331 buffer
= jnlib_xstrdup (orig_string
);
335 /* For Latin-1 we can avoid the iconv overhead. */
336 for (s
= string
; *s
; s
++)
342 buffer
= jnlib_xmalloc (length
+ 1);
343 for (p
= (unsigned char *)buffer
, s
= string
; *s
; s
++)
347 *p
++ = 0xc0 | ((*s
>> 6) & 3);
348 *p
++ = 0x80 | (*s
& 0x3f);
357 /* Need to use iconv. */
361 size_t inbytes
, outbytes
;
363 cd
= iconv_open ("utf-8", active_charset_name
);
364 if (cd
== (iconv_t
)-1)
366 handle_iconv_error ("utf-8", active_charset_name
, 1);
367 return native_to_utf8 (string
);
370 for (s
=string
; *s
; s
++ )
374 length
+= 5; /* We may need up to 6 bytes for the utf8 output. */
376 buffer
= jnlib_xmalloc (length
+ 1);
379 inbytes
= strlen (string
);
382 if ( iconv (cd
, (ICONV_CONST
char **)&inptr
, &inbytes
,
383 &outptr
, &outbytes
) == (size_t)-1)
388 log_info (_("conversion from `%s' to `%s' failed: %s\n"),
389 active_charset_name
, "utf-8", strerror (errno
));
391 /* We don't do any conversion at all but use the strings as is. */
392 strcpy (buffer
, string
);
397 /* We could realloc the buffer now but I doubt that it makes
398 much sense given that it will get freed anyway soon
409 do_utf8_to_native (const char *string
, size_t length
, int delim
,
414 unsigned char encbuf
[8];
416 const unsigned char *s
;
420 unsigned long val
= 0;
424 /* First pass (p==NULL): count the extended utf-8 characters. */
425 /* Second pass (p!=NULL): create string. */
428 for (slen
= length
, nleft
= encidx
= 0, n
= 0,
429 s
= (const unsigned char *)string
;
435 if (!(*s
< 128 || (*s
>= 0xc0 && *s
<= 0xfd)))
440 sprintf (p
, "\\x%02x", *s
);
454 && (*s
< 0x20 || *s
== 0x7f || *s
== delim
455 || (delim
&& *s
== '\\')))
462 case '\n': n
++; if ( p
) *p
++ = 'n'; break;
463 case '\r': n
++; if ( p
) *p
++ = 'r'; break;
464 case '\f': n
++; if ( p
) *p
++ = 'f'; break;
465 case '\v': n
++; if ( p
) *p
++ = 'v'; break;
466 case '\b': n
++; if ( p
) *p
++ = 'b'; break;
467 case 0: n
++; if ( p
) *p
++ = '0'; break;
472 sprintf (p
, "x%02x", *s
);
485 else if ((*s
& 0xe0) == 0xc0) /* 110x xxxx */
490 encbuf
[encidx
++] = *s
;
492 else if ((*s
& 0xf0) == 0xe0) /* 1110 xxxx */
497 encbuf
[encidx
++] = *s
;
499 else if ((*s
& 0xf8) == 0xf0) /* 1111 0xxx */
504 encbuf
[encidx
++] = *s
;
506 else if ((*s
& 0xfc) == 0xf8) /* 1111 10xx */
511 encbuf
[encidx
++] = *s
;
513 else if ((*s
& 0xfe) == 0xfc) /* 1111 110x */
518 encbuf
[encidx
++] = *s
;
520 else /* Invalid encoding: print as \xNN. */
524 sprintf (p
, "\\x%02x", *s
);
531 else if (*s
< 0x80 || *s
>= 0xc0) /* Invalid utf-8 */
535 for (i
= 0; i
< encidx
; i
++)
537 sprintf (p
, "\\x%02x", encbuf
[i
]);
540 sprintf (p
, "\\x%02x", *s
);
550 encbuf
[encidx
++] = *s
;
553 if (!--nleft
) /* Ready. */
559 for (i
= 0; i
< encidx
; i
++)
567 /* Our strategy for using iconv is a bit strange
568 but it better keeps compatibility with
569 previous versions in regard to how invalid
570 encodings are displayed. What we do is to
571 keep the utf-8 as is and have the real
572 translation step then at the end. Yes, I
573 know that this is ugly. However we are short
574 of the 1.4 release and for this branch we
575 should not mess too much around with iconv
576 things. One reason for this is that we don't
577 know enough about non-GNU iconv
578 implementation and want to minimize the risk
579 of breaking the code on too many platforms. */
582 for (i
=0; i
< encidx
; i
++ )
588 else /* Latin-1 case. */
590 if (val
>= 0x80 && val
< 256)
592 /* We can simply print this character */
599 /* We do not have a translation: print utf8. */
602 for (i
= 0; i
< encidx
; i
++)
604 sprintf (p
, "\\x%02x", encbuf
[i
]);
618 /* Allocate the buffer after the first pass. */
619 buffer
= p
= jnlib_xmalloc (n
+ 1);
623 /* Note: See above for comments. */
626 char *outbuf
, *outptr
;
627 size_t inbytes
, outbytes
;
629 *p
= 0; /* Terminate the buffer. */
631 cd
= iconv_open (active_charset_name
, "utf-8");
632 if (cd
== (iconv_t
)-1)
634 handle_iconv_error (active_charset_name
, "utf-8", 1);
636 return utf8_to_native (string
, length
, delim
);
639 /* Allocate a new buffer large enough to hold all possible
644 outbytes
= n
* MB_LEN_MAX
;
645 if (outbytes
/ MB_LEN_MAX
!= n
)
646 BUG (); /* Actually an overflow. */
647 outbuf
= outptr
= jnlib_xmalloc (outbytes
);
648 if ( iconv (cd
, (ICONV_CONST
char **)&inptr
, &inbytes
,
649 &outptr
, &outbytes
) == (size_t)-1)
654 log_info (_("conversion from `%s' to `%s' failed: %s\n"),
655 "utf-8", active_charset_name
, strerror (errno
));
657 /* Didn't worked out. Try again but without iconv. */
661 outbuf
= do_utf8_to_native (string
, length
, delim
, 0);
665 *outptr
= 0; /* Make sure it is a string. */
666 /* We could realloc the buffer now but I doubt that it
667 makes much sense given that it will get freed
668 anyway soon after. */
674 else /* Not using iconv. */
676 *p
= 0; /* Make sure it is a string. */
682 /* Convert string, which is in UTF-8 to native encoding. Replace
683 illegal encodings by some "\xnn" and quote all control
684 characters. A character with value DELIM will always be quoted, it
685 must be a vanilla ASCII character. A DELIM value of -1 is special:
686 it disables all quoting of control characters. This function
687 terminates the process on memory shortage. */
689 utf8_to_native (const char *string
, size_t length
, int delim
)
691 return do_utf8_to_native (string
, length
, delim
, use_iconv
);
697 /* Wrapper function for iconv_open, required for W32 as we dlopen that
698 library on that system. */
700 jnlib_iconv_open (const char *tocode
, const char *fromcode
)
702 #ifdef HAVE_W32_SYSTEM
703 if (load_libiconv ())
704 return (jnlib_iconv_t
)(-1);
705 #endif /*HAVE_W32_SYSTEM*/
707 return (jnlib_iconv_t
)iconv_open (tocode
, fromcode
);
711 /* Wrapper function for iconv, required for W32 as we dlopen that
712 library on that system. */
714 jnlib_iconv (jnlib_iconv_t cd
,
715 const char **inbuf
, size_t *inbytesleft
,
716 char **outbuf
, size_t *outbytesleft
)
719 #ifdef HAVE_W32_SYSTEM
720 if (load_libiconv ())
722 #endif /*HAVE_W32_SYSTEM*/
724 return iconv ((iconv_t
)cd
, (char**)inbuf
, inbytesleft
, outbuf
, outbytesleft
);
727 /* Wrapper function for iconv_close, required for W32 as we dlopen that
728 library on that system. */
730 jnlib_iconv_close (jnlib_iconv_t cd
)
732 #ifdef HAVE_W32_SYSTEM
733 if (load_libiconv ())
735 #endif /*HAVE_W32_SYSTEM*/
737 return iconv_close ((iconv_t
)cd
);