1 /* utf8conf.c - UTF8 character set conversion
2 * Copyright (C) 1994, 1998, 1999, 2000, 2001,
3 * 2003, 2006, 2008 Free Software Foundation, Inc.
5 * This file is part of JNLIB.
7 * JNLIB is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU Lesser General Public License as
9 * published by the Free Software Foundation; either version 3 of
10 * the License, or (at your option) any later version.
12 * JNLIB is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
26 #ifdef HAVE_LANGINFO_CODESET
30 #ifndef HAVE_W32_SYSTEM
34 #include "libjnlib-config.h"
35 #include "stringhelp.h"
43 static const char *active_charset_name
= "iso-8859-1";
44 static int no_translation
; /* Set to true if we let simply pass through. */
45 static int use_iconv
; /* iconv comversion fucntions required. */
48 /* Under W32 we dlopen the iconv dll and don't require any iconv
49 related headers at all. However we need to define some stuff. */
50 #ifdef HAVE_W32_SYSTEM
51 typedef void *iconv_t
;
53 #define ICONV_CONST const
55 static iconv_t (* __stdcall iconv_open
) (const char *tocode
,
56 const char *fromcode
);
57 static size_t (* __stdcall iconv
) (iconv_t cd
,
58 const char **inbuf
, size_t *inbytesleft
,
59 char **outbuf
, size_t *outbytesleft
);
60 static int (* __stdcall iconv_close
) (iconv_t cd
);
71 done
= 1; /* Do it right now because we might get called recursivly
74 handle
= dlopen ("iconv.dll", RTLD_LAZY
);
77 iconv_open
= dlsym (handle
, "libiconv_open");
79 iconv
= dlsym (handle
, "libiconv");
81 iconv_close
= dlsym (handle
, "libiconv_close");
83 if (!handle
|| !iconv_close
)
85 log_info (_("error loading `%s': %s\n"),
86 "iconv.dll", dlerror ());
87 log_info (_("please see %s for more information\n"),
88 "http://www.gnupg.org/download/iconv.html");
96 return iconv_open
? 0: -1;
98 #endif /*HAVE_W32_SYSTEM*/
101 /* Error handler for iconv failures. This is needed to not clutter the
102 output with repeated diagnostics about a missing conversion. */
104 handle_iconv_error (const char *to
, const char *from
, int use_fallback
)
108 static int shown1
, shown2
;
111 if (to
&& !strcmp (to
, "utf-8"))
123 log_info (_("conversion from `%s' to `%s' not available\n"),
131 log_info (_("iconv_open failed: %s\n"), strerror (errno
));
137 /* To avoid further error messages we fallback to Latin-1 for the
138 native encoding. This is justified as one can expect that on a
139 utf-8 enabled system nl_langinfo() will work and thus we won't
140 never get to here. Thus Latin-1 seems to be a reasonable
142 active_charset_name
= "iso-8859-1";
151 set_native_charset (const char *newset
)
153 const char *full_newset
;
157 #ifdef HAVE_W32_SYSTEM
158 static char codepage
[30];
162 /* We are a console program thus we need to use the
163 GetConsoleOutputCP function and not the the GetACP which
164 would give the codepage for a GUI program. Note this is not
165 a bulletproof detection because GetConsoleCP might return a
166 different one for console input. Not sure how to cope with
167 that. If the console Code page is not known we fall back to
168 the system code page. */
169 cpno
= GetConsoleOutputCP ();
172 sprintf (codepage
, "CP%u", cpno
);
173 /* Resolve alias. We use a long string string and not the usual
174 array to optimize if the code is taken to a DSO. Taken from
177 for (aliases
= ("CP936" "\0" "GBK" "\0"
178 "CP1361" "\0" "JOHAB" "\0"
179 "CP20127" "\0" "ASCII" "\0"
180 "CP20866" "\0" "KOI8-R" "\0"
181 "CP21866" "\0" "KOI8-RU" "\0"
182 "CP28591" "\0" "ISO-8859-1" "\0"
183 "CP28592" "\0" "ISO-8859-2" "\0"
184 "CP28593" "\0" "ISO-8859-3" "\0"
185 "CP28594" "\0" "ISO-8859-4" "\0"
186 "CP28595" "\0" "ISO-8859-5" "\0"
187 "CP28596" "\0" "ISO-8859-6" "\0"
188 "CP28597" "\0" "ISO-8859-7" "\0"
189 "CP28598" "\0" "ISO-8859-8" "\0"
190 "CP28599" "\0" "ISO-8859-9" "\0"
191 "CP28605" "\0" "ISO-8859-15" "\0"
192 "CP65001" "\0" "UTF-8" "\0");
194 aliases
+= strlen (aliases
) + 1, aliases
+= strlen (aliases
) + 1)
196 if (!strcmp (codepage
, aliases
) ||(*aliases
== '*' && !aliases
[1]))
198 newset
= aliases
+ strlen (aliases
) + 1;
203 #else /*!HAVE_W32_SYSTEM*/
205 #ifdef HAVE_LANGINFO_CODESET
206 newset
= nl_langinfo (CODESET
);
207 #else /*!HAVE_LANGINFO_CODESET*/
208 /* Try to get the used charset from environment variables. */
209 static char codepage
[30];
210 const char *lc
, *dot
, *mod
;
212 strcpy (codepage
, "iso-8859-1");
213 lc
= getenv ("LC_ALL");
216 lc
= getenv ("LC_CTYPE");
218 lc
= getenv ("LANG");
222 dot
= strchr (lc
, '.');
225 mod
= strchr (++dot
, '@');
227 mod
= dot
+ strlen (dot
);
228 if (mod
- dot
< sizeof codepage
&& dot
!= mod
)
230 memcpy (codepage
, dot
, mod
- dot
);
231 codepage
[mod
- dot
] = 0;
236 #endif /*!HAVE_LANGINFO_CODESET*/
237 #endif /*!HAVE_W32_SYSTEM*/
240 full_newset
= newset
;
241 if (strlen (newset
) > 3 && !ascii_memcasecmp (newset
, "iso", 3))
244 if (*newset
== '-' || *newset
== '_')
248 /* Note that we silently assume that plain ASCII is actually meant
249 as Latin-1. This makes sense because many Unix system don't have
250 their locale set up properly and thus would get annoying error
251 messages and we have to handle all the "bug" reports. Latin-1 has
252 always been the character set used for 8 bit characters on Unix
255 || !ascii_strcasecmp (newset
, "8859-1" )
256 || !ascii_strcasecmp (newset
, "646" )
257 || !ascii_strcasecmp (newset
, "ASCII" )
258 || !ascii_strcasecmp (newset
, "ANSI_X3.4-1968" )
261 active_charset_name
= "iso-8859-1";
265 else if ( !ascii_strcasecmp (newset
, "utf8" )
266 || !ascii_strcasecmp(newset
, "utf-8") )
268 active_charset_name
= "utf-8";
276 #ifdef HAVE_W32_SYSTEM
277 if (load_libiconv ())
279 #endif /*HAVE_W32_SYSTEM*/
281 cd
= iconv_open (full_newset
, "utf-8");
282 if (cd
== (iconv_t
)-1)
284 handle_iconv_error (full_newset
, "utf-8", 0);
288 cd
= iconv_open ("utf-8", full_newset
);
289 if (cd
== (iconv_t
)-1)
291 handle_iconv_error ("utf-8", full_newset
, 0);
295 active_charset_name
= full_newset
;
303 get_native_charset ()
305 return active_charset_name
;
308 /* Return true if the native charset is utf-8. */
310 is_native_utf8 (void)
312 return no_translation
;
316 /* Convert string, which is in native encoding to UTF8 and return a
317 new allocated UTF-8 string. */
319 native_to_utf8 (const char *orig_string
)
321 const unsigned char *string
= (const unsigned char *)orig_string
;
322 const unsigned char *s
;
329 /* Already utf-8 encoded. */
330 buffer
= jnlib_xstrdup (orig_string
);
334 /* For Latin-1 we can avoid the iconv overhead. */
335 for (s
= string
; *s
; s
++)
341 buffer
= jnlib_xmalloc (length
+ 1);
342 for (p
= (unsigned char *)buffer
, s
= string
; *s
; s
++)
346 *p
++ = 0xc0 | ((*s
>> 6) & 3);
347 *p
++ = 0x80 | (*s
& 0x3f);
356 /* Need to use iconv. */
360 size_t inbytes
, outbytes
;
362 cd
= iconv_open ("utf-8", active_charset_name
);
363 if (cd
== (iconv_t
)-1)
365 handle_iconv_error ("utf-8", active_charset_name
, 1);
366 return native_to_utf8 (string
);
369 for (s
=string
; *s
; s
++ )
373 length
+= 5; /* We may need up to 6 bytes for the utf8 output. */
375 buffer
= jnlib_xmalloc (length
+ 1);
378 inbytes
= strlen (string
);
381 if ( iconv (cd
, (ICONV_CONST
char **)&inptr
, &inbytes
,
382 &outptr
, &outbytes
) == (size_t)-1)
387 log_info (_("conversion from `%s' to `%s' failed: %s\n"),
388 active_charset_name
, "utf-8", strerror (errno
));
390 /* We don't do any conversion at all but use the strings as is. */
391 strcpy (buffer
, string
);
396 /* We could realloc the buffer now but I doubt that it makes
397 much sense given that it will get freed anyway soon
408 do_utf8_to_native (const char *string
, size_t length
, int delim
,
413 unsigned char encbuf
[8];
415 const unsigned char *s
;
419 unsigned long val
= 0;
423 /* First pass (p==NULL): count the extended utf-8 characters. */
424 /* Second pass (p!=NULL): create string. */
427 for (slen
= length
, nleft
= encidx
= 0, n
= 0,
428 s
= (const unsigned char *)string
;
434 if (!(*s
< 128 || (*s
>= 0xc0 && *s
<= 0xfd)))
439 sprintf (p
, "\\x%02x", *s
);
453 && (*s
< 0x20 || *s
== 0x7f || *s
== delim
454 || (delim
&& *s
== '\\')))
461 case '\n': n
++; if ( p
) *p
++ = 'n'; break;
462 case '\r': n
++; if ( p
) *p
++ = 'r'; break;
463 case '\f': n
++; if ( p
) *p
++ = 'f'; break;
464 case '\v': n
++; if ( p
) *p
++ = 'v'; break;
465 case '\b': n
++; if ( p
) *p
++ = 'b'; break;
466 case 0: n
++; if ( p
) *p
++ = '0'; break;
471 sprintf (p
, "x%02x", *s
);
484 else if ((*s
& 0xe0) == 0xc0) /* 110x xxxx */
489 encbuf
[encidx
++] = *s
;
491 else if ((*s
& 0xf0) == 0xe0) /* 1110 xxxx */
496 encbuf
[encidx
++] = *s
;
498 else if ((*s
& 0xf8) == 0xf0) /* 1111 0xxx */
503 encbuf
[encidx
++] = *s
;
505 else if ((*s
& 0xfc) == 0xf8) /* 1111 10xx */
510 encbuf
[encidx
++] = *s
;
512 else if ((*s
& 0xfe) == 0xfc) /* 1111 110x */
517 encbuf
[encidx
++] = *s
;
519 else /* Invalid encoding: print as \xNN. */
523 sprintf (p
, "\\x%02x", *s
);
530 else if (*s
< 0x80 || *s
>= 0xc0) /* Invalid utf-8 */
534 for (i
= 0; i
< encidx
; i
++)
536 sprintf (p
, "\\x%02x", encbuf
[i
]);
539 sprintf (p
, "\\x%02x", *s
);
549 encbuf
[encidx
++] = *s
;
552 if (!--nleft
) /* Ready. */
558 for (i
= 0; i
< encidx
; i
++)
566 /* Our strategy for using iconv is a bit strange
567 but it better keeps compatibility with
568 previous versions in regard to how invalid
569 encodings are displayed. What we do is to
570 keep the utf-8 as is and have the real
571 translation step then at the end. Yes, I
572 know that this is ugly. However we are short
573 of the 1.4 release and for this branch we
574 should not mess too much around with iconv
575 things. One reason for this is that we don't
576 know enough about non-GNU iconv
577 implementation and want to minimize the risk
578 of breaking the code on too many platforms. */
581 for (i
=0; i
< encidx
; i
++ )
587 else /* Latin-1 case. */
589 if (val
>= 0x80 && val
< 256)
591 /* We can simply print this character */
598 /* We do not have a translation: print utf8. */
601 for (i
= 0; i
< encidx
; i
++)
603 sprintf (p
, "\\x%02x", encbuf
[i
]);
617 /* Allocate the buffer after the first pass. */
618 buffer
= p
= jnlib_xmalloc (n
+ 1);
622 /* Note: See above for comments. */
625 char *outbuf
, *outptr
;
626 size_t inbytes
, outbytes
;
628 *p
= 0; /* Terminate the buffer. */
630 cd
= iconv_open (active_charset_name
, "utf-8");
631 if (cd
== (iconv_t
)-1)
633 handle_iconv_error (active_charset_name
, "utf-8", 1);
635 return utf8_to_native (string
, length
, delim
);
638 /* Allocate a new buffer large enough to hold all possible
643 outbytes
= n
* MB_LEN_MAX
;
644 if (outbytes
/ MB_LEN_MAX
!= n
)
645 BUG (); /* Actually an overflow. */
646 outbuf
= outptr
= jnlib_xmalloc (outbytes
);
647 if ( iconv (cd
, (ICONV_CONST
char **)&inptr
, &inbytes
,
648 &outptr
, &outbytes
) == (size_t)-1)
653 log_info (_("conversion from `%s' to `%s' failed: %s\n"),
654 "utf-8", active_charset_name
, strerror (errno
));
656 /* Didn't worked out. Try again but without iconv. */
660 outbuf
= do_utf8_to_native (string
, length
, delim
, 0);
664 *outptr
= 0; /* Make sure it is a string. */
665 /* We could realloc the buffer now but I doubt that it
666 makes much sense given that it will get freed
667 anyway soon after. */
673 else /* Not using iconv. */
675 *p
= 0; /* Make sure it is a string. */
681 /* Convert string, which is in UTF-8 to native encoding. Replace
682 illegal encodings by some "\xnn" and quote all control
683 characters. A character with value DELIM will always be quoted, it
684 must be a vanilla ASCII character. A DELIM value of -1 is special:
685 it disables all quoting of control characters. */
687 utf8_to_native (const char *string
, size_t length
, int delim
)
689 return do_utf8_to_native (string
, length
, delim
, use_iconv
);
695 /* Wrapper function for iconv_open, required for W32 as we dlopen that
696 library on that system. */
698 jnlib_iconv_open (const char *tocode
, const char *fromcode
)
700 #ifdef HAVE_W32_SYSTEM
701 if (load_libiconv ())
702 return (jnlib_iconv_t
)(-1);
703 #endif /*HAVE_W32_SYSTEM*/
705 return (jnlib_iconv_t
)iconv_open (tocode
, fromcode
);
709 /* Wrapper function for iconv, required for W32 as we dlopen that
710 library on that system. */
712 jnlib_iconv (jnlib_iconv_t cd
,
713 const char **inbuf
, size_t *inbytesleft
,
714 char **outbuf
, size_t *outbytesleft
)
717 #ifdef HAVE_W32_SYSTEM
718 if (load_libiconv ())
720 #endif /*HAVE_W32_SYSTEM*/
722 return iconv ((iconv_t
)cd
, (char**)inbuf
, inbytesleft
, outbuf
, outbytesleft
);
725 /* Wrapper function for iconv_close, required for W32 as we dlopen that
726 library on that system. */
728 jnlib_iconv_close (jnlib_iconv_t cd
)
730 #ifdef HAVE_W32_SYSTEM
731 if (load_libiconv ())
733 #endif /*HAVE_W32_SYSTEM*/
735 return iconv_close ((iconv_t
)cd
);