Enhanced last patch.
[gnupg.git] / jnlib / utf8conv.c
blob5223d473bc9f361665fa70cbcefd3a656d9fc2c5
1 /* utf8conf.c - UTF8 character set conversion
2 * Copyright (C) 1994, 1998, 1999, 2000, 2001,
3 * 2003, 2006, 2008 Free Software Foundation, Inc.
5 * This file is part of JNLIB.
7 * JNLIB is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU Lesser General Public License as
9 * published by the Free Software Foundation; either version 3 of
10 * the License, or (at your option) any later version.
12 * JNLIB is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include <config.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <stdarg.h>
25 #include <ctype.h>
26 #ifdef HAVE_LANGINFO_CODESET
27 #include <langinfo.h>
28 #endif
29 #include <errno.h>
30 #ifndef HAVE_W32_SYSTEM
31 # include <iconv.h>
32 #endif
34 #include "libjnlib-config.h"
35 #include "stringhelp.h"
36 #include "dynload.h"
37 #include "utf8conv.h"
39 #ifndef MB_LEN_MAX
40 #define MB_LEN_MAX 16
41 #endif
43 static const char *active_charset_name = "iso-8859-1";
44 static int no_translation; /* Set to true if we let simply pass through. */
45 static int use_iconv; /* iconv comversion fucntions required. */
48 /* Under W32 we dlopen the iconv dll and don't require any iconv
49 related headers at all. However we need to define some stuff. */
50 #ifdef HAVE_W32_SYSTEM
51 typedef void *iconv_t;
52 #ifndef ICONV_CONST
53 #define ICONV_CONST const
54 #endif
55 static iconv_t (* __stdcall iconv_open) (const char *tocode,
56 const char *fromcode);
57 static size_t (* __stdcall iconv) (iconv_t cd,
58 const char **inbuf, size_t *inbytesleft,
59 char **outbuf, size_t *outbytesleft);
60 static int (* __stdcall iconv_close) (iconv_t cd);
62 static int
63 load_libiconv (void)
65 static int done;
67 if (!done)
69 void *handle;
71 done = 1; /* Do it right now because we might get called recursivly
72 through gettext. */
74 handle = dlopen ("iconv.dll", RTLD_LAZY);
75 if (handle)
77 iconv_open = dlsym (handle, "libiconv_open");
78 if (iconv_open)
79 iconv = dlsym (handle, "libiconv");
80 if (iconv)
81 iconv_close = dlsym (handle, "libiconv_close");
83 if (!handle || !iconv_close)
85 log_info (_("error loading `%s': %s\n"),
86 "iconv.dll", dlerror ());
87 log_info (_("please see %s for more information\n"),
88 "http://www.gnupg.org/download/iconv.html");
89 iconv_open = NULL;
90 iconv = NULL;
91 iconv_close = NULL;
92 if (handle)
93 dlclose (handle);
96 return iconv_open? 0: -1;
98 #endif /*HAVE_W32_SYSTEM*/
101 /* Error handler for iconv failures. This is needed to not clutter the
102 output with repeated diagnostics about a missing conversion. */
103 static void
104 handle_iconv_error (const char *to, const char *from, int use_fallback)
106 if (errno == EINVAL)
108 static int shown1, shown2;
109 int x;
111 if (to && !strcmp (to, "utf-8"))
113 x = shown1;
114 shown1 = 1;
116 else
118 x = shown2;
119 shown2 = 1;
122 if (!x)
123 log_info (_("conversion from `%s' to `%s' not available\n"),
124 from, to);
126 else
128 static int shown;
130 if (!shown)
131 log_info (_("iconv_open failed: %s\n"), strerror (errno));
132 shown = 1;
135 if (use_fallback)
137 /* To avoid further error messages we fallback to Latin-1 for the
138 native encoding. This is justified as one can expect that on a
139 utf-8 enabled system nl_langinfo() will work and thus we won't
140 never get to here. Thus Latin-1 seems to be a reasonable
141 default. */
142 active_charset_name = "iso-8859-1";
143 no_translation = 0;
144 use_iconv = 0;
151 set_native_charset (const char *newset)
153 const char *full_newset;
155 if (!newset)
157 #ifdef HAVE_W32_SYSTEM
158 static char codepage[30];
159 unsigned int cpno;
160 const char *aliases;
162 /* We are a console program thus we need to use the
163 GetConsoleOutputCP function and not the the GetACP which
164 would give the codepage for a GUI program. Note this is not
165 a bulletproof detection because GetConsoleCP might return a
166 different one for console input. Not sure how to cope with
167 that. If the console Code page is not known we fall back to
168 the system code page. */
169 cpno = GetConsoleOutputCP ();
170 if (!cpno)
171 cpno = GetACP ();
172 sprintf (codepage, "CP%u", cpno );
173 /* Resolve alias. We use a long string string and not the usual
174 array to optimize if the code is taken to a DSO. Taken from
175 libiconv 1.9.2. */
176 newset = codepage;
177 for (aliases = ("CP936" "\0" "GBK" "\0"
178 "CP1361" "\0" "JOHAB" "\0"
179 "CP20127" "\0" "ASCII" "\0"
180 "CP20866" "\0" "KOI8-R" "\0"
181 "CP21866" "\0" "KOI8-RU" "\0"
182 "CP28591" "\0" "ISO-8859-1" "\0"
183 "CP28592" "\0" "ISO-8859-2" "\0"
184 "CP28593" "\0" "ISO-8859-3" "\0"
185 "CP28594" "\0" "ISO-8859-4" "\0"
186 "CP28595" "\0" "ISO-8859-5" "\0"
187 "CP28596" "\0" "ISO-8859-6" "\0"
188 "CP28597" "\0" "ISO-8859-7" "\0"
189 "CP28598" "\0" "ISO-8859-8" "\0"
190 "CP28599" "\0" "ISO-8859-9" "\0"
191 "CP28605" "\0" "ISO-8859-15" "\0"
192 "CP65001" "\0" "UTF-8" "\0");
193 *aliases;
194 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
196 if (!strcmp (codepage, aliases) ||(*aliases == '*' && !aliases[1]))
198 newset = aliases + strlen (aliases) + 1;
199 break;
203 #else /*!HAVE_W32_SYSTEM*/
205 #ifdef HAVE_LANGINFO_CODESET
206 newset = nl_langinfo (CODESET);
207 #else /*!HAVE_LANGINFO_CODESET*/
208 /* Try to get the used charset from environment variables. */
209 static char codepage[30];
210 const char *lc, *dot, *mod;
212 strcpy (codepage, "iso-8859-1");
213 lc = getenv ("LC_ALL");
214 if (!lc || !*lc)
216 lc = getenv ("LC_CTYPE");
217 if (!lc || !*lc)
218 lc = getenv ("LANG");
220 if (lc && *lc)
222 dot = strchr (lc, '.');
223 if (dot)
225 mod = strchr (++dot, '@');
226 if (!mod)
227 mod = dot + strlen (dot);
228 if (mod - dot < sizeof codepage && dot != mod)
230 memcpy (codepage, dot, mod - dot);
231 codepage [mod - dot] = 0;
235 newset = codepage;
236 #endif /*!HAVE_LANGINFO_CODESET*/
237 #endif /*!HAVE_W32_SYSTEM*/
240 full_newset = newset;
241 if (strlen (newset) > 3 && !ascii_memcasecmp (newset, "iso", 3))
243 newset += 3;
244 if (*newset == '-' || *newset == '_')
245 newset++;
248 /* Note that we silently assume that plain ASCII is actually meant
249 as Latin-1. This makes sense because many Unix system don't have
250 their locale set up properly and thus would get annoying error
251 messages and we have to handle all the "bug" reports. Latin-1 has
252 always been the character set used for 8 bit characters on Unix
253 systems. */
254 if ( !*newset
255 || !ascii_strcasecmp (newset, "8859-1" )
256 || !ascii_strcasecmp (newset, "646" )
257 || !ascii_strcasecmp (newset, "ASCII" )
258 || !ascii_strcasecmp (newset, "ANSI_X3.4-1968" )
261 active_charset_name = "iso-8859-1";
262 no_translation = 0;
263 use_iconv = 0;
265 else if ( !ascii_strcasecmp (newset, "utf8" )
266 || !ascii_strcasecmp(newset, "utf-8") )
268 active_charset_name = "utf-8";
269 no_translation = 1;
270 use_iconv = 0;
272 else
274 iconv_t cd;
276 #ifdef HAVE_W32_SYSTEM
277 if (load_libiconv ())
278 return -1;
279 #endif /*HAVE_W32_SYSTEM*/
281 cd = iconv_open (full_newset, "utf-8");
282 if (cd == (iconv_t)-1)
284 handle_iconv_error (full_newset, "utf-8", 0);
285 return -1;
287 iconv_close (cd);
288 cd = iconv_open ("utf-8", full_newset);
289 if (cd == (iconv_t)-1)
291 handle_iconv_error ("utf-8", full_newset, 0);
292 return -1;
294 iconv_close (cd);
295 active_charset_name = full_newset;
296 no_translation = 0;
297 use_iconv = 1;
299 return 0;
302 const char *
303 get_native_charset ()
305 return active_charset_name;
308 /* Return true if the native charset is utf-8. */
309 int
310 is_native_utf8 (void)
312 return no_translation;
316 /* Convert string, which is in native encoding to UTF8 and return a
317 new allocated UTF-8 string. */
318 char *
319 native_to_utf8 (const char *orig_string)
321 const unsigned char *string = (const unsigned char *)orig_string;
322 const unsigned char *s;
323 char *buffer;
324 unsigned char *p;
325 size_t length = 0;
327 if (no_translation)
329 /* Already utf-8 encoded. */
330 buffer = jnlib_xstrdup (orig_string);
332 else if (!use_iconv)
334 /* For Latin-1 we can avoid the iconv overhead. */
335 for (s = string; *s; s++)
337 length++;
338 if (*s & 0x80)
339 length++;
341 buffer = jnlib_xmalloc (length + 1);
342 for (p = (unsigned char *)buffer, s = string; *s; s++)
344 if ( (*s & 0x80 ))
346 *p++ = 0xc0 | ((*s >> 6) & 3);
347 *p++ = 0x80 | (*s & 0x3f);
349 else
350 *p++ = *s;
352 *p = 0;
354 else
356 /* Need to use iconv. */
357 iconv_t cd;
358 const char *inptr;
359 char *outptr;
360 size_t inbytes, outbytes;
362 cd = iconv_open ("utf-8", active_charset_name);
363 if (cd == (iconv_t)-1)
365 handle_iconv_error ("utf-8", active_charset_name, 1);
366 return native_to_utf8 (string);
369 for (s=string; *s; s++ )
371 length++;
372 if ((*s & 0x80))
373 length += 5; /* We may need up to 6 bytes for the utf8 output. */
375 buffer = jnlib_xmalloc (length + 1);
377 inptr = string;
378 inbytes = strlen (string);
379 outptr = buffer;
380 outbytes = length;
381 if ( iconv (cd, (ICONV_CONST char **)&inptr, &inbytes,
382 &outptr, &outbytes) == (size_t)-1)
384 static int shown;
386 if (!shown)
387 log_info (_("conversion from `%s' to `%s' failed: %s\n"),
388 active_charset_name, "utf-8", strerror (errno));
389 shown = 1;
390 /* We don't do any conversion at all but use the strings as is. */
391 strcpy (buffer, string);
393 else /* Success. */
395 *outptr = 0;
396 /* We could realloc the buffer now but I doubt that it makes
397 much sense given that it will get freed anyway soon
398 after. */
400 iconv_close (cd);
402 return buffer;
407 static char *
408 do_utf8_to_native (const char *string, size_t length, int delim,
409 int with_iconv)
411 int nleft;
412 int i;
413 unsigned char encbuf[8];
414 int encidx;
415 const unsigned char *s;
416 size_t n;
417 char *buffer = NULL;
418 char *p = NULL;
419 unsigned long val = 0;
420 size_t slen;
421 int resync = 0;
423 /* First pass (p==NULL): count the extended utf-8 characters. */
424 /* Second pass (p!=NULL): create string. */
425 for (;;)
427 for (slen = length, nleft = encidx = 0, n = 0,
428 s = (const unsigned char *)string;
429 slen;
430 s++, slen--)
432 if (resync)
434 if (!(*s < 128 || (*s >= 0xc0 && *s <= 0xfd)))
436 /* Still invalid. */
437 if (p)
439 sprintf (p, "\\x%02x", *s);
440 p += 4;
442 n += 4;
443 continue;
445 resync = 0;
447 if (!nleft)
449 if (!(*s & 0x80))
451 /* Plain ascii. */
452 if ( delim != -1
453 && (*s < 0x20 || *s == 0x7f || *s == delim
454 || (delim && *s == '\\')))
456 n++;
457 if (p)
458 *p++ = '\\';
459 switch (*s)
461 case '\n': n++; if ( p ) *p++ = 'n'; break;
462 case '\r': n++; if ( p ) *p++ = 'r'; break;
463 case '\f': n++; if ( p ) *p++ = 'f'; break;
464 case '\v': n++; if ( p ) *p++ = 'v'; break;
465 case '\b': n++; if ( p ) *p++ = 'b'; break;
466 case 0: n++; if ( p ) *p++ = '0'; break;
467 default:
468 n += 3;
469 if (p)
471 sprintf (p, "x%02x", *s);
472 p += 3;
474 break;
477 else
479 if (p)
480 *p++ = *s;
481 n++;
484 else if ((*s & 0xe0) == 0xc0) /* 110x xxxx */
486 val = *s & 0x1f;
487 nleft = 1;
488 encidx = 0;
489 encbuf[encidx++] = *s;
491 else if ((*s & 0xf0) == 0xe0) /* 1110 xxxx */
493 val = *s & 0x0f;
494 nleft = 2;
495 encidx = 0;
496 encbuf[encidx++] = *s;
498 else if ((*s & 0xf8) == 0xf0) /* 1111 0xxx */
500 val = *s & 0x07;
501 nleft = 3;
502 encidx = 0;
503 encbuf[encidx++] = *s;
505 else if ((*s & 0xfc) == 0xf8) /* 1111 10xx */
507 val = *s & 0x03;
508 nleft = 4;
509 encidx = 0;
510 encbuf[encidx++] = *s;
512 else if ((*s & 0xfe) == 0xfc) /* 1111 110x */
514 val = *s & 0x01;
515 nleft = 5;
516 encidx = 0;
517 encbuf[encidx++] = *s;
519 else /* Invalid encoding: print as \xNN. */
521 if (p)
523 sprintf (p, "\\x%02x", *s);
524 p += 4;
526 n += 4;
527 resync = 1;
530 else if (*s < 0x80 || *s >= 0xc0) /* Invalid utf-8 */
532 if (p)
534 for (i = 0; i < encidx; i++)
536 sprintf (p, "\\x%02x", encbuf[i]);
537 p += 4;
539 sprintf (p, "\\x%02x", *s);
540 p += 4;
542 n += 4 + 4 * encidx;
543 nleft = 0;
544 encidx = 0;
545 resync = 1;
547 else
549 encbuf[encidx++] = *s;
550 val <<= 6;
551 val |= *s & 0x3f;
552 if (!--nleft) /* Ready. */
554 if (no_translation)
556 if (p)
558 for (i = 0; i < encidx; i++)
559 *p++ = encbuf[i];
561 n += encidx;
562 encidx = 0;
564 else if (with_iconv)
566 /* Our strategy for using iconv is a bit strange
567 but it better keeps compatibility with
568 previous versions in regard to how invalid
569 encodings are displayed. What we do is to
570 keep the utf-8 as is and have the real
571 translation step then at the end. Yes, I
572 know that this is ugly. However we are short
573 of the 1.4 release and for this branch we
574 should not mess too much around with iconv
575 things. One reason for this is that we don't
576 know enough about non-GNU iconv
577 implementation and want to minimize the risk
578 of breaking the code on too many platforms. */
579 if ( p )
581 for (i=0; i < encidx; i++ )
582 *p++ = encbuf[i];
584 n += encidx;
585 encidx = 0;
587 else /* Latin-1 case. */
589 if (val >= 0x80 && val < 256)
591 /* We can simply print this character */
592 n++;
593 if (p)
594 *p++ = val;
596 else
598 /* We do not have a translation: print utf8. */
599 if (p)
601 for (i = 0; i < encidx; i++)
603 sprintf (p, "\\x%02x", encbuf[i]);
604 p += 4;
607 n += encidx * 4;
608 encidx = 0;
615 if (!buffer)
617 /* Allocate the buffer after the first pass. */
618 buffer = p = jnlib_xmalloc (n + 1);
620 else if (with_iconv)
622 /* Note: See above for comments. */
623 iconv_t cd;
624 const char *inptr;
625 char *outbuf, *outptr;
626 size_t inbytes, outbytes;
628 *p = 0; /* Terminate the buffer. */
630 cd = iconv_open (active_charset_name, "utf-8");
631 if (cd == (iconv_t)-1)
633 handle_iconv_error (active_charset_name, "utf-8", 1);
634 jnlib_free (buffer);
635 return utf8_to_native (string, length, delim);
638 /* Allocate a new buffer large enough to hold all possible
639 encodings. */
640 n = p - buffer + 1;
641 inbytes = n - 1;;
642 inptr = buffer;
643 outbytes = n * MB_LEN_MAX;
644 if (outbytes / MB_LEN_MAX != n)
645 BUG (); /* Actually an overflow. */
646 outbuf = outptr = jnlib_xmalloc (outbytes);
647 if ( iconv (cd, (ICONV_CONST char **)&inptr, &inbytes,
648 &outptr, &outbytes) == (size_t)-1)
650 static int shown;
652 if (!shown)
653 log_info (_("conversion from `%s' to `%s' failed: %s\n"),
654 "utf-8", active_charset_name, strerror (errno));
655 shown = 1;
656 /* Didn't worked out. Try again but without iconv. */
657 jnlib_free (buffer);
658 buffer = NULL;
659 jnlib_free (outbuf);
660 outbuf = do_utf8_to_native (string, length, delim, 0);
662 else /* Success. */
664 *outptr = 0; /* Make sure it is a string. */
665 /* We could realloc the buffer now but I doubt that it
666 makes much sense given that it will get freed
667 anyway soon after. */
668 jnlib_free (buffer);
670 iconv_close (cd);
671 return outbuf;
673 else /* Not using iconv. */
675 *p = 0; /* Make sure it is a string. */
676 return buffer;
681 /* Convert string, which is in UTF-8 to native encoding. Replace
682 illegal encodings by some "\xnn" and quote all control
683 characters. A character with value DELIM will always be quoted, it
684 must be a vanilla ASCII character. A DELIM value of -1 is special:
685 it disables all quoting of control characters. */
686 char *
687 utf8_to_native (const char *string, size_t length, int delim)
689 return do_utf8_to_native (string, length, delim, use_iconv);
695 /* Wrapper function for iconv_open, required for W32 as we dlopen that
696 library on that system. */
697 jnlib_iconv_t
698 jnlib_iconv_open (const char *tocode, const char *fromcode)
700 #ifdef HAVE_W32_SYSTEM
701 if (load_libiconv ())
702 return (jnlib_iconv_t)(-1);
703 #endif /*HAVE_W32_SYSTEM*/
705 return (jnlib_iconv_t)iconv_open (tocode, fromcode);
709 /* Wrapper function for iconv, required for W32 as we dlopen that
710 library on that system. */
711 size_t
712 jnlib_iconv (jnlib_iconv_t cd,
713 const char **inbuf, size_t *inbytesleft,
714 char **outbuf, size_t *outbytesleft)
717 #ifdef HAVE_W32_SYSTEM
718 if (load_libiconv ())
719 return 0;
720 #endif /*HAVE_W32_SYSTEM*/
722 return iconv ((iconv_t)cd, (char**)inbuf, inbytesleft, outbuf, outbytesleft);
725 /* Wrapper function for iconv_close, required for W32 as we dlopen that
726 library on that system. */
728 jnlib_iconv_close (jnlib_iconv_t cd)
730 #ifdef HAVE_W32_SYSTEM
731 if (load_libiconv ())
732 return 0;
733 #endif /*HAVE_W32_SYSTEM*/
735 return iconv_close ((iconv_t)cd);