Impleemned gpgsm's IMPORT --re-import feature.
[gnupg.git] / jnlib / utf8conv.c
blobfee4dc6a5e113ccdc3e989859ce9369948b572d1
1 /* utf8conf.c - UTF8 character set conversion
2 * Copyright (C) 1994, 1998, 1999, 2000, 2001,
3 * 2003, 2006, 2008 Free Software Foundation, Inc.
5 * This file is part of JNLIB.
7 * JNLIB is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU Lesser General Public License as
9 * published by the Free Software Foundation; either version 3 of
10 * the License, or (at your option) any later version.
12 * JNLIB is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include <config.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <stdarg.h>
25 #include <ctype.h>
26 #ifdef HAVE_LANGINFO_CODESET
27 #include <langinfo.h>
28 #endif
29 #include <errno.h>
30 #ifndef HAVE_W32_SYSTEM
31 # include <iconv.h>
32 #endif
34 #include "libjnlib-config.h"
35 #include "stringhelp.h"
36 #include "dynload.h"
37 #include "utf8conv.h"
39 #ifndef MB_LEN_MAX
40 #define MB_LEN_MAX 16
41 #endif
43 static const char *active_charset_name = "iso-8859-1";
44 static int no_translation; /* Set to true if we let simply pass through. */
45 static int use_iconv; /* iconv comversion fucntions required. */
48 /* Under W32 we dlopen the iconv dll and don't require any iconv
49 related headers at all. However we need to define some stuff. */
50 #ifdef HAVE_W32_SYSTEM
51 typedef void *iconv_t;
52 #ifndef ICONV_CONST
53 #define ICONV_CONST const
54 #endif
55 static iconv_t (* __stdcall iconv_open) (const char *tocode,
56 const char *fromcode);
57 static size_t (* __stdcall iconv) (iconv_t cd,
58 const char **inbuf, size_t *inbytesleft,
59 char **outbuf, size_t *outbytesleft);
60 static int (* __stdcall iconv_close) (iconv_t cd);
62 static int
63 load_libiconv (void)
65 static int done;
67 if (!done)
69 void *handle;
71 done = 1; /* Do it right now because we might get called recursivly
72 through gettext. */
74 handle = dlopen ("iconv.dll", RTLD_LAZY);
75 if (handle)
77 iconv_open = dlsym (handle, "libiconv_open");
78 if (iconv_open)
79 iconv = dlsym (handle, "libiconv");
80 if (iconv)
81 iconv_close = dlsym (handle, "libiconv_close");
83 if (!handle || !iconv_close)
85 log_info (_("error loading `%s': %s\n"),
86 "iconv.dll", dlerror ());
87 log_info (_("please see %s for more information\n"),
88 "http://www.gnupg.org/download/iconv.html");
89 iconv_open = NULL;
90 iconv = NULL;
91 iconv_close = NULL;
92 if (handle)
93 dlclose (handle);
96 return iconv_open? 0: -1;
98 #endif /*HAVE_W32_SYSTEM*/
101 /* Error handler for iconv failures. This is needed to not clutter the
102 output with repeated diagnostics about a missing conversion. */
103 static void
104 handle_iconv_error (const char *to, const char *from, int use_fallback)
106 if (errno == EINVAL)
108 static int shown1, shown2;
109 int x;
111 if (to && !strcmp (to, "utf-8"))
113 x = shown1;
114 shown1 = 1;
116 else
118 x = shown2;
119 shown2 = 1;
122 if (!x)
123 log_info (_("conversion from `%s' to `%s' not available\n"),
124 from, to);
126 else
128 static int shown;
130 if (!shown)
131 log_info (_("iconv_open failed: %s\n"), strerror (errno));
132 shown = 1;
135 if (use_fallback)
137 /* To avoid further error messages we fallback to Latin-1 for the
138 native encoding. This is justified as one can expect that on a
139 utf-8 enabled system nl_langinfo() will work and thus we won't
140 never get to here. Thus Latin-1 seems to be a reasonable
141 default. */
142 active_charset_name = "iso-8859-1";
143 no_translation = 0;
144 use_iconv = 0;
151 set_native_charset (const char *newset)
153 const char *full_newset;
155 if (!newset)
157 #ifdef HAVE_W32_SYSTEM
158 static char codepage[30];
159 unsigned int cpno;
160 const char *aliases;
162 /* We are a console program thus we need to use the
163 GetConsoleOutputCP function and not the the GetACP which
164 would give the codepage for a GUI program. Note this is not
165 a bulletproof detection because GetConsoleCP might return a
166 different one for console input. Not sure how to cope with
167 that. If the console Code page is not known we fall back to
168 the system code page. */
169 cpno = GetConsoleOutputCP ();
170 if (!cpno)
171 cpno = GetACP ();
172 sprintf (codepage, "CP%u", cpno );
173 /* Resolve alias. We use a long string string and not the usual
174 array to optimize if the code is taken to a DSO. Taken from
175 libiconv 1.9.2. */
176 newset = codepage;
177 for (aliases = ("CP936" "\0" "GBK" "\0"
178 "CP1361" "\0" "JOHAB" "\0"
179 "CP20127" "\0" "ASCII" "\0"
180 "CP20866" "\0" "KOI8-R" "\0"
181 "CP21866" "\0" "KOI8-RU" "\0"
182 "CP28591" "\0" "ISO-8859-1" "\0"
183 "CP28592" "\0" "ISO-8859-2" "\0"
184 "CP28593" "\0" "ISO-8859-3" "\0"
185 "CP28594" "\0" "ISO-8859-4" "\0"
186 "CP28595" "\0" "ISO-8859-5" "\0"
187 "CP28596" "\0" "ISO-8859-6" "\0"
188 "CP28597" "\0" "ISO-8859-7" "\0"
189 "CP28598" "\0" "ISO-8859-8" "\0"
190 "CP28599" "\0" "ISO-8859-9" "\0"
191 "CP28605" "\0" "ISO-8859-15" "\0"
192 "CP65001" "\0" "UTF-8" "\0");
193 *aliases;
194 aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
196 if (!strcmp (codepage, aliases) ||(*aliases == '*' && !aliases[1]))
198 newset = aliases + strlen (aliases) + 1;
199 break;
203 #else /*!HAVE_W32_SYSTEM*/
205 #ifdef HAVE_LANGINFO_CODESET
206 newset = nl_langinfo (CODESET);
207 #else /*!HAVE_LANGINFO_CODESET*/
208 /* Try to get the used charset from environment variables. */
209 static char codepage[30];
210 const char *lc, *dot, *mod;
212 strcpy (codepage, "iso-8859-1");
213 lc = getenv ("LC_ALL");
214 if (!lc || !*lc)
216 lc = getenv ("LC_CTYPE");
217 if (!lc || !*lc)
218 lc = getenv ("LANG");
220 if (lc && *lc)
222 dot = strchr (lc, '.');
223 if (dot)
225 mod = strchr (++dot, '@');
226 if (!mod)
227 mod = dot + strlen (dot);
228 if (mod - dot < sizeof codepage && dot != mod)
230 memcpy (codepage, dot, mod - dot);
231 codepage [mod - dot] = 0;
235 newset = codepage;
236 #endif /*!HAVE_LANGINFO_CODESET*/
237 #endif /*!HAVE_W32_SYSTEM*/
240 full_newset = newset;
241 if (strlen (newset) > 3 && !ascii_memcasecmp (newset, "iso", 3))
243 newset += 3;
244 if (*newset == '-' || *newset == '_')
245 newset++;
248 /* Note that we silently assume that plain ASCII is actually meant
249 as Latin-1. This makes sense because many Unix system don't have
250 their locale set up properly and thus would get annoying error
251 messages and we have to handle all the "bug" reports. Latin-1 has
252 always been the character set used for 8 bit characters on Unix
253 systems. */
254 if ( !*newset
255 || !ascii_strcasecmp (newset, "8859-1" )
256 || !ascii_strcasecmp (newset, "646" )
257 || !ascii_strcasecmp (newset, "ASCII" )
258 || !ascii_strcasecmp (newset, "ANSI_X3.4-1968" )
261 active_charset_name = "iso-8859-1";
262 no_translation = 0;
263 use_iconv = 0;
265 else if ( !ascii_strcasecmp (newset, "utf8" )
266 || !ascii_strcasecmp(newset, "utf-8") )
268 active_charset_name = "utf-8";
269 no_translation = 1;
270 use_iconv = 0;
272 else
274 iconv_t cd;
276 #ifdef HAVE_W32_SYSTEM
277 if (load_libiconv ())
278 return -1;
279 #endif /*HAVE_W32_SYSTEM*/
281 cd = iconv_open (full_newset, "utf-8");
282 if (cd == (iconv_t)-1)
284 handle_iconv_error (full_newset, "utf-8", 0);
285 return -1;
287 iconv_close (cd);
288 cd = iconv_open ("utf-8", full_newset);
289 if (cd == (iconv_t)-1)
291 handle_iconv_error ("utf-8", full_newset, 0);
292 return -1;
294 iconv_close (cd);
295 active_charset_name = full_newset;
296 no_translation = 0;
297 use_iconv = 1;
299 return 0;
302 const char *
303 get_native_charset ()
305 return active_charset_name;
308 /* Return true if the native charset is utf-8. */
309 int
310 is_native_utf8 (void)
312 return no_translation;
316 /* Convert string, which is in native encoding to UTF8 and return a
317 new allocated UTF-8 string. This function terminates the process
318 on memory shortage. */
319 char *
320 native_to_utf8 (const char *orig_string)
322 const unsigned char *string = (const unsigned char *)orig_string;
323 const unsigned char *s;
324 char *buffer;
325 unsigned char *p;
326 size_t length = 0;
328 if (no_translation)
330 /* Already utf-8 encoded. */
331 buffer = jnlib_xstrdup (orig_string);
333 else if (!use_iconv)
335 /* For Latin-1 we can avoid the iconv overhead. */
336 for (s = string; *s; s++)
338 length++;
339 if (*s & 0x80)
340 length++;
342 buffer = jnlib_xmalloc (length + 1);
343 for (p = (unsigned char *)buffer, s = string; *s; s++)
345 if ( (*s & 0x80 ))
347 *p++ = 0xc0 | ((*s >> 6) & 3);
348 *p++ = 0x80 | (*s & 0x3f);
350 else
351 *p++ = *s;
353 *p = 0;
355 else
357 /* Need to use iconv. */
358 iconv_t cd;
359 const char *inptr;
360 char *outptr;
361 size_t inbytes, outbytes;
363 cd = iconv_open ("utf-8", active_charset_name);
364 if (cd == (iconv_t)-1)
366 handle_iconv_error ("utf-8", active_charset_name, 1);
367 return native_to_utf8 (string);
370 for (s=string; *s; s++ )
372 length++;
373 if ((*s & 0x80))
374 length += 5; /* We may need up to 6 bytes for the utf8 output. */
376 buffer = jnlib_xmalloc (length + 1);
378 inptr = string;
379 inbytes = strlen (string);
380 outptr = buffer;
381 outbytes = length;
382 if ( iconv (cd, (ICONV_CONST char **)&inptr, &inbytes,
383 &outptr, &outbytes) == (size_t)-1)
385 static int shown;
387 if (!shown)
388 log_info (_("conversion from `%s' to `%s' failed: %s\n"),
389 active_charset_name, "utf-8", strerror (errno));
390 shown = 1;
391 /* We don't do any conversion at all but use the strings as is. */
392 strcpy (buffer, string);
394 else /* Success. */
396 *outptr = 0;
397 /* We could realloc the buffer now but I doubt that it makes
398 much sense given that it will get freed anyway soon
399 after. */
401 iconv_close (cd);
403 return buffer;
408 static char *
409 do_utf8_to_native (const char *string, size_t length, int delim,
410 int with_iconv)
412 int nleft;
413 int i;
414 unsigned char encbuf[8];
415 int encidx;
416 const unsigned char *s;
417 size_t n;
418 char *buffer = NULL;
419 char *p = NULL;
420 unsigned long val = 0;
421 size_t slen;
422 int resync = 0;
424 /* First pass (p==NULL): count the extended utf-8 characters. */
425 /* Second pass (p!=NULL): create string. */
426 for (;;)
428 for (slen = length, nleft = encidx = 0, n = 0,
429 s = (const unsigned char *)string;
430 slen;
431 s++, slen--)
433 if (resync)
435 if (!(*s < 128 || (*s >= 0xc0 && *s <= 0xfd)))
437 /* Still invalid. */
438 if (p)
440 sprintf (p, "\\x%02x", *s);
441 p += 4;
443 n += 4;
444 continue;
446 resync = 0;
448 if (!nleft)
450 if (!(*s & 0x80))
452 /* Plain ascii. */
453 if ( delim != -1
454 && (*s < 0x20 || *s == 0x7f || *s == delim
455 || (delim && *s == '\\')))
457 n++;
458 if (p)
459 *p++ = '\\';
460 switch (*s)
462 case '\n': n++; if ( p ) *p++ = 'n'; break;
463 case '\r': n++; if ( p ) *p++ = 'r'; break;
464 case '\f': n++; if ( p ) *p++ = 'f'; break;
465 case '\v': n++; if ( p ) *p++ = 'v'; break;
466 case '\b': n++; if ( p ) *p++ = 'b'; break;
467 case 0: n++; if ( p ) *p++ = '0'; break;
468 default:
469 n += 3;
470 if (p)
472 sprintf (p, "x%02x", *s);
473 p += 3;
475 break;
478 else
480 if (p)
481 *p++ = *s;
482 n++;
485 else if ((*s & 0xe0) == 0xc0) /* 110x xxxx */
487 val = *s & 0x1f;
488 nleft = 1;
489 encidx = 0;
490 encbuf[encidx++] = *s;
492 else if ((*s & 0xf0) == 0xe0) /* 1110 xxxx */
494 val = *s & 0x0f;
495 nleft = 2;
496 encidx = 0;
497 encbuf[encidx++] = *s;
499 else if ((*s & 0xf8) == 0xf0) /* 1111 0xxx */
501 val = *s & 0x07;
502 nleft = 3;
503 encidx = 0;
504 encbuf[encidx++] = *s;
506 else if ((*s & 0xfc) == 0xf8) /* 1111 10xx */
508 val = *s & 0x03;
509 nleft = 4;
510 encidx = 0;
511 encbuf[encidx++] = *s;
513 else if ((*s & 0xfe) == 0xfc) /* 1111 110x */
515 val = *s & 0x01;
516 nleft = 5;
517 encidx = 0;
518 encbuf[encidx++] = *s;
520 else /* Invalid encoding: print as \xNN. */
522 if (p)
524 sprintf (p, "\\x%02x", *s);
525 p += 4;
527 n += 4;
528 resync = 1;
531 else if (*s < 0x80 || *s >= 0xc0) /* Invalid utf-8 */
533 if (p)
535 for (i = 0; i < encidx; i++)
537 sprintf (p, "\\x%02x", encbuf[i]);
538 p += 4;
540 sprintf (p, "\\x%02x", *s);
541 p += 4;
543 n += 4 + 4 * encidx;
544 nleft = 0;
545 encidx = 0;
546 resync = 1;
548 else
550 encbuf[encidx++] = *s;
551 val <<= 6;
552 val |= *s & 0x3f;
553 if (!--nleft) /* Ready. */
555 if (no_translation)
557 if (p)
559 for (i = 0; i < encidx; i++)
560 *p++ = encbuf[i];
562 n += encidx;
563 encidx = 0;
565 else if (with_iconv)
567 /* Our strategy for using iconv is a bit strange
568 but it better keeps compatibility with
569 previous versions in regard to how invalid
570 encodings are displayed. What we do is to
571 keep the utf-8 as is and have the real
572 translation step then at the end. Yes, I
573 know that this is ugly. However we are short
574 of the 1.4 release and for this branch we
575 should not mess too much around with iconv
576 things. One reason for this is that we don't
577 know enough about non-GNU iconv
578 implementation and want to minimize the risk
579 of breaking the code on too many platforms. */
580 if ( p )
582 for (i=0; i < encidx; i++ )
583 *p++ = encbuf[i];
585 n += encidx;
586 encidx = 0;
588 else /* Latin-1 case. */
590 if (val >= 0x80 && val < 256)
592 /* We can simply print this character */
593 n++;
594 if (p)
595 *p++ = val;
597 else
599 /* We do not have a translation: print utf8. */
600 if (p)
602 for (i = 0; i < encidx; i++)
604 sprintf (p, "\\x%02x", encbuf[i]);
605 p += 4;
608 n += encidx * 4;
609 encidx = 0;
616 if (!buffer)
618 /* Allocate the buffer after the first pass. */
619 buffer = p = jnlib_xmalloc (n + 1);
621 else if (with_iconv)
623 /* Note: See above for comments. */
624 iconv_t cd;
625 const char *inptr;
626 char *outbuf, *outptr;
627 size_t inbytes, outbytes;
629 *p = 0; /* Terminate the buffer. */
631 cd = iconv_open (active_charset_name, "utf-8");
632 if (cd == (iconv_t)-1)
634 handle_iconv_error (active_charset_name, "utf-8", 1);
635 jnlib_free (buffer);
636 return utf8_to_native (string, length, delim);
639 /* Allocate a new buffer large enough to hold all possible
640 encodings. */
641 n = p - buffer + 1;
642 inbytes = n - 1;;
643 inptr = buffer;
644 outbytes = n * MB_LEN_MAX;
645 if (outbytes / MB_LEN_MAX != n)
646 BUG (); /* Actually an overflow. */
647 outbuf = outptr = jnlib_xmalloc (outbytes);
648 if ( iconv (cd, (ICONV_CONST char **)&inptr, &inbytes,
649 &outptr, &outbytes) == (size_t)-1)
651 static int shown;
653 if (!shown)
654 log_info (_("conversion from `%s' to `%s' failed: %s\n"),
655 "utf-8", active_charset_name, strerror (errno));
656 shown = 1;
657 /* Didn't worked out. Try again but without iconv. */
658 jnlib_free (buffer);
659 buffer = NULL;
660 jnlib_free (outbuf);
661 outbuf = do_utf8_to_native (string, length, delim, 0);
663 else /* Success. */
665 *outptr = 0; /* Make sure it is a string. */
666 /* We could realloc the buffer now but I doubt that it
667 makes much sense given that it will get freed
668 anyway soon after. */
669 jnlib_free (buffer);
671 iconv_close (cd);
672 return outbuf;
674 else /* Not using iconv. */
676 *p = 0; /* Make sure it is a string. */
677 return buffer;
682 /* Convert string, which is in UTF-8 to native encoding. Replace
683 illegal encodings by some "\xnn" and quote all control
684 characters. A character with value DELIM will always be quoted, it
685 must be a vanilla ASCII character. A DELIM value of -1 is special:
686 it disables all quoting of control characters. This function
687 terminates the process on memory shortage. */
688 char *
689 utf8_to_native (const char *string, size_t length, int delim)
691 return do_utf8_to_native (string, length, delim, use_iconv);
697 /* Wrapper function for iconv_open, required for W32 as we dlopen that
698 library on that system. */
699 jnlib_iconv_t
700 jnlib_iconv_open (const char *tocode, const char *fromcode)
702 #ifdef HAVE_W32_SYSTEM
703 if (load_libiconv ())
704 return (jnlib_iconv_t)(-1);
705 #endif /*HAVE_W32_SYSTEM*/
707 return (jnlib_iconv_t)iconv_open (tocode, fromcode);
711 /* Wrapper function for iconv, required for W32 as we dlopen that
712 library on that system. */
713 size_t
714 jnlib_iconv (jnlib_iconv_t cd,
715 const char **inbuf, size_t *inbytesleft,
716 char **outbuf, size_t *outbytesleft)
719 #ifdef HAVE_W32_SYSTEM
720 if (load_libiconv ())
721 return 0;
722 #endif /*HAVE_W32_SYSTEM*/
724 return iconv ((iconv_t)cd, (char**)inbuf, inbytesleft, outbuf, outbytesleft);
727 /* Wrapper function for iconv_close, required for W32 as we dlopen that
728 library on that system. */
730 jnlib_iconv_close (jnlib_iconv_t cd)
732 #ifdef HAVE_W32_SYSTEM
733 if (load_libiconv ())
734 return 0;
735 #endif /*HAVE_W32_SYSTEM*/
737 return iconv_close ((iconv_t)cd);