Include <string.h> for memcpy.
[glib.git] / gconvert.c
blobd14b6342889632cceb8e9d49285f96779abeda1d
1 /* GLIB - Library of useful routines for C programming
3 * gconvert.c: Convert between character sets using iconv
4 * Copyright Red Hat Inc., 2000
5 * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the
19 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 * Boston, MA 02111-1307, USA.
23 #include <iconv.h>
24 #include <errno.h>
25 #include <string.h>
26 #include <stdlib.h>
28 #include "glib.h"
29 #include "config.h"
31 #ifdef G_OS_WIN32
32 #include <windows.h>
33 #endif
35 #include "glibintl.h"
37 GQuark
38 g_convert_error_quark()
40 static GQuark quark;
41 if (!quark)
42 quark = g_quark_from_static_string ("g_convert_error");
44 return quark;
47 #if defined(USE_LIBICONV) && !defined (_LIBICONV_H)
48 #error libiconv in use but included iconv.h not from libiconv
49 #endif
50 #if !defined(USE_LIBICONV) && defined (_LIBICONV_H)
51 #error libiconv not in use but included iconv.h is from libiconv
52 #endif
54 GIConv
55 g_iconv_open (const gchar *to_codeset,
56 const gchar *from_codeset)
58 iconv_t cd = iconv_open (to_codeset, from_codeset);
60 return (GIConv)cd;
63 size_t
64 g_iconv (GIConv converter,
65 gchar **inbuf,
66 size_t *inbytes_left,
67 gchar **outbuf,
68 size_t *outbytes_left)
70 iconv_t cd = (iconv_t)converter;
72 return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left);
75 gint
76 g_iconv_close (GIConv converter)
78 iconv_t cd = (iconv_t)converter;
80 return iconv_close (cd);
83 static GIConv
84 open_converter (const gchar *to_codeset,
85 const gchar *from_codeset,
86 GError **error)
88 GIConv cd = g_iconv_open (to_codeset, from_codeset);
90 if (cd == (iconv_t) -1)
92 /* Something went wrong. */
93 if (errno == EINVAL)
94 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
95 _("Conversion from character set `%s' to `%s' is not supported"),
96 from_codeset, to_codeset);
97 else
98 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
99 _("Could not open converter from `%s' to `%s': %s"),
100 from_codeset, to_codeset, strerror (errno));
103 return cd;
108 * g_convert:
109 * @str: the string to convert
110 * @len: the length of the string
111 * @to_codeset: name of character set into which to convert @str
112 * @from_codeset: character set of @str.
113 * @bytes_read: location to store the number of bytes in the
114 * input string that were successfully converted, or %NULL.
115 * Even if the conversion was succesful, this may be
116 * less than len if there were partial characters
117 * at the end of the input. If the error
118 * G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
119 * stored will the byte fofset after the last valid
120 * input sequence.
121 * @bytes_written: the stored in the output buffer (not including the
122 * terminating nul.
123 * @error: location to store the error occuring, or %NULL to ignore
124 * errors. Any of the errors in #GConvertError may occur.
126 * Convert a string from one character set to another.
128 * Return value: If the conversion was successful, a newly allocated
129 * NUL-terminated string, which must be freed with
130 * g_free. Otherwise %NULL and @error will be set.
132 gchar*
133 g_convert (const gchar *str,
134 gint len,
135 const gchar *to_codeset,
136 const gchar *from_codeset,
137 gint *bytes_read,
138 gint *bytes_written,
139 GError **error)
141 gchar *dest;
142 gchar *outp;
143 const gchar *p;
144 size_t inbytes_remaining;
145 size_t outbytes_remaining;
146 size_t err;
147 GIConv cd;
148 size_t outbuf_size;
149 gboolean have_error = FALSE;
151 g_return_val_if_fail (str != NULL, NULL);
152 g_return_val_if_fail (to_codeset != NULL, NULL);
153 g_return_val_if_fail (from_codeset != NULL, NULL);
155 cd = open_converter (to_codeset, from_codeset, error);
157 if (cd == (GIConv) -1)
159 if (bytes_read)
160 *bytes_read = 0;
162 if (bytes_written)
163 *bytes_written = 0;
165 return NULL;
168 if (len < 0)
169 len = strlen (str);
171 p = str;
172 inbytes_remaining = len;
174 /* Due to a GLIBC bug, round outbuf_size up to a multiple of 4 */
175 /* + 1 for nul in case len == 1 */
176 outbuf_size = ((len + 3) & ~3) + 1;
178 outbytes_remaining = outbuf_size - 1; /* -1 for nul */
179 outp = dest = g_malloc (outbuf_size);
181 again:
183 err = g_iconv (cd, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
185 if (err == (size_t) -1)
187 switch (errno)
189 case EINVAL:
190 /* Incomplete text, do not report an error */
191 break;
192 case E2BIG:
194 size_t used = outp - dest;
196 /* glibc's iconv can return E2BIG even if there is space
197 * remaining if an internal buffer is exhausted. The
198 * folllowing is a heuristic to catch this. The 16 is
199 * pretty arbitrary.
201 if (used + 16 > outbuf_size)
203 outbuf_size = (outbuf_size - 1) * 2 + 1;
204 dest = g_realloc (dest, outbuf_size);
206 outp = dest + used;
207 outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
210 goto again;
212 case EILSEQ:
213 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
214 _("Invalid byte sequence in conversion input"));
215 have_error = TRUE;
216 break;
217 default:
218 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
219 _("Error during conversion: %s"),
220 strerror (errno));
221 have_error = TRUE;
222 break;
226 *outp = '\0';
228 g_iconv_close (cd);
230 if (bytes_read)
231 *bytes_read = p - str;
232 else
234 if ((p - str) != len)
236 if (!have_error)
238 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
239 _("Partial character sequence at end of input"));
240 have_error = TRUE;
245 if (bytes_written)
246 *bytes_written = outp - dest; /* Doesn't include '\0' */
248 if (have_error)
250 g_free (dest);
251 return NULL;
253 else
254 return dest;
258 * g_convert_with_fallback:
259 * @str: the string to convert
260 * @len: the length of the string
261 * @to_codeset: name of character set into which to convert @str
262 * @from_codeset: character set of @str.
263 * @fallback: UTF-8 string to use in place of character not
264 * present in the target encoding. (This must be
265 * in the target encoding), if %NULL, characters
266 * not in the target encoding will be represented
267 * as Unicode escapes \x{XXXX} or \x{XXXXXX}.
268 * @bytes_read: location to store the number of bytes in the
269 * input string that were successfully converted, or %NULL.
270 * Even if the conversion was succesful, this may be
271 * less than len if there were partial characters
272 * at the end of the input. If the error
273 * G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
274 * stored will the byte fofset after the last valid
275 * input sequence.
276 * @bytes_written: the stored in the output buffer (not including the
277 * terminating nul.
278 * @error: location to store the error occuring, or %NULL to ignore
279 * errors. Any of the errors in #GConvertError may occur.
281 * Convert a string from one character set to another, possibly
282 * including fallback sequences for characters not representable
283 * in the output. Note that it is not guaranteed that the specification
284 * for the fallback sequences in @fallback will be honored. Some
285 * systems may do a approximate conversion from @from_codeset
286 * to @to_codeset in their iconv() functions, in which case GLib
287 * will simply return that approximate conversion.
289 * Return value: If the conversion was successful, a newly allocated
290 * NUL-terminated string, which must be freed with
291 * g_free. Otherwise %NULL and @error will be set.
293 gchar*
294 g_convert_with_fallback (const gchar *str,
295 gint len,
296 const gchar *to_codeset,
297 const gchar *from_codeset,
298 gchar *fallback,
299 gint *bytes_read,
300 gint *bytes_written,
301 GError **error)
303 gchar *utf8;
304 gchar *dest;
305 gchar *outp;
306 const gchar *insert_str = NULL;
307 const gchar *p;
308 int inbytes_remaining;
309 const gchar *save_p = NULL;
310 size_t save_inbytes = 0;
311 size_t outbytes_remaining;
312 size_t err;
313 GIConv cd;
314 size_t outbuf_size;
315 gboolean have_error = FALSE;
316 gboolean done = FALSE;
318 GError *local_error = NULL;
320 g_return_val_if_fail (str != NULL, NULL);
321 g_return_val_if_fail (to_codeset != NULL, NULL);
322 g_return_val_if_fail (from_codeset != NULL, NULL);
324 if (len < 0)
325 len = strlen (str);
327 /* Try an exact conversion; we only proceed if this fails
328 * due to an illegal sequence in the input string.
330 dest = g_convert (str, len, to_codeset, from_codeset,
331 bytes_read, bytes_written, &local_error);
332 if (!local_error)
333 return dest;
335 if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
337 g_propagate_error (error, local_error);
338 return NULL;
340 else
341 g_error_free (local_error);
343 local_error = NULL;
345 /* No go; to proceed, we need a converter from "UTF-8" to
346 * to_codeset, and the string as UTF-8.
348 cd = open_converter (to_codeset, "UTF-8", error);
349 if (cd == (GIConv) -1)
351 if (bytes_read)
352 *bytes_read = 0;
354 if (bytes_written)
355 *bytes_written = 0;
357 return NULL;
360 utf8 = g_convert (str, len, "UTF-8", from_codeset,
361 bytes_read, &inbytes_remaining, error);
362 if (!utf8)
363 return NULL;
365 /* Now the heart of the code. We loop through the UTF-8 string, and
366 * whenever we hit an offending character, we form fallback, convert
367 * the fallback to the target codeset, and then go back to
368 * converting the original string after finishing with the fallback.
370 * The variables save_p and save_inbytes store the input state
371 * for the original string while we are converting the fallback
373 p = utf8;
374 /* Due to a GLIBC bug, round outbuf_size up to a multiple of 4 */
375 /* + 1 for nul in case len == 1 */
376 outbuf_size = ((len + 3) & ~3) + 1;
377 outbytes_remaining = outbuf_size - 1; /* -1 for nul */
378 outp = dest = g_malloc (outbuf_size);
380 while (!done && !have_error)
382 size_t inbytes_tmp = inbytes_remaining;
383 err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining);
384 inbytes_remaining = inbytes_tmp;
386 if (err == (size_t) -1)
388 switch (errno)
390 case EINVAL:
391 g_assert_not_reached();
392 break;
393 case E2BIG:
395 size_t used = outp - dest;
397 /* glibc's iconv can return E2BIG even if there is space
398 * remaining if an internal buffer is exhausted. The
399 * folllowing is a heuristic to catch this. The 16 is
400 * pretty arbitrary.
402 if (used + 16 > outbuf_size)
404 outbuf_size = (outbuf_size - 1) * 2 + 1;
405 dest = g_realloc (dest, outbuf_size);
407 outp = dest + used;
408 outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
411 break;
413 case EILSEQ:
414 if (save_p)
416 /* Error converting fallback string - fatal
418 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
419 _("Cannot convert fallback '%s' to codeset '%s'"),
420 insert_str, to_codeset);
421 have_error = TRUE;
422 break;
424 else
426 if (!fallback)
428 gunichar ch = g_utf8_get_char (p);
429 insert_str = g_strdup_printf ("\\x{%0*X}",
430 (ch < 0x10000) ? 4 : 6,
431 ch);
433 else
434 insert_str = fallback;
436 save_p = g_utf8_next_char (p);
437 save_inbytes = inbytes_remaining - (save_p - p);
438 p = insert_str;
439 inbytes_remaining = strlen (p);
441 break;
442 default:
443 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
444 _("Error during conversion: %s"),
445 strerror (errno));
446 have_error = TRUE;
447 break;
450 else
452 if (save_p)
454 if (!fallback)
455 g_free ((gchar *)insert_str);
456 p = save_p;
457 inbytes_remaining = save_inbytes;
458 save_p = NULL;
460 else
461 done = TRUE;
465 /* Cleanup
467 *outp = '\0';
469 g_iconv_close (cd);
471 if (bytes_written)
472 *bytes_written = outp - str; /* Doesn't include '\0' */
474 g_free (utf8);
476 if (have_error)
478 if (save_p && !fallback)
479 g_free ((gchar *)insert_str);
480 g_free (dest);
481 return NULL;
483 else
484 return dest;
488 * g_locale_to_utf8
494 * g_locale_to_utf8:
495 * @opsysstring: a string in the encoding of the current locale
496 * @len: the length of the string, or -1 if the string is
497 * NULL-terminated.
498 * @bytes_read: location to store the number of bytes in the
499 * input string that were successfully converted, or %NULL.
500 * Even if the conversion was succesful, this may be
501 * less than len if there were partial characters
502 * at the end of the input. If the error
503 * G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
504 * stored will the byte fofset after the last valid
505 * input sequence.
506 * @bytes_written: the stored in the output buffer (not including the
507 * terminating nul.
508 * @error: location to store the error occuring, or %NULL to ignore
509 * errors. Any of the errors in #GConvertError may occur.
511 * Converts a string which is in the encoding used for strings by
512 * the C runtime (usually the same as that used by the operating
513 * system) in the current locale into a UTF-8 string.
515 * Return value: The converted string, or %NULL on an error.
517 gchar *
518 g_locale_to_utf8 (const gchar *opsysstring,
519 gint len,
520 gint *bytes_read,
521 gint *bytes_written,
522 GError **error)
524 #ifdef G_OS_WIN32
526 gint i, clen, total_len, wclen, first;
527 wchar_t *wcs, wc;
528 gchar *result, *bp;
529 const wchar_t *wcp;
531 if (len == -1)
532 len = strlen (opsysstring);
534 wcs = g_new (wchar_t, len);
535 wclen = MultiByteToWideChar (CP_ACP, 0, opsysstring, len, wcs, len);
537 wcp = wcs;
538 total_len = 0;
539 for (i = 0; i < wclen; i++)
541 wc = *wcp++;
543 if (wc < 0x80)
544 total_len += 1;
545 else if (wc < 0x800)
546 total_len += 2;
547 else if (wc < 0x10000)
548 total_len += 3;
549 else if (wc < 0x200000)
550 total_len += 4;
551 else if (wc < 0x4000000)
552 total_len += 5;
553 else
554 total_len += 6;
557 result = g_malloc (total_len + 1);
559 wcp = wcs;
560 bp = result;
561 for (i = 0; i < wclen; i++)
563 wc = *wcp++;
565 if (wc < 0x80)
567 first = 0;
568 clen = 1;
570 else if (wc < 0x800)
572 first = 0xc0;
573 clen = 2;
575 else if (wc < 0x10000)
577 first = 0xe0;
578 clen = 3;
580 else if (wc < 0x200000)
582 first = 0xf0;
583 clen = 4;
585 else if (wc < 0x4000000)
587 first = 0xf8;
588 clen = 5;
590 else
592 first = 0xfc;
593 clen = 6;
596 /* Woo-hoo! */
597 switch (clen)
599 case 6: bp[5] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
600 case 5: bp[4] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
601 case 4: bp[3] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
602 case 3: bp[2] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
603 case 2: bp[1] = (wc & 0x3f) | 0x80; wc >>= 6; /* Fall through */
604 case 1: bp[0] = wc | first;
607 bp += clen;
609 *bp = 0;
611 g_free (wcs);
613 if (bytes_read)
614 *bytes_read = len;
615 if (bytes_written)
616 *bytes_written = total_len;
618 return result;
620 #else
622 char *charset, *str;
624 if (g_get_charset (&charset))
625 return g_strdup (opsysstring);
627 str = g_convert (opsysstring, len,
628 "UTF-8", charset, bytes_read, bytes_written, error);
630 return str;
631 #endif
635 * g_locale_from_utf8:
636 * @utf8string: a UTF-8 encoded string
637 * @len: the length of the string, or -1 if the string is
638 * NULL-terminated.
639 * @bytes_read: location to store the number of bytes in the
640 * input string that were successfully converted, or %NULL.
641 * Even if the conversion was succesful, this may be
642 * less than len if there were partial characters
643 * at the end of the input. If the error
644 * G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
645 * stored will the byte fofset after the last valid
646 * input sequence.
647 * @bytes_written: the stored in the output buffer (not including the
648 * terminating nul.
649 * @error: location to store the error occuring, or %NULL to ignore
650 * errors. Any of the errors in #GConvertError may occur.
652 * Converts a string from UTF-8 to the encoding used for strings by
653 * the C runtime (usually the same as that used by the operating
654 * system) in the current locale.
656 * Return value: The converted string, or %NULL on an error.
658 gchar *
659 g_locale_from_utf8 (const gchar *utf8string,
660 gint len,
661 gint *bytes_read,
662 gint *bytes_written,
663 GError **error)
665 #ifdef G_OS_WIN32
667 gint i, mask, clen, mblen;
668 wchar_t *wcs, *wcp;
669 gchar *result;
670 guchar *cp, *end, c;
671 gint n;
673 if (len == -1)
674 len = strlen (utf8string);
676 /* First convert to wide chars */
677 cp = (guchar *) utf8string;
678 end = cp + len;
679 n = 0;
680 wcs = g_new (wchar_t, len + 1);
681 wcp = wcs;
682 while (cp != end)
684 mask = 0;
685 c = *cp;
687 if (c < 0x80)
689 clen = 1;
690 mask = 0x7f;
692 else if ((c & 0xe0) == 0xc0)
694 clen = 2;
695 mask = 0x1f;
697 else if ((c & 0xf0) == 0xe0)
699 clen = 3;
700 mask = 0x0f;
702 else if ((c & 0xf8) == 0xf0)
704 clen = 4;
705 mask = 0x07;
707 else if ((c & 0xfc) == 0xf8)
709 clen = 5;
710 mask = 0x03;
712 else if ((c & 0xfc) == 0xfc)
714 clen = 6;
715 mask = 0x01;
717 else
719 g_free (wcs);
720 return NULL;
723 if (cp + clen > end)
725 g_free (wcs);
726 return NULL;
729 *wcp = (cp[0] & mask);
730 for (i = 1; i < clen; i++)
732 if ((cp[i] & 0xc0) != 0x80)
734 g_free (wcs);
735 return NULL;
737 *wcp <<= 6;
738 *wcp |= (cp[i] & 0x3f);
741 cp += clen;
742 wcp++;
743 n++;
745 if (cp != end)
747 g_free (wcs);
748 return NULL;
751 /* n is the number of wide chars constructed */
753 /* Convert to a string in the current ANSI codepage */
755 result = g_new (gchar, 3 * n + 1);
756 mblen = WideCharToMultiByte (CP_ACP, 0, wcs, n, result, 3*n, NULL, NULL);
757 result[mblen] = 0;
758 g_free (wcs);
760 if (bytes_read)
761 *bytes_read = len;
762 if (bytes_written)
763 *bytes_written = mblen;
765 return result;
767 #else
769 gchar *charset, *str;
771 if (g_get_charset (&charset))
772 return g_strdup (utf8string);
774 str = g_convert (utf8string, strlen (utf8string),
775 charset, "UTF-8", bytes_read, bytes_written, error);
777 return str;
779 #endif
783 * g_filename_to_utf8:
784 * @opsysstring: a string in the encoding for filenames
785 * @len: the length of the string, or -1 if the string is
786 * NULL-terminated.
787 * @bytes_read: location to store the number of bytes in the
788 * input string that were successfully converted, or %NULL.
789 * Even if the conversion was succesful, this may be
790 * less than len if there were partial characters
791 * at the end of the input. If the error
792 * G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
793 * stored will the byte fofset after the last valid
794 * input sequence.
795 * @bytes_written: the stored in the output buffer (not including the
796 * terminating nul.
797 * @error: location to store the error occuring, or %NULL to ignore
798 * errors. Any of the errors in #GConvertError may occur.
800 * Converts a string which is in the encoding used for filenames
801 * into a UTF-8 string.
803 * Return value: The converted string, or %NULL on an error.
805 gchar*
806 g_filename_to_utf8 (const gchar *opsysstring,
807 gint len,
808 gint *bytes_read,
809 gint *bytes_written,
810 GError **error)
812 #ifdef G_OS_WIN32
813 return g_locale_to_utf8 (opsysstring, len,
814 bytes_read, bytes_written,
815 error);
816 #else
817 if (getenv ("G_BROKEN_FILENAMES"))
818 return g_locale_to_utf8 (opsysstring, len,
819 bytes_read, bytes_written,
820 error);
822 if (bytes_read || bytes_written)
824 gint len = strlen (opsysstring);
826 if (bytes_read)
827 *bytes_read = len;
828 if (bytes_written)
829 *bytes_written = len;
832 if (len < 0)
833 return g_strdup (opsysstring);
834 else
835 return g_strndup (opsysstring, len);
836 #endif
840 * g_filename_from_utf8:
841 * @utf8string: a UTF-8 encoded string
842 * @len: the length of the string, or -1 if the string is
843 * NULL-terminated.
844 * @bytes_read: location to store the number of bytes in the
845 * input string that were successfully converted, or %NULL.
846 * Even if the conversion was succesful, this may be
847 * less than len if there were partial characters
848 * at the end of the input. If the error
849 * G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
850 * stored will the byte fofset after the last valid
851 * input sequence.
852 * @bytes_written: the stored in the output buffer (not including the
853 * terminating nul.
854 * @error: location to store the error occuring, or %NULL to ignore
855 * errors. Any of the errors in #GConvertError may occur.
857 * Converts a string from UTF-8 to the encoding used for filenames.
859 * Return value: The converted string, or %NULL on an error.
861 gchar*
862 g_filename_from_utf8 (const gchar *utf8string,
863 gint len,
864 gint *bytes_read,
865 gint *bytes_written,
866 GError **error)
868 #ifdef G_OS_WIN32
869 return g_locale_from_utf8 (utf8string, len,
870 bytes_read, bytes_written,
871 error);
872 #else
873 if (getenv ("G_BROKEN_FILENAMES"))
874 return g_locale_from_utf8 (utf8string, len,
875 bytes_read, bytes_written,
876 error);
878 if (bytes_read || bytes_written)
880 gint len = strlen (utf8string);
882 if (bytes_read)
883 *bytes_read = len;
884 if (bytes_written)
885 *bytes_written = len;
888 if (len < 0)
889 return g_strdup (utf8string);
890 else
891 return g_strndup (utf8string, len);
892 #endif