Remove redundant header inclusions
[glib.git] / glib / gconvert.c
blob12c010d8e18388cca04253080134c61e3161f4a9
1 /* GLIB - Library of useful routines for C programming
3 * gconvert.c: Convert between character sets using iconv
4 * Copyright Red Hat Inc., 2000
5 * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com>
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the
19 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
20 * Boston, MA 02111-1307, USA.
23 #include "config.h"
25 #include "glib.h"
27 #ifndef G_OS_WIN32
28 #include <iconv.h>
29 #endif
30 #include <errno.h>
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
35 #include "gprintfint.h"
36 #include "gthreadprivate.h"
37 #include "gunicode.h"
39 #ifdef G_OS_WIN32
40 #include "win_iconv.c"
41 #endif
43 #ifdef G_PLATFORM_WIN32
44 #define STRICT
45 #include <windows.h>
46 #undef STRICT
47 #endif
49 #include "glibintl.h"
51 #if defined(USE_LIBICONV_GNU) && !defined (_LIBICONV_H)
52 #error GNU libiconv in use but included iconv.h not from libiconv
53 #endif
54 #if !defined(USE_LIBICONV_GNU) && defined (_LIBICONV_H)
55 #error GNU libiconv not in use but included iconv.h is from libiconv
56 #endif
59 /**
60 * SECTION:conversions
61 * @title: Character Set Conversion
62 * @short_description: Convert strings between different character sets
64 * The g_convert() family of function wraps the functionality of iconv(). In
65 * addition to pure character set conversions, GLib has functions to deal
66 * with the extra complications of encodings for file names.
68 * <refsect2 id="file-name-encodings">
69 * <title>File Name Encodings</title>
70 * <para>
71 * Historically, Unix has not had a defined encoding for file
72 * names: a file name is valid as long as it does not have path
73 * separators in it ("/"). However, displaying file names may
74 * require conversion: from the character set in which they were
75 * created, to the character set in which the application
76 * operates. Consider the Spanish file name
77 * "<filename>Presentaci&oacute;n.sxi</filename>". If the
78 * application which created it uses ISO-8859-1 for its encoding,
79 * </para>
80 * <programlisting id="filename-iso8859-1">
81 * Character: P r e s e n t a c i &oacute; n . s x i
82 * Hex code: 50 72 65 73 65 6e 74 61 63 69 f3 6e 2e 73 78 69
83 * </programlisting>
84 * <para>
85 * However, if the application use UTF-8, the actual file name on
86 * disk would look like this:
87 * </para>
88 * <programlisting id="filename-utf-8">
89 * Character: P r e s e n t a c i &oacute; n . s x i
90 * Hex code: 50 72 65 73 65 6e 74 61 63 69 c3 b3 6e 2e 73 78 69
91 * </programlisting>
92 * <para>
93 * Glib uses UTF-8 for its strings, and GUI toolkits like GTK+
94 * that use Glib do the same thing. If you get a file name from
95 * the file system, for example, from readdir(3) or from g_dir_read_name(),
96 * and you wish to display the file name to the user, you
97 * <emphasis>will</emphasis> need to convert it into UTF-8. The
98 * opposite case is when the user types the name of a file he
99 * wishes to save: the toolkit will give you that string in
100 * UTF-8 encoding, and you will need to convert it to the
101 * character set used for file names before you can create the
102 * file with open(2) or fopen(3).
103 * </para>
104 * <para>
105 * By default, Glib assumes that file names on disk are in UTF-8
106 * encoding. This is a valid assumption for file systems which
107 * were created relatively recently: most applications use UTF-8
108 * encoding for their strings, and that is also what they use for
109 * the file names they create. However, older file systems may
110 * still contain file names created in "older" encodings, such as
111 * ISO-8859-1. In this case, for compatibility reasons, you may
112 * want to instruct Glib to use that particular encoding for file
113 * names rather than UTF-8. You can do this by specifying the
114 * encoding for file names in the <link
115 * linkend="G_FILENAME_ENCODING"><envar>G_FILENAME_ENCODING</envar></link>
116 * environment variable. For example, if your installation uses
117 * ISO-8859-1 for file names, you can put this in your
118 * <filename>~/.profile</filename>:
119 * </para>
120 * <programlisting>
121 * export G_FILENAME_ENCODING=ISO-8859-1
122 * </programlisting>
123 * <para>
124 * Glib provides the functions g_filename_to_utf8() and
125 * g_filename_from_utf8() to perform the necessary conversions. These
126 * functions convert file names from the encoding specified in
127 * <envar>G_FILENAME_ENCODING</envar> to UTF-8 and vice-versa.
128 * <xref linkend="file-name-encodings-diagram"/> illustrates how
129 * these functions are used to convert between UTF-8 and the
130 * encoding for file names in the file system.
131 * </para>
132 * <figure id="file-name-encodings-diagram">
133 * <title>Conversion between File Name Encodings</title>
134 * <graphic fileref="file-name-encodings.png" format="PNG"/>
135 * </figure>
136 * <refsect3 id="file-name-encodings-checklist">
137 * <title>Checklist for Application Writers</title>
138 * <para>
139 * This section is a practical summary of the detailed
140 * description above. You can use this as a checklist of
141 * things to do to make sure your applications process file
142 * name encodings correctly.
143 * </para>
144 * <orderedlist>
145 * <listitem><para>
146 * If you get a file name from the file system from a function
147 * such as readdir(3) or gtk_file_chooser_get_filename(),
148 * you do not need to do any conversion to pass that
149 * file name to functions like open(2), rename(2), or
150 * fopen(3) &mdash; those are "raw" file names which the file
151 * system understands.
152 * </para></listitem>
153 * <listitem><para>
154 * If you need to display a file name, convert it to UTF-8 first by
155 * using g_filename_to_utf8(). If conversion fails, display a string like
156 * "<literal>Unknown file name</literal>". <emphasis>Do not</emphasis>
157 * convert this string back into the encoding used for file names if you
158 * wish to pass it to the file system; use the original file name instead.
159 * For example, the document window of a word processor could display
160 * "Unknown file name" in its title bar but still let the user save the
161 * file, as it would keep the raw file name internally. This can happen
162 * if the user has not set the <envar>G_FILENAME_ENCODING</envar>
163 * environment variable even though he has files whose names are not
164 * encoded in UTF-8.
165 * </para></listitem>
166 * <listitem><para>
167 * If your user interface lets the user type a file name for saving or
168 * renaming, convert it to the encoding used for file names in the file
169 * system by using g_filename_from_utf8(). Pass the converted file name
170 * to functions like fopen(3). If conversion fails, ask the user to enter
171 * a different file name. This can happen if the user types Japanese
172 * characters when <envar>G_FILENAME_ENCODING</envar> is set to
173 * <literal>ISO-8859-1</literal>, for example.
174 * </para></listitem>
175 * </orderedlist>
176 * </refsect3>
177 * </refsect2>
180 /* We try to terminate strings in unknown charsets with this many zero bytes
181 * to ensure that multibyte strings really are nul-terminated when we return
182 * them from g_convert() and friends.
184 #define NUL_TERMINATOR_LENGTH 4
186 GQuark
187 g_convert_error_quark (void)
189 return g_quark_from_static_string ("g_convert_error");
192 static gboolean
193 try_conversion (const char *to_codeset,
194 const char *from_codeset,
195 iconv_t *cd)
197 *cd = iconv_open (to_codeset, from_codeset);
199 if (*cd == (iconv_t)-1 && errno == EINVAL)
200 return FALSE;
201 else
202 return TRUE;
205 static gboolean
206 try_to_aliases (const char **to_aliases,
207 const char *from_codeset,
208 iconv_t *cd)
210 if (to_aliases)
212 const char **p = to_aliases;
213 while (*p)
215 if (try_conversion (*p, from_codeset, cd))
216 return TRUE;
218 p++;
222 return FALSE;
225 G_GNUC_INTERNAL extern const char **
226 _g_charset_get_aliases (const char *canonical_name);
229 * g_iconv_open:
230 * @to_codeset: destination codeset
231 * @from_codeset: source codeset
233 * Same as the standard UNIX routine iconv_open(), but
234 * may be implemented via libiconv on UNIX flavors that lack
235 * a native implementation.
237 * GLib provides g_convert() and g_locale_to_utf8() which are likely
238 * more convenient than the raw iconv wrappers.
240 * Return value: a "conversion descriptor", or (GIConv)-1 if
241 * opening the converter failed.
243 GIConv
244 g_iconv_open (const gchar *to_codeset,
245 const gchar *from_codeset)
247 iconv_t cd;
249 if (!try_conversion (to_codeset, from_codeset, &cd))
251 const char **to_aliases = _g_charset_get_aliases (to_codeset);
252 const char **from_aliases = _g_charset_get_aliases (from_codeset);
254 if (from_aliases)
256 const char **p = from_aliases;
257 while (*p)
259 if (try_conversion (to_codeset, *p, &cd))
260 goto out;
262 if (try_to_aliases (to_aliases, *p, &cd))
263 goto out;
265 p++;
269 if (try_to_aliases (to_aliases, from_codeset, &cd))
270 goto out;
273 out:
274 return (cd == (iconv_t)-1) ? (GIConv)-1 : (GIConv)cd;
278 * g_iconv:
279 * @converter: conversion descriptor from g_iconv_open()
280 * @inbuf: bytes to convert
281 * @inbytes_left: inout parameter, bytes remaining to convert in @inbuf
282 * @outbuf: converted output bytes
283 * @outbytes_left: inout parameter, bytes available to fill in @outbuf
285 * Same as the standard UNIX routine iconv(), but
286 * may be implemented via libiconv on UNIX flavors that lack
287 * a native implementation.
289 * GLib provides g_convert() and g_locale_to_utf8() which are likely
290 * more convenient than the raw iconv wrappers.
292 * Return value: count of non-reversible conversions, or -1 on error
294 gsize
295 g_iconv (GIConv converter,
296 gchar **inbuf,
297 gsize *inbytes_left,
298 gchar **outbuf,
299 gsize *outbytes_left)
301 iconv_t cd = (iconv_t)converter;
303 return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left);
307 * g_iconv_close:
308 * @converter: a conversion descriptor from g_iconv_open()
310 * Same as the standard UNIX routine iconv_close(), but
311 * may be implemented via libiconv on UNIX flavors that lack
312 * a native implementation. Should be called to clean up
313 * the conversion descriptor from g_iconv_open() when
314 * you are done converting things.
316 * GLib provides g_convert() and g_locale_to_utf8() which are likely
317 * more convenient than the raw iconv wrappers.
319 * Return value: -1 on error, 0 on success
321 gint
322 g_iconv_close (GIConv converter)
324 iconv_t cd = (iconv_t)converter;
326 return iconv_close (cd);
330 #ifdef NEED_ICONV_CACHE
332 #define ICONV_CACHE_SIZE (16)
334 struct _iconv_cache_bucket {
335 gchar *key;
336 guint32 refcount;
337 gboolean used;
338 GIConv cd;
341 static GList *iconv_cache_list;
342 static GHashTable *iconv_cache;
343 static GHashTable *iconv_open_hash;
344 static guint iconv_cache_size = 0;
345 G_LOCK_DEFINE_STATIC (iconv_cache_lock);
347 /* caller *must* hold the iconv_cache_lock */
348 static void
349 iconv_cache_init (void)
351 static gboolean initialized = FALSE;
353 if (initialized)
354 return;
356 iconv_cache_list = NULL;
357 iconv_cache = g_hash_table_new (g_str_hash, g_str_equal);
358 iconv_open_hash = g_hash_table_new (g_direct_hash, g_direct_equal);
360 initialized = TRUE;
365 * iconv_cache_bucket_new:
366 * @key: cache key
367 * @cd: iconv descriptor
369 * Creates a new cache bucket, inserts it into the cache and
370 * increments the cache size.
372 * This assumes ownership of @key.
374 * Returns a pointer to the newly allocated cache bucket.
376 static struct _iconv_cache_bucket *
377 iconv_cache_bucket_new (gchar *key, GIConv cd)
379 struct _iconv_cache_bucket *bucket;
381 bucket = g_new (struct _iconv_cache_bucket, 1);
382 bucket->key = key;
383 bucket->refcount = 1;
384 bucket->used = TRUE;
385 bucket->cd = cd;
387 g_hash_table_insert (iconv_cache, bucket->key, bucket);
389 /* FIXME: if we sorted the list so items with few refcounts were
390 first, then we could expire them faster in iconv_cache_expire_unused () */
391 iconv_cache_list = g_list_prepend (iconv_cache_list, bucket);
393 iconv_cache_size++;
395 return bucket;
400 * iconv_cache_bucket_expire:
401 * @node: cache bucket's node
402 * @bucket: cache bucket
404 * Expires a single cache bucket @bucket. This should only ever be
405 * called on a bucket that currently has no used iconv descriptors
406 * open.
408 * @node is not a required argument. If @node is not supplied, we
409 * search for it ourselves.
411 static void
412 iconv_cache_bucket_expire (GList *node, struct _iconv_cache_bucket *bucket)
414 g_hash_table_remove (iconv_cache, bucket->key);
416 if (node == NULL)
417 node = g_list_find (iconv_cache_list, bucket);
419 g_assert (node != NULL);
421 if (node->prev)
423 node->prev->next = node->next;
424 if (node->next)
425 node->next->prev = node->prev;
427 else
429 iconv_cache_list = node->next;
430 if (node->next)
431 node->next->prev = NULL;
434 g_list_free_1 (node);
436 g_free (bucket->key);
437 g_iconv_close (bucket->cd);
438 g_free (bucket);
440 iconv_cache_size--;
445 * iconv_cache_expire_unused:
447 * Expires as many unused cache buckets as it needs to in order to get
448 * the total number of buckets < ICONV_CACHE_SIZE.
450 static void
451 iconv_cache_expire_unused (void)
453 struct _iconv_cache_bucket *bucket;
454 GList *node, *next;
456 node = iconv_cache_list;
457 while (node && iconv_cache_size >= ICONV_CACHE_SIZE)
459 next = node->next;
461 bucket = node->data;
462 if (bucket->refcount == 0)
463 iconv_cache_bucket_expire (node, bucket);
465 node = next;
469 static GIConv
470 open_converter (const gchar *to_codeset,
471 const gchar *from_codeset,
472 GError **error)
474 struct _iconv_cache_bucket *bucket;
475 gchar *key, *dyn_key, auto_key[80];
476 GIConv cd;
477 gsize len_from_codeset, len_to_codeset;
479 /* create our key */
480 len_from_codeset = strlen (from_codeset);
481 len_to_codeset = strlen (to_codeset);
482 if (len_from_codeset + len_to_codeset + 2 < sizeof (auto_key))
484 key = auto_key;
485 dyn_key = NULL;
487 else
488 key = dyn_key = g_malloc (len_from_codeset + len_to_codeset + 2);
489 memcpy (key, from_codeset, len_from_codeset);
490 key[len_from_codeset] = ':';
491 strcpy (key + len_from_codeset + 1, to_codeset);
493 G_LOCK (iconv_cache_lock);
495 /* make sure the cache has been initialized */
496 iconv_cache_init ();
498 bucket = g_hash_table_lookup (iconv_cache, key);
499 if (bucket)
501 g_free (dyn_key);
503 if (bucket->used)
505 cd = g_iconv_open (to_codeset, from_codeset);
506 if (cd == (GIConv) -1)
507 goto error;
509 else
511 /* Apparently iconv on Solaris <= 7 segfaults if you pass in
512 * NULL for anything but inbuf; work around that. (NULL outbuf
513 * or NULL *outbuf is allowed by Unix98.)
515 gsize inbytes_left = 0;
516 gchar *outbuf = NULL;
517 gsize outbytes_left = 0;
519 cd = bucket->cd;
520 bucket->used = TRUE;
522 /* reset the descriptor */
523 g_iconv (cd, NULL, &inbytes_left, &outbuf, &outbytes_left);
526 bucket->refcount++;
528 else
530 cd = g_iconv_open (to_codeset, from_codeset);
531 if (cd == (GIConv) -1)
533 g_free (dyn_key);
534 goto error;
537 iconv_cache_expire_unused ();
539 bucket = iconv_cache_bucket_new (dyn_key ? dyn_key : g_strdup (key), cd);
542 g_hash_table_insert (iconv_open_hash, cd, bucket->key);
544 G_UNLOCK (iconv_cache_lock);
546 return cd;
548 error:
550 G_UNLOCK (iconv_cache_lock);
552 /* Something went wrong. */
553 if (error)
555 if (errno == EINVAL)
556 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
557 _("Conversion from character set '%s' to '%s' is not supported"),
558 from_codeset, to_codeset);
559 else
560 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
561 _("Could not open converter from '%s' to '%s'"),
562 from_codeset, to_codeset);
565 return cd;
568 static int
569 close_converter (GIConv converter)
571 struct _iconv_cache_bucket *bucket;
572 const gchar *key;
573 GIConv cd;
575 cd = converter;
577 if (cd == (GIConv) -1)
578 return 0;
580 G_LOCK (iconv_cache_lock);
582 key = g_hash_table_lookup (iconv_open_hash, cd);
583 if (key)
585 g_hash_table_remove (iconv_open_hash, cd);
587 bucket = g_hash_table_lookup (iconv_cache, key);
588 g_assert (bucket);
590 bucket->refcount--;
592 if (cd == bucket->cd)
593 bucket->used = FALSE;
594 else
595 g_iconv_close (cd);
597 if (!bucket->refcount && iconv_cache_size > ICONV_CACHE_SIZE)
599 /* expire this cache bucket */
600 iconv_cache_bucket_expire (NULL, bucket);
603 else
605 G_UNLOCK (iconv_cache_lock);
607 g_warning ("This iconv context wasn't opened using open_converter");
609 return g_iconv_close (converter);
612 G_UNLOCK (iconv_cache_lock);
614 return 0;
617 #else /* !NEED_ICONV_CACHE */
619 static GIConv
620 open_converter (const gchar *to_codeset,
621 const gchar *from_codeset,
622 GError **error)
624 GIConv cd;
626 cd = g_iconv_open (to_codeset, from_codeset);
628 if (cd == (GIConv) -1)
630 /* Something went wrong. */
631 if (error)
633 if (errno == EINVAL)
634 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
635 _("Conversion from character set '%s' to '%s' is not supported"),
636 from_codeset, to_codeset);
637 else
638 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
639 _("Could not open converter from '%s' to '%s'"),
640 from_codeset, to_codeset);
644 return cd;
647 static int
648 close_converter (GIConv cd)
650 if (cd == (GIConv) -1)
651 return 0;
653 return g_iconv_close (cd);
656 #endif /* NEED_ICONV_CACHE */
659 * g_convert_with_iconv:
660 * @str: the string to convert
661 * @len: the length of the string, or -1 if the string is
662 * nul-terminated<footnoteref linkend="nul-unsafe"/>.
663 * @converter: conversion descriptor from g_iconv_open()
664 * @bytes_read: location to store the number of bytes in the
665 * input string that were successfully converted, or %NULL.
666 * Even if the conversion was successful, this may be
667 * less than @len if there were partial characters
668 * at the end of the input. If the error
669 * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
670 * stored will the byte offset after the last valid
671 * input sequence.
672 * @bytes_written: the number of bytes stored in the output buffer (not
673 * including the terminating nul).
674 * @error: location to store the error occuring, or %NULL to ignore
675 * errors. Any of the errors in #GConvertError may occur.
677 * Converts a string from one character set to another.
679 * Note that you should use g_iconv() for streaming
680 * conversions<footnote id="streaming-state">
681 * <para>
682 * Despite the fact that @byes_read can return information about partial
683 * characters, the <literal>g_convert_...</literal> functions
684 * are not generally suitable for streaming. If the underlying converter
685 * being used maintains internal state, then this won't be preserved
686 * across successive calls to g_convert(), g_convert_with_iconv() or
687 * g_convert_with_fallback(). (An example of this is the GNU C converter
688 * for CP1255 which does not emit a base character until it knows that
689 * the next character is not a mark that could combine with the base
690 * character.)
691 * </para>
692 * </footnote>.
694 * Return value: If the conversion was successful, a newly allocated
695 * nul-terminated string, which must be freed with
696 * g_free(). Otherwise %NULL and @error will be set.
698 gchar*
699 g_convert_with_iconv (const gchar *str,
700 gssize len,
701 GIConv converter,
702 gsize *bytes_read,
703 gsize *bytes_written,
704 GError **error)
706 gchar *dest;
707 gchar *outp;
708 const gchar *p;
709 gsize inbytes_remaining;
710 gsize outbytes_remaining;
711 gsize err;
712 gsize outbuf_size;
713 gboolean have_error = FALSE;
714 gboolean done = FALSE;
715 gboolean reset = FALSE;
717 g_return_val_if_fail (converter != (GIConv) -1, NULL);
719 if (len < 0)
720 len = strlen (str);
722 p = str;
723 inbytes_remaining = len;
724 outbuf_size = len + NUL_TERMINATOR_LENGTH;
726 outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;
727 outp = dest = g_malloc (outbuf_size);
729 while (!done && !have_error)
731 if (reset)
732 err = g_iconv (converter, NULL, &inbytes_remaining, &outp, &outbytes_remaining);
733 else
734 err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
736 if (err == (gsize) -1)
738 switch (errno)
740 case EINVAL:
741 /* Incomplete text, do not report an error */
742 done = TRUE;
743 break;
744 case E2BIG:
746 gsize used = outp - dest;
748 outbuf_size *= 2;
749 dest = g_realloc (dest, outbuf_size);
751 outp = dest + used;
752 outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;
754 break;
755 case EILSEQ:
756 if (error)
757 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
758 _("Invalid byte sequence in conversion input"));
759 have_error = TRUE;
760 break;
761 default:
763 int errsv = errno;
765 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
766 _("Error during conversion: %s"),
767 g_strerror (errsv));
769 have_error = TRUE;
770 break;
773 else
775 if (!reset)
777 /* call g_iconv with NULL inbuf to cleanup shift state */
778 reset = TRUE;
779 inbytes_remaining = 0;
781 else
782 done = TRUE;
786 memset (outp, 0, NUL_TERMINATOR_LENGTH);
788 if (bytes_read)
789 *bytes_read = p - str;
790 else
792 if ((p - str) != len)
794 if (!have_error)
796 if (error)
797 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
798 _("Partial character sequence at end of input"));
799 have_error = TRUE;
804 if (bytes_written)
805 *bytes_written = outp - dest; /* Doesn't include '\0' */
807 if (have_error)
809 g_free (dest);
810 return NULL;
812 else
813 return dest;
817 * g_convert:
818 * @str: the string to convert
819 * @len: the length of the string, or -1 if the string is
820 * nul-terminated<footnote id="nul-unsafe">
821 <para>
822 Note that some encodings may allow nul bytes to
823 occur inside strings. In that case, using -1 for
824 the @len parameter is unsafe.
825 </para>
826 </footnote>.
827 * @to_codeset: name of character set into which to convert @str
828 * @from_codeset: character set of @str.
829 * @bytes_read: location to store the number of bytes in the
830 * input string that were successfully converted, or %NULL.
831 * Even if the conversion was successful, this may be
832 * less than @len if there were partial characters
833 * at the end of the input. If the error
834 * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
835 * stored will the byte offset after the last valid
836 * input sequence.
837 * @bytes_written: the number of bytes stored in the output buffer (not
838 * including the terminating nul).
839 * @error: location to store the error occuring, or %NULL to ignore
840 * errors. Any of the errors in #GConvertError may occur.
842 * Converts a string from one character set to another.
844 * Note that you should use g_iconv() for streaming
845 * conversions<footnoteref linkend="streaming-state"/>.
847 * Return value: If the conversion was successful, a newly allocated
848 * nul-terminated string, which must be freed with
849 * g_free(). Otherwise %NULL and @error will be set.
851 gchar*
852 g_convert (const gchar *str,
853 gssize len,
854 const gchar *to_codeset,
855 const gchar *from_codeset,
856 gsize *bytes_read,
857 gsize *bytes_written,
858 GError **error)
860 gchar *res;
861 GIConv cd;
863 g_return_val_if_fail (str != NULL, NULL);
864 g_return_val_if_fail (to_codeset != NULL, NULL);
865 g_return_val_if_fail (from_codeset != NULL, NULL);
867 cd = open_converter (to_codeset, from_codeset, error);
869 if (cd == (GIConv) -1)
871 if (bytes_read)
872 *bytes_read = 0;
874 if (bytes_written)
875 *bytes_written = 0;
877 return NULL;
880 res = g_convert_with_iconv (str, len, cd,
881 bytes_read, bytes_written,
882 error);
884 close_converter (cd);
886 return res;
890 * g_convert_with_fallback:
891 * @str: the string to convert
892 * @len: the length of the string, or -1 if the string is
893 * nul-terminated<footnoteref linkend="nul-unsafe"/>.
894 * @to_codeset: name of character set into which to convert @str
895 * @from_codeset: character set of @str.
896 * @fallback: UTF-8 string to use in place of character not
897 * present in the target encoding. (The string must be
898 * representable in the target encoding).
899 If %NULL, characters not in the target encoding will
900 be represented as Unicode escapes \uxxxx or \Uxxxxyyyy.
901 * @bytes_read: location to store the number of bytes in the
902 * input string that were successfully converted, or %NULL.
903 * Even if the conversion was successful, this may be
904 * less than @len if there were partial characters
905 * at the end of the input.
906 * @bytes_written: the number of bytes stored in the output buffer (not
907 * including the terminating nul).
908 * @error: location to store the error occuring, or %NULL to ignore
909 * errors. Any of the errors in #GConvertError may occur.
911 * Converts a string from one character set to another, possibly
912 * including fallback sequences for characters not representable
913 * in the output. Note that it is not guaranteed that the specification
914 * for the fallback sequences in @fallback will be honored. Some
915 * systems may do an approximate conversion from @from_codeset
916 * to @to_codeset in their iconv() functions,
917 * in which case GLib will simply return that approximate conversion.
919 * Note that you should use g_iconv() for streaming
920 * conversions<footnoteref linkend="streaming-state"/>.
922 * Return value: If the conversion was successful, a newly allocated
923 * nul-terminated string, which must be freed with
924 * g_free(). Otherwise %NULL and @error will be set.
926 gchar*
927 g_convert_with_fallback (const gchar *str,
928 gssize len,
929 const gchar *to_codeset,
930 const gchar *from_codeset,
931 const gchar *fallback,
932 gsize *bytes_read,
933 gsize *bytes_written,
934 GError **error)
936 gchar *utf8;
937 gchar *dest;
938 gchar *outp;
939 const gchar *insert_str = NULL;
940 const gchar *p;
941 gsize inbytes_remaining;
942 const gchar *save_p = NULL;
943 gsize save_inbytes = 0;
944 gsize outbytes_remaining;
945 gsize err;
946 GIConv cd;
947 gsize outbuf_size;
948 gboolean have_error = FALSE;
949 gboolean done = FALSE;
951 GError *local_error = NULL;
953 g_return_val_if_fail (str != NULL, NULL);
954 g_return_val_if_fail (to_codeset != NULL, NULL);
955 g_return_val_if_fail (from_codeset != NULL, NULL);
957 if (len < 0)
958 len = strlen (str);
960 /* Try an exact conversion; we only proceed if this fails
961 * due to an illegal sequence in the input string.
963 dest = g_convert (str, len, to_codeset, from_codeset,
964 bytes_read, bytes_written, &local_error);
965 if (!local_error)
966 return dest;
968 if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
970 g_propagate_error (error, local_error);
971 return NULL;
973 else
974 g_error_free (local_error);
976 local_error = NULL;
978 /* No go; to proceed, we need a converter from "UTF-8" to
979 * to_codeset, and the string as UTF-8.
981 cd = open_converter (to_codeset, "UTF-8", error);
982 if (cd == (GIConv) -1)
984 if (bytes_read)
985 *bytes_read = 0;
987 if (bytes_written)
988 *bytes_written = 0;
990 return NULL;
993 utf8 = g_convert (str, len, "UTF-8", from_codeset,
994 bytes_read, &inbytes_remaining, error);
995 if (!utf8)
997 close_converter (cd);
998 if (bytes_written)
999 *bytes_written = 0;
1000 return NULL;
1003 /* Now the heart of the code. We loop through the UTF-8 string, and
1004 * whenever we hit an offending character, we form fallback, convert
1005 * the fallback to the target codeset, and then go back to
1006 * converting the original string after finishing with the fallback.
1008 * The variables save_p and save_inbytes store the input state
1009 * for the original string while we are converting the fallback
1011 p = utf8;
1013 outbuf_size = len + NUL_TERMINATOR_LENGTH;
1014 outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;
1015 outp = dest = g_malloc (outbuf_size);
1017 while (!done && !have_error)
1019 gsize inbytes_tmp = inbytes_remaining;
1020 err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining);
1021 inbytes_remaining = inbytes_tmp;
1023 if (err == (gsize) -1)
1025 switch (errno)
1027 case EINVAL:
1028 g_assert_not_reached();
1029 break;
1030 case E2BIG:
1032 gsize used = outp - dest;
1034 outbuf_size *= 2;
1035 dest = g_realloc (dest, outbuf_size);
1037 outp = dest + used;
1038 outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;
1040 break;
1042 case EILSEQ:
1043 if (save_p)
1045 /* Error converting fallback string - fatal
1047 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1048 _("Cannot convert fallback '%s' to codeset '%s'"),
1049 insert_str, to_codeset);
1050 have_error = TRUE;
1051 break;
1053 else if (p)
1055 if (!fallback)
1057 gunichar ch = g_utf8_get_char (p);
1058 insert_str = g_strdup_printf (ch < 0x10000 ? "\\u%04x" : "\\U%08x",
1059 ch);
1061 else
1062 insert_str = fallback;
1064 save_p = g_utf8_next_char (p);
1065 save_inbytes = inbytes_remaining - (save_p - p);
1066 p = insert_str;
1067 inbytes_remaining = strlen (p);
1068 break;
1070 /* fall thru if p is NULL */
1071 default:
1073 int errsv = errno;
1075 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
1076 _("Error during conversion: %s"),
1077 g_strerror (errsv));
1080 have_error = TRUE;
1081 break;
1084 else
1086 if (save_p)
1088 if (!fallback)
1089 g_free ((gchar *)insert_str);
1090 p = save_p;
1091 inbytes_remaining = save_inbytes;
1092 save_p = NULL;
1094 else if (p)
1096 /* call g_iconv with NULL inbuf to cleanup shift state */
1097 p = NULL;
1098 inbytes_remaining = 0;
1100 else
1101 done = TRUE;
1105 /* Cleanup
1107 memset (outp, 0, NUL_TERMINATOR_LENGTH);
1109 close_converter (cd);
1111 if (bytes_written)
1112 *bytes_written = outp - dest; /* Doesn't include '\0' */
1114 g_free (utf8);
1116 if (have_error)
1118 if (save_p && !fallback)
1119 g_free ((gchar *)insert_str);
1120 g_free (dest);
1121 return NULL;
1123 else
1124 return dest;
1128 * g_locale_to_utf8
1133 static gchar *
1134 strdup_len (const gchar *string,
1135 gssize len,
1136 gsize *bytes_written,
1137 gsize *bytes_read,
1138 GError **error)
1141 gsize real_len;
1143 if (!g_utf8_validate (string, len, NULL))
1145 if (bytes_read)
1146 *bytes_read = 0;
1147 if (bytes_written)
1148 *bytes_written = 0;
1150 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1151 _("Invalid byte sequence in conversion input"));
1152 return NULL;
1155 if (len < 0)
1156 real_len = strlen (string);
1157 else
1159 real_len = 0;
1161 while (real_len < len && string[real_len])
1162 real_len++;
1165 if (bytes_read)
1166 *bytes_read = real_len;
1167 if (bytes_written)
1168 *bytes_written = real_len;
1170 return g_strndup (string, real_len);
1174 * g_locale_to_utf8:
1175 * @opsysstring: a string in the encoding of the current locale. On Windows
1176 * this means the system codepage.
1177 * @len: the length of the string, or -1 if the string is
1178 * nul-terminated<footnoteref linkend="nul-unsafe"/>.
1179 * @bytes_read: location to store the number of bytes in the
1180 * input string that were successfully converted, or %NULL.
1181 * Even if the conversion was successful, this may be
1182 * less than @len if there were partial characters
1183 * at the end of the input. If the error
1184 * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1185 * stored will the byte offset after the last valid
1186 * input sequence.
1187 * @bytes_written: the number of bytes stored in the output buffer (not
1188 * including the terminating nul).
1189 * @error: location to store the error occuring, or %NULL to ignore
1190 * errors. Any of the errors in #GConvertError may occur.
1192 * Converts a string which is in the encoding used for strings by
1193 * the C runtime (usually the same as that used by the operating
1194 * system) in the <link linkend="setlocale">current locale</link> into a
1195 * UTF-8 string.
1197 * Return value: The converted string, or %NULL on an error.
1199 gchar *
1200 g_locale_to_utf8 (const gchar *opsysstring,
1201 gssize len,
1202 gsize *bytes_read,
1203 gsize *bytes_written,
1204 GError **error)
1206 const char *charset;
1208 if (g_get_charset (&charset))
1209 return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1210 else
1211 return g_convert (opsysstring, len,
1212 "UTF-8", charset, bytes_read, bytes_written, error);
1216 * g_locale_from_utf8:
1217 * @utf8string: a UTF-8 encoded string
1218 * @len: the length of the string, or -1 if the string is
1219 * nul-terminated<footnoteref linkend="nul-unsafe"/>.
1220 * @bytes_read: location to store the number of bytes in the
1221 * input string that were successfully converted, or %NULL.
1222 * Even if the conversion was successful, this may be
1223 * less than @len if there were partial characters
1224 * at the end of the input. If the error
1225 * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1226 * stored will the byte offset after the last valid
1227 * input sequence.
1228 * @bytes_written: the number of bytes stored in the output buffer (not
1229 * including the terminating nul).
1230 * @error: location to store the error occuring, or %NULL to ignore
1231 * errors. Any of the errors in #GConvertError may occur.
1233 * Converts a string from UTF-8 to the encoding used for strings by
1234 * the C runtime (usually the same as that used by the operating
1235 * system) in the <link linkend="setlocale">current locale</link>. On
1236 * Windows this means the system codepage.
1238 * Return value: The converted string, or %NULL on an error.
1240 gchar *
1241 g_locale_from_utf8 (const gchar *utf8string,
1242 gssize len,
1243 gsize *bytes_read,
1244 gsize *bytes_written,
1245 GError **error)
1247 const gchar *charset;
1249 if (g_get_charset (&charset))
1250 return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1251 else
1252 return g_convert (utf8string, len,
1253 charset, "UTF-8", bytes_read, bytes_written, error);
1256 #ifndef G_PLATFORM_WIN32
1258 typedef struct _GFilenameCharsetCache GFilenameCharsetCache;
1260 struct _GFilenameCharsetCache {
1261 gboolean is_utf8;
1262 gchar *charset;
1263 gchar **filename_charsets;
1266 static void
1267 filename_charset_cache_free (gpointer data)
1269 GFilenameCharsetCache *cache = data;
1270 g_free (cache->charset);
1271 g_strfreev (cache->filename_charsets);
1272 g_free (cache);
1276 * g_get_filename_charsets:
1277 * @charsets: return location for the %NULL-terminated list of encoding names
1279 * Determines the preferred character sets used for filenames.
1280 * The first character set from the @charsets is the filename encoding, the
1281 * subsequent character sets are used when trying to generate a displayable
1282 * representation of a filename, see g_filename_display_name().
1284 * On Unix, the character sets are determined by consulting the
1285 * environment variables <envar>G_FILENAME_ENCODING</envar> and
1286 * <envar>G_BROKEN_FILENAMES</envar>. On Windows, the character set
1287 * used in the GLib API is always UTF-8 and said environment variables
1288 * have no effect.
1290 * <envar>G_FILENAME_ENCODING</envar> may be set to a comma-separated list
1291 * of character set names. The special token "&commat;locale" is taken to
1292 * mean the character set for the <link linkend="setlocale">current
1293 * locale</link>. If <envar>G_FILENAME_ENCODING</envar> is not set, but
1294 * <envar>G_BROKEN_FILENAMES</envar> is, the character set of the current
1295 * locale is taken as the filename encoding. If neither environment variable
1296 * is set, UTF-8 is taken as the filename encoding, but the character
1297 * set of the current locale is also put in the list of encodings.
1299 * The returned @charsets belong to GLib and must not be freed.
1301 * Note that on Unix, regardless of the locale character set or
1302 * <envar>G_FILENAME_ENCODING</envar> value, the actual file names present
1303 * on a system might be in any random encoding or just gibberish.
1305 * Return value: %TRUE if the filename encoding is UTF-8.
1307 * Since: 2.6
1309 gboolean
1310 g_get_filename_charsets (G_CONST_RETURN gchar ***filename_charsets)
1312 static GStaticPrivate cache_private = G_STATIC_PRIVATE_INIT;
1313 GFilenameCharsetCache *cache = g_static_private_get (&cache_private);
1314 const gchar *charset;
1316 if (!cache)
1318 cache = g_new0 (GFilenameCharsetCache, 1);
1319 g_static_private_set (&cache_private, cache, filename_charset_cache_free);
1322 g_get_charset (&charset);
1324 if (!(cache->charset && strcmp (cache->charset, charset) == 0))
1326 const gchar *new_charset;
1327 gchar *p;
1328 gint i;
1330 g_free (cache->charset);
1331 g_strfreev (cache->filename_charsets);
1332 cache->charset = g_strdup (charset);
1334 p = getenv ("G_FILENAME_ENCODING");
1335 if (p != NULL && p[0] != '\0')
1337 cache->filename_charsets = g_strsplit (p, ",", 0);
1338 cache->is_utf8 = (strcmp (cache->filename_charsets[0], "UTF-8") == 0);
1340 for (i = 0; cache->filename_charsets[i]; i++)
1342 if (strcmp ("@locale", cache->filename_charsets[i]) == 0)
1344 g_get_charset (&new_charset);
1345 g_free (cache->filename_charsets[i]);
1346 cache->filename_charsets[i] = g_strdup (new_charset);
1350 else if (getenv ("G_BROKEN_FILENAMES") != NULL)
1352 cache->filename_charsets = g_new0 (gchar *, 2);
1353 cache->is_utf8 = g_get_charset (&new_charset);
1354 cache->filename_charsets[0] = g_strdup (new_charset);
1356 else
1358 cache->filename_charsets = g_new0 (gchar *, 3);
1359 cache->is_utf8 = TRUE;
1360 cache->filename_charsets[0] = g_strdup ("UTF-8");
1361 if (!g_get_charset (&new_charset))
1362 cache->filename_charsets[1] = g_strdup (new_charset);
1366 if (filename_charsets)
1367 *filename_charsets = (const gchar **)cache->filename_charsets;
1369 return cache->is_utf8;
1372 #else /* G_PLATFORM_WIN32 */
1374 gboolean
1375 g_get_filename_charsets (G_CONST_RETURN gchar ***filename_charsets)
1377 static const gchar *charsets[] = {
1378 "UTF-8",
1379 NULL
1382 #ifdef G_OS_WIN32
1383 /* On Windows GLib pretends that the filename charset is UTF-8 */
1384 if (filename_charsets)
1385 *filename_charsets = charsets;
1387 return TRUE;
1388 #else
1389 gboolean result;
1391 /* Cygwin works like before */
1392 result = g_get_charset (&(charsets[0]));
1394 if (filename_charsets)
1395 *filename_charsets = charsets;
1397 return result;
1398 #endif
1401 #endif /* G_PLATFORM_WIN32 */
1403 static gboolean
1404 get_filename_charset (const gchar **filename_charset)
1406 const gchar **charsets;
1407 gboolean is_utf8;
1409 is_utf8 = g_get_filename_charsets (&charsets);
1411 if (filename_charset)
1412 *filename_charset = charsets[0];
1414 return is_utf8;
1417 /* This is called from g_thread_init(). It's used to
1418 * initialize some static data in a threadsafe way.
1420 void
1421 _g_convert_thread_init (void)
1423 const gchar **dummy;
1424 (void) g_get_filename_charsets (&dummy);
1428 * g_filename_to_utf8:
1429 * @opsysstring: a string in the encoding for filenames
1430 * @len: the length of the string, or -1 if the string is
1431 * nul-terminated<footnoteref linkend="nul-unsafe"/>.
1432 * @bytes_read: location to store the number of bytes in the
1433 * input string that were successfully converted, or %NULL.
1434 * Even if the conversion was successful, this may be
1435 * less than @len if there were partial characters
1436 * at the end of the input. If the error
1437 * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1438 * stored will the byte offset after the last valid
1439 * input sequence.
1440 * @bytes_written: the number of bytes stored in the output buffer (not
1441 * including the terminating nul).
1442 * @error: location to store the error occuring, or %NULL to ignore
1443 * errors. Any of the errors in #GConvertError may occur.
1445 * Converts a string which is in the encoding used by GLib for
1446 * filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8
1447 * for filenames; on other platforms, this function indirectly depends on
1448 * the <link linkend="setlocale">current locale</link>.
1450 * Return value: The converted string, or %NULL on an error.
1452 gchar*
1453 g_filename_to_utf8 (const gchar *opsysstring,
1454 gssize len,
1455 gsize *bytes_read,
1456 gsize *bytes_written,
1457 GError **error)
1459 const gchar *charset;
1461 if (get_filename_charset (&charset))
1462 return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1463 else
1464 return g_convert (opsysstring, len,
1465 "UTF-8", charset, bytes_read, bytes_written, error);
1468 #if defined (G_OS_WIN32) && !defined (_WIN64)
1470 #undef g_filename_to_utf8
1472 /* Binary compatibility version. Not for newly compiled code. Also not needed for
1473 * 64-bit versions as there should be no old deployed binaries that would use
1474 * the old versions.
1477 gchar*
1478 g_filename_to_utf8 (const gchar *opsysstring,
1479 gssize len,
1480 gsize *bytes_read,
1481 gsize *bytes_written,
1482 GError **error)
1484 const gchar *charset;
1486 if (g_get_charset (&charset))
1487 return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1488 else
1489 return g_convert (opsysstring, len,
1490 "UTF-8", charset, bytes_read, bytes_written, error);
1493 #endif
1496 * g_filename_from_utf8:
1497 * @utf8string: a UTF-8 encoded string.
1498 * @len: the length of the string, or -1 if the string is
1499 * nul-terminated.
1500 * @bytes_read: location to store the number of bytes in the
1501 * input string that were successfully converted, or %NULL.
1502 * Even if the conversion was successful, this may be
1503 * less than @len if there were partial characters
1504 * at the end of the input. If the error
1505 * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1506 * stored will the byte offset after the last valid
1507 * input sequence.
1508 * @bytes_written: the number of bytes stored in the output buffer (not
1509 * including the terminating nul).
1510 * @error: location to store the error occuring, or %NULL to ignore
1511 * errors. Any of the errors in #GConvertError may occur.
1513 * Converts a string from UTF-8 to the encoding GLib uses for
1514 * filenames. Note that on Windows GLib uses UTF-8 for filenames;
1515 * on other platforms, this function indirectly depends on the
1516 * <link linkend="setlocale">current locale</link>.
1518 * Return value: The converted string, or %NULL on an error.
1520 gchar*
1521 g_filename_from_utf8 (const gchar *utf8string,
1522 gssize len,
1523 gsize *bytes_read,
1524 gsize *bytes_written,
1525 GError **error)
1527 const gchar *charset;
1529 if (get_filename_charset (&charset))
1530 return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1531 else
1532 return g_convert (utf8string, len,
1533 charset, "UTF-8", bytes_read, bytes_written, error);
1536 #if defined (G_OS_WIN32) && !defined (_WIN64)
1538 #undef g_filename_from_utf8
1540 /* Binary compatibility version. Not for newly compiled code. */
1542 gchar*
1543 g_filename_from_utf8 (const gchar *utf8string,
1544 gssize len,
1545 gsize *bytes_read,
1546 gsize *bytes_written,
1547 GError **error)
1549 const gchar *charset;
1551 if (g_get_charset (&charset))
1552 return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1553 else
1554 return g_convert (utf8string, len,
1555 charset, "UTF-8", bytes_read, bytes_written, error);
1558 #endif
1560 /* Test of haystack has the needle prefix, comparing case
1561 * insensitive. haystack may be UTF-8, but needle must
1562 * contain only ascii. */
1563 static gboolean
1564 has_case_prefix (const gchar *haystack, const gchar *needle)
1566 const gchar *h, *n;
1568 /* Eat one character at a time. */
1569 h = haystack;
1570 n = needle;
1572 while (*n && *h &&
1573 g_ascii_tolower (*n) == g_ascii_tolower (*h))
1575 n++;
1576 h++;
1579 return *n == '\0';
1582 typedef enum {
1583 UNSAFE_ALL = 0x1, /* Escape all unsafe characters */
1584 UNSAFE_ALLOW_PLUS = 0x2, /* Allows '+' */
1585 UNSAFE_PATH = 0x8, /* Allows '/', '&', '=', ':', '@', '+', '$' and ',' */
1586 UNSAFE_HOST = 0x10, /* Allows '/' and ':' and '@' */
1587 UNSAFE_SLASHES = 0x20 /* Allows all characters except for '/' and '%' */
1588 } UnsafeCharacterSet;
1590 static const guchar acceptable[96] = {
1591 /* A table of the ASCII chars from space (32) to DEL (127) */
1592 /* ! " # $ % & ' ( ) * + , - . / */
1593 0x00,0x3F,0x20,0x20,0x28,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x2A,0x28,0x3F,0x3F,0x1C,
1594 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1595 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x20,
1596 /* @ A B C D E F G H I J K L M N O */
1597 0x38,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1598 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
1599 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F,
1600 /* ` a b c d e f g h i j k l m n o */
1601 0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1602 /* p q r s t u v w x y z { | } ~ DEL */
1603 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20
1606 static const gchar hex[16] = "0123456789ABCDEF";
1608 /* Note: This escape function works on file: URIs, but if you want to
1609 * escape something else, please read RFC-2396 */
1610 static gchar *
1611 g_escape_uri_string (const gchar *string,
1612 UnsafeCharacterSet mask)
1614 #define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask))
1616 const gchar *p;
1617 gchar *q;
1618 gchar *result;
1619 int c;
1620 gint unacceptable;
1621 UnsafeCharacterSet use_mask;
1623 g_return_val_if_fail (mask == UNSAFE_ALL
1624 || mask == UNSAFE_ALLOW_PLUS
1625 || mask == UNSAFE_PATH
1626 || mask == UNSAFE_HOST
1627 || mask == UNSAFE_SLASHES, NULL);
1629 unacceptable = 0;
1630 use_mask = mask;
1631 for (p = string; *p != '\0'; p++)
1633 c = (guchar) *p;
1634 if (!ACCEPTABLE (c))
1635 unacceptable++;
1638 result = g_malloc (p - string + unacceptable * 2 + 1);
1640 use_mask = mask;
1641 for (q = result, p = string; *p != '\0'; p++)
1643 c = (guchar) *p;
1645 if (!ACCEPTABLE (c))
1647 *q++ = '%'; /* means hex coming */
1648 *q++ = hex[c >> 4];
1649 *q++ = hex[c & 15];
1651 else
1652 *q++ = *p;
1655 *q = '\0';
1657 return result;
1661 static gchar *
1662 g_escape_file_uri (const gchar *hostname,
1663 const gchar *pathname)
1665 char *escaped_hostname = NULL;
1666 char *escaped_path;
1667 char *res;
1669 #ifdef G_OS_WIN32
1670 char *p, *backslash;
1672 /* Turn backslashes into forward slashes. That's what Netscape
1673 * does, and they are actually more or less equivalent in Windows.
1676 pathname = g_strdup (pathname);
1677 p = (char *) pathname;
1679 while ((backslash = strchr (p, '\\')) != NULL)
1681 *backslash = '/';
1682 p = backslash + 1;
1684 #endif
1686 if (hostname && *hostname != '\0')
1688 escaped_hostname = g_escape_uri_string (hostname, UNSAFE_HOST);
1691 escaped_path = g_escape_uri_string (pathname, UNSAFE_PATH);
1693 res = g_strconcat ("file://",
1694 (escaped_hostname) ? escaped_hostname : "",
1695 (*escaped_path != '/') ? "/" : "",
1696 escaped_path,
1697 NULL);
1699 #ifdef G_OS_WIN32
1700 g_free ((char *) pathname);
1701 #endif
1703 g_free (escaped_hostname);
1704 g_free (escaped_path);
1706 return res;
1709 static int
1710 unescape_character (const char *scanner)
1712 int first_digit;
1713 int second_digit;
1715 first_digit = g_ascii_xdigit_value (scanner[0]);
1716 if (first_digit < 0)
1717 return -1;
1719 second_digit = g_ascii_xdigit_value (scanner[1]);
1720 if (second_digit < 0)
1721 return -1;
1723 return (first_digit << 4) | second_digit;
1726 static gchar *
1727 g_unescape_uri_string (const char *escaped,
1728 int len,
1729 const char *illegal_escaped_characters,
1730 gboolean ascii_must_not_be_escaped)
1732 const gchar *in, *in_end;
1733 gchar *out, *result;
1734 int c;
1736 if (escaped == NULL)
1737 return NULL;
1739 if (len < 0)
1740 len = strlen (escaped);
1742 result = g_malloc (len + 1);
1744 out = result;
1745 for (in = escaped, in_end = escaped + len; in < in_end; in++)
1747 c = *in;
1749 if (c == '%')
1751 /* catch partial escape sequences past the end of the substring */
1752 if (in + 3 > in_end)
1753 break;
1755 c = unescape_character (in + 1);
1757 /* catch bad escape sequences and NUL characters */
1758 if (c <= 0)
1759 break;
1761 /* catch escaped ASCII */
1762 if (ascii_must_not_be_escaped && c <= 0x7F)
1763 break;
1765 /* catch other illegal escaped characters */
1766 if (strchr (illegal_escaped_characters, c) != NULL)
1767 break;
1769 in += 2;
1772 *out++ = c;
1775 g_assert (out - result <= len);
1776 *out = '\0';
1778 if (in != in_end)
1780 g_free (result);
1781 return NULL;
1784 return result;
1787 static gboolean
1788 is_asciialphanum (gunichar c)
1790 return c <= 0x7F && g_ascii_isalnum (c);
1793 static gboolean
1794 is_asciialpha (gunichar c)
1796 return c <= 0x7F && g_ascii_isalpha (c);
1799 /* allows an empty string */
1800 static gboolean
1801 hostname_validate (const char *hostname)
1803 const char *p;
1804 gunichar c, first_char, last_char;
1806 p = hostname;
1807 if (*p == '\0')
1808 return TRUE;
1811 /* read in a label */
1812 c = g_utf8_get_char (p);
1813 p = g_utf8_next_char (p);
1814 if (!is_asciialphanum (c))
1815 return FALSE;
1816 first_char = c;
1819 last_char = c;
1820 c = g_utf8_get_char (p);
1821 p = g_utf8_next_char (p);
1823 while (is_asciialphanum (c) || c == '-');
1824 if (last_char == '-')
1825 return FALSE;
1827 /* if that was the last label, check that it was a toplabel */
1828 if (c == '\0' || (c == '.' && *p == '\0'))
1829 return is_asciialpha (first_char);
1831 while (c == '.');
1832 return FALSE;
1836 * g_filename_from_uri:
1837 * @uri: a uri describing a filename (escaped, encoded in ASCII).
1838 * @hostname: Location to store hostname for the URI, or %NULL.
1839 * If there is no hostname in the URI, %NULL will be
1840 * stored in this location.
1841 * @error: location to store the error occuring, or %NULL to ignore
1842 * errors. Any of the errors in #GConvertError may occur.
1844 * Converts an escaped ASCII-encoded URI to a local filename in the
1845 * encoding used for filenames.
1847 * Return value: a newly-allocated string holding the resulting
1848 * filename, or %NULL on an error.
1850 gchar *
1851 g_filename_from_uri (const gchar *uri,
1852 gchar **hostname,
1853 GError **error)
1855 const char *path_part;
1856 const char *host_part;
1857 char *unescaped_hostname;
1858 char *result;
1859 char *filename;
1860 int offs;
1861 #ifdef G_OS_WIN32
1862 char *p, *slash;
1863 #endif
1865 if (hostname)
1866 *hostname = NULL;
1868 if (!has_case_prefix (uri, "file:/"))
1870 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1871 _("The URI '%s' is not an absolute URI using the \"file\" scheme"),
1872 uri);
1873 return NULL;
1876 path_part = uri + strlen ("file:");
1878 if (strchr (path_part, '#') != NULL)
1880 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1881 _("The local file URI '%s' may not include a '#'"),
1882 uri);
1883 return NULL;
1886 if (has_case_prefix (path_part, "///"))
1887 path_part += 2;
1888 else if (has_case_prefix (path_part, "//"))
1890 path_part += 2;
1891 host_part = path_part;
1893 path_part = strchr (path_part, '/');
1895 if (path_part == NULL)
1897 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1898 _("The URI '%s' is invalid"),
1899 uri);
1900 return NULL;
1903 unescaped_hostname = g_unescape_uri_string (host_part, path_part - host_part, "", TRUE);
1905 if (unescaped_hostname == NULL ||
1906 !hostname_validate (unescaped_hostname))
1908 g_free (unescaped_hostname);
1909 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1910 _("The hostname of the URI '%s' is invalid"),
1911 uri);
1912 return NULL;
1915 if (hostname)
1916 *hostname = unescaped_hostname;
1917 else
1918 g_free (unescaped_hostname);
1921 filename = g_unescape_uri_string (path_part, -1, "/", FALSE);
1923 if (filename == NULL)
1925 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1926 _("The URI '%s' contains invalidly escaped characters"),
1927 uri);
1928 return NULL;
1931 offs = 0;
1932 #ifdef G_OS_WIN32
1933 /* Drop localhost */
1934 if (hostname && *hostname != NULL &&
1935 g_ascii_strcasecmp (*hostname, "localhost") == 0)
1937 g_free (*hostname);
1938 *hostname = NULL;
1941 /* Turn slashes into backslashes, because that's the canonical spelling */
1942 p = filename;
1943 while ((slash = strchr (p, '/')) != NULL)
1945 *slash = '\\';
1946 p = slash + 1;
1949 /* Windows URIs with a drive letter can be like "file://host/c:/foo"
1950 * or "file://host/c|/foo" (some Netscape versions). In those cases, start
1951 * the filename from the drive letter.
1953 if (g_ascii_isalpha (filename[1]))
1955 if (filename[2] == ':')
1956 offs = 1;
1957 else if (filename[2] == '|')
1959 filename[2] = ':';
1960 offs = 1;
1963 #endif
1965 result = g_strdup (filename + offs);
1966 g_free (filename);
1968 return result;
1971 #if defined (G_OS_WIN32) && !defined (_WIN64)
1973 #undef g_filename_from_uri
1975 gchar *
1976 g_filename_from_uri (const gchar *uri,
1977 gchar **hostname,
1978 GError **error)
1980 gchar *utf8_filename;
1981 gchar *retval = NULL;
1983 utf8_filename = g_filename_from_uri_utf8 (uri, hostname, error);
1984 if (utf8_filename)
1986 retval = g_locale_from_utf8 (utf8_filename, -1, NULL, NULL, error);
1987 g_free (utf8_filename);
1989 return retval;
1992 #endif
1995 * g_filename_to_uri:
1996 * @filename: an absolute filename specified in the GLib file name encoding,
1997 * which is the on-disk file name bytes on Unix, and UTF-8 on
1998 * Windows
1999 * @hostname: A UTF-8 encoded hostname, or %NULL for none.
2000 * @error: location to store the error occuring, or %NULL to ignore
2001 * errors. Any of the errors in #GConvertError may occur.
2003 * Converts an absolute filename to an escaped ASCII-encoded URI, with the path
2004 * component following Section 3.3. of RFC 2396.
2006 * Return value: a newly-allocated string holding the resulting
2007 * URI, or %NULL on an error.
2009 gchar *
2010 g_filename_to_uri (const gchar *filename,
2011 const gchar *hostname,
2012 GError **error)
2014 char *escaped_uri;
2016 g_return_val_if_fail (filename != NULL, NULL);
2018 if (!g_path_is_absolute (filename))
2020 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH,
2021 _("The pathname '%s' is not an absolute path"),
2022 filename);
2023 return NULL;
2026 if (hostname &&
2027 !(g_utf8_validate (hostname, -1, NULL)
2028 && hostname_validate (hostname)))
2030 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
2031 _("Invalid hostname"));
2032 return NULL;
2035 #ifdef G_OS_WIN32
2036 /* Don't use localhost unnecessarily */
2037 if (hostname && g_ascii_strcasecmp (hostname, "localhost") == 0)
2038 hostname = NULL;
2039 #endif
2041 escaped_uri = g_escape_file_uri (hostname, filename);
2043 return escaped_uri;
2046 #if defined (G_OS_WIN32) && !defined (_WIN64)
2048 #undef g_filename_to_uri
2050 gchar *
2051 g_filename_to_uri (const gchar *filename,
2052 const gchar *hostname,
2053 GError **error)
2055 gchar *utf8_filename;
2056 gchar *retval = NULL;
2058 utf8_filename = g_locale_to_utf8 (filename, -1, NULL, NULL, error);
2060 if (utf8_filename)
2062 retval = g_filename_to_uri_utf8 (utf8_filename, hostname, error);
2063 g_free (utf8_filename);
2066 return retval;
2069 #endif
2072 * g_uri_list_extract_uris:
2073 * @uri_list: an URI list
2075 * Splits an URI list conforming to the text/uri-list
2076 * mime type defined in RFC 2483 into individual URIs,
2077 * discarding any comments. The URIs are not validated.
2079 * Returns: a newly allocated %NULL-terminated list of
2080 * strings holding the individual URIs. The array should
2081 * be freed with g_strfreev().
2083 * Since: 2.6
2085 gchar **
2086 g_uri_list_extract_uris (const gchar *uri_list)
2088 GSList *uris, *u;
2089 const gchar *p, *q;
2090 gchar **result;
2091 gint n_uris = 0;
2093 uris = NULL;
2095 p = uri_list;
2097 /* We don't actually try to validate the URI according to RFC
2098 * 2396, or even check for allowed characters - we just ignore
2099 * comments and trim whitespace off the ends. We also
2100 * allow LF delimination as well as the specified CRLF.
2102 * We do allow comments like specified in RFC 2483.
2104 while (p)
2106 if (*p != '#')
2108 while (g_ascii_isspace (*p))
2109 p++;
2111 q = p;
2112 while (*q && (*q != '\n') && (*q != '\r'))
2113 q++;
2115 if (q > p)
2117 q--;
2118 while (q > p && g_ascii_isspace (*q))
2119 q--;
2121 if (q > p)
2123 uris = g_slist_prepend (uris, g_strndup (p, q - p + 1));
2124 n_uris++;
2128 p = strchr (p, '\n');
2129 if (p)
2130 p++;
2133 result = g_new (gchar *, n_uris + 1);
2135 result[n_uris--] = NULL;
2136 for (u = uris; u; u = u->next)
2137 result[n_uris--] = u->data;
2139 g_slist_free (uris);
2141 return result;
2145 * g_filename_display_basename:
2146 * @filename: an absolute pathname in the GLib file name encoding
2148 * Returns the display basename for the particular filename, guaranteed
2149 * to be valid UTF-8. The display name might not be identical to the filename,
2150 * for instance there might be problems converting it to UTF-8, and some files
2151 * can be translated in the display.
2153 * If GLib can not make sense of the encoding of @filename, as a last resort it
2154 * replaces unknown characters with U+FFFD, the Unicode replacement character.
2155 * You can search the result for the UTF-8 encoding of this character (which is
2156 * "\357\277\275" in octal notation) to find out if @filename was in an invalid
2157 * encoding.
2159 * You must pass the whole absolute pathname to this functions so that
2160 * translation of well known locations can be done.
2162 * This function is preferred over g_filename_display_name() if you know the
2163 * whole path, as it allows translation.
2165 * Return value: a newly allocated string containing
2166 * a rendition of the basename of the filename in valid UTF-8
2168 * Since: 2.6
2170 gchar *
2171 g_filename_display_basename (const gchar *filename)
2173 char *basename;
2174 char *display_name;
2176 g_return_val_if_fail (filename != NULL, NULL);
2178 basename = g_path_get_basename (filename);
2179 display_name = g_filename_display_name (basename);
2180 g_free (basename);
2181 return display_name;
2185 * g_filename_display_name:
2186 * @filename: a pathname hopefully in the GLib file name encoding
2188 * Converts a filename into a valid UTF-8 string. The conversion is
2189 * not necessarily reversible, so you should keep the original around
2190 * and use the return value of this function only for display purposes.
2191 * Unlike g_filename_to_utf8(), the result is guaranteed to be non-%NULL
2192 * even if the filename actually isn't in the GLib file name encoding.
2194 * If GLib can not make sense of the encoding of @filename, as a last resort it
2195 * replaces unknown characters with U+FFFD, the Unicode replacement character.
2196 * You can search the result for the UTF-8 encoding of this character (which is
2197 * "\357\277\275" in octal notation) to find out if @filename was in an invalid
2198 * encoding.
2200 * If you know the whole pathname of the file you should use
2201 * g_filename_display_basename(), since that allows location-based
2202 * translation of filenames.
2204 * Return value: a newly allocated string containing
2205 * a rendition of the filename in valid UTF-8
2207 * Since: 2.6
2209 gchar *
2210 g_filename_display_name (const gchar *filename)
2212 gint i;
2213 const gchar **charsets;
2214 gchar *display_name = NULL;
2215 gboolean is_utf8;
2217 is_utf8 = g_get_filename_charsets (&charsets);
2219 if (is_utf8)
2221 if (g_utf8_validate (filename, -1, NULL))
2222 display_name = g_strdup (filename);
2225 if (!display_name)
2227 /* Try to convert from the filename charsets to UTF-8.
2228 * Skip the first charset if it is UTF-8.
2230 for (i = is_utf8 ? 1 : 0; charsets[i]; i++)
2232 display_name = g_convert (filename, -1, "UTF-8", charsets[i],
2233 NULL, NULL, NULL);
2235 if (display_name)
2236 break;
2240 /* if all conversions failed, we replace invalid UTF-8
2241 * by a question mark
2243 if (!display_name)
2244 display_name = _g_utf8_make_valid (filename);
2246 return display_name;