utf8: add unit test for g_utf8_make_valid
[glib.git] / glib / ghostutils.c
blob4be59f7eec994b2b1d452568090a0f88ccc7b57e
1 /* -*- mode: C; c-file-style: "gnu"; indent-tabs-mode: nil; -*- */
3 /* GLIB - Library of useful routines for C programming
4 * Copyright (C) 2008 Red Hat, Inc.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General
17 * Public License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "config.h"
22 #include <string.h>
24 #include "ghostutils.h"
26 #include "garray.h"
27 #include "gmem.h"
28 #include "gstring.h"
29 #include "gstrfuncs.h"
30 #include "glibintl.h"
33 /**
34 * SECTION:ghostutils
35 * @short_description: Internet hostname utilities
37 * Functions for manipulating internet hostnames; in particular, for
38 * converting between Unicode and ASCII-encoded forms of
39 * Internationalized Domain Names (IDNs).
41 * The
42 * [Internationalized Domain Names for Applications (IDNA)](http://www.ietf.org/rfc/rfc3490.txt)
43 * standards allow for the use
44 * of Unicode domain names in applications, while providing
45 * backward-compatibility with the old ASCII-only DNS, by defining an
46 * ASCII-Compatible Encoding of any given Unicode name, which can be
47 * used with non-IDN-aware applications and protocols. (For example,
48 * "Παν語.org" maps to "xn--4wa8awb4637h.org".)
49 **/
51 #define IDNA_ACE_PREFIX "xn--"
52 #define IDNA_ACE_PREFIX_LEN 4
54 /* Punycode constants, from RFC 3492. */
56 #define PUNYCODE_BASE 36
57 #define PUNYCODE_TMIN 1
58 #define PUNYCODE_TMAX 26
59 #define PUNYCODE_SKEW 38
60 #define PUNYCODE_DAMP 700
61 #define PUNYCODE_INITIAL_BIAS 72
62 #define PUNYCODE_INITIAL_N 0x80
64 #define PUNYCODE_IS_BASIC(cp) ((guint)(cp) < 0x80)
66 /* Encode/decode a single base-36 digit */
67 static inline gchar
68 encode_digit (guint dig)
70 if (dig < 26)
71 return dig + 'a';
72 else
73 return dig - 26 + '0';
76 static inline guint
77 decode_digit (gchar dig)
79 if (dig >= 'A' && dig <= 'Z')
80 return dig - 'A';
81 else if (dig >= 'a' && dig <= 'z')
82 return dig - 'a';
83 else if (dig >= '0' && dig <= '9')
84 return dig - '0' + 26;
85 else
86 return G_MAXUINT;
89 /* Punycode bias adaptation algorithm, RFC 3492 section 6.1 */
90 static guint
91 adapt (guint delta,
92 guint numpoints,
93 gboolean firsttime)
95 guint k;
97 delta = firsttime ? delta / PUNYCODE_DAMP : delta / 2;
98 delta += delta / numpoints;
100 k = 0;
101 while (delta > ((PUNYCODE_BASE - PUNYCODE_TMIN) * PUNYCODE_TMAX) / 2)
103 delta /= PUNYCODE_BASE - PUNYCODE_TMIN;
104 k += PUNYCODE_BASE;
107 return k + ((PUNYCODE_BASE - PUNYCODE_TMIN + 1) * delta /
108 (delta + PUNYCODE_SKEW));
111 /* Punycode encoder, RFC 3492 section 6.3. The algorithm is
112 * sufficiently bizarre that it's not really worth trying to explain
113 * here.
115 static gboolean
116 punycode_encode (const gchar *input_utf8,
117 gsize input_utf8_length,
118 GString *output)
120 guint delta, handled_chars, num_basic_chars, bias, j, q, k, t, digit;
121 gunichar n, m, *input;
122 glong input_length;
123 gboolean success = FALSE;
125 /* Convert from UTF-8 to Unicode code points */
126 input = g_utf8_to_ucs4 (input_utf8, input_utf8_length, NULL,
127 &input_length, NULL);
128 if (!input)
129 return FALSE;
131 /* Copy basic chars */
132 for (j = num_basic_chars = 0; j < input_length; j++)
134 if (PUNYCODE_IS_BASIC (input[j]))
136 g_string_append_c (output, g_ascii_tolower (input[j]));
137 num_basic_chars++;
140 if (num_basic_chars)
141 g_string_append_c (output, '-');
143 handled_chars = num_basic_chars;
145 /* Encode non-basic chars */
146 delta = 0;
147 bias = PUNYCODE_INITIAL_BIAS;
148 n = PUNYCODE_INITIAL_N;
149 while (handled_chars < input_length)
151 /* let m = the minimum {non-basic} code point >= n in the input */
152 for (m = G_MAXUINT, j = 0; j < input_length; j++)
154 if (input[j] >= n && input[j] < m)
155 m = input[j];
158 if (m - n > (G_MAXUINT - delta) / (handled_chars + 1))
159 goto fail;
160 delta += (m - n) * (handled_chars + 1);
161 n = m;
163 for (j = 0; j < input_length; j++)
165 if (input[j] < n)
167 if (++delta == 0)
168 goto fail;
170 else if (input[j] == n)
172 q = delta;
173 for (k = PUNYCODE_BASE; ; k += PUNYCODE_BASE)
175 if (k <= bias)
176 t = PUNYCODE_TMIN;
177 else if (k >= bias + PUNYCODE_TMAX)
178 t = PUNYCODE_TMAX;
179 else
180 t = k - bias;
181 if (q < t)
182 break;
183 digit = t + (q - t) % (PUNYCODE_BASE - t);
184 g_string_append_c (output, encode_digit (digit));
185 q = (q - t) / (PUNYCODE_BASE - t);
188 g_string_append_c (output, encode_digit (q));
189 bias = adapt (delta, handled_chars + 1, handled_chars == num_basic_chars);
190 delta = 0;
191 handled_chars++;
195 delta++;
196 n++;
199 success = TRUE;
201 fail:
202 g_free (input);
203 return success;
206 /* From RFC 3454, Table B.1 */
207 #define idna_is_junk(ch) ((ch) == 0x00AD || (ch) == 0x1806 || (ch) == 0x200B || (ch) == 0x2060 || (ch) == 0xFEFF || (ch) == 0x034F || (ch) == 0x180B || (ch) == 0x180C || (ch) == 0x180D || (ch) == 0x200C || (ch) == 0x200D || ((ch) >= 0xFE00 && (ch) <= 0xFE0F))
209 /* Scan @str for "junk" and return a cleaned-up string if any junk
210 * is found. Else return %NULL.
212 static gchar *
213 remove_junk (const gchar *str,
214 gint len)
216 GString *cleaned = NULL;
217 const gchar *p;
218 gunichar ch;
220 for (p = str; len == -1 ? *p : p < str + len; p = g_utf8_next_char (p))
222 ch = g_utf8_get_char (p);
223 if (idna_is_junk (ch))
225 if (!cleaned)
227 cleaned = g_string_new (NULL);
228 g_string_append_len (cleaned, str, p - str);
231 else if (cleaned)
232 g_string_append_unichar (cleaned, ch);
235 if (cleaned)
236 return g_string_free (cleaned, FALSE);
237 else
238 return NULL;
241 static inline gboolean
242 contains_uppercase_letters (const gchar *str,
243 gint len)
245 const gchar *p;
247 for (p = str; len == -1 ? *p : p < str + len; p = g_utf8_next_char (p))
249 if (g_unichar_isupper (g_utf8_get_char (p)))
250 return TRUE;
252 return FALSE;
255 static inline gboolean
256 contains_non_ascii (const gchar *str,
257 gint len)
259 const gchar *p;
261 for (p = str; len == -1 ? *p : p < str + len; p++)
263 if ((guchar)*p > 0x80)
264 return TRUE;
266 return FALSE;
269 /* RFC 3454, Appendix C. ish. */
270 static inline gboolean
271 idna_is_prohibited (gunichar ch)
273 switch (g_unichar_type (ch))
275 case G_UNICODE_CONTROL:
276 case G_UNICODE_FORMAT:
277 case G_UNICODE_UNASSIGNED:
278 case G_UNICODE_PRIVATE_USE:
279 case G_UNICODE_SURROGATE:
280 case G_UNICODE_LINE_SEPARATOR:
281 case G_UNICODE_PARAGRAPH_SEPARATOR:
282 case G_UNICODE_SPACE_SEPARATOR:
283 return TRUE;
285 case G_UNICODE_OTHER_SYMBOL:
286 if (ch == 0xFFFC || ch == 0xFFFD ||
287 (ch >= 0x2FF0 && ch <= 0x2FFB))
288 return TRUE;
289 return FALSE;
291 case G_UNICODE_NON_SPACING_MARK:
292 if (ch == 0x0340 || ch == 0x0341)
293 return TRUE;
294 return FALSE;
296 default:
297 return FALSE;
301 /* RFC 3491 IDN cleanup algorithm. */
302 static gchar *
303 nameprep (const gchar *hostname,
304 gint len,
305 gboolean *is_unicode)
307 gchar *name, *tmp = NULL, *p;
309 /* It would be nice if we could do this without repeatedly
310 * allocating strings and converting back and forth between
311 * gunichars and UTF-8... The code does at least avoid doing most of
312 * the sub-operations when they would just be equivalent to a
313 * g_strdup().
316 /* Remove presentation-only characters */
317 name = remove_junk (hostname, len);
318 if (name)
320 tmp = name;
321 len = -1;
323 else
324 name = (gchar *)hostname;
326 /* Convert to lowercase */
327 if (contains_uppercase_letters (name, len))
329 name = g_utf8_strdown (name, len);
330 g_free (tmp);
331 tmp = name;
332 len = -1;
335 /* If there are no UTF8 characters, we're done. */
336 if (!contains_non_ascii (name, len))
338 *is_unicode = FALSE;
339 if (name == (gchar *)hostname)
340 return len == -1 ? g_strdup (hostname) : g_strndup (hostname, len);
341 else
342 return name;
345 *is_unicode = TRUE;
347 /* Normalize */
348 name = g_utf8_normalize (name, len, G_NORMALIZE_NFKC);
349 g_free (tmp);
350 tmp = name;
352 if (!name)
353 return NULL;
355 /* KC normalization may have created more capital letters (eg,
356 * angstrom -> capital A with ring). So we have to lowercasify a
357 * second time. (This is more-or-less how the nameprep algorithm
358 * does it. If tolower(nfkc(tolower(X))) is guaranteed to be the
359 * same as tolower(nfkc(X)), then we could skip the first tolower,
360 * but I'm not sure it is.)
362 if (contains_uppercase_letters (name, -1))
364 name = g_utf8_strdown (name, -1);
365 g_free (tmp);
366 tmp = name;
369 /* Check for prohibited characters */
370 for (p = name; *p; p = g_utf8_next_char (p))
372 if (idna_is_prohibited (g_utf8_get_char (p)))
374 name = NULL;
375 g_free (tmp);
376 goto done;
380 /* FIXME: We're supposed to verify certain constraints on bidi
381 * characters, but glib does not appear to have that information.
384 done:
385 return name;
388 /* RFC 3490, section 3.1 says '.', 0x3002, 0xFF0E, and 0xFF61 count as
389 * label-separating dots. @str must be '\0'-terminated.
391 #define idna_is_dot(str) ( \
392 ((guchar)(str)[0] == '.') || \
393 ((guchar)(str)[0] == 0xE3 && (guchar)(str)[1] == 0x80 && (guchar)(str)[2] == 0x82) || \
394 ((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBC && (guchar)(str)[2] == 0x8E) || \
395 ((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBD && (guchar)(str)[2] == 0xA1) )
397 static const gchar *
398 idna_end_of_label (const gchar *str)
400 for (; *str; str = g_utf8_next_char (str))
402 if (idna_is_dot (str))
403 return str;
405 return str;
409 * g_hostname_to_ascii:
410 * @hostname: a valid UTF-8 or ASCII hostname
412 * Converts @hostname to its canonical ASCII form; an ASCII-only
413 * string containing no uppercase letters and not ending with a
414 * trailing dot.
416 * Returns: an ASCII hostname, which must be freed, or %NULL if
417 * @hostname is in some way invalid.
419 * Since: 2.22
421 gchar *
422 g_hostname_to_ascii (const gchar *hostname)
424 gchar *name, *label, *p;
425 GString *out;
426 gssize llen, oldlen;
427 gboolean unicode;
429 label = name = nameprep (hostname, -1, &unicode);
430 if (!name || !unicode)
431 return name;
433 out = g_string_new (NULL);
437 unicode = FALSE;
438 for (p = label; *p && !idna_is_dot (p); p++)
440 if ((guchar)*p > 0x80)
441 unicode = TRUE;
444 oldlen = out->len;
445 llen = p - label;
446 if (unicode)
448 if (!strncmp (label, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN))
449 goto fail;
451 g_string_append (out, IDNA_ACE_PREFIX);
452 if (!punycode_encode (label, llen, out))
453 goto fail;
455 else
456 g_string_append_len (out, label, llen);
458 if (out->len - oldlen > 63)
459 goto fail;
461 label += llen;
462 if (*label)
463 label = g_utf8_next_char (label);
464 if (*label)
465 g_string_append_c (out, '.');
467 while (*label);
469 g_free (name);
470 return g_string_free (out, FALSE);
472 fail:
473 g_free (name);
474 g_string_free (out, TRUE);
475 return NULL;
479 * g_hostname_is_non_ascii:
480 * @hostname: a hostname
482 * Tests if @hostname contains Unicode characters. If this returns
483 * %TRUE, you need to encode the hostname with g_hostname_to_ascii()
484 * before using it in non-IDN-aware contexts.
486 * Note that a hostname might contain a mix of encoded and unencoded
487 * segments, and so it is possible for g_hostname_is_non_ascii() and
488 * g_hostname_is_ascii_encoded() to both return %TRUE for a name.
490 * Returns: %TRUE if @hostname contains any non-ASCII characters
492 * Since: 2.22
494 gboolean
495 g_hostname_is_non_ascii (const gchar *hostname)
497 return contains_non_ascii (hostname, -1);
500 /* Punycode decoder, RFC 3492 section 6.2. As with punycode_encode(),
501 * read the RFC if you want to understand what this is actually doing.
503 static gboolean
504 punycode_decode (const gchar *input,
505 gsize input_length,
506 GString *output)
508 GArray *output_chars;
509 gunichar n;
510 guint i, bias;
511 guint oldi, w, k, digit, t;
512 const gchar *split;
514 n = PUNYCODE_INITIAL_N;
515 i = 0;
516 bias = PUNYCODE_INITIAL_BIAS;
518 split = input + input_length - 1;
519 while (split > input && *split != '-')
520 split--;
521 if (split > input)
523 output_chars = g_array_sized_new (FALSE, FALSE, sizeof (gunichar),
524 split - input);
525 input_length -= (split - input) + 1;
526 while (input < split)
528 gunichar ch = (gunichar)*input++;
529 if (!PUNYCODE_IS_BASIC (ch))
530 goto fail;
531 g_array_append_val (output_chars, ch);
533 input++;
535 else
536 output_chars = g_array_new (FALSE, FALSE, sizeof (gunichar));
538 while (input_length)
540 oldi = i;
541 w = 1;
542 for (k = PUNYCODE_BASE; ; k += PUNYCODE_BASE)
544 if (!input_length--)
545 goto fail;
546 digit = decode_digit (*input++);
547 if (digit >= PUNYCODE_BASE)
548 goto fail;
549 if (digit > (G_MAXUINT - i) / w)
550 goto fail;
551 i += digit * w;
552 if (k <= bias)
553 t = PUNYCODE_TMIN;
554 else if (k >= bias + PUNYCODE_TMAX)
555 t = PUNYCODE_TMAX;
556 else
557 t = k - bias;
558 if (digit < t)
559 break;
560 if (w > G_MAXUINT / (PUNYCODE_BASE - t))
561 goto fail;
562 w *= (PUNYCODE_BASE - t);
565 bias = adapt (i - oldi, output_chars->len + 1, oldi == 0);
567 if (i / (output_chars->len + 1) > G_MAXUINT - n)
568 goto fail;
569 n += i / (output_chars->len + 1);
570 i %= (output_chars->len + 1);
572 g_array_insert_val (output_chars, i++, n);
575 for (i = 0; i < output_chars->len; i++)
576 g_string_append_unichar (output, g_array_index (output_chars, gunichar, i));
577 g_array_free (output_chars, TRUE);
578 return TRUE;
580 fail:
581 g_array_free (output_chars, TRUE);
582 return FALSE;
586 * g_hostname_to_unicode:
587 * @hostname: a valid UTF-8 or ASCII hostname
589 * Converts @hostname to its canonical presentation form; a UTF-8
590 * string in Unicode normalization form C, containing no uppercase
591 * letters, no forbidden characters, and no ASCII-encoded segments,
592 * and not ending with a trailing dot.
594 * Of course if @hostname is not an internationalized hostname, then
595 * the canonical presentation form will be entirely ASCII.
597 * Returns: a UTF-8 hostname, which must be freed, or %NULL if
598 * @hostname is in some way invalid.
600 * Since: 2.22
602 gchar *
603 g_hostname_to_unicode (const gchar *hostname)
605 GString *out;
606 gssize llen;
608 out = g_string_new (NULL);
612 llen = idna_end_of_label (hostname) - hostname;
613 if (!g_ascii_strncasecmp (hostname, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN))
615 hostname += IDNA_ACE_PREFIX_LEN;
616 llen -= IDNA_ACE_PREFIX_LEN;
617 if (!punycode_decode (hostname, llen, out))
619 g_string_free (out, TRUE);
620 return NULL;
623 else
625 gboolean unicode;
626 gchar *canonicalized = nameprep (hostname, llen, &unicode);
628 if (!canonicalized)
630 g_string_free (out, TRUE);
631 return NULL;
633 g_string_append (out, canonicalized);
634 g_free (canonicalized);
637 hostname += llen;
638 if (*hostname)
639 hostname = g_utf8_next_char (hostname);
640 if (*hostname)
641 g_string_append_c (out, '.');
643 while (*hostname);
645 return g_string_free (out, FALSE);
649 * g_hostname_is_ascii_encoded:
650 * @hostname: a hostname
652 * Tests if @hostname contains segments with an ASCII-compatible
653 * encoding of an Internationalized Domain Name. If this returns
654 * %TRUE, you should decode the hostname with g_hostname_to_unicode()
655 * before displaying it to the user.
657 * Note that a hostname might contain a mix of encoded and unencoded
658 * segments, and so it is possible for g_hostname_is_non_ascii() and
659 * g_hostname_is_ascii_encoded() to both return %TRUE for a name.
661 * Returns: %TRUE if @hostname contains any ASCII-encoded
662 * segments.
664 * Since: 2.22
666 gboolean
667 g_hostname_is_ascii_encoded (const gchar *hostname)
669 while (1)
671 if (!g_ascii_strncasecmp (hostname, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN))
672 return TRUE;
673 hostname = idna_end_of_label (hostname);
674 if (*hostname)
675 hostname = g_utf8_next_char (hostname);
676 if (!*hostname)
677 return FALSE;
682 * g_hostname_is_ip_address:
683 * @hostname: a hostname (or IP address in string form)
685 * Tests if @hostname is the string form of an IPv4 or IPv6 address.
686 * (Eg, "192.168.0.1".)
688 * Returns: %TRUE if @hostname is an IP address
690 * Since: 2.22
692 gboolean
693 g_hostname_is_ip_address (const gchar *hostname)
695 gchar *p, *end;
696 gint nsegments, octet;
698 /* On Linux we could implement this using inet_pton, but the Windows
699 * equivalent of that requires linking against winsock, so we just
700 * figure this out ourselves. Tested by tests/hostutils.c.
703 p = (char *)hostname;
705 if (strchr (p, ':'))
707 gboolean skipped;
709 /* If it contains a ':', it's an IPv6 address (assuming it's an
710 * IP address at all). This consists of eight ':'-separated
711 * segments, each containing a 1-4 digit hex number, except that
712 * optionally: (a) the last two segments can be replaced by an
713 * IPv4 address, and (b) a single span of 1 to 8 "0000" segments
714 * can be replaced with just "::".
717 nsegments = 0;
718 skipped = FALSE;
719 while (*p && nsegments < 8)
721 /* Each segment after the first must be preceded by a ':'.
722 * (We also handle half of the "string starts with ::" case
723 * here.)
725 if (p != (char *)hostname || (p[0] == ':' && p[1] == ':'))
727 if (*p != ':')
728 return FALSE;
729 p++;
732 /* If there's another ':', it means we're skipping some segments */
733 if (*p == ':' && !skipped)
735 skipped = TRUE;
736 nsegments++;
738 /* Handle the "string ends with ::" case */
739 if (!p[1])
740 p++;
742 continue;
745 /* Read the segment, make sure it's valid. */
746 for (end = p; g_ascii_isxdigit (*end); end++)
748 if (end == p || end > p + 4)
749 return FALSE;
751 if (*end == '.')
753 if ((nsegments == 6 && !skipped) || (nsegments <= 6 && skipped))
754 goto parse_ipv4;
755 else
756 return FALSE;
759 nsegments++;
760 p = end;
763 return !*p && (nsegments == 8 || skipped);
766 parse_ipv4:
768 /* Parse IPv4: N.N.N.N, where each N <= 255 and doesn't have leading 0s. */
769 for (nsegments = 0; nsegments < 4; nsegments++)
771 if (nsegments != 0)
773 if (*p != '.')
774 return FALSE;
775 p++;
778 /* Check the segment; a little tricker than the IPv6 case since
779 * we can't allow extra leading 0s, and we can't assume that all
780 * strings of valid length are within range.
782 octet = 0;
783 if (*p == '0')
784 end = p + 1;
785 else
787 for (end = p; g_ascii_isdigit (*end); end++)
789 octet = 10 * octet + (*end - '0');
791 if (octet > 255)
792 break;
795 if (end == p || end > p + 3 || octet > 255)
796 return FALSE;
798 p = end;
801 /* If there's nothing left to parse, then it's ok. */
802 return !*p;