1 /* -*- mode: C; c-file-style: "gnu"; indent-tabs-mode: nil; -*- */
3 /* GLIB - Library of useful routines for C programming
4 * Copyright (C) 2008 Red Hat, Inc.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General
17 * Public License along with this library; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
19 * Boston, MA 02111-1307, USA.
33 * @short_description: Internet hostname utilities
36 * Functions for manipulating internet hostnames; in particular, for
37 * converting between Unicode and ASCII-encoded forms of
38 * Internationalized Domain Names (IDNs).
41 * url="http://www.ietf.org/rfc/rfc3490.txt">Internationalized Domain
42 * Names for Applications (IDNA)</ulink> standards allow for the use
43 * of Unicode domain names in applications, while providing
44 * backward-compatibility with the old ASCII-only DNS, by defining an
45 * ASCII-Compatible Encoding of any given Unicode name, which can be
46 * used with non-IDN-aware applications and protocols. (For example,
47 * "Παν語.org" maps to "xn--4wa8awb4637h.org".)
50 #define IDNA_ACE_PREFIX "xn--"
51 #define IDNA_ACE_PREFIX_LEN 4
53 /* Punycode constants, from RFC 3492. */
55 #define PUNYCODE_BASE 36
56 #define PUNYCODE_TMIN 1
57 #define PUNYCODE_TMAX 26
58 #define PUNYCODE_SKEW 38
59 #define PUNYCODE_DAMP 700
60 #define PUNYCODE_INITIAL_BIAS 72
61 #define PUNYCODE_INITIAL_N 0x80
63 #define PUNYCODE_IS_BASIC(cp) ((guint)(cp) < 0x80)
65 /* Encode/decode a single base-36 digit */
67 encode_digit (guint dig
)
72 return dig
- 26 + '0';
76 decode_digit (gchar dig
)
78 if (dig
>= 'A' && dig
<= 'Z')
80 else if (dig
>= 'a' && dig
<= 'z')
82 else if (dig
>= '0' && dig
<= '9')
83 return dig
- '0' + 26;
88 /* Punycode bias adaptation algorithm, RFC 3492 section 6.1 */
96 delta
= firsttime
? delta
/ PUNYCODE_DAMP
: delta
/ 2;
97 delta
+= delta
/ numpoints
;
100 while (delta
> ((PUNYCODE_BASE
- PUNYCODE_TMIN
) * PUNYCODE_TMAX
) / 2)
102 delta
/= PUNYCODE_BASE
- PUNYCODE_TMIN
;
106 return k
+ ((PUNYCODE_BASE
- PUNYCODE_TMIN
+ 1) * delta
/
107 (delta
+ PUNYCODE_SKEW
));
110 /* Punycode encoder, RFC 3492 section 6.3. The algorithm is
111 * sufficiently bizarre that it's not really worth trying to explain
115 punycode_encode (const gchar
*input_utf8
,
116 gsize input_utf8_length
,
119 guint delta
, handled_chars
, num_basic_chars
, bias
, j
, q
, k
, t
, digit
;
120 gunichar n
, m
, *input
;
122 gboolean success
= FALSE
;
124 /* Convert from UTF-8 to Unicode code points */
125 input
= g_utf8_to_ucs4 (input_utf8
, input_utf8_length
, NULL
,
126 &input_length
, NULL
);
130 /* Copy basic chars */
131 for (j
= num_basic_chars
= 0; j
< input_length
; j
++)
133 if (PUNYCODE_IS_BASIC (input
[j
]))
135 g_string_append_c (output
, g_ascii_tolower (input
[j
]));
140 g_string_append_c (output
, '-');
142 handled_chars
= num_basic_chars
;
144 /* Encode non-basic chars */
146 bias
= PUNYCODE_INITIAL_BIAS
;
147 n
= PUNYCODE_INITIAL_N
;
148 while (handled_chars
< input_length
)
150 /* let m = the minimum {non-basic} code point >= n in the input */
151 for (m
= G_MAXUINT
, j
= 0; j
< input_length
; j
++)
153 if (input
[j
] >= n
&& input
[j
] < m
)
157 if (m
- n
> (G_MAXUINT
- delta
) / (handled_chars
+ 1))
159 delta
+= (m
- n
) * (handled_chars
+ 1);
162 for (j
= 0; j
< input_length
; j
++)
169 else if (input
[j
] == n
)
172 for (k
= PUNYCODE_BASE
; ; k
+= PUNYCODE_BASE
)
176 else if (k
>= bias
+ PUNYCODE_TMAX
)
182 digit
= t
+ (q
- t
) % (PUNYCODE_BASE
- t
);
183 g_string_append_c (output
, encode_digit (digit
));
184 q
= (q
- t
) / (PUNYCODE_BASE
- t
);
187 g_string_append_c (output
, encode_digit (q
));
188 bias
= adapt (delta
, handled_chars
+ 1, handled_chars
== num_basic_chars
);
205 /* From RFC 3454, Table B.1 */
206 #define idna_is_junk(ch) ((ch) == 0x00AD || (ch) == 0x1806 || (ch) == 0x200B || (ch) == 0x2060 || (ch) == 0xFEFF || (ch) == 0x034F || (ch) == 0x180B || (ch) == 0x180C || (ch) == 0x180D || (ch) == 0x200C || (ch) == 0x200D || ((ch) >= 0xFE00 && (ch) <= 0xFE0F))
208 /* Scan @str for "junk" and return a cleaned-up string if any junk
209 * is found. Else return %NULL.
212 remove_junk (const gchar
*str
,
215 GString
*cleaned
= NULL
;
219 for (p
= str
; len
== -1 ? *p
: p
< str
+ len
; p
= g_utf8_next_char (p
))
221 ch
= g_utf8_get_char (p
);
222 if (idna_is_junk (ch
))
226 cleaned
= g_string_new (NULL
);
227 g_string_append_len (cleaned
, str
, p
- str
);
231 g_string_append_unichar (cleaned
, ch
);
235 return g_string_free (cleaned
, FALSE
);
240 static inline gboolean
241 contains_uppercase_letters (const gchar
*str
,
246 for (p
= str
; len
== -1 ? *p
: p
< str
+ len
; p
= g_utf8_next_char (p
))
248 if (g_unichar_isupper (g_utf8_get_char (p
)))
254 static inline gboolean
255 contains_non_ascii (const gchar
*str
,
260 for (p
= str
; len
== -1 ? *p
: p
< str
+ len
; p
++)
262 if ((guchar
)*p
> 0x80)
268 /* RFC 3454, Appendix C. ish. */
269 static inline gboolean
270 idna_is_prohibited (gunichar ch
)
272 switch (g_unichar_type (ch
))
274 case G_UNICODE_CONTROL
:
275 case G_UNICODE_FORMAT
:
276 case G_UNICODE_UNASSIGNED
:
277 case G_UNICODE_PRIVATE_USE
:
278 case G_UNICODE_SURROGATE
:
279 case G_UNICODE_LINE_SEPARATOR
:
280 case G_UNICODE_PARAGRAPH_SEPARATOR
:
281 case G_UNICODE_SPACE_SEPARATOR
:
284 case G_UNICODE_OTHER_SYMBOL
:
285 if (ch
== 0xFFFC || ch
== 0xFFFD ||
286 (ch
>= 0x2FF0 && ch
<= 0x2FFB))
290 case G_UNICODE_NON_SPACING_MARK
:
291 if (ch
== 0x0340 || ch
== 0x0341)
300 /* RFC 3491 IDN cleanup algorithm. */
302 nameprep (const gchar
*hostname
,
305 gchar
*name
, *tmp
= NULL
, *p
;
307 /* It would be nice if we could do this without repeatedly
308 * allocating strings and converting back and forth between
309 * gunichars and UTF-8... The code does at least avoid doing most of
310 * the sub-operations when they would just be equivalent to a
314 /* Remove presentation-only characters */
315 name
= remove_junk (hostname
, len
);
322 name
= (gchar
*)hostname
;
324 /* Convert to lowercase */
325 if (contains_uppercase_letters (name
, len
))
327 name
= g_utf8_strdown (name
, len
);
333 /* If there are no UTF8 characters, we're done. */
334 if (!contains_non_ascii (name
, len
))
336 if (name
== (gchar
*)hostname
)
337 return len
== -1 ? g_strdup (hostname
) : g_strndup (hostname
, len
);
343 name
= g_utf8_normalize (name
, len
, G_NORMALIZE_NFKC
);
350 /* KC normalization may have created more capital letters (eg,
351 * angstrom -> capital A with ring). So we have to lowercasify a
352 * second time. (This is more-or-less how the nameprep algorithm
353 * does it. If tolower(nfkc(tolower(X))) is guaranteed to be the
354 * same as tolower(nfkc(X)), then we could skip the first tolower,
355 * but I'm not sure it is.)
357 if (contains_uppercase_letters (name
, -1))
359 name
= g_utf8_strdown (name
, -1);
364 /* Check for prohibited characters */
365 for (p
= name
; *p
; p
= g_utf8_next_char (p
))
367 if (idna_is_prohibited (g_utf8_get_char (p
)))
375 /* FIXME: We're supposed to verify certain constraints on bidi
376 * characters, but glib does not appear to have that information.
384 * g_hostname_to_ascii:
385 * @hostname: a valid UTF-8 or ASCII hostname
387 * Converts @hostname to its canonical ASCII form; an ASCII-only
388 * string containing no uppercase letters and not ending with a
391 * Return value: an ASCII hostname, which must be freed, or %NULL if
392 * @hostname is in some way invalid.
397 g_hostname_to_ascii (const gchar
*hostname
)
399 gchar
*name
, *label
, *p
;
404 label
= name
= nameprep (hostname
, -1);
408 out
= g_string_new (NULL
);
413 for (p
= label
; *p
&& *p
!= '.'; p
++)
415 if ((guchar
)*p
> 0x80)
423 if (!strncmp (label
, IDNA_ACE_PREFIX
, IDNA_ACE_PREFIX_LEN
))
426 g_string_append (out
, IDNA_ACE_PREFIX
);
427 if (!punycode_encode (label
, llen
, out
))
431 g_string_append_len (out
, label
, llen
);
433 if (out
->len
- oldlen
> 63)
437 if (*label
&& *++label
)
438 g_string_append_c (out
, '.');
443 return g_string_free (out
, FALSE
);
447 g_string_free (out
, TRUE
);
452 * g_hostname_is_non_ascii:
453 * @hostname: a hostname
455 * Tests if @hostname contains Unicode characters. If this returns
456 * %TRUE, you need to encode the hostname with g_hostname_to_ascii()
457 * before using it in non-IDN-aware contexts.
459 * Note that a hostname might contain a mix of encoded and unencoded
460 * segments, and so it is possible for g_hostname_is_non_ascii() and
461 * g_hostname_is_ascii_encoded() to both return %TRUE for a name.
463 * Return value: %TRUE if @hostname contains any non-ASCII characters
468 g_hostname_is_non_ascii (const gchar
*hostname
)
470 return contains_non_ascii (hostname
, -1);
473 /* Punycode decoder, RFC 3492 section 6.2. As with punycode_encode(),
474 * read the RFC if you want to understand what this is actually doing.
477 punycode_decode (const gchar
*input
,
481 GArray
*output_chars
;
484 guint oldi
, w
, k
, digit
, t
;
487 n
= PUNYCODE_INITIAL_N
;
489 bias
= PUNYCODE_INITIAL_BIAS
;
491 split
= input
+ input_length
- 1;
492 while (split
> input
&& *split
!= '-')
496 output_chars
= g_array_sized_new (FALSE
, FALSE
, sizeof (gunichar
),
498 input_length
-= (split
- input
) + 1;
499 while (input
< split
)
501 gunichar ch
= (gunichar
)*input
++;
502 if (!PUNYCODE_IS_BASIC (ch
))
504 g_array_append_val (output_chars
, ch
);
509 output_chars
= g_array_new (FALSE
, FALSE
, sizeof (gunichar
));
515 for (k
= PUNYCODE_BASE
; ; k
+= PUNYCODE_BASE
)
519 digit
= decode_digit (*input
++);
520 if (digit
>= PUNYCODE_BASE
)
522 if (digit
> (G_MAXUINT
- i
) / w
)
527 else if (k
>= bias
+ PUNYCODE_TMAX
)
533 if (w
> G_MAXUINT
/ (PUNYCODE_BASE
- t
))
535 w
*= (PUNYCODE_BASE
- t
);
538 bias
= adapt (i
- oldi
, output_chars
->len
+ 1, oldi
== 0);
540 if (i
/ (output_chars
->len
+ 1) > G_MAXUINT
- n
)
542 n
+= i
/ (output_chars
->len
+ 1);
543 i
%= (output_chars
->len
+ 1);
545 g_array_insert_val (output_chars
, i
++, n
);
548 for (i
= 0; i
< output_chars
->len
; i
++)
549 g_string_append_unichar (output
, g_array_index (output_chars
, gunichar
, i
));
550 g_array_free (output_chars
, TRUE
);
554 g_array_free (output_chars
, TRUE
);
559 * g_hostname_to_unicode:
560 * @hostname: a valid UTF-8 or ASCII hostname
562 * Converts @hostname to its canonical presentation form; a UTF-8
563 * string in Unicode normalization form C, containing no uppercase
564 * letters, no forbidden characters, and no ASCII-encoded segments,
565 * and not ending with a trailing dot.
567 * Of course if @hostname is not an internationalized hostname, then
568 * the canonical presentation form will be entirely ASCII.
570 * Return value: a UTF-8 hostname, which must be freed, or %NULL if
571 * @hostname is in some way invalid.
576 g_hostname_to_unicode (const gchar
*hostname
)
581 out
= g_string_new (NULL
);
585 llen
= strcspn (hostname
, ".");
586 if (!g_ascii_strncasecmp (hostname
, IDNA_ACE_PREFIX
, IDNA_ACE_PREFIX_LEN
))
588 hostname
+= IDNA_ACE_PREFIX_LEN
;
589 llen
-= IDNA_ACE_PREFIX_LEN
;
590 if (!punycode_decode (hostname
, llen
, out
))
592 g_string_free (out
, TRUE
);
598 gchar
*canonicalized
= nameprep (hostname
, llen
);
602 g_string_free (out
, TRUE
);
605 g_string_append (out
, canonicalized
);
606 g_free (canonicalized
);
610 if (*hostname
&& *++hostname
)
611 g_string_append_c (out
, '.');
615 return g_string_free (out
, FALSE
);
619 * g_hostname_is_ascii_encoded:
620 * @hostname: a hostname
622 * Tests if @hostname contains segments with an ASCII-compatible
623 * encoding of an Internationalized Domain Name. If this returns
624 * %TRUE, you should decode the hostname with g_hostname_to_unicode()
625 * before displaying it to the user.
627 * Note that a hostname might contain a mix of encoded and unencoded
628 * segments, and so it is possible for g_hostname_is_non_ascii() and
629 * g_hostname_is_ascii_encoded() to both return %TRUE for a name.
631 * Return value: %TRUE if @hostname contains any ASCII-encoded
637 g_hostname_is_ascii_encoded (const gchar
*hostname
)
641 if (!g_ascii_strncasecmp (hostname
, IDNA_ACE_PREFIX
, IDNA_ACE_PREFIX_LEN
))
643 hostname
= strchr (hostname
, '.');
650 * g_hostname_is_ip_address:
651 * @hostname: a hostname (or IP address in string form)
653 * Tests if @hostname is the string form of an IPv4 or IPv6 address.
654 * (Eg, "192.168.0.1".)
656 * Return value: %TRUE if @hostname is an IP address
661 g_hostname_is_ip_address (const gchar
*hostname
)
664 gint nsegments
, octet
;
666 /* On Linux we could implement this using inet_pton, but the Windows
667 * equivalent of that requires linking against winsock, so we just
668 * figure this out ourselves. Tested by tests/hostutils.c.
671 p
= (char *)hostname
;
677 /* If it contains a ':', it's an IPv6 address (assuming it's an
678 * IP address at all). This consists of eight ':'-separated
679 * segments, each containing a 1-4 digit hex number, except that
680 * optionally: (a) the last two segments can be replaced by an
681 * IPv4 address, and (b) a single span of 1 to 8 "0000" segments
682 * can be replaced with just "::".
687 while (*p
&& nsegments
< 8)
689 /* Each segment after the first must be preceded by a ':'.
690 * (We also handle half of the "string starts with ::" case
693 if (p
!= (char *)hostname
|| (p
[0] == ':' && p
[1] == ':'))
700 /* If there's another ':', it means we're skipping some segments */
701 if (*p
== ':' && !skipped
)
706 /* Handle the "string ends with ::" case */
713 /* Read the segment, make sure it's valid. */
714 for (end
= p
; g_ascii_isxdigit (*end
); end
++)
716 if (end
== p
|| end
> p
+ 4)
721 if ((nsegments
== 6 && !skipped
) || (nsegments
<= 6 && skipped
))
731 return !*p
&& (nsegments
== 8 || skipped
);
736 /* Parse IPv4: N.N.N.N, where each N <= 255 and doesn't have leading 0s. */
737 for (nsegments
= 0; nsegments
< 4; nsegments
++)
746 /* Check the segment; a little tricker than the IPv6 case since
747 * we can't allow extra leading 0s, and we can't assume that all
748 * strings of valid length are within range.
755 for (end
= p
; g_ascii_isdigit (*end
); end
++)
756 octet
= 10 * octet
+ (*end
- '0');
758 if (end
== p
|| end
> p
+ 3 || octet
> 255)
764 /* If there's nothing left to parse, then it's ok. */
768 #define __G_HOST_UTILS_C__
769 #include "galiasdef.c"