1 /* -*- mode: C; c-file-style: "gnu"; indent-tabs-mode: nil; -*- */
3 /* GLIB - Library of useful routines for C programming
4 * Copyright (C) 2008 Red Hat, Inc.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General
17 * Public License along with this library; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
19 * Boston, MA 02111-1307, USA.
26 #include "ghostutils.h"
31 #include "gstrfuncs.h"
37 * @short_description: Internet hostname utilities
39 * Functions for manipulating internet hostnames; in particular, for
40 * converting between Unicode and ASCII-encoded forms of
41 * Internationalized Domain Names (IDNs).
44 * url="http://www.ietf.org/rfc/rfc3490.txt">Internationalized Domain
45 * Names for Applications (IDNA)</ulink> standards allow for the use
46 * of Unicode domain names in applications, while providing
47 * backward-compatibility with the old ASCII-only DNS, by defining an
48 * ASCII-Compatible Encoding of any given Unicode name, which can be
49 * used with non-IDN-aware applications and protocols. (For example,
50 * "Παν語.org" maps to "xn--4wa8awb4637h.org".)
53 #define IDNA_ACE_PREFIX "xn--"
54 #define IDNA_ACE_PREFIX_LEN 4
56 /* Punycode constants, from RFC 3492. */
58 #define PUNYCODE_BASE 36
59 #define PUNYCODE_TMIN 1
60 #define PUNYCODE_TMAX 26
61 #define PUNYCODE_SKEW 38
62 #define PUNYCODE_DAMP 700
63 #define PUNYCODE_INITIAL_BIAS 72
64 #define PUNYCODE_INITIAL_N 0x80
66 #define PUNYCODE_IS_BASIC(cp) ((guint)(cp) < 0x80)
68 /* Encode/decode a single base-36 digit */
70 encode_digit (guint dig
)
75 return dig
- 26 + '0';
79 decode_digit (gchar dig
)
81 if (dig
>= 'A' && dig
<= 'Z')
83 else if (dig
>= 'a' && dig
<= 'z')
85 else if (dig
>= '0' && dig
<= '9')
86 return dig
- '0' + 26;
91 /* Punycode bias adaptation algorithm, RFC 3492 section 6.1 */
99 delta
= firsttime
? delta
/ PUNYCODE_DAMP
: delta
/ 2;
100 delta
+= delta
/ numpoints
;
103 while (delta
> ((PUNYCODE_BASE
- PUNYCODE_TMIN
) * PUNYCODE_TMAX
) / 2)
105 delta
/= PUNYCODE_BASE
- PUNYCODE_TMIN
;
109 return k
+ ((PUNYCODE_BASE
- PUNYCODE_TMIN
+ 1) * delta
/
110 (delta
+ PUNYCODE_SKEW
));
113 /* Punycode encoder, RFC 3492 section 6.3. The algorithm is
114 * sufficiently bizarre that it's not really worth trying to explain
118 punycode_encode (const gchar
*input_utf8
,
119 gsize input_utf8_length
,
122 guint delta
, handled_chars
, num_basic_chars
, bias
, j
, q
, k
, t
, digit
;
123 gunichar n
, m
, *input
;
125 gboolean success
= FALSE
;
127 /* Convert from UTF-8 to Unicode code points */
128 input
= g_utf8_to_ucs4 (input_utf8
, input_utf8_length
, NULL
,
129 &input_length
, NULL
);
133 /* Copy basic chars */
134 for (j
= num_basic_chars
= 0; j
< input_length
; j
++)
136 if (PUNYCODE_IS_BASIC (input
[j
]))
138 g_string_append_c (output
, g_ascii_tolower (input
[j
]));
143 g_string_append_c (output
, '-');
145 handled_chars
= num_basic_chars
;
147 /* Encode non-basic chars */
149 bias
= PUNYCODE_INITIAL_BIAS
;
150 n
= PUNYCODE_INITIAL_N
;
151 while (handled_chars
< input_length
)
153 /* let m = the minimum {non-basic} code point >= n in the input */
154 for (m
= G_MAXUINT
, j
= 0; j
< input_length
; j
++)
156 if (input
[j
] >= n
&& input
[j
] < m
)
160 if (m
- n
> (G_MAXUINT
- delta
) / (handled_chars
+ 1))
162 delta
+= (m
- n
) * (handled_chars
+ 1);
165 for (j
= 0; j
< input_length
; j
++)
172 else if (input
[j
] == n
)
175 for (k
= PUNYCODE_BASE
; ; k
+= PUNYCODE_BASE
)
179 else if (k
>= bias
+ PUNYCODE_TMAX
)
185 digit
= t
+ (q
- t
) % (PUNYCODE_BASE
- t
);
186 g_string_append_c (output
, encode_digit (digit
));
187 q
= (q
- t
) / (PUNYCODE_BASE
- t
);
190 g_string_append_c (output
, encode_digit (q
));
191 bias
= adapt (delta
, handled_chars
+ 1, handled_chars
== num_basic_chars
);
208 /* From RFC 3454, Table B.1 */
209 #define idna_is_junk(ch) ((ch) == 0x00AD || (ch) == 0x1806 || (ch) == 0x200B || (ch) == 0x2060 || (ch) == 0xFEFF || (ch) == 0x034F || (ch) == 0x180B || (ch) == 0x180C || (ch) == 0x180D || (ch) == 0x200C || (ch) == 0x200D || ((ch) >= 0xFE00 && (ch) <= 0xFE0F))
211 /* Scan @str for "junk" and return a cleaned-up string if any junk
212 * is found. Else return %NULL.
215 remove_junk (const gchar
*str
,
218 GString
*cleaned
= NULL
;
222 for (p
= str
; len
== -1 ? *p
: p
< str
+ len
; p
= g_utf8_next_char (p
))
224 ch
= g_utf8_get_char (p
);
225 if (idna_is_junk (ch
))
229 cleaned
= g_string_new (NULL
);
230 g_string_append_len (cleaned
, str
, p
- str
);
234 g_string_append_unichar (cleaned
, ch
);
238 return g_string_free (cleaned
, FALSE
);
243 static inline gboolean
244 contains_uppercase_letters (const gchar
*str
,
249 for (p
= str
; len
== -1 ? *p
: p
< str
+ len
; p
= g_utf8_next_char (p
))
251 if (g_unichar_isupper (g_utf8_get_char (p
)))
257 static inline gboolean
258 contains_non_ascii (const gchar
*str
,
263 for (p
= str
; len
== -1 ? *p
: p
< str
+ len
; p
++)
265 if ((guchar
)*p
> 0x80)
271 /* RFC 3454, Appendix C. ish. */
272 static inline gboolean
273 idna_is_prohibited (gunichar ch
)
275 switch (g_unichar_type (ch
))
277 case G_UNICODE_CONTROL
:
278 case G_UNICODE_FORMAT
:
279 case G_UNICODE_UNASSIGNED
:
280 case G_UNICODE_PRIVATE_USE
:
281 case G_UNICODE_SURROGATE
:
282 case G_UNICODE_LINE_SEPARATOR
:
283 case G_UNICODE_PARAGRAPH_SEPARATOR
:
284 case G_UNICODE_SPACE_SEPARATOR
:
287 case G_UNICODE_OTHER_SYMBOL
:
288 if (ch
== 0xFFFC || ch
== 0xFFFD ||
289 (ch
>= 0x2FF0 && ch
<= 0x2FFB))
293 case G_UNICODE_NON_SPACING_MARK
:
294 if (ch
== 0x0340 || ch
== 0x0341)
303 /* RFC 3491 IDN cleanup algorithm. */
305 nameprep (const gchar
*hostname
,
307 gboolean
*is_unicode
)
309 gchar
*name
, *tmp
= NULL
, *p
;
311 /* It would be nice if we could do this without repeatedly
312 * allocating strings and converting back and forth between
313 * gunichars and UTF-8... The code does at least avoid doing most of
314 * the sub-operations when they would just be equivalent to a
318 /* Remove presentation-only characters */
319 name
= remove_junk (hostname
, len
);
326 name
= (gchar
*)hostname
;
328 /* Convert to lowercase */
329 if (contains_uppercase_letters (name
, len
))
331 name
= g_utf8_strdown (name
, len
);
337 /* If there are no UTF8 characters, we're done. */
338 if (!contains_non_ascii (name
, len
))
341 if (name
== (gchar
*)hostname
)
342 return len
== -1 ? g_strdup (hostname
) : g_strndup (hostname
, len
);
350 name
= g_utf8_normalize (name
, len
, G_NORMALIZE_NFKC
);
357 /* KC normalization may have created more capital letters (eg,
358 * angstrom -> capital A with ring). So we have to lowercasify a
359 * second time. (This is more-or-less how the nameprep algorithm
360 * does it. If tolower(nfkc(tolower(X))) is guaranteed to be the
361 * same as tolower(nfkc(X)), then we could skip the first tolower,
362 * but I'm not sure it is.)
364 if (contains_uppercase_letters (name
, -1))
366 name
= g_utf8_strdown (name
, -1);
371 /* Check for prohibited characters */
372 for (p
= name
; *p
; p
= g_utf8_next_char (p
))
374 if (idna_is_prohibited (g_utf8_get_char (p
)))
382 /* FIXME: We're supposed to verify certain constraints on bidi
383 * characters, but glib does not appear to have that information.
390 /* RFC 3490, section 3.1 says '.', 0x3002, 0xFF0E, and 0xFF61 count as
391 * label-separating dots. @str must be '\0'-terminated.
393 #define idna_is_dot(str) ( \
394 ((guchar)(str)[0] == '.') || \
395 ((guchar)(str)[0] == 0xE3 && (guchar)(str)[1] == 0x80 && (guchar)(str)[2] == 0x82) || \
396 ((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBC && (guchar)(str)[2] == 0x8E) || \
397 ((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBD && (guchar)(str)[2] == 0xA1) )
400 idna_end_of_label (const gchar
*str
)
402 for (; *str
; str
= g_utf8_next_char (str
))
404 if (idna_is_dot (str
))
411 * g_hostname_to_ascii:
412 * @hostname: a valid UTF-8 or ASCII hostname
414 * Converts @hostname to its canonical ASCII form; an ASCII-only
415 * string containing no uppercase letters and not ending with a
418 * Return value: an ASCII hostname, which must be freed, or %NULL if
419 * @hostname is in some way invalid.
424 g_hostname_to_ascii (const gchar
*hostname
)
426 gchar
*name
, *label
, *p
;
431 label
= name
= nameprep (hostname
, -1, &unicode
);
432 if (!name
|| !unicode
)
435 out
= g_string_new (NULL
);
440 for (p
= label
; *p
&& !idna_is_dot (p
); p
++)
442 if ((guchar
)*p
> 0x80)
450 if (!strncmp (label
, IDNA_ACE_PREFIX
, IDNA_ACE_PREFIX_LEN
))
453 g_string_append (out
, IDNA_ACE_PREFIX
);
454 if (!punycode_encode (label
, llen
, out
))
458 g_string_append_len (out
, label
, llen
);
460 if (out
->len
- oldlen
> 63)
465 label
= g_utf8_next_char (label
);
467 g_string_append_c (out
, '.');
472 return g_string_free (out
, FALSE
);
476 g_string_free (out
, TRUE
);
481 * g_hostname_is_non_ascii:
482 * @hostname: a hostname
484 * Tests if @hostname contains Unicode characters. If this returns
485 * %TRUE, you need to encode the hostname with g_hostname_to_ascii()
486 * before using it in non-IDN-aware contexts.
488 * Note that a hostname might contain a mix of encoded and unencoded
489 * segments, and so it is possible for g_hostname_is_non_ascii() and
490 * g_hostname_is_ascii_encoded() to both return %TRUE for a name.
492 * Return value: %TRUE if @hostname contains any non-ASCII characters
497 g_hostname_is_non_ascii (const gchar
*hostname
)
499 return contains_non_ascii (hostname
, -1);
502 /* Punycode decoder, RFC 3492 section 6.2. As with punycode_encode(),
503 * read the RFC if you want to understand what this is actually doing.
506 punycode_decode (const gchar
*input
,
510 GArray
*output_chars
;
513 guint oldi
, w
, k
, digit
, t
;
516 n
= PUNYCODE_INITIAL_N
;
518 bias
= PUNYCODE_INITIAL_BIAS
;
520 split
= input
+ input_length
- 1;
521 while (split
> input
&& *split
!= '-')
525 output_chars
= g_array_sized_new (FALSE
, FALSE
, sizeof (gunichar
),
527 input_length
-= (split
- input
) + 1;
528 while (input
< split
)
530 gunichar ch
= (gunichar
)*input
++;
531 if (!PUNYCODE_IS_BASIC (ch
))
533 g_array_append_val (output_chars
, ch
);
538 output_chars
= g_array_new (FALSE
, FALSE
, sizeof (gunichar
));
544 for (k
= PUNYCODE_BASE
; ; k
+= PUNYCODE_BASE
)
548 digit
= decode_digit (*input
++);
549 if (digit
>= PUNYCODE_BASE
)
551 if (digit
> (G_MAXUINT
- i
) / w
)
556 else if (k
>= bias
+ PUNYCODE_TMAX
)
562 if (w
> G_MAXUINT
/ (PUNYCODE_BASE
- t
))
564 w
*= (PUNYCODE_BASE
- t
);
567 bias
= adapt (i
- oldi
, output_chars
->len
+ 1, oldi
== 0);
569 if (i
/ (output_chars
->len
+ 1) > G_MAXUINT
- n
)
571 n
+= i
/ (output_chars
->len
+ 1);
572 i
%= (output_chars
->len
+ 1);
574 g_array_insert_val (output_chars
, i
++, n
);
577 for (i
= 0; i
< output_chars
->len
; i
++)
578 g_string_append_unichar (output
, g_array_index (output_chars
, gunichar
, i
));
579 g_array_free (output_chars
, TRUE
);
583 g_array_free (output_chars
, TRUE
);
588 * g_hostname_to_unicode:
589 * @hostname: a valid UTF-8 or ASCII hostname
591 * Converts @hostname to its canonical presentation form; a UTF-8
592 * string in Unicode normalization form C, containing no uppercase
593 * letters, no forbidden characters, and no ASCII-encoded segments,
594 * and not ending with a trailing dot.
596 * Of course if @hostname is not an internationalized hostname, then
597 * the canonical presentation form will be entirely ASCII.
599 * Return value: a UTF-8 hostname, which must be freed, or %NULL if
600 * @hostname is in some way invalid.
605 g_hostname_to_unicode (const gchar
*hostname
)
610 out
= g_string_new (NULL
);
614 llen
= idna_end_of_label (hostname
) - hostname
;
615 if (!g_ascii_strncasecmp (hostname
, IDNA_ACE_PREFIX
, IDNA_ACE_PREFIX_LEN
))
617 hostname
+= IDNA_ACE_PREFIX_LEN
;
618 llen
-= IDNA_ACE_PREFIX_LEN
;
619 if (!punycode_decode (hostname
, llen
, out
))
621 g_string_free (out
, TRUE
);
628 gchar
*canonicalized
= nameprep (hostname
, llen
, &unicode
);
632 g_string_free (out
, TRUE
);
635 g_string_append (out
, canonicalized
);
636 g_free (canonicalized
);
641 hostname
= g_utf8_next_char (hostname
);
643 g_string_append_c (out
, '.');
647 return g_string_free (out
, FALSE
);
651 * g_hostname_is_ascii_encoded:
652 * @hostname: a hostname
654 * Tests if @hostname contains segments with an ASCII-compatible
655 * encoding of an Internationalized Domain Name. If this returns
656 * %TRUE, you should decode the hostname with g_hostname_to_unicode()
657 * before displaying it to the user.
659 * Note that a hostname might contain a mix of encoded and unencoded
660 * segments, and so it is possible for g_hostname_is_non_ascii() and
661 * g_hostname_is_ascii_encoded() to both return %TRUE for a name.
663 * Return value: %TRUE if @hostname contains any ASCII-encoded
669 g_hostname_is_ascii_encoded (const gchar
*hostname
)
673 if (!g_ascii_strncasecmp (hostname
, IDNA_ACE_PREFIX
, IDNA_ACE_PREFIX_LEN
))
675 hostname
= idna_end_of_label (hostname
);
677 hostname
= g_utf8_next_char (hostname
);
684 * g_hostname_is_ip_address:
685 * @hostname: a hostname (or IP address in string form)
687 * Tests if @hostname is the string form of an IPv4 or IPv6 address.
688 * (Eg, "192.168.0.1".)
690 * Return value: %TRUE if @hostname is an IP address
695 g_hostname_is_ip_address (const gchar
*hostname
)
698 gint nsegments
, octet
;
700 /* On Linux we could implement this using inet_pton, but the Windows
701 * equivalent of that requires linking against winsock, so we just
702 * figure this out ourselves. Tested by tests/hostutils.c.
705 p
= (char *)hostname
;
711 /* If it contains a ':', it's an IPv6 address (assuming it's an
712 * IP address at all). This consists of eight ':'-separated
713 * segments, each containing a 1-4 digit hex number, except that
714 * optionally: (a) the last two segments can be replaced by an
715 * IPv4 address, and (b) a single span of 1 to 8 "0000" segments
716 * can be replaced with just "::".
721 while (*p
&& nsegments
< 8)
723 /* Each segment after the first must be preceded by a ':'.
724 * (We also handle half of the "string starts with ::" case
727 if (p
!= (char *)hostname
|| (p
[0] == ':' && p
[1] == ':'))
734 /* If there's another ':', it means we're skipping some segments */
735 if (*p
== ':' && !skipped
)
740 /* Handle the "string ends with ::" case */
747 /* Read the segment, make sure it's valid. */
748 for (end
= p
; g_ascii_isxdigit (*end
); end
++)
750 if (end
== p
|| end
> p
+ 4)
755 if ((nsegments
== 6 && !skipped
) || (nsegments
<= 6 && skipped
))
765 return !*p
&& (nsegments
== 8 || skipped
);
770 /* Parse IPv4: N.N.N.N, where each N <= 255 and doesn't have leading 0s. */
771 for (nsegments
= 0; nsegments
< 4; nsegments
++)
780 /* Check the segment; a little tricker than the IPv6 case since
781 * we can't allow extra leading 0s, and we can't assume that all
782 * strings of valid length are within range.
789 for (end
= p
; g_ascii_isdigit (*end
); end
++)
790 octet
= 10 * octet
+ (*end
- '0');
792 if (end
== p
|| end
> p
+ 3 || octet
> 255)
798 /* If there's nothing left to parse, then it's ok. */