1 /* -*- mode: C; c-file-style: "gnu"; indent-tabs-mode: nil; -*- */
3 /* GLIB - Library of useful routines for C programming
4 * Copyright (C) 2008 Red Hat, Inc.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General
17 * Public License along with this library; if not, see <http://www.gnu.org/licenses/>.
24 #include "ghostutils.h"
29 #include "gstrfuncs.h"
35 * @short_description: Internet hostname utilities
37 * Functions for manipulating internet hostnames; in particular, for
38 * converting between Unicode and ASCII-encoded forms of
39 * Internationalized Domain Names (IDNs).
42 * [Internationalized Domain Names for Applications (IDNA)](http://www.ietf.org/rfc/rfc3490.txt)
43 * standards allow for the use
44 * of Unicode domain names in applications, while providing
45 * backward-compatibility with the old ASCII-only DNS, by defining an
46 * ASCII-Compatible Encoding of any given Unicode name, which can be
47 * used with non-IDN-aware applications and protocols. (For example,
48 * "Παν語.org" maps to "xn--4wa8awb4637h.org".)
51 #define IDNA_ACE_PREFIX "xn--"
52 #define IDNA_ACE_PREFIX_LEN 4
54 /* Punycode constants, from RFC 3492. */
56 #define PUNYCODE_BASE 36
57 #define PUNYCODE_TMIN 1
58 #define PUNYCODE_TMAX 26
59 #define PUNYCODE_SKEW 38
60 #define PUNYCODE_DAMP 700
61 #define PUNYCODE_INITIAL_BIAS 72
62 #define PUNYCODE_INITIAL_N 0x80
64 #define PUNYCODE_IS_BASIC(cp) ((guint)(cp) < 0x80)
66 /* Encode/decode a single base-36 digit */
68 encode_digit (guint dig
)
73 return dig
- 26 + '0';
77 decode_digit (gchar dig
)
79 if (dig
>= 'A' && dig
<= 'Z')
81 else if (dig
>= 'a' && dig
<= 'z')
83 else if (dig
>= '0' && dig
<= '9')
84 return dig
- '0' + 26;
89 /* Punycode bias adaptation algorithm, RFC 3492 section 6.1 */
97 delta
= firsttime
? delta
/ PUNYCODE_DAMP
: delta
/ 2;
98 delta
+= delta
/ numpoints
;
101 while (delta
> ((PUNYCODE_BASE
- PUNYCODE_TMIN
) * PUNYCODE_TMAX
) / 2)
103 delta
/= PUNYCODE_BASE
- PUNYCODE_TMIN
;
107 return k
+ ((PUNYCODE_BASE
- PUNYCODE_TMIN
+ 1) * delta
/
108 (delta
+ PUNYCODE_SKEW
));
111 /* Punycode encoder, RFC 3492 section 6.3. The algorithm is
112 * sufficiently bizarre that it's not really worth trying to explain
116 punycode_encode (const gchar
*input_utf8
,
117 gsize input_utf8_length
,
120 guint delta
, handled_chars
, num_basic_chars
, bias
, j
, q
, k
, t
, digit
;
121 gunichar n
, m
, *input
;
123 gboolean success
= FALSE
;
125 /* Convert from UTF-8 to Unicode code points */
126 input
= g_utf8_to_ucs4 (input_utf8
, input_utf8_length
, NULL
,
127 &input_length
, NULL
);
131 /* Copy basic chars */
132 for (j
= num_basic_chars
= 0; j
< input_length
; j
++)
134 if (PUNYCODE_IS_BASIC (input
[j
]))
136 g_string_append_c (output
, g_ascii_tolower (input
[j
]));
141 g_string_append_c (output
, '-');
143 handled_chars
= num_basic_chars
;
145 /* Encode non-basic chars */
147 bias
= PUNYCODE_INITIAL_BIAS
;
148 n
= PUNYCODE_INITIAL_N
;
149 while (handled_chars
< input_length
)
151 /* let m = the minimum {non-basic} code point >= n in the input */
152 for (m
= G_MAXUINT
, j
= 0; j
< input_length
; j
++)
154 if (input
[j
] >= n
&& input
[j
] < m
)
158 if (m
- n
> (G_MAXUINT
- delta
) / (handled_chars
+ 1))
160 delta
+= (m
- n
) * (handled_chars
+ 1);
163 for (j
= 0; j
< input_length
; j
++)
170 else if (input
[j
] == n
)
173 for (k
= PUNYCODE_BASE
; ; k
+= PUNYCODE_BASE
)
177 else if (k
>= bias
+ PUNYCODE_TMAX
)
183 digit
= t
+ (q
- t
) % (PUNYCODE_BASE
- t
);
184 g_string_append_c (output
, encode_digit (digit
));
185 q
= (q
- t
) / (PUNYCODE_BASE
- t
);
188 g_string_append_c (output
, encode_digit (q
));
189 bias
= adapt (delta
, handled_chars
+ 1, handled_chars
== num_basic_chars
);
206 /* From RFC 3454, Table B.1 */
207 #define idna_is_junk(ch) ((ch) == 0x00AD || (ch) == 0x1806 || (ch) == 0x200B || (ch) == 0x2060 || (ch) == 0xFEFF || (ch) == 0x034F || (ch) == 0x180B || (ch) == 0x180C || (ch) == 0x180D || (ch) == 0x200C || (ch) == 0x200D || ((ch) >= 0xFE00 && (ch) <= 0xFE0F))
209 /* Scan @str for "junk" and return a cleaned-up string if any junk
210 * is found. Else return %NULL.
213 remove_junk (const gchar
*str
,
216 GString
*cleaned
= NULL
;
220 for (p
= str
; len
== -1 ? *p
: p
< str
+ len
; p
= g_utf8_next_char (p
))
222 ch
= g_utf8_get_char (p
);
223 if (idna_is_junk (ch
))
227 cleaned
= g_string_new (NULL
);
228 g_string_append_len (cleaned
, str
, p
- str
);
232 g_string_append_unichar (cleaned
, ch
);
236 return g_string_free (cleaned
, FALSE
);
241 static inline gboolean
242 contains_uppercase_letters (const gchar
*str
,
247 for (p
= str
; len
== -1 ? *p
: p
< str
+ len
; p
= g_utf8_next_char (p
))
249 if (g_unichar_isupper (g_utf8_get_char (p
)))
255 static inline gboolean
256 contains_non_ascii (const gchar
*str
,
261 for (p
= str
; len
== -1 ? *p
: p
< str
+ len
; p
++)
263 if ((guchar
)*p
> 0x80)
269 /* RFC 3454, Appendix C. ish. */
270 static inline gboolean
271 idna_is_prohibited (gunichar ch
)
273 switch (g_unichar_type (ch
))
275 case G_UNICODE_CONTROL
:
276 case G_UNICODE_FORMAT
:
277 case G_UNICODE_UNASSIGNED
:
278 case G_UNICODE_PRIVATE_USE
:
279 case G_UNICODE_SURROGATE
:
280 case G_UNICODE_LINE_SEPARATOR
:
281 case G_UNICODE_PARAGRAPH_SEPARATOR
:
282 case G_UNICODE_SPACE_SEPARATOR
:
285 case G_UNICODE_OTHER_SYMBOL
:
286 if (ch
== 0xFFFC || ch
== 0xFFFD ||
287 (ch
>= 0x2FF0 && ch
<= 0x2FFB))
291 case G_UNICODE_NON_SPACING_MARK
:
292 if (ch
== 0x0340 || ch
== 0x0341)
301 /* RFC 3491 IDN cleanup algorithm. */
303 nameprep (const gchar
*hostname
,
305 gboolean
*is_unicode
)
307 gchar
*name
, *tmp
= NULL
, *p
;
309 /* It would be nice if we could do this without repeatedly
310 * allocating strings and converting back and forth between
311 * gunichars and UTF-8... The code does at least avoid doing most of
312 * the sub-operations when they would just be equivalent to a
316 /* Remove presentation-only characters */
317 name
= remove_junk (hostname
, len
);
324 name
= (gchar
*)hostname
;
326 /* Convert to lowercase */
327 if (contains_uppercase_letters (name
, len
))
329 name
= g_utf8_strdown (name
, len
);
335 /* If there are no UTF8 characters, we're done. */
336 if (!contains_non_ascii (name
, len
))
339 if (name
== (gchar
*)hostname
)
340 return len
== -1 ? g_strdup (hostname
) : g_strndup (hostname
, len
);
348 name
= g_utf8_normalize (name
, len
, G_NORMALIZE_NFKC
);
355 /* KC normalization may have created more capital letters (eg,
356 * angstrom -> capital A with ring). So we have to lowercasify a
357 * second time. (This is more-or-less how the nameprep algorithm
358 * does it. If tolower(nfkc(tolower(X))) is guaranteed to be the
359 * same as tolower(nfkc(X)), then we could skip the first tolower,
360 * but I'm not sure it is.)
362 if (contains_uppercase_letters (name
, -1))
364 name
= g_utf8_strdown (name
, -1);
369 /* Check for prohibited characters */
370 for (p
= name
; *p
; p
= g_utf8_next_char (p
))
372 if (idna_is_prohibited (g_utf8_get_char (p
)))
380 /* FIXME: We're supposed to verify certain constraints on bidi
381 * characters, but glib does not appear to have that information.
388 /* RFC 3490, section 3.1 says '.', 0x3002, 0xFF0E, and 0xFF61 count as
389 * label-separating dots. @str must be '\0'-terminated.
391 #define idna_is_dot(str) ( \
392 ((guchar)(str)[0] == '.') || \
393 ((guchar)(str)[0] == 0xE3 && (guchar)(str)[1] == 0x80 && (guchar)(str)[2] == 0x82) || \
394 ((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBC && (guchar)(str)[2] == 0x8E) || \
395 ((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBD && (guchar)(str)[2] == 0xA1) )
398 idna_end_of_label (const gchar
*str
)
400 for (; *str
; str
= g_utf8_next_char (str
))
402 if (idna_is_dot (str
))
409 * g_hostname_to_ascii:
410 * @hostname: a valid UTF-8 or ASCII hostname
412 * Converts @hostname to its canonical ASCII form; an ASCII-only
413 * string containing no uppercase letters and not ending with a
416 * Returns: an ASCII hostname, which must be freed, or %NULL if
417 * @hostname is in some way invalid.
422 g_hostname_to_ascii (const gchar
*hostname
)
424 gchar
*name
, *label
, *p
;
429 label
= name
= nameprep (hostname
, -1, &unicode
);
430 if (!name
|| !unicode
)
433 out
= g_string_new (NULL
);
438 for (p
= label
; *p
&& !idna_is_dot (p
); p
++)
440 if ((guchar
)*p
> 0x80)
448 if (!strncmp (label
, IDNA_ACE_PREFIX
, IDNA_ACE_PREFIX_LEN
))
451 g_string_append (out
, IDNA_ACE_PREFIX
);
452 if (!punycode_encode (label
, llen
, out
))
456 g_string_append_len (out
, label
, llen
);
458 if (out
->len
- oldlen
> 63)
463 label
= g_utf8_next_char (label
);
465 g_string_append_c (out
, '.');
470 return g_string_free (out
, FALSE
);
474 g_string_free (out
, TRUE
);
479 * g_hostname_is_non_ascii:
480 * @hostname: a hostname
482 * Tests if @hostname contains Unicode characters. If this returns
483 * %TRUE, you need to encode the hostname with g_hostname_to_ascii()
484 * before using it in non-IDN-aware contexts.
486 * Note that a hostname might contain a mix of encoded and unencoded
487 * segments, and so it is possible for g_hostname_is_non_ascii() and
488 * g_hostname_is_ascii_encoded() to both return %TRUE for a name.
490 * Returns: %TRUE if @hostname contains any non-ASCII characters
495 g_hostname_is_non_ascii (const gchar
*hostname
)
497 return contains_non_ascii (hostname
, -1);
500 /* Punycode decoder, RFC 3492 section 6.2. As with punycode_encode(),
501 * read the RFC if you want to understand what this is actually doing.
504 punycode_decode (const gchar
*input
,
508 GArray
*output_chars
;
511 guint oldi
, w
, k
, digit
, t
;
514 n
= PUNYCODE_INITIAL_N
;
516 bias
= PUNYCODE_INITIAL_BIAS
;
518 split
= input
+ input_length
- 1;
519 while (split
> input
&& *split
!= '-')
523 output_chars
= g_array_sized_new (FALSE
, FALSE
, sizeof (gunichar
),
525 input_length
-= (split
- input
) + 1;
526 while (input
< split
)
528 gunichar ch
= (gunichar
)*input
++;
529 if (!PUNYCODE_IS_BASIC (ch
))
531 g_array_append_val (output_chars
, ch
);
536 output_chars
= g_array_new (FALSE
, FALSE
, sizeof (gunichar
));
542 for (k
= PUNYCODE_BASE
; ; k
+= PUNYCODE_BASE
)
546 digit
= decode_digit (*input
++);
547 if (digit
>= PUNYCODE_BASE
)
549 if (digit
> (G_MAXUINT
- i
) / w
)
554 else if (k
>= bias
+ PUNYCODE_TMAX
)
560 if (w
> G_MAXUINT
/ (PUNYCODE_BASE
- t
))
562 w
*= (PUNYCODE_BASE
- t
);
565 bias
= adapt (i
- oldi
, output_chars
->len
+ 1, oldi
== 0);
567 if (i
/ (output_chars
->len
+ 1) > G_MAXUINT
- n
)
569 n
+= i
/ (output_chars
->len
+ 1);
570 i
%= (output_chars
->len
+ 1);
572 g_array_insert_val (output_chars
, i
++, n
);
575 for (i
= 0; i
< output_chars
->len
; i
++)
576 g_string_append_unichar (output
, g_array_index (output_chars
, gunichar
, i
));
577 g_array_free (output_chars
, TRUE
);
581 g_array_free (output_chars
, TRUE
);
586 * g_hostname_to_unicode:
587 * @hostname: a valid UTF-8 or ASCII hostname
589 * Converts @hostname to its canonical presentation form; a UTF-8
590 * string in Unicode normalization form C, containing no uppercase
591 * letters, no forbidden characters, and no ASCII-encoded segments,
592 * and not ending with a trailing dot.
594 * Of course if @hostname is not an internationalized hostname, then
595 * the canonical presentation form will be entirely ASCII.
597 * Returns: a UTF-8 hostname, which must be freed, or %NULL if
598 * @hostname is in some way invalid.
603 g_hostname_to_unicode (const gchar
*hostname
)
608 out
= g_string_new (NULL
);
612 llen
= idna_end_of_label (hostname
) - hostname
;
613 if (!g_ascii_strncasecmp (hostname
, IDNA_ACE_PREFIX
, IDNA_ACE_PREFIX_LEN
))
615 hostname
+= IDNA_ACE_PREFIX_LEN
;
616 llen
-= IDNA_ACE_PREFIX_LEN
;
617 if (!punycode_decode (hostname
, llen
, out
))
619 g_string_free (out
, TRUE
);
626 gchar
*canonicalized
= nameprep (hostname
, llen
, &unicode
);
630 g_string_free (out
, TRUE
);
633 g_string_append (out
, canonicalized
);
634 g_free (canonicalized
);
639 hostname
= g_utf8_next_char (hostname
);
641 g_string_append_c (out
, '.');
645 return g_string_free (out
, FALSE
);
649 * g_hostname_is_ascii_encoded:
650 * @hostname: a hostname
652 * Tests if @hostname contains segments with an ASCII-compatible
653 * encoding of an Internationalized Domain Name. If this returns
654 * %TRUE, you should decode the hostname with g_hostname_to_unicode()
655 * before displaying it to the user.
657 * Note that a hostname might contain a mix of encoded and unencoded
658 * segments, and so it is possible for g_hostname_is_non_ascii() and
659 * g_hostname_is_ascii_encoded() to both return %TRUE for a name.
661 * Returns: %TRUE if @hostname contains any ASCII-encoded
667 g_hostname_is_ascii_encoded (const gchar
*hostname
)
671 if (!g_ascii_strncasecmp (hostname
, IDNA_ACE_PREFIX
, IDNA_ACE_PREFIX_LEN
))
673 hostname
= idna_end_of_label (hostname
);
675 hostname
= g_utf8_next_char (hostname
);
682 * g_hostname_is_ip_address:
683 * @hostname: a hostname (or IP address in string form)
685 * Tests if @hostname is the string form of an IPv4 or IPv6 address.
686 * (Eg, "192.168.0.1".)
688 * Returns: %TRUE if @hostname is an IP address
693 g_hostname_is_ip_address (const gchar
*hostname
)
696 gint nsegments
, octet
;
698 /* On Linux we could implement this using inet_pton, but the Windows
699 * equivalent of that requires linking against winsock, so we just
700 * figure this out ourselves. Tested by tests/hostutils.c.
703 p
= (char *)hostname
;
709 /* If it contains a ':', it's an IPv6 address (assuming it's an
710 * IP address at all). This consists of eight ':'-separated
711 * segments, each containing a 1-4 digit hex number, except that
712 * optionally: (a) the last two segments can be replaced by an
713 * IPv4 address, and (b) a single span of 1 to 8 "0000" segments
714 * can be replaced with just "::".
719 while (*p
&& nsegments
< 8)
721 /* Each segment after the first must be preceded by a ':'.
722 * (We also handle half of the "string starts with ::" case
725 if (p
!= (char *)hostname
|| (p
[0] == ':' && p
[1] == ':'))
732 /* If there's another ':', it means we're skipping some segments */
733 if (*p
== ':' && !skipped
)
738 /* Handle the "string ends with ::" case */
745 /* Read the segment, make sure it's valid. */
746 for (end
= p
; g_ascii_isxdigit (*end
); end
++)
748 if (end
== p
|| end
> p
+ 4)
753 if ((nsegments
== 6 && !skipped
) || (nsegments
<= 6 && skipped
))
763 return !*p
&& (nsegments
== 8 || skipped
);
768 /* Parse IPv4: N.N.N.N, where each N <= 255 and doesn't have leading 0s. */
769 for (nsegments
= 0; nsegments
< 4; nsegments
++)
778 /* Check the segment; a little tricker than the IPv6 case since
779 * we can't allow extra leading 0s, and we can't assume that all
780 * strings of valid length are within range.
787 for (end
= p
; g_ascii_isdigit (*end
); end
++)
789 octet
= 10 * octet
+ (*end
- '0');
795 if (end
== p
|| end
> p
+ 3 || octet
> 255)
801 /* If there's nothing left to parse, then it's ok. */