4 static char *rcsid
= "Id: punycode.c,v 1.1.1.1 2003/06/04 00:26:06 marka Exp";
8 * Copyright (c) 2001,2002 Japan Network Information Center.
11 * By using this file, you agree to the terms and conditions set forth bellow.
13 * LICENSE TERMS AND CONDITIONS
15 * The following License Terms and Conditions apply, unless a different
16 * license is obtained from Japan Network Information Center ("JPNIC"),
17 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
18 * Chiyoda-ku, Tokyo 101-0047, Japan.
20 * 1. Use, Modification and Redistribution (including distribution of any
21 * modified or derived work) in source and/or binary forms is permitted
22 * under this License Terms and Conditions.
24 * 2. Redistribution of source code must retain the copyright notices as they
25 * appear in each source code file, this License Terms and Conditions.
27 * 3. Redistribution in binary form must reproduce the Copyright Notice,
28 * this License Terms and Conditions, in the documentation and/or other
29 * materials provided with the distribution. For the purposes of binary
30 * distribution the "Copyright Notice" refers to the following language:
31 * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved."
33 * 4. The name of JPNIC may not be used to endorse or promote products
34 * derived from this Software without specific prior written approval of
37 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
40 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE
41 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
44 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
45 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
46 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
47 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
56 #include <idn/result.h>
57 #include <idn/assert.h>
58 #include <idn/logmacro.h>
59 #include <idn/converter.h>
61 #include <idn/debug.h>
62 #include <idn/punycode.h>
66 * Although draft-ietf-idn-punycode-00.txt doesn't specify the ACE
67 * signature, we have to choose one. In order to prevent the converted
68 * name from beginning with a hyphen, we should choose a prefix rather
71 #ifndef IDN_PUNYCODE_PREFIX
72 #define IDN_PUNYCODE_PREFIX "xn--"
75 #define INVALID_UCS 0x80000000
76 #define MAX_UCS 0x10FFFF
79 * As the draft states, it is possible that `delta' may overflow during
80 * the encoding. The upper bound of 'delta' is:
81 * <# of chars. of input string> + <max. difference in code point> *
82 * <# of chars. of input string + 1>
83 * For this value not to be greater than 0xffffffff (since the calculation
84 * is done using unsigned long, which is at least 32bit long), the maxmum
85 * input string size is about 3850 characters, which is long enough for
88 #define PUNYCODE_MAXINPUT 3800
93 #define PUNYCODE_BASE 36
94 #define PUNYCODE_TMIN 1
95 #define PUNYCODE_TMAX 26
96 #define PUNYCODE_SKEW 38
97 #define PUNYCODE_DAMP 700
98 #define PUNYCODE_INITIAL_BIAS 72
99 #define PUNYCODE_INITIAL_N 0x80
101 static int punycode_getwc(const char *s
, size_t len
,
102 int bias
, unsigned long *vp
);
103 static int punycode_putwc(char *s
, size_t len
,
104 unsigned long delta
, int bias
);
105 static int punycode_update_bias(unsigned long delta
,
106 size_t npoints
, int first
);
109 idn__punycode_decode(idn_converter_t ctx
, void *privdata
,
110 const char *from
, unsigned long *to
, size_t tolen
) {
111 unsigned long *to_org
= to
;
112 unsigned long c
, idx
;
113 size_t prefixlen
= strlen(IDN_PUNYCODE_PREFIX
);
115 size_t uidx
, fidx
, ucslen
;
121 TRACE(("idn__punycode_decode(from=\"%s\", tolen=%d)\n",
122 idn__debug_xstring(from
, 50), (int)tolen
));
124 if (!idn__util_asciihaveaceprefix(from
, IDN_PUNYCODE_PREFIX
)) {
126 r
= idn_ucs4_utf8toucs4(from
, to
, tolen
);
129 r
= idn_invalid_encoding
;
133 fromlen
= strlen(from
);
136 * Find the last delimiter, and copy the characters
137 * before it verbatim.
140 for (fidx
= fromlen
; fidx
> 0; fidx
--) {
141 if (from
[fidx
- 1] == '-') {
143 r
= idn_buffer_overflow
;
146 for (uidx
= 0; uidx
< fidx
- 1; uidx
++) {
147 to
[uidx
] = from
[uidx
];
155 bias
= PUNYCODE_INITIAL_BIAS
;
156 c
= PUNYCODE_INITIAL_N
;
158 while (fidx
< fromlen
) {
163 len
= punycode_getwc(from
+ fidx
, fromlen
- fidx
, bias
, &delta
);
165 r
= idn_invalid_encoding
;
170 bias
= punycode_update_bias(delta
, ucslen
+ 1, first
);
173 c
+= idx
/ (ucslen
+ 1);
174 uidx
= idx
% (ucslen
+ 1);
176 /* Insert 'c' at uidx. */
178 r
= idn_buffer_overflow
;
181 for (i
= ucslen
; i
> uidx
; i
--)
189 /* Terminate with NUL. */
191 r
= idn_buffer_overflow
;
198 if (r
== idn_success
) {
199 TRACE(("idn__punycode_decode(): succcess (to=\"%s\")\n",
200 idn__debug_ucs4xstring(to_org
, 50)));
202 TRACE(("idn__punycode_decode(): %s\n", idn_result_tostring(r
)));
208 idn__punycode_encode(idn_converter_t ctx
, void *privdata
,
209 const unsigned long *from
, char *to
, size_t tolen
) {
211 unsigned long cur_code
, next_code
, delta
;
212 size_t prefixlen
= strlen(IDN_PUNYCODE_PREFIX
);
216 int uidx
, bias
, first
;
221 TRACE(("idn__punycode_encode(from=\"%s\", tolen=%d)\n",
222 idn__debug_ucs4xstring(from
, 50), (int)tolen
));
225 r
= idn_ucs4_ucs4toutf8(from
, to
, tolen
);
227 } else if (idn__util_ucs4haveaceprefix(from
, IDN_PUNYCODE_PREFIX
)) {
232 if (tolen
< prefixlen
) {
233 r
= idn_buffer_overflow
;
236 memcpy(to
, IDN_PUNYCODE_PREFIX
, prefixlen
);
240 fromlen
= idn_ucs4_strlen(from
);
243 * If the input string is too long (actually too long to be sane),
244 * return failure in order to prevent possible overflow.
246 if (fromlen
> PUNYCODE_MAXINPUT
) {
247 ERROR(("idn__punycode_encode(): "
248 "the input string is too long to convert Punycode\n",
249 idn__debug_ucs4xstring(from
, 50)));
254 ucsdone
= 0; /* number of characters processed */
258 * First, pick up basic code points and copy them to 'to'.
260 for (uidx
= 0; uidx
< fromlen
; uidx
++) {
261 if (from
[uidx
] < 0x80) {
262 if (toidx
>= tolen
) {
263 r
= idn_buffer_overflow
;
266 to
[toidx
++] = from
[uidx
];
272 * If there are any basic code points, output a delimiter
276 if (toidx
>= tolen
) {
277 r
= idn_buffer_overflow
;
286 * Then encode non-basic characters.
289 cur_code
= PUNYCODE_INITIAL_N
;
290 bias
= PUNYCODE_INITIAL_BIAS
;
292 while (ucsdone
< fromlen
) {
293 int limit
= -1, rest
;
296 * Find the smallest code point equal to or greater
297 * than 'cur_code'. Also remember the index of the
298 * last occurence of the code point.
300 for (next_code
= MAX_UCS
, uidx
= fromlen
- 1;
302 if (from
[uidx
] >= cur_code
&& from
[uidx
] < next_code
) {
303 next_code
= from
[uidx
];
307 /* There must be such code point. */
310 delta
+= (next_code
- cur_code
) * (ucsdone
+ 1);
311 cur_code
= next_code
;
314 * Scan the input string again, and encode characters
315 * whose code point is 'cur_code'. Use 'limit' to avoid
318 for (uidx
= 0, rest
= ucsdone
; uidx
<= limit
; uidx
++) {
319 if (from
[uidx
] < cur_code
) {
322 } else if (from
[uidx
] == cur_code
) {
323 int sz
= punycode_putwc(to
, tolen
, delta
, bias
);
325 r
= idn_buffer_overflow
;
331 bias
= punycode_update_bias(delta
, ucsdone
,
342 * Terminate with NUL.
345 r
= idn_buffer_overflow
;
352 if (r
== idn_success
) {
353 TRACE(("idn__punycode_encode(): succcess (to=\"%s\")\n",
354 idn__debug_xstring(to_org
, 50)));
356 TRACE(("idn__punycode_encode(): %s\n", idn_result_tostring(r
)));
362 punycode_getwc(const char *s
, size_t len
, int bias
, unsigned long *vp
) {
364 unsigned long v
= 0, w
= 1;
367 for (k
= PUNYCODE_BASE
- bias
; len
> 0; k
+= PUNYCODE_BASE
) {
369 int t
= (k
< PUNYCODE_TMIN
) ? PUNYCODE_TMIN
:
370 (k
> PUNYCODE_TMAX
) ? PUNYCODE_TMAX
: k
;
373 if ('a' <= c
&& c
<= 'z')
375 else if ('A' <= c
&& c
<= 'Z')
377 else if ('0' <= c
&& c
<= '9')
383 return (0); /* invalid character */
389 return (orglen
- len
);
392 w
*= (PUNYCODE_BASE
- t
);
395 return (0); /* final character missing */
399 punycode_putwc(char *s
, size_t len
, unsigned long delta
, int bias
) {
400 const char *punycode_base36
= "abcdefghijklmnopqrstuvwxyz0123456789";
404 for (k
= PUNYCODE_BASE
- bias
; 1; k
+= PUNYCODE_BASE
) {
405 int t
= (k
< PUNYCODE_TMIN
) ? PUNYCODE_TMIN
:
406 (k
> PUNYCODE_TMAX
) ? PUNYCODE_TMAX
: k
;
412 *s
++ = punycode_base36
[t
+ ((delta
- t
) % (PUNYCODE_BASE
- t
))];
414 delta
= (delta
- t
) / (PUNYCODE_BASE
- t
);
418 *s
++ = punycode_base36
[delta
];
423 punycode_update_bias(unsigned long delta
, size_t npoints
, int first
) {
426 delta
/= first
? PUNYCODE_DAMP
: 2;
427 delta
+= delta
/ npoints
;
429 while (delta
> ((PUNYCODE_BASE
- PUNYCODE_TMIN
) * PUNYCODE_TMAX
) / 2) {
430 delta
/= PUNYCODE_BASE
- PUNYCODE_TMIN
;
433 return (PUNYCODE_BASE
* k
+
434 (((PUNYCODE_BASE
- PUNYCODE_TMIN
+ 1) * delta
) /
435 (delta
+ PUNYCODE_SKEW
)));