4 static char *rcsid
= "Id: unormalize.c,v 1.1.1.1 2003/06/04 00:26:43 marka Exp";
8 * Copyright (c) 2000,2001,2002 Japan Network Information Center.
11 * By using this file, you agree to the terms and conditions set forth bellow.
13 * LICENSE TERMS AND CONDITIONS
15 * The following License Terms and Conditions apply, unless a different
16 * license is obtained from Japan Network Information Center ("JPNIC"),
17 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
18 * Chiyoda-ku, Tokyo 101-0047, Japan.
20 * 1. Use, Modification and Redistribution (including distribution of any
21 * modified or derived work) in source and/or binary forms is permitted
22 * under this License Terms and Conditions.
24 * 2. Redistribution of source code must retain the copyright notices as they
25 * appear in each source code file, this License Terms and Conditions.
27 * 3. Redistribution in binary form must reproduce the Copyright Notice,
28 * this License Terms and Conditions, in the documentation and/or other
29 * materials provided with the distribution. For the purposes of binary
30 * distribution the "Copyright Notice" refers to the following language:
31 * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved."
33 * 4. The name of JPNIC may not be used to endorse or promote products
34 * derived from this Software without specific prior written approval of
37 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
38 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
39 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
40 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE
41 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
44 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
45 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
46 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
47 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
56 #include <idn/result.h>
57 #include <idn/assert.h>
58 #include <idn/logmacro.h>
60 #include <idn/unicode.h>
61 #include <idn/unormalize.h>
62 #include <idn/debug.h>
64 #if !defined(HAVE_MEMMOVE) && defined(HAVE_BCOPY)
65 #define memmove(a,b,c) bcopy((char *)(b),(char *)(a),(int)(c))
68 #define WORKBUF_SIZE 128
69 #define WORKBUF_SIZE_MAX 10000
72 idn__unicode_version_t version
; /* Unicode version */
73 int cur
; /* pointing now processing character */
74 int last
; /* pointing just after the last character */
75 int size
; /* size of UCS and CLASS array */
76 unsigned long *ucs4
; /* UCS-4 characters */
77 int *class; /* and their canonical classes */
78 unsigned long ucs4_buf
[WORKBUF_SIZE
]; /* local buffer */
79 int class_buf
[WORKBUF_SIZE
]; /* ditto */
82 static idn_result_t
normalize(idn__unicode_version_t version
,
83 int do_composition
, int compat
,
84 const unsigned long *from
,
85 unsigned long *to
, size_t tolen
);
86 static idn_result_t
decompose(workbuf_t
*wb
, unsigned long c
, int compat
);
87 static void get_class(workbuf_t
*wb
);
88 static void reorder(workbuf_t
*wb
);
89 static void compose(workbuf_t
*wb
);
90 static idn_result_t
flush_before_cur(workbuf_t
*wb
,
91 unsigned long **top
, size_t *tolenp
);
92 static void workbuf_init(workbuf_t
*wb
);
93 static void workbuf_free(workbuf_t
*wb
);
94 static idn_result_t
workbuf_extend(workbuf_t
*wb
);
95 static idn_result_t
workbuf_append(workbuf_t
*wb
, unsigned long c
);
96 static void workbuf_shift(workbuf_t
*wb
, int shift
);
97 static void workbuf_removevoid(workbuf_t
*wb
);
100 idn__unormalize_formkc(idn__unicode_version_t version
,
101 const unsigned long *from
, unsigned long *to
,
103 assert(version
!= NULL
&& from
!= NULL
&& to
!= NULL
&& tolen
>= 0);
104 TRACE(("idn__unormalize_formkc(from=\"%s\", tolen=%d)\n",
105 idn__debug_ucs4xstring(from
, 50), tolen
));
106 return (normalize(version
, 1, 1, from
, to
, tolen
));
110 normalize(idn__unicode_version_t version
, int do_composition
, int compat
,
111 const unsigned long *from
, unsigned long *to
, size_t tolen
) {
113 idn_result_t r
= idn_success
;
116 * Initialize working buffer.
119 wb
.version
= version
;
121 while (*from
!= '\0') {
124 assert(wb
.cur
== wb
.last
);
127 * Get one character from 'from'.
134 if ((r
= decompose(&wb
, c
, compat
)) != idn_success
)
138 * Get canonical class.
145 for (; wb
.cur
< wb
.last
; wb
.cur
++) {
148 } else if (wb
.class[wb
.cur
] > 0) {
150 * This is not a starter. Try reordering.
151 * Note that characters up to it are
152 * already in canonical order.
159 * This is a starter character, and there are
160 * some characters before it. Those characters
161 * have been reordered properly, and
162 * ready for composition.
164 if (do_composition
&& wb
.class[0] == 0)
168 * If CUR points to a starter character,
169 * then process of characters before CUR are
170 * already finished, because any further
171 * reordering/composition for them are blocked
172 * by the starter CUR points.
174 if (wb
.cur
> 0 && wb
.class[wb
.cur
] == 0) {
175 /* Flush everything before CUR. */
176 r
= flush_before_cur(&wb
, &to
, &tolen
);
177 if (r
!= idn_success
)
183 if (r
== idn_success
) {
184 if (do_composition
&& wb
.cur
> 0 && wb
.class[0] == 0) {
186 * There is some characters left in WB.
187 * They are ordered, but not composed yet.
188 * Now CUR points just after the last character in WB,
189 * and since compose() tries to compose characters
190 * between top and CUR inclusive, we must make CUR
191 * one character back during compose().
198 * Call this even when WB.CUR == 0, to make TO
201 r
= flush_before_cur(&wb
, &to
, &tolen
);
202 if (r
!= idn_success
)
207 r
= idn_buffer_overflow
;
218 decompose(workbuf_t
*wb
, unsigned long c
, int compat
) {
223 r
= idn__unicode_decompose(wb
->version
, compat
, wb
->ucs4
+ wb
->last
,
224 wb
->size
- wb
->last
, c
, &dec_len
);
228 return (idn_success
);
230 return (workbuf_append(wb
, c
));
231 case idn_buffer_overflow
:
232 if ((r
= workbuf_extend(wb
)) != idn_success
)
234 if (wb
->size
> WORKBUF_SIZE_MAX
) {
235 WARNING(("idn__unormalize_form*: "
236 "working buffer too large\n"));
237 return (idn_nomemory
);
247 get_class(workbuf_t
*wb
) {
250 for (i
= wb
->cur
; i
< wb
->last
; i
++)
251 wb
->class[i
] = idn__unicode_canonicalclass(wb
->version
,
256 reorder(workbuf_t
*wb
) {
265 class = wb
->class[i
];
267 while (i
> 0 && wb
->class[i
- 1] > class) {
268 wb
->ucs4
[i
] = wb
->ucs4
[i
- 1];
269 wb
->class[i
] =wb
->class[i
- 1];
272 wb
->class[i
] = class;
277 compose(workbuf_t
*wb
) {
284 idn__unicode_version_t ver
;
286 assert(wb
!= NULL
&& wb
->class[0] == 0);
294 * If there are no decomposition sequence that begins with
295 * the top character, composition is impossible.
297 if (!idn__unicode_iscompositecandidate(ver
, ucs4
[0]))
302 for (i
= 1; i
<= cur
; i
++) {
306 if ((last_class
< cl
|| cl
== 0) &&
307 idn__unicode_compose(ver
, ucs4
[0], ucs4
[i
],
308 &c
) == idn_success
) {
310 * Replace the top character with the composed one.
313 class[0] = idn__unicode_canonicalclass(ver
, c
);
315 class[i
] = -1; /* void this character */
322 /* Purge void characters, if any. */
324 workbuf_removevoid(wb
);
328 flush_before_cur(workbuf_t
*wb
, unsigned long **top
, size_t *tolenp
) {
329 if (*tolenp
< wb
->cur
)
330 return (idn_buffer_overflow
);
332 memcpy(*top
, wb
->ucs4
, sizeof(**top
) * wb
->cur
);
335 workbuf_shift(wb
, wb
->cur
);
337 return (idn_success
);
341 workbuf_init(workbuf_t
*wb
) {
344 wb
->size
= WORKBUF_SIZE
;
345 wb
->ucs4
= wb
->ucs4_buf
;
346 wb
->class = wb
->class_buf
;
350 workbuf_free(workbuf_t
*wb
) {
351 if (wb
->ucs4
!= wb
->ucs4_buf
) {
358 workbuf_extend(workbuf_t
*wb
) {
359 int newsize
= wb
->size
* 3;
361 if (wb
->ucs4
== wb
->ucs4_buf
) {
362 wb
->ucs4
= malloc(sizeof(wb
->ucs4
[0]) * newsize
);
363 wb
->class = malloc(sizeof(wb
->class[0]) * newsize
);
365 wb
->ucs4
= realloc(wb
->ucs4
, sizeof(wb
->ucs4
[0]) * newsize
);
366 wb
->class = realloc(wb
->class, sizeof(wb
->class[0]) * newsize
);
368 if (wb
->ucs4
== NULL
|| wb
->class == NULL
)
369 return (idn_nomemory
);
371 return (idn_success
);
375 workbuf_append(workbuf_t
*wb
, unsigned long c
) {
378 if (wb
->last
>= wb
->size
&& (r
= workbuf_extend(wb
)) != idn_success
)
380 wb
->ucs4
[wb
->last
++] = c
;
381 return (idn_success
);
385 workbuf_shift(workbuf_t
*wb
, int shift
) {
388 assert(wb
!= NULL
&& wb
->cur
>= shift
);
390 nmove
= wb
->last
- shift
;
391 (void)memmove(&wb
->ucs4
[0], &wb
->ucs4
[shift
],
392 nmove
* sizeof(wb
->ucs4
[0]));
393 (void)memmove(&wb
->class[0], &wb
->class[shift
],
394 nmove
* sizeof(wb
->class[0]));
400 workbuf_removevoid(workbuf_t
*wb
) {
404 for (i
= j
= 0; i
< last
; i
++) {
405 if (wb
->class[i
] >= 0) {
407 wb
->ucs4
[j
] = wb
->ucs4
[i
];
408 wb
->class[j
] = wb
->class[i
];