1 /* unicode.c unicode normalization utilities
2 * Copyright (C) 2002 Simon Josefsson
4 * This file is part of libgsasl.
6 * Libgsasl is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * Libgsasl is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with libgsasl; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 /* This file contains functions from GLIB including gutf8.c and
25 * gunidecomp.c, all with the following license.
27 * Copyright (C) 1999, 2000 Tom Tromey
28 * Copyright 2000 Red Hat, Inc.
30 * The Gnome Library is free software; you can redistribute it and/or
31 * modify it under the terms of the GNU Lesser General Public License as
32 * published by the Free Software Foundation; either version 2 of the
33 * License, or (at your option) any later version.
35 * The Gnome Library is distributed in the hope that it will be useful,
36 * but WITHOUT ANY WARRANTY; without even the implied warranty of
37 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
38 * Lesser General Public License for more details.
40 * You should have received a copy of the GNU Lesser General Public
41 * License along with the Gnome Library; see the file COPYING.LIB. If not,
42 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
43 * Boston, MA 02111-1307, USA.
52 typedef gint gboolean
;
54 typedef unsigned char guchar
;
55 typedef unsigned short gushort
;
56 typedef unsigned long gulong
;
57 typedef unsigned int guint
;
61 G_NORMALIZE_NFD
= G_NORMALIZE_DEFAULT
,
62 G_NORMALIZE_DEFAULT_COMPOSE
,
63 G_NORMALIZE_NFC
= G_NORMALIZE_DEFAULT_COMPOSE
,
65 G_NORMALIZE_NFKD
= G_NORMALIZE_ALL
,
66 G_NORMALIZE_ALL_COMPOSE
,
67 G_NORMALIZE_NFKC
= G_NORMALIZE_ALL_COMPOSE
70 #include "gunidecomp.h"
75 typedef signed char gint8
;
76 typedef unsigned char guint8
;
77 typedef signed short gint16
;
78 typedef unsigned short guint16
;
79 typedef signed int gint32
;
80 typedef unsigned int guint32
;
82 typedef guint32 gunichar
;
83 typedef guint16 gunichar2
;
85 typedef signed int gssize
;
86 typedef unsigned int gsize
;
88 #define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
93 #define UTF8_COMPUTE(Char, Mask, Len) \
99 else if ((Char & 0xe0) == 0xc0) \
104 else if ((Char & 0xf0) == 0xe0) \
109 else if ((Char & 0xf8) == 0xf0) \
114 else if ((Char & 0xfc) == 0xf8) \
119 else if ((Char & 0xfe) == 0xfc) \
127 #define UTF8_LENGTH(Char) \
128 ((Char) < 0x80 ? 1 : \
129 ((Char) < 0x800 ? 2 : \
130 ((Char) < 0x10000 ? 3 : \
131 ((Char) < 0x200000 ? 4 : \
132 ((Char) < 0x4000000 ? 5 : 6)))))
135 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
136 (Result) = (Chars)[0] & (Mask); \
137 for ((Count) = 1; (Count) < (Len); ++(Count)) \
139 if (((Chars)[(Count)] & 0xc0) != 0x80) \
145 (Result) |= ((Chars)[(Count)] & 0x3f); \
148 #define UNICODE_VALID(Char) \
149 ((Char) < 0x110000 && \
150 ((Char) < 0xD800 || (Char) >= 0xE000) && \
151 (Char) != 0xFFFE && (Char) != 0xFFFF)
153 static const gchar utf8_skip_data
[256] = {
154 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
155 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
156 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
157 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
158 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
159 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
160 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
161 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
163 static const gchar
* const g_utf8_skip
= utf8_skip_data
;
165 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
167 #define g_malloc malloc
170 #define g_new(struct_type, n_structs) \
171 ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs))))
175 * @p: a pointer to Unicode character encoded as UTF-8
177 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
178 * If @p does not point to a valid UTF-8 encoded character, results are
179 * undefined. If you are not sure that the bytes are complete
180 * valid Unicode characters, you should use g_utf8_get_char_validated()
183 * Return value: the resulting character
186 g_utf8_get_char (const gchar
*p
)
188 int i
, mask
= 0, len
;
190 unsigned char c
= (unsigned char) *p
;
192 UTF8_COMPUTE (c
, mask
, len
);
195 UTF8_GET (result
, p
, i
, mask
, len
);
200 #define CC(Page, Char) \
201 ((combining_class_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
202 ? (combining_class_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
203 : (cclass_data[combining_class_table[Page]][Char]))
205 #define COMBINING_CLASS(Char) \
206 (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
209 * g_unicode_canonical_ordering:
210 * @string: a UCS-4 encoded string.
211 * @len: the maximum length of @string to use.
213 * Computes the canonical ordering of a string in-place.
214 * This rearranges decomposed characters in the string
215 * according to their combining classes. See the Unicode
216 * manual for more information.
219 g_unicode_canonical_ordering (gunichar
*string
,
229 last
= COMBINING_CLASS (string
[0]);
230 for (i
= 0; i
< len
- 1; ++i
)
232 int next
= COMBINING_CLASS (string
[i
+ 1]);
233 if (next
!= 0 && last
> next
)
236 /* Percolate item leftward through string. */
237 for (j
= i
; j
> 0; --j
)
240 if (COMBINING_CLASS (string
[j
]) <= next
)
243 string
[j
+ 1] = string
[j
];
247 /* We're re-entering the loop looking at the old
256 static const guchar
*
257 find_decomposition (gunichar ch
,
261 int end
= G_N_ELEMENTS (decomp_table
);
263 if (ch
>= decomp_table
[start
].ch
&&
264 ch
<= decomp_table
[end
- 1].ch
)
268 int half
= (start
+ end
) / 2;
269 if (ch
== decomp_table
[half
].ch
)
275 offset
= decomp_table
[half
].compat_offset
;
277 offset
= decomp_table
[half
].canon_offset
;
281 offset
= decomp_table
[half
].canon_offset
;
286 return &(decomp_expansion_string
[decomp_table
[half
].expansion_offset
+ offset
]);
288 else if (half
== start
)
290 else if (ch
> decomp_table
[half
].ch
)
300 #define CI(Page, Char) \
301 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
302 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
303 : (compose_data[compose_table[Page]][Char]))
305 #define COMPOSE_INDEX(Char) \
306 (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
313 gushort index_a
, index_b
;
315 index_a
= COMPOSE_INDEX(a
);
316 if (index_a
>= COMPOSE_FIRST_SINGLE_START
&& index_a
< COMPOSE_SECOND_START
)
318 if (b
== compose_first_single
[index_a
- COMPOSE_FIRST_SINGLE_START
][0])
320 *result
= compose_first_single
[index_a
- COMPOSE_FIRST_SINGLE_START
][1];
327 index_b
= COMPOSE_INDEX(b
);
328 if (index_b
>= COMPOSE_SECOND_SINGLE_START
)
330 if (a
== compose_second_single
[index_b
- COMPOSE_SECOND_SINGLE_START
][0])
332 *result
= compose_second_single
[index_b
- COMPOSE_SECOND_SINGLE_START
][1];
339 if (index_a
>= COMPOSE_FIRST_START
&& index_a
< COMPOSE_FIRST_SINGLE_START
&&
340 index_b
>= COMPOSE_SECOND_START
&& index_a
< COMPOSE_SECOND_SINGLE_START
)
342 gunichar res
= compose_array
[index_a
- COMPOSE_FIRST_START
][index_b
- COMPOSE_SECOND_START
];
355 _g_utf8_normalize_wc (const gchar
*str
,
363 gboolean do_compat
= (mode
== G_NORMALIZE_NFKC
||
364 mode
== G_NORMALIZE_NFKD
);
365 gboolean do_compose
= (mode
== G_NORMALIZE_NFC
||
366 mode
== G_NORMALIZE_NFKC
);
370 while ((max_len
< 0 || p
< str
+ max_len
) && *p
)
372 gunichar wc
= g_utf8_get_char (p
);
374 const guchar
*decomp
= find_decomposition (wc
, do_compat
);
379 /* We store as a double-nul terminated string. */
380 for (len
= 0; (decomp
[len
] || decomp
[len
+ 1]);
388 p
= g_utf8_next_char (p
);
391 wc_buffer
= g_new (gunichar
, n_wc
+ 1);
396 while ((max_len
< 0 || p
< str
+ max_len
) && *p
)
398 gunichar wc
= g_utf8_get_char (p
);
399 const guchar
*decomp
;
401 gsize old_n_wc
= n_wc
;
403 decomp
= find_decomposition (wc
, do_compat
);
408 /* We store as a double-nul terminated string. */
409 for (len
= 0; (decomp
[len
] || decomp
[len
+ 1]);
411 wc_buffer
[n_wc
++] = (decomp
[len
] << 8 | decomp
[len
+ 1]);
414 wc_buffer
[n_wc
++] = wc
;
418 cc
= COMBINING_CLASS (wc_buffer
[old_n_wc
]);
422 g_unicode_canonical_ordering (wc_buffer
+ last_start
, n_wc
- last_start
);
423 last_start
= old_n_wc
;
427 p
= g_utf8_next_char (p
);
432 g_unicode_canonical_ordering (wc_buffer
+ last_start
, n_wc
- last_start
);
438 /* All decomposed and reordered */
441 if (do_compose
&& n_wc
> 0)
447 for (i
= 0; i
< n_wc
; i
++)
449 int cc
= COMBINING_CLASS (wc_buffer
[i
]);
452 (last_cc
== 0 || last_cc
!= cc
) &&
453 combine (wc_buffer
[last_start
], wc_buffer
[i
],
454 &wc_buffer
[last_start
]))
456 for (j
= i
+ 1; j
< n_wc
; j
++)
457 wc_buffer
[j
-1] = wc_buffer
[j
];
464 last_cc
= COMBINING_CLASS (wc_buffer
[i
-1]);
483 * @c: a ISO10646 character code
484 * @outbuf: output buffer, must have at least 6 bytes of space.
485 * If %NULL, the length will be computed and returned
486 * and nothing will be written to @outbuf.
488 * Converts a single character to UTF-8.
490 * Return value: number of bytes written
493 g_unichar_to_utf8 (gunichar c
,
510 else if (c
< 0x10000)
515 else if (c
< 0x200000)
520 else if (c
< 0x4000000)
533 for (i
= len
- 1; i
> 0; --i
)
535 outbuf
[i
] = (c
& 0x3f) | 0x80;
538 outbuf
[0] = c
| first
;
546 * @str: a UCS-4 encoded string
547 * @len: the maximum length of @str to use. If @len < 0, then
548 * the string is terminated with a 0 character.
549 * @items_read: location to store number of characters read read, or %NULL.
550 * @items_written: location to store number of bytes written or %NULL.
551 * The value here stored does not include the trailing 0
553 * @error: location to store the error occuring, or %NULL to ignore
554 * errors. Any of the errors in #GConvertError other than
555 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
557 * Convert a string from a 32-bit fixed width representation as UCS-4.
558 * to UTF-8. The result will be terminated with a 0 byte.
560 * Return value: a pointer to a newly allocated UTF-8 string.
561 * This value must be freed with g_free(). If an
562 * error occurs, %NULL will be returned and
566 g_ucs4_to_utf8 (const gunichar
*str
,
569 glong
*items_written
)
572 gchar
*result
= NULL
;
577 for (i
= 0; len
< 0 || i
< len
; i
++)
582 if (str
[i
] >= 0x80000000)
590 result_length
+= UTF8_LENGTH (str
[i
]);
593 result
= g_malloc (result_length
+ 1);
597 while (p
< result
+ result_length
)
598 p
+= g_unichar_to_utf8 (str
[i
++], p
);
603 *items_written
= p
- result
;
614 * @str: a UTF-8 encoded string.
615 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
616 * @mode: the type of normalization to perform.
618 * Converts a string into canonical form, standardizing
619 * such issues as whether a character with an accent
620 * is represented as a base character and combining
621 * accent or as a single precomposed character. You
622 * should generally call g_utf8_normalize() before
623 * comparing two Unicode strings.
625 * The normalization mode %G_NORMALIZE_DEFAULT only
626 * standardizes differences that do not affect the
627 * text content, such as the above-mentioned accent
628 * representation. %G_NORMALIZE_ALL also standardizes
629 * the "compatibility" characters in Unicode, such
630 * as SUPERSCRIPT THREE to the standard forms
631 * (in this case DIGIT THREE). Formatting information
632 * may be lost but for most text operations such
633 * characters should be considered the same.
634 * For example, g_utf8_collate() normalizes
635 * with %G_NORMALIZE_ALL as its first step.
637 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
638 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
639 * but returned a result with composed forms rather
640 * than a maximally decomposed form. This is often
641 * useful if you intend to convert the string to
642 * a legacy encoding or pass it to a system with
643 * less capable Unicode handling.
645 * Return value: a newly allocated string, that is the
646 * normalized form of @str.
649 g_utf8_normalize (const gchar
*str
,
653 gunichar
*result_wc
= _g_utf8_normalize_wc (str
, len
, mode
);
656 result
= g_ucs4_to_utf8 (result_wc
, -1, NULL
, NULL
);
663 gsasl_utf8_nfkc_normalize (const char *str
,
666 return g_utf8_normalize (str
, len
, G_NORMALIZE_NFKC
);