Initial release, version 0.0.0.
[gsasl.git] / lib / unicode.c
blobe7693484d46e07a5561991ffdf303145593fa808
1 /* unicode.c unicode normalization utilities
2 * Copyright (C) 2002 Simon Josefsson
4 * This file is part of libgsasl.
6 * Libgsasl is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * Libgsasl is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with libgsasl; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 #include "internal.h"
24 /* This file contains functions from GLIB including gutf8.c and
25 * gunidecomp.c, all with the following license.
27 * Copyright (C) 1999, 2000 Tom Tromey
28 * Copyright 2000 Red Hat, Inc.
30 * The Gnome Library is free software; you can redistribute it and/or
31 * modify it under the terms of the GNU Lesser General Public License as
32 * published by the Free Software Foundation; either version 2 of the
33 * License, or (at your option) any later version.
35 * The Gnome Library is distributed in the hope that it will be useful,
36 * but WITHOUT ANY WARRANTY; without even the implied warranty of
37 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
38 * Lesser General Public License for more details.
40 * You should have received a copy of the GNU Lesser General Public
41 * License along with the Gnome Library; see the file COPYING.LIB. If not,
42 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
43 * Boston, MA 02111-1307, USA.
46 #include <config.h>
48 typedef char gchar;
49 typedef short gshort;
50 typedef long glong;
51 typedef int gint;
52 typedef gint gboolean;
54 typedef unsigned char guchar;
55 typedef unsigned short gushort;
56 typedef unsigned long gulong;
57 typedef unsigned int guint;
59 typedef enum {
60 G_NORMALIZE_DEFAULT,
61 G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
62 G_NORMALIZE_DEFAULT_COMPOSE,
63 G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
64 G_NORMALIZE_ALL,
65 G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
66 G_NORMALIZE_ALL_COMPOSE,
67 G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
68 } GNormalizeMode;
70 #include "gunidecomp.h"
71 #include "gunicomp.h"
73 #include <stdlib.h>
75 typedef signed char gint8;
76 typedef unsigned char guint8;
77 typedef signed short gint16;
78 typedef unsigned short guint16;
79 typedef signed int gint32;
80 typedef unsigned int guint32;
82 typedef guint32 gunichar;
83 typedef guint16 gunichar2;
85 typedef signed int gssize;
86 typedef unsigned int gsize;
88 #define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
90 #define FALSE 0
91 #define TRUE 1
93 #define UTF8_COMPUTE(Char, Mask, Len) \
94 if (Char < 128) \
95 { \
96 Len = 1; \
97 Mask = 0x7f; \
98 } \
99 else if ((Char & 0xe0) == 0xc0) \
101 Len = 2; \
102 Mask = 0x1f; \
104 else if ((Char & 0xf0) == 0xe0) \
106 Len = 3; \
107 Mask = 0x0f; \
109 else if ((Char & 0xf8) == 0xf0) \
111 Len = 4; \
112 Mask = 0x07; \
114 else if ((Char & 0xfc) == 0xf8) \
116 Len = 5; \
117 Mask = 0x03; \
119 else if ((Char & 0xfe) == 0xfc) \
121 Len = 6; \
122 Mask = 0x01; \
124 else \
125 Len = -1;
127 #define UTF8_LENGTH(Char) \
128 ((Char) < 0x80 ? 1 : \
129 ((Char) < 0x800 ? 2 : \
130 ((Char) < 0x10000 ? 3 : \
131 ((Char) < 0x200000 ? 4 : \
132 ((Char) < 0x4000000 ? 5 : 6)))))
135 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
136 (Result) = (Chars)[0] & (Mask); \
137 for ((Count) = 1; (Count) < (Len); ++(Count)) \
139 if (((Chars)[(Count)] & 0xc0) != 0x80) \
141 (Result) = -1; \
142 break; \
144 (Result) <<= 6; \
145 (Result) |= ((Chars)[(Count)] & 0x3f); \
148 #define UNICODE_VALID(Char) \
149 ((Char) < 0x110000 && \
150 ((Char) < 0xD800 || (Char) >= 0xE000) && \
151 (Char) != 0xFFFE && (Char) != 0xFFFF)
153 static const gchar utf8_skip_data[256] = {
154 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
155 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
156 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
157 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
158 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
159 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
160 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
161 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
163 static const gchar * const g_utf8_skip = utf8_skip_data;
165 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
167 #define g_malloc malloc
168 #define g_free free
170 #define g_new(struct_type, n_structs) \
171 ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs))))
174 * g_utf8_get_char:
175 * @p: a pointer to Unicode character encoded as UTF-8
177 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
178 * If @p does not point to a valid UTF-8 encoded character, results are
179 * undefined. If you are not sure that the bytes are complete
180 * valid Unicode characters, you should use g_utf8_get_char_validated()
181 * instead.
183 * Return value: the resulting character
185 static gunichar
186 g_utf8_get_char (const gchar *p)
188 int i, mask = 0, len;
189 gunichar result;
190 unsigned char c = (unsigned char) *p;
192 UTF8_COMPUTE (c, mask, len);
193 if (len == -1)
194 return (gunichar)-1;
195 UTF8_GET (result, p, i, mask, len);
197 return result;
200 #define CC(Page, Char) \
201 ((combining_class_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
202 ? (combining_class_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
203 : (cclass_data[combining_class_table[Page]][Char]))
205 #define COMBINING_CLASS(Char) \
206 (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
209 * g_unicode_canonical_ordering:
210 * @string: a UCS-4 encoded string.
211 * @len: the maximum length of @string to use.
213 * Computes the canonical ordering of a string in-place.
214 * This rearranges decomposed characters in the string
215 * according to their combining classes. See the Unicode
216 * manual for more information.
218 static void
219 g_unicode_canonical_ordering (gunichar *string,
220 gsize len)
222 gsize i;
223 int swap = 1;
225 while (swap)
227 int last;
228 swap = 0;
229 last = COMBINING_CLASS (string[0]);
230 for (i = 0; i < len - 1; ++i)
232 int next = COMBINING_CLASS (string[i + 1]);
233 if (next != 0 && last > next)
235 gsize j;
236 /* Percolate item leftward through string. */
237 for (j = i; j > 0; --j)
239 gunichar t;
240 if (COMBINING_CLASS (string[j]) <= next)
241 break;
242 t = string[j + 1];
243 string[j + 1] = string[j];
244 string[j] = t;
245 swap = 1;
247 /* We're re-entering the loop looking at the old
248 character again. */
249 next = last;
251 last = next;
256 static const guchar *
257 find_decomposition (gunichar ch,
258 gboolean compat)
260 int start = 0;
261 int end = G_N_ELEMENTS (decomp_table);
263 if (ch >= decomp_table[start].ch &&
264 ch <= decomp_table[end - 1].ch)
266 while (TRUE)
268 int half = (start + end) / 2;
269 if (ch == decomp_table[half].ch)
271 int offset;
273 if (compat)
275 offset = decomp_table[half].compat_offset;
276 if (offset == 0xff)
277 offset = decomp_table[half].canon_offset;
279 else
281 offset = decomp_table[half].canon_offset;
282 if (offset == 0xff)
283 return NULL;
286 return &(decomp_expansion_string[decomp_table[half].expansion_offset + offset]);
288 else if (half == start)
289 break;
290 else if (ch > decomp_table[half].ch)
291 start = half;
292 else
293 end = half;
297 return NULL;
300 #define CI(Page, Char) \
301 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
302 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
303 : (compose_data[compose_table[Page]][Char]))
305 #define COMPOSE_INDEX(Char) \
306 (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
308 static gboolean
309 combine (gunichar a,
310 gunichar b,
311 gunichar *result)
313 gushort index_a, index_b;
315 index_a = COMPOSE_INDEX(a);
316 if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
318 if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
320 *result = compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
321 return TRUE;
323 else
324 return FALSE;
327 index_b = COMPOSE_INDEX(b);
328 if (index_b >= COMPOSE_SECOND_SINGLE_START)
330 if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
332 *result = compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
333 return TRUE;
335 else
336 return FALSE;
339 if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START &&
340 index_b >= COMPOSE_SECOND_START && index_a < COMPOSE_SECOND_SINGLE_START)
342 gunichar res = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START];
344 if (res)
346 *result = res;
347 return TRUE;
351 return FALSE;
354 static gunichar *
355 _g_utf8_normalize_wc (const gchar *str,
356 gssize max_len,
357 GNormalizeMode mode)
359 gsize n_wc;
360 gunichar *wc_buffer;
361 const char *p;
362 gsize last_start;
363 gboolean do_compat = (mode == G_NORMALIZE_NFKC ||
364 mode == G_NORMALIZE_NFKD);
365 gboolean do_compose = (mode == G_NORMALIZE_NFC ||
366 mode == G_NORMALIZE_NFKC);
368 n_wc = 0;
369 p = str;
370 while ((max_len < 0 || p < str + max_len) && *p)
372 gunichar wc = g_utf8_get_char (p);
374 const guchar *decomp = find_decomposition (wc, do_compat);
376 if (decomp)
378 int len;
379 /* We store as a double-nul terminated string. */
380 for (len = 0; (decomp[len] || decomp[len + 1]);
381 len += 2)
383 n_wc += len / 2;
385 else
386 n_wc++;
388 p = g_utf8_next_char (p);
391 wc_buffer = g_new (gunichar, n_wc + 1);
393 last_start = 0;
394 n_wc = 0;
395 p = str;
396 while ((max_len < 0 || p < str + max_len) && *p)
398 gunichar wc = g_utf8_get_char (p);
399 const guchar *decomp;
400 int cc;
401 gsize old_n_wc = n_wc;
403 decomp = find_decomposition (wc, do_compat);
405 if (decomp)
407 int len;
408 /* We store as a double-nul terminated string. */
409 for (len = 0; (decomp[len] || decomp[len + 1]);
410 len += 2)
411 wc_buffer[n_wc++] = (decomp[len] << 8 | decomp[len + 1]);
413 else
414 wc_buffer[n_wc++] = wc;
416 if (n_wc > 0)
418 cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
420 if (cc == 0)
422 g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
423 last_start = old_n_wc;
427 p = g_utf8_next_char (p);
430 if (n_wc > 0)
432 g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
433 last_start = n_wc;
436 wc_buffer[n_wc] = 0;
438 /* All decomposed and reordered */
441 if (do_compose && n_wc > 0)
443 gsize i, j;
444 int last_cc = 0;
445 last_start = 0;
447 for (i = 0; i < n_wc; i++)
449 int cc = COMBINING_CLASS (wc_buffer[i]);
451 if (i > 0 &&
452 (last_cc == 0 || last_cc != cc) &&
453 combine (wc_buffer[last_start], wc_buffer[i],
454 &wc_buffer[last_start]))
456 for (j = i + 1; j < n_wc; j++)
457 wc_buffer[j-1] = wc_buffer[j];
458 n_wc--;
459 i--;
461 if (i == last_start)
462 last_cc = 0;
463 else
464 last_cc = COMBINING_CLASS (wc_buffer[i-1]);
466 continue;
469 if (cc == 0)
470 last_start = i;
472 last_cc = cc;
476 wc_buffer[n_wc] = 0;
478 return wc_buffer;
482 * g_unichar_to_utf8:
483 * @c: a ISO10646 character code
484 * @outbuf: output buffer, must have at least 6 bytes of space.
485 * If %NULL, the length will be computed and returned
486 * and nothing will be written to @outbuf.
488 * Converts a single character to UTF-8.
490 * Return value: number of bytes written
492 static int
493 g_unichar_to_utf8 (gunichar c,
494 gchar *outbuf)
496 guint len = 0;
497 int first;
498 int i;
500 if (c < 0x80)
502 first = 0;
503 len = 1;
505 else if (c < 0x800)
507 first = 0xc0;
508 len = 2;
510 else if (c < 0x10000)
512 first = 0xe0;
513 len = 3;
515 else if (c < 0x200000)
517 first = 0xf0;
518 len = 4;
520 else if (c < 0x4000000)
522 first = 0xf8;
523 len = 5;
525 else
527 first = 0xfc;
528 len = 6;
531 if (outbuf)
533 for (i = len - 1; i > 0; --i)
535 outbuf[i] = (c & 0x3f) | 0x80;
536 c >>= 6;
538 outbuf[0] = c | first;
541 return len;
545 * g_ucs4_to_utf8:
546 * @str: a UCS-4 encoded string
547 * @len: the maximum length of @str to use. If @len < 0, then
548 * the string is terminated with a 0 character.
549 * @items_read: location to store number of characters read read, or %NULL.
550 * @items_written: location to store number of bytes written or %NULL.
551 * The value here stored does not include the trailing 0
552 * byte.
553 * @error: location to store the error occuring, or %NULL to ignore
554 * errors. Any of the errors in #GConvertError other than
555 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
557 * Convert a string from a 32-bit fixed width representation as UCS-4.
558 * to UTF-8. The result will be terminated with a 0 byte.
560 * Return value: a pointer to a newly allocated UTF-8 string.
561 * This value must be freed with g_free(). If an
562 * error occurs, %NULL will be returned and
563 * @error set.
565 static gchar *
566 g_ucs4_to_utf8 (const gunichar *str,
567 glong len,
568 glong *items_read,
569 glong *items_written)
571 gint result_length;
572 gchar *result = NULL;
573 gchar *p;
574 gint i;
576 result_length = 0;
577 for (i = 0; len < 0 || i < len ; i++)
579 if (!str[i])
580 break;
582 if (str[i] >= 0x80000000)
584 if (items_read)
585 *items_read = i;
587 goto err_out;
590 result_length += UTF8_LENGTH (str[i]);
593 result = g_malloc (result_length + 1);
594 p = result;
596 i = 0;
597 while (p < result + result_length)
598 p += g_unichar_to_utf8 (str[i++], p);
600 *p = '\0';
602 if (items_written)
603 *items_written = p - result;
605 err_out:
606 if (items_read)
607 *items_read = i;
609 return result;
613 * g_utf8_normalize:
614 * @str: a UTF-8 encoded string.
615 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
616 * @mode: the type of normalization to perform.
618 * Converts a string into canonical form, standardizing
619 * such issues as whether a character with an accent
620 * is represented as a base character and combining
621 * accent or as a single precomposed character. You
622 * should generally call g_utf8_normalize() before
623 * comparing two Unicode strings.
625 * The normalization mode %G_NORMALIZE_DEFAULT only
626 * standardizes differences that do not affect the
627 * text content, such as the above-mentioned accent
628 * representation. %G_NORMALIZE_ALL also standardizes
629 * the "compatibility" characters in Unicode, such
630 * as SUPERSCRIPT THREE to the standard forms
631 * (in this case DIGIT THREE). Formatting information
632 * may be lost but for most text operations such
633 * characters should be considered the same.
634 * For example, g_utf8_collate() normalizes
635 * with %G_NORMALIZE_ALL as its first step.
637 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
638 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
639 * but returned a result with composed forms rather
640 * than a maximally decomposed form. This is often
641 * useful if you intend to convert the string to
642 * a legacy encoding or pass it to a system with
643 * less capable Unicode handling.
645 * Return value: a newly allocated string, that is the
646 * normalized form of @str.
648 static gchar *
649 g_utf8_normalize (const gchar *str,
650 gssize len,
651 GNormalizeMode mode)
653 gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
654 gchar *result;
656 result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
657 g_free (result_wc);
659 return result;
662 char *
663 gsasl_utf8_nfkc_normalize (const char *str,
664 int len)
666 return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);