lib/unicode.c

   1 /* unicode.c    unicode normalization utilities
   2  * Copyright (C) 2002  Simon Josefsson
   3  *
   4  * This file is part of libgsasl.
   5  *
   6  * Libgsasl is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * Libgsasl is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with libgsasl; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19  *
  20  */
  21
  22 #include "internal.h"
  23
  24 /* This file contains functions from GLIB including gutf8.c and
  25  * gunidecomp.c, all with the following license.
  26  *
  27  *  Copyright (C) 1999, 2000 Tom Tromey
  28  *  Copyright 2000 Red Hat, Inc.
  29  *
  30  * The Gnome Library is free software; you can redistribute it and/or
  31  * modify it under the terms of the GNU Lesser General Public License as
  32  * published by the Free Software Foundation; either version 2 of the
  33  * License, or (at your option) any later version.
  34  *
  35  * The Gnome Library is distributed in the hope that it will be useful,
  36  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  37  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  38  * Lesser General Public License for more details.
  39  *
  40  * You should have received a copy of the GNU Lesser General Public
  41  * License along with the Gnome Library; see the file COPYING.LIB.  If not,
  42  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  43  *   Boston, MA 02111-1307, USA.
  44  */
  45
  46 #include <config.h>
  47
  48 typedef char   gchar;
  49 typedef short  gshort;
  50 typedef long   glong;
  51 typedef int    gint;
  52 typedef gint   gboolean;
  53
  54 typedef unsigned char   guchar;
  55 typedef unsigned short  gushort;
  56 typedef unsigned long   gulong;
  57 typedef unsigned int    guint;
  58
  59 typedef enum {
  60   G_NORMALIZE_DEFAULT,
  61   G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
  62   G_NORMALIZE_DEFAULT_COMPOSE,
  63   G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
  64   G_NORMALIZE_ALL,
  65   G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
  66   G_NORMALIZE_ALL_COMPOSE,
  67   G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
  68 } GNormalizeMode;
  69
  70 #include "gunidecomp.h"
  71 #include "gunicomp.h"
  72
  73 #include <stdlib.h>
  74
  75 typedef signed char gint8;
  76 typedef unsigned char guint8;
  77 typedef signed short gint16;
  78 typedef unsigned short guint16;
  79 typedef signed int gint32;
  80 typedef unsigned int guint32;
  81
  82 typedef guint32 gunichar;
  83 typedef guint16 gunichar2;
  84
  85 typedef signed int gssize;
  86 typedef unsigned int gsize;
  87
  88 #define G_N_ELEMENTS(arr)               (sizeof (arr) / sizeof ((arr)[0]))
  89
  90 #define FALSE 0
  91 #define TRUE 1
  92
  93 #define UTF8_COMPUTE(Char, Mask, Len)                                         \
  94   if (Char < 128)                                                             \
  95     {                                                                         \
  96       Len = 1;                                                                \
  97       Mask = 0x7f;                                                            \
  98     }                                                                         \
  99   else if ((Char & 0xe0) == 0xc0)                                             \
 100     {                                                                         \
 101       Len = 2;                                                                \
 102       Mask = 0x1f;                                                            \
 103     }                                                                         \
 104   else if ((Char & 0xf0) == 0xe0)                                             \
 105     {                                                                         \
 106       Len = 3;                                                                \
 107       Mask = 0x0f;                                                            \
 108     }                                                                         \
 109   else if ((Char & 0xf8) == 0xf0)                                             \
 110     {                                                                         \
 111       Len = 4;                                                                \
 112       Mask = 0x07;                                                            \
 113     }                                                                         \
 114   else if ((Char & 0xfc) == 0xf8)                                             \
 115     {                                                                         \
 116       Len = 5;                                                                \
 117       Mask = 0x03;                                                            \
 118     }                                                                         \
 119   else if ((Char & 0xfe) == 0xfc)                                             \
 120     {                                                                         \
 121       Len = 6;                                                                \
 122       Mask = 0x01;                                                            \
 123     }                                                                         \
 124   else                                                                        \
 125     Len = -1;
 126
 127 #define UTF8_LENGTH(Char)              \
 128   ((Char) < 0x80 ? 1 :                 \
 129    ((Char) < 0x800 ? 2 :               \
 130     ((Char) < 0x10000 ? 3 :            \
 131      ((Char) < 0x200000 ? 4 :          \
 132       ((Char) < 0x4000000 ? 5 : 6)))))
 133
 134
 135 #define UTF8_GET(Result, Chars, Count, Mask, Len)                             \
 136   (Result) = (Chars)[0] & (Mask);                                             \
 137   for ((Count) = 1; (Count) < (Len); ++(Count))                               \
 138     {                                                                         \
 139       if (((Chars)[(Count)] & 0xc0) != 0x80)                                  \
 140         {                                                                     \
 141           (Result) = -1;                                                      \
 142           break;                                                              \
 143         }                                                                     \
 144       (Result) <<= 6;                                                         \
 145       (Result) |= ((Chars)[(Count)] & 0x3f);                                  \
 146     }
 147
 148 #define UNICODE_VALID(Char)                   \
 149     ((Char) < 0x110000 &&                     \
 150      ((Char) < 0xD800 || (Char) >= 0xE000) && \
 151      (Char) != 0xFFFE && (Char) != 0xFFFF)
 152
 153 static const gchar utf8_skip_data[256] = {
 154   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 155   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 156   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 157   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 158   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 159   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 160   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 161   3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
 162 };
 163 static const gchar * const g_utf8_skip = utf8_skip_data;
 164
 165 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
 166
 167 #define g_malloc malloc
 168 #define g_free free
 169
 170 #define g_new(struct_type, n_structs)           \
 171     ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs))))
 172
 173 /**
 174  * g_utf8_get_char:
 175  * @p: a pointer to Unicode character encoded as UTF-8
 176  *
 177  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
 178  * If @p does not point to a valid UTF-8 encoded character, results are
 179  * undefined. If you are not sure that the bytes are complete
 180  * valid Unicode characters, you should use g_utf8_get_char_validated()
 181  * instead.
 182  *
 183  * Return value: the resulting character
 184  **/
 185 static gunichar
 186 g_utf8_get_char (const gchar *p)
 187 {
 188   int i, mask = 0, len;
 189   gunichar result;
 190   unsigned char c = (unsigned char) *p;
 191
 192   UTF8_COMPUTE (c, mask, len);
 193   if (len == -1)
 194     return (gunichar)-1;
 195   UTF8_GET (result, p, i, mask, len);
 196
 197   return result;
 198 }
 199
 200 #define CC(Page, Char) \
 201   ((combining_class_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
 202    ? (combining_class_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
 203    : (cclass_data[combining_class_table[Page]][Char]))
 204
 205 #define COMBINING_CLASS(Char) \
 206      (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
 207
 208 /**
 209  * g_unicode_canonical_ordering:
 210  * @string: a UCS-4 encoded string.
 211  * @len: the maximum length of @string to use.
 212  *
 213  * Computes the canonical ordering of a string in-place.
 214  * This rearranges decomposed characters in the string
 215  * according to their combining classes.  See the Unicode
 216  * manual for more information.
 217  **/
 218 static void
 219 g_unicode_canonical_ordering (gunichar *string,
 220                               gsize     len)
 221 {
 222   gsize i;
 223   int swap = 1;
 224
 225   while (swap)
 226     {
 227       int last;
 228       swap = 0;
 229       last = COMBINING_CLASS (string[0]);
 230       for (i = 0; i < len - 1; ++i)
 231         {
 232           int next = COMBINING_CLASS (string[i + 1]);
 233           if (next != 0 && last > next)
 234             {
 235               gsize j;
 236               /* Percolate item leftward through string.  */
 237               for (j = i; j > 0; --j)
 238                 {
 239                   gunichar t;
 240                   if (COMBINING_CLASS (string[j]) <= next)
 241                     break;
 242                   t = string[j + 1];
 243                   string[j + 1] = string[j];
 244                   string[j] = t;
 245                   swap = 1;
 246                 }
 247               /* We're re-entering the loop looking at the old
 248                  character again.  */
 249               next = last;
 250             }
 251           last = next;
 252         }
 253     }
 254 }
 255
 256 static const guchar *
 257 find_decomposition (gunichar ch,
 258                     gboolean compat)
 259 {
 260   int start = 0;
 261   int end = G_N_ELEMENTS (decomp_table);
 262
 263   if (ch >= decomp_table[start].ch &&
 264       ch <= decomp_table[end - 1].ch)
 265     {
 266       while (TRUE)
 267         {
 268           int half = (start + end) / 2;
 269           if (ch == decomp_table[half].ch)
 270             {
 271               int offset;
 272
 273               if (compat)
 274                 {
 275                   offset = decomp_table[half].compat_offset;
 276                   if (offset == 0xff)
 277                     offset = decomp_table[half].canon_offset;
 278                 }
 279               else
 280                 {
 281                   offset = decomp_table[half].canon_offset;
 282                   if (offset == 0xff)
 283                     return NULL;
 284                 }
 285
 286               return &(decomp_expansion_string[decomp_table[half].expansion_offset + offset]);
 287             }
 288           else if (half == start)
 289             break;
 290           else if (ch > decomp_table[half].ch)
 291             start = half;
 292           else
 293             end = half;
 294         }
 295     }
 296
 297   return NULL;
 298 }
 299
 300 #define CI(Page, Char) \
 301   ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
 302    ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
 303    : (compose_data[compose_table[Page]][Char]))
 304
 305 #define COMPOSE_INDEX(Char) \
 306      (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
 307
 308 static gboolean
 309 combine (gunichar  a,
 310          gunichar  b,
 311          gunichar *result)
 312 {
 313   gushort index_a, index_b;
 314
 315   index_a = COMPOSE_INDEX(a);
 316   if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
 317     {
 318       if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
 319         {
 320           *result = compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
 321           return TRUE;
 322         }
 323       else
 324         return FALSE;
 325     }
 326
 327   index_b = COMPOSE_INDEX(b);
 328   if (index_b >= COMPOSE_SECOND_SINGLE_START)
 329     {
 330       if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
 331         {
 332           *result = compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
 333           return TRUE;
 334         }
 335       else
 336         return FALSE;
 337     }
 338
 339   if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START &&
 340       index_b >= COMPOSE_SECOND_START && index_a < COMPOSE_SECOND_SINGLE_START)
 341     {
 342       gunichar res = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START];
 343
 344       if (res)
 345         {
 346           *result = res;
 347           return TRUE;
 348         }
 349     }
 350
 351   return FALSE;
 352 }
 353
 354 static gunichar *
 355 _g_utf8_normalize_wc (const gchar    *str,
 356                       gssize          max_len,
 357                       GNormalizeMode  mode)
 358 {
 359   gsize n_wc;
 360   gunichar *wc_buffer;
 361   const char *p;
 362   gsize last_start;
 363   gboolean do_compat = (mode == G_NORMALIZE_NFKC ||
 364                         mode == G_NORMALIZE_NFKD);
 365   gboolean do_compose = (mode == G_NORMALIZE_NFC ||
 366                          mode == G_NORMALIZE_NFKC);
 367
 368   n_wc = 0;
 369   p = str;
 370   while ((max_len < 0 || p < str + max_len) && *p)
 371     {
 372       gunichar wc = g_utf8_get_char (p);
 373
 374       const guchar *decomp = find_decomposition (wc, do_compat);
 375
 376       if (decomp)
 377         {
 378           int len;
 379           /* We store as a double-nul terminated string.  */
 380           for (len = 0; (decomp[len] || decomp[len + 1]);
 381                len += 2)
 382             ;
 383           n_wc += len / 2;
 384         }
 385       else
 386         n_wc++;
 387
 388       p = g_utf8_next_char (p);
 389     }
 390
 391   wc_buffer = g_new (gunichar, n_wc + 1);
 392
 393   last_start = 0;
 394   n_wc = 0;
 395   p = str;
 396   while ((max_len < 0 || p < str + max_len) && *p)
 397     {
 398       gunichar wc = g_utf8_get_char (p);
 399       const guchar *decomp;
 400       int cc;
 401       gsize old_n_wc = n_wc;
 402
 403       decomp = find_decomposition (wc, do_compat);
 404
 405       if (decomp)
 406         {
 407           int len;
 408           /* We store as a double-nul terminated string.  */
 409           for (len = 0; (decomp[len] || decomp[len + 1]);
 410                len += 2)
 411             wc_buffer[n_wc++] = (decomp[len] << 8 | decomp[len + 1]);
 412         }
 413       else
 414         wc_buffer[n_wc++] = wc;
 415
 416       if (n_wc > 0)
 417         {
 418           cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
 419
 420           if (cc == 0)
 421             {
 422               g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
 423               last_start = old_n_wc;
 424             }
 425         }
 426
 427       p = g_utf8_next_char (p);
 428     }
 429
 430   if (n_wc > 0)
 431     {
 432       g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
 433       last_start = n_wc;
 434     }
 435
 436   wc_buffer[n_wc] = 0;
 437
 438   /* All decomposed and reordered */
 439
 440
 441   if (do_compose && n_wc > 0)
 442     {
 443       gsize i, j;
 444       int last_cc = 0;
 445       last_start = 0;
 446
 447       for (i = 0; i < n_wc; i++)
 448         {
 449           int cc = COMBINING_CLASS (wc_buffer[i]);
 450
 451           if (i > 0 &&
 452               (last_cc == 0 || last_cc != cc) &&
 453               combine (wc_buffer[last_start], wc_buffer[i],
 454                        &wc_buffer[last_start]))
 455             {
 456               for (j = i + 1; j < n_wc; j++)
 457                 wc_buffer[j-1] = wc_buffer[j];
 458               n_wc--;
 459               i--;
 460
 461               if (i == last_start)
 462                 last_cc = 0;
 463               else
 464                 last_cc = COMBINING_CLASS (wc_buffer[i-1]);
 465
 466               continue;
 467             }
 468
 469           if (cc == 0)
 470             last_start = i;
 471
 472           last_cc = cc;
 473         }
 474     }
 475
 476   wc_buffer[n_wc] = 0;
 477
 478   return wc_buffer;
 479 }
 480
 481 /**
 482  * g_unichar_to_utf8:
 483  * @c: a ISO10646 character code
 484  * @outbuf: output buffer, must have at least 6 bytes of space.
 485  *       If %NULL, the length will be computed and returned
 486  *       and nothing will be written to @outbuf.
 487  *
 488  * Converts a single character to UTF-8.
 489  *
 490  * Return value: number of bytes written
 491  **/
 492 static int
 493 g_unichar_to_utf8 (gunichar c,
 494                    gchar   *outbuf)
 495 {
 496   guint len = 0;
 497   int first;
 498   int i;
 499
 500   if (c < 0x80)
 501     {
 502       first = 0;
 503       len = 1;
 504     }
 505   else if (c < 0x800)
 506     {
 507       first = 0xc0;
 508       len = 2;
 509     }
 510   else if (c < 0x10000)
 511     {
 512       first = 0xe0;
 513       len = 3;
 514     }
 515    else if (c < 0x200000)
 516     {
 517       first = 0xf0;
 518       len = 4;
 519     }
 520   else if (c < 0x4000000)
 521     {
 522       first = 0xf8;
 523       len = 5;
 524     }
 525   else
 526     {
 527       first = 0xfc;
 528       len = 6;
 529     }
 530
 531   if (outbuf)
 532     {
 533       for (i = len - 1; i > 0; --i)
 534         {
 535           outbuf[i] = (c & 0x3f) | 0x80;
 536           c >>= 6;
 537         }
 538       outbuf[0] = c | first;
 539     }
 540
 541   return len;
 542 }
 543
 544 /**
 545  * g_ucs4_to_utf8:
 546  * @str: a UCS-4 encoded string
 547  * @len: the maximum length of @str to use. If @len < 0, then
 548  *       the string is terminated with a 0 character.
 549  * @items_read: location to store number of characters read read, or %NULL.
 550  * @items_written: location to store number of bytes written or %NULL.
 551  *                 The value here stored does not include the trailing 0
 552  *                 byte.
 553  * @error: location to store the error occuring, or %NULL to ignore
 554  *         errors. Any of the errors in #GConvertError other than
 555  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 556  *
 557  * Convert a string from a 32-bit fixed width representation as UCS-4.
 558  * to UTF-8. The result will be terminated with a 0 byte.
 559  *
 560  * Return value: a pointer to a newly allocated UTF-8 string.
 561  *               This value must be freed with g_free(). If an
 562  *               error occurs, %NULL will be returned and
 563  *               @error set.
 564  **/
 565 static gchar *
 566 g_ucs4_to_utf8 (const gunichar *str,
 567                 glong           len,
 568                 glong          *items_read,
 569                 glong          *items_written)
 570 {
 571   gint result_length;
 572   gchar *result = NULL;
 573   gchar *p;
 574   gint i;
 575
 576   result_length = 0;
 577   for (i = 0; len < 0 || i < len ; i++)
 578     {
 579       if (!str[i])
 580         break;
 581
 582       if (str[i] >= 0x80000000)
 583         {
 584           if (items_read)
 585             *items_read = i;
 586
 587           goto err_out;
 588         }
 589
 590       result_length += UTF8_LENGTH (str[i]);
 591     }
 592
 593   result = g_malloc (result_length + 1);
 594   p = result;
 595
 596   i = 0;
 597   while (p < result + result_length)
 598     p += g_unichar_to_utf8 (str[i++], p);
 599
 600   *p = '\0';
 601
 602   if (items_written)
 603     *items_written = p - result;
 604
 605  err_out:
 606   if (items_read)
 607     *items_read = i;
 608
 609   return result;
 610 }
 611
 612 /**
 613  * g_utf8_normalize:
 614  * @str: a UTF-8 encoded string.
 615  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 616  * @mode: the type of normalization to perform.
 617  *
 618  * Converts a string into canonical form, standardizing
 619  * such issues as whether a character with an accent
 620  * is represented as a base character and combining
 621  * accent or as a single precomposed character. You
 622  * should generally call g_utf8_normalize() before
 623  * comparing two Unicode strings.
 624  *
 625  * The normalization mode %G_NORMALIZE_DEFAULT only
 626  * standardizes differences that do not affect the
 627  * text content, such as the above-mentioned accent
 628  * representation. %G_NORMALIZE_ALL also standardizes
 629  * the "compatibility" characters in Unicode, such
 630  * as SUPERSCRIPT THREE to the standard forms
 631  * (in this case DIGIT THREE). Formatting information
 632  * may be lost but for most text operations such
 633  * characters should be considered the same.
 634  * For example, g_utf8_collate() normalizes
 635  * with %G_NORMALIZE_ALL as its first step.
 636  *
 637  * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
 638  * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
 639  * but returned a result with composed forms rather
 640  * than a maximally decomposed form. This is often
 641  * useful if you intend to convert the string to
 642  * a legacy encoding or pass it to a system with
 643  * less capable Unicode handling.
 644  *
 645  * Return value: a newly allocated string, that is the
 646  *   normalized form of @str.
 647  **/
 648 static gchar *
 649 g_utf8_normalize (const gchar    *str,
 650                   gssize          len,
 651                   GNormalizeMode  mode)
 652 {
 653   gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
 654   gchar *result;
 655
 656   result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
 657   g_free (result_wc);
 658
 659   return result;
 660 }
 661
 662 char *
 663 gsasl_utf8_nfkc_normalize (const char *str,
 664                            int len)
 665 {
 666   return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
 667 }