glib/guniprop.c

   1 /* guniprop.c - Unicode character properties.
   2  *
   3  * Copyright (C) 1999 Tom Tromey
   4  * Copyright (C) 2000 Red Hat, Inc.
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the
  18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  * Boston, MA 02111-1307, USA.
  20  */
  21
  22 #include "config.h"
  23
  24 #include <stddef.h>
  25 #include <string.h>
  26 #include <locale.h>
  27
  28 #include "glib.h"
  29 #include "gunichartables.h"
  30 #include "gunicodeprivate.h"
  31
  32 #define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \
  33                           ? attr_table_part1[Page] \
  34                           : attr_table_part2[(Page) - 0xe00])
  35
  36 #define ATTTABLE(Page, Char) \
  37   ((ATTR_TABLE(Page) == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[ATTR_TABLE(Page)][Char]))
  38
  39 #define TTYPE_PART1(Page, Char) \
  40   ((type_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
  41    ? (type_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
  42    : (type_data[type_table_part1[Page]][Char]))
  43
  44 #define TTYPE_PART2(Page, Char) \
  45   ((type_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
  46    ? (type_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
  47    : (type_data[type_table_part2[Page]][Char]))
  48
  49 #define TYPE(Char) \
  50   (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
  51    ? TTYPE_PART1 ((Char) >> 8, (Char) & 0xff) \
  52    : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
  53       ? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
  54       : G_UNICODE_UNASSIGNED))
  55
  56
  57 #define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER       \
  58                        || (Type) == G_UNICODE_LETTER_NUMBER     \
  59                        || (Type) == G_UNICODE_OTHER_NUMBER)
  60
  61 #define ISALPHA(Type) ((Type) == G_UNICODE_LOWERCASE_LETTER     \
  62                        || (Type) == G_UNICODE_UPPERCASE_LETTER  \
  63                        || (Type) == G_UNICODE_TITLECASE_LETTER  \
  64                        || (Type) == G_UNICODE_MODIFIER_LETTER   \
  65                        || (Type) == G_UNICODE_OTHER_LETTER)
  66
  67 #define ISMARK(Type) ((Type) == G_UNICODE_NON_SPACING_MARK ||   \
  68                       (Type) == G_UNICODE_COMBINING_MARK ||     \
  69                       (Type) == G_UNICODE_ENCLOSING_MARK)
  70
  71
  72 /**
  73  * g_unichar_isalnum:
  74  * @c: a Unicode character
  75  *
  76  * Determines whether a character is alphanumeric.
  77  * Given some UTF-8 text, obtain a character value
  78  * with g_utf8_get_char().
  79  *
  80  * Return value: %TRUE if @c is an alphanumeric character
  81  **/
  82 gboolean
  83 g_unichar_isalnum (gunichar c)
  84 {
  85   int t = TYPE (c);
  86   return ISDIGIT (t) || ISALPHA (t);
  87 }
  88
  89 /**
  90  * g_unichar_isalpha:
  91  * @c: a Unicode character
  92  *
  93  * Determines whether a character is alphabetic (i.e. a letter).
  94  * Given some UTF-8 text, obtain a character value with
  95  * g_utf8_get_char().
  96  *
  97  * Return value: %TRUE if @c is an alphabetic character
  98  **/
  99 gboolean
 100 g_unichar_isalpha (gunichar c)
 101 {
 102   int t = TYPE (c);
 103   return ISALPHA (t);
 104 }
 105
 106
 107 /**
 108  * g_unichar_iscntrl:
 109  * @c: a Unicode character
 110  *
 111  * Determines whether a character is a control character.
 112  * Given some UTF-8 text, obtain a character value with
 113  * g_utf8_get_char().
 114  *
 115  * Return value: %TRUE if @c is a control character
 116  **/
 117 gboolean
 118 g_unichar_iscntrl (gunichar c)
 119 {
 120   return TYPE (c) == G_UNICODE_CONTROL;
 121 }
 122
 123 /**
 124  * g_unichar_isdigit:
 125  * @c: a Unicode character
 126  *
 127  * Determines whether a character is numeric (i.e. a digit).  This
 128  * covers ASCII 0-9 and also digits in other languages/scripts.  Given
 129  * some UTF-8 text, obtain a character value with g_utf8_get_char().
 130  *
 131  * Return value: %TRUE if @c is a digit
 132  **/
 133 gboolean
 134 g_unichar_isdigit (gunichar c)
 135 {
 136   return TYPE (c) == G_UNICODE_DECIMAL_NUMBER;
 137 }
 138
 139
 140 /**
 141  * g_unichar_isgraph:
 142  * @c: a Unicode character
 143  *
 144  * Determines whether a character is printable and not a space
 145  * (returns %FALSE for control characters, format characters, and
 146  * spaces). g_unichar_isprint() is similar, but returns %TRUE for
 147  * spaces. Given some UTF-8 text, obtain a character value with
 148  * g_utf8_get_char().
 149  *
 150  * Return value: %TRUE if @c is printable unless it's a space
 151  **/
 152 gboolean
 153 g_unichar_isgraph (gunichar c)
 154 {
 155   int t = TYPE (c);
 156   return (t != G_UNICODE_CONTROL
 157           && t != G_UNICODE_FORMAT
 158           && t != G_UNICODE_UNASSIGNED
 159           && t != G_UNICODE_PRIVATE_USE
 160           && t != G_UNICODE_SURROGATE
 161           && t != G_UNICODE_SPACE_SEPARATOR);
 162 }
 163
 164 /**
 165  * g_unichar_islower:
 166  * @c: a Unicode character
 167  *
 168  * Determines whether a character is a lowercase letter.
 169  * Given some UTF-8 text, obtain a character value with
 170  * g_utf8_get_char().
 171  *
 172  * Return value: %TRUE if @c is a lowercase letter
 173  **/
 174 gboolean
 175 g_unichar_islower (gunichar c)
 176 {
 177   return TYPE (c) == G_UNICODE_LOWERCASE_LETTER;
 178 }
 179
 180
 181 /**
 182  * g_unichar_isprint:
 183  * @c: a Unicode character
 184  *
 185  * Determines whether a character is printable.
 186  * Unlike g_unichar_isgraph(), returns %TRUE for spaces.
 187  * Given some UTF-8 text, obtain a character value with
 188  * g_utf8_get_char().
 189  *
 190  * Return value: %TRUE if @c is printable
 191  **/
 192 gboolean
 193 g_unichar_isprint (gunichar c)
 194 {
 195   int t = TYPE (c);
 196   return (t != G_UNICODE_CONTROL
 197           && t != G_UNICODE_FORMAT
 198           && t != G_UNICODE_UNASSIGNED
 199           && t != G_UNICODE_PRIVATE_USE
 200           && t != G_UNICODE_SURROGATE);
 201 }
 202
 203 /**
 204  * g_unichar_ispunct:
 205  * @c: a Unicode character
 206  *
 207  * Determines whether a character is punctuation or a symbol.
 208  * Given some UTF-8 text, obtain a character value with
 209  * g_utf8_get_char().
 210  *
 211  * Return value: %TRUE if @c is a punctuation or symbol character
 212  **/
 213 gboolean
 214 g_unichar_ispunct (gunichar c)
 215 {
 216   int t = TYPE (c);
 217   return (t == G_UNICODE_CONNECT_PUNCTUATION || t == G_UNICODE_DASH_PUNCTUATION
 218           || t == G_UNICODE_CLOSE_PUNCTUATION || t == G_UNICODE_FINAL_PUNCTUATION
 219           || t == G_UNICODE_INITIAL_PUNCTUATION || t == G_UNICODE_OTHER_PUNCTUATION
 220           || t == G_UNICODE_OPEN_PUNCTUATION || t == G_UNICODE_CURRENCY_SYMBOL
 221           || t == G_UNICODE_MODIFIER_SYMBOL || t == G_UNICODE_MATH_SYMBOL
 222           || t == G_UNICODE_OTHER_SYMBOL);
 223 }
 224
 225 /**
 226  * g_unichar_isspace:
 227  * @c: a Unicode character
 228  *
 229  * Determines whether a character is a space, tab, or line separator
 230  * (newline, carriage return, etc.).  Given some UTF-8 text, obtain a
 231  * character value with g_utf8_get_char().
 232  *
 233  * (Note: don't use this to do word breaking; you have to use
 234  * Pango or equivalent to get word breaking right, the algorithm
 235  * is fairly complex.)
 236  *
 237  * Return value: %TRUE if @c is a punctuation character
 238  **/
 239 gboolean
 240 g_unichar_isspace (gunichar c)
 241 {
 242   switch (c)
 243     {
 244       /* special-case these since Unicode thinks they are not spaces */
 245     case '\t':
 246     case '\n':
 247     case '\r':
 248     case '\f':
 249       return TRUE;
 250       break;
 251
 252     default:
 253       {
 254         int t = TYPE (c);
 255         return (t == G_UNICODE_SPACE_SEPARATOR || t == G_UNICODE_LINE_SEPARATOR
 256                 || t == G_UNICODE_PARAGRAPH_SEPARATOR);
 257       }
 258       break;
 259     }
 260 }
 261
 262 /**
 263  * g_unichar_isupper:
 264  * @c: a Unicode character
 265  *
 266  * Determines if a character is uppercase.
 267  *
 268  * Return value: %TRUE if @c is an uppercase character
 269  **/
 270 gboolean
 271 g_unichar_isupper (gunichar c)
 272 {
 273   return TYPE (c) == G_UNICODE_UPPERCASE_LETTER;
 274 }
 275
 276 /**
 277  * g_unichar_istitle:
 278  * @c: a Unicode character
 279  *
 280  * Determines if a character is titlecase. Some characters in
 281  * Unicode which are composites, such as the DZ digraph
 282  * have three case variants instead of just two. The titlecase
 283  * form is used at the beginning of a word where only the
 284  * first letter is capitalized. The titlecase form of the DZ
 285  * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z.
 286  *
 287  * Return value: %TRUE if the character is titlecase
 288  **/
 289 gboolean
 290 g_unichar_istitle (gunichar c)
 291 {
 292   unsigned int i;
 293   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 294     if (title_table[i][0] == c)
 295       return 1;
 296   return 0;
 297 }
 298
 299 /**
 300  * g_unichar_isxdigit:
 301  * @c: a Unicode character.
 302  *
 303  * Determines if a character is a hexidecimal digit.
 304  *
 305  * Return value: %TRUE if the character is a hexadecimal digit
 306  **/
 307 gboolean
 308 g_unichar_isxdigit (gunichar c)
 309 {
 310   int t = TYPE (c);
 311   return ((c >= 'a' && c <= 'f')
 312           || (c >= 'A' && c <= 'F')
 313           || ISDIGIT (t));
 314 }
 315
 316 /**
 317  * g_unichar_isdefined:
 318  * @c: a Unicode character
 319  *
 320  * Determines if a given character is assigned in the Unicode
 321  * standard.
 322  *
 323  * Return value: %TRUE if the character has an assigned value
 324  **/
 325 gboolean
 326 g_unichar_isdefined (gunichar c)
 327 {
 328   int t = TYPE (c);
 329   return t != G_UNICODE_UNASSIGNED;
 330 }
 331
 332 /**
 333  * g_unichar_iswide:
 334  * @c: a Unicode character
 335  *
 336  * Determines if a character is typically rendered in a double-width
 337  * cell.
 338  *
 339  * Return value: %TRUE if the character is wide
 340  **/
 341 /* This function stolen from Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk>.  */
 342 gboolean
 343 g_unichar_iswide (gunichar c)
 344 {
 345   if (c < 0x1100)
 346     return FALSE;
 347
 348   return (c <= 0x115f  /* Hangul Jamo init. consonants */
 349           || c == 0x2329 || c == 0x232a     /* angle brackets */
 350           || (c >= 0x2e80 && c <= 0xa4cf && (c < 0x302a || c > 0x302f)
 351               && c != 0x303f && c != 0x3099 && c!= 0x309a) /* CJK ... Yi */
 352           || (c >= 0xac00 && c <= 0xd7a3)   /* Hangul Syllables */
 353           || (c >= 0xf900 && c <= 0xfaff)   /* CJK Compatibility Ideographs */
 354           || (c >= 0xfe30 && c <= 0xfe6f)   /* CJK Compatibility Forms */
 355           || (c >= 0xff00 && c <= 0xff60)   /* Fullwidth Forms */
 356           || (c >= 0xffe0 && c <= 0xffe6)   /* Fullwidth Forms */
 357           || (c >= 0x20000 && c <= 0x2fffd) /* CJK extra stuff */
 358           || (c >= 0x30000 && c <= 0x3fffd));
 359 }
 360
 361 /**
 362  * g_unichar_toupper:
 363  * @c: a Unicode character
 364  *
 365  * Converts a character to uppercase.
 366  *
 367  * Return value: the result of converting @c to uppercase.
 368  *               If @c is not an lowercase or titlecase character,
 369  *               or has no upper case equivalent @c is returned unchanged.
 370  **/
 371 gunichar
 372 g_unichar_toupper (gunichar c)
 373 {
 374   int t = TYPE (c);
 375   if (t == G_UNICODE_LOWERCASE_LETTER)
 376     {
 377       gunichar val = ATTTABLE (c >> 8, c & 0xff);
 378       if (val >= 0x1000000)
 379         {
 380           const gchar *p = special_case_table + val - 0x1000000;
 381           return g_utf8_get_char (p);
 382         }
 383       else
 384         return val ? val : c;
 385     }
 386   else if (t == G_UNICODE_TITLECASE_LETTER)
 387     {
 388       unsigned int i;
 389       for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 390         {
 391           if (title_table[i][0] == c)
 392             return title_table[i][1];
 393         }
 394     }
 395   return c;
 396 }
 397
 398 /**
 399  * g_unichar_tolower:
 400  * @c: a Unicode character.
 401  *
 402  * Converts a character to lower case.
 403  *
 404  * Return value: the result of converting @c to lower case.
 405  *               If @c is not an upperlower or titlecase character,
 406  *               or has no lowercase equivalent @c is returned unchanged.
 407  **/
 408 gunichar
 409 g_unichar_tolower (gunichar c)
 410 {
 411   int t = TYPE (c);
 412   if (t == G_UNICODE_UPPERCASE_LETTER)
 413     {
 414       gunichar val = ATTTABLE (c >> 8, c & 0xff);
 415       if (val >= 0x1000000)
 416         {
 417           const gchar *p = special_case_table + val - 0x1000000;
 418           return g_utf8_get_char (p);
 419         }
 420       else
 421         return val ? val : c;
 422     }
 423   else if (t == G_UNICODE_TITLECASE_LETTER)
 424     {
 425       unsigned int i;
 426       for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 427         {
 428           if (title_table[i][0] == c)
 429             return title_table[i][2];
 430         }
 431     }
 432   return c;
 433 }
 434
 435 /**
 436  * g_unichar_totitle:
 437  * @c: a Unicode character
 438  *
 439  * Converts a character to the titlecase.
 440  *
 441  * Return value: the result of converting @c to titlecase.
 442  *               If @c is not an uppercase or lowercase character,
 443  *               @c is returned unchanged.
 444  **/
 445 gunichar
 446 g_unichar_totitle (gunichar c)
 447 {
 448   unsigned int i;
 449   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 450     {
 451       if (title_table[i][0] == c || title_table[i][1] == c
 452           || title_table[i][2] == c)
 453         return title_table[i][0];
 454     }
 455   return (TYPE (c) == G_UNICODE_LOWERCASE_LETTER
 456           ? ATTTABLE (c >> 8, c & 0xff)
 457           : c);
 458 }
 459
 460 /**
 461  * g_unichar_digit_value:
 462  * @c: a Unicode character
 463  *
 464  * Determines the numeric value of a character as a decimal
 465  * digit.
 466  *
 467  * Return value: If @c is a decimal digit (according to
 468  * g_unichar_isdigit()), its numeric value. Otherwise, -1.
 469  **/
 470 int
 471 g_unichar_digit_value (gunichar c)
 472 {
 473   if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
 474     return ATTTABLE (c >> 8, c & 0xff);
 475   return -1;
 476 }
 477
 478 /**
 479  * g_unichar_xdigit_value:
 480  * @c: a Unicode character
 481  *
 482  * Determines the numeric value of a character as a hexidecimal
 483  * digit.
 484  *
 485  * Return value: If @c is a hex digit (according to
 486  * g_unichar_isxdigit()), its numeric value. Otherwise, -1.
 487  **/
 488 int
 489 g_unichar_xdigit_value (gunichar c)
 490 {
 491   if (c >= 'A' && c <= 'F')
 492     return c - 'A' + 10;
 493   if (c >= 'a' && c <= 'f')
 494     return c - 'a' + 10;
 495   if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
 496     return ATTTABLE (c >> 8, c & 0xff);
 497   return -1;
 498 }
 499
 500 /**
 501  * g_unichar_type:
 502  * @c: a Unicode character
 503  *
 504  * Classifies a Unicode character by type.
 505  *
 506  * Return value: the type of the character.
 507  **/
 508 GUnicodeType
 509 g_unichar_type (gunichar c)
 510 {
 511   return TYPE (c);
 512 }
 513
 514 /*
 515  * Case mapping functions
 516  */
 517
 518 typedef enum {
 519   LOCALE_NORMAL,
 520   LOCALE_TURKIC,
 521   LOCALE_LITHUANIAN
 522 } LocaleType;
 523
 524 static LocaleType
 525 get_locale_type (void)
 526 {
 527   const char *locale = setlocale (LC_CTYPE, NULL);
 528
 529   switch (locale[0])
 530     {
 531    case 'a':
 532       if (locale[1] == 'z')
 533         return LOCALE_TURKIC;
 534       break;
 535     case 'l':
 536       if (locale[1] == 't')
 537         return LOCALE_LITHUANIAN;
 538       break;
 539     case 't':
 540       if (locale[1] == 'r')
 541         return LOCALE_TURKIC;
 542       break;
 543     }
 544
 545   return LOCALE_NORMAL;
 546 }
 547
 548 static gint
 549 output_marks (const char **p_inout,
 550               char        *out_buffer,
 551               gboolean     remove_dot)
 552 {
 553   const char *p = *p_inout;
 554   gint len = 0;
 555
 556   while (*p)
 557     {
 558       gunichar c = g_utf8_get_char (p);
 559       int t = TYPE(c);
 560
 561       if (ISMARK(t))
 562         {
 563           if (!remove_dot || c != 0x307 /* COMBINING DOT ABOVE */)
 564             len += g_unichar_to_utf8 (c, out_buffer ? out_buffer + len : NULL);
 565           p = g_utf8_next_char (p);
 566         }
 567       else
 568         break;
 569     }
 570
 571   *p_inout = p;
 572   return len;
 573 }
 574
 575 static gint
 576 output_special_case (gchar *out_buffer,
 577                      int    offset,
 578                      int    type,
 579                      int    which)
 580 {
 581   const gchar *p = special_case_table + offset;
 582   gint len;
 583
 584   if (type != G_UNICODE_TITLECASE_LETTER)
 585     p = g_utf8_next_char (p);
 586
 587   if (which == 1)
 588     p += strlen (p) + 1;
 589
 590   len = strlen (p);
 591   if (out_buffer)
 592     memcpy (out_buffer, p, len);
 593
 594   return len;
 595 }
 596
 597 static gsize
 598 real_toupper (const gchar *str,
 599               gssize       max_len,
 600               gchar       *out_buffer,
 601               LocaleType   locale_type)
 602 {
 603   const gchar *p = str;
 604   const char *last = NULL;
 605   gsize len = 0;
 606   gboolean last_was_i = FALSE;
 607
 608   while ((max_len < 0 || p < str + max_len) && *p)
 609     {
 610       gunichar c = g_utf8_get_char (p);
 611       int t = TYPE (c);
 612       gunichar val;
 613
 614       last = p;
 615       p = g_utf8_next_char (p);
 616
 617       if (locale_type == LOCALE_LITHUANIAN)
 618         {
 619           if (c == 'i')
 620             last_was_i = TRUE;
 621           else
 622             {
 623               if (last_was_i)
 624                 {
 625                   /* Nasty, need to remove any dot above. Though
 626                    * I think only E WITH DOT ABOVE occurs in practice
 627                    * which could simplify this considerably.
 628                    */
 629                   gsize decomp_len, i;
 630                   gunichar *decomp;
 631
 632                   decomp = g_unicode_canonical_decomposition (c, &decomp_len);
 633                   for (i=0; i < decomp_len; i++)
 634                     {
 635                       if (decomp[i] != 0x307 /* COMBINING DOT ABOVE */)
 636                         len += g_unichar_to_utf8 (g_unichar_toupper (decomp[i]), out_buffer ? out_buffer + len : NULL);
 637                     }
 638                   g_free (decomp);
 639
 640                   len += output_marks (&p, out_buffer ? out_buffer + len : NULL, TRUE);
 641
 642                   continue;
 643                 }
 644
 645               if (!ISMARK(t))
 646                 last_was_i = FALSE;
 647             }
 648         }
 649
 650       if (locale_type == LOCALE_TURKIC && c == 'i')
 651         {
 652           /* i => LATIN CAPITAL LETTER I WITH DOT ABOVE */
 653           len += g_unichar_to_utf8 (0x130, out_buffer ? out_buffer + len : NULL);
 654         }
 655       else if (c == 0x0345)     /* COMBINING GREEK YPOGEGRAMMENI */
 656         {
 657           /* Nasty, need to move it after other combining marks .. this would go away if
 658            * we normalized first.
 659            */
 660           len += output_marks (&p, out_buffer ? out_buffer + len : NULL, FALSE);
 661
 662           /* And output as GREEK CAPITAL LETTER IOTA */
 663           len += g_unichar_to_utf8 (0x399, out_buffer ? out_buffer + len : NULL);
 664         }
 665       else if (t == G_UNICODE_LOWERCASE_LETTER || t == G_UNICODE_TITLECASE_LETTER)
 666         {
 667           val = ATTTABLE (c >> 8, c & 0xff);
 668
 669           if (val >= 0x1000000)
 670             {
 671               len += output_special_case (out_buffer ? out_buffer + len : NULL, val - 0x1000000, t,
 672                                           t == G_UNICODE_LOWERCASE_LETTER ? 0 : 1);
 673             }
 674           else
 675             {
 676               if (t == G_UNICODE_TITLECASE_LETTER)
 677                 {
 678                   unsigned int i;
 679                   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 680                     {
 681                       if (title_table[i][0] == c)
 682                         val = title_table[i][1];
 683                     }
 684                 }
 685
 686               len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
 687             }
 688         }
 689       else
 690         {
 691           gsize char_len = g_utf8_skip[*(guchar *)last];
 692
 693           if (out_buffer)
 694             memcpy (out_buffer + len, last, char_len);
 695
 696           len += char_len;
 697         }
 698
 699     }
 700
 701   return len;
 702 }
 703
 704 /**
 705  * g_utf8_strup:
 706  * @str: a UTF-8 encoded string
 707  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 708  *
 709  * Converts all Unicode characters in the string that have a case
 710  * to uppercase. The exact manner that this is done depends
 711  * on the current locale, and may result in the number of
 712  * characters in the string increasing. (For instance, the
 713  * German ess-zet will be changed to SS.)
 714  *
 715  * Return value: a newly allocated string, with all characters
 716  *    converted to uppercase.
 717  **/
 718 gchar *
 719 g_utf8_strup (const gchar *str,
 720               gssize       len)
 721 {
 722   gsize result_len;
 723   LocaleType locale_type;
 724   gchar *result;
 725
 726   g_return_val_if_fail (str != NULL, NULL);
 727
 728   locale_type = get_locale_type ();
 729
 730   /*
 731    * We use a two pass approach to keep memory management simple
 732    */
 733   result_len = real_toupper (str, len, NULL, locale_type);
 734   result = g_malloc (result_len + 1);
 735   real_toupper (str, len, result, locale_type);
 736   result[result_len] = '\0';
 737
 738   return result;
 739 }
 740
 741 /* traverses the string checking for characters with combining class == 230
 742  * until a base character is found */
 743 static gboolean
 744 has_more_above (const gchar *str)
 745 {
 746   const gchar *p = str;
 747   gint combining_class;
 748
 749   while (*p)
 750     {
 751       combining_class = _g_unichar_combining_class (g_utf8_get_char (p));
 752       if (combining_class == 230)
 753         return TRUE;
 754       else if (combining_class == 0)
 755         break;
 756
 757       p = g_utf8_next_char (p);
 758     }
 759
 760   return FALSE;
 761 }
 762
 763 static gsize
 764 real_tolower (const gchar *str,
 765               gssize       max_len,
 766               gchar       *out_buffer,
 767               LocaleType   locale_type)
 768 {
 769   const gchar *p = str;
 770   const char *last = NULL;
 771   gsize len = 0;
 772
 773   while ((max_len < 0 || p < str + max_len) && *p)
 774     {
 775       gunichar c = g_utf8_get_char (p);
 776       int t = TYPE (c);
 777       gunichar val;
 778
 779       last = p;
 780       p = g_utf8_next_char (p);
 781
 782       if (locale_type == LOCALE_TURKIC && c == 'I')
 783         {
 784           if (g_utf8_get_char (p) == 0x0307)
 785             {
 786               /* I + COMBINING DOT ABOVE => i (U+0069) */
 787               len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL);
 788               p = g_utf8_next_char (p);
 789             }
 790           else
 791             {
 792               /* I => LATIN SMALL LETTER DOTLESS I */
 793               len += g_unichar_to_utf8 (0x131, out_buffer ? out_buffer + len : NULL);
 794             }
 795         }
 796       /* Introduce an explicit dot above when lowercasing capital I's and J's
 797        * whenever there are more accents above. [SpecialCasing.txt] */
 798       else if (locale_type == LOCALE_LITHUANIAN &&
 799                (c == 0x00cc || c == 0x00cd || c == 0x0128))
 800         {
 801           len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL);
 802           len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL);
 803
 804           switch (c)
 805             {
 806             case 0x00cc:
 807               len += g_unichar_to_utf8 (0x0300, out_buffer ? out_buffer + len : NULL);
 808               break;
 809             case 0x00cd:
 810               len += g_unichar_to_utf8 (0x0301, out_buffer ? out_buffer + len : NULL);
 811               break;
 812             case 0x0128:
 813               len += g_unichar_to_utf8 (0x0303, out_buffer ? out_buffer + len : NULL);
 814               break;
 815             }
 816         }
 817       else if (locale_type == LOCALE_LITHUANIAN &&
 818                (c == 'I' || c == 'J' || c == 0x012e) &&
 819                has_more_above (p))
 820         {
 821           len += g_unichar_to_utf8 (g_unichar_tolower (c), out_buffer ? out_buffer + len : NULL);
 822           len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL);
 823         }
 824       else if (c == 0x03A3)     /* GREEK CAPITAL LETTER SIGMA */
 825         {
 826           if ((max_len < 0 || p < str + max_len) && *p)
 827             {
 828               gunichar next_c = g_utf8_get_char (p);
 829               int next_type = TYPE(next_c);
 830
 831               /* SIGMA mapps differently depending on whether it is
 832                * final or not. The following simplified test would
 833                * fail in the case of combining marks following the
 834                * sigma, but I don't think that occurs in real text.
 835                * The test here matches that in ICU.
 836                */
 837               if (ISALPHA(next_type)) /* Lu,Ll,Lt,Lm,Lo */
 838                 val = 0x3c3;    /* GREEK SMALL SIGMA */
 839               else
 840                 val = 0x3c2;    /* GREEK SMALL FINAL SIGMA */
 841             }
 842           else
 843             val = 0x3c2;        /* GREEK SMALL FINAL SIGMA */
 844
 845           len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
 846         }
 847       else if (t == G_UNICODE_UPPERCASE_LETTER || t == G_UNICODE_TITLECASE_LETTER)
 848         {
 849           val = ATTTABLE (c >> 8, c & 0xff);
 850
 851           if (val >= 0x1000000)
 852             {
 853               len += output_special_case (out_buffer ? out_buffer + len : NULL, val - 0x1000000, t, 0);
 854             }
 855           else
 856             {
 857               if (t == G_UNICODE_TITLECASE_LETTER)
 858                 {
 859                   unsigned int i;
 860                   for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
 861                     {
 862                       if (title_table[i][0] == c)
 863                         val = title_table[i][2];
 864                     }
 865                 }
 866
 867               len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
 868             }
 869         }
 870       else
 871         {
 872           gsize char_len = g_utf8_skip[*(guchar *)last];
 873
 874           if (out_buffer)
 875             memcpy (out_buffer + len, last, char_len);
 876
 877           len += char_len;
 878         }
 879
 880     }
 881
 882   return len;
 883 }
 884
 885 /**
 886  * g_utf8_strdown:
 887  * @str: a UTF-8 encoded string
 888  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 889  *
 890  * Converts all Unicode characters in the string that have a case
 891  * to lowercase. The exact manner that this is done depends
 892  * on the current locale, and may result in the number of
 893  * characters in the string changing.
 894  *
 895  * Return value: a newly allocated string, with all characters
 896  *    converted to lowercase.
 897  **/
 898 gchar *
 899 g_utf8_strdown (const gchar *str,
 900                 gssize       len)
 901 {
 902   gsize result_len;
 903   LocaleType locale_type;
 904   gchar *result;
 905
 906   g_return_val_if_fail (str != NULL, NULL);
 907
 908   locale_type = get_locale_type ();
 909
 910   /*
 911    * We use a two pass approach to keep memory management simple
 912    */
 913   result_len = real_tolower (str, len, NULL, locale_type);
 914   result = g_malloc (result_len + 1);
 915   real_tolower (str, len, result, locale_type);
 916   result[result_len] = '\0';
 917
 918   return result;
 919 }
 920
 921 /**
 922  * g_utf8_casefold:
 923  * @str: a UTF-8 encoded string
 924  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 925  *
 926  * Converts a string into a form that is independent of case. The
 927  * result will not correspond to any particular case, but can be
 928  * compared for equality or ordered with the results of calling
 929  * g_utf8_casefold() on other strings.
 930  *
 931  * Note that calling g_utf8_casefold() followed by g_utf8_collate() is
 932  * only an approximation to the correct linguistic case insensitive
 933  * ordering, though it is a fairly good one. Getting this exactly
 934  * right would require a more sophisticated collation function that
 935  * takes case sensitivity into account. GLib does not currently
 936  * provide such a function.
 937  *
 938  * Return value: a newly allocated string, that is a
 939  *   case independent form of @str.
 940  **/
 941 gchar *
 942 g_utf8_casefold (const gchar *str,
 943                  gssize       len)
 944 {
 945   GString *result;
 946   const char *p;
 947
 948   g_return_val_if_fail (str != NULL, NULL);
 949
 950   result = g_string_new (NULL);
 951   p = str;
 952   while ((len < 0 || p < str + len) && *p)
 953     {
 954       gunichar ch = g_utf8_get_char (p);
 955
 956       int start = 0;
 957       int end = G_N_ELEMENTS (casefold_table);
 958
 959       if (ch >= casefold_table[start].ch &&
 960           ch <= casefold_table[end - 1].ch)
 961         {
 962           while (TRUE)
 963             {
 964               int half = (start + end) / 2;
 965               if (ch == casefold_table[half].ch)
 966                 {
 967                   g_string_append (result, casefold_table[half].data);
 968                   goto next;
 969                 }
 970               else if (half == start)
 971                 break;
 972               else if (ch > casefold_table[half].ch)
 973                 start = half;
 974               else
 975                 end = half;
 976             }
 977         }
 978
 979       g_string_append_unichar (result, g_unichar_tolower (ch));
 980
 981     next:
 982       p = g_utf8_next_char (p);
 983     }
 984
 985   return g_string_free (result, FALSE);
 986 }
 987
 988 /**
 989  * g_unichar_get_mirror_char:
 990  * @ch: a unicode character
 991  * @mirrored_ch: location to store the mirrored character
 992  *
 993  * In Unicode, some characters are <firstterm>mirrored</firstterm>. This
 994  * means that their images are mirrored horizontally in text that is laid
 995  * out from right to left. For instance, "(" would become its mirror image,
 996  * ")", in right-to-left text.
 997  *
 998  * If @ch has the Unicode mirrored property and there is another unicode
 999  * character that typically has a glyph that is the mirror image of @ch's
1000  * glyph, puts that character in the address pointed to by @mirrored_ch.
1001  *
1002  * Return value: %TRUE if @ch has a mirrored character and @mirrored_ch is
1003  * filled in, %FALSE otherwise
1004  *
1005  * Since: 2.4
1006  **/
1007 /* This code is adapted from FriBidi (http://fribidi.sourceforge.net/).
1008  * FriBidi is: Copyright (C) 1999,2000 Dov Grobgeld, and
1009  *             Copyright (C) 2001,2002 Behdad Esfahbod.
1010  */
1011 gboolean
1012 g_unichar_get_mirror_char (gunichar ch,
1013                            gunichar *mirrored_ch)
1014 {
1015   gint pos, step, size;
1016   gboolean found;
1017
1018   size = G_N_ELEMENTS (bidi_mirroring_table);
1019   pos = step = (size / 2) + 1;
1020
1021   while (step > 1)
1022     {
1023       gunichar cmp_ch = bidi_mirroring_table[pos].ch;
1024       step = (step + 1) / 2;
1025
1026       if (cmp_ch < ch)
1027         {
1028           pos += step;
1029           if (pos > size - 1)
1030             pos = size - 1;
1031         }
1032       else if (cmp_ch > ch)
1033         {
1034           pos -= step;
1035           if (pos < 0)
1036             pos = 0;
1037         }
1038       else
1039         break;
1040     }
1041   found = bidi_mirroring_table[pos].ch == ch;
1042   if (mirrored_ch)
1043     *mirrored_ch = found ? bidi_mirroring_table[pos].mirrored_ch : ch;
1044
1045   return found;
1046
1047 }