glib/gunicollate.c

   1 /* gunicollate.c - Collation
   2  *
   3  *  Copyright 2001,2005 Red Hat, Inc.
   4  *
   5  * The Gnome Library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public License as
   7  * published by the Free Software Foundation; either version 2 of the
   8  * License, or (at your option) any later version.
   9  *
  10  * The Gnome Library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with the Gnome Library; see the file COPYING.LIB.  If not,
  17  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  18  *   Boston, MA 02111-1307, USA.
  19  */
  20
  21 #include "config.h"
  22
  23 #include <locale.h>
  24 #include <string.h>
  25 #ifdef __STDC_ISO_10646__
  26 #include <wchar.h>
  27 #endif
  28
  29 #ifdef HAVE_CARBON
  30 #include <CoreServices/CoreServices.h>
  31 #endif
  32
  33 #include "glib.h"
  34 #include "gunicodeprivate.h"
  35 #include "galias.h"
  36
  37 #ifdef _MSC_VER
  38 /* Workaround for bug in MSVCR80.DLL */
  39 static gsize
  40 msc_strxfrm_wrapper (char       *string1,
  41                      const char *string2,
  42                      gsize       count)
  43 {
  44   if (!string1 || count <= 0)
  45     {
  46       char tmp;
  47
  48       return strxfrm (&tmp, string2, 1);
  49     }
  50   return strxfrm (string1, string2, count);
  51 }
  52 #define strxfrm msc_strxfrm_wrapper
  53 #endif
  54
  55 /**
  56  * g_utf8_collate:
  57  * @str1: a UTF-8 encoded string
  58  * @str2: a UTF-8 encoded string
  59  *
  60  * Compares two strings for ordering using the linguistically
  61  * correct rules for the <link linkend="setlocale">current locale</link>.
  62  * When sorting a large number of strings, it will be significantly
  63  * faster to obtain collation keys with g_utf8_collate_key() and
  64  * compare the keys with strcmp() when sorting instead of sorting
  65  * the original strings.
  66  *
  67  * Return value: &lt; 0 if @str1 compares before @str2,
  68  *   0 if they compare equal, &gt; 0 if @str1 compares after @str2.
  69  **/
  70 gint
  71 g_utf8_collate (const gchar *str1,
  72                 const gchar *str2)
  73 {
  74   gint result;
  75
  76 #ifdef HAVE_CARBON
  77
  78   UniChar *str1_utf16;
  79   UniChar *str2_utf16;
  80   glong len1;
  81   glong len2;
  82   SInt32 retval = 0;
  83
  84   g_return_val_if_fail (str1 != NULL, 0);
  85   g_return_val_if_fail (str2 != NULL, 0);
  86
  87   str1_utf16 = g_utf8_to_utf16 (str1, -1, NULL, &len1, NULL);
  88   str2_utf16 = g_utf8_to_utf16 (str2, -1, NULL, &len2, NULL);
  89
  90   UCCompareTextDefault (kUCCollateStandardOptions,
  91                         str1_utf16, len1, str2_utf16, len2,
  92                         NULL, &retval);
  93   result = retval;
  94
  95   g_free (str2_utf16);
  96   g_free (str1_utf16);
  97
  98 #elif defined(__STDC_ISO_10646__)
  99
 100   gunichar *str1_norm;
 101   gunichar *str2_norm;
 102
 103   g_return_val_if_fail (str1 != NULL, 0);
 104   g_return_val_if_fail (str2 != NULL, 0);
 105
 106   str1_norm = _g_utf8_normalize_wc (str1, -1, G_NORMALIZE_ALL_COMPOSE);
 107   str2_norm = _g_utf8_normalize_wc (str2, -1, G_NORMALIZE_ALL_COMPOSE);
 108
 109   result = wcscoll ((wchar_t *)str1_norm, (wchar_t *)str2_norm);
 110
 111   g_free (str1_norm);
 112   g_free (str2_norm);
 113
 114 #else /* !__STDC_ISO_10646__ */
 115
 116   const gchar *charset;
 117   gchar *str1_norm;
 118   gchar *str2_norm;
 119
 120   g_return_val_if_fail (str1 != NULL, 0);
 121   g_return_val_if_fail (str2 != NULL, 0);
 122
 123   str1_norm = g_utf8_normalize (str1, -1, G_NORMALIZE_ALL_COMPOSE);
 124   str2_norm = g_utf8_normalize (str2, -1, G_NORMALIZE_ALL_COMPOSE);
 125
 126   if (g_get_charset (&charset))
 127     {
 128       result = strcoll (str1_norm, str2_norm);
 129     }
 130   else
 131     {
 132       gchar *str1_locale = g_convert (str1_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
 133       gchar *str2_locale = g_convert (str2_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
 134
 135       if (str1_locale && str2_locale)
 136         result =  strcoll (str1_locale, str2_locale);
 137       else if (str1_locale)
 138         result = -1;
 139       else if (str2_locale)
 140         result = 1;
 141       else
 142         result = strcmp (str1_norm, str2_norm);
 143
 144       g_free (str1_locale);
 145       g_free (str2_locale);
 146     }
 147
 148   g_free (str1_norm);
 149   g_free (str2_norm);
 150
 151 #endif /* __STDC_ISO_10646__ */
 152
 153   return result;
 154 }
 155
 156 #if defined(__STDC_ISO_10646__) || defined(HAVE_CARBON)
 157 /* We need UTF-8 encoding of numbers to encode the weights if
 158  * we are using wcsxfrm. However, we aren't encoding Unicode
 159  * characters, so we can't simply use g_unichar_to_utf8.
 160  *
 161  * The following routine is taken (with modification) from GNU
 162  * libc's strxfrm routine:
 163  *
 164  * Copyright (C) 1995-1999,2000,2001 Free Software Foundation, Inc.
 165  * Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
 166  */
 167 static inline int
 168 utf8_encode (char *buf, wchar_t val)
 169 {
 170   int retval;
 171
 172   if (val < 0x80)
 173     {
 174       if (buf)
 175         *buf++ = (char) val;
 176       retval = 1;
 177     }
 178   else
 179     {
 180       int step;
 181
 182       for (step = 2; step < 6; ++step)
 183         if ((val & (~(guint32)0 << (5 * step + 1))) == 0)
 184           break;
 185       retval = step;
 186
 187       if (buf)
 188         {
 189           *buf = (unsigned char) (~0xff >> step);
 190           --step;
 191           do
 192             {
 193               buf[step] = 0x80 | (val & 0x3f);
 194               val >>= 6;
 195             }
 196           while (--step > 0);
 197           *buf |= val;
 198         }
 199     }
 200
 201   return retval;
 202 }
 203 #endif /* __STDC_ISO_10646__ || HAVE_CARBON */
 204
 205 #ifdef HAVE_CARBON
 206
 207 static gchar *
 208 collate_key_to_string (UCCollationValue *key,
 209                        gsize             key_len)
 210 {
 211   gchar *result;
 212   gsize result_len;
 213   gsize i;
 214
 215   /* Pretty smart algorithm here: ignore first eight bytes of the
 216    * collation key. It doesn't produce results equivalent to
 217    * UCCompareCollationKeys's, but the difference seems to be only
 218    * that UCCompareCollationKeys in some cases produces 0 where our
 219    * comparison gets -1 or 1. */
 220
 221   if (key_len * sizeof (UCCollationValue) <= 8)
 222     return g_strdup ("");
 223
 224   result_len = 0;
 225   for (i = 8; i < key_len * sizeof (UCCollationValue); i++)
 226     /* there may be nul bytes, encode byteval+1 */
 227     result_len += utf8_encode (NULL, *((guchar*)key + i) + 1);
 228
 229   result = g_malloc (result_len + 1);
 230   result_len = 0;
 231   for (i = 8; i < key_len * sizeof (UCCollationValue); i++)
 232     result_len += utf8_encode (result + result_len, *((guchar*)key + i) + 1);
 233
 234   result[result_len] = 0;
 235   return result;
 236 }
 237
 238 static gchar *
 239 carbon_collate_key_with_collator (const gchar *str,
 240                                   gssize       len,
 241                                   CollatorRef  collator)
 242 {
 243   UniChar *str_utf16 = NULL;
 244   glong len_utf16;
 245   OSStatus ret;
 246   UCCollationValue staticbuf[512];
 247   UCCollationValue *freeme = NULL;
 248   UCCollationValue *buf;
 249   ItemCount buf_len;
 250   ItemCount key_len;
 251   ItemCount try_len;
 252   gchar *result = NULL;
 253
 254   str_utf16 = g_utf8_to_utf16 (str, len, NULL, &len_utf16, NULL);
 255   try_len = len_utf16 * 5 + 2;
 256
 257   if (try_len <= sizeof staticbuf)
 258     {
 259       buf = staticbuf;
 260       buf_len = sizeof staticbuf;
 261     }
 262   else
 263     {
 264       freeme = g_new (UCCollationValue, try_len);
 265       buf = freeme;
 266       buf_len = try_len;
 267     }
 268
 269   ret = UCGetCollationKey (collator, str_utf16, len_utf16,
 270                            buf_len, &key_len, buf);
 271
 272   if (ret == kCollateBufferTooSmall)
 273     {
 274       freeme = g_renew (UCCollationValue, freeme, try_len * 2);
 275       buf = freeme;
 276       buf_len = try_len * 2;
 277       ret = UCGetCollationKey (collator, str_utf16, len_utf16,
 278                                buf_len, &key_len, buf);
 279     }
 280
 281   if (ret == 0)
 282     result = collate_key_to_string (buf, key_len);
 283   else
 284     result = g_strdup ("");
 285
 286   g_free (freeme);
 287   g_free (str_utf16);
 288   return result;
 289 }
 290
 291 static gchar *
 292 carbon_collate_key (const gchar *str,
 293                     gssize       len)
 294 {
 295   static CollatorRef collator;
 296
 297   if (G_UNLIKELY (!collator))
 298     {
 299       UCCreateCollator (NULL, 0, kUCCollateStandardOptions, &collator);
 300
 301       if (!collator)
 302         {
 303           static gboolean been_here;
 304           if (!been_here)
 305             g_warning ("%s: UCCreateCollator failed", G_STRLOC);
 306           been_here = TRUE;
 307           return g_strdup ("");
 308         }
 309     }
 310
 311   return carbon_collate_key_with_collator (str, len, collator);
 312 }
 313
 314 static gchar *
 315 carbon_collate_key_for_filename (const gchar *str,
 316                                  gssize       len)
 317 {
 318   static CollatorRef collator;
 319
 320   if (G_UNLIKELY (!collator))
 321     {
 322       /* http://developer.apple.com/qa/qa2004/qa1159.html */
 323       UCCreateCollator (NULL, 0,
 324                         kUCCollateComposeInsensitiveMask
 325                          | kUCCollateWidthInsensitiveMask
 326                          | kUCCollateCaseInsensitiveMask
 327                          | kUCCollateDigitsOverrideMask
 328                          | kUCCollateDigitsAsNumberMask
 329                          | kUCCollatePunctuationSignificantMask,
 330                         &collator);
 331
 332       if (!collator)
 333         {
 334           static gboolean been_here;
 335           if (!been_here)
 336             g_warning ("%s: UCCreateCollator failed", G_STRLOC);
 337           been_here = TRUE;
 338           return g_strdup ("");
 339         }
 340     }
 341
 342   return carbon_collate_key_with_collator (str, len, collator);
 343 }
 344
 345 #endif /* HAVE_CARBON */
 346
 347 /**
 348  * g_utf8_collate_key:
 349  * @str: a UTF-8 encoded string.
 350  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 351  *
 352  * Converts a string into a collation key that can be compared
 353  * with other collation keys produced by the same function using
 354  * strcmp().
 355  *
 356  * The results of comparing the collation keys of two strings
 357  * with strcmp() will always be the same as comparing the two
 358  * original keys with g_utf8_collate().
 359  *
 360  * Note that this function depends on the
 361  * <link linkend="setlocale">current locale</link>.
 362  *
 363  * Return value: a newly allocated string. This string should
 364  *   be freed with g_free() when you are done with it.
 365  **/
 366 gchar *
 367 g_utf8_collate_key (const gchar *str,
 368                     gssize       len)
 369 {
 370   gchar *result;
 371
 372 #ifdef HAVE_CARBON
 373
 374   g_return_val_if_fail (str != NULL, NULL);
 375   result = carbon_collate_key (str, len);
 376
 377 #elif defined(__STDC_ISO_10646__)
 378
 379   gsize xfrm_len;
 380   gunichar *str_norm;
 381   wchar_t *result_wc;
 382   gsize i;
 383   gsize result_len = 0;
 384
 385   g_return_val_if_fail (str != NULL, NULL);
 386
 387   str_norm = _g_utf8_normalize_wc (str, len, G_NORMALIZE_ALL_COMPOSE);
 388
 389   xfrm_len = wcsxfrm (NULL, (wchar_t *)str_norm, 0);
 390   result_wc = g_new (wchar_t, xfrm_len + 1);
 391   wcsxfrm (result_wc, (wchar_t *)str_norm, xfrm_len + 1);
 392
 393   for (i=0; i < xfrm_len; i++)
 394     result_len += utf8_encode (NULL, result_wc[i]);
 395
 396   result = g_malloc (result_len + 1);
 397   result_len = 0;
 398   for (i=0; i < xfrm_len; i++)
 399     result_len += utf8_encode (result + result_len, result_wc[i]);
 400
 401   result[result_len] = '\0';
 402
 403   g_free (result_wc);
 404   g_free (str_norm);
 405
 406   return result;
 407 #else /* !__STDC_ISO_10646__ */
 408
 409   gsize xfrm_len;
 410   const gchar *charset;
 411   gchar *str_norm;
 412
 413   g_return_val_if_fail (str != NULL, NULL);
 414
 415   str_norm = g_utf8_normalize (str, len, G_NORMALIZE_ALL_COMPOSE);
 416
 417   result = NULL;
 418
 419   if (g_get_charset (&charset))
 420     {
 421       xfrm_len = strxfrm (NULL, str_norm, 0);
 422       if (xfrm_len >= 0 && xfrm_len < G_MAXINT - 2)
 423         {
 424           result = g_malloc (xfrm_len + 1);
 425           strxfrm (result, str_norm, xfrm_len + 1);
 426         }
 427     }
 428   else
 429     {
 430       gchar *str_locale = g_convert (str_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
 431
 432       if (str_locale)
 433         {
 434           xfrm_len = strxfrm (NULL, str_locale, 0);
 435           if (xfrm_len < 0 || xfrm_len >= G_MAXINT - 2)
 436             {
 437               g_free (str_locale);
 438               str_locale = NULL;
 439             }
 440         }
 441       if (str_locale)
 442         {
 443           result = g_malloc (xfrm_len + 2);
 444           result[0] = 'A';
 445           strxfrm (result + 1, str_locale, xfrm_len + 1);
 446
 447           g_free (str_locale);
 448         }
 449     }
 450
 451   if (!result)
 452     {
 453       xfrm_len = strlen (str_norm);
 454       result = g_malloc (xfrm_len + 2);
 455       result[0] = 'B';
 456       memcpy (result + 1, str_norm, xfrm_len);
 457       result[xfrm_len+1] = '\0';
 458     }
 459
 460   g_free (str_norm);
 461 #endif /* __STDC_ISO_10646__ */
 462
 463   return result;
 464 }
 465
 466 /* This is a collation key that is very very likely to sort before any
 467    collation key that libc strxfrm generates. We use this before any
 468    special case (dot or number) to make sure that its sorted before
 469    anything else.
 470  */
 471 #define COLLATION_SENTINEL "\1\1\1"
 472
 473 /**
 474  * g_utf8_collate_key_for_filename:
 475  * @str: a UTF-8 encoded string.
 476  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 477  *
 478  * Converts a string into a collation key that can be compared
 479  * with other collation keys produced by the same function using strcmp().
 480  *
 481  * In order to sort filenames correctly, this function treats the dot '.'
 482  * as a special case. Most dictionary orderings seem to consider it
 483  * insignificant, thus producing the ordering "event.c" "eventgenerator.c"
 484  * "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we
 485  * would like to treat numbers intelligently so that "file1" "file10" "file5"
 486  * is sorted as "file1" "file5" "file10".
 487  *
 488  * Note that this function depends on the
 489  * <link linkend="setlocale">current locale</link>.
 490  *
 491  * Return value: a newly allocated string. This string should
 492  *   be freed with g_free() when you are done with it.
 493  *
 494  * Since: 2.8
 495  */
 496 gchar*
 497 g_utf8_collate_key_for_filename (const gchar *str,
 498                                  gssize       len)
 499 {
 500 #ifndef HAVE_CARBON
 501   GString *result;
 502   GString *append;
 503   const gchar *p;
 504   const gchar *prev;
 505   const gchar *end;
 506   gchar *collate_key;
 507   gint digits;
 508   gint leading_zeros;
 509
 510   /*
 511    * How it works:
 512    *
 513    * Split the filename into collatable substrings which do
 514    * not contain [.0-9] and special-cased substrings. The collatable
 515    * substrings are run through the normal g_utf8_collate_key() and the
 516    * resulting keys are concatenated with keys generated from the
 517    * special-cased substrings.
 518    *
 519    * Special cases: Dots are handled by replacing them with '\1' which
 520    * implies that short dot-delimited substrings are before long ones,
 521    * e.g.
 522    *
 523    *   a\1a   (a.a)
 524    *   a-\1a  (a-.a)
 525    *   aa\1a  (aa.a)
 526    *
 527    * Numbers are handled by prepending to each number d-1 superdigits
 528    * where d = number of digits in the number and SUPERDIGIT is a
 529    * character with an integer value higher than any digit (for instance
 530    * ':'). This ensures that single-digit numbers are sorted before
 531    * double-digit numbers which in turn are sorted separately from
 532    * triple-digit numbers, etc. To avoid strange side-effects when
 533    * sorting strings that already contain SUPERDIGITs, a '\2'
 534    * is also prepended, like this
 535    *
 536    *   file\21      (file1)
 537    *   file\25      (file5)
 538    *   file\2:10    (file10)
 539    *   file\2:26    (file26)
 540    *   file\2::100  (file100)
 541    *   file:foo     (file:foo)
 542    *
 543    * This has the side-effect of sorting numbers before everything else (except
 544    * dots), but this is probably OK.
 545    *
 546    * Leading digits are ignored when doing the above. To discriminate
 547    * numbers which differ only in the number of leading digits, we append
 548    * the number of leading digits as a byte at the very end of the collation
 549    * key.
 550    *
 551    * To try avoid conflict with any collation key sequence generated by libc we
 552    * start each switch to a special cased part with a sentinel that hopefully
 553    * will sort before anything libc will generate.
 554    */
 555
 556   if (len < 0)
 557     len = strlen (str);
 558
 559   result = g_string_sized_new (len * 2);
 560   append = g_string_sized_new (0);
 561
 562   end = str + len;
 563
 564   /* No need to use utf8 functions, since we're only looking for ascii chars */
 565   for (prev = p = str; p < end; p++)
 566     {
 567       switch (*p)
 568         {
 569         case '.':
 570           if (prev != p)
 571             {
 572               collate_key = g_utf8_collate_key (prev, p - prev);
 573               g_string_append (result, collate_key);
 574               g_free (collate_key);
 575             }
 576
 577           g_string_append (result, COLLATION_SENTINEL "\1");
 578
 579           /* skip the dot */
 580           prev = p + 1;
 581           break;
 582
 583         case '0':
 584         case '1':
 585         case '2':
 586         case '3':
 587         case '4':
 588         case '5':
 589         case '6':
 590         case '7':
 591         case '8':
 592         case '9':
 593           if (prev != p)
 594             {
 595               collate_key = g_utf8_collate_key (prev, p - prev);
 596               g_string_append (result, collate_key);
 597               g_free (collate_key);
 598             }
 599
 600           g_string_append (result, COLLATION_SENTINEL "\2");
 601
 602           prev = p;
 603
 604           /* write d-1 colons */
 605           if (*p == '0')
 606             {
 607               leading_zeros = 1;
 608               digits = 0;
 609             }
 610           else
 611             {
 612               leading_zeros = 0;
 613               digits = 1;
 614             }
 615
 616           while (++p < end)
 617             {
 618               if (*p == '0' && !digits)
 619                 ++leading_zeros;
 620               else if (g_ascii_isdigit(*p))
 621                 ++digits;
 622               else
 623                 {
 624                   /* count an all-zero sequence as
 625                    * one digit plus leading zeros
 626                    */
 627                   if (!digits)
 628                     {
 629                       ++digits;
 630                       --leading_zeros;
 631                     }
 632                   break;
 633                 }
 634             }
 635
 636           while (digits > 1)
 637             {
 638               g_string_append_c (result, ':');
 639               --digits;
 640             }
 641
 642           if (leading_zeros > 0)
 643             {
 644               g_string_append_c (append, (char)leading_zeros);
 645               prev += leading_zeros;
 646             }
 647
 648           /* write the number itself */
 649           g_string_append_len (result, prev, p - prev);
 650
 651           prev = p;
 652           --p;    /* go one step back to avoid disturbing outer loop */
 653           break;
 654
 655         default:
 656           /* other characters just accumulate */
 657           break;
 658         }
 659     }
 660
 661   if (prev != p)
 662     {
 663       collate_key = g_utf8_collate_key (prev, p - prev);
 664       g_string_append (result, collate_key);
 665       g_free (collate_key);
 666     }
 667
 668   g_string_append (result, append->str);
 669   g_string_free (append, TRUE);
 670
 671   return g_string_free (result, FALSE);
 672 #else /* HAVE_CARBON */
 673   return carbon_collate_key_for_filename (str, len);
 674 #endif
 675 }
 676
 677
 678 #define __G_UNICOLLATE_C__
 679 #include "galiasdef.c"