glib/gunicollate.c

   1 /* gunicollate.c - Collation
   2  *
   3  *  Copyright 2001,2005 Red Hat, Inc.
   4  *
   5  * The Gnome Library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public License as
   7  * published by the Free Software Foundation; either version 2 of the
   8  * License, or (at your option) any later version.
   9  *
  10  * The Gnome Library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with the Gnome Library; see the file COPYING.LIB.  If not,
  17  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  18  *   Boston, MA 02111-1307, USA.
  19  */
  20
  21 #include "config.h"
  22
  23 #include <locale.h>
  24 #include <string.h>
  25 #ifdef __STDC_ISO_10646__
  26 #include <wchar.h>
  27 #endif
  28
  29 #include "glib.h"
  30 #include "gunicodeprivate.h"
  31 #include "galias.h"
  32
  33 #ifdef _MSC_VER
  34 /* Workaround for bug in MSVCR80.DLL */
  35 static size_t
  36 msc_strxfrm_wrapper (char       *string1,
  37                      const char *string2,
  38                      size_t      count)
  39 {
  40   if (!string1 || count <= 0)
  41     {
  42       char tmp;
  43
  44       return strxfrm (&tmp, string2, 1);
  45     }
  46   return strxfrm (string1, string2, count);
  47 }
  48 #define strxfrm msc_strxfrm_wrapper
  49 #endif
  50
  51 /**
  52  * g_utf8_collate:
  53  * @str1: a UTF-8 encoded string
  54  * @str2: a UTF-8 encoded string
  55  *
  56  * Compares two strings for ordering using the linguistically
  57  * correct rules for the current locale. When sorting a large
  58  * number of strings, it will be significantly faster to
  59  * obtain collation keys with g_utf8_collate_key() and
  60  * compare the keys with strcmp() when
  61  * sorting instead of sorting the original strings.
  62  *
  63  * Return value: &lt; 0 if @str1 compares before @str2,
  64  *   0 if they compare equal, &gt; 0 if @str1 compares after @str2.
  65  **/
  66 gint
  67 g_utf8_collate (const gchar *str1,
  68                 const gchar *str2)
  69 {
  70   gint result;
  71
  72 #ifdef __STDC_ISO_10646__
  73
  74   gunichar *str1_norm;
  75   gunichar *str2_norm;
  76
  77   g_return_val_if_fail (str1 != NULL, 0);
  78   g_return_val_if_fail (str2 != NULL, 0);
  79
  80   str1_norm = _g_utf8_normalize_wc (str1, -1, G_NORMALIZE_ALL_COMPOSE);
  81   str2_norm = _g_utf8_normalize_wc (str2, -1, G_NORMALIZE_ALL_COMPOSE);
  82
  83   result = wcscoll ((wchar_t *)str1_norm, (wchar_t *)str2_norm);
  84
  85   g_free (str1_norm);
  86   g_free (str2_norm);
  87
  88 #else /* !__STDC_ISO_10646__ */
  89
  90   const gchar *charset;
  91   gchar *str1_norm;
  92   gchar *str2_norm;
  93
  94   g_return_val_if_fail (str1 != NULL, 0);
  95   g_return_val_if_fail (str2 != NULL, 0);
  96
  97   str1_norm = g_utf8_normalize (str1, -1, G_NORMALIZE_ALL_COMPOSE);
  98   str2_norm = g_utf8_normalize (str2, -1, G_NORMALIZE_ALL_COMPOSE);
  99
 100   if (g_get_charset (&charset))
 101     {
 102       result = strcoll (str1_norm, str2_norm);
 103     }
 104   else
 105     {
 106       gchar *str1_locale = g_convert (str1_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
 107       gchar *str2_locale = g_convert (str2_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
 108
 109       if (str1_locale && str2_locale)
 110         result =  strcoll (str1_locale, str2_locale);
 111       else if (str1_locale)
 112         result = -1;
 113       else if (str2_locale)
 114         result = 1;
 115       else
 116         result = strcmp (str1_norm, str2_norm);
 117
 118       g_free (str1_locale);
 119       g_free (str2_locale);
 120     }
 121
 122   g_free (str1_norm);
 123   g_free (str2_norm);
 124
 125 #endif /* __STDC_ISO_10646__ */
 126
 127   return result;
 128 }
 129
 130 #ifdef __STDC_ISO_10646__
 131 /* We need UTF-8 encoding of numbers to encode the weights if
 132  * we are using wcsxfrm. However, we aren't encoding Unicode
 133  * characters, so we can't simply use g_unichar_to_utf8.
 134  *
 135  * The following routine is taken (with modification) from GNU
 136  * libc's strxfrm routine:
 137  *
 138  * Copyright (C) 1995-1999,2000,2001 Free Software Foundation, Inc.
 139  * Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
 140  */
 141 static inline int
 142 utf8_encode (char *buf, wchar_t val)
 143 {
 144   int retval;
 145
 146   if (val < 0x80)
 147     {
 148       if (buf)
 149         *buf++ = (char) val;
 150       retval = 1;
 151     }
 152   else
 153     {
 154       int step;
 155
 156       for (step = 2; step < 6; ++step)
 157         if ((val & (~(guint32)0 << (5 * step + 1))) == 0)
 158           break;
 159       retval = step;
 160
 161       if (buf)
 162         {
 163           *buf = (unsigned char) (~0xff >> step);
 164           --step;
 165           do
 166             {
 167               buf[step] = 0x80 | (val & 0x3f);
 168               val >>= 6;
 169             }
 170           while (--step > 0);
 171           *buf |= val;
 172         }
 173     }
 174
 175   return retval;
 176 }
 177 #endif /* __STDC_ISO_10646__ */
 178
 179 /**
 180  * g_utf8_collate_key:
 181  * @str: a UTF-8 encoded string.
 182  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 183  *
 184  * Converts a string into a collation key that can be compared
 185  * with other collation keys produced by the same function using
 186  * strcmp().
 187  * The results of comparing the collation keys of two strings
 188  * with strcmp() will always be the same as
 189  * comparing the two original keys with g_utf8_collate().
 190  *
 191  * Return value: a newly allocated string. This string should
 192  *   be freed with g_free() when you are done with it.
 193  **/
 194 gchar *
 195 g_utf8_collate_key (const gchar *str,
 196                     gssize       len)
 197 {
 198   gchar *result;
 199   size_t xfrm_len;
 200
 201 #ifdef __STDC_ISO_10646__
 202
 203   gunichar *str_norm;
 204   wchar_t *result_wc;
 205   size_t i;
 206   size_t result_len = 0;
 207
 208   g_return_val_if_fail (str != NULL, NULL);
 209
 210   str_norm = _g_utf8_normalize_wc (str, len, G_NORMALIZE_ALL_COMPOSE);
 211
 212   setlocale (LC_COLLATE, "");
 213
 214   xfrm_len = wcsxfrm (NULL, (wchar_t *)str_norm, 0);
 215   result_wc = g_new (wchar_t, xfrm_len + 1);
 216   wcsxfrm (result_wc, (wchar_t *)str_norm, xfrm_len + 1);
 217
 218   for (i=0; i < xfrm_len; i++)
 219     result_len += utf8_encode (NULL, result_wc[i]);
 220
 221   result = g_malloc (result_len + 1);
 222   result_len = 0;
 223   for (i=0; i < xfrm_len; i++)
 224     result_len += utf8_encode (result + result_len, result_wc[i]);
 225
 226   result[result_len] = '\0';
 227
 228   g_free (result_wc);
 229   g_free (str_norm);
 230
 231   return result;
 232 #else /* !__STDC_ISO_10646__ */
 233
 234   const gchar *charset;
 235   gchar *str_norm;
 236
 237   g_return_val_if_fail (str != NULL, NULL);
 238
 239   str_norm = g_utf8_normalize (str, len, G_NORMALIZE_ALL_COMPOSE);
 240
 241   if (g_get_charset (&charset))
 242     {
 243       xfrm_len = strxfrm (NULL, str_norm, 0);
 244       result = g_malloc (xfrm_len + 1);
 245       strxfrm (result, str_norm, xfrm_len + 1);
 246     }
 247   else
 248     {
 249       gchar *str_locale = g_convert (str_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
 250
 251       if (str_locale)
 252         {
 253           xfrm_len = strxfrm (NULL, str_locale, 0);
 254           if (xfrm_len < 0 || xfrm_len >= G_MAXINT - 2)
 255             {
 256               g_free (str_locale);
 257               str_locale = NULL;
 258             }
 259         }
 260       if (str_locale)
 261         {
 262           result = g_malloc (xfrm_len + 2);
 263           result[0] = 'A';
 264           strxfrm (result + 1, str_locale, xfrm_len + 1);
 265
 266           g_free (str_locale);
 267         }
 268       else
 269         {
 270           xfrm_len = strlen (str_norm);
 271           result = g_malloc (xfrm_len + 2);
 272           result[0] = 'B';
 273           memcpy (result + 1, str_norm, xfrm_len);
 274           result[xfrm_len+1] = '\0';
 275         }
 276     }
 277
 278   g_free (str_norm);
 279 #endif /* __STDC_ISO_10646__ */
 280
 281   return result;
 282 }
 283
 284 /* This is a collation key that is very very likely to sort before any
 285    collation key that libc strxfrm generates. We use this before any
 286    special case (dot or number) to make sure that its sorted before
 287    anything else.
 288  */
 289 #define COLLATION_SENTINEL "\1\1\1"
 290
 291 /**
 292  * g_utf8_collate_key_for_filename:
 293  * @str: a UTF-8 encoded string.
 294  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 295  *
 296  * Converts a string into a collation key that can be compared
 297  * with other collation keys produced by the same function using strcmp().
 298  *
 299  * In order to sort filenames correctly, this function treats the dot '.'
 300  * as a special case. Most dictionary orderings seem to consider it
 301  * insignificant, thus producing the ordering "event.c" "eventgenerator.c"
 302  * "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we
 303  * would like to treat numbers intelligently so that "file1" "file10" "file5"
 304  * is sorted as "file1" "file5" "file10".
 305  *
 306  * Return value: a newly allocated string. This string should
 307  *   be freed with g_free() when you are done with it.
 308  *
 309  * Since: 2.8
 310  */
 311 gchar*
 312 g_utf8_collate_key_for_filename (const gchar *str,
 313                                  gssize       len)
 314 {
 315   GString *result;
 316   GString *append;
 317   const gchar *p;
 318   const gchar *prev;
 319   const gchar *end;
 320   gchar *collate_key;
 321   gint digits;
 322   gint leading_zeros;
 323
 324   /*
 325    * How it works:
 326    *
 327    * Split the filename into collatable substrings which do
 328    * not contain [.0-9] and special-cased substrings. The collatable
 329    * substrings are run through the normal g_utf8_collate_key() and the
 330    * resulting keys are concatenated with keys generated from the
 331    * special-cased substrings.
 332    *
 333    * Special cases: Dots are handled by replacing them with '\1' which
 334    * implies that short dot-delimited substrings are before long ones,
 335    * e.g.
 336    *
 337    *   a\1a   (a.a)
 338    *   a-\1a  (a-.a)
 339    *   aa\1a  (aa.a)
 340    *
 341    * Numbers are handled by prepending to each number d-1 superdigits
 342    * where d = number of digits in the number and SUPERDIGIT is a
 343    * character with an integer value higher than any digit (for instance
 344    * ':'). This ensures that single-digit numbers are sorted before
 345    * double-digit numbers which in turn are sorted separately from
 346    * triple-digit numbers, etc. To avoid strange side-effects when
 347    * sorting strings that already contain SUPERDIGITs, a '\2'
 348    * is also prepended, like this
 349    *
 350    *   file\21      (file1)
 351    *   file\25      (file5)
 352    *   file\2:10    (file10)
 353    *   file\2:26    (file26)
 354    *   file\2::100  (file100)
 355    *   file:foo     (file:foo)
 356    *
 357    * This has the side-effect of sorting numbers before everything else (except
 358    * dots), but this is probably OK.
 359    *
 360    * Leading digits are ignored when doing the above. To discriminate
 361    * numbers which differ only in the number of leading digits, we append
 362    * the number of leading digits as a byte at the very end of the collation
 363    * key.
 364    *
 365    * To try avoid conflict with any collation key sequence generated by libc we
 366    * start each switch to a special cased part with a sentinel that hopefully
 367    * will sort before anything libc will generate.
 368    */
 369
 370   if (len < 0)
 371     len = strlen (str);
 372
 373   result = g_string_sized_new (len * 2);
 374   append = g_string_sized_new (0);
 375
 376   end = str + len;
 377
 378   /* No need to use utf8 functions, since we're only looking for ascii chars */
 379   for (prev = p = str; p < end; p++)
 380     {
 381       switch (*p)
 382         {
 383         case '.':
 384           if (prev != p)
 385             {
 386               collate_key = g_utf8_collate_key (prev, p - prev);
 387               g_string_append (result, collate_key);
 388               g_free (collate_key);
 389             }
 390
 391           g_string_append (result, COLLATION_SENTINEL "\1");
 392
 393           /* skip the dot */
 394           prev = p + 1;
 395           break;
 396
 397         case '0':
 398         case '1':
 399         case '2':
 400         case '3':
 401         case '4':
 402         case '5':
 403         case '6':
 404         case '7':
 405         case '8':
 406         case '9':
 407           if (prev != p)
 408             {
 409               collate_key = g_utf8_collate_key (prev, p - prev);
 410               g_string_append (result, collate_key);
 411               g_free (collate_key);
 412             }
 413
 414           g_string_append (result, COLLATION_SENTINEL "\2");
 415
 416           prev = p;
 417
 418           /* write d-1 colons */
 419           if (*p == '0')
 420             {
 421               leading_zeros = 1;
 422               digits = 0;
 423             }
 424           else
 425             {
 426               leading_zeros = 0;
 427               digits = 1;
 428             }
 429
 430           while (++p < end)
 431             {
 432               if (*p == '0' && !digits)
 433                 ++leading_zeros;
 434               else if (g_ascii_isdigit(*p))
 435                 ++digits;
 436               else
 437                 {
 438                   /* count an all-zero sequence as
 439                    * one digit plus leading zeros
 440                    */
 441                   if (!digits)
 442                     {
 443                       ++digits;
 444                       --leading_zeros;
 445                     }
 446                   break;
 447                 }
 448             }
 449
 450           while (digits > 1)
 451             {
 452               g_string_append_c (result, ':');
 453               --digits;
 454             }
 455
 456           if (leading_zeros > 0)
 457             {
 458               g_string_append_c (append, (char)leading_zeros);
 459               prev += leading_zeros;
 460             }
 461
 462           /* write the number itself */
 463           g_string_append_len (result, prev, p - prev);
 464
 465           prev = p;
 466           --p;    /* go one step back to avoid disturbing outer loop */
 467           break;
 468
 469         default:
 470           /* other characters just accumulate */
 471           break;
 472         }
 473     }
 474
 475   if (prev != p)
 476     {
 477       collate_key = g_utf8_collate_key (prev, p - prev);
 478       g_string_append (result, collate_key);
 479       g_free (collate_key);
 480     }
 481
 482   g_string_append (result, append->str);
 483   g_string_free (append, TRUE);
 484
 485   return g_string_free (result, FALSE);
 486 }
 487
 488
 489 #define __G_UNICOLLATE_C__
 490 #include "galiasdef.c"