Cosmetic rearrangement
[glib.git] / glib / gunicollate.c
blob165ecbc250de18a62332027a8ce1b5d412dcc6c3
1 /* gunicollate.c - Collation
3 * Copyright 2001,2005 Red Hat, Inc.
5 * The Gnome Library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
10 * The Gnome Library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with the Gnome Library; see the file COPYING.LIB. If not,
17 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 * Boston, MA 02111-1307, USA.
21 #include "config.h"
23 #include <locale.h>
24 #include <string.h>
25 #ifdef __STDC_ISO_10646__
26 #include <wchar.h>
27 #endif
29 #ifdef HAVE_CARBON
30 #include <CoreServices/CoreServices.h>
31 #endif
33 #include "gmem.h"
34 #include "gunicode.h"
35 #include "gunicodeprivate.h"
36 #include "gstring.h"
37 #include "gstrfuncs.h"
38 #include "gtestutils.h"
39 #ifndef __STDC_ISO_10646__
40 #include "gconvert.h"
41 #endif
44 #ifdef _MSC_VER
45 /* Workaround for bug in MSVCR80.DLL */
46 static gsize
47 msc_strxfrm_wrapper (char *string1,
48 const char *string2,
49 gsize count)
51 if (!string1 || count <= 0)
53 char tmp;
55 return strxfrm (&tmp, string2, 1);
57 return strxfrm (string1, string2, count);
59 #define strxfrm msc_strxfrm_wrapper
60 #endif
62 /**
63 * g_utf8_collate:
64 * @str1: a UTF-8 encoded string
65 * @str2: a UTF-8 encoded string
67 * Compares two strings for ordering using the linguistically
68 * correct rules for the <link linkend="setlocale">current locale</link>.
69 * When sorting a large number of strings, it will be significantly
70 * faster to obtain collation keys with g_utf8_collate_key() and
71 * compare the keys with strcmp() when sorting instead of sorting
72 * the original strings.
74 * Return value: &lt; 0 if @str1 compares before @str2,
75 * 0 if they compare equal, &gt; 0 if @str1 compares after @str2.
76 **/
77 gint
78 g_utf8_collate (const gchar *str1,
79 const gchar *str2)
81 gint result;
83 #ifdef HAVE_CARBON
85 UniChar *str1_utf16;
86 UniChar *str2_utf16;
87 glong len1;
88 glong len2;
89 SInt32 retval = 0;
91 g_return_val_if_fail (str1 != NULL, 0);
92 g_return_val_if_fail (str2 != NULL, 0);
94 str1_utf16 = g_utf8_to_utf16 (str1, -1, NULL, &len1, NULL);
95 str2_utf16 = g_utf8_to_utf16 (str2, -1, NULL, &len2, NULL);
97 UCCompareTextDefault (kUCCollateStandardOptions,
98 str1_utf16, len1, str2_utf16, len2,
99 NULL, &retval);
100 result = retval;
102 g_free (str2_utf16);
103 g_free (str1_utf16);
105 #elif defined(__STDC_ISO_10646__)
107 gunichar *str1_norm;
108 gunichar *str2_norm;
110 g_return_val_if_fail (str1 != NULL, 0);
111 g_return_val_if_fail (str2 != NULL, 0);
113 str1_norm = _g_utf8_normalize_wc (str1, -1, G_NORMALIZE_ALL_COMPOSE);
114 str2_norm = _g_utf8_normalize_wc (str2, -1, G_NORMALIZE_ALL_COMPOSE);
116 result = wcscoll ((wchar_t *)str1_norm, (wchar_t *)str2_norm);
118 g_free (str1_norm);
119 g_free (str2_norm);
121 #else /* !__STDC_ISO_10646__ */
123 const gchar *charset;
124 gchar *str1_norm;
125 gchar *str2_norm;
127 g_return_val_if_fail (str1 != NULL, 0);
128 g_return_val_if_fail (str2 != NULL, 0);
130 str1_norm = g_utf8_normalize (str1, -1, G_NORMALIZE_ALL_COMPOSE);
131 str2_norm = g_utf8_normalize (str2, -1, G_NORMALIZE_ALL_COMPOSE);
133 if (g_get_charset (&charset))
135 result = strcoll (str1_norm, str2_norm);
137 else
139 gchar *str1_locale = g_convert (str1_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
140 gchar *str2_locale = g_convert (str2_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
142 if (str1_locale && str2_locale)
143 result = strcoll (str1_locale, str2_locale);
144 else if (str1_locale)
145 result = -1;
146 else if (str2_locale)
147 result = 1;
148 else
149 result = strcmp (str1_norm, str2_norm);
151 g_free (str1_locale);
152 g_free (str2_locale);
155 g_free (str1_norm);
156 g_free (str2_norm);
158 #endif /* __STDC_ISO_10646__ */
160 return result;
163 #if defined(__STDC_ISO_10646__) || defined(HAVE_CARBON)
164 /* We need UTF-8 encoding of numbers to encode the weights if
165 * we are using wcsxfrm. However, we aren't encoding Unicode
166 * characters, so we can't simply use g_unichar_to_utf8.
168 * The following routine is taken (with modification) from GNU
169 * libc's strxfrm routine:
171 * Copyright (C) 1995-1999,2000,2001 Free Software Foundation, Inc.
172 * Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
174 static inline int
175 utf8_encode (char *buf, wchar_t val)
177 int retval;
179 if (val < 0x80)
181 if (buf)
182 *buf++ = (char) val;
183 retval = 1;
185 else
187 int step;
189 for (step = 2; step < 6; ++step)
190 if ((val & (~(guint32)0 << (5 * step + 1))) == 0)
191 break;
192 retval = step;
194 if (buf)
196 *buf = (unsigned char) (~0xff >> step);
197 --step;
200 buf[step] = 0x80 | (val & 0x3f);
201 val >>= 6;
203 while (--step > 0);
204 *buf |= val;
208 return retval;
210 #endif /* __STDC_ISO_10646__ || HAVE_CARBON */
212 #ifdef HAVE_CARBON
214 static gchar *
215 collate_key_to_string (UCCollationValue *key,
216 gsize key_len)
218 gchar *result;
219 gsize result_len;
220 gsize i;
222 /* Pretty smart algorithm here: ignore first eight bytes of the
223 * collation key. It doesn't produce results equivalent to
224 * UCCompareCollationKeys's, but the difference seems to be only
225 * that UCCompareCollationKeys in some cases produces 0 where our
226 * comparison gets -1 or 1. */
228 if (key_len * sizeof (UCCollationValue) <= 8)
229 return g_strdup ("");
231 result_len = 0;
232 for (i = 8; i < key_len * sizeof (UCCollationValue); i++)
233 /* there may be nul bytes, encode byteval+1 */
234 result_len += utf8_encode (NULL, *((guchar*)key + i) + 1);
236 result = g_malloc (result_len + 1);
237 result_len = 0;
238 for (i = 8; i < key_len * sizeof (UCCollationValue); i++)
239 result_len += utf8_encode (result + result_len, *((guchar*)key + i) + 1);
241 result[result_len] = 0;
242 return result;
245 static gchar *
246 carbon_collate_key_with_collator (const gchar *str,
247 gssize len,
248 CollatorRef collator)
250 UniChar *str_utf16 = NULL;
251 glong len_utf16;
252 OSStatus ret;
253 UCCollationValue staticbuf[512];
254 UCCollationValue *freeme = NULL;
255 UCCollationValue *buf;
256 ItemCount buf_len;
257 ItemCount key_len;
258 ItemCount try_len;
259 gchar *result = NULL;
261 str_utf16 = g_utf8_to_utf16 (str, len, NULL, &len_utf16, NULL);
262 try_len = len_utf16 * 5 + 2;
264 if (try_len <= sizeof staticbuf)
266 buf = staticbuf;
267 buf_len = sizeof staticbuf;
269 else
271 freeme = g_new (UCCollationValue, try_len);
272 buf = freeme;
273 buf_len = try_len;
276 ret = UCGetCollationKey (collator, str_utf16, len_utf16,
277 buf_len, &key_len, buf);
279 if (ret == kCollateBufferTooSmall)
281 freeme = g_renew (UCCollationValue, freeme, try_len * 2);
282 buf = freeme;
283 buf_len = try_len * 2;
284 ret = UCGetCollationKey (collator, str_utf16, len_utf16,
285 buf_len, &key_len, buf);
288 if (ret == 0)
289 result = collate_key_to_string (buf, key_len);
290 else
291 result = g_strdup ("");
293 g_free (freeme);
294 g_free (str_utf16);
295 return result;
298 static gchar *
299 carbon_collate_key (const gchar *str,
300 gssize len)
302 static CollatorRef collator;
304 if (G_UNLIKELY (!collator))
306 UCCreateCollator (NULL, 0, kUCCollateStandardOptions, &collator);
308 if (!collator)
310 static gboolean been_here;
311 if (!been_here)
312 g_warning ("%s: UCCreateCollator failed", G_STRLOC);
313 been_here = TRUE;
314 return g_strdup ("");
318 return carbon_collate_key_with_collator (str, len, collator);
321 static gchar *
322 carbon_collate_key_for_filename (const gchar *str,
323 gssize len)
325 static CollatorRef collator;
327 if (G_UNLIKELY (!collator))
329 /* http://developer.apple.com/qa/qa2004/qa1159.html */
330 UCCreateCollator (NULL, 0,
331 kUCCollateComposeInsensitiveMask
332 | kUCCollateWidthInsensitiveMask
333 | kUCCollateCaseInsensitiveMask
334 | kUCCollateDigitsOverrideMask
335 | kUCCollateDigitsAsNumberMask
336 | kUCCollatePunctuationSignificantMask,
337 &collator);
339 if (!collator)
341 static gboolean been_here;
342 if (!been_here)
343 g_warning ("%s: UCCreateCollator failed", G_STRLOC);
344 been_here = TRUE;
345 return g_strdup ("");
349 return carbon_collate_key_with_collator (str, len, collator);
352 #endif /* HAVE_CARBON */
355 * g_utf8_collate_key:
356 * @str: a UTF-8 encoded string.
357 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
359 * Converts a string into a collation key that can be compared
360 * with other collation keys produced by the same function using
361 * strcmp().
363 * The results of comparing the collation keys of two strings
364 * with strcmp() will always be the same as comparing the two
365 * original keys with g_utf8_collate().
367 * Note that this function depends on the
368 * <link linkend="setlocale">current locale</link>.
370 * Return value: a newly allocated string. This string should
371 * be freed with g_free() when you are done with it.
373 gchar *
374 g_utf8_collate_key (const gchar *str,
375 gssize len)
377 gchar *result;
379 #ifdef HAVE_CARBON
381 g_return_val_if_fail (str != NULL, NULL);
382 result = carbon_collate_key (str, len);
384 #elif defined(__STDC_ISO_10646__)
386 gsize xfrm_len;
387 gunichar *str_norm;
388 wchar_t *result_wc;
389 gsize i;
390 gsize result_len = 0;
392 g_return_val_if_fail (str != NULL, NULL);
394 str_norm = _g_utf8_normalize_wc (str, len, G_NORMALIZE_ALL_COMPOSE);
396 xfrm_len = wcsxfrm (NULL, (wchar_t *)str_norm, 0);
397 result_wc = g_new (wchar_t, xfrm_len + 1);
398 wcsxfrm (result_wc, (wchar_t *)str_norm, xfrm_len + 1);
400 for (i=0; i < xfrm_len; i++)
401 result_len += utf8_encode (NULL, result_wc[i]);
403 result = g_malloc (result_len + 1);
404 result_len = 0;
405 for (i=0; i < xfrm_len; i++)
406 result_len += utf8_encode (result + result_len, result_wc[i]);
408 result[result_len] = '\0';
410 g_free (result_wc);
411 g_free (str_norm);
413 return result;
414 #else /* !__STDC_ISO_10646__ */
416 gsize xfrm_len;
417 const gchar *charset;
418 gchar *str_norm;
420 g_return_val_if_fail (str != NULL, NULL);
422 str_norm = g_utf8_normalize (str, len, G_NORMALIZE_ALL_COMPOSE);
424 result = NULL;
426 if (g_get_charset (&charset))
428 xfrm_len = strxfrm (NULL, str_norm, 0);
429 if (xfrm_len >= 0 && xfrm_len < G_MAXINT - 2)
431 result = g_malloc (xfrm_len + 1);
432 strxfrm (result, str_norm, xfrm_len + 1);
435 else
437 gchar *str_locale = g_convert (str_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
439 if (str_locale)
441 xfrm_len = strxfrm (NULL, str_locale, 0);
442 if (xfrm_len < 0 || xfrm_len >= G_MAXINT - 2)
444 g_free (str_locale);
445 str_locale = NULL;
448 if (str_locale)
450 result = g_malloc (xfrm_len + 2);
451 result[0] = 'A';
452 strxfrm (result + 1, str_locale, xfrm_len + 1);
454 g_free (str_locale);
458 if (!result)
460 xfrm_len = strlen (str_norm);
461 result = g_malloc (xfrm_len + 2);
462 result[0] = 'B';
463 memcpy (result + 1, str_norm, xfrm_len);
464 result[xfrm_len+1] = '\0';
467 g_free (str_norm);
468 #endif /* __STDC_ISO_10646__ */
470 return result;
473 /* This is a collation key that is very very likely to sort before any
474 collation key that libc strxfrm generates. We use this before any
475 special case (dot or number) to make sure that its sorted before
476 anything else.
478 #define COLLATION_SENTINEL "\1\1\1"
481 * g_utf8_collate_key_for_filename:
482 * @str: a UTF-8 encoded string.
483 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
485 * Converts a string into a collation key that can be compared
486 * with other collation keys produced by the same function using strcmp().
488 * In order to sort filenames correctly, this function treats the dot '.'
489 * as a special case. Most dictionary orderings seem to consider it
490 * insignificant, thus producing the ordering "event.c" "eventgenerator.c"
491 * "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we
492 * would like to treat numbers intelligently so that "file1" "file10" "file5"
493 * is sorted as "file1" "file5" "file10".
495 * Note that this function depends on the
496 * <link linkend="setlocale">current locale</link>.
498 * Return value: a newly allocated string. This string should
499 * be freed with g_free() when you are done with it.
501 * Since: 2.8
503 gchar*
504 g_utf8_collate_key_for_filename (const gchar *str,
505 gssize len)
507 #ifndef HAVE_CARBON
508 GString *result;
509 GString *append;
510 const gchar *p;
511 const gchar *prev;
512 const gchar *end;
513 gchar *collate_key;
514 gint digits;
515 gint leading_zeros;
518 * How it works:
520 * Split the filename into collatable substrings which do
521 * not contain [.0-9] and special-cased substrings. The collatable
522 * substrings are run through the normal g_utf8_collate_key() and the
523 * resulting keys are concatenated with keys generated from the
524 * special-cased substrings.
526 * Special cases: Dots are handled by replacing them with '\1' which
527 * implies that short dot-delimited substrings are before long ones,
528 * e.g.
530 * a\1a (a.a)
531 * a-\1a (a-.a)
532 * aa\1a (aa.a)
534 * Numbers are handled by prepending to each number d-1 superdigits
535 * where d = number of digits in the number and SUPERDIGIT is a
536 * character with an integer value higher than any digit (for instance
537 * ':'). This ensures that single-digit numbers are sorted before
538 * double-digit numbers which in turn are sorted separately from
539 * triple-digit numbers, etc. To avoid strange side-effects when
540 * sorting strings that already contain SUPERDIGITs, a '\2'
541 * is also prepended, like this
543 * file\21 (file1)
544 * file\25 (file5)
545 * file\2:10 (file10)
546 * file\2:26 (file26)
547 * file\2::100 (file100)
548 * file:foo (file:foo)
550 * This has the side-effect of sorting numbers before everything else (except
551 * dots), but this is probably OK.
553 * Leading digits are ignored when doing the above. To discriminate
554 * numbers which differ only in the number of leading digits, we append
555 * the number of leading digits as a byte at the very end of the collation
556 * key.
558 * To try avoid conflict with any collation key sequence generated by libc we
559 * start each switch to a special cased part with a sentinel that hopefully
560 * will sort before anything libc will generate.
563 if (len < 0)
564 len = strlen (str);
566 result = g_string_sized_new (len * 2);
567 append = g_string_sized_new (0);
569 end = str + len;
571 /* No need to use utf8 functions, since we're only looking for ascii chars */
572 for (prev = p = str; p < end; p++)
574 switch (*p)
576 case '.':
577 if (prev != p)
579 collate_key = g_utf8_collate_key (prev, p - prev);
580 g_string_append (result, collate_key);
581 g_free (collate_key);
584 g_string_append (result, COLLATION_SENTINEL "\1");
586 /* skip the dot */
587 prev = p + 1;
588 break;
590 case '0':
591 case '1':
592 case '2':
593 case '3':
594 case '4':
595 case '5':
596 case '6':
597 case '7':
598 case '8':
599 case '9':
600 if (prev != p)
602 collate_key = g_utf8_collate_key (prev, p - prev);
603 g_string_append (result, collate_key);
604 g_free (collate_key);
607 g_string_append (result, COLLATION_SENTINEL "\2");
609 prev = p;
611 /* write d-1 colons */
612 if (*p == '0')
614 leading_zeros = 1;
615 digits = 0;
617 else
619 leading_zeros = 0;
620 digits = 1;
623 while (++p < end)
625 if (*p == '0' && !digits)
626 ++leading_zeros;
627 else if (g_ascii_isdigit(*p))
628 ++digits;
629 else
631 /* count an all-zero sequence as
632 * one digit plus leading zeros
634 if (!digits)
636 ++digits;
637 --leading_zeros;
639 break;
643 while (digits > 1)
645 g_string_append_c (result, ':');
646 --digits;
649 if (leading_zeros > 0)
651 g_string_append_c (append, (char)leading_zeros);
652 prev += leading_zeros;
655 /* write the number itself */
656 g_string_append_len (result, prev, p - prev);
658 prev = p;
659 --p; /* go one step back to avoid disturbing outer loop */
660 break;
662 default:
663 /* other characters just accumulate */
664 break;
668 if (prev != p)
670 collate_key = g_utf8_collate_key (prev, p - prev);
671 g_string_append (result, collate_key);
672 g_free (collate_key);
675 g_string_append (result, append->str);
676 g_string_free (append, TRUE);
678 return g_string_free (result, FALSE);
679 #else /* HAVE_CARBON */
680 return carbon_collate_key_for_filename (str, len);
681 #endif