1 /* guniprop.c - Unicode character properties.
3 * Copyright (C) 1999 Tom Tromey
4 * Copyright (C) 2000 Red Hat, Inc.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
29 #include "gunichartables.h"
30 #include "gunicodeprivate.h"
32 #define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \
33 ? attr_table_part1[Page] \
34 : attr_table_part2[(Page) - 0xe00])
36 #define ATTTABLE(Page, Char) \
37 ((ATTR_TABLE(Page) == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[ATTR_TABLE(Page)][Char]))
39 #define TTYPE_PART1(Page, Char) \
40 ((type_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
41 ? (type_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
42 : (type_data[type_table_part1[Page]][Char]))
44 #define TTYPE_PART2(Page, Char) \
45 ((type_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
46 ? (type_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
47 : (type_data[type_table_part2[Page]][Char]))
50 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
51 ? TTYPE_PART1 ((Char) >> 8, (Char) & 0xff) \
52 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
53 ? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
54 : G_UNICODE_UNASSIGNED))
57 #define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER \
58 || (Type) == G_UNICODE_LETTER_NUMBER \
59 || (Type) == G_UNICODE_OTHER_NUMBER)
61 #define ISALPHA(Type) ((Type) == G_UNICODE_LOWERCASE_LETTER \
62 || (Type) == G_UNICODE_UPPERCASE_LETTER \
63 || (Type) == G_UNICODE_TITLECASE_LETTER \
64 || (Type) == G_UNICODE_MODIFIER_LETTER \
65 || (Type) == G_UNICODE_OTHER_LETTER)
67 #define ISMARK(Type) ((Type) == G_UNICODE_NON_SPACING_MARK || \
68 (Type) == G_UNICODE_COMBINING_MARK || \
69 (Type) == G_UNICODE_ENCLOSING_MARK)
74 * @c: a Unicode character
76 * Determines whether a character is alphanumeric.
77 * Given some UTF-8 text, obtain a character value
78 * with g_utf8_get_char().
80 * Return value: %TRUE if @c is an alphanumeric character
83 g_unichar_isalnum (gunichar c
)
86 return ISDIGIT (t
) || ISALPHA (t
);
91 * @c: a Unicode character
93 * Determines whether a character is alphabetic (i.e. a letter).
94 * Given some UTF-8 text, obtain a character value with
97 * Return value: %TRUE if @c is an alphabetic character
100 g_unichar_isalpha (gunichar c
)
109 * @c: a Unicode character
111 * Determines whether a character is a control character.
112 * Given some UTF-8 text, obtain a character value with
115 * Return value: %TRUE if @c is a control character
118 g_unichar_iscntrl (gunichar c
)
120 return TYPE (c
) == G_UNICODE_CONTROL
;
125 * @c: a Unicode character
127 * Determines whether a character is numeric (i.e. a digit). This
128 * covers ASCII 0-9 and also digits in other languages/scripts. Given
129 * some UTF-8 text, obtain a character value with g_utf8_get_char().
131 * Return value: %TRUE if @c is a digit
134 g_unichar_isdigit (gunichar c
)
136 return TYPE (c
) == G_UNICODE_DECIMAL_NUMBER
;
142 * @c: a Unicode character
144 * Determines whether a character is printable and not a space
145 * (returns %FALSE for control characters, format characters, and
146 * spaces). g_unichar_isprint() is similar, but returns %TRUE for
147 * spaces. Given some UTF-8 text, obtain a character value with
150 * Return value: %TRUE if @c is printable unless it's a space
153 g_unichar_isgraph (gunichar c
)
156 return (t
!= G_UNICODE_CONTROL
157 && t
!= G_UNICODE_FORMAT
158 && t
!= G_UNICODE_UNASSIGNED
159 && t
!= G_UNICODE_PRIVATE_USE
160 && t
!= G_UNICODE_SURROGATE
161 && t
!= G_UNICODE_SPACE_SEPARATOR
);
166 * @c: a Unicode character
168 * Determines whether a character is a lowercase letter.
169 * Given some UTF-8 text, obtain a character value with
172 * Return value: %TRUE if @c is a lowercase letter
175 g_unichar_islower (gunichar c
)
177 return TYPE (c
) == G_UNICODE_LOWERCASE_LETTER
;
183 * @c: a Unicode character
185 * Determines whether a character is printable.
186 * Unlike g_unichar_isgraph(), returns %TRUE for spaces.
187 * Given some UTF-8 text, obtain a character value with
190 * Return value: %TRUE if @c is printable
193 g_unichar_isprint (gunichar c
)
196 return (t
!= G_UNICODE_CONTROL
197 && t
!= G_UNICODE_FORMAT
198 && t
!= G_UNICODE_UNASSIGNED
199 && t
!= G_UNICODE_PRIVATE_USE
200 && t
!= G_UNICODE_SURROGATE
);
205 * @c: a Unicode character
207 * Determines whether a character is punctuation or a symbol.
208 * Given some UTF-8 text, obtain a character value with
211 * Return value: %TRUE if @c is a punctuation or symbol character
214 g_unichar_ispunct (gunichar c
)
217 return (t
== G_UNICODE_CONNECT_PUNCTUATION
|| t
== G_UNICODE_DASH_PUNCTUATION
218 || t
== G_UNICODE_CLOSE_PUNCTUATION
|| t
== G_UNICODE_FINAL_PUNCTUATION
219 || t
== G_UNICODE_INITIAL_PUNCTUATION
|| t
== G_UNICODE_OTHER_PUNCTUATION
220 || t
== G_UNICODE_OPEN_PUNCTUATION
|| t
== G_UNICODE_CURRENCY_SYMBOL
221 || t
== G_UNICODE_MODIFIER_SYMBOL
|| t
== G_UNICODE_MATH_SYMBOL
222 || t
== G_UNICODE_OTHER_SYMBOL
);
227 * @c: a Unicode character
229 * Determines whether a character is a space, tab, or line separator
230 * (newline, carriage return, etc.). Given some UTF-8 text, obtain a
231 * character value with g_utf8_get_char().
233 * (Note: don't use this to do word breaking; you have to use
234 * Pango or equivalent to get word breaking right, the algorithm
235 * is fairly complex.)
237 * Return value: %TRUE if @c is a punctuation character
240 g_unichar_isspace (gunichar c
)
244 /* special-case these since Unicode thinks they are not spaces */
255 return (t
== G_UNICODE_SPACE_SEPARATOR
|| t
== G_UNICODE_LINE_SEPARATOR
256 || t
== G_UNICODE_PARAGRAPH_SEPARATOR
);
264 * @c: a Unicode character
266 * Determines if a character is uppercase.
268 * Return value: %TRUE if @c is an uppercase character
271 g_unichar_isupper (gunichar c
)
273 return TYPE (c
) == G_UNICODE_UPPERCASE_LETTER
;
278 * @c: a Unicode character
280 * Determines if a character is titlecase. Some characters in
281 * Unicode which are composites, such as the DZ digraph
282 * have three case variants instead of just two. The titlecase
283 * form is used at the beginning of a word where only the
284 * first letter is capitalized. The titlecase form of the DZ
285 * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z.
287 * Return value: %TRUE if the character is titlecase
290 g_unichar_istitle (gunichar c
)
293 for (i
= 0; i
< G_N_ELEMENTS (title_table
); ++i
)
294 if (title_table
[i
][0] == c
)
300 * g_unichar_isxdigit:
301 * @c: a Unicode character.
303 * Determines if a character is a hexidecimal digit.
305 * Return value: %TRUE if the character is a hexadecimal digit
308 g_unichar_isxdigit (gunichar c
)
311 return ((c
>= 'a' && c
<= 'f')
312 || (c
>= 'A' && c
<= 'F')
317 * g_unichar_isdefined:
318 * @c: a Unicode character
320 * Determines if a given character is assigned in the Unicode
323 * Return value: %TRUE if the character has an assigned value
326 g_unichar_isdefined (gunichar c
)
329 return t
!= G_UNICODE_UNASSIGNED
;
334 * @c: a Unicode character
336 * Determines if a character is typically rendered in a double-width
339 * Return value: %TRUE if the character is wide
341 /* This function stolen from Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk>. */
343 g_unichar_iswide (gunichar c
)
348 return (c
<= 0x115f /* Hangul Jamo init. consonants */
349 || c
== 0x2329 || c
== 0x232a /* angle brackets */
350 || (c
>= 0x2e80 && c
<= 0xa4cf && (c
< 0x302a || c
> 0x302f)
351 && c
!= 0x303f && c
!= 0x3099 && c
!= 0x309a) /* CJK ... Yi */
352 || (c
>= 0xac00 && c
<= 0xd7a3) /* Hangul Syllables */
353 || (c
>= 0xf900 && c
<= 0xfaff) /* CJK Compatibility Ideographs */
354 || (c
>= 0xfe30 && c
<= 0xfe6f) /* CJK Compatibility Forms */
355 || (c
>= 0xff00 && c
<= 0xff60) /* Fullwidth Forms */
356 || (c
>= 0xffe0 && c
<= 0xffe6) /* Fullwidth Forms */
357 || (c
>= 0x20000 && c
<= 0x2fffd) /* CJK extra stuff */
358 || (c
>= 0x30000 && c
<= 0x3fffd));
363 * @c: a Unicode character
365 * Converts a character to uppercase.
367 * Return value: the result of converting @c to uppercase.
368 * If @c is not an lowercase or titlecase character,
369 * or has no upper case equivalent @c is returned unchanged.
372 g_unichar_toupper (gunichar c
)
375 if (t
== G_UNICODE_LOWERCASE_LETTER
)
377 gunichar val
= ATTTABLE (c
>> 8, c
& 0xff);
378 if (val
>= 0x1000000)
380 const gchar
*p
= special_case_table
+ val
- 0x1000000;
381 return g_utf8_get_char (p
);
384 return val
? val
: c
;
386 else if (t
== G_UNICODE_TITLECASE_LETTER
)
389 for (i
= 0; i
< G_N_ELEMENTS (title_table
); ++i
)
391 if (title_table
[i
][0] == c
)
392 return title_table
[i
][1];
400 * @c: a Unicode character.
402 * Converts a character to lower case.
404 * Return value: the result of converting @c to lower case.
405 * If @c is not an upperlower or titlecase character,
406 * or has no lowercase equivalent @c is returned unchanged.
409 g_unichar_tolower (gunichar c
)
412 if (t
== G_UNICODE_UPPERCASE_LETTER
)
414 gunichar val
= ATTTABLE (c
>> 8, c
& 0xff);
415 if (val
>= 0x1000000)
417 const gchar
*p
= special_case_table
+ val
- 0x1000000;
418 return g_utf8_get_char (p
);
421 return val
? val
: c
;
423 else if (t
== G_UNICODE_TITLECASE_LETTER
)
426 for (i
= 0; i
< G_N_ELEMENTS (title_table
); ++i
)
428 if (title_table
[i
][0] == c
)
429 return title_table
[i
][2];
437 * @c: a Unicode character
439 * Converts a character to the titlecase.
441 * Return value: the result of converting @c to titlecase.
442 * If @c is not an uppercase or lowercase character,
443 * @c is returned unchanged.
446 g_unichar_totitle (gunichar c
)
449 for (i
= 0; i
< G_N_ELEMENTS (title_table
); ++i
)
451 if (title_table
[i
][0] == c
|| title_table
[i
][1] == c
452 || title_table
[i
][2] == c
)
453 return title_table
[i
][0];
455 return (TYPE (c
) == G_UNICODE_LOWERCASE_LETTER
456 ? ATTTABLE (c
>> 8, c
& 0xff)
461 * g_unichar_digit_value:
462 * @c: a Unicode character
464 * Determines the numeric value of a character as a decimal
467 * Return value: If @c is a decimal digit (according to
468 * g_unichar_isdigit()), its numeric value. Otherwise, -1.
471 g_unichar_digit_value (gunichar c
)
473 if (TYPE (c
) == G_UNICODE_DECIMAL_NUMBER
)
474 return ATTTABLE (c
>> 8, c
& 0xff);
479 * g_unichar_xdigit_value:
480 * @c: a Unicode character
482 * Determines the numeric value of a character as a hexidecimal
485 * Return value: If @c is a hex digit (according to
486 * g_unichar_isxdigit()), its numeric value. Otherwise, -1.
489 g_unichar_xdigit_value (gunichar c
)
491 if (c
>= 'A' && c
<= 'F')
493 if (c
>= 'a' && c
<= 'f')
495 if (TYPE (c
) == G_UNICODE_DECIMAL_NUMBER
)
496 return ATTTABLE (c
>> 8, c
& 0xff);
502 * @c: a Unicode character
504 * Classifies a Unicode character by type.
506 * Return value: the type of the character.
509 g_unichar_type (gunichar c
)
515 * Case mapping functions
525 get_locale_type (void)
527 const char *locale
= setlocale (LC_CTYPE
, NULL
);
532 if (locale
[1] == 'z')
533 return LOCALE_TURKIC
;
536 if (locale
[1] == 't')
537 return LOCALE_LITHUANIAN
;
540 if (locale
[1] == 'r')
541 return LOCALE_TURKIC
;
545 return LOCALE_NORMAL
;
549 output_marks (const char **p_inout
,
553 const char *p
= *p_inout
;
558 gunichar c
= g_utf8_get_char (p
);
563 if (!remove_dot
|| c
!= 0x307 /* COMBINING DOT ABOVE */)
564 len
+= g_unichar_to_utf8 (c
, out_buffer
? out_buffer
+ len
: NULL
);
565 p
= g_utf8_next_char (p
);
576 output_special_case (gchar
*out_buffer
,
581 const gchar
*p
= special_case_table
+ offset
;
584 if (type
!= G_UNICODE_TITLECASE_LETTER
)
585 p
= g_utf8_next_char (p
);
592 memcpy (out_buffer
, p
, len
);
598 real_toupper (const gchar
*str
,
601 LocaleType locale_type
)
603 const gchar
*p
= str
;
604 const char *last
= NULL
;
606 gboolean last_was_i
= FALSE
;
608 while ((max_len
< 0 || p
< str
+ max_len
) && *p
)
610 gunichar c
= g_utf8_get_char (p
);
615 p
= g_utf8_next_char (p
);
617 if (locale_type
== LOCALE_LITHUANIAN
)
625 /* Nasty, need to remove any dot above. Though
626 * I think only E WITH DOT ABOVE occurs in practice
627 * which could simplify this considerably.
632 decomp
= g_unicode_canonical_decomposition (c
, &decomp_len
);
633 for (i
=0; i
< decomp_len
; i
++)
635 if (decomp
[i
] != 0x307 /* COMBINING DOT ABOVE */)
636 len
+= g_unichar_to_utf8 (g_unichar_toupper (decomp
[i
]), out_buffer
? out_buffer
+ len
: NULL
);
640 len
+= output_marks (&p
, out_buffer
? out_buffer
+ len
: NULL
, TRUE
);
650 if (locale_type
== LOCALE_TURKIC
&& c
== 'i')
652 /* i => LATIN CAPITAL LETTER I WITH DOT ABOVE */
653 len
+= g_unichar_to_utf8 (0x130, out_buffer
? out_buffer
+ len
: NULL
);
655 else if (c
== 0x0345) /* COMBINING GREEK YPOGEGRAMMENI */
657 /* Nasty, need to move it after other combining marks .. this would go away if
658 * we normalized first.
660 len
+= output_marks (&p
, out_buffer
? out_buffer
+ len
: NULL
, FALSE
);
662 /* And output as GREEK CAPITAL LETTER IOTA */
663 len
+= g_unichar_to_utf8 (0x399, out_buffer
? out_buffer
+ len
: NULL
);
665 else if (t
== G_UNICODE_LOWERCASE_LETTER
|| t
== G_UNICODE_TITLECASE_LETTER
)
667 val
= ATTTABLE (c
>> 8, c
& 0xff);
669 if (val
>= 0x1000000)
671 len
+= output_special_case (out_buffer
? out_buffer
+ len
: NULL
, val
- 0x1000000, t
,
672 t
== G_UNICODE_LOWERCASE_LETTER
? 0 : 1);
676 if (t
== G_UNICODE_TITLECASE_LETTER
)
679 for (i
= 0; i
< G_N_ELEMENTS (title_table
); ++i
)
681 if (title_table
[i
][0] == c
)
682 val
= title_table
[i
][1];
686 len
+= g_unichar_to_utf8 (val
, out_buffer
? out_buffer
+ len
: NULL
);
691 gsize char_len
= g_utf8_skip
[*(guchar
*)last
];
694 memcpy (out_buffer
+ len
, last
, char_len
);
706 * @str: a UTF-8 encoded string
707 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
709 * Converts all Unicode characters in the string that have a case
710 * to uppercase. The exact manner that this is done depends
711 * on the current locale, and may result in the number of
712 * characters in the string increasing. (For instance, the
713 * German ess-zet will be changed to SS.)
715 * Return value: a newly allocated string, with all characters
716 * converted to uppercase.
719 g_utf8_strup (const gchar
*str
,
723 LocaleType locale_type
;
726 g_return_val_if_fail (str
!= NULL
, NULL
);
728 locale_type
= get_locale_type ();
731 * We use a two pass approach to keep memory management simple
733 result_len
= real_toupper (str
, len
, NULL
, locale_type
);
734 result
= g_malloc (result_len
+ 1);
735 real_toupper (str
, len
, result
, locale_type
);
736 result
[result_len
] = '\0';
741 /* traverses the string checking for characters with combining class == 230
742 * until a base character is found */
744 has_more_above (const gchar
*str
)
746 const gchar
*p
= str
;
747 gint combining_class
;
751 combining_class
= _g_unichar_combining_class (g_utf8_get_char (p
));
752 if (combining_class
== 230)
754 else if (combining_class
== 0)
757 p
= g_utf8_next_char (p
);
764 real_tolower (const gchar
*str
,
767 LocaleType locale_type
)
769 const gchar
*p
= str
;
770 const char *last
= NULL
;
773 while ((max_len
< 0 || p
< str
+ max_len
) && *p
)
775 gunichar c
= g_utf8_get_char (p
);
780 p
= g_utf8_next_char (p
);
782 if (locale_type
== LOCALE_TURKIC
&& c
== 'I')
784 if (g_utf8_get_char (p
) == 0x0307)
786 /* I + COMBINING DOT ABOVE => i (U+0069) */
787 len
+= g_unichar_to_utf8 (0x0069, out_buffer
? out_buffer
+ len
: NULL
);
788 p
= g_utf8_next_char (p
);
792 /* I => LATIN SMALL LETTER DOTLESS I */
793 len
+= g_unichar_to_utf8 (0x131, out_buffer
? out_buffer
+ len
: NULL
);
796 /* Introduce an explicit dot above when lowercasing capital I's and J's
797 * whenever there are more accents above. [SpecialCasing.txt] */
798 else if (locale_type
== LOCALE_LITHUANIAN
&&
799 (c
== 0x00cc || c
== 0x00cd || c
== 0x0128))
801 len
+= g_unichar_to_utf8 (0x0069, out_buffer
? out_buffer
+ len
: NULL
);
802 len
+= g_unichar_to_utf8 (0x0307, out_buffer
? out_buffer
+ len
: NULL
);
807 len
+= g_unichar_to_utf8 (0x0300, out_buffer
? out_buffer
+ len
: NULL
);
810 len
+= g_unichar_to_utf8 (0x0301, out_buffer
? out_buffer
+ len
: NULL
);
813 len
+= g_unichar_to_utf8 (0x0303, out_buffer
? out_buffer
+ len
: NULL
);
817 else if (locale_type
== LOCALE_LITHUANIAN
&&
818 (c
== 'I' || c
== 'J' || c
== 0x012e) &&
821 len
+= g_unichar_to_utf8 (g_unichar_tolower (c
), out_buffer
? out_buffer
+ len
: NULL
);
822 len
+= g_unichar_to_utf8 (0x0307, out_buffer
? out_buffer
+ len
: NULL
);
824 else if (c
== 0x03A3) /* GREEK CAPITAL LETTER SIGMA */
826 if ((max_len
< 0 || p
< str
+ max_len
) && *p
)
828 gunichar next_c
= g_utf8_get_char (p
);
829 int next_type
= TYPE(next_c
);
831 /* SIGMA mapps differently depending on whether it is
832 * final or not. The following simplified test would
833 * fail in the case of combining marks following the
834 * sigma, but I don't think that occurs in real text.
835 * The test here matches that in ICU.
837 if (ISALPHA(next_type
)) /* Lu,Ll,Lt,Lm,Lo */
838 val
= 0x3c3; /* GREEK SMALL SIGMA */
840 val
= 0x3c2; /* GREEK SMALL FINAL SIGMA */
843 val
= 0x3c2; /* GREEK SMALL FINAL SIGMA */
845 len
+= g_unichar_to_utf8 (val
, out_buffer
? out_buffer
+ len
: NULL
);
847 else if (t
== G_UNICODE_UPPERCASE_LETTER
|| t
== G_UNICODE_TITLECASE_LETTER
)
849 val
= ATTTABLE (c
>> 8, c
& 0xff);
851 if (val
>= 0x1000000)
853 len
+= output_special_case (out_buffer
? out_buffer
+ len
: NULL
, val
- 0x1000000, t
, 0);
857 if (t
== G_UNICODE_TITLECASE_LETTER
)
860 for (i
= 0; i
< G_N_ELEMENTS (title_table
); ++i
)
862 if (title_table
[i
][0] == c
)
863 val
= title_table
[i
][2];
867 len
+= g_unichar_to_utf8 (val
, out_buffer
? out_buffer
+ len
: NULL
);
872 gsize char_len
= g_utf8_skip
[*(guchar
*)last
];
875 memcpy (out_buffer
+ len
, last
, char_len
);
887 * @str: a UTF-8 encoded string
888 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
890 * Converts all Unicode characters in the string that have a case
891 * to lowercase. The exact manner that this is done depends
892 * on the current locale, and may result in the number of
893 * characters in the string changing.
895 * Return value: a newly allocated string, with all characters
896 * converted to lowercase.
899 g_utf8_strdown (const gchar
*str
,
903 LocaleType locale_type
;
906 g_return_val_if_fail (str
!= NULL
, NULL
);
908 locale_type
= get_locale_type ();
911 * We use a two pass approach to keep memory management simple
913 result_len
= real_tolower (str
, len
, NULL
, locale_type
);
914 result
= g_malloc (result_len
+ 1);
915 real_tolower (str
, len
, result
, locale_type
);
916 result
[result_len
] = '\0';
923 * @str: a UTF-8 encoded string
924 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
926 * Converts a string into a form that is independent of case. The
927 * result will not correspond to any particular case, but can be
928 * compared for equality or ordered with the results of calling
929 * g_utf8_casefold() on other strings.
931 * Note that calling g_utf8_casefold() followed by g_utf8_collate() is
932 * only an approximation to the correct linguistic case insensitive
933 * ordering, though it is a fairly good one. Getting this exactly
934 * right would require a more sophisticated collation function that
935 * takes case sensitivity into account. GLib does not currently
936 * provide such a function.
938 * Return value: a newly allocated string, that is a
939 * case independent form of @str.
942 g_utf8_casefold (const gchar
*str
,
948 g_return_val_if_fail (str
!= NULL
, NULL
);
950 result
= g_string_new (NULL
);
952 while ((len
< 0 || p
< str
+ len
) && *p
)
954 gunichar ch
= g_utf8_get_char (p
);
957 int end
= G_N_ELEMENTS (casefold_table
);
959 if (ch
>= casefold_table
[start
].ch
&&
960 ch
<= casefold_table
[end
- 1].ch
)
964 int half
= (start
+ end
) / 2;
965 if (ch
== casefold_table
[half
].ch
)
967 g_string_append (result
, casefold_table
[half
].data
);
970 else if (half
== start
)
972 else if (ch
> casefold_table
[half
].ch
)
979 g_string_append_unichar (result
, g_unichar_tolower (ch
));
982 p
= g_utf8_next_char (p
);
985 return g_string_free (result
, FALSE
);
989 * g_unichar_get_mirror_char:
990 * @ch: a unicode character
991 * @mirrored_ch: location to store the mirrored character
993 * In Unicode, some characters are <firstterm>mirrored</firstterm>. This
994 * means that their images are mirrored horizontally in text that is laid
995 * out from right to left. For instance, "(" would become its mirror image,
996 * ")", in right-to-left text.
998 * If @ch has the Unicode mirrored property and there is another unicode
999 * character that typically has a glyph that is the mirror image of @ch's
1000 * glyph, puts that character in the address pointed to by @mirrored_ch.
1002 * Return value: %TRUE if @ch has a mirrored character and @mirrored_ch is
1003 * filled in, %FALSE otherwise
1007 /* This code is adapted from FriBidi (http://fribidi.sourceforge.net/).
1008 * FriBidi is: Copyright (C) 1999,2000 Dov Grobgeld, and
1009 * Copyright (C) 2001,2002 Behdad Esfahbod.
1012 g_unichar_get_mirror_char (gunichar ch
,
1013 gunichar
*mirrored_ch
)
1015 gint pos
, step
, size
;
1018 size
= G_N_ELEMENTS (bidi_mirroring_table
);
1019 pos
= step
= (size
/ 2) + 1;
1023 gunichar cmp_ch
= bidi_mirroring_table
[pos
].ch
;
1024 step
= (step
+ 1) / 2;
1032 else if (cmp_ch
> ch
)
1041 found
= bidi_mirroring_table
[pos
].ch
== ch
;
1043 *mirrored_ch
= found
? bidi_mirroring_table
[pos
].mirrored_ch
: ch
;