2 UTF-8 strings utilities
4 Copyright (C) 2007-2024
5 Free Software Foundation, Inc.
10 This file is part of the Midnight Commander.
12 The Midnight Commander is free software: you can redistribute it
13 and/or modify it under the terms of the GNU General Public License as
14 published by the Free Software Foundation, either version 3 of the License,
15 or (at your option) any later version.
17 The Midnight Commander is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program. If not, see <http://www.gnu.org/licenses/>.
30 #include <limits.h> /* MB_LEN_MAX */
33 #include "lib/global.h"
34 #include "lib/strutil.h"
36 /* using function for utf-8 from glib */
38 /*** global variables ****************************************************************************/
40 /*** file scope macro definitions ****************************************************************/
42 /*** file scope type declarations ****************************************************************/
55 char text
[BUF_MEDIUM
* MB_LEN_MAX
];
60 /*** forward declarations (file scope functions) *************************************************/
62 /*** file scope variables ************************************************************************/
64 static const char replch
[] = "\xEF\xBF\xBD";
66 /* --------------------------------------------------------------------------------------------- */
67 /*** file scope functions ************************************************************************/
68 /* --------------------------------------------------------------------------------------------- */
71 str_unichar_iscombiningmark (gunichar uni
)
75 type
= g_unichar_type (uni
);
76 return (type
== G_UNICODE_SPACING_MARK
)
77 || (type
== G_UNICODE_ENCLOSING_MARK
) || (type
== G_UNICODE_NON_SPACING_MARK
);
80 /* --------------------------------------------------------------------------------------------- */
83 str_utf8_insert_replace_char (GString
*buffer
)
85 g_string_append (buffer
, replch
);
88 /* --------------------------------------------------------------------------------------------- */
91 str_utf8_is_valid_string (const char *text
)
93 return g_utf8_validate (text
, -1, NULL
);
96 /* --------------------------------------------------------------------------------------------- */
99 str_utf8_is_valid_char (const char *ch
, size_t size
)
101 switch (g_utf8_get_char_validated (ch
, size
))
103 case (gunichar
) (-2):
105 case (gunichar
) (-1):
112 /* --------------------------------------------------------------------------------------------- */
115 str_utf8_cnext_char (const char **text
)
117 (*text
) = g_utf8_next_char (*text
);
120 /* --------------------------------------------------------------------------------------------- */
123 str_utf8_cprev_char (const char **text
)
125 (*text
) = g_utf8_prev_char (*text
);
128 /* --------------------------------------------------------------------------------------------- */
131 str_utf8_cnext_char_safe (const char **text
)
133 if (str_utf8_is_valid_char (*text
, -1) == 1)
134 (*text
) = g_utf8_next_char (*text
);
139 /* --------------------------------------------------------------------------------------------- */
142 str_utf8_cprev_char_safe (const char **text
)
144 const char *result
, *t
;
146 result
= g_utf8_prev_char (*text
);
148 str_utf8_cnext_char_safe (&t
);
155 /* --------------------------------------------------------------------------------------------- */
158 str_utf8_fix_string (char *text
)
160 while (text
[0] != '\0')
164 uni
= g_utf8_get_char_validated (text
, -1);
165 if ((uni
!= (gunichar
) (-1)) && (uni
!= (gunichar
) (-2)))
166 text
= g_utf8_next_char (text
);
175 /* --------------------------------------------------------------------------------------------- */
178 str_utf8_isspace (const char *text
)
182 uni
= g_utf8_get_char_validated (text
, -1);
183 return g_unichar_isspace (uni
);
186 /* --------------------------------------------------------------------------------------------- */
189 str_utf8_ispunct (const char *text
)
193 uni
= g_utf8_get_char_validated (text
, -1);
194 return g_unichar_ispunct (uni
);
197 /* --------------------------------------------------------------------------------------------- */
200 str_utf8_isalnum (const char *text
)
204 uni
= g_utf8_get_char_validated (text
, -1);
205 return g_unichar_isalnum (uni
);
208 /* --------------------------------------------------------------------------------------------- */
211 str_utf8_isdigit (const char *text
)
215 uni
= g_utf8_get_char_validated (text
, -1);
216 return g_unichar_isdigit (uni
);
219 /* --------------------------------------------------------------------------------------------- */
222 str_utf8_isprint (const char *ch
)
226 uni
= g_utf8_get_char_validated (ch
, -1);
227 return g_unichar_isprint (uni
);
230 /* --------------------------------------------------------------------------------------------- */
233 str_utf8_iscombiningmark (const char *ch
)
237 uni
= g_utf8_get_char_validated (ch
, -1);
238 return str_unichar_iscombiningmark (uni
);
241 /* --------------------------------------------------------------------------------------------- */
244 str_utf8_cnext_noncomb_char (const char **text
)
248 while ((*text
)[0] != '\0')
250 str_utf8_cnext_char_safe (text
);
252 if (!str_utf8_iscombiningmark (*text
))
259 /* --------------------------------------------------------------------------------------------- */
262 str_utf8_cprev_noncomb_char (const char **text
, const char *begin
)
266 while ((*text
) != begin
)
268 str_utf8_cprev_char_safe (text
);
270 if (!str_utf8_iscombiningmark (*text
))
277 /* --------------------------------------------------------------------------------------------- */
280 str_utf8_toupper (const char *text
, char **out
, size_t *remain
)
285 uni
= g_utf8_get_char_validated (text
, -1);
286 if (uni
== (gunichar
) (-1) || uni
== (gunichar
) (-2))
289 uni
= g_unichar_toupper (uni
);
290 left
= g_unichar_to_utf8 (uni
, NULL
);
294 left
= g_unichar_to_utf8 (uni
, *out
);
300 /* --------------------------------------------------------------------------------------------- */
303 str_utf8_tolower (const char *text
, char **out
, size_t *remain
)
308 uni
= g_utf8_get_char_validated (text
, -1);
309 if (uni
== (gunichar
) (-1) || uni
== (gunichar
) (-2))
312 uni
= g_unichar_tolower (uni
);
313 left
= g_unichar_to_utf8 (uni
, NULL
);
317 left
= g_unichar_to_utf8 (uni
, *out
);
323 /* --------------------------------------------------------------------------------------------- */
326 str_utf8_length (const char *text
)
333 while (!g_utf8_validate (start
, -1, &end
) && start
[0] != '\0')
336 result
+= g_utf8_strlen (start
, end
- start
);
343 result
= g_utf8_strlen (text
, -1);
344 else if (start
[0] != '\0' && start
!= end
)
345 result
+= g_utf8_strlen (start
, end
- start
);
350 /* --------------------------------------------------------------------------------------------- */
353 str_utf8_length2 (const char *text
, int size
)
360 while (!g_utf8_validate (start
, -1, &end
) && start
[0] != '\0' && size
> 0)
364 result
+= g_utf8_strlen (start
, MIN (end
- start
, size
));
367 result
+= (size
> 0);
373 result
= g_utf8_strlen (text
, size
);
374 else if (start
[0] != '\0' && start
!= end
&& size
> 0)
375 result
+= g_utf8_strlen (start
, MIN (end
- start
, size
));
380 /* --------------------------------------------------------------------------------------------- */
383 str_utf8_length_noncomb (const char *text
)
386 const char *t
= text
;
390 str_utf8_cnext_noncomb_char (&t
);
397 /* --------------------------------------------------------------------------------------------- */
401 str_utf8_questmark_sustb (char **string
, size_t *left
, GString
*buffer
)
405 next
= g_utf8_next_char (*string
);
406 (*left
) -= next
- (*string
);
408 g_string_append_c (buffer
, '?');
412 /* --------------------------------------------------------------------------------------------- */
415 str_utf8_conv_gerror_message (GError
*mcerror
, const char *def_msg
)
418 return g_strdup (mcerror
->message
);
420 return g_strdup (def_msg
!= NULL
? def_msg
: "");
423 /* --------------------------------------------------------------------------------------------- */
426 str_utf8_vfs_convert_to (GIConv coder
, const char *string
, int size
, GString
*buffer
)
428 estr_t result
= ESTR_SUCCESS
;
430 if (coder
== str_cnv_not_convert
)
431 g_string_append_len (buffer
, string
, size
);
433 result
= str_nconvert (coder
, string
, size
, buffer
);
438 /* --------------------------------------------------------------------------------------------- */
439 /* utility function, that makes string valid in utf8 and all characters printable
440 * return width of string too */
442 static const struct term_form
*
443 str_utf8_make_make_term_form (const char *text
, size_t length
)
445 static struct term_form result
;
450 result
.text
[0] = '\0';
452 result
.compose
= FALSE
;
453 actual
= result
.text
;
455 /* check if text start with combining character,
456 * add space at begin in this case */
457 if (length
!= 0 && text
[0] != '\0')
459 uni
= g_utf8_get_char_validated (text
, -1);
460 if ((uni
!= (gunichar
) (-1)) && (uni
!= (gunichar
) (-2))
461 && str_unichar_iscombiningmark (uni
))
466 result
.compose
= TRUE
;
470 while (length
!= 0 && text
[0] != '\0')
472 uni
= g_utf8_get_char_validated (text
, -1);
473 if ((uni
!= (gunichar
) (-1)) && (uni
!= (gunichar
) (-2)))
475 if (g_unichar_isprint (uni
))
477 left
= g_unichar_to_utf8 (uni
, actual
);
479 if (str_unichar_iscombiningmark (uni
))
480 result
.compose
= TRUE
;
484 if (g_unichar_iswide (uni
))
494 text
= g_utf8_next_char (text
);
501 /*actual[0] = '?'; */
502 repl_len
= strlen (replch
);
503 memcpy (actual
, replch
, repl_len
);
508 if (length
!= (size_t) (-1))
516 /* --------------------------------------------------------------------------------------------- */
519 str_utf8_term_form (const char *text
)
521 static char result
[BUF_MEDIUM
* MB_LEN_MAX
];
522 const struct term_form
*pre_form
;
524 pre_form
= str_utf8_make_make_term_form (text
, (size_t) (-1));
525 if (pre_form
->compose
)
529 composed
= g_utf8_normalize (pre_form
->text
, -1, G_NORMALIZE_DEFAULT_COMPOSE
);
530 g_strlcpy (result
, composed
, sizeof (result
));
534 g_strlcpy (result
, pre_form
->text
, sizeof (result
));
539 /* --------------------------------------------------------------------------------------------- */
540 /* utility function, that copies all characters from checked to actual */
543 utf8_tool_copy_chars_to_end (struct utf8_tool
*tool
)
545 tool
->compose
= FALSE
;
547 while (tool
->checked
[0] != '\0')
552 uni
= g_utf8_get_char (tool
->checked
);
553 tool
->compose
= tool
->compose
|| str_unichar_iscombiningmark (uni
);
554 left
= g_unichar_to_utf8 (uni
, NULL
);
555 if (tool
->remain
<= left
)
557 left
= g_unichar_to_utf8 (uni
, tool
->actual
);
558 tool
->actual
+= left
;
559 tool
->remain
-= left
;
560 tool
->checked
= g_utf8_next_char (tool
->checked
);
566 /* --------------------------------------------------------------------------------------------- */
567 /* utility function, that copies characters from checked to actual until ident is
568 * smaller than to_ident */
571 utf8_tool_copy_chars_to (struct utf8_tool
*tool
, int to_ident
)
573 tool
->compose
= FALSE
;
575 while (tool
->checked
[0] != '\0')
581 uni
= g_utf8_get_char (tool
->checked
);
582 if (str_unichar_iscombiningmark (uni
))
583 tool
->compose
= TRUE
;
587 if (g_unichar_iswide (uni
))
589 if (tool
->ident
+ w
> to_ident
)
593 left
= g_unichar_to_utf8 (uni
, NULL
);
594 if (tool
->remain
<= left
)
596 left
= g_unichar_to_utf8 (uni
, tool
->actual
);
597 tool
->actual
+= left
;
598 tool
->remain
-= left
;
599 tool
->checked
= g_utf8_next_char (tool
->checked
);
606 /* --------------------------------------------------------------------------------------------- */
607 /* utility function, adds count spaces to actual */
610 utf8_tool_insert_space (struct utf8_tool
*tool
, int count
)
614 if (tool
->remain
<= (gsize
) count
)
617 memset (tool
->actual
, ' ', count
);
618 tool
->actual
+= count
;
619 tool
->remain
-= count
;
623 /* --------------------------------------------------------------------------------------------- */
624 /* utility function, adds one characters to actual */
627 utf8_tool_insert_char (struct utf8_tool
*tool
, char ch
)
629 if (tool
->remain
<= 1)
632 tool
->actual
[0] = ch
;
638 /* --------------------------------------------------------------------------------------------- */
639 /* utility function, thah skips characters from checked until ident is greater or
640 * equal to to_ident */
643 utf8_tool_skip_chars_to (struct utf8_tool
*tool
, int to_ident
)
647 while (to_ident
> tool
->ident
&& tool
->checked
[0] != '\0')
649 uni
= g_utf8_get_char (tool
->checked
);
650 if (!str_unichar_iscombiningmark (uni
))
653 if (g_unichar_iswide (uni
))
656 tool
->checked
= g_utf8_next_char (tool
->checked
);
659 uni
= g_utf8_get_char (tool
->checked
);
660 while (str_unichar_iscombiningmark (uni
))
662 tool
->checked
= g_utf8_next_char (tool
->checked
);
663 uni
= g_utf8_get_char (tool
->checked
);
669 /* --------------------------------------------------------------------------------------------- */
672 utf8_tool_compose (char *buffer
, size_t size
)
676 composed
= g_utf8_normalize (buffer
, -1, G_NORMALIZE_DEFAULT_COMPOSE
);
677 g_strlcpy (buffer
, composed
, size
);
681 /* --------------------------------------------------------------------------------------------- */
684 str_utf8_fit_to_term (const char *text
, int width
, align_crt_t just_mode
)
686 static char result
[BUF_MEDIUM
* MB_LEN_MAX
];
687 const struct term_form
*pre_form
;
688 struct utf8_tool tool
;
690 pre_form
= str_utf8_make_make_term_form (text
, (size_t) (-1));
691 tool
.checked
= pre_form
->text
;
692 tool
.actual
= result
;
693 tool
.remain
= sizeof (result
);
694 tool
.compose
= FALSE
;
696 if (pre_form
->width
<= (gsize
) width
)
698 switch (HIDE_FIT (just_mode
))
702 tool
.ident
= (width
- pre_form
->width
) / 2;
705 tool
.ident
= width
- pre_form
->width
;
712 utf8_tool_insert_space (&tool
, tool
.ident
);
713 utf8_tool_copy_chars_to_end (&tool
);
714 utf8_tool_insert_space (&tool
, width
- pre_form
->width
- tool
.ident
);
716 else if (IS_FIT (just_mode
))
719 utf8_tool_copy_chars_to (&tool
, width
/ 2);
720 utf8_tool_insert_char (&tool
, '~');
723 utf8_tool_skip_chars_to (&tool
, pre_form
->width
- width
+ 1);
724 utf8_tool_copy_chars_to_end (&tool
);
725 utf8_tool_insert_space (&tool
, width
- (pre_form
->width
- tool
.ident
+ 1));
729 switch (HIDE_FIT (just_mode
))
732 tool
.ident
= (width
- pre_form
->width
) / 2;
735 tool
.ident
= width
- pre_form
->width
;
742 utf8_tool_skip_chars_to (&tool
, 0);
743 utf8_tool_insert_space (&tool
, tool
.ident
);
744 utf8_tool_copy_chars_to (&tool
, width
);
745 utf8_tool_insert_space (&tool
, width
- tool
.ident
);
748 tool
.actual
[0] = '\0';
750 utf8_tool_compose (result
, sizeof (result
));
754 /* --------------------------------------------------------------------------------------------- */
757 str_utf8_term_trim (const char *text
, int width
)
759 static char result
[BUF_MEDIUM
* MB_LEN_MAX
];
760 const struct term_form
*pre_form
;
761 struct utf8_tool tool
;
769 pre_form
= str_utf8_make_make_term_form (text
, (size_t) (-1));
771 tool
.checked
= pre_form
->text
;
772 tool
.actual
= result
;
773 tool
.remain
= sizeof (result
);
774 tool
.compose
= FALSE
;
776 if ((gsize
) width
>= pre_form
->width
)
777 utf8_tool_copy_chars_to_end (&tool
);
780 memset (tool
.actual
, '.', width
);
781 tool
.actual
+= width
;
782 tool
.remain
-= width
;
786 memset (tool
.actual
, '.', 3);
791 utf8_tool_skip_chars_to (&tool
, pre_form
->width
- width
+ 3);
792 utf8_tool_copy_chars_to_end (&tool
);
795 tool
.actual
[0] = '\0';
797 utf8_tool_compose (result
, sizeof (result
));
801 /* --------------------------------------------------------------------------------------------- */
804 str_utf8_term_width2 (const char *text
, size_t length
)
806 const struct term_form
*result
;
808 result
= str_utf8_make_make_term_form (text
, length
);
809 return result
->width
;
812 /* --------------------------------------------------------------------------------------------- */
815 str_utf8_term_width1 (const char *text
)
817 return str_utf8_term_width2 (text
, (size_t) (-1));
820 /* --------------------------------------------------------------------------------------------- */
823 str_utf8_term_char_width (const char *text
)
827 uni
= g_utf8_get_char_validated (text
, -1);
828 return (str_unichar_iscombiningmark (uni
)) ? 0 : ((g_unichar_iswide (uni
)) ? 2 : 1);
831 /* --------------------------------------------------------------------------------------------- */
834 str_utf8_term_substring (const char *text
, int start
, int width
)
836 static char result
[BUF_MEDIUM
* MB_LEN_MAX
];
837 const struct term_form
*pre_form
;
838 struct utf8_tool tool
;
840 pre_form
= str_utf8_make_make_term_form (text
, (size_t) (-1));
842 tool
.checked
= pre_form
->text
;
843 tool
.actual
= result
;
844 tool
.remain
= sizeof (result
);
845 tool
.compose
= FALSE
;
848 utf8_tool_skip_chars_to (&tool
, 0);
851 utf8_tool_insert_space (&tool
, tool
.ident
);
853 utf8_tool_copy_chars_to (&tool
, width
);
854 utf8_tool_insert_space (&tool
, width
- tool
.ident
);
856 tool
.actual
[0] = '\0';
858 utf8_tool_compose (result
, sizeof (result
));
862 /* --------------------------------------------------------------------------------------------- */
865 str_utf8_trunc (const char *text
, int width
)
867 static char result
[MC_MAXPATHLEN
* MB_LEN_MAX
* 2];
868 const struct term_form
*pre_form
;
869 struct utf8_tool tool
;
871 pre_form
= str_utf8_make_make_term_form (text
, (size_t) (-1));
873 tool
.checked
= pre_form
->text
;
874 tool
.actual
= result
;
875 tool
.remain
= sizeof (result
);
876 tool
.compose
= FALSE
;
878 if (pre_form
->width
<= (gsize
) width
)
879 utf8_tool_copy_chars_to_end (&tool
);
883 utf8_tool_copy_chars_to (&tool
, width
/ 2);
884 utf8_tool_insert_char (&tool
, '~');
887 utf8_tool_skip_chars_to (&tool
, pre_form
->width
- width
+ 1);
888 utf8_tool_copy_chars_to_end (&tool
);
891 tool
.actual
[0] = '\0';
893 utf8_tool_compose (result
, sizeof (result
));
897 /* --------------------------------------------------------------------------------------------- */
900 str_utf8_offset_to_pos (const char *text
, size_t length
)
902 if (str_utf8_is_valid_string (text
))
903 return g_utf8_offset_to_pointer (text
, length
) - text
;
909 buffer
= g_strdup (text
);
910 str_utf8_fix_string (buffer
);
911 result
= g_utf8_offset_to_pointer (buffer
, length
) - buffer
;
917 /* --------------------------------------------------------------------------------------------- */
920 str_utf8_column_to_pos (const char *text
, size_t pos
)
925 while (text
[0] != '\0')
929 uni
= g_utf8_get_char_validated (text
, MB_LEN_MAX
);
930 if ((uni
!= (gunichar
) (-1)) && (uni
!= (gunichar
) (-2)))
932 if (g_unichar_isprint (uni
))
934 if (!str_unichar_iscombiningmark (uni
))
937 if (g_unichar_iswide (uni
))
945 text
= g_utf8_next_char (text
);
953 if ((gsize
) width
> pos
)
962 /* --------------------------------------------------------------------------------------------- */
965 str_utf8_create_search_needle (const char *needle
, gboolean case_sen
)
973 return g_utf8_normalize (needle
, -1, G_NORMALIZE_ALL
);
975 fold
= g_utf8_casefold (needle
, -1);
976 result
= g_utf8_normalize (fold
, -1, G_NORMALIZE_ALL
);
981 /* --------------------------------------------------------------------------------------------- */
984 str_utf8_release_search_needle (char *needle
, gboolean case_sen
)
990 /* --------------------------------------------------------------------------------------------- */
993 str_utf8_search_first (const char *text
, const char *search
, gboolean case_sen
)
997 const char *result
= NULL
;
1001 deco_text
= g_utf8_normalize (text
, -1, G_NORMALIZE_ALL
);
1006 fold_text
= g_utf8_casefold (text
, -1);
1007 deco_text
= g_utf8_normalize (fold_text
, -1, G_NORMALIZE_ALL
);
1011 search_len
= strlen (search
);
1016 match
= g_strstr_len (match
, -1, search
);
1019 if ((!str_utf8_iscombiningmark (match
) || (match
== deco_text
)) &&
1020 !str_utf8_iscombiningmark (match
+ search_len
))
1022 const char *m
= deco_text
;
1027 str_utf8_cnext_noncomb_char (&m
);
1028 str_utf8_cnext_noncomb_char (&result
);
1032 str_utf8_cnext_char (&match
);
1035 while (match
!= NULL
&& result
== NULL
);
1042 /* --------------------------------------------------------------------------------------------- */
1045 str_utf8_search_last (const char *text
, const char *search
, gboolean case_sen
)
1049 const char *result
= NULL
;
1053 deco_text
= g_utf8_normalize (text
, -1, G_NORMALIZE_ALL
);
1058 fold_text
= g_utf8_casefold (text
, -1);
1059 deco_text
= g_utf8_normalize (fold_text
, -1, G_NORMALIZE_ALL
);
1063 search_len
= strlen (search
);
1067 match
= g_strrstr_len (deco_text
, -1, search
);
1070 if ((!str_utf8_iscombiningmark (match
) || (match
== deco_text
)) &&
1071 !str_utf8_iscombiningmark (match
+ search_len
))
1073 const char *m
= deco_text
;
1078 str_utf8_cnext_noncomb_char (&m
);
1079 str_utf8_cnext_noncomb_char (&result
);
1086 while (match
!= NULL
&& result
== NULL
);
1093 /* --------------------------------------------------------------------------------------------- */
1096 str_utf8_normalize (const char *text
)
1104 /* g_utf8_normalize() is a heavyweight function, that converts UTF-8 into UCS-4,
1105 * does the normalization and then converts UCS-4 back into UTF-8.
1106 * Since file names are composed of ASCII characters in most cases, we can speed up
1107 * utf8 normalization by checking if the heavyweight Unicode normalization is actually
1108 * needed. Normalization of ASCII string is no-op.
1111 /* find out whether text is ASCII only */
1112 for (end
= text
; *end
!= '\0'; end
++)
1113 if ((*end
& 0x80) != 0)
1115 /* found 2nd byte of utf8-encoded symbol */
1119 /* if text is ASCII-only, return copy, normalize otherwise */
1121 return g_strndup (text
, end
- text
);
1123 fixed
= g_string_sized_new (4);
1126 while (!g_utf8_validate (start
, -1, &end
) && start
[0] != '\0')
1130 tmp
= g_utf8_normalize (start
, end
- start
, G_NORMALIZE_ALL
);
1131 g_string_append (fixed
, tmp
);
1134 g_string_append_c (fixed
, end
[0]);
1140 result
= g_utf8_normalize (text
, -1, G_NORMALIZE_ALL
);
1141 g_string_free (fixed
, TRUE
);
1145 if (start
[0] != '\0' && start
!= end
)
1147 tmp
= g_utf8_normalize (start
, end
- start
, G_NORMALIZE_ALL
);
1148 g_string_append (fixed
, tmp
);
1151 result
= g_string_free (fixed
, FALSE
);
1157 /* --------------------------------------------------------------------------------------------- */
1160 str_utf8_casefold_normalize (const char *text
)
1168 fixed
= g_string_sized_new (4);
1171 while (!g_utf8_validate (start
, -1, &end
) && start
[0] != '\0')
1175 fold
= g_utf8_casefold (start
, end
- start
);
1176 tmp
= g_utf8_normalize (fold
, -1, G_NORMALIZE_ALL
);
1177 g_string_append (fixed
, tmp
);
1181 g_string_append_c (fixed
, end
[0]);
1187 fold
= g_utf8_casefold (text
, -1);
1188 result
= g_utf8_normalize (fold
, -1, G_NORMALIZE_ALL
);
1190 g_string_free (fixed
, TRUE
);
1194 if (start
[0] != '\0' && start
!= end
)
1196 fold
= g_utf8_casefold (start
, end
- start
);
1197 tmp
= g_utf8_normalize (fold
, -1, G_NORMALIZE_ALL
);
1198 g_string_append (fixed
, tmp
);
1202 result
= g_string_free (fixed
, FALSE
);
1208 /* --------------------------------------------------------------------------------------------- */
1211 str_utf8_compare (const char *t1
, const char *t2
)
1216 n1
= str_utf8_normalize (t1
);
1217 n2
= str_utf8_normalize (t2
);
1219 result
= strcmp (n1
, n2
);
1227 /* --------------------------------------------------------------------------------------------- */
1230 str_utf8_ncompare (const char *t1
, const char *t2
)
1236 n1
= str_utf8_normalize (t1
);
1237 n2
= str_utf8_normalize (t2
);
1241 result
= strncmp (n1
, n2
, MIN (l1
, l2
));
1249 /* --------------------------------------------------------------------------------------------- */
1252 str_utf8_casecmp (const char *t1
, const char *t2
)
1257 n1
= str_utf8_casefold_normalize (t1
);
1258 n2
= str_utf8_casefold_normalize (t2
);
1260 result
= strcmp (n1
, n2
);
1268 /* --------------------------------------------------------------------------------------------- */
1271 str_utf8_ncasecmp (const char *t1
, const char *t2
)
1277 n1
= str_utf8_casefold_normalize (t1
);
1278 n2
= str_utf8_casefold_normalize (t2
);
1282 result
= strncmp (n1
, n2
, MIN (l1
, l2
));
1290 /* --------------------------------------------------------------------------------------------- */
1293 str_utf8_prefix (const char *text
, const char *prefix
)
1296 const char *nt
, *np
;
1297 const char *nnt
, *nnp
;
1300 t
= str_utf8_normalize (text
);
1301 p
= str_utf8_normalize (prefix
);
1307 while (nt
[0] != '\0' && np
[0] != '\0')
1309 str_utf8_cnext_char_safe (&nnt
);
1310 str_utf8_cnext_char_safe (&nnp
);
1311 if (nnt
- nt
!= nnp
- np
)
1313 if (strncmp (nt
, np
, nnt
- nt
) != 0)
1327 /* --------------------------------------------------------------------------------------------- */
1330 str_utf8_caseprefix (const char *text
, const char *prefix
)
1333 const char *nt
, *np
;
1334 const char *nnt
, *nnp
;
1337 t
= str_utf8_casefold_normalize (text
);
1338 p
= str_utf8_casefold_normalize (prefix
);
1344 while (nt
[0] != '\0' && np
[0] != '\0')
1346 str_utf8_cnext_char_safe (&nnt
);
1347 str_utf8_cnext_char_safe (&nnp
);
1348 if (nnt
- nt
!= nnp
- np
)
1350 if (strncmp (nt
, np
, nnt
- nt
) != 0)
1364 /* --------------------------------------------------------------------------------------------- */
1367 str_utf8_create_key_gen (const char *text
, gboolean case_sen
,
1368 gchar
*(*keygen
) (const gchar
*text
, gssize size
))
1373 result
= str_utf8_normalize (text
);
1378 const char *start
, *end
;
1381 dot
= text
[0] == '.';
1382 fixed
= g_string_sized_new (16);
1389 g_string_append_c (fixed
, '.');
1392 while (!g_utf8_validate (start
, -1, &end
) && start
[0] != '\0')
1396 fold
= g_utf8_casefold (start
, end
- start
);
1397 key
= keygen (fold
, -1);
1398 g_string_append (fixed
, key
);
1402 g_string_append_c (fixed
, end
[0]);
1408 fold
= g_utf8_casefold (start
, -1);
1409 result
= keygen (fold
, -1);
1411 g_string_free (fixed
, TRUE
);
1413 else if (dot
&& (start
== text
+ 1))
1415 fold
= g_utf8_casefold (start
, -1);
1416 key
= keygen (fold
, -1);
1417 g_string_append (fixed
, key
);
1420 result
= g_string_free (fixed
, FALSE
);
1424 if (start
[0] != '\0' && start
!= end
)
1426 fold
= g_utf8_casefold (start
, end
- start
);
1427 key
= keygen (fold
, -1);
1428 g_string_append (fixed
, key
);
1432 result
= g_string_free (fixed
, FALSE
);
1438 /* --------------------------------------------------------------------------------------------- */
1441 str_utf8_create_key (const char *text
, gboolean case_sen
)
1443 return str_utf8_create_key_gen (text
, case_sen
, g_utf8_collate_key
);
1446 /* --------------------------------------------------------------------------------------------- */
1448 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1450 str_utf8_create_key_for_filename (const char *text
, gboolean case_sen
)
1452 return str_utf8_create_key_gen (text
, case_sen
, g_utf8_collate_key_for_filename
);
1456 /* --------------------------------------------------------------------------------------------- */
1459 str_utf8_key_collate (const char *t1
, const char *t2
, gboolean case_sen
)
1462 return strcmp (t1
, t2
);
1465 /* --------------------------------------------------------------------------------------------- */
1468 str_utf8_release_key (char *key
, gboolean case_sen
)
1474 /* --------------------------------------------------------------------------------------------- */
1475 /*** public functions ****************************************************************************/
1476 /* --------------------------------------------------------------------------------------------- */
1479 str_utf8_init (void)
1481 struct str_class result
;
1483 result
.conv_gerror_message
= str_utf8_conv_gerror_message
;
1484 result
.vfs_convert_to
= str_utf8_vfs_convert_to
;
1485 result
.insert_replace_char
= str_utf8_insert_replace_char
;
1486 result
.is_valid_string
= str_utf8_is_valid_string
;
1487 result
.is_valid_char
= str_utf8_is_valid_char
;
1488 result
.cnext_char
= str_utf8_cnext_char
;
1489 result
.cprev_char
= str_utf8_cprev_char
;
1490 result
.cnext_char_safe
= str_utf8_cnext_char_safe
;
1491 result
.cprev_char_safe
= str_utf8_cprev_char_safe
;
1492 result
.cnext_noncomb_char
= str_utf8_cnext_noncomb_char
;
1493 result
.cprev_noncomb_char
= str_utf8_cprev_noncomb_char
;
1494 result
.char_isspace
= str_utf8_isspace
;
1495 result
.char_ispunct
= str_utf8_ispunct
;
1496 result
.char_isalnum
= str_utf8_isalnum
;
1497 result
.char_isdigit
= str_utf8_isdigit
;
1498 result
.char_isprint
= str_utf8_isprint
;
1499 result
.char_iscombiningmark
= str_utf8_iscombiningmark
;
1500 result
.char_toupper
= str_utf8_toupper
;
1501 result
.char_tolower
= str_utf8_tolower
;
1502 result
.length
= str_utf8_length
;
1503 result
.length2
= str_utf8_length2
;
1504 result
.length_noncomb
= str_utf8_length_noncomb
;
1505 result
.fix_string
= str_utf8_fix_string
;
1506 result
.term_form
= str_utf8_term_form
;
1507 result
.fit_to_term
= str_utf8_fit_to_term
;
1508 result
.term_trim
= str_utf8_term_trim
;
1509 result
.term_width2
= str_utf8_term_width2
;
1510 result
.term_width1
= str_utf8_term_width1
;
1511 result
.term_char_width
= str_utf8_term_char_width
;
1512 result
.term_substring
= str_utf8_term_substring
;
1513 result
.trunc
= str_utf8_trunc
;
1514 result
.offset_to_pos
= str_utf8_offset_to_pos
;
1515 result
.column_to_pos
= str_utf8_column_to_pos
;
1516 result
.create_search_needle
= str_utf8_create_search_needle
;
1517 result
.release_search_needle
= str_utf8_release_search_needle
;
1518 result
.search_first
= str_utf8_search_first
;
1519 result
.search_last
= str_utf8_search_last
;
1520 result
.compare
= str_utf8_compare
;
1521 result
.ncompare
= str_utf8_ncompare
;
1522 result
.casecmp
= str_utf8_casecmp
;
1523 result
.ncasecmp
= str_utf8_ncasecmp
;
1524 result
.prefix
= str_utf8_prefix
;
1525 result
.caseprefix
= str_utf8_caseprefix
;
1526 result
.create_key
= str_utf8_create_key
;
1527 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1528 /* case insensitive sort files in "a1 a2 a10" order */
1529 result
.create_key_for_filename
= str_utf8_create_key_for_filename
;
1531 /* case insensitive sort files in "a1 a10 a2" order */
1532 result
.create_key_for_filename
= str_utf8_create_key
;
1534 result
.key_collate
= str_utf8_key_collate
;
1535 result
.release_key
= str_utf8_release_key
;
1540 /* --------------------------------------------------------------------------------------------- */