lib/strutil/strutilutf8.c

   1 /* UTF-8 strings utilities
   2    Copyright (C) 2007 Free Software Foundation, Inc.
   3
   4    Written 2007 by:
   5    Rostislav Benes
   6
   7    The file_date routine is mostly from GNU's fileutils package,
   8    written by Richard Stallman and David MacKenzie.
   9
  10    This program is free software; you can redistribute it and/or modify
  11    it under the terms of the GNU General Public License as published by
  12    the Free Software Foundation; either version 2 of the License, or
  13    (at your option) any later version.
  14
  15    This program is distributed in the hope that it will be useful,
  16    but WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18    GNU General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with this program; if not, write to the Free Software
  22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  23  */
  24
  25 #include <config.h>
  26 #include <stdlib.h>
  27 #include <stdio.h>
  28 #include <errno.h>
  29 #include <glib.h>
  30 #include <langinfo.h>
  31 #include <string.h>
  32
  33 #include "lib/global.h"
  34 #include "lib/strutil.h"
  35
  36 /* using function for utf-8 from glib */
  37
  38 static const char replch[] = "\xEF\xBF\xBD";
  39
  40 static int
  41 str_unichar_iscombiningmark (gunichar uni)
  42 {
  43     int type = g_unichar_type (uni);
  44     return (type == G_UNICODE_COMBINING_MARK)
  45         || (type == G_UNICODE_ENCLOSING_MARK)
  46         || (type == G_UNICODE_NON_SPACING_MARK);
  47 }
  48
  49 static void
  50 str_utf8_insert_replace_char (GString * buffer)
  51 {
  52     g_string_append (buffer, replch);
  53 }
  54
  55 static int
  56 str_utf8_is_valid_string (const char *text)
  57 {
  58     return g_utf8_validate (text, -1, NULL);
  59 }
  60
  61 static int
  62 str_utf8_is_valid_char (const char *ch, size_t size)
  63 {
  64     switch (g_utf8_get_char_validated (ch, size))
  65     {
  66     case (gunichar) (-2):
  67         return -2;
  68     case (gunichar) (-1):
  69         return -1;
  70     default:
  71         return 1;
  72     }
  73 }
  74
  75 static void
  76 str_utf8_cnext_char (const char **text)
  77 {
  78     (*text) = g_utf8_next_char (*text);
  79 }
  80
  81 static void
  82 str_utf8_cprev_char (const char **text)
  83 {
  84     (*text) = g_utf8_prev_char (*text);
  85 }
  86
  87 static void
  88 str_utf8_cnext_char_safe (const char **text)
  89 {
  90     if (str_utf8_is_valid_char (*text, -1) == 1)
  91         (*text) = g_utf8_next_char (*text);
  92     else
  93         (*text)++;
  94 }
  95
  96 static void
  97 str_utf8_cprev_char_safe (const char **text)
  98 {
  99     const char *result = g_utf8_prev_char (*text);
 100     const char *t = result;
 101     str_utf8_cnext_char_safe (&t);
 102     if (t == *text)
 103         (*text) = result;
 104     else
 105         (*text)--;
 106 }
 107
 108 static void
 109 str_utf8_fix_string (char *text)
 110 {
 111     gunichar uni;
 112
 113     while (text[0] != '\0')
 114     {
 115         uni = g_utf8_get_char_validated (text, -1);
 116         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 117         {
 118             text = g_utf8_next_char (text);
 119         }
 120         else
 121         {
 122             text[0] = '?';
 123             text++;
 124         }
 125     }
 126 }
 127
 128 static int
 129 str_utf8_isspace (const char *text)
 130 {
 131     gunichar uni = g_utf8_get_char_validated (text, -1);
 132     return g_unichar_isspace (uni);
 133 }
 134
 135 static int
 136 str_utf8_ispunct (const char *text)
 137 {
 138     gunichar uni = g_utf8_get_char_validated (text, -1);
 139     return g_unichar_ispunct (uni);
 140 }
 141
 142 static int
 143 str_utf8_isalnum (const char *text)
 144 {
 145     gunichar uni = g_utf8_get_char_validated (text, -1);
 146     return g_unichar_isalnum (uni);
 147 }
 148
 149 static int
 150 str_utf8_isdigit (const char *text)
 151 {
 152     gunichar uni = g_utf8_get_char_validated (text, -1);
 153     return g_unichar_isdigit (uni);
 154 }
 155
 156 static int
 157 str_utf8_isprint (const char *ch)
 158 {
 159     gunichar uni = g_utf8_get_char_validated (ch, -1);
 160     return g_unichar_isprint (uni);
 161 }
 162
 163 static int
 164 str_utf8_iscombiningmark (const char *ch)
 165 {
 166     gunichar uni = g_utf8_get_char_validated (ch, -1);
 167     return str_unichar_iscombiningmark (uni);
 168 }
 169
 170 static int
 171 str_utf8_cnext_noncomb_char (const char **text)
 172 {
 173     int count = 0;
 174     while ((*text)[0] != '\0')
 175     {
 176         str_utf8_cnext_char_safe (text);
 177         count++;
 178         if (!str_utf8_iscombiningmark (*text))
 179             break;
 180     }
 181     return count;
 182 }
 183
 184 static int
 185 str_utf8_cprev_noncomb_char (const char **text, const char *begin)
 186 {
 187     int count = 0;
 188     while ((*text) != begin)
 189     {
 190         str_utf8_cprev_char_safe (text);
 191         count++;
 192         if (!str_utf8_iscombiningmark (*text))
 193             break;
 194     }
 195     return count;
 196 }
 197
 198 static int
 199 str_utf8_toupper (const char *text, char **out, size_t * remain)
 200 {
 201     gunichar uni;
 202     size_t left;
 203
 204     uni = g_utf8_get_char_validated (text, -1);
 205     if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
 206         return 0;
 207
 208     uni = g_unichar_toupper (uni);
 209     left = g_unichar_to_utf8 (uni, NULL);
 210     if (left >= *remain)
 211         return 0;
 212
 213     left = g_unichar_to_utf8 (uni, *out);
 214     (*out) += left;
 215     (*remain) -= left;
 216     return 1;
 217 }
 218
 219 static int
 220 str_utf8_tolower (const char *text, char **out, size_t * remain)
 221 {
 222     gunichar uni;
 223     size_t left;
 224
 225     uni = g_utf8_get_char_validated (text, -1);
 226     if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
 227         return 0;
 228
 229     uni = g_unichar_tolower (uni);
 230     left = g_unichar_to_utf8 (uni, NULL);
 231     if (left >= *remain)
 232         return 0;
 233
 234     left = g_unichar_to_utf8 (uni, *out);
 235     (*out) += left;
 236     (*remain) -= left;
 237     return 1;
 238 }
 239
 240 static int
 241 str_utf8_length (const char *text)
 242 {
 243     int result = 0;
 244     const char *start;
 245     const char *end;
 246
 247     start = text;
 248     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
 249     {
 250         if (start != end)
 251         {
 252             result += g_utf8_strlen (start, end - start);
 253         }
 254         result++;
 255         start = end + 1;
 256     }
 257
 258     if (start == text)
 259     {
 260         result = g_utf8_strlen (text, -1);
 261     }
 262     else
 263     {
 264         if (start[0] != '\0' && start != end)
 265         {
 266             result += g_utf8_strlen (start, end - start);
 267         }
 268     }
 269
 270     return result;
 271 }
 272
 273 static int
 274 str_utf8_length2 (const char *text, int size)
 275 {
 276     int result = 0;
 277     const char *start;
 278     const char *end;
 279
 280     start = text;
 281     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
 282     {
 283         if (start != end)
 284         {
 285             result += g_utf8_strlen (start, min (end - start, size));
 286             size -= end - start;
 287         }
 288         result += (size > 0);
 289         size--;
 290         start = end + 1;
 291     }
 292
 293     if (start == text)
 294     {
 295         result = g_utf8_strlen (text, size);
 296     }
 297     else
 298     {
 299         if (start[0] != '\0' && start != end && size > 0)
 300         {
 301             result += g_utf8_strlen (start, min (end - start, size));
 302         }
 303     }
 304
 305     return result;
 306 }
 307
 308 static int
 309 str_utf8_length_noncomb (const char *text)
 310 {
 311     int result = 0;
 312     const char *t = text;
 313
 314     while (t[0] != '\0')
 315     {
 316         str_utf8_cnext_noncomb_char (&t);
 317         result++;
 318     }
 319
 320     return result;
 321 }
 322 /*
 323 static void
 324 str_utf8_questmark_sustb (char **string, size_t * left, GString * buffer)
 325 {
 326     char *next = g_utf8_next_char (*string);
 327     (*left) -= next - (*string);
 328     (*string) = next;
 329     g_string_append_c (buffer, '?');
 330 }
 331 */
 332
 333 static gchar *
 334 str_utf8_conv_gerror_message (GError *error, const char *def_msg)
 335 {
 336     if ((error != NULL) && (error->message != NULL))
 337         return g_strdup (error->message);
 338
 339     return g_strdup (def_msg != NULL ? def_msg : "");
 340 }
 341
 342 static estr_t
 343 str_utf8_vfs_convert_to (GIConv coder, const char *string,
 344                          int size, GString * buffer)
 345 {
 346     estr_t result;
 347
 348     if (coder == str_cnv_not_convert)
 349     {
 350         g_string_append_len (buffer, string, size);
 351         result = ESTR_SUCCESS;
 352     }
 353     else
 354         result = str_nconvert (coder, (char *) string, size, buffer);
 355
 356     return result;
 357 }
 358
 359 struct term_form
 360 {
 361     char text[BUF_MEDIUM * 6];
 362     size_t width;
 363     int compose;
 364 };
 365
 366 /* utiliti function, that make string valid in utf8 and all characters printable
 367  * return width of string too*/
 368 static const struct term_form *
 369 str_utf8_make_make_term_form (const char *text, size_t length)
 370 {
 371     static struct term_form result;
 372     gunichar uni;
 373     size_t left;
 374     char *actual;
 375
 376     result.text[0] = '\0';
 377     result.width = 0;
 378     result.compose = 0;
 379     actual = result.text;
 380
 381     /* check if text start with combining character,
 382      * add space at begin in this case */
 383     if (length != 0 && text[0] != '\0')
 384     {
 385         uni = g_utf8_get_char_validated (text, -1);
 386         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 387         {
 388             if (str_unichar_iscombiningmark (uni))
 389             {
 390                 actual[0] = ' ';
 391                 actual++;
 392                 result.width++;
 393                 result.compose = 1;
 394             }
 395         }
 396     }
 397
 398     while (length != 0 && text[0] != '\0') {
 399         uni = g_utf8_get_char_validated (text, -1);
 400         if ((uni != (gunichar)(-1)) && (uni != (gunichar)(-2))) {
 401             if (g_unichar_isprint(uni)) {
 402                 left = g_unichar_to_utf8 (uni, actual);
 403                 actual+= left;
 404                 if (!str_unichar_iscombiningmark (uni)) {
 405                     result.width++;
 406                     if (g_unichar_iswide(uni)) result.width++;
 407                 } else result.compose = 1;
 408             } else {
 409                 actual[0] = '.';
 410                 actual++;
 411                 result.width++;
 412             }
 413             text = g_utf8_next_char (text);
 414         } else {
 415             text++;
 416             /*actual[0] = '?';*/
 417             memcpy (actual, replch, strlen (replch));
 418             actual+= strlen (replch);
 419             result.width++;
 420         }
 421         if (length != (size_t) (-1)) length--;    }
 422     actual[0] = '\0';
 423
 424     return &result;
 425 }
 426
 427 static const char *
 428 str_utf8_term_form (const char *text)
 429 {
 430     static char result[BUF_MEDIUM * 6];
 431     const struct term_form *pre_form;
 432     char *composed;
 433
 434     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 435     if (pre_form->compose)
 436     {
 437         composed =
 438             g_utf8_normalize (pre_form->text, -1,
 439                               G_NORMALIZE_DEFAULT_COMPOSE);
 440         g_strlcpy (result, composed, sizeof (result));
 441         g_free (composed);
 442     }
 443     else
 444     {
 445         g_strlcpy (result, pre_form->text, sizeof (result));
 446     }
 447     return result;
 448 }
 449
 450 struct utf8_tool
 451 {
 452     char *actual;
 453     size_t remain;
 454     const char *cheked;
 455     int ident;
 456     int compose;
 457 };
 458
 459 /* utiliti function, that copy all characters from cheked to actual */
 460 static int
 461 utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
 462 {
 463     size_t left;
 464     gunichar uni;
 465
 466     tool->compose = 0;
 467
 468     while (tool->cheked[0] != '\0')
 469     {
 470         uni = g_utf8_get_char (tool->cheked);
 471         tool->compose |= str_unichar_iscombiningmark (uni);
 472         left = g_unichar_to_utf8 (uni, NULL);
 473         if (tool->remain <= left)
 474             return 0;
 475         left = g_unichar_to_utf8 (uni, tool->actual);
 476         tool->actual += left;
 477         tool->remain -= left;
 478         tool->cheked = g_utf8_next_char (tool->cheked);
 479     }
 480     return 1;
 481 }
 482
 483 /* utiliti function, that copy characters from cheked to actual until ident is
 484  * smaller than to_ident */
 485 static int
 486 utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
 487 {
 488     size_t left;
 489     gunichar uni;
 490     int w;
 491
 492     tool->compose = 0;
 493
 494     while (tool->cheked[0] != '\0')
 495     {
 496         uni = g_utf8_get_char (tool->cheked);
 497         if (!str_unichar_iscombiningmark (uni))
 498         {
 499             w = 1;
 500             if (g_unichar_iswide (uni))
 501                 w++;
 502             if (tool->ident + w > to_ident)
 503                 return 1;
 504         }
 505         else
 506         {
 507             w = 0;
 508             tool->compose = 1;
 509         }
 510
 511         left = g_unichar_to_utf8 (uni, NULL);
 512         if (tool->remain <= left)
 513             return 0;
 514         left = g_unichar_to_utf8 (uni, tool->actual);
 515         tool->actual += left;
 516         tool->remain -= left;
 517         tool->cheked = g_utf8_next_char (tool->cheked);
 518         tool->ident += w;
 519     }
 520     return 1;
 521 }
 522
 523 /* utiliti function, add count spaces to actual */
 524 static int
 525 utf8_tool_insert_space (struct utf8_tool *tool, int count)
 526 {
 527     if (count <= 0)
 528         return 1;
 529     if (tool->remain <= (gsize) count)
 530         return 0;
 531     memset (tool->actual, ' ', count);
 532     tool->actual += count;
 533     tool->remain -= count;
 534     return 1;
 535 }
 536
 537 /* utiliti function, add one characters to actual */
 538 static int
 539 utf8_tool_insert_char (struct utf8_tool *tool, char ch)
 540 {
 541     if (tool->remain <= 1)
 542         return 0;
 543     tool->actual[0] = ch;
 544     tool->actual++;
 545     tool->remain--;
 546     return 1;
 547 }
 548
 549 /* utiliti function, thah skip characters from cheked until ident is greater or
 550  * equal to to_ident */
 551 static int
 552 utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
 553 {
 554     gunichar uni;
 555
 556     while (to_ident > tool->ident && tool->cheked[0] != '\0')
 557     {
 558         uni = g_utf8_get_char (tool->cheked);
 559         if (!str_unichar_iscombiningmark (uni))
 560         {
 561             tool->ident++;
 562             if (g_unichar_iswide (uni))
 563                 tool->ident++;
 564         }
 565         tool->cheked = g_utf8_next_char (tool->cheked);
 566     }
 567     uni = g_utf8_get_char (tool->cheked);
 568     while (str_unichar_iscombiningmark (uni))
 569     {
 570         tool->cheked = g_utf8_next_char (tool->cheked);
 571         uni = g_utf8_get_char (tool->cheked);
 572     }
 573     return 1;
 574 }
 575
 576 static void
 577 utf8_tool_compose (char *buffer, size_t size)
 578 {
 579     char *composed =
 580         g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
 581     g_strlcpy (buffer, composed, size);
 582     g_free (composed);
 583 }
 584
 585
 586 static const char *
 587 str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
 588 {
 589     static char result[BUF_MEDIUM * 6];
 590     const struct term_form *pre_form;
 591     struct utf8_tool tool;
 592
 593     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 594     tool.cheked = pre_form->text;
 595     tool.actual = result;
 596     tool.remain = sizeof (result);
 597     tool.compose = 0;
 598
 599     if (pre_form->width <= (gsize)width)
 600     {
 601         tool.ident = 0;
 602         switch (HIDE_FIT (just_mode))
 603         {
 604         case J_CENTER_LEFT:
 605         case J_CENTER:
 606             tool.ident = (width - pre_form->width) / 2;
 607             break;
 608         case J_RIGHT:
 609             tool.ident = width - pre_form->width;
 610             break;
 611         }
 612
 613         utf8_tool_insert_space (&tool, tool.ident);
 614         utf8_tool_copy_chars_to_end (&tool);
 615         utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
 616     }
 617     else
 618     {
 619         if (IS_FIT (just_mode))
 620         {
 621             tool.ident = 0;
 622             utf8_tool_copy_chars_to (&tool, width / 2);
 623             utf8_tool_insert_char (&tool, '~');
 624
 625             tool.ident = 0;
 626             utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
 627             utf8_tool_copy_chars_to_end (&tool);
 628             utf8_tool_insert_space (&tool,
 629                                     width - (pre_form->width - tool.ident +
 630                                              1));
 631         }
 632         else
 633         {
 634             tool.ident = 0;
 635             switch (HIDE_FIT (just_mode))
 636             {
 637             case J_CENTER:
 638                 tool.ident = (width - pre_form->width) / 2;
 639                 break;
 640             case J_RIGHT:
 641                 tool.ident = width - pre_form->width;
 642                 break;
 643             }
 644
 645             utf8_tool_skip_chars_to (&tool, 0);
 646             utf8_tool_insert_space (&tool, tool.ident);
 647             utf8_tool_copy_chars_to (&tool, width);
 648             utf8_tool_insert_space (&tool, width - tool.ident);
 649         }
 650     }
 651
 652     tool.actual[0] = '\0';
 653     if (tool.compose)
 654         utf8_tool_compose (result, sizeof (result));
 655     return result;
 656 }
 657
 658 static const char *
 659 str_utf8_term_trim (const char *text, int width)
 660 {
 661     static char result[BUF_MEDIUM * 6];
 662     const struct term_form *pre_form;
 663     struct utf8_tool tool;
 664
 665     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 666
 667     tool.cheked = pre_form->text;
 668     tool.actual = result;
 669     tool.remain = sizeof (result);
 670     tool.compose = 0;
 671
 672     if ((gsize)width < pre_form->width)
 673     {
 674         if (width <= 3)
 675         {
 676             memset (tool.actual, '.', width);
 677             tool.actual += width;
 678             tool.remain -= width;
 679         }
 680         else
 681         {
 682             memset (tool.actual, '.', 3);
 683             tool.actual += 3;
 684             tool.remain -= 3;
 685
 686             tool.ident = 0;
 687             utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
 688             utf8_tool_copy_chars_to_end (&tool);
 689         }
 690     }
 691     else
 692     {
 693         utf8_tool_copy_chars_to_end (&tool);
 694     }
 695
 696     tool.actual[0] = '\0';
 697     if (tool.compose)
 698         utf8_tool_compose (result, sizeof (result));
 699     return result;
 700 }
 701
 702 static int
 703 str_utf8_term_width2 (const char *text, size_t length)
 704 {
 705     const struct term_form *result;
 706
 707     result = str_utf8_make_make_term_form (text, length);
 708     return result->width;
 709 }
 710
 711 static int
 712 str_utf8_term_width1 (const char *text)
 713 {
 714     return str_utf8_term_width2 (text, (size_t) (-1));
 715 }
 716
 717 static int
 718 str_utf8_term_char_width (const char *text)
 719 {
 720     gunichar uni = g_utf8_get_char_validated (text, -1);
 721     return (str_unichar_iscombiningmark (uni)) ? 0
 722         : ((g_unichar_iswide (uni)) ? 2 : 1);
 723 }
 724
 725 static void
 726 str_utf8_msg_term_size (const char *text, int *lines, int *columns)
 727 {
 728     char *p, *tmp;
 729     char *q;
 730     char c = '\0';
 731     int width;
 732
 733     (*lines) = 1;
 734     (*columns) = 0;
 735
 736     tmp = g_strdup (text);
 737     p = tmp;
 738     for (;;)
 739     {
 740         q = strchr (p, '\n');
 741         if (q != NULL)
 742         {
 743             c = q[0];
 744             q[0] = '\0';
 745         }
 746
 747         width = str_utf8_term_width1 (p);
 748         if (width > (*columns))
 749             (*columns) = width;
 750
 751         if (q == NULL)
 752             break;
 753         q[0] = c;
 754         p = q + 1;
 755         (*lines)++;
 756     }
 757     g_free (tmp);
 758 }
 759
 760 static const char *
 761 str_utf8_term_substring (const char *text, int start, int width)
 762 {
 763     static char result[BUF_MEDIUM * 6];
 764     const struct term_form *pre_form;
 765     struct utf8_tool tool;
 766
 767     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 768
 769     tool.cheked = pre_form->text;
 770     tool.actual = result;
 771     tool.remain = sizeof (result);
 772     tool.compose = 0;
 773
 774     tool.ident = -start;
 775     utf8_tool_skip_chars_to (&tool, 0);
 776     if (tool.ident < 0)
 777         tool.ident = 0;
 778     utf8_tool_insert_space (&tool, tool.ident);
 779
 780     utf8_tool_copy_chars_to (&tool, width);
 781     utf8_tool_insert_space (&tool, width - tool.ident);
 782
 783     tool.actual[0] = '\0';
 784     if (tool.compose)
 785         utf8_tool_compose (result, sizeof (result));
 786     return result;
 787 }
 788
 789 static const char *
 790 str_utf8_trunc (const char *text, int width)
 791 {
 792     static char result[MC_MAXPATHLEN * 6 * 2];
 793     const struct term_form *pre_form;
 794     struct utf8_tool tool;
 795
 796     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 797
 798     tool.cheked = pre_form->text;
 799     tool.actual = result;
 800     tool.remain = sizeof (result);
 801     tool.compose = 0;
 802
 803     if (pre_form->width > (gsize)width)
 804     {
 805         tool.ident = 0;
 806         utf8_tool_copy_chars_to (&tool, width / 2);
 807         utf8_tool_insert_char (&tool, '~');
 808
 809         tool.ident = 0;
 810         utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
 811         utf8_tool_copy_chars_to_end (&tool);
 812     }
 813     else
 814     {
 815         utf8_tool_copy_chars_to_end (&tool);
 816     }
 817
 818     tool.actual[0] = '\0';
 819     if (tool.compose)
 820         utf8_tool_compose (result, sizeof (result));
 821     return result;
 822 }
 823
 824 static int
 825 str_utf8_offset_to_pos (const char *text, size_t length)
 826 {
 827     if (str_utf8_is_valid_string (text))
 828         return g_utf8_offset_to_pointer (text, length) - text;
 829     else
 830     {
 831         int result;
 832         GString *buffer = g_string_new (text);
 833
 834         str_utf8_fix_string (buffer->str);
 835         result = g_utf8_offset_to_pointer (buffer->str, length) - buffer->str;
 836         g_string_free (buffer, TRUE);
 837         return result;
 838     }
 839 }
 840
 841 static int
 842 str_utf8_column_to_pos (const char *text, size_t pos)
 843 {
 844     static int result;
 845     gunichar uni;
 846     int width;
 847
 848     width = 0;
 849     result = 0;
 850
 851     while (text[0] != '\0')
 852     {
 853         uni = g_utf8_get_char_validated (text, 6);
 854         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 855         {
 856             if (g_unichar_isprint (uni))
 857             {
 858                 if (!str_unichar_iscombiningmark (uni))
 859                 {
 860                     width++;
 861                     if (g_unichar_iswide (uni))
 862                         width++;
 863                 }
 864             }
 865             else
 866             {
 867                 width++;
 868             }
 869             text = g_utf8_next_char (text);
 870         }
 871         else
 872         {
 873             text++;
 874             width++;
 875         }
 876         if ((gsize)width > pos)
 877             return result;
 878
 879         result++;
 880     }
 881
 882     return result;
 883 }
 884
 885 static char *
 886 str_utf8_create_search_needle (const char *needle, int case_sen)
 887 {
 888     if (needle != NULL)
 889     {
 890         if (case_sen)
 891         {
 892             return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
 893         }
 894         else
 895         {
 896             char *fold = g_utf8_casefold (needle, -1);
 897             char *result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
 898             g_free (fold);
 899             return result;
 900         }
 901     }
 902     else
 903         return NULL;
 904 }
 905
 906 static void
 907 str_utf8_release_search_needle (char *needle, int case_sen)
 908 {
 909     (void) case_sen;
 910     if (needle != NULL)
 911         g_free (needle);
 912 }
 913
 914 static const char *
 915 str_utf8_search_first (const char *text, const char *search, int case_sen)
 916 {
 917     char *fold_text;
 918     char *deco_text;
 919     const char *match;
 920     const char *result = NULL;
 921     const char *m;
 922
 923     fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
 924     deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
 925
 926     match = deco_text;
 927     do
 928     {
 929         match = g_strstr_len (match, -1, search);
 930         if (match != NULL)
 931         {
 932             if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
 933                 !str_utf8_iscombiningmark (match + strlen (search)))
 934             {
 935
 936                 result = text;
 937                 m = deco_text;
 938                 while (m < match)
 939                 {
 940                     str_utf8_cnext_noncomb_char (&m);
 941                     str_utf8_cnext_noncomb_char (&result);
 942                 }
 943             }
 944             else
 945             {
 946                 str_utf8_cnext_char (&match);
 947             }
 948         }
 949     }
 950     while (match != NULL && result == NULL);
 951
 952     g_free (deco_text);
 953     if (!case_sen)
 954         g_free (fold_text);
 955
 956     return result;
 957 }
 958
 959 static const char *
 960 str_utf8_search_last (const char *text, const char *search, int case_sen)
 961 {
 962     char *fold_text;
 963     char *deco_text;
 964     char *match;
 965     const char *result = NULL;
 966     const char *m;
 967
 968     fold_text = (case_sen) ? (char *) text : g_utf8_casefold (text, -1);
 969     deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
 970
 971     do
 972     {
 973         match = g_strrstr_len (deco_text, -1, search);
 974         if (match != NULL)
 975         {
 976             if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
 977                 !str_utf8_iscombiningmark (match + strlen (search)))
 978             {
 979
 980                 result = text;
 981                 m = deco_text;
 982                 while (m < match)
 983                 {
 984                     str_utf8_cnext_noncomb_char (&m);
 985                     str_utf8_cnext_noncomb_char (&result);
 986                 }
 987             }
 988             else
 989             {
 990                 match[0] = '\0';
 991             }
 992         }
 993     }
 994     while (match != NULL && result == NULL);
 995
 996     g_free (deco_text);
 997     if (!case_sen)
 998         g_free (fold_text);
 999
1000     return result;
1001 }
1002
1003 static char *
1004 str_utf8_normalize (const char *text)
1005 {
1006     GString *fixed = g_string_new ("");
1007     char *tmp;
1008     char *result;
1009     const char *start;
1010     const char *end;
1011
1012     start = text;
1013     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1014     {
1015         if (start != end)
1016         {
1017             tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1018             g_string_append (fixed, tmp);
1019             g_free (tmp);
1020         }
1021         g_string_append_c (fixed, end[0]);
1022         start = end + 1;
1023     }
1024
1025     if (start == text)
1026     {
1027         result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1028     }
1029     else
1030     {
1031         if (start[0] != '\0' && start != end)
1032         {
1033             tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1034             g_string_append (fixed, tmp);
1035             g_free (tmp);
1036         }
1037         result = g_strdup (fixed->str);
1038     }
1039     g_string_free (fixed, TRUE);
1040
1041     return result;
1042 }
1043
1044 static char *
1045 str_utf8_casefold_normalize (const char *text)
1046 {
1047     GString *fixed = g_string_new ("");
1048     char *tmp, *fold;
1049     char *result;
1050     const char *start;
1051     const char *end;
1052
1053     start = text;
1054     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1055     {
1056         if (start != end)
1057         {
1058             fold = g_utf8_casefold (start, end - start);
1059             tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1060             g_string_append (fixed, tmp);
1061             g_free (tmp);
1062             g_free (fold);
1063         }
1064         g_string_append_c (fixed, end[0]);
1065         start = end + 1;
1066     }
1067
1068     if (start == text)
1069     {
1070         fold = g_utf8_casefold (text, -1);
1071         result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1072         g_free (fold);
1073     }
1074     else
1075     {
1076         if (start[0] != '\0' && start != end)
1077         {
1078             fold = g_utf8_casefold (start, end - start);
1079             tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1080             g_string_append (fixed, tmp);
1081             g_free (tmp);
1082             g_free (fold);
1083         }
1084         result = g_strdup (fixed->str);
1085     }
1086     g_string_free (fixed, TRUE);
1087
1088     return result;
1089 }
1090
1091 static int
1092 str_utf8_compare (const char *t1, const char *t2)
1093 {
1094     char *n1, *n2;
1095     int result;
1096
1097     n1 = str_utf8_normalize (t1);
1098     n2 = str_utf8_normalize (t2);
1099
1100     result = strcmp (n1, n2);
1101
1102     g_free (n1);
1103     g_free (n2);
1104
1105     return result;
1106 }
1107
1108 static int
1109 str_utf8_ncompare (const char *t1, const char *t2)
1110 {
1111     char *n1, *n2;
1112     int result;
1113
1114     n1 = str_utf8_normalize (t1);
1115     n2 = str_utf8_normalize (t2);
1116
1117     result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1118
1119     g_free (n1);
1120     g_free (n2);
1121
1122     return result;
1123 }
1124
1125 static int
1126 str_utf8_casecmp (const char *t1, const char *t2)
1127 {
1128     char *n1, *n2;
1129     int result;
1130
1131     n1 = str_utf8_casefold_normalize (t1);
1132     n2 = str_utf8_casefold_normalize (t2);
1133
1134     result = strcmp (n1, n2);
1135
1136     g_free (n1);
1137     g_free (n2);
1138
1139     return result;
1140 }
1141
1142 static int
1143 str_utf8_ncasecmp (const char *t1, const char *t2)
1144 {
1145     char *n1, *n2;
1146     int result;
1147
1148     n1 = str_utf8_casefold_normalize (t1);
1149     n2 = str_utf8_casefold_normalize (t2);
1150
1151     result = strncmp (n1, n2, min (strlen (n1), strlen (n2)));
1152
1153     g_free (n1);
1154     g_free (n2);
1155
1156     return result;
1157 }
1158
1159 static int
1160 str_utf8_prefix (const char *text, const char *prefix)
1161 {
1162     char *t = str_utf8_normalize (text);
1163     char *p = str_utf8_normalize (prefix);
1164     const char *nt = t;
1165     const char *np = p;
1166     const char *nnt = t;
1167     const char *nnp = p;
1168     int result;
1169
1170     while (nt[0] != '\0' && np[0] != '\0')
1171     {
1172         str_utf8_cnext_char_safe (&nnt);
1173         str_utf8_cnext_char_safe (&nnp);
1174         if (nnt - nt != nnp - np)
1175             break;
1176         if (strncmp (nt, np, nnt - nt) != 0)
1177             break;
1178         nt = nnt;
1179         np = nnp;
1180     }
1181
1182     result = np - p;
1183
1184     g_free (t);
1185     g_free (p);
1186
1187     return result;
1188 }
1189
1190 static int
1191 str_utf8_caseprefix (const char *text, const char *prefix)
1192 {
1193     char *t = str_utf8_casefold_normalize (text);
1194     char *p = str_utf8_casefold_normalize (prefix);
1195     const char *nt = t;
1196     const char *np = p;
1197     const char *nnt = t;
1198     const char *nnp = p;
1199     int result;
1200
1201     while (nt[0] != '\0' && np[0] != '\0')
1202     {
1203         str_utf8_cnext_char_safe (&nnt);
1204         str_utf8_cnext_char_safe (&nnp);
1205         if (nnt - nt != nnp - np)
1206             break;
1207         if (strncmp (nt, np, nnt - nt) != 0)
1208             break;
1209         nt = nnt;
1210         np = nnp;
1211     }
1212
1213     result = np - p;
1214
1215     g_free (t);
1216     g_free (p);
1217
1218     return result;
1219 }
1220
1221 static char *
1222 str_utf8_create_key_gen (const char *text, int case_sen,
1223                          gchar * (*keygen) (const gchar *, gssize size))
1224 {
1225     char *result;
1226
1227     if (case_sen) {
1228         result = str_utf8_normalize (text);
1229     } else {
1230         const char *start, *end;
1231         char *fold, *key;
1232         GString *fixed = g_string_new ("");
1233
1234         start = text;
1235         while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1236         {
1237             if (start != end)
1238             {
1239                 fold = g_utf8_casefold (start, end - start);
1240                 key = keygen (fold, -1);
1241                 g_string_append (fixed, key);
1242                 g_free (key);
1243                 g_free (fold);
1244             }
1245             g_string_append_c (fixed, end[0]);
1246             start = end + 1;
1247         }
1248
1249         if (start == text)
1250         {
1251             fold = g_utf8_casefold (text, -1);
1252             result = keygen (fold, -1);
1253             g_free (fold);
1254         }
1255         else
1256         {
1257             if (start[0] != '\0' && start != end)
1258             {
1259                 fold = g_utf8_casefold (start, end - start);
1260                 key = keygen (fold, -1);
1261                 g_string_append (fixed, key);
1262                 g_free (key);
1263                 g_free (fold);
1264             }
1265             result = g_strdup (fixed->str);
1266         }
1267         g_string_free (fixed, TRUE);
1268     }
1269     return result;
1270 }
1271
1272 static char *
1273 str_utf8_create_key (const char *text, int case_sen)
1274 {
1275     return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
1276 }
1277
1278 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1279 static char *
1280 str_utf8_create_key_for_filename (const char *text, int case_sen)
1281 {
1282     return str_utf8_create_key_gen (text, case_sen,
1283                                     g_utf8_collate_key_for_filename);
1284 }
1285 #endif
1286
1287 static int
1288 str_utf8_key_collate (const char *t1, const char *t2, int case_sen)
1289 {
1290     (void) case_sen;
1291     return strcmp (t1, t2);
1292 }
1293
1294 static void
1295 str_utf8_release_key (char *key, int case_sen)
1296 {
1297     (void) case_sen;
1298     g_free (key);
1299 }
1300
1301 struct str_class
1302 str_utf8_init (void)
1303 {
1304     struct str_class result;
1305
1306     result.conv_gerror_message = str_utf8_conv_gerror_message;
1307     result.vfs_convert_to = str_utf8_vfs_convert_to;
1308     result.insert_replace_char = str_utf8_insert_replace_char;
1309     result.is_valid_string = str_utf8_is_valid_string;
1310     result.is_valid_char = str_utf8_is_valid_char;
1311     result.cnext_char = str_utf8_cnext_char;
1312     result.cprev_char = str_utf8_cprev_char;
1313     result.cnext_char_safe = str_utf8_cnext_char_safe;
1314     result.cprev_char_safe = str_utf8_cprev_char_safe;
1315     result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
1316     result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
1317     result.isspace = str_utf8_isspace;
1318     result.ispunct = str_utf8_ispunct;
1319     result.isalnum = str_utf8_isalnum;
1320     result.isdigit = str_utf8_isdigit;
1321     result.isprint = str_utf8_isprint;
1322     result.iscombiningmark = str_utf8_iscombiningmark;
1323     result.toupper = str_utf8_toupper;
1324     result.tolower = str_utf8_tolower;
1325     result.length = str_utf8_length;
1326     result.length2 = str_utf8_length2;
1327     result.length_noncomb = str_utf8_length_noncomb;
1328     result.fix_string = str_utf8_fix_string;
1329     result.term_form = str_utf8_term_form;
1330     result.fit_to_term = str_utf8_fit_to_term;
1331     result.term_trim = str_utf8_term_trim;
1332     result.term_width2 = str_utf8_term_width2;
1333     result.term_width1 = str_utf8_term_width1;
1334     result.term_char_width = str_utf8_term_char_width;
1335     result.msg_term_size = str_utf8_msg_term_size;
1336     result.term_substring = str_utf8_term_substring;
1337     result.trunc = str_utf8_trunc;
1338     result.offset_to_pos = str_utf8_offset_to_pos;
1339     result.column_to_pos = str_utf8_column_to_pos;
1340     result.create_search_needle = str_utf8_create_search_needle;
1341     result.release_search_needle = str_utf8_release_search_needle;
1342     result.search_first = str_utf8_search_first;
1343     result.search_last = str_utf8_search_last;
1344     result.compare = str_utf8_compare;
1345     result.ncompare = str_utf8_ncompare;
1346     result.casecmp = str_utf8_casecmp;
1347     result.ncasecmp = str_utf8_ncasecmp;
1348     result.prefix = str_utf8_prefix;
1349     result.caseprefix = str_utf8_caseprefix;
1350     result.create_key = str_utf8_create_key;
1351 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1352     /* case insensitive sort files in "a1 a2 a10" order */
1353     result.create_key_for_filename = str_utf8_create_key_for_filename;
1354 #else
1355     /* case insensitive sort files in "a1 a10 a2" order */
1356     result.create_key_for_filename = str_utf8_create_key;
1357 #endif
1358     result.key_collate = str_utf8_key_collate;
1359     result.release_key = str_utf8_release_key;
1360
1361     return result;
1362 }