lib/strutil/strutilutf8.c

   1 /*
   2    UTF-8 strings utilities
   3
   4    Copyright (C) 2007-2024
   5    Free Software Foundation, Inc.
   6
   7    Written by:
   8    Rostislav Benes, 2007
   9
  10    This file is part of the Midnight Commander.
  11
  12    The Midnight Commander is free software: you can redistribute it
  13    and/or modify it under the terms of the GNU General Public License as
  14    published by the Free Software Foundation, either version 3 of the License,
  15    or (at your option) any later version.
  16
  17    The Midnight Commander is distributed in the hope that it will be useful,
  18    but WITHOUT ANY WARRANTY; without even the implied warranty of
  19    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20    GNU General Public License for more details.
  21
  22    You should have received a copy of the GNU General Public License
  23    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  24  */
  25
  26 #include <config.h>
  27
  28 #include <stdlib.h>
  29 #include <langinfo.h>
  30 #include <limits.h>             /* MB_LEN_MAX */
  31 #include <string.h>
  32
  33 #include "lib/global.h"
  34 #include "lib/strutil.h"
  35
  36 /* using function for utf-8 from glib */
  37
  38 /*** global variables ****************************************************************************/
  39
  40 /*** file scope macro definitions ****************************************************************/
  41
  42 /*** file scope type declarations ****************************************************************/
  43
  44 struct utf8_tool
  45 {
  46     char *actual;
  47     size_t remain;
  48     const char *checked;
  49     int ident;
  50     gboolean compose;
  51 };
  52
  53 struct term_form
  54 {
  55     char text[BUF_MEDIUM * MB_LEN_MAX];
  56     size_t width;
  57     gboolean compose;
  58 };
  59
  60 /*** forward declarations (file scope functions) *************************************************/
  61
  62 /*** file scope variables ************************************************************************/
  63
  64 static const char replch[] = "\xEF\xBF\xBD";
  65
  66 /* --------------------------------------------------------------------------------------------- */
  67 /*** file scope functions ************************************************************************/
  68 /* --------------------------------------------------------------------------------------------- */
  69
  70 static gboolean
  71 str_unichar_iscombiningmark (gunichar uni)
  72 {
  73     GUnicodeType type;
  74
  75     type = g_unichar_type (uni);
  76     return (type == G_UNICODE_SPACING_MARK)
  77         || (type == G_UNICODE_ENCLOSING_MARK) || (type == G_UNICODE_NON_SPACING_MARK);
  78 }
  79
  80 /* --------------------------------------------------------------------------------------------- */
  81
  82 static void
  83 str_utf8_insert_replace_char (GString *buffer)
  84 {
  85     g_string_append (buffer, replch);
  86 }
  87
  88 /* --------------------------------------------------------------------------------------------- */
  89
  90 static gboolean
  91 str_utf8_is_valid_string (const char *text)
  92 {
  93     return g_utf8_validate (text, -1, NULL);
  94 }
  95
  96 /* --------------------------------------------------------------------------------------------- */
  97
  98 static int
  99 str_utf8_is_valid_char (const char *ch, size_t size)
 100 {
 101     switch (g_utf8_get_char_validated (ch, size))
 102     {
 103     case (gunichar) (-2):
 104         return (-2);
 105     case (gunichar) (-1):
 106         return (-1);
 107     default:
 108         return 1;
 109     }
 110 }
 111
 112 /* --------------------------------------------------------------------------------------------- */
 113
 114 static void
 115 str_utf8_cnext_char (const char **text)
 116 {
 117     (*text) = g_utf8_next_char (*text);
 118 }
 119
 120 /* --------------------------------------------------------------------------------------------- */
 121
 122 static void
 123 str_utf8_cprev_char (const char **text)
 124 {
 125     (*text) = g_utf8_prev_char (*text);
 126 }
 127
 128 /* --------------------------------------------------------------------------------------------- */
 129
 130 static void
 131 str_utf8_cnext_char_safe (const char **text)
 132 {
 133     if (str_utf8_is_valid_char (*text, -1) == 1)
 134         (*text) = g_utf8_next_char (*text);
 135     else
 136         (*text)++;
 137 }
 138
 139 /* --------------------------------------------------------------------------------------------- */
 140
 141 static void
 142 str_utf8_cprev_char_safe (const char **text)
 143 {
 144     const char *result, *t;
 145
 146     result = g_utf8_prev_char (*text);
 147     t = result;
 148     str_utf8_cnext_char_safe (&t);
 149     if (t == *text)
 150         (*text) = result;
 151     else
 152         (*text)--;
 153 }
 154
 155 /* --------------------------------------------------------------------------------------------- */
 156
 157 static void
 158 str_utf8_fix_string (char *text)
 159 {
 160     while (text[0] != '\0')
 161     {
 162         gunichar uni;
 163
 164         uni = g_utf8_get_char_validated (text, -1);
 165         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 166             text = g_utf8_next_char (text);
 167         else
 168         {
 169             text[0] = '?';
 170             text++;
 171         }
 172     }
 173 }
 174
 175 /* --------------------------------------------------------------------------------------------- */
 176
 177 static gboolean
 178 str_utf8_isspace (const char *text)
 179 {
 180     gunichar uni;
 181
 182     uni = g_utf8_get_char_validated (text, -1);
 183     return g_unichar_isspace (uni);
 184 }
 185
 186 /* --------------------------------------------------------------------------------------------- */
 187
 188 static gboolean
 189 str_utf8_ispunct (const char *text)
 190 {
 191     gunichar uni;
 192
 193     uni = g_utf8_get_char_validated (text, -1);
 194     return g_unichar_ispunct (uni);
 195 }
 196
 197 /* --------------------------------------------------------------------------------------------- */
 198
 199 static gboolean
 200 str_utf8_isalnum (const char *text)
 201 {
 202     gunichar uni;
 203
 204     uni = g_utf8_get_char_validated (text, -1);
 205     return g_unichar_isalnum (uni);
 206 }
 207
 208 /* --------------------------------------------------------------------------------------------- */
 209
 210 static gboolean
 211 str_utf8_isdigit (const char *text)
 212 {
 213     gunichar uni;
 214
 215     uni = g_utf8_get_char_validated (text, -1);
 216     return g_unichar_isdigit (uni);
 217 }
 218
 219 /* --------------------------------------------------------------------------------------------- */
 220
 221 static gboolean
 222 str_utf8_isprint (const char *ch)
 223 {
 224     gunichar uni;
 225
 226     uni = g_utf8_get_char_validated (ch, -1);
 227     return g_unichar_isprint (uni);
 228 }
 229
 230 /* --------------------------------------------------------------------------------------------- */
 231
 232 static gboolean
 233 str_utf8_iscombiningmark (const char *ch)
 234 {
 235     gunichar uni;
 236
 237     uni = g_utf8_get_char_validated (ch, -1);
 238     return str_unichar_iscombiningmark (uni);
 239 }
 240
 241 /* --------------------------------------------------------------------------------------------- */
 242
 243 static int
 244 str_utf8_cnext_noncomb_char (const char **text)
 245 {
 246     int count = 0;
 247
 248     while ((*text)[0] != '\0')
 249     {
 250         str_utf8_cnext_char_safe (text);
 251         count++;
 252         if (!str_utf8_iscombiningmark (*text))
 253             break;
 254     }
 255
 256     return count;
 257 }
 258
 259 /* --------------------------------------------------------------------------------------------- */
 260
 261 static int
 262 str_utf8_cprev_noncomb_char (const char **text, const char *begin)
 263 {
 264     int count = 0;
 265
 266     while ((*text) != begin)
 267     {
 268         str_utf8_cprev_char_safe (text);
 269         count++;
 270         if (!str_utf8_iscombiningmark (*text))
 271             break;
 272     }
 273
 274     return count;
 275 }
 276
 277 /* --------------------------------------------------------------------------------------------- */
 278
 279 static gboolean
 280 str_utf8_toupper (const char *text, char **out, size_t *remain)
 281 {
 282     gunichar uni;
 283     size_t left;
 284
 285     uni = g_utf8_get_char_validated (text, -1);
 286     if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
 287         return FALSE;
 288
 289     uni = g_unichar_toupper (uni);
 290     left = g_unichar_to_utf8 (uni, NULL);
 291     if (left >= *remain)
 292         return FALSE;
 293
 294     left = g_unichar_to_utf8 (uni, *out);
 295     (*out) += left;
 296     (*remain) -= left;
 297     return TRUE;
 298 }
 299
 300 /* --------------------------------------------------------------------------------------------- */
 301
 302 static gboolean
 303 str_utf8_tolower (const char *text, char **out, size_t *remain)
 304 {
 305     gunichar uni;
 306     size_t left;
 307
 308     uni = g_utf8_get_char_validated (text, -1);
 309     if (uni == (gunichar) (-1) || uni == (gunichar) (-2))
 310         return FALSE;
 311
 312     uni = g_unichar_tolower (uni);
 313     left = g_unichar_to_utf8 (uni, NULL);
 314     if (left >= *remain)
 315         return FALSE;
 316
 317     left = g_unichar_to_utf8 (uni, *out);
 318     (*out) += left;
 319     (*remain) -= left;
 320     return TRUE;
 321 }
 322
 323 /* --------------------------------------------------------------------------------------------- */
 324
 325 static int
 326 str_utf8_length (const char *text)
 327 {
 328     int result = 0;
 329     const char *start;
 330     const char *end;
 331
 332     start = text;
 333     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
 334     {
 335         if (start != end)
 336             result += g_utf8_strlen (start, end - start);
 337
 338         result++;
 339         start = end + 1;
 340     }
 341
 342     if (start == text)
 343         result = g_utf8_strlen (text, -1);
 344     else if (start[0] != '\0' && start != end)
 345         result += g_utf8_strlen (start, end - start);
 346
 347     return result;
 348 }
 349
 350 /* --------------------------------------------------------------------------------------------- */
 351
 352 static int
 353 str_utf8_length2 (const char *text, int size)
 354 {
 355     int result = 0;
 356     const char *start;
 357     const char *end;
 358
 359     start = text;
 360     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0' && size > 0)
 361     {
 362         if (start != end)
 363         {
 364             result += g_utf8_strlen (start, MIN (end - start, size));
 365             size -= end - start;
 366         }
 367         result += (size > 0);
 368         size--;
 369         start = end + 1;
 370     }
 371
 372     if (start == text)
 373         result = g_utf8_strlen (text, size);
 374     else if (start[0] != '\0' && start != end && size > 0)
 375         result += g_utf8_strlen (start, MIN (end - start, size));
 376
 377     return result;
 378 }
 379
 380 /* --------------------------------------------------------------------------------------------- */
 381
 382 static int
 383 str_utf8_length_noncomb (const char *text)
 384 {
 385     int result = 0;
 386     const char *t = text;
 387
 388     while (t[0] != '\0')
 389     {
 390         str_utf8_cnext_noncomb_char (&t);
 391         result++;
 392     }
 393
 394     return result;
 395 }
 396
 397 /* --------------------------------------------------------------------------------------------- */
 398
 399 #if 0
 400 static void
 401 str_utf8_questmark_sustb (char **string, size_t *left, GString *buffer)
 402 {
 403     char *next;
 404
 405     next = g_utf8_next_char (*string);
 406     (*left) -= next - (*string);
 407     (*string) = next;
 408     g_string_append_c (buffer, '?');
 409 }
 410 #endif
 411
 412 /* --------------------------------------------------------------------------------------------- */
 413
 414 static gchar *
 415 str_utf8_conv_gerror_message (GError *mcerror, const char *def_msg)
 416 {
 417     if (mcerror != NULL)
 418         return g_strdup (mcerror->message);
 419
 420     return g_strdup (def_msg != NULL ? def_msg : "");
 421 }
 422
 423 /* --------------------------------------------------------------------------------------------- */
 424
 425 static estr_t
 426 str_utf8_vfs_convert_to (GIConv coder, const char *string, int size, GString *buffer)
 427 {
 428     estr_t result = ESTR_SUCCESS;
 429
 430     if (coder == str_cnv_not_convert)
 431         g_string_append_len (buffer, string, size);
 432     else
 433         result = str_nconvert (coder, string, size, buffer);
 434
 435     return result;
 436 }
 437
 438 /* --------------------------------------------------------------------------------------------- */
 439 /* utility function, that makes string valid in utf8 and all characters printable
 440  * return width of string too */
 441
 442 static const struct term_form *
 443 str_utf8_make_make_term_form (const char *text, size_t length)
 444 {
 445     static struct term_form result;
 446     gunichar uni;
 447     size_t left;
 448     char *actual;
 449
 450     result.text[0] = '\0';
 451     result.width = 0;
 452     result.compose = FALSE;
 453     actual = result.text;
 454
 455     /* check if text start with combining character,
 456      * add space at begin in this case */
 457     if (length != 0 && text[0] != '\0')
 458     {
 459         uni = g_utf8_get_char_validated (text, -1);
 460         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2))
 461             && str_unichar_iscombiningmark (uni))
 462         {
 463             actual[0] = ' ';
 464             actual++;
 465             result.width++;
 466             result.compose = TRUE;
 467         }
 468     }
 469
 470     while (length != 0 && text[0] != '\0')
 471     {
 472         uni = g_utf8_get_char_validated (text, -1);
 473         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 474         {
 475             if (g_unichar_isprint (uni))
 476             {
 477                 left = g_unichar_to_utf8 (uni, actual);
 478                 actual += left;
 479                 if (str_unichar_iscombiningmark (uni))
 480                     result.compose = TRUE;
 481                 else
 482                 {
 483                     result.width++;
 484                     if (g_unichar_iswide (uni))
 485                         result.width++;
 486                 }
 487             }
 488             else
 489             {
 490                 actual[0] = '.';
 491                 actual++;
 492                 result.width++;
 493             }
 494             text = g_utf8_next_char (text);
 495         }
 496         else
 497         {
 498             size_t repl_len;
 499
 500             text++;
 501             /*actual[0] = '?'; */
 502             repl_len = strlen (replch);
 503             memcpy (actual, replch, repl_len);
 504             actual += repl_len;
 505             result.width++;
 506         }
 507
 508         if (length != (size_t) (-1))
 509             length--;
 510     }
 511     actual[0] = '\0';
 512
 513     return &result;
 514 }
 515
 516 /* --------------------------------------------------------------------------------------------- */
 517
 518 static const char *
 519 str_utf8_term_form (const char *text)
 520 {
 521     static char result[BUF_MEDIUM * MB_LEN_MAX];
 522     const struct term_form *pre_form;
 523
 524     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 525     if (pre_form->compose)
 526     {
 527         char *composed;
 528
 529         composed = g_utf8_normalize (pre_form->text, -1, G_NORMALIZE_DEFAULT_COMPOSE);
 530         g_strlcpy (result, composed, sizeof (result));
 531         g_free (composed);
 532     }
 533     else
 534         g_strlcpy (result, pre_form->text, sizeof (result));
 535
 536     return result;
 537 }
 538
 539 /* --------------------------------------------------------------------------------------------- */
 540 /* utility function, that copies all characters from checked to actual */
 541
 542 static gboolean
 543 utf8_tool_copy_chars_to_end (struct utf8_tool *tool)
 544 {
 545     tool->compose = FALSE;
 546
 547     while (tool->checked[0] != '\0')
 548     {
 549         gunichar uni;
 550         size_t left;
 551
 552         uni = g_utf8_get_char (tool->checked);
 553         tool->compose = tool->compose || str_unichar_iscombiningmark (uni);
 554         left = g_unichar_to_utf8 (uni, NULL);
 555         if (tool->remain <= left)
 556             return FALSE;
 557         left = g_unichar_to_utf8 (uni, tool->actual);
 558         tool->actual += left;
 559         tool->remain -= left;
 560         tool->checked = g_utf8_next_char (tool->checked);
 561     }
 562
 563     return TRUE;
 564 }
 565
 566 /* --------------------------------------------------------------------------------------------- */
 567 /* utility function, that copies characters from checked to actual until ident is
 568  * smaller than to_ident */
 569
 570 static gboolean
 571 utf8_tool_copy_chars_to (struct utf8_tool *tool, int to_ident)
 572 {
 573     tool->compose = FALSE;
 574
 575     while (tool->checked[0] != '\0')
 576     {
 577         gunichar uni;
 578         size_t left;
 579         int w = 0;
 580
 581         uni = g_utf8_get_char (tool->checked);
 582         if (str_unichar_iscombiningmark (uni))
 583             tool->compose = TRUE;
 584         else
 585         {
 586             w = 1;
 587             if (g_unichar_iswide (uni))
 588                 w++;
 589             if (tool->ident + w > to_ident)
 590                 return TRUE;
 591         }
 592
 593         left = g_unichar_to_utf8 (uni, NULL);
 594         if (tool->remain <= left)
 595             return FALSE;
 596         left = g_unichar_to_utf8 (uni, tool->actual);
 597         tool->actual += left;
 598         tool->remain -= left;
 599         tool->checked = g_utf8_next_char (tool->checked);
 600         tool->ident += w;
 601     }
 602
 603     return TRUE;
 604 }
 605
 606 /* --------------------------------------------------------------------------------------------- */
 607 /* utility function, adds count spaces to actual */
 608
 609 static int
 610 utf8_tool_insert_space (struct utf8_tool *tool, int count)
 611 {
 612     if (count <= 0)
 613         return 1;
 614     if (tool->remain <= (gsize) count)
 615         return 0;
 616
 617     memset (tool->actual, ' ', count);
 618     tool->actual += count;
 619     tool->remain -= count;
 620     return 1;
 621 }
 622
 623 /* --------------------------------------------------------------------------------------------- */
 624 /* utility function, adds one characters to actual */
 625
 626 static int
 627 utf8_tool_insert_char (struct utf8_tool *tool, char ch)
 628 {
 629     if (tool->remain <= 1)
 630         return 0;
 631
 632     tool->actual[0] = ch;
 633     tool->actual++;
 634     tool->remain--;
 635     return 1;
 636 }
 637
 638 /* --------------------------------------------------------------------------------------------- */
 639 /* utility function, thah skips characters from checked until ident is greater or
 640  * equal to to_ident */
 641
 642 static gboolean
 643 utf8_tool_skip_chars_to (struct utf8_tool *tool, int to_ident)
 644 {
 645     gunichar uni;
 646
 647     while (to_ident > tool->ident && tool->checked[0] != '\0')
 648     {
 649         uni = g_utf8_get_char (tool->checked);
 650         if (!str_unichar_iscombiningmark (uni))
 651         {
 652             tool->ident++;
 653             if (g_unichar_iswide (uni))
 654                 tool->ident++;
 655         }
 656         tool->checked = g_utf8_next_char (tool->checked);
 657     }
 658
 659     uni = g_utf8_get_char (tool->checked);
 660     while (str_unichar_iscombiningmark (uni))
 661     {
 662         tool->checked = g_utf8_next_char (tool->checked);
 663         uni = g_utf8_get_char (tool->checked);
 664     }
 665
 666     return TRUE;
 667 }
 668
 669 /* --------------------------------------------------------------------------------------------- */
 670
 671 static void
 672 utf8_tool_compose (char *buffer, size_t size)
 673 {
 674     char *composed;
 675
 676     composed = g_utf8_normalize (buffer, -1, G_NORMALIZE_DEFAULT_COMPOSE);
 677     g_strlcpy (buffer, composed, size);
 678     g_free (composed);
 679 }
 680
 681 /* --------------------------------------------------------------------------------------------- */
 682
 683 static const char *
 684 str_utf8_fit_to_term (const char *text, int width, align_crt_t just_mode)
 685 {
 686     static char result[BUF_MEDIUM * MB_LEN_MAX];
 687     const struct term_form *pre_form;
 688     struct utf8_tool tool;
 689
 690     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 691     tool.checked = pre_form->text;
 692     tool.actual = result;
 693     tool.remain = sizeof (result);
 694     tool.compose = FALSE;
 695
 696     if (pre_form->width <= (gsize) width)
 697     {
 698         switch (HIDE_FIT (just_mode))
 699         {
 700         case J_CENTER_LEFT:
 701         case J_CENTER:
 702             tool.ident = (width - pre_form->width) / 2;
 703             break;
 704         case J_RIGHT:
 705             tool.ident = width - pre_form->width;
 706             break;
 707         default:
 708             tool.ident = 0;
 709             break;
 710         }
 711
 712         utf8_tool_insert_space (&tool, tool.ident);
 713         utf8_tool_copy_chars_to_end (&tool);
 714         utf8_tool_insert_space (&tool, width - pre_form->width - tool.ident);
 715     }
 716     else if (IS_FIT (just_mode))
 717     {
 718         tool.ident = 0;
 719         utf8_tool_copy_chars_to (&tool, width / 2);
 720         utf8_tool_insert_char (&tool, '~');
 721
 722         tool.ident = 0;
 723         utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
 724         utf8_tool_copy_chars_to_end (&tool);
 725         utf8_tool_insert_space (&tool, width - (pre_form->width - tool.ident + 1));
 726     }
 727     else
 728     {
 729         switch (HIDE_FIT (just_mode))
 730         {
 731         case J_CENTER:
 732             tool.ident = (width - pre_form->width) / 2;
 733             break;
 734         case J_RIGHT:
 735             tool.ident = width - pre_form->width;
 736             break;
 737         default:
 738             tool.ident = 0;
 739             break;
 740         }
 741
 742         utf8_tool_skip_chars_to (&tool, 0);
 743         utf8_tool_insert_space (&tool, tool.ident);
 744         utf8_tool_copy_chars_to (&tool, width);
 745         utf8_tool_insert_space (&tool, width - tool.ident);
 746     }
 747
 748     tool.actual[0] = '\0';
 749     if (tool.compose)
 750         utf8_tool_compose (result, sizeof (result));
 751     return result;
 752 }
 753
 754 /* --------------------------------------------------------------------------------------------- */
 755
 756 static const char *
 757 str_utf8_term_trim (const char *text, int width)
 758 {
 759     static char result[BUF_MEDIUM * MB_LEN_MAX];
 760     const struct term_form *pre_form;
 761     struct utf8_tool tool;
 762
 763     if (width < 1)
 764     {
 765         result[0] = '\0';
 766         return result;
 767     }
 768
 769     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 770
 771     tool.checked = pre_form->text;
 772     tool.actual = result;
 773     tool.remain = sizeof (result);
 774     tool.compose = FALSE;
 775
 776     if ((gsize) width >= pre_form->width)
 777         utf8_tool_copy_chars_to_end (&tool);
 778     else if (width <= 3)
 779     {
 780         memset (tool.actual, '.', width);
 781         tool.actual += width;
 782         tool.remain -= width;
 783     }
 784     else
 785     {
 786         memset (tool.actual, '.', 3);
 787         tool.actual += 3;
 788         tool.remain -= 3;
 789
 790         tool.ident = 0;
 791         utf8_tool_skip_chars_to (&tool, pre_form->width - width + 3);
 792         utf8_tool_copy_chars_to_end (&tool);
 793     }
 794
 795     tool.actual[0] = '\0';
 796     if (tool.compose)
 797         utf8_tool_compose (result, sizeof (result));
 798     return result;
 799 }
 800
 801 /* --------------------------------------------------------------------------------------------- */
 802
 803 static int
 804 str_utf8_term_width2 (const char *text, size_t length)
 805 {
 806     const struct term_form *result;
 807
 808     result = str_utf8_make_make_term_form (text, length);
 809     return result->width;
 810 }
 811
 812 /* --------------------------------------------------------------------------------------------- */
 813
 814 static int
 815 str_utf8_term_width1 (const char *text)
 816 {
 817     return str_utf8_term_width2 (text, (size_t) (-1));
 818 }
 819
 820 /* --------------------------------------------------------------------------------------------- */
 821
 822 static int
 823 str_utf8_term_char_width (const char *text)
 824 {
 825     gunichar uni;
 826
 827     uni = g_utf8_get_char_validated (text, -1);
 828     return (str_unichar_iscombiningmark (uni)) ? 0 : ((g_unichar_iswide (uni)) ? 2 : 1);
 829 }
 830
 831 /* --------------------------------------------------------------------------------------------- */
 832
 833 static const char *
 834 str_utf8_term_substring (const char *text, int start, int width)
 835 {
 836     static char result[BUF_MEDIUM * MB_LEN_MAX];
 837     const struct term_form *pre_form;
 838     struct utf8_tool tool;
 839
 840     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 841
 842     tool.checked = pre_form->text;
 843     tool.actual = result;
 844     tool.remain = sizeof (result);
 845     tool.compose = FALSE;
 846
 847     tool.ident = -start;
 848     utf8_tool_skip_chars_to (&tool, 0);
 849     if (tool.ident < 0)
 850         tool.ident = 0;
 851     utf8_tool_insert_space (&tool, tool.ident);
 852
 853     utf8_tool_copy_chars_to (&tool, width);
 854     utf8_tool_insert_space (&tool, width - tool.ident);
 855
 856     tool.actual[0] = '\0';
 857     if (tool.compose)
 858         utf8_tool_compose (result, sizeof (result));
 859     return result;
 860 }
 861
 862 /* --------------------------------------------------------------------------------------------- */
 863
 864 static const char *
 865 str_utf8_trunc (const char *text, int width)
 866 {
 867     static char result[MC_MAXPATHLEN * MB_LEN_MAX * 2];
 868     const struct term_form *pre_form;
 869     struct utf8_tool tool;
 870
 871     pre_form = str_utf8_make_make_term_form (text, (size_t) (-1));
 872
 873     tool.checked = pre_form->text;
 874     tool.actual = result;
 875     tool.remain = sizeof (result);
 876     tool.compose = FALSE;
 877
 878     if (pre_form->width <= (gsize) width)
 879         utf8_tool_copy_chars_to_end (&tool);
 880     else
 881     {
 882         tool.ident = 0;
 883         utf8_tool_copy_chars_to (&tool, width / 2);
 884         utf8_tool_insert_char (&tool, '~');
 885
 886         tool.ident = 0;
 887         utf8_tool_skip_chars_to (&tool, pre_form->width - width + 1);
 888         utf8_tool_copy_chars_to_end (&tool);
 889     }
 890
 891     tool.actual[0] = '\0';
 892     if (tool.compose)
 893         utf8_tool_compose (result, sizeof (result));
 894     return result;
 895 }
 896
 897 /* --------------------------------------------------------------------------------------------- */
 898
 899 static int
 900 str_utf8_offset_to_pos (const char *text, size_t length)
 901 {
 902     if (str_utf8_is_valid_string (text))
 903         return g_utf8_offset_to_pointer (text, length) - text;
 904     else
 905     {
 906         int result;
 907         char *buffer;
 908
 909         buffer = g_strdup (text);
 910         str_utf8_fix_string (buffer);
 911         result = g_utf8_offset_to_pointer (buffer, length) - buffer;
 912         g_free (buffer);
 913         return result;
 914     }
 915 }
 916
 917 /* --------------------------------------------------------------------------------------------- */
 918
 919 static int
 920 str_utf8_column_to_pos (const char *text, size_t pos)
 921 {
 922     int result = 0;
 923     int width = 0;
 924
 925     while (text[0] != '\0')
 926     {
 927         gunichar uni;
 928
 929         uni = g_utf8_get_char_validated (text, MB_LEN_MAX);
 930         if ((uni != (gunichar) (-1)) && (uni != (gunichar) (-2)))
 931         {
 932             if (g_unichar_isprint (uni))
 933             {
 934                 if (!str_unichar_iscombiningmark (uni))
 935                 {
 936                     width++;
 937                     if (g_unichar_iswide (uni))
 938                         width++;
 939                 }
 940             }
 941             else
 942             {
 943                 width++;
 944             }
 945             text = g_utf8_next_char (text);
 946         }
 947         else
 948         {
 949             text++;
 950             width++;
 951         }
 952
 953         if ((gsize) width > pos)
 954             return result;
 955
 956         result++;
 957     }
 958
 959     return result;
 960 }
 961
 962 /* --------------------------------------------------------------------------------------------- */
 963
 964 static char *
 965 str_utf8_create_search_needle (const char *needle, gboolean case_sen)
 966 {
 967     char *fold, *result;
 968
 969     if (needle == NULL)
 970         return NULL;
 971
 972     if (case_sen)
 973         return g_utf8_normalize (needle, -1, G_NORMALIZE_ALL);
 974
 975     fold = g_utf8_casefold (needle, -1);
 976     result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
 977     g_free (fold);
 978     return result;
 979 }
 980
 981 /* --------------------------------------------------------------------------------------------- */
 982
 983 static void
 984 str_utf8_release_search_needle (char *needle, gboolean case_sen)
 985 {
 986     (void) case_sen;
 987     g_free (needle);
 988 }
 989
 990 /* --------------------------------------------------------------------------------------------- */
 991
 992 static const char *
 993 str_utf8_search_first (const char *text, const char *search, gboolean case_sen)
 994 {
 995     char *deco_text;
 996     const char *match;
 997     const char *result = NULL;
 998     size_t search_len;
 999
1000     if (case_sen)
1001         deco_text = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1002     else
1003     {
1004         char *fold_text;
1005
1006         fold_text = g_utf8_casefold (text, -1);
1007         deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
1008         g_free (fold_text);
1009     }
1010
1011     search_len = strlen (search);
1012
1013     match = deco_text;
1014     do
1015     {
1016         match = g_strstr_len (match, -1, search);
1017         if (match != NULL)
1018         {
1019             if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
1020                 !str_utf8_iscombiningmark (match + search_len))
1021             {
1022                 const char *m = deco_text;
1023
1024                 result = text;
1025                 while (m < match)
1026                 {
1027                     str_utf8_cnext_noncomb_char (&m);
1028                     str_utf8_cnext_noncomb_char (&result);
1029                 }
1030             }
1031             else
1032                 str_utf8_cnext_char (&match);
1033         }
1034     }
1035     while (match != NULL && result == NULL);
1036
1037     g_free (deco_text);
1038
1039     return result;
1040 }
1041
1042 /* --------------------------------------------------------------------------------------------- */
1043
1044 static const char *
1045 str_utf8_search_last (const char *text, const char *search, gboolean case_sen)
1046 {
1047     char *deco_text;
1048     char *match;
1049     const char *result = NULL;
1050     size_t search_len;
1051
1052     if (case_sen)
1053         deco_text = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1054     else
1055     {
1056         char *fold_text;
1057
1058         fold_text = g_utf8_casefold (text, -1);
1059         deco_text = g_utf8_normalize (fold_text, -1, G_NORMALIZE_ALL);
1060         g_free (fold_text);
1061     }
1062
1063     search_len = strlen (search);
1064
1065     do
1066     {
1067         match = g_strrstr_len (deco_text, -1, search);
1068         if (match != NULL)
1069         {
1070             if ((!str_utf8_iscombiningmark (match) || (match == deco_text)) &&
1071                 !str_utf8_iscombiningmark (match + search_len))
1072             {
1073                 const char *m = deco_text;
1074
1075                 result = text;
1076                 while (m < match)
1077                 {
1078                     str_utf8_cnext_noncomb_char (&m);
1079                     str_utf8_cnext_noncomb_char (&result);
1080                 }
1081             }
1082             else
1083                 match[0] = '\0';
1084         }
1085     }
1086     while (match != NULL && result == NULL);
1087
1088     g_free (deco_text);
1089
1090     return result;
1091 }
1092
1093 /* --------------------------------------------------------------------------------------------- */
1094
1095 static char *
1096 str_utf8_normalize (const char *text)
1097 {
1098     GString *fixed;
1099     char *tmp;
1100     char *result;
1101     const char *start;
1102     const char *end;
1103
1104     /* g_utf8_normalize() is a heavyweight function, that converts UTF-8 into UCS-4,
1105      * does the normalization and then converts UCS-4 back into UTF-8.
1106      * Since file names are composed of ASCII characters in most cases, we can speed up
1107      * utf8 normalization by checking if the heavyweight Unicode normalization is actually
1108      * needed. Normalization of ASCII string is no-op.
1109      */
1110
1111     /* find out whether text is ASCII only */
1112     for (end = text; *end != '\0'; end++)
1113         if ((*end & 0x80) != 0)
1114         {
1115             /* found 2nd byte of utf8-encoded symbol */
1116             break;
1117         }
1118
1119     /* if text is ASCII-only, return copy, normalize otherwise */
1120     if (*end == '\0')
1121         return g_strndup (text, end - text);
1122
1123     fixed = g_string_sized_new (4);
1124
1125     start = text;
1126     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1127     {
1128         if (start != end)
1129         {
1130             tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1131             g_string_append (fixed, tmp);
1132             g_free (tmp);
1133         }
1134         g_string_append_c (fixed, end[0]);
1135         start = end + 1;
1136     }
1137
1138     if (start == text)
1139     {
1140         result = g_utf8_normalize (text, -1, G_NORMALIZE_ALL);
1141         g_string_free (fixed, TRUE);
1142     }
1143     else
1144     {
1145         if (start[0] != '\0' && start != end)
1146         {
1147             tmp = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL);
1148             g_string_append (fixed, tmp);
1149             g_free (tmp);
1150         }
1151         result = g_string_free (fixed, FALSE);
1152     }
1153
1154     return result;
1155 }
1156
1157 /* --------------------------------------------------------------------------------------------- */
1158
1159 static char *
1160 str_utf8_casefold_normalize (const char *text)
1161 {
1162     GString *fixed;
1163     char *tmp, *fold;
1164     char *result;
1165     const char *start;
1166     const char *end;
1167
1168     fixed = g_string_sized_new (4);
1169
1170     start = text;
1171     while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1172     {
1173         if (start != end)
1174         {
1175             fold = g_utf8_casefold (start, end - start);
1176             tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1177             g_string_append (fixed, tmp);
1178             g_free (tmp);
1179             g_free (fold);
1180         }
1181         g_string_append_c (fixed, end[0]);
1182         start = end + 1;
1183     }
1184
1185     if (start == text)
1186     {
1187         fold = g_utf8_casefold (text, -1);
1188         result = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1189         g_free (fold);
1190         g_string_free (fixed, TRUE);
1191     }
1192     else
1193     {
1194         if (start[0] != '\0' && start != end)
1195         {
1196             fold = g_utf8_casefold (start, end - start);
1197             tmp = g_utf8_normalize (fold, -1, G_NORMALIZE_ALL);
1198             g_string_append (fixed, tmp);
1199             g_free (tmp);
1200             g_free (fold);
1201         }
1202         result = g_string_free (fixed, FALSE);
1203     }
1204
1205     return result;
1206 }
1207
1208 /* --------------------------------------------------------------------------------------------- */
1209
1210 static int
1211 str_utf8_compare (const char *t1, const char *t2)
1212 {
1213     char *n1, *n2;
1214     int result;
1215
1216     n1 = str_utf8_normalize (t1);
1217     n2 = str_utf8_normalize (t2);
1218
1219     result = strcmp (n1, n2);
1220
1221     g_free (n1);
1222     g_free (n2);
1223
1224     return result;
1225 }
1226
1227 /* --------------------------------------------------------------------------------------------- */
1228
1229 static int
1230 str_utf8_ncompare (const char *t1, const char *t2)
1231 {
1232     char *n1, *n2;
1233     size_t l1, l2;
1234     int result;
1235
1236     n1 = str_utf8_normalize (t1);
1237     n2 = str_utf8_normalize (t2);
1238
1239     l1 = strlen (n1);
1240     l2 = strlen (n2);
1241     result = strncmp (n1, n2, MIN (l1, l2));
1242
1243     g_free (n1);
1244     g_free (n2);
1245
1246     return result;
1247 }
1248
1249 /* --------------------------------------------------------------------------------------------- */
1250
1251 static int
1252 str_utf8_casecmp (const char *t1, const char *t2)
1253 {
1254     char *n1, *n2;
1255     int result;
1256
1257     n1 = str_utf8_casefold_normalize (t1);
1258     n2 = str_utf8_casefold_normalize (t2);
1259
1260     result = strcmp (n1, n2);
1261
1262     g_free (n1);
1263     g_free (n2);
1264
1265     return result;
1266 }
1267
1268 /* --------------------------------------------------------------------------------------------- */
1269
1270 static int
1271 str_utf8_ncasecmp (const char *t1, const char *t2)
1272 {
1273     char *n1, *n2;
1274     size_t l1, l2;
1275     int result;
1276
1277     n1 = str_utf8_casefold_normalize (t1);
1278     n2 = str_utf8_casefold_normalize (t2);
1279
1280     l1 = strlen (n1);
1281     l2 = strlen (n2);
1282     result = strncmp (n1, n2, MIN (l1, l2));
1283
1284     g_free (n1);
1285     g_free (n2);
1286
1287     return result;
1288 }
1289
1290 /* --------------------------------------------------------------------------------------------- */
1291
1292 static int
1293 str_utf8_prefix (const char *text, const char *prefix)
1294 {
1295     char *t, *p;
1296     const char *nt, *np;
1297     const char *nnt, *nnp;
1298     int result;
1299
1300     t = str_utf8_normalize (text);
1301     p = str_utf8_normalize (prefix);
1302     nt = t;
1303     np = p;
1304     nnt = t;
1305     nnp = p;
1306
1307     while (nt[0] != '\0' && np[0] != '\0')
1308     {
1309         str_utf8_cnext_char_safe (&nnt);
1310         str_utf8_cnext_char_safe (&nnp);
1311         if (nnt - nt != nnp - np)
1312             break;
1313         if (strncmp (nt, np, nnt - nt) != 0)
1314             break;
1315         nt = nnt;
1316         np = nnp;
1317     }
1318
1319     result = np - p;
1320
1321     g_free (t);
1322     g_free (p);
1323
1324     return result;
1325 }
1326
1327 /* --------------------------------------------------------------------------------------------- */
1328
1329 static int
1330 str_utf8_caseprefix (const char *text, const char *prefix)
1331 {
1332     char *t, *p;
1333     const char *nt, *np;
1334     const char *nnt, *nnp;
1335     int result;
1336
1337     t = str_utf8_casefold_normalize (text);
1338     p = str_utf8_casefold_normalize (prefix);
1339     nt = t;
1340     np = p;
1341     nnt = t;
1342     nnp = p;
1343
1344     while (nt[0] != '\0' && np[0] != '\0')
1345     {
1346         str_utf8_cnext_char_safe (&nnt);
1347         str_utf8_cnext_char_safe (&nnp);
1348         if (nnt - nt != nnp - np)
1349             break;
1350         if (strncmp (nt, np, nnt - nt) != 0)
1351             break;
1352         nt = nnt;
1353         np = nnp;
1354     }
1355
1356     result = np - p;
1357
1358     g_free (t);
1359     g_free (p);
1360
1361     return result;
1362 }
1363
1364 /* --------------------------------------------------------------------------------------------- */
1365
1366 static char *
1367 str_utf8_create_key_gen (const char *text, gboolean case_sen,
1368                          gchar *(*keygen) (const gchar *text, gssize size))
1369 {
1370     char *result;
1371
1372     if (case_sen)
1373         result = str_utf8_normalize (text);
1374     else
1375     {
1376         gboolean dot;
1377         GString *fixed;
1378         const char *start, *end;
1379         char *fold, *key;
1380
1381         dot = text[0] == '.';
1382         fixed = g_string_sized_new (16);
1383
1384         if (!dot)
1385             start = text;
1386         else
1387         {
1388             start = text + 1;
1389             g_string_append_c (fixed, '.');
1390         }
1391
1392         while (!g_utf8_validate (start, -1, &end) && start[0] != '\0')
1393         {
1394             if (start != end)
1395             {
1396                 fold = g_utf8_casefold (start, end - start);
1397                 key = keygen (fold, -1);
1398                 g_string_append (fixed, key);
1399                 g_free (key);
1400                 g_free (fold);
1401             }
1402             g_string_append_c (fixed, end[0]);
1403             start = end + 1;
1404         }
1405
1406         if (start == text)
1407         {
1408             fold = g_utf8_casefold (start, -1);
1409             result = keygen (fold, -1);
1410             g_free (fold);
1411             g_string_free (fixed, TRUE);
1412         }
1413         else if (dot && (start == text + 1))
1414         {
1415             fold = g_utf8_casefold (start, -1);
1416             key = keygen (fold, -1);
1417             g_string_append (fixed, key);
1418             g_free (key);
1419             g_free (fold);
1420             result = g_string_free (fixed, FALSE);
1421         }
1422         else
1423         {
1424             if (start[0] != '\0' && start != end)
1425             {
1426                 fold = g_utf8_casefold (start, end - start);
1427                 key = keygen (fold, -1);
1428                 g_string_append (fixed, key);
1429                 g_free (key);
1430                 g_free (fold);
1431             }
1432             result = g_string_free (fixed, FALSE);
1433         }
1434     }
1435     return result;
1436 }
1437
1438 /* --------------------------------------------------------------------------------------------- */
1439
1440 static char *
1441 str_utf8_create_key (const char *text, gboolean case_sen)
1442 {
1443     return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key);
1444 }
1445
1446 /* --------------------------------------------------------------------------------------------- */
1447
1448 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1449 static char *
1450 str_utf8_create_key_for_filename (const char *text, gboolean case_sen)
1451 {
1452     return str_utf8_create_key_gen (text, case_sen, g_utf8_collate_key_for_filename);
1453 }
1454 #endif
1455
1456 /* --------------------------------------------------------------------------------------------- */
1457
1458 static int
1459 str_utf8_key_collate (const char *t1, const char *t2, gboolean case_sen)
1460 {
1461     (void) case_sen;
1462     return strcmp (t1, t2);
1463 }
1464
1465 /* --------------------------------------------------------------------------------------------- */
1466
1467 static void
1468 str_utf8_release_key (char *key, gboolean case_sen)
1469 {
1470     (void) case_sen;
1471     g_free (key);
1472 }
1473
1474 /* --------------------------------------------------------------------------------------------- */
1475 /*** public functions ****************************************************************************/
1476 /* --------------------------------------------------------------------------------------------- */
1477
1478 struct str_class
1479 str_utf8_init (void)
1480 {
1481     struct str_class result;
1482
1483     result.conv_gerror_message = str_utf8_conv_gerror_message;
1484     result.vfs_convert_to = str_utf8_vfs_convert_to;
1485     result.insert_replace_char = str_utf8_insert_replace_char;
1486     result.is_valid_string = str_utf8_is_valid_string;
1487     result.is_valid_char = str_utf8_is_valid_char;
1488     result.cnext_char = str_utf8_cnext_char;
1489     result.cprev_char = str_utf8_cprev_char;
1490     result.cnext_char_safe = str_utf8_cnext_char_safe;
1491     result.cprev_char_safe = str_utf8_cprev_char_safe;
1492     result.cnext_noncomb_char = str_utf8_cnext_noncomb_char;
1493     result.cprev_noncomb_char = str_utf8_cprev_noncomb_char;
1494     result.char_isspace = str_utf8_isspace;
1495     result.char_ispunct = str_utf8_ispunct;
1496     result.char_isalnum = str_utf8_isalnum;
1497     result.char_isdigit = str_utf8_isdigit;
1498     result.char_isprint = str_utf8_isprint;
1499     result.char_iscombiningmark = str_utf8_iscombiningmark;
1500     result.char_toupper = str_utf8_toupper;
1501     result.char_tolower = str_utf8_tolower;
1502     result.length = str_utf8_length;
1503     result.length2 = str_utf8_length2;
1504     result.length_noncomb = str_utf8_length_noncomb;
1505     result.fix_string = str_utf8_fix_string;
1506     result.term_form = str_utf8_term_form;
1507     result.fit_to_term = str_utf8_fit_to_term;
1508     result.term_trim = str_utf8_term_trim;
1509     result.term_width2 = str_utf8_term_width2;
1510     result.term_width1 = str_utf8_term_width1;
1511     result.term_char_width = str_utf8_term_char_width;
1512     result.term_substring = str_utf8_term_substring;
1513     result.trunc = str_utf8_trunc;
1514     result.offset_to_pos = str_utf8_offset_to_pos;
1515     result.column_to_pos = str_utf8_column_to_pos;
1516     result.create_search_needle = str_utf8_create_search_needle;
1517     result.release_search_needle = str_utf8_release_search_needle;
1518     result.search_first = str_utf8_search_first;
1519     result.search_last = str_utf8_search_last;
1520     result.compare = str_utf8_compare;
1521     result.ncompare = str_utf8_ncompare;
1522     result.casecmp = str_utf8_casecmp;
1523     result.ncasecmp = str_utf8_ncasecmp;
1524     result.prefix = str_utf8_prefix;
1525     result.caseprefix = str_utf8_caseprefix;
1526     result.create_key = str_utf8_create_key;
1527 #ifdef MC__USE_STR_UTF8_CREATE_KEY_FOR_FILENAME
1528     /* case insensitive sort files in "a1 a2 a10" order */
1529     result.create_key_for_filename = str_utf8_create_key_for_filename;
1530 #else
1531     /* case insensitive sort files in "a1 a10 a2" order */
1532     result.create_key_for_filename = str_utf8_create_key;
1533 #endif
1534     result.key_collate = str_utf8_key_collate;
1535     result.release_key = str_utf8_release_key;
1536
1537     return result;
1538 }
1539
1540 /* --------------------------------------------------------------------------------------------- */