utf8.c

   1 /* $OpenBSD$ */
   2
   3 /*
   4  * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
   5  *
   6  * Permission to use, copy, modify, and distribute this software for any
   7  * purpose with or without fee is hereby granted, provided that the above
   8  * copyright notice and this permission notice appear in all copies.
   9  *
  10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  14  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
  15  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  16  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  17  */
  18
  19 #include <sys/types.h>
  20
  21 #include <ctype.h>
  22 #include <errno.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <wchar.h>
  26
  27 #include "tmux.h"
  28
  29 static const wchar_t utf8_force_wide[] = {
  30         0x0261D,
  31         0x026F9,
  32         0x0270A,
  33         0x0270B,
  34         0x0270C,
  35         0x0270D,
  36         0x1F1E6,
  37         0x1F1E7,
  38         0x1F1E8,
  39         0x1F1E9,
  40         0x1F1EA,
  41         0x1F1EB,
  42         0x1F1EC,
  43         0x1F1ED,
  44         0x1F1EE,
  45         0x1F1EF,
  46         0x1F1F0,
  47         0x1F1F1,
  48         0x1F1F2,
  49         0x1F1F3,
  50         0x1F1F4,
  51         0x1F1F5,
  52         0x1F1F6,
  53         0x1F1F7,
  54         0x1F1F8,
  55         0x1F1F9,
  56         0x1F1FA,
  57         0x1F1FB,
  58         0x1F1FC,
  59         0x1F1FD,
  60         0x1F1FE,
  61         0x1F1FF,
  62         0x1F385,
  63         0x1F3C2,
  64         0x1F3C3,
  65         0x1F3C4,
  66         0x1F3C7,
  67         0x1F3CA,
  68         0x1F3CB,
  69         0x1F3CC,
  70         0x1F3FB,
  71         0x1F3FC,
  72         0x1F3FD,
  73         0x1F3FE,
  74         0x1F3FF,
  75         0x1F442,
  76         0x1F443,
  77         0x1F446,
  78         0x1F447,
  79         0x1F448,
  80         0x1F449,
  81         0x1F44A,
  82         0x1F44B,
  83         0x1F44C,
  84         0x1F44D,
  85         0x1F44E,
  86         0x1F44F,
  87         0x1F450,
  88         0x1F466,
  89         0x1F467,
  90         0x1F468,
  91         0x1F469,
  92         0x1F46B,
  93         0x1F46C,
  94         0x1F46D,
  95         0x1F46E,
  96         0x1F470,
  97         0x1F471,
  98         0x1F472,
  99         0x1F473,
 100         0x1F474,
 101         0x1F475,
 102         0x1F476,
 103         0x1F477,
 104         0x1F478,
 105         0x1F47C,
 106         0x1F481,
 107         0x1F482,
 108         0x1F483,
 109         0x1F485,
 110         0x1F486,
 111         0x1F487,
 112         0x1F48F,
 113         0x1F491,
 114         0x1F4AA,
 115         0x1F574,
 116         0x1F575,
 117         0x1F57A,
 118         0x1F590,
 119         0x1F595,
 120         0x1F596,
 121         0x1F645,
 122         0x1F646,
 123         0x1F647,
 124         0x1F64B,
 125         0x1F64C,
 126         0x1F64D,
 127         0x1F64E,
 128         0x1F64F,
 129         0x1F6A3,
 130         0x1F6B4,
 131         0x1F6B5,
 132         0x1F6B6,
 133         0x1F6C0,
 134         0x1F6CC,
 135         0x1F90C,
 136         0x1F90F,
 137         0x1F918,
 138         0x1F919,
 139         0x1F91A,
 140         0x1F91B,
 141         0x1F91C,
 142         0x1F91D,
 143         0x1F91E,
 144         0x1F91F,
 145         0x1F926,
 146         0x1F930,
 147         0x1F931,
 148         0x1F932,
 149         0x1F933,
 150         0x1F934,
 151         0x1F935,
 152         0x1F936,
 153         0x1F937,
 154         0x1F938,
 155         0x1F939,
 156         0x1F93D,
 157         0x1F93E,
 158         0x1F977,
 159         0x1F9B5,
 160         0x1F9B6,
 161         0x1F9B8,
 162         0x1F9B9,
 163         0x1F9BB,
 164         0x1F9CD,
 165         0x1F9CE,
 166         0x1F9CF,
 167         0x1F9D1,
 168         0x1F9D2,
 169         0x1F9D3,
 170         0x1F9D4,
 171         0x1F9D5,
 172         0x1F9D6,
 173         0x1F9D7,
 174         0x1F9D8,
 175         0x1F9D9,
 176         0x1F9DA,
 177         0x1F9DB,
 178         0x1F9DC,
 179         0x1F9DD,
 180         0x1FAC3,
 181         0x1FAC4,
 182         0x1FAC5,
 183         0x1FAF0,
 184         0x1FAF1,
 185         0x1FAF2,
 186         0x1FAF3,
 187         0x1FAF4,
 188         0x1FAF5,
 189         0x1FAF6,
 190         0x1FAF7,
 191         0x1FAF8
 192 };
 193
 194 struct utf8_item {
 195         RB_ENTRY(utf8_item)     index_entry;
 196         u_int                   index;
 197
 198         RB_ENTRY(utf8_item)     data_entry;
 199         char                    data[UTF8_SIZE];
 200         u_char                  size;
 201 };
 202
 203 static int
 204 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
 205 {
 206         if (ui1->size < ui2->size)
 207                 return (-1);
 208         if (ui1->size > ui2->size)
 209                 return (1);
 210         return (memcmp(ui1->data, ui2->data, ui1->size));
 211 }
 212 RB_HEAD(utf8_data_tree, utf8_item);
 213 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
 214 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
 215
 216 static int
 217 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
 218 {
 219         if (ui1->index < ui2->index)
 220                 return (-1);
 221         if (ui1->index > ui2->index)
 222                 return (1);
 223         return (0);
 224 }
 225 RB_HEAD(utf8_index_tree, utf8_item);
 226 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
 227 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
 228
 229 static u_int utf8_next_index;
 230
 231 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
 232 #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
 233
 234 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
 235 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
 236
 237 /* Get a UTF-8 item from data. */
 238 static struct utf8_item *
 239 utf8_item_by_data(const u_char *data, size_t size)
 240 {
 241         struct utf8_item        ui;
 242
 243         memcpy(ui.data, data, size);
 244         ui.size = size;
 245
 246         return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
 247 }
 248
 249 /* Get a UTF-8 item from data. */
 250 static struct utf8_item *
 251 utf8_item_by_index(u_int index)
 252 {
 253         struct utf8_item        ui;
 254
 255         ui.index = index;
 256
 257         return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
 258 }
 259
 260 /* Add a UTF-8 item. */
 261 static int
 262 utf8_put_item(const u_char *data, size_t size, u_int *index)
 263 {
 264         struct utf8_item        *ui;
 265
 266         ui = utf8_item_by_data(data, size);
 267         if (ui != NULL) {
 268                 *index = ui->index;
 269                 log_debug("%s: found %.*s = %u", __func__, (int)size, data,
 270                     *index);
 271                 return (0);
 272         }
 273
 274         if (utf8_next_index == 0xffffff + 1)
 275                 return (-1);
 276
 277         ui = xcalloc(1, sizeof *ui);
 278         ui->index = utf8_next_index++;
 279         RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
 280
 281         memcpy(ui->data, data, size);
 282         ui->size = size;
 283         RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
 284
 285         *index = ui->index;
 286         log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
 287         return (0);
 288 }
 289
 290 static int
 291 utf8_table_cmp(const void *vp1, const void *vp2)
 292 {
 293         const wchar_t   *wc1 = vp1, *wc2 = vp2;
 294
 295         if (*wc1 < *wc2)
 296                 return (-1);
 297         if (*wc1 > *wc2)
 298                 return (1);
 299         return (0);
 300 }
 301
 302 /* Check if character in table. */
 303 int
 304 utf8_in_table(wchar_t find, const wchar_t *table, u_int count)
 305 {
 306         wchar_t *found;
 307
 308         found = bsearch(&find, table, count, sizeof *table, utf8_table_cmp);
 309         return (found != NULL);
 310 }
 311
 312 /* Get UTF-8 character from data. */
 313 enum utf8_state
 314 utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
 315 {
 316         u_int   index;
 317
 318         if (ud->width > 2)
 319                 fatalx("invalid UTF-8 width: %u", ud->width);
 320
 321         if (ud->size > UTF8_SIZE)
 322                 goto fail;
 323         if (ud->size <= 3) {
 324                 index = (((utf8_char)ud->data[2] << 16)|
 325                           ((utf8_char)ud->data[1] << 8)|
 326                           ((utf8_char)ud->data[0]));
 327         } else if (utf8_put_item(ud->data, ud->size, &index) != 0)
 328                 goto fail;
 329         *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
 330         log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
 331             (int)ud->size, ud->data, *uc);
 332         return (UTF8_DONE);
 333
 334 fail:
 335         if (ud->width == 0)
 336                 *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
 337         else if (ud->width == 1)
 338                 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
 339         else
 340                 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
 341         return (UTF8_ERROR);
 342 }
 343
 344 /* Get UTF-8 data from character. */
 345 void
 346 utf8_to_data(utf8_char uc, struct utf8_data *ud)
 347 {
 348         struct utf8_item        *ui;
 349         u_int                    index;
 350
 351         memset(ud, 0, sizeof *ud);
 352         ud->size = ud->have = UTF8_GET_SIZE(uc);
 353         ud->width = UTF8_GET_WIDTH(uc);
 354
 355         if (ud->size <= 3) {
 356                 ud->data[2] = (uc >> 16);
 357                 ud->data[1] = ((uc >> 8) & 0xff);
 358                 ud->data[0] = (uc & 0xff);
 359         } else {
 360                 index = (uc & 0xffffff);
 361                 if ((ui = utf8_item_by_index(index)) == NULL)
 362                         memset(ud->data, ' ', ud->size);
 363                 else
 364                         memcpy(ud->data, ui->data, ud->size);
 365         }
 366
 367         log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
 368             (int)ud->size, ud->data);
 369 }
 370
 371 /* Get UTF-8 character from a single ASCII character. */
 372 u_int
 373 utf8_build_one(u_char ch)
 374 {
 375         return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
 376 }
 377
 378 /* Set a single character. */
 379 void
 380 utf8_set(struct utf8_data *ud, u_char ch)
 381 {
 382         static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
 383
 384         memcpy(ud, &empty, sizeof *ud);
 385         *ud->data = ch;
 386 }
 387
 388 /* Copy UTF-8 character. */
 389 void
 390 utf8_copy(struct utf8_data *to, const struct utf8_data *from)
 391 {
 392         u_int   i;
 393
 394         memcpy(to, from, sizeof *to);
 395
 396         for (i = to->size; i < sizeof to->data; i++)
 397                 to->data[i] = '\0';
 398 }
 399
 400 /* Get width of Unicode character. */
 401 static enum utf8_state
 402 utf8_width(struct utf8_data *ud, int *width)
 403 {
 404         wchar_t wc;
 405
 406         if (utf8_towc(ud, &wc) != UTF8_DONE)
 407                 return (UTF8_ERROR);
 408         if (utf8_in_table(wc, utf8_force_wide, nitems(utf8_force_wide))) {
 409                 *width = 2;
 410                 return (UTF8_DONE);
 411         }
 412         *width = wcwidth(wc);
 413         log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width);
 414         if (*width < 0) {
 415                 /*
 416                  * C1 control characters are nonprintable, so they are always
 417                  * zero width.
 418                  */
 419                 *width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1;
 420         }
 421         if (*width >= 0 && *width <= 0xff)
 422                 return (UTF8_DONE);
 423         return (UTF8_ERROR);
 424 }
 425
 426 /* Convert UTF-8 character to wide character. */
 427 enum utf8_state
 428 utf8_towc(const struct utf8_data *ud, wchar_t *wc)
 429 {
 430         switch (mbtowc(wc, ud->data, ud->size)) {
 431         case -1:
 432                 log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
 433                     errno);
 434                 mbtowc(NULL, NULL, MB_CUR_MAX);
 435                 return (UTF8_ERROR);
 436         case 0:
 437                 return (UTF8_ERROR);
 438         }
 439         log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc);
 440         return (UTF8_DONE);
 441 }
 442
 443 /*
 444  * Open UTF-8 sequence.
 445  *
 446  * 11000010-11011111 C2-DF start of 2-byte sequence
 447  * 11100000-11101111 E0-EF start of 3-byte sequence
 448  * 11110000-11110100 F0-F4 start of 4-byte sequence
 449  */
 450 enum utf8_state
 451 utf8_open(struct utf8_data *ud, u_char ch)
 452 {
 453         memset(ud, 0, sizeof *ud);
 454         if (ch >= 0xc2 && ch <= 0xdf)
 455                 ud->size = 2;
 456         else if (ch >= 0xe0 && ch <= 0xef)
 457                 ud->size = 3;
 458         else if (ch >= 0xf0 && ch <= 0xf4)
 459                 ud->size = 4;
 460         else
 461                 return (UTF8_ERROR);
 462         utf8_append(ud, ch);
 463         return (UTF8_MORE);
 464 }
 465
 466 /* Append character to UTF-8, closing if finished. */
 467 enum utf8_state
 468 utf8_append(struct utf8_data *ud, u_char ch)
 469 {
 470         int     width;
 471
 472         if (ud->have >= ud->size)
 473                 fatalx("UTF-8 character overflow");
 474         if (ud->size > sizeof ud->data)
 475                 fatalx("UTF-8 character size too large");
 476
 477         if (ud->have != 0 && (ch & 0xc0) != 0x80)
 478                 ud->width = 0xff;
 479
 480         ud->data[ud->have++] = ch;
 481         if (ud->have != ud->size)
 482                 return (UTF8_MORE);
 483
 484         if (ud->width == 0xff)
 485                 return (UTF8_ERROR);
 486         if (utf8_width(ud, &width) != UTF8_DONE)
 487                 return (UTF8_ERROR);
 488         ud->width = width;
 489
 490         return (UTF8_DONE);
 491 }
 492
 493 /*
 494  * Encode len characters from src into dst, which is guaranteed to have four
 495  * bytes available for each character from src (for \abc or UTF-8) plus space
 496  * for \0.
 497  */
 498 int
 499 utf8_strvis(char *dst, const char *src, size_t len, int flag)
 500 {
 501         struct utf8_data         ud;
 502         const char              *start = dst, *end = src + len;
 503         enum utf8_state          more;
 504         size_t                   i;
 505
 506         while (src < end) {
 507                 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
 508                         while (++src < end && more == UTF8_MORE)
 509                                 more = utf8_append(&ud, *src);
 510                         if (more == UTF8_DONE) {
 511                                 /* UTF-8 character finished. */
 512                                 for (i = 0; i < ud.size; i++)
 513                                         *dst++ = ud.data[i];
 514                                 continue;
 515                         }
 516                         /* Not a complete, valid UTF-8 character. */
 517                         src -= ud.have;
 518                 }
 519                 if (src[0] == '$' && src < end - 1) {
 520                         if (isalpha((u_char)src[1]) ||
 521                             src[1] == '_' ||
 522                             src[1] == '{')
 523                                 *dst++ = '\\';
 524                         *dst++ = '$';
 525                 } else if (src < end - 1)
 526                         dst = vis(dst, src[0], flag, src[1]);
 527                 else if (src < end)
 528                         dst = vis(dst, src[0], flag, '\0');
 529                 src++;
 530         }
 531         *dst = '\0';
 532         return (dst - start);
 533 }
 534
 535 /* Same as utf8_strvis but allocate the buffer. */
 536 int
 537 utf8_stravis(char **dst, const char *src, int flag)
 538 {
 539         char    *buf;
 540         int      len;
 541
 542         buf = xreallocarray(NULL, 4, strlen(src) + 1);
 543         len = utf8_strvis(buf, src, strlen(src), flag);
 544
 545         *dst = xrealloc(buf, len + 1);
 546         return (len);
 547 }
 548
 549 /* Same as utf8_strvis but allocate the buffer. */
 550 int
 551 utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
 552 {
 553         char    *buf;
 554         int      len;
 555
 556         buf = xreallocarray(NULL, 4, srclen + 1);
 557         len = utf8_strvis(buf, src, srclen, flag);
 558
 559         *dst = xrealloc(buf, len + 1);
 560         return (len);
 561 }
 562
 563 /* Does this string contain anything that isn't valid UTF-8? */
 564 int
 565 utf8_isvalid(const char *s)
 566 {
 567         struct utf8_data ud;
 568         const char      *end;
 569         enum utf8_state  more;
 570
 571         end = s + strlen(s);
 572         while (s < end) {
 573                 if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
 574                         while (++s < end && more == UTF8_MORE)
 575                                 more = utf8_append(&ud, *s);
 576                         if (more == UTF8_DONE)
 577                                 continue;
 578                         return (0);
 579                 }
 580                 if (*s < 0x20 || *s > 0x7e)
 581                         return (0);
 582                 s++;
 583         }
 584         return (1);
 585 }
 586
 587 /*
 588  * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
 589  * the returned string. Anything not valid printable ASCII or UTF-8 is
 590  * stripped.
 591  */
 592 char *
 593 utf8_sanitize(const char *src)
 594 {
 595         char            *dst = NULL;
 596         size_t           n = 0;
 597         enum utf8_state  more;
 598         struct utf8_data ud;
 599         u_int            i;
 600
 601         while (*src != '\0') {
 602                 dst = xreallocarray(dst, n + 1, sizeof *dst);
 603                 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
 604                         while (*++src != '\0' && more == UTF8_MORE)
 605                                 more = utf8_append(&ud, *src);
 606                         if (more == UTF8_DONE) {
 607                                 dst = xreallocarray(dst, n + ud.width,
 608                                     sizeof *dst);
 609                                 for (i = 0; i < ud.width; i++)
 610                                         dst[n++] = '_';
 611                                 continue;
 612                         }
 613                         src -= ud.have;
 614                 }
 615                 if (*src > 0x1f && *src < 0x7f)
 616                         dst[n++] = *src;
 617                 else
 618                         dst[n++] = '_';
 619                 src++;
 620         }
 621         dst = xreallocarray(dst, n + 1, sizeof *dst);
 622         dst[n] = '\0';
 623         return (dst);
 624 }
 625
 626 /* Get UTF-8 buffer length. */
 627 size_t
 628 utf8_strlen(const struct utf8_data *s)
 629 {
 630         size_t  i;
 631
 632         for (i = 0; s[i].size != 0; i++)
 633                 /* nothing */;
 634         return (i);
 635 }
 636
 637 /* Get UTF-8 string width. */
 638 u_int
 639 utf8_strwidth(const struct utf8_data *s, ssize_t n)
 640 {
 641         ssize_t i;
 642         u_int   width = 0;
 643
 644         for (i = 0; s[i].size != 0; i++) {
 645                 if (n != -1 && n == i)
 646                         break;
 647                 width += s[i].width;
 648         }
 649         return (width);
 650 }
 651
 652 /*
 653  * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
 654  * Caller frees.
 655  */
 656 struct utf8_data *
 657 utf8_fromcstr(const char *src)
 658 {
 659         struct utf8_data        *dst = NULL;
 660         size_t                   n = 0;
 661         enum utf8_state          more;
 662
 663         while (*src != '\0') {
 664                 dst = xreallocarray(dst, n + 1, sizeof *dst);
 665                 if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
 666                         while (*++src != '\0' && more == UTF8_MORE)
 667                                 more = utf8_append(&dst[n], *src);
 668                         if (more == UTF8_DONE) {
 669                                 n++;
 670                                 continue;
 671                         }
 672                         src -= dst[n].have;
 673                 }
 674                 utf8_set(&dst[n], *src);
 675                 n++;
 676                 src++;
 677         }
 678         dst = xreallocarray(dst, n + 1, sizeof *dst);
 679         dst[n].size = 0;
 680         return (dst);
 681 }
 682
 683 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
 684 char *
 685 utf8_tocstr(struct utf8_data *src)
 686 {
 687         char    *dst = NULL;
 688         size_t   n = 0;
 689
 690         for(; src->size != 0; src++) {
 691                 dst = xreallocarray(dst, n + src->size, 1);
 692                 memcpy(dst + n, src->data, src->size);
 693                 n += src->size;
 694         }
 695         dst = xreallocarray(dst, n + 1, 1);
 696         dst[n] = '\0';
 697         return (dst);
 698 }
 699
 700 /* Get width of UTF-8 string. */
 701 u_int
 702 utf8_cstrwidth(const char *s)
 703 {
 704         struct utf8_data        tmp;
 705         u_int                   width;
 706         enum utf8_state         more;
 707
 708         width = 0;
 709         while (*s != '\0') {
 710                 if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
 711                         while (*++s != '\0' && more == UTF8_MORE)
 712                                 more = utf8_append(&tmp, *s);
 713                         if (more == UTF8_DONE) {
 714                                 width += tmp.width;
 715                                 continue;
 716                         }
 717                         s -= tmp.have;
 718                 }
 719                 if (*s > 0x1f && *s != 0x7f)
 720                         width++;
 721                 s++;
 722         }
 723         return (width);
 724 }
 725
 726 /* Pad UTF-8 string to width on the left. Caller frees. */
 727 char *
 728 utf8_padcstr(const char *s, u_int width)
 729 {
 730         size_t   slen;
 731         char    *out;
 732         u_int    n, i;
 733
 734         n = utf8_cstrwidth(s);
 735         if (n >= width)
 736                 return (xstrdup(s));
 737
 738         slen = strlen(s);
 739         out = xmalloc(slen + 1 + (width - n));
 740         memcpy(out, s, slen);
 741         for (i = n; i < width; i++)
 742                 out[slen++] = ' ';
 743         out[slen] = '\0';
 744         return (out);
 745 }
 746
 747 /* Pad UTF-8 string to width on the right. Caller frees. */
 748 char *
 749 utf8_rpadcstr(const char *s, u_int width)
 750 {
 751         size_t   slen;
 752         char    *out;
 753         u_int    n, i;
 754
 755         n = utf8_cstrwidth(s);
 756         if (n >= width)
 757                 return (xstrdup(s));
 758
 759         slen = strlen(s);
 760         out = xmalloc(slen + 1 + (width - n));
 761         for (i = 0; i < width - n; i++)
 762                 out[i] = ' ';
 763         memcpy(out + i, s, slen);
 764         out[i + slen] = '\0';
 765         return (out);
 766 }
 767
 768 int
 769 utf8_cstrhas(const char *s, const struct utf8_data *ud)
 770 {
 771         struct utf8_data        *copy, *loop;
 772         int                      found = 0;
 773
 774         copy = utf8_fromcstr(s);
 775         for (loop = copy; loop->size != 0; loop++) {
 776                 if (loop->size != ud->size)
 777                         continue;
 778                 if (memcmp(loop->data, ud->data, loop->size) == 0) {
 779                         found = 1;
 780                         break;
 781                 }
 782         }
 783         free(copy);
 784
 785         return (found);
 786 }