utf8.c

   1 /* $OpenBSD$ */
   2
   3 /*
   4  * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
   5  *
   6  * Permission to use, copy, modify, and distribute this software for any
   7  * purpose with or without fee is hereby granted, provided that the above
   8  * copyright notice and this permission notice appear in all copies.
   9  *
  10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  14  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
  15  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  16  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  17  */
  18
  19 #include <sys/types.h>
  20
  21 #include <ctype.h>
  22 #include <errno.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <vis.h>
  26
  27 #include "tmux.h"
  28
  29 static const wchar_t utf8_force_wide[] = {
  30         0x0261D,
  31         0x026F9,
  32         0x0270A,
  33         0x0270B,
  34         0x0270C,
  35         0x0270D,
  36         0x1F1E6,
  37         0x1F1E7,
  38         0x1F1E8,
  39         0x1F1E9,
  40         0x1F1EA,
  41         0x1F1EB,
  42         0x1F1EC,
  43         0x1F1ED,
  44         0x1F1EE,
  45         0x1F1EF,
  46         0x1F1F0,
  47         0x1F1F1,
  48         0x1F1F2,
  49         0x1F1F3,
  50         0x1F1F4,
  51         0x1F1F5,
  52         0x1F1F6,
  53         0x1F1F7,
  54         0x1F1F8,
  55         0x1F1F9,
  56         0x1F1FA,
  57         0x1F1FB,
  58         0x1F1FC,
  59         0x1F1FD,
  60         0x1F1FE,
  61         0x1F1FF,
  62         0x1F385,
  63         0x1F3C2,
  64         0x1F3C3,
  65         0x1F3C4,
  66         0x1F3C7,
  67         0x1F3CA,
  68         0x1F3CB,
  69         0x1F3CC,
  70         0x1F3FB,
  71         0x1F3FC,
  72         0x1F3FD,
  73         0x1F3FE,
  74         0x1F3FF,
  75         0x1F442,
  76         0x1F443,
  77         0x1F446,
  78         0x1F447,
  79         0x1F448,
  80         0x1F449,
  81         0x1F44A,
  82         0x1F44B,
  83         0x1F44C,
  84         0x1F44D,
  85         0x1F44E,
  86         0x1F44F,
  87         0x1F450,
  88         0x1F466,
  89         0x1F467,
  90         0x1F468,
  91         0x1F469,
  92         0x1F46B,
  93         0x1F46C,
  94         0x1F46D,
  95         0x1F46E,
  96         0x1F470,
  97         0x1F471,
  98         0x1F472,
  99         0x1F473,
 100         0x1F474,
 101         0x1F475,
 102         0x1F476,
 103         0x1F477,
 104         0x1F478,
 105         0x1F47C,
 106         0x1F481,
 107         0x1F482,
 108         0x1F483,
 109         0x1F485,
 110         0x1F486,
 111         0x1F487,
 112         0x1F48F,
 113         0x1F491,
 114         0x1F4AA,
 115         0x1F574,
 116         0x1F575,
 117         0x1F57A,
 118         0x1F590,
 119         0x1F595,
 120         0x1F596,
 121         0x1F645,
 122         0x1F646,
 123         0x1F647,
 124         0x1F64B,
 125         0x1F64C,
 126         0x1F64D,
 127         0x1F64E,
 128         0x1F64F,
 129         0x1F6A3,
 130         0x1F6B4,
 131         0x1F6B5,
 132         0x1F6B6,
 133         0x1F6C0,
 134         0x1F6CC,
 135         0x1F90C,
 136         0x1F90F,
 137         0x1F918,
 138         0x1F919,
 139         0x1F91A,
 140         0x1F91B,
 141         0x1F91C,
 142         0x1F91D,
 143         0x1F91E,
 144         0x1F91F,
 145         0x1F926,
 146         0x1F930,
 147         0x1F931,
 148         0x1F932,
 149         0x1F933,
 150         0x1F934,
 151         0x1F935,
 152         0x1F936,
 153         0x1F937,
 154         0x1F938,
 155         0x1F939,
 156         0x1F93D,
 157         0x1F93E,
 158         0x1F977,
 159         0x1F9B5,
 160         0x1F9B6,
 161         0x1F9B8,
 162         0x1F9B9,
 163         0x1F9BB,
 164         0x1F9CD,
 165         0x1F9CE,
 166         0x1F9CF,
 167         0x1F9D1,
 168         0x1F9D2,
 169         0x1F9D3,
 170         0x1F9D4,
 171         0x1F9D5,
 172         0x1F9D6,
 173         0x1F9D7,
 174         0x1F9D8,
 175         0x1F9D9,
 176         0x1F9DA,
 177         0x1F9DB,
 178         0x1F9DC,
 179         0x1F9DD,
 180         0x1FAC3,
 181         0x1FAC4,
 182         0x1FAC5,
 183         0x1FAF0,
 184         0x1FAF1,
 185         0x1FAF2,
 186         0x1FAF3,
 187         0x1FAF4,
 188         0x1FAF5,
 189         0x1FAF6,
 190         0x1FAF7,
 191         0x1FAF8
 192 };
 193
 194 struct utf8_item {
 195         RB_ENTRY(utf8_item)     index_entry;
 196         u_int                   index;
 197
 198         RB_ENTRY(utf8_item)     data_entry;
 199         char                    data[UTF8_SIZE];
 200         u_char                  size;
 201 };
 202
 203 static int
 204 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
 205 {
 206         if (ui1->size < ui2->size)
 207                 return (-1);
 208         if (ui1->size > ui2->size)
 209                 return (1);
 210         return (memcmp(ui1->data, ui2->data, ui1->size));
 211 }
 212 RB_HEAD(utf8_data_tree, utf8_item);
 213 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
 214 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
 215
 216 static int
 217 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
 218 {
 219         if (ui1->index < ui2->index)
 220                 return (-1);
 221         if (ui1->index > ui2->index)
 222                 return (1);
 223         return (0);
 224 }
 225 RB_HEAD(utf8_index_tree, utf8_item);
 226 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
 227 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
 228
 229 static u_int utf8_next_index;
 230
 231 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
 232 #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
 233
 234 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
 235 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
 236
 237 /* Get a UTF-8 item from data. */
 238 static struct utf8_item *
 239 utf8_item_by_data(const u_char *data, size_t size)
 240 {
 241         struct utf8_item        ui;
 242
 243         memcpy(ui.data, data, size);
 244         ui.size = size;
 245
 246         return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
 247 }
 248
 249 /* Get a UTF-8 item from data. */
 250 static struct utf8_item *
 251 utf8_item_by_index(u_int index)
 252 {
 253         struct utf8_item        ui;
 254
 255         ui.index = index;
 256
 257         return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
 258 }
 259
 260 /* Add a UTF-8 item. */
 261 static int
 262 utf8_put_item(const u_char *data, size_t size, u_int *index)
 263 {
 264         struct utf8_item        *ui;
 265
 266         ui = utf8_item_by_data(data, size);
 267         if (ui != NULL) {
 268                 *index = ui->index;
 269                 log_debug("%s: found %.*s = %u", __func__, (int)size, data,
 270                     *index);
 271                 return (0);
 272         }
 273
 274         if (utf8_next_index == 0xffffff + 1)
 275                 return (-1);
 276
 277         ui = xcalloc(1, sizeof *ui);
 278         ui->index = utf8_next_index++;
 279         RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
 280
 281         memcpy(ui->data, data, size);
 282         ui->size = size;
 283         RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
 284
 285         *index = ui->index;
 286         log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
 287         return (0);
 288 }
 289
 290 static int
 291 utf8_table_cmp(const void *vp1, const void *vp2)
 292 {
 293         const wchar_t   *wc1 = vp1, *wc2 = vp2;
 294
 295         if (*wc1 < *wc2)
 296                 return (-1);
 297         if (*wc1 > *wc2)
 298                 return (1);
 299         return (0);
 300 }
 301
 302 /* Check if character in table. */
 303 int
 304 utf8_in_table(wchar_t find, const wchar_t *table, u_int count)
 305 {
 306         wchar_t *found;
 307
 308         found = bsearch(&find, table, count, sizeof *table, utf8_table_cmp);
 309         return (found != NULL);
 310 }
 311
 312 /* Get UTF-8 character from data. */
 313 enum utf8_state
 314 utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
 315 {
 316         u_int   index;
 317
 318         if (ud->width > 2)
 319                 fatalx("invalid UTF-8 width: %u", ud->width);
 320
 321         if (ud->size > UTF8_SIZE)
 322                 goto fail;
 323         if (ud->size <= 3) {
 324                 index = (((utf8_char)ud->data[2] << 16)|
 325                           ((utf8_char)ud->data[1] << 8)|
 326                           ((utf8_char)ud->data[0]));
 327         } else if (utf8_put_item(ud->data, ud->size, &index) != 0)
 328                 goto fail;
 329         *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
 330         log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
 331             (int)ud->size, ud->data, *uc);
 332         return (UTF8_DONE);
 333
 334 fail:
 335         if (ud->width == 0)
 336                 *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
 337         else if (ud->width == 1)
 338                 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
 339         else
 340                 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
 341         return (UTF8_ERROR);
 342 }
 343
 344 /* Get UTF-8 data from character. */
 345 void
 346 utf8_to_data(utf8_char uc, struct utf8_data *ud)
 347 {
 348         struct utf8_item        *ui;
 349         u_int                    index;
 350
 351         memset(ud, 0, sizeof *ud);
 352         ud->size = ud->have = UTF8_GET_SIZE(uc);
 353         ud->width = UTF8_GET_WIDTH(uc);
 354
 355         if (ud->size <= 3) {
 356                 ud->data[2] = (uc >> 16);
 357                 ud->data[1] = ((uc >> 8) & 0xff);
 358                 ud->data[0] = (uc & 0xff);
 359         } else {
 360                 index = (uc & 0xffffff);
 361                 if ((ui = utf8_item_by_index(index)) == NULL)
 362                         memset(ud->data, ' ', ud->size);
 363                 else
 364                         memcpy(ud->data, ui->data, ud->size);
 365         }
 366
 367         log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
 368             (int)ud->size, ud->data);
 369 }
 370
 371 /* Get UTF-8 character from a single ASCII character. */
 372 u_int
 373 utf8_build_one(u_char ch)
 374 {
 375         return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
 376 }
 377
 378 /* Set a single character. */
 379 void
 380 utf8_set(struct utf8_data *ud, u_char ch)
 381 {
 382         static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
 383
 384         memcpy(ud, &empty, sizeof *ud);
 385         *ud->data = ch;
 386 }
 387
 388 /* Copy UTF-8 character. */
 389 void
 390 utf8_copy(struct utf8_data *to, const struct utf8_data *from)
 391 {
 392         u_int   i;
 393
 394         memcpy(to, from, sizeof *to);
 395
 396         for (i = to->size; i < sizeof to->data; i++)
 397                 to->data[i] = '\0';
 398 }
 399
 400 /* Get width of Unicode character. */
 401 static enum utf8_state
 402 utf8_width(struct utf8_data *ud, int *width)
 403 {
 404         wchar_t wc;
 405
 406         if (utf8_towc(ud, &wc) != UTF8_DONE)
 407                 return (UTF8_ERROR);
 408         if (utf8_in_table(wc, utf8_force_wide, nitems(utf8_force_wide))) {
 409                 *width = 2;
 410                 return (UTF8_DONE);
 411         }
 412
 413         *width = wcwidth(wc);
 414         log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width);
 415         if (*width < 0) {
 416                 /*
 417                  * C1 control characters are nonprintable, so they are always
 418                  * zero width.
 419                  */
 420                 *width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1;
 421         }
 422         if (*width >= 0 && *width <= 0xff)
 423                 return (UTF8_DONE);
 424         return (UTF8_ERROR);
 425 }
 426
 427 /* Convert UTF-8 character to wide character. */
 428 enum utf8_state
 429 utf8_towc(const struct utf8_data *ud, wchar_t *wc)
 430 {
 431         switch (mbtowc(wc, ud->data, ud->size)) {
 432         case -1:
 433                 log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
 434                     errno);
 435                 mbtowc(NULL, NULL, MB_CUR_MAX);
 436                 return (UTF8_ERROR);
 437         case 0:
 438                 return (UTF8_ERROR);
 439         }
 440         log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc);
 441         return (UTF8_DONE);
 442 }
 443
 444 /* Convert wide character to UTF-8 character. */
 445 enum utf8_state
 446 utf8_fromwc(wchar_t wc, struct utf8_data *ud)
 447 {
 448         int     size, width;
 449
 450         size = wctomb(ud->data, wc);
 451         if (size < 0) {
 452                 log_debug("UTF-8 %d, wctomb() %d", wc, errno);
 453                 wctomb(NULL, 0);
 454                 return (UTF8_ERROR);
 455         }
 456         if (size == 0)
 457                 return (UTF8_ERROR);
 458         ud->size = ud->have = size;
 459         if (utf8_width(ud, &width) == UTF8_DONE) {
 460                 ud->width = width;
 461                 return (UTF8_DONE);
 462         }
 463         return (UTF8_ERROR);
 464 }
 465
 466 /*
 467  * Open UTF-8 sequence.
 468  *
 469  * 11000010-11011111 C2-DF start of 2-byte sequence
 470  * 11100000-11101111 E0-EF start of 3-byte sequence
 471  * 11110000-11110100 F0-F4 start of 4-byte sequence
 472  */
 473 enum utf8_state
 474 utf8_open(struct utf8_data *ud, u_char ch)
 475 {
 476         memset(ud, 0, sizeof *ud);
 477         if (ch >= 0xc2 && ch <= 0xdf)
 478                 ud->size = 2;
 479         else if (ch >= 0xe0 && ch <= 0xef)
 480                 ud->size = 3;
 481         else if (ch >= 0xf0 && ch <= 0xf4)
 482                 ud->size = 4;
 483         else
 484                 return (UTF8_ERROR);
 485         utf8_append(ud, ch);
 486         return (UTF8_MORE);
 487 }
 488
 489 /* Append character to UTF-8, closing if finished. */
 490 enum utf8_state
 491 utf8_append(struct utf8_data *ud, u_char ch)
 492 {
 493         int     width;
 494
 495         if (ud->have >= ud->size)
 496                 fatalx("UTF-8 character overflow");
 497         if (ud->size > sizeof ud->data)
 498                 fatalx("UTF-8 character size too large");
 499
 500         if (ud->have != 0 && (ch & 0xc0) != 0x80)
 501                 ud->width = 0xff;
 502
 503         ud->data[ud->have++] = ch;
 504         if (ud->have != ud->size)
 505                 return (UTF8_MORE);
 506
 507         if (ud->width == 0xff)
 508                 return (UTF8_ERROR);
 509         if (utf8_width(ud, &width) != UTF8_DONE)
 510                 return (UTF8_ERROR);
 511         ud->width = width;
 512
 513         return (UTF8_DONE);
 514 }
 515
 516 /*
 517  * Encode len characters from src into dst, which is guaranteed to have four
 518  * bytes available for each character from src (for \abc or UTF-8) plus space
 519  * for \0.
 520  */
 521 int
 522 utf8_strvis(char *dst, const char *src, size_t len, int flag)
 523 {
 524         struct utf8_data         ud;
 525         const char              *start = dst, *end = src + len;
 526         enum utf8_state          more;
 527         size_t                   i;
 528
 529         while (src < end) {
 530                 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
 531                         while (++src < end && more == UTF8_MORE)
 532                                 more = utf8_append(&ud, *src);
 533                         if (more == UTF8_DONE) {
 534                                 /* UTF-8 character finished. */
 535                                 for (i = 0; i < ud.size; i++)
 536                                         *dst++ = ud.data[i];
 537                                 continue;
 538                         }
 539                         /* Not a complete, valid UTF-8 character. */
 540                         src -= ud.have;
 541                 }
 542                 if ((flag & VIS_DQ) && src[0] == '$' && src < end - 1) {
 543                         if (isalpha((u_char)src[1]) ||
 544                             src[1] == '_' ||
 545                             src[1] == '{')
 546                                 *dst++ = '\\';
 547                         *dst++ = '$';
 548                 } else if (src < end - 1)
 549                         dst = vis(dst, src[0], flag, src[1]);
 550                 else if (src < end)
 551                         dst = vis(dst, src[0], flag, '\0');
 552                 src++;
 553         }
 554         *dst = '\0';
 555         return (dst - start);
 556 }
 557
 558 /* Same as utf8_strvis but allocate the buffer. */
 559 int
 560 utf8_stravis(char **dst, const char *src, int flag)
 561 {
 562         char    *buf;
 563         int      len;
 564
 565         buf = xreallocarray(NULL, 4, strlen(src) + 1);
 566         len = utf8_strvis(buf, src, strlen(src), flag);
 567
 568         *dst = xrealloc(buf, len + 1);
 569         return (len);
 570 }
 571
 572 /* Same as utf8_strvis but allocate the buffer. */
 573 int
 574 utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
 575 {
 576         char    *buf;
 577         int      len;
 578
 579         buf = xreallocarray(NULL, 4, srclen + 1);
 580         len = utf8_strvis(buf, src, srclen, flag);
 581
 582         *dst = xrealloc(buf, len + 1);
 583         return (len);
 584 }
 585
 586 /* Does this string contain anything that isn't valid UTF-8? */
 587 int
 588 utf8_isvalid(const char *s)
 589 {
 590         struct utf8_data ud;
 591         const char      *end;
 592         enum utf8_state  more;
 593
 594         end = s + strlen(s);
 595         while (s < end) {
 596                 if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
 597                         while (++s < end && more == UTF8_MORE)
 598                                 more = utf8_append(&ud, *s);
 599                         if (more == UTF8_DONE)
 600                                 continue;
 601                         return (0);
 602                 }
 603                 if (*s < 0x20 || *s > 0x7e)
 604                         return (0);
 605                 s++;
 606         }
 607         return (1);
 608 }
 609
 610 /*
 611  * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
 612  * the returned string. Anything not valid printable ASCII or UTF-8 is
 613  * stripped.
 614  */
 615 char *
 616 utf8_sanitize(const char *src)
 617 {
 618         char            *dst = NULL;
 619         size_t           n = 0;
 620         enum utf8_state  more;
 621         struct utf8_data ud;
 622         u_int            i;
 623
 624         while (*src != '\0') {
 625                 dst = xreallocarray(dst, n + 1, sizeof *dst);
 626                 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
 627                         while (*++src != '\0' && more == UTF8_MORE)
 628                                 more = utf8_append(&ud, *src);
 629                         if (more == UTF8_DONE) {
 630                                 dst = xreallocarray(dst, n + ud.width,
 631                                     sizeof *dst);
 632                                 for (i = 0; i < ud.width; i++)
 633                                         dst[n++] = '_';
 634                                 continue;
 635                         }
 636                         src -= ud.have;
 637                 }
 638                 if (*src > 0x1f && *src < 0x7f)
 639                         dst[n++] = *src;
 640                 else
 641                         dst[n++] = '_';
 642                 src++;
 643         }
 644         dst = xreallocarray(dst, n + 1, sizeof *dst);
 645         dst[n] = '\0';
 646         return (dst);
 647 }
 648
 649 /* Get UTF-8 buffer length. */
 650 size_t
 651 utf8_strlen(const struct utf8_data *s)
 652 {
 653         size_t  i;
 654
 655         for (i = 0; s[i].size != 0; i++)
 656                 /* nothing */;
 657         return (i);
 658 }
 659
 660 /* Get UTF-8 string width. */
 661 u_int
 662 utf8_strwidth(const struct utf8_data *s, ssize_t n)
 663 {
 664         ssize_t i;
 665         u_int   width = 0;
 666
 667         for (i = 0; s[i].size != 0; i++) {
 668                 if (n != -1 && n == i)
 669                         break;
 670                 width += s[i].width;
 671         }
 672         return (width);
 673 }
 674
 675 /*
 676  * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
 677  * Caller frees.
 678  */
 679 struct utf8_data *
 680 utf8_fromcstr(const char *src)
 681 {
 682         struct utf8_data        *dst = NULL;
 683         size_t                   n = 0;
 684         enum utf8_state          more;
 685
 686         while (*src != '\0') {
 687                 dst = xreallocarray(dst, n + 1, sizeof *dst);
 688                 if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
 689                         while (*++src != '\0' && more == UTF8_MORE)
 690                                 more = utf8_append(&dst[n], *src);
 691                         if (more == UTF8_DONE) {
 692                                 n++;
 693                                 continue;
 694                         }
 695                         src -= dst[n].have;
 696                 }
 697                 utf8_set(&dst[n], *src);
 698                 n++;
 699                 src++;
 700         }
 701         dst = xreallocarray(dst, n + 1, sizeof *dst);
 702         dst[n].size = 0;
 703         return (dst);
 704 }
 705
 706 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
 707 char *
 708 utf8_tocstr(struct utf8_data *src)
 709 {
 710         char    *dst = NULL;
 711         size_t   n = 0;
 712
 713         for(; src->size != 0; src++) {
 714                 dst = xreallocarray(dst, n + src->size, 1);
 715                 memcpy(dst + n, src->data, src->size);
 716                 n += src->size;
 717         }
 718         dst = xreallocarray(dst, n + 1, 1);
 719         dst[n] = '\0';
 720         return (dst);
 721 }
 722
 723 /* Get width of UTF-8 string. */
 724 u_int
 725 utf8_cstrwidth(const char *s)
 726 {
 727         struct utf8_data        tmp;
 728         u_int                   width;
 729         enum utf8_state         more;
 730
 731         width = 0;
 732         while (*s != '\0') {
 733                 if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
 734                         while (*++s != '\0' && more == UTF8_MORE)
 735                                 more = utf8_append(&tmp, *s);
 736                         if (more == UTF8_DONE) {
 737                                 width += tmp.width;
 738                                 continue;
 739                         }
 740                         s -= tmp.have;
 741                 }
 742                 if (*s > 0x1f && *s != 0x7f)
 743                         width++;
 744                 s++;
 745         }
 746         return (width);
 747 }
 748
 749 /* Pad UTF-8 string to width on the left. Caller frees. */
 750 char *
 751 utf8_padcstr(const char *s, u_int width)
 752 {
 753         size_t   slen;
 754         char    *out;
 755         u_int    n, i;
 756
 757         n = utf8_cstrwidth(s);
 758         if (n >= width)
 759                 return (xstrdup(s));
 760
 761         slen = strlen(s);
 762         out = xmalloc(slen + 1 + (width - n));
 763         memcpy(out, s, slen);
 764         for (i = n; i < width; i++)
 765                 out[slen++] = ' ';
 766         out[slen] = '\0';
 767         return (out);
 768 }
 769
 770 /* Pad UTF-8 string to width on the right. Caller frees. */
 771 char *
 772 utf8_rpadcstr(const char *s, u_int width)
 773 {
 774         size_t   slen;
 775         char    *out;
 776         u_int    n, i;
 777
 778         n = utf8_cstrwidth(s);
 779         if (n >= width)
 780                 return (xstrdup(s));
 781
 782         slen = strlen(s);
 783         out = xmalloc(slen + 1 + (width - n));
 784         for (i = 0; i < width - n; i++)
 785                 out[i] = ' ';
 786         memcpy(out + i, s, slen);
 787         out[i + slen] = '\0';
 788         return (out);
 789 }
 790
 791 int
 792 utf8_cstrhas(const char *s, const struct utf8_data *ud)
 793 {
 794         struct utf8_data        *copy, *loop;
 795         int                      found = 0;
 796
 797         copy = utf8_fromcstr(s);
 798         for (loop = copy; loop->size != 0; loop++) {
 799                 if (loop->size != ud->size)
 800                         continue;
 801                 if (memcmp(loop->data, ud->data, loop->size) == 0) {
 802                         found = 1;
 803                         break;
 804                 }
 805         }
 806         free(copy);
 807
 808         return (found);
 809 }