utf8.c

   1 /* $OpenBSD$ */
   2
   3 /*
   4  * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
   5  *
   6  * Permission to use, copy, modify, and distribute this software for any
   7  * purpose with or without fee is hereby granted, provided that the above
   8  * copyright notice and this permission notice appear in all copies.
   9  *
  10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  14  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
  15  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  16  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  17  */
  18
  19 #include <sys/types.h>
  20
  21 #include <ctype.h>
  22 #include <errno.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <wchar.h>
  26
  27 #include "compat.h"
  28 #include "tmux.h"
  29
  30 static const wchar_t utf8_force_wide[] = {
  31         0x0261D,
  32         0x026F9,
  33         0x0270A,
  34         0x0270B,
  35         0x0270C,
  36         0x0270D,
  37         0x1F1E6,
  38         0x1F1E7,
  39         0x1F1E8,
  40         0x1F1E9,
  41         0x1F1EA,
  42         0x1F1EB,
  43         0x1F1EC,
  44         0x1F1ED,
  45         0x1F1EE,
  46         0x1F1EF,
  47         0x1F1F0,
  48         0x1F1F1,
  49         0x1F1F2,
  50         0x1F1F3,
  51         0x1F1F4,
  52         0x1F1F5,
  53         0x1F1F6,
  54         0x1F1F7,
  55         0x1F1F8,
  56         0x1F1F9,
  57         0x1F1FA,
  58         0x1F1FB,
  59         0x1F1FC,
  60         0x1F1FD,
  61         0x1F1FE,
  62         0x1F1FF,
  63         0x1F385,
  64         0x1F3C2,
  65         0x1F3C3,
  66         0x1F3C4,
  67         0x1F3C7,
  68         0x1F3CA,
  69         0x1F3CB,
  70         0x1F3CC,
  71         0x1F3FB,
  72         0x1F3FC,
  73         0x1F3FD,
  74         0x1F3FE,
  75         0x1F3FF,
  76         0x1F442,
  77         0x1F443,
  78         0x1F446,
  79         0x1F447,
  80         0x1F448,
  81         0x1F449,
  82         0x1F44A,
  83         0x1F44B,
  84         0x1F44C,
  85         0x1F44D,
  86         0x1F44E,
  87         0x1F44F,
  88         0x1F450,
  89         0x1F466,
  90         0x1F467,
  91         0x1F468,
  92         0x1F469,
  93         0x1F46B,
  94         0x1F46C,
  95         0x1F46D,
  96         0x1F46E,
  97         0x1F470,
  98         0x1F471,
  99         0x1F472,
 100         0x1F473,
 101         0x1F474,
 102         0x1F475,
 103         0x1F476,
 104         0x1F477,
 105         0x1F478,
 106         0x1F47C,
 107         0x1F481,
 108         0x1F482,
 109         0x1F483,
 110         0x1F485,
 111         0x1F486,
 112         0x1F487,
 113         0x1F48F,
 114         0x1F491,
 115         0x1F4AA,
 116         0x1F574,
 117         0x1F575,
 118         0x1F57A,
 119         0x1F590,
 120         0x1F595,
 121         0x1F596,
 122         0x1F645,
 123         0x1F646,
 124         0x1F647,
 125         0x1F64B,
 126         0x1F64C,
 127         0x1F64D,
 128         0x1F64E,
 129         0x1F64F,
 130         0x1F6A3,
 131         0x1F6B4,
 132         0x1F6B5,
 133         0x1F6B6,
 134         0x1F6C0,
 135         0x1F6CC,
 136         0x1F90C,
 137         0x1F90F,
 138         0x1F918,
 139         0x1F919,
 140         0x1F91A,
 141         0x1F91B,
 142         0x1F91C,
 143         0x1F91D,
 144         0x1F91E,
 145         0x1F91F,
 146         0x1F926,
 147         0x1F930,
 148         0x1F931,
 149         0x1F932,
 150         0x1F933,
 151         0x1F934,
 152         0x1F935,
 153         0x1F936,
 154         0x1F937,
 155         0x1F938,
 156         0x1F939,
 157         0x1F93D,
 158         0x1F93E,
 159         0x1F977,
 160         0x1F9B5,
 161         0x1F9B6,
 162         0x1F9B8,
 163         0x1F9B9,
 164         0x1F9BB,
 165         0x1F9CD,
 166         0x1F9CE,
 167         0x1F9CF,
 168         0x1F9D1,
 169         0x1F9D2,
 170         0x1F9D3,
 171         0x1F9D4,
 172         0x1F9D5,
 173         0x1F9D6,
 174         0x1F9D7,
 175         0x1F9D8,
 176         0x1F9D9,
 177         0x1F9DA,
 178         0x1F9DB,
 179         0x1F9DC,
 180         0x1F9DD,
 181         0x1FAC3,
 182         0x1FAC4,
 183         0x1FAC5,
 184         0x1FAF0,
 185         0x1FAF1,
 186         0x1FAF2,
 187         0x1FAF3,
 188         0x1FAF4,
 189         0x1FAF5,
 190         0x1FAF6,
 191         0x1FAF7,
 192         0x1FAF8
 193 };
 194
 195 struct utf8_item {
 196         RB_ENTRY(utf8_item)     index_entry;
 197         u_int                   index;
 198
 199         RB_ENTRY(utf8_item)     data_entry;
 200         char                    data[UTF8_SIZE];
 201         u_char                  size;
 202 };
 203
 204 static int
 205 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
 206 {
 207         if (ui1->size < ui2->size)
 208                 return (-1);
 209         if (ui1->size > ui2->size)
 210                 return (1);
 211         return (memcmp(ui1->data, ui2->data, ui1->size));
 212 }
 213 RB_HEAD(utf8_data_tree, utf8_item);
 214 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
 215 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
 216
 217 static int
 218 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
 219 {
 220         if (ui1->index < ui2->index)
 221                 return (-1);
 222         if (ui1->index > ui2->index)
 223                 return (1);
 224         return (0);
 225 }
 226 RB_HEAD(utf8_index_tree, utf8_item);
 227 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
 228 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
 229
 230 static u_int utf8_next_index;
 231
 232 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
 233 #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
 234
 235 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
 236 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
 237
 238 /* Get a UTF-8 item from data. */
 239 static struct utf8_item *
 240 utf8_item_by_data(const u_char *data, size_t size)
 241 {
 242         struct utf8_item        ui;
 243
 244         memcpy(ui.data, data, size);
 245         ui.size = size;
 246
 247         return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
 248 }
 249
 250 /* Get a UTF-8 item from data. */
 251 static struct utf8_item *
 252 utf8_item_by_index(u_int index)
 253 {
 254         struct utf8_item        ui;
 255
 256         ui.index = index;
 257
 258         return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
 259 }
 260
 261 /* Add a UTF-8 item. */
 262 static int
 263 utf8_put_item(const u_char *data, size_t size, u_int *index)
 264 {
 265         struct utf8_item        *ui;
 266
 267         ui = utf8_item_by_data(data, size);
 268         if (ui != NULL) {
 269                 *index = ui->index;
 270                 log_debug("%s: found %.*s = %u", __func__, (int)size, data,
 271                     *index);
 272                 return (0);
 273         }
 274
 275         if (utf8_next_index == 0xffffff + 1)
 276                 return (-1);
 277
 278         ui = xcalloc(1, sizeof *ui);
 279         ui->index = utf8_next_index++;
 280         RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
 281
 282         memcpy(ui->data, data, size);
 283         ui->size = size;
 284         RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
 285
 286         *index = ui->index;
 287         log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
 288         return (0);
 289 }
 290
 291 static int
 292 utf8_table_cmp(const void *vp1, const void *vp2)
 293 {
 294         const wchar_t   *wc1 = vp1, *wc2 = vp2;
 295
 296         if (*wc1 < *wc2)
 297                 return (-1);
 298         if (*wc1 > *wc2)
 299                 return (1);
 300         return (0);
 301 }
 302
 303 /* Check if character in table. */
 304 int
 305 utf8_in_table(wchar_t find, const wchar_t *table, u_int count)
 306 {
 307         wchar_t *found;
 308
 309         found = bsearch(&find, table, count, sizeof *table, utf8_table_cmp);
 310         return (found != NULL);
 311 }
 312
 313 /* Get UTF-8 character from data. */
 314 enum utf8_state
 315 utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
 316 {
 317         u_int   index;
 318
 319         if (ud->width > 2)
 320                 fatalx("invalid UTF-8 width: %u", ud->width);
 321
 322         if (ud->size > UTF8_SIZE)
 323                 goto fail;
 324         if (ud->size <= 3) {
 325                 index = (((utf8_char)ud->data[2] << 16)|
 326                           ((utf8_char)ud->data[1] << 8)|
 327                           ((utf8_char)ud->data[0]));
 328         } else if (utf8_put_item(ud->data, ud->size, &index) != 0)
 329                 goto fail;
 330         *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
 331         log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
 332             (int)ud->size, ud->data, *uc);
 333         return (UTF8_DONE);
 334
 335 fail:
 336         if (ud->width == 0)
 337                 *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
 338         else if (ud->width == 1)
 339                 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
 340         else
 341                 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
 342         return (UTF8_ERROR);
 343 }
 344
 345 /* Get UTF-8 data from character. */
 346 void
 347 utf8_to_data(utf8_char uc, struct utf8_data *ud)
 348 {
 349         struct utf8_item        *ui;
 350         u_int                    index;
 351
 352         memset(ud, 0, sizeof *ud);
 353         ud->size = ud->have = UTF8_GET_SIZE(uc);
 354         ud->width = UTF8_GET_WIDTH(uc);
 355
 356         if (ud->size <= 3) {
 357                 ud->data[2] = (uc >> 16);
 358                 ud->data[1] = ((uc >> 8) & 0xff);
 359                 ud->data[0] = (uc & 0xff);
 360         } else {
 361                 index = (uc & 0xffffff);
 362                 if ((ui = utf8_item_by_index(index)) == NULL)
 363                         memset(ud->data, ' ', ud->size);
 364                 else
 365                         memcpy(ud->data, ui->data, ud->size);
 366         }
 367
 368         log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
 369             (int)ud->size, ud->data);
 370 }
 371
 372 /* Get UTF-8 character from a single ASCII character. */
 373 u_int
 374 utf8_build_one(u_char ch)
 375 {
 376         return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
 377 }
 378
 379 /* Set a single character. */
 380 void
 381 utf8_set(struct utf8_data *ud, u_char ch)
 382 {
 383         static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
 384
 385         memcpy(ud, &empty, sizeof *ud);
 386         *ud->data = ch;
 387 }
 388
 389 /* Copy UTF-8 character. */
 390 void
 391 utf8_copy(struct utf8_data *to, const struct utf8_data *from)
 392 {
 393         u_int   i;
 394
 395         memcpy(to, from, sizeof *to);
 396
 397         for (i = to->size; i < sizeof to->data; i++)
 398                 to->data[i] = '\0';
 399 }
 400
 401 /* Get width of Unicode character. */
 402 static enum utf8_state
 403 utf8_width(struct utf8_data *ud, int *width)
 404 {
 405         wchar_t wc;
 406
 407         if (utf8_towc(ud, &wc) != UTF8_DONE)
 408                 return (UTF8_ERROR);
 409         if (utf8_in_table(wc, utf8_force_wide, nitems(utf8_force_wide))) {
 410                 *width = 2;
 411                 return (UTF8_DONE);
 412         }
 413 #ifdef HAVE_UTF8PROC
 414         *width = utf8proc_wcwidth(wc);
 415         log_debug("utf8proc_wcwidth(%05X) returned %d", (u_int)wc, *width);
 416 #else
 417         *width = wcwidth(wc);
 418         log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width);
 419         if (*width < 0) {
 420                 /*
 421                  * C1 control characters are nonprintable, so they are always
 422                  * zero width.
 423                  */
 424                 *width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1;
 425         }
 426 #endif
 427         if (*width >= 0 && *width <= 0xff)
 428                 return (UTF8_DONE);
 429         return (UTF8_ERROR);
 430 }
 431
 432 /* Convert UTF-8 character to wide character. */
 433 enum utf8_state
 434 utf8_towc(const struct utf8_data *ud, wchar_t *wc)
 435 {
 436 #ifdef HAVE_UTF8PROC
 437         switch (utf8proc_mbtowc(wc, ud->data, ud->size)) {
 438 #else
 439         switch (mbtowc(wc, ud->data, ud->size)) {
 440 #endif
 441         case -1:
 442                 log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
 443                     errno);
 444                 mbtowc(NULL, NULL, MB_CUR_MAX);
 445                 return (UTF8_ERROR);
 446         case 0:
 447                 return (UTF8_ERROR);
 448         }
 449         log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc);
 450         return (UTF8_DONE);
 451 }
 452
 453 /* Convert wide character to UTF-8 character. */
 454 enum utf8_state
 455 utf8_fromwc(wchar_t wc, struct utf8_data *ud)
 456 {
 457         int     size, width;
 458
 459 #ifdef HAVE_UTF8PROC
 460         size = utf8proc_wctomb(ud->data, wc);
 461 #else
 462         size = wctomb(ud->data, wc);
 463 #endif
 464         if (size < 0) {
 465                 log_debug("UTF-8 %d, wctomb() %d", wc, errno);
 466                 wctomb(NULL, 0);
 467                 return (UTF8_ERROR);
 468         }
 469         if (size == 0)
 470                 return (UTF8_ERROR);
 471         ud->size = ud->have = size;
 472         if (utf8_width(ud, &width) == UTF8_DONE) {
 473                 ud->width = width;
 474                 return (UTF8_DONE);
 475         }
 476         return (UTF8_ERROR);
 477 }
 478
 479 /*
 480  * Open UTF-8 sequence.
 481  *
 482  * 11000010-11011111 C2-DF start of 2-byte sequence
 483  * 11100000-11101111 E0-EF start of 3-byte sequence
 484  * 11110000-11110100 F0-F4 start of 4-byte sequence
 485  */
 486 enum utf8_state
 487 utf8_open(struct utf8_data *ud, u_char ch)
 488 {
 489         memset(ud, 0, sizeof *ud);
 490         if (ch >= 0xc2 && ch <= 0xdf)
 491                 ud->size = 2;
 492         else if (ch >= 0xe0 && ch <= 0xef)
 493                 ud->size = 3;
 494         else if (ch >= 0xf0 && ch <= 0xf4)
 495                 ud->size = 4;
 496         else
 497                 return (UTF8_ERROR);
 498         utf8_append(ud, ch);
 499         return (UTF8_MORE);
 500 }
 501
 502 /* Append character to UTF-8, closing if finished. */
 503 enum utf8_state
 504 utf8_append(struct utf8_data *ud, u_char ch)
 505 {
 506         int     width;
 507
 508         if (ud->have >= ud->size)
 509                 fatalx("UTF-8 character overflow");
 510         if (ud->size > sizeof ud->data)
 511                 fatalx("UTF-8 character size too large");
 512
 513         if (ud->have != 0 && (ch & 0xc0) != 0x80)
 514                 ud->width = 0xff;
 515
 516         ud->data[ud->have++] = ch;
 517         if (ud->have != ud->size)
 518                 return (UTF8_MORE);
 519
 520         if (ud->width == 0xff)
 521                 return (UTF8_ERROR);
 522         if (utf8_width(ud, &width) != UTF8_DONE)
 523                 return (UTF8_ERROR);
 524         ud->width = width;
 525
 526         return (UTF8_DONE);
 527 }
 528
 529 /*
 530  * Encode len characters from src into dst, which is guaranteed to have four
 531  * bytes available for each character from src (for \abc or UTF-8) plus space
 532  * for \0.
 533  */
 534 int
 535 utf8_strvis(char *dst, const char *src, size_t len, int flag)
 536 {
 537         struct utf8_data         ud;
 538         const char              *start = dst, *end = src + len;
 539         enum utf8_state          more;
 540         size_t                   i;
 541
 542         while (src < end) {
 543                 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
 544                         while (++src < end && more == UTF8_MORE)
 545                                 more = utf8_append(&ud, *src);
 546                         if (more == UTF8_DONE) {
 547                                 /* UTF-8 character finished. */
 548                                 for (i = 0; i < ud.size; i++)
 549                                         *dst++ = ud.data[i];
 550                                 continue;
 551                         }
 552                         /* Not a complete, valid UTF-8 character. */
 553                         src -= ud.have;
 554                 }
 555                 if ((flag & VIS_DQ) && src[0] == '$' && src < end - 1) {
 556                         if (isalpha((u_char)src[1]) ||
 557                             src[1] == '_' ||
 558                             src[1] == '{')
 559                                 *dst++ = '\\';
 560                         *dst++ = '$';
 561                 } else if (src < end - 1)
 562                         dst = vis(dst, src[0], flag, src[1]);
 563                 else if (src < end)
 564                         dst = vis(dst, src[0], flag, '\0');
 565                 src++;
 566         }
 567         *dst = '\0';
 568         return (dst - start);
 569 }
 570
 571 /* Same as utf8_strvis but allocate the buffer. */
 572 int
 573 utf8_stravis(char **dst, const char *src, int flag)
 574 {
 575         char    *buf;
 576         int      len;
 577
 578         buf = xreallocarray(NULL, 4, strlen(src) + 1);
 579         len = utf8_strvis(buf, src, strlen(src), flag);
 580
 581         *dst = xrealloc(buf, len + 1);
 582         return (len);
 583 }
 584
 585 /* Same as utf8_strvis but allocate the buffer. */
 586 int
 587 utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
 588 {
 589         char    *buf;
 590         int      len;
 591
 592         buf = xreallocarray(NULL, 4, srclen + 1);
 593         len = utf8_strvis(buf, src, srclen, flag);
 594
 595         *dst = xrealloc(buf, len + 1);
 596         return (len);
 597 }
 598
 599 /* Does this string contain anything that isn't valid UTF-8? */
 600 int
 601 utf8_isvalid(const char *s)
 602 {
 603         struct utf8_data ud;
 604         const char      *end;
 605         enum utf8_state  more;
 606
 607         end = s + strlen(s);
 608         while (s < end) {
 609                 if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
 610                         while (++s < end && more == UTF8_MORE)
 611                                 more = utf8_append(&ud, *s);
 612                         if (more == UTF8_DONE)
 613                                 continue;
 614                         return (0);
 615                 }
 616                 if (*s < 0x20 || *s > 0x7e)
 617                         return (0);
 618                 s++;
 619         }
 620         return (1);
 621 }
 622
 623 /*
 624  * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
 625  * the returned string. Anything not valid printable ASCII or UTF-8 is
 626  * stripped.
 627  */
 628 char *
 629 utf8_sanitize(const char *src)
 630 {
 631         char            *dst = NULL;
 632         size_t           n = 0;
 633         enum utf8_state  more;
 634         struct utf8_data ud;
 635         u_int            i;
 636
 637         while (*src != '\0') {
 638                 dst = xreallocarray(dst, n + 1, sizeof *dst);
 639                 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
 640                         while (*++src != '\0' && more == UTF8_MORE)
 641                                 more = utf8_append(&ud, *src);
 642                         if (more == UTF8_DONE) {
 643                                 dst = xreallocarray(dst, n + ud.width,
 644                                     sizeof *dst);
 645                                 for (i = 0; i < ud.width; i++)
 646                                         dst[n++] = '_';
 647                                 continue;
 648                         }
 649                         src -= ud.have;
 650                 }
 651                 if (*src > 0x1f && *src < 0x7f)
 652                         dst[n++] = *src;
 653                 else
 654                         dst[n++] = '_';
 655                 src++;
 656         }
 657         dst = xreallocarray(dst, n + 1, sizeof *dst);
 658         dst[n] = '\0';
 659         return (dst);
 660 }
 661
 662 /* Get UTF-8 buffer length. */
 663 size_t
 664 utf8_strlen(const struct utf8_data *s)
 665 {
 666         size_t  i;
 667
 668         for (i = 0; s[i].size != 0; i++)
 669                 /* nothing */;
 670         return (i);
 671 }
 672
 673 /* Get UTF-8 string width. */
 674 u_int
 675 utf8_strwidth(const struct utf8_data *s, ssize_t n)
 676 {
 677         ssize_t i;
 678         u_int   width = 0;
 679
 680         for (i = 0; s[i].size != 0; i++) {
 681                 if (n != -1 && n == i)
 682                         break;
 683                 width += s[i].width;
 684         }
 685         return (width);
 686 }
 687
 688 /*
 689  * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
 690  * Caller frees.
 691  */
 692 struct utf8_data *
 693 utf8_fromcstr(const char *src)
 694 {
 695         struct utf8_data        *dst = NULL;
 696         size_t                   n = 0;
 697         enum utf8_state          more;
 698
 699         while (*src != '\0') {
 700                 dst = xreallocarray(dst, n + 1, sizeof *dst);
 701                 if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
 702                         while (*++src != '\0' && more == UTF8_MORE)
 703                                 more = utf8_append(&dst[n], *src);
 704                         if (more == UTF8_DONE) {
 705                                 n++;
 706                                 continue;
 707                         }
 708                         src -= dst[n].have;
 709                 }
 710                 utf8_set(&dst[n], *src);
 711                 n++;
 712                 src++;
 713         }
 714         dst = xreallocarray(dst, n + 1, sizeof *dst);
 715         dst[n].size = 0;
 716         return (dst);
 717 }
 718
 719 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
 720 char *
 721 utf8_tocstr(struct utf8_data *src)
 722 {
 723         char    *dst = NULL;
 724         size_t   n = 0;
 725
 726         for(; src->size != 0; src++) {
 727                 dst = xreallocarray(dst, n + src->size, 1);
 728                 memcpy(dst + n, src->data, src->size);
 729                 n += src->size;
 730         }
 731         dst = xreallocarray(dst, n + 1, 1);
 732         dst[n] = '\0';
 733         return (dst);
 734 }
 735
 736 /* Get width of UTF-8 string. */
 737 u_int
 738 utf8_cstrwidth(const char *s)
 739 {
 740         struct utf8_data        tmp;
 741         u_int                   width;
 742         enum utf8_state         more;
 743
 744         width = 0;
 745         while (*s != '\0') {
 746                 if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
 747                         while (*++s != '\0' && more == UTF8_MORE)
 748                                 more = utf8_append(&tmp, *s);
 749                         if (more == UTF8_DONE) {
 750                                 width += tmp.width;
 751                                 continue;
 752                         }
 753                         s -= tmp.have;
 754                 }
 755                 if (*s > 0x1f && *s != 0x7f)
 756                         width++;
 757                 s++;
 758         }
 759         return (width);
 760 }
 761
 762 /* Pad UTF-8 string to width on the left. Caller frees. */
 763 char *
 764 utf8_padcstr(const char *s, u_int width)
 765 {
 766         size_t   slen;
 767         char    *out;
 768         u_int    n, i;
 769
 770         n = utf8_cstrwidth(s);
 771         if (n >= width)
 772                 return (xstrdup(s));
 773
 774         slen = strlen(s);
 775         out = xmalloc(slen + 1 + (width - n));
 776         memcpy(out, s, slen);
 777         for (i = n; i < width; i++)
 778                 out[slen++] = ' ';
 779         out[slen] = '\0';
 780         return (out);
 781 }
 782
 783 /* Pad UTF-8 string to width on the right. Caller frees. */
 784 char *
 785 utf8_rpadcstr(const char *s, u_int width)
 786 {
 787         size_t   slen;
 788         char    *out;
 789         u_int    n, i;
 790
 791         n = utf8_cstrwidth(s);
 792         if (n >= width)
 793                 return (xstrdup(s));
 794
 795         slen = strlen(s);
 796         out = xmalloc(slen + 1 + (width - n));
 797         for (i = 0; i < width - n; i++)
 798                 out[i] = ' ';
 799         memcpy(out + i, s, slen);
 800         out[i + slen] = '\0';
 801         return (out);
 802 }
 803
 804 int
 805 utf8_cstrhas(const char *s, const struct utf8_data *ud)
 806 {
 807         struct utf8_data        *copy, *loop;
 808         int                      found = 0;
 809
 810         copy = utf8_fromcstr(s);
 811         for (loop = copy; loop->size != 0; loop++) {
 812                 if (loop->size != ud->size)
 813                         continue;
 814                 if (memcmp(loop->data, ud->data, loop->size) == 0) {
 815                         found = 1;
 816                         break;
 817                 }
 818         }
 819         free(copy);
 820
 821         return (found);
 822 }