utf8.c

   1 /* $OpenBSD$ */
   2
   3 /*
   4  * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
   5  *
   6  * Permission to use, copy, modify, and distribute this software for any
   7  * purpose with or without fee is hereby granted, provided that the above
   8  * copyright notice and this permission notice appear in all copies.
   9  *
  10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  14  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
  15  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  16  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  17  */
  18
  19 #include <sys/types.h>
  20
  21 #include <ctype.h>
  22 #include <errno.h>
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <wchar.h>
  26
  27 #include "tmux.h"
  28
  29 struct utf8_item {
  30         RB_ENTRY(utf8_item)     index_entry;
  31         u_int                   index;
  32
  33         RB_ENTRY(utf8_item)     data_entry;
  34         char                    data[UTF8_SIZE];
  35         u_char                  size;
  36 };
  37
  38 static int
  39 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
  40 {
  41         if (ui1->size < ui2->size)
  42                 return (-1);
  43         if (ui1->size > ui2->size)
  44                 return (1);
  45         return (memcmp(ui1->data, ui2->data, ui1->size));
  46 }
  47 RB_HEAD(utf8_data_tree, utf8_item);
  48 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
  49 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
  50
  51 static int
  52 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
  53 {
  54         if (ui1->index < ui2->index)
  55                 return (-1);
  56         if (ui1->index > ui2->index)
  57                 return (1);
  58         return (0);
  59 }
  60 RB_HEAD(utf8_index_tree, utf8_item);
  61 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
  62 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
  63
  64 static u_int utf8_next_index;
  65
  66 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
  67 #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
  68
  69 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
  70 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
  71
  72 /* Get a UTF-8 item from data. */
  73 static struct utf8_item *
  74 utf8_item_by_data(const u_char *data, size_t size)
  75 {
  76         struct utf8_item        ui;
  77
  78         memcpy(ui.data, data, size);
  79         ui.size = size;
  80
  81         return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
  82 }
  83
  84 /* Get a UTF-8 item from data. */
  85 static struct utf8_item *
  86 utf8_item_by_index(u_int index)
  87 {
  88         struct utf8_item        ui;
  89
  90         ui.index = index;
  91
  92         return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
  93 }
  94
  95 /* Add a UTF-8 item. */
  96 static int
  97 utf8_put_item(const u_char *data, size_t size, u_int *index)
  98 {
  99         struct utf8_item        *ui;
 100
 101         ui = utf8_item_by_data(data, size);
 102         if (ui != NULL) {
 103                 *index = ui->index;
 104                 log_debug("%s: found %.*s = %u", __func__, (int)size, data,
 105                     *index);
 106                 return (0);
 107         }
 108
 109         if (utf8_next_index == 0xffffff + 1)
 110                 return (-1);
 111
 112         ui = xcalloc(1, sizeof *ui);
 113         ui->index = utf8_next_index++;
 114         RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
 115
 116         memcpy(ui->data, data, size);
 117         ui->size = size;
 118         RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
 119
 120         *index = ui->index;
 121         log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
 122         return (0);
 123 }
 124
 125 /* Get UTF-8 character from data. */
 126 enum utf8_state
 127 utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
 128 {
 129         u_int   index;
 130
 131         if (ud->width > 2)
 132                 fatalx("invalid UTF-8 width: %u", ud->width);
 133
 134         if (ud->size > UTF8_SIZE)
 135                 goto fail;
 136         if (ud->size <= 3) {
 137                 index = (((utf8_char)ud->data[2] << 16)|
 138                           ((utf8_char)ud->data[1] << 8)|
 139                           ((utf8_char)ud->data[0]));
 140         } else if (utf8_put_item(ud->data, ud->size, &index) != 0)
 141                 goto fail;
 142         *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
 143         log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
 144             (int)ud->size, ud->data, *uc);
 145         return (UTF8_DONE);
 146
 147 fail:
 148         if (ud->width == 0)
 149                 *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
 150         else if (ud->width == 1)
 151                 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
 152         else
 153                 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
 154         return (UTF8_ERROR);
 155 }
 156
 157 /* Get UTF-8 data from character. */
 158 void
 159 utf8_to_data(utf8_char uc, struct utf8_data *ud)
 160 {
 161         struct utf8_item        *ui;
 162         u_int                    index;
 163
 164         memset(ud, 0, sizeof *ud);
 165         ud->size = ud->have = UTF8_GET_SIZE(uc);
 166         ud->width = UTF8_GET_WIDTH(uc);
 167
 168         if (ud->size <= 3) {
 169                 ud->data[2] = (uc >> 16);
 170                 ud->data[1] = ((uc >> 8) & 0xff);
 171                 ud->data[0] = (uc & 0xff);
 172         } else {
 173                 index = (uc & 0xffffff);
 174                 if ((ui = utf8_item_by_index(index)) == NULL)
 175                         memset(ud->data, ' ', ud->size);
 176                 else
 177                         memcpy(ud->data, ui->data, ud->size);
 178         }
 179
 180         log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
 181             (int)ud->size, ud->data);
 182 }
 183
 184 /* Get UTF-8 character from a single ASCII character. */
 185 u_int
 186 utf8_build_one(u_char ch)
 187 {
 188         return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
 189 }
 190
 191 /* Set a single character. */
 192 void
 193 utf8_set(struct utf8_data *ud, u_char ch)
 194 {
 195         static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
 196
 197         memcpy(ud, &empty, sizeof *ud);
 198         *ud->data = ch;
 199 }
 200
 201 /* Copy UTF-8 character. */
 202 void
 203 utf8_copy(struct utf8_data *to, const struct utf8_data *from)
 204 {
 205         u_int   i;
 206
 207         memcpy(to, from, sizeof *to);
 208
 209         for (i = to->size; i < sizeof to->data; i++)
 210                 to->data[i] = '\0';
 211 }
 212
 213 /* Get width of Unicode character. */
 214 static enum utf8_state
 215 utf8_width(struct utf8_data *ud, int *width)
 216 {
 217         wchar_t wc;
 218
 219 #ifdef HAVE_UTF8PROC
 220         switch (utf8proc_mbtowc(&wc, ud->data, ud->size)) {
 221 #else
 222         switch (mbtowc(&wc, ud->data, ud->size)) {
 223 #endif
 224         case -1:
 225                 log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
 226                     errno);
 227                 mbtowc(NULL, NULL, MB_CUR_MAX);
 228                 return (UTF8_ERROR);
 229         case 0:
 230                 return (UTF8_ERROR);
 231         }
 232         log_debug("UTF-8 %.*s is %08X", (int)ud->size, ud->data, (u_int)wc);
 233 #ifdef HAVE_UTF8PROC
 234         *width = utf8proc_wcwidth(wc);
 235         log_debug("utf8proc_wcwidth(%08X) returned %d", (u_int)wc, *width);
 236 #else
 237         *width = wcwidth(wc);
 238         log_debug("wcwidth(%08X) returned %d", (u_int)wc, *width);
 239         if (*width < 0) {
 240                 /*
 241                  * C1 control characters are nonprintable, so they are always
 242                  * zero width.
 243                  */
 244                 *width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1;
 245         }
 246 #endif
 247         if (*width >= 0 && *width <= 0xff)
 248                 return (UTF8_DONE);
 249         return (UTF8_ERROR);
 250 }
 251
 252 /*
 253  * Open UTF-8 sequence.
 254  *
 255  * 11000010-11011111 C2-DF start of 2-byte sequence
 256  * 11100000-11101111 E0-EF start of 3-byte sequence
 257  * 11110000-11110100 F0-F4 start of 4-byte sequence
 258  */
 259 enum utf8_state
 260 utf8_open(struct utf8_data *ud, u_char ch)
 261 {
 262         memset(ud, 0, sizeof *ud);
 263         if (ch >= 0xc2 && ch <= 0xdf)
 264                 ud->size = 2;
 265         else if (ch >= 0xe0 && ch <= 0xef)
 266                 ud->size = 3;
 267         else if (ch >= 0xf0 && ch <= 0xf4)
 268                 ud->size = 4;
 269         else
 270                 return (UTF8_ERROR);
 271         utf8_append(ud, ch);
 272         return (UTF8_MORE);
 273 }
 274
 275 /* Append character to UTF-8, closing if finished. */
 276 enum utf8_state
 277 utf8_append(struct utf8_data *ud, u_char ch)
 278 {
 279         int     width;
 280
 281         if (ud->have >= ud->size)
 282                 fatalx("UTF-8 character overflow");
 283         if (ud->size > sizeof ud->data)
 284                 fatalx("UTF-8 character size too large");
 285
 286         if (ud->have != 0 && (ch & 0xc0) != 0x80)
 287                 ud->width = 0xff;
 288
 289         ud->data[ud->have++] = ch;
 290         if (ud->have != ud->size)
 291                 return (UTF8_MORE);
 292
 293         if (ud->width == 0xff)
 294                 return (UTF8_ERROR);
 295         if (utf8_width(ud, &width) != UTF8_DONE)
 296                 return (UTF8_ERROR);
 297         ud->width = width;
 298
 299         return (UTF8_DONE);
 300 }
 301
 302 /*
 303  * Encode len characters from src into dst, which is guaranteed to have four
 304  * bytes available for each character from src (for \abc or UTF-8) plus space
 305  * for \0.
 306  */
 307 int
 308 utf8_strvis(char *dst, const char *src, size_t len, int flag)
 309 {
 310         struct utf8_data         ud;
 311         const char              *start = dst, *end = src + len;
 312         enum utf8_state          more;
 313         size_t                   i;
 314
 315         while (src < end) {
 316                 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
 317                         while (++src < end && more == UTF8_MORE)
 318                                 more = utf8_append(&ud, *src);
 319                         if (more == UTF8_DONE) {
 320                                 /* UTF-8 character finished. */
 321                                 for (i = 0; i < ud.size; i++)
 322                                         *dst++ = ud.data[i];
 323                                 continue;
 324                         }
 325                         /* Not a complete, valid UTF-8 character. */
 326                         src -= ud.have;
 327                 }
 328                 if (src[0] == '$' && src < end - 1) {
 329                         if (isalpha((u_char)src[1]) ||
 330                             src[1] == '_' ||
 331                             src[1] == '{')
 332                                 *dst++ = '\\';
 333                         *dst++ = '$';
 334                 } else if (src < end - 1)
 335                         dst = vis(dst, src[0], flag, src[1]);
 336                 else if (src < end)
 337                         dst = vis(dst, src[0], flag, '\0');
 338                 src++;
 339         }
 340         *dst = '\0';
 341         return (dst - start);
 342 }
 343
 344 /* Same as utf8_strvis but allocate the buffer. */
 345 int
 346 utf8_stravis(char **dst, const char *src, int flag)
 347 {
 348         char    *buf;
 349         int      len;
 350
 351         buf = xreallocarray(NULL, 4, strlen(src) + 1);
 352         len = utf8_strvis(buf, src, strlen(src), flag);
 353
 354         *dst = xrealloc(buf, len + 1);
 355         return (len);
 356 }
 357
 358 /* Same as utf8_strvis but allocate the buffer. */
 359 int
 360 utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
 361 {
 362         char    *buf;
 363         int      len;
 364
 365         buf = xreallocarray(NULL, 4, srclen + 1);
 366         len = utf8_strvis(buf, src, srclen, flag);
 367
 368         *dst = xrealloc(buf, len + 1);
 369         return (len);
 370 }
 371
 372 /* Does this string contain anything that isn't valid UTF-8? */
 373 int
 374 utf8_isvalid(const char *s)
 375 {
 376         struct utf8_data ud;
 377         const char      *end;
 378         enum utf8_state  more;
 379
 380         end = s + strlen(s);
 381         while (s < end) {
 382                 if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
 383                         while (++s < end && more == UTF8_MORE)
 384                                 more = utf8_append(&ud, *s);
 385                         if (more == UTF8_DONE)
 386                                 continue;
 387                         return (0);
 388                 }
 389                 if (*s < 0x20 || *s > 0x7e)
 390                         return (0);
 391                 s++;
 392         }
 393         return (1);
 394 }
 395
 396 /*
 397  * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
 398  * the returned string. Anything not valid printable ASCII or UTF-8 is
 399  * stripped.
 400  */
 401 char *
 402 utf8_sanitize(const char *src)
 403 {
 404         char            *dst = NULL;
 405         size_t           n = 0;
 406         enum utf8_state  more;
 407         struct utf8_data ud;
 408         u_int            i;
 409
 410         while (*src != '\0') {
 411                 dst = xreallocarray(dst, n + 1, sizeof *dst);
 412                 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
 413                         while (*++src != '\0' && more == UTF8_MORE)
 414                                 more = utf8_append(&ud, *src);
 415                         if (more == UTF8_DONE) {
 416                                 dst = xreallocarray(dst, n + ud.width,
 417                                     sizeof *dst);
 418                                 for (i = 0; i < ud.width; i++)
 419                                         dst[n++] = '_';
 420                                 continue;
 421                         }
 422                         src -= ud.have;
 423                 }
 424                 if (*src > 0x1f && *src < 0x7f)
 425                         dst[n++] = *src;
 426                 else
 427                         dst[n++] = '_';
 428                 src++;
 429         }
 430         dst = xreallocarray(dst, n + 1, sizeof *dst);
 431         dst[n] = '\0';
 432         return (dst);
 433 }
 434
 435 /* Get UTF-8 buffer length. */
 436 size_t
 437 utf8_strlen(const struct utf8_data *s)
 438 {
 439         size_t  i;
 440
 441         for (i = 0; s[i].size != 0; i++)
 442                 /* nothing */;
 443         return (i);
 444 }
 445
 446 /* Get UTF-8 string width. */
 447 u_int
 448 utf8_strwidth(const struct utf8_data *s, ssize_t n)
 449 {
 450         ssize_t i;
 451         u_int   width = 0;
 452
 453         for (i = 0; s[i].size != 0; i++) {
 454                 if (n != -1 && n == i)
 455                         break;
 456                 width += s[i].width;
 457         }
 458         return (width);
 459 }
 460
 461 /*
 462  * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
 463  * Caller frees.
 464  */
 465 struct utf8_data *
 466 utf8_fromcstr(const char *src)
 467 {
 468         struct utf8_data        *dst = NULL;
 469         size_t                   n = 0;
 470         enum utf8_state          more;
 471
 472         while (*src != '\0') {
 473                 dst = xreallocarray(dst, n + 1, sizeof *dst);
 474                 if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
 475                         while (*++src != '\0' && more == UTF8_MORE)
 476                                 more = utf8_append(&dst[n], *src);
 477                         if (more == UTF8_DONE) {
 478                                 n++;
 479                                 continue;
 480                         }
 481                         src -= dst[n].have;
 482                 }
 483                 utf8_set(&dst[n], *src);
 484                 n++;
 485                 src++;
 486         }
 487         dst = xreallocarray(dst, n + 1, sizeof *dst);
 488         dst[n].size = 0;
 489         return (dst);
 490 }
 491
 492 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
 493 char *
 494 utf8_tocstr(struct utf8_data *src)
 495 {
 496         char    *dst = NULL;
 497         size_t   n = 0;
 498
 499         for(; src->size != 0; src++) {
 500                 dst = xreallocarray(dst, n + src->size, 1);
 501                 memcpy(dst + n, src->data, src->size);
 502                 n += src->size;
 503         }
 504         dst = xreallocarray(dst, n + 1, 1);
 505         dst[n] = '\0';
 506         return (dst);
 507 }
 508
 509 /* Get width of UTF-8 string. */
 510 u_int
 511 utf8_cstrwidth(const char *s)
 512 {
 513         struct utf8_data        tmp;
 514         u_int                   width;
 515         enum utf8_state         more;
 516
 517         width = 0;
 518         while (*s != '\0') {
 519                 if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
 520                         while (*++s != '\0' && more == UTF8_MORE)
 521                                 more = utf8_append(&tmp, *s);
 522                         if (more == UTF8_DONE) {
 523                                 width += tmp.width;
 524                                 continue;
 525                         }
 526                         s -= tmp.have;
 527                 }
 528                 if (*s > 0x1f && *s != 0x7f)
 529                         width++;
 530                 s++;
 531         }
 532         return (width);
 533 }
 534
 535 /* Pad UTF-8 string to width on the left. Caller frees. */
 536 char *
 537 utf8_padcstr(const char *s, u_int width)
 538 {
 539         size_t   slen;
 540         char    *out;
 541         u_int    n, i;
 542
 543         n = utf8_cstrwidth(s);
 544         if (n >= width)
 545                 return (xstrdup(s));
 546
 547         slen = strlen(s);
 548         out = xmalloc(slen + 1 + (width - n));
 549         memcpy(out, s, slen);
 550         for (i = n; i < width; i++)
 551                 out[slen++] = ' ';
 552         out[slen] = '\0';
 553         return (out);
 554 }
 555
 556 /* Pad UTF-8 string to width on the right. Caller frees. */
 557 char *
 558 utf8_rpadcstr(const char *s, u_int width)
 559 {
 560         size_t   slen;
 561         char    *out;
 562         u_int    n, i;
 563
 564         n = utf8_cstrwidth(s);
 565         if (n >= width)
 566                 return (xstrdup(s));
 567
 568         slen = strlen(s);
 569         out = xmalloc(slen + 1 + (width - n));
 570         for (i = 0; i < width - n; i++)
 571                 out[i] = ' ';
 572         memcpy(out + i, s, slen);
 573         out[i + slen] = '\0';
 574         return (out);
 575 }
 576
 577 int
 578 utf8_cstrhas(const char *s, const struct utf8_data *ud)
 579 {
 580         struct utf8_data        *copy, *loop;
 581         int                      found = 0;
 582
 583         copy = utf8_fromcstr(s);
 584         for (loop = copy; loop->size != 0; loop++) {
 585                 if (loop->size != ud->size)
 586                         continue;
 587                 if (memcmp(loop->data, ud->data, loop->size) == 0) {
 588                         found = 1;
 589                         break;
 590                 }
 591         }
 592         free(copy);
 593
 594         return (found);
 595 }