4 * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
19 #include <sys/types.h>
30 static const wchar_t utf8_force_wide
[] = {
196 RB_ENTRY(utf8_item
) index_entry
;
199 RB_ENTRY(utf8_item
) data_entry
;
200 char data
[UTF8_SIZE
];
205 utf8_data_cmp(struct utf8_item
*ui1
, struct utf8_item
*ui2
)
207 if (ui1
->size
< ui2
->size
)
209 if (ui1
->size
> ui2
->size
)
211 return (memcmp(ui1
->data
, ui2
->data
, ui1
->size
));
213 RB_HEAD(utf8_data_tree
, utf8_item
);
214 RB_GENERATE_STATIC(utf8_data_tree
, utf8_item
, data_entry
, utf8_data_cmp
);
215 static struct utf8_data_tree utf8_data_tree
= RB_INITIALIZER(utf8_data_tree
);
218 utf8_index_cmp(struct utf8_item
*ui1
, struct utf8_item
*ui2
)
220 if (ui1
->index
< ui2
->index
)
222 if (ui1
->index
> ui2
->index
)
226 RB_HEAD(utf8_index_tree
, utf8_item
);
227 RB_GENERATE_STATIC(utf8_index_tree
, utf8_item
, index_entry
, utf8_index_cmp
);
228 static struct utf8_index_tree utf8_index_tree
= RB_INITIALIZER(utf8_index_tree
);
230 static u_int utf8_next_index
;
232 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
233 #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
235 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
236 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
238 /* Get a UTF-8 item from data. */
239 static struct utf8_item
*
240 utf8_item_by_data(const u_char
*data
, size_t size
)
244 memcpy(ui
.data
, data
, size
);
247 return (RB_FIND(utf8_data_tree
, &utf8_data_tree
, &ui
));
250 /* Get a UTF-8 item from data. */
251 static struct utf8_item
*
252 utf8_item_by_index(u_int index
)
258 return (RB_FIND(utf8_index_tree
, &utf8_index_tree
, &ui
));
261 /* Add a UTF-8 item. */
263 utf8_put_item(const u_char
*data
, size_t size
, u_int
*index
)
265 struct utf8_item
*ui
;
267 ui
= utf8_item_by_data(data
, size
);
270 log_debug("%s: found %.*s = %u", __func__
, (int)size
, data
,
275 if (utf8_next_index
== 0xffffff + 1)
278 ui
= xcalloc(1, sizeof *ui
);
279 ui
->index
= utf8_next_index
++;
280 RB_INSERT(utf8_index_tree
, &utf8_index_tree
, ui
);
282 memcpy(ui
->data
, data
, size
);
284 RB_INSERT(utf8_data_tree
, &utf8_data_tree
, ui
);
287 log_debug("%s: added %.*s = %u", __func__
, (int)size
, data
, *index
);
292 utf8_table_cmp(const void *vp1
, const void *vp2
)
294 const wchar_t *wc1
= vp1
, *wc2
= vp2
;
303 /* Check if character in table. */
305 utf8_in_table(wchar_t find
, const wchar_t *table
, u_int count
)
309 found
= bsearch(&find
, table
, count
, sizeof *table
, utf8_table_cmp
);
310 return (found
!= NULL
);
313 /* Get UTF-8 character from data. */
315 utf8_from_data(const struct utf8_data
*ud
, utf8_char
*uc
)
320 fatalx("invalid UTF-8 width: %u", ud
->width
);
322 if (ud
->size
> UTF8_SIZE
)
325 index
= (((utf8_char
)ud
->data
[2] << 16)|
326 ((utf8_char
)ud
->data
[1] << 8)|
327 ((utf8_char
)ud
->data
[0]));
328 } else if (utf8_put_item(ud
->data
, ud
->size
, &index
) != 0)
330 *uc
= UTF8_SET_SIZE(ud
->size
)|UTF8_SET_WIDTH(ud
->width
)|index
;
331 log_debug("%s: (%d %d %.*s) -> %08x", __func__
, ud
->width
, ud
->size
,
332 (int)ud
->size
, ud
->data
, *uc
);
337 *uc
= UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
338 else if (ud
->width
== 1)
339 *uc
= UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
341 *uc
= UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
345 /* Get UTF-8 data from character. */
347 utf8_to_data(utf8_char uc
, struct utf8_data
*ud
)
349 struct utf8_item
*ui
;
352 memset(ud
, 0, sizeof *ud
);
353 ud
->size
= ud
->have
= UTF8_GET_SIZE(uc
);
354 ud
->width
= UTF8_GET_WIDTH(uc
);
357 ud
->data
[2] = (uc
>> 16);
358 ud
->data
[1] = ((uc
>> 8) & 0xff);
359 ud
->data
[0] = (uc
& 0xff);
361 index
= (uc
& 0xffffff);
362 if ((ui
= utf8_item_by_index(index
)) == NULL
)
363 memset(ud
->data
, ' ', ud
->size
);
365 memcpy(ud
->data
, ui
->data
, ud
->size
);
368 log_debug("%s: %08x -> (%d %d %.*s)", __func__
, uc
, ud
->width
, ud
->size
,
369 (int)ud
->size
, ud
->data
);
372 /* Get UTF-8 character from a single ASCII character. */
374 utf8_build_one(u_char ch
)
376 return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch
);
379 /* Set a single character. */
381 utf8_set(struct utf8_data
*ud
, u_char ch
)
383 static const struct utf8_data empty
= { { 0 }, 1, 1, 1 };
385 memcpy(ud
, &empty
, sizeof *ud
);
389 /* Copy UTF-8 character. */
391 utf8_copy(struct utf8_data
*to
, const struct utf8_data
*from
)
395 memcpy(to
, from
, sizeof *to
);
397 for (i
= to
->size
; i
< sizeof to
->data
; i
++)
401 /* Get width of Unicode character. */
402 static enum utf8_state
403 utf8_width(struct utf8_data
*ud
, int *width
)
407 if (utf8_towc(ud
, &wc
) != UTF8_DONE
)
409 if (utf8_in_table(wc
, utf8_force_wide
, nitems(utf8_force_wide
))) {
414 *width
= utf8proc_wcwidth(wc
);
415 log_debug("utf8proc_wcwidth(%05X) returned %d", (u_int
)wc
, *width
);
417 *width
= wcwidth(wc
);
418 log_debug("wcwidth(%05X) returned %d", (u_int
)wc
, *width
);
421 * C1 control characters are nonprintable, so they are always
424 *width
= (wc
>= 0x80 && wc
<= 0x9f) ? 0 : 1;
427 if (*width
>= 0 && *width
<= 0xff)
432 /* Convert UTF-8 character to wide character. */
434 utf8_towc(const struct utf8_data
*ud
, wchar_t *wc
)
437 switch (utf8proc_mbtowc(wc
, ud
->data
, ud
->size
)) {
439 switch (mbtowc(wc
, ud
->data
, ud
->size
)) {
442 log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud
->size
, ud
->data
,
444 mbtowc(NULL
, NULL
, MB_CUR_MAX
);
449 log_debug("UTF-8 %.*s is %05X", (int)ud
->size
, ud
->data
, (u_int
)*wc
);
453 /* Convert wide character to UTF-8 character. */
455 utf8_fromwc(wchar_t wc
, struct utf8_data
*ud
)
460 size
= utf8proc_wctomb(ud
->data
, wc
);
462 size
= wctomb(ud
->data
, wc
);
465 log_debug("UTF-8 %d, wctomb() %d", wc
, errno
);
471 ud
->size
= ud
->have
= size
;
472 if (utf8_width(ud
, &width
) == UTF8_DONE
) {
480 * Open UTF-8 sequence.
482 * 11000010-11011111 C2-DF start of 2-byte sequence
483 * 11100000-11101111 E0-EF start of 3-byte sequence
484 * 11110000-11110100 F0-F4 start of 4-byte sequence
487 utf8_open(struct utf8_data
*ud
, u_char ch
)
489 memset(ud
, 0, sizeof *ud
);
490 if (ch
>= 0xc2 && ch
<= 0xdf)
492 else if (ch
>= 0xe0 && ch
<= 0xef)
494 else if (ch
>= 0xf0 && ch
<= 0xf4)
502 /* Append character to UTF-8, closing if finished. */
504 utf8_append(struct utf8_data
*ud
, u_char ch
)
508 if (ud
->have
>= ud
->size
)
509 fatalx("UTF-8 character overflow");
510 if (ud
->size
> sizeof ud
->data
)
511 fatalx("UTF-8 character size too large");
513 if (ud
->have
!= 0 && (ch
& 0xc0) != 0x80)
516 ud
->data
[ud
->have
++] = ch
;
517 if (ud
->have
!= ud
->size
)
520 if (ud
->width
== 0xff)
522 if (utf8_width(ud
, &width
) != UTF8_DONE
)
530 * Encode len characters from src into dst, which is guaranteed to have four
531 * bytes available for each character from src (for \abc or UTF-8) plus space
535 utf8_strvis(char *dst
, const char *src
, size_t len
, int flag
)
538 const char *start
= dst
, *end
= src
+ len
;
539 enum utf8_state more
;
543 if ((more
= utf8_open(&ud
, *src
)) == UTF8_MORE
) {
544 while (++src
< end
&& more
== UTF8_MORE
)
545 more
= utf8_append(&ud
, *src
);
546 if (more
== UTF8_DONE
) {
547 /* UTF-8 character finished. */
548 for (i
= 0; i
< ud
.size
; i
++)
552 /* Not a complete, valid UTF-8 character. */
555 if ((flag
& VIS_DQ
) && src
[0] == '$' && src
< end
- 1) {
556 if (isalpha((u_char
)src
[1]) ||
561 } else if (src
< end
- 1)
562 dst
= vis(dst
, src
[0], flag
, src
[1]);
564 dst
= vis(dst
, src
[0], flag
, '\0');
568 return (dst
- start
);
571 /* Same as utf8_strvis but allocate the buffer. */
573 utf8_stravis(char **dst
, const char *src
, int flag
)
578 buf
= xreallocarray(NULL
, 4, strlen(src
) + 1);
579 len
= utf8_strvis(buf
, src
, strlen(src
), flag
);
581 *dst
= xrealloc(buf
, len
+ 1);
585 /* Same as utf8_strvis but allocate the buffer. */
587 utf8_stravisx(char **dst
, const char *src
, size_t srclen
, int flag
)
592 buf
= xreallocarray(NULL
, 4, srclen
+ 1);
593 len
= utf8_strvis(buf
, src
, srclen
, flag
);
595 *dst
= xrealloc(buf
, len
+ 1);
599 /* Does this string contain anything that isn't valid UTF-8? */
601 utf8_isvalid(const char *s
)
605 enum utf8_state more
;
609 if ((more
= utf8_open(&ud
, *s
)) == UTF8_MORE
) {
610 while (++s
< end
&& more
== UTF8_MORE
)
611 more
= utf8_append(&ud
, *s
);
612 if (more
== UTF8_DONE
)
616 if (*s
< 0x20 || *s
> 0x7e)
624 * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
625 * the returned string. Anything not valid printable ASCII or UTF-8 is
629 utf8_sanitize(const char *src
)
633 enum utf8_state more
;
637 while (*src
!= '\0') {
638 dst
= xreallocarray(dst
, n
+ 1, sizeof *dst
);
639 if ((more
= utf8_open(&ud
, *src
)) == UTF8_MORE
) {
640 while (*++src
!= '\0' && more
== UTF8_MORE
)
641 more
= utf8_append(&ud
, *src
);
642 if (more
== UTF8_DONE
) {
643 dst
= xreallocarray(dst
, n
+ ud
.width
,
645 for (i
= 0; i
< ud
.width
; i
++)
651 if (*src
> 0x1f && *src
< 0x7f)
657 dst
= xreallocarray(dst
, n
+ 1, sizeof *dst
);
662 /* Get UTF-8 buffer length. */
664 utf8_strlen(const struct utf8_data
*s
)
668 for (i
= 0; s
[i
].size
!= 0; i
++)
673 /* Get UTF-8 string width. */
675 utf8_strwidth(const struct utf8_data
*s
, ssize_t n
)
680 for (i
= 0; s
[i
].size
!= 0; i
++) {
681 if (n
!= -1 && n
== i
)
689 * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
693 utf8_fromcstr(const char *src
)
695 struct utf8_data
*dst
= NULL
;
697 enum utf8_state more
;
699 while (*src
!= '\0') {
700 dst
= xreallocarray(dst
, n
+ 1, sizeof *dst
);
701 if ((more
= utf8_open(&dst
[n
], *src
)) == UTF8_MORE
) {
702 while (*++src
!= '\0' && more
== UTF8_MORE
)
703 more
= utf8_append(&dst
[n
], *src
);
704 if (more
== UTF8_DONE
) {
710 utf8_set(&dst
[n
], *src
);
714 dst
= xreallocarray(dst
, n
+ 1, sizeof *dst
);
719 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
721 utf8_tocstr(struct utf8_data
*src
)
726 for(; src
->size
!= 0; src
++) {
727 dst
= xreallocarray(dst
, n
+ src
->size
, 1);
728 memcpy(dst
+ n
, src
->data
, src
->size
);
731 dst
= xreallocarray(dst
, n
+ 1, 1);
736 /* Get width of UTF-8 string. */
738 utf8_cstrwidth(const char *s
)
740 struct utf8_data tmp
;
742 enum utf8_state more
;
746 if ((more
= utf8_open(&tmp
, *s
)) == UTF8_MORE
) {
747 while (*++s
!= '\0' && more
== UTF8_MORE
)
748 more
= utf8_append(&tmp
, *s
);
749 if (more
== UTF8_DONE
) {
755 if (*s
> 0x1f && *s
!= 0x7f)
762 /* Pad UTF-8 string to width on the left. Caller frees. */
764 utf8_padcstr(const char *s
, u_int width
)
770 n
= utf8_cstrwidth(s
);
775 out
= xmalloc(slen
+ 1 + (width
- n
));
776 memcpy(out
, s
, slen
);
777 for (i
= n
; i
< width
; i
++)
783 /* Pad UTF-8 string to width on the right. Caller frees. */
785 utf8_rpadcstr(const char *s
, u_int width
)
791 n
= utf8_cstrwidth(s
);
796 out
= xmalloc(slen
+ 1 + (width
- n
));
797 for (i
= 0; i
< width
- n
; i
++)
799 memcpy(out
+ i
, s
, slen
);
800 out
[i
+ slen
] = '\0';
805 utf8_cstrhas(const char *s
, const struct utf8_data
*ud
)
807 struct utf8_data
*copy
, *loop
;
810 copy
= utf8_fromcstr(s
);
811 for (loop
= copy
; loop
->size
!= 0; loop
++) {
812 if (loop
->size
!= ud
->size
)
814 if (memcmp(loop
->data
, ud
->data
, loop
->size
) == 0) {