2 #include "st-unicode.h"
7 st_utf8_get_unichar (const char *p
)
14 if ((p
[0] & 0x80) == 0x00) {
16 } else if ((p
[0] & 0xe0) == 0xc0) {
17 ch
= ((p
[0] & 0x1f) << 6) | (p
[1] & 0x3f);
18 } else if ((p
[0] & 0xf0) == 0xe0) {
19 ch
= ((p
[0] & 0xf) << 12) | ((p
[1] & 0x3f) << 6) | (p
[2] & 0x3f);
20 } else if ((p
[0] & 0xf8) == 0xf0) {
21 ch
= ((p
[0] & 0x7) << 18) | ((p
[1] & 0x3f) << 12) | ((p
[2] & 0x3f) << 6) | (p
[3] & 0x3f);
23 ch
= 0x00; /* undefined */
29 * Copyright (C) 2008 Colin Percival
32 #define ONEMASK ((size_t)(-1) / 0xFF)
34 st_utf8_strlen(const char * _s
)
41 /* Handle any initial misaligned bytes. */
42 for (s
= _s
; (uintptr_t)(s
) & (sizeof(size_t) - 1); s
++) {
45 /* Exit if we hit a zero byte. */
49 /* Is this byte NOT the first byte of a character? */
50 count
+= (b
>> 7) & ((~b
) >> 6);
53 /* Handle complete blocks. */
54 for (; ; s
+= sizeof(size_t)) {
55 /* Prefetch 256 bytes ahead. */
56 __builtin_prefetch(&s
[256], 0, 0);
58 /* Grab 4 or 8 bytes of UTF-8 data. */
61 /* Exit the loop if there are any zero bytes. */
62 if ((u
- ONEMASK
) & (~u
) & (ONEMASK
* 0x80))
65 /* Count bytes which are NOT the first byte of a character. */
66 u
= ((u
& (ONEMASK
* 0x80)) >> 7) & ((~u
) >> 6);
67 count
+= (u
* ONEMASK
) >> ((sizeof(size_t) - 1) * 8);
70 /* Take care of any left-over bytes. */
74 /* Exit if we hit a zero byte. */
78 /* Is this byte NOT the first byte of a character? */
79 count
+= (b
>> 7) & ((~b
) >> 6);
83 return ((s
- _s
) - count
);
87 /* Derived from FontConfig
88 * Copyright (C) 2006 Keith Packard
91 st_unichar_to_utf8 (st_unichar ch
, char *outbuf
)
96 if (ch
< 0x80) { *d
++ = ch
; bits
= -6; }
97 else if (ch
< 0x800) { *d
++ = ((ch
>> 6) & 0x1f) | 0xc0; bits
= 0; }
98 else if (ch
< 0x10000) { *d
++ = ((ch
>> 12) & 0x0f) | 0xe0; bits
= 6; }
99 else if (ch
< 0x200000) { *d
++ = ((ch
>> 18) & 0x07) | 0xf0; bits
= 12; }
100 else if (ch
< 0x4000000) { *d
++ = ((ch
>> 24) & 0x03) | 0xf8; bits
= 18; }
101 else if (ch
< 0x80000000) { *d
++ = ((ch
>> 30) & 0x01) | 0xfC; bits
= 24; }
104 for (; bits
>= 0; bits
-= 6) {
105 *d
++= ((ch
>> bits
) & 0x3F) | 0x80;
112 * @utf: Pointer to putative UTF-8 encoded string.
114 * Checks @utf for being valid UTF-8. @utf is assumed to be
115 * null-terminated. This function is not super-strict, as it will
116 * allow longer UTF-8 sequences than necessary. Note that Java is
117 * capable of producing these sequences if provoked. Also note, this
118 * routine checks for the 4-byte maximum size, but does not check for
119 * 0x10ffff maximum value.
121 * Return value: true if @utf is valid.
123 /* Derived from eglib, libxml2
124 * Copyright (C) 2006 Novell, Inc.
125 * Copyright (C) 1998-2003 Daniel Veillard
128 st_utf8_validate (const char *string
, ssize_t max_len
)
133 max_len
= strlen (string
);
136 * input is a string of 1, 2, 3 or 4 bytes. The valid strings
137 * are as follows (in "bit format"):
138 * 0xxxxxxx valid 1-byte
139 * 110xxxxx 10xxxxxx valid 2-byte
140 * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
141 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
143 for (ix
= 0; ix
< max_len
;) { /* string is 0-terminated */
147 if ((c
& 0x80) == 0x00) { /* 1-byte code, starts with 10 */
149 } else if ((c
& 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
150 if (((ix
+1) >= max_len
) || (string
[ix
+1] & 0xc0 ) != 0x80)
153 } else if ((c
& 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
154 if (((ix
+ 2) >= max_len
) ||
155 ((string
[ix
+1] & 0xc0) != 0x80) ||
156 ((string
[ix
+2] & 0xc0) != 0x80))
159 } else if ((c
& 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
160 if (((ix
+ 3) >= max_len
) ||
161 ((string
[ix
+1] & 0xc0) != 0x80) ||
162 ((string
[ix
+2] & 0xc0) != 0x80) ||
163 ((string
[ix
+3] & 0xc0) != 0x80))
166 } else {/* unknown encoding */
176 * @utf: a sequence of UTF-8 encoded bytes
178 * compute the length of an UTF8 string, it doesn't do a full UTF8
179 * checking of the content of the string.
181 * Returns the number of characters in the string or -1 in case of error
183 /* Derived from libxml2
184 * Copyright (C) 1998-2003 Daniel Veillard
187 st_utf8_strlen (const char *string
)
194 while (*string
!= 0) {
195 if (string
[0] & 0x80) {
196 if ((string
[1] & 0xc0) != 0x80)
198 if ((string
[0] & 0xe0) == 0xe0) {
199 if ((string
[2] & 0xc0) != 0x80)
201 if ((string
[0] & 0xf0) == 0xf0) {
202 if ((string
[0] & 0xf8) != 0xf0 || (string
[3] & 0xc0) != 0x80)
220 st_utf8_offset_to_pointer (const char *string
, st_uint offset
)
222 const char *p
= string
;
224 for (st_uint i
= 0; i
< offset
; i
++)
225 p
= st_utf8_next_char (p
);
231 st_utf8_to_ucs4 (const char *string
)
233 const st_uchar
*p
= string
;
234 st_unichar
*buffer
, c
;
240 buffer
= st_malloc (sizeof (st_unichar
) * (st_utf8_strlen (string
) + 1));
243 if ((p
[0] & 0x80) == 0x00) {
246 } else if ((p
[0] & 0xe0) == 0xc0) {
247 c
= ((p
[0] & 0x1f) << 6) | (p
[1] & 0x3f);
249 } else if ((p
[0] & 0xf0) == 0xe0) {
250 c
= ((p
[0] & 0xf) << 12) | ((p
[1] & 0x3f) << 6) | (p
[2] & 0x3f);
252 } else if ((p
[0] & 0xf8) == 0xf0) {
253 c
= ((p
[0] & 0x7) << 18) | ((p
[1] & 0x3f) << 12) | ((p
[2] & 0x3f) << 6) | (p
[3] & 0x3f);
268 st_ucs4_to_utf8 (const st_unichar
*string
)
274 if (ch
< 0x80) { *d
++ = ch
; bits
= -6; }
275 else if (ch
< 0x800) { *d
++ = ((ch
>> 6) & 0x1f) | 0xc0; bits
= 0; }
276 else if (ch
< 0x10000) { *d
++ = ((ch
>> 12) & 0x0f) | 0xe0; bits
= 6; }
277 else if (ch
< 0x200000) { *d
++ = ((ch
>> 18) & 0x07) | 0xf0; bits
= 12; }
278 else if (ch
< 0x4000000) { *d
++ = ((ch
>> 24) & 0x03) | 0xf8; bits
= 18; }
279 else if (ch
< 0x80000000) { *d
++ = ((ch
>> 30) & 0x01) | 0xfC; bits
= 24; }
282 for (; bits
>= 0; bits
-= 6) {
283 *d
++= ((ch
>> bits
) & 0x3F) | 0x80;