Removed the notion of a "large" context. For simplicity, all contexts
[panda.git] / src / st-unicode.c
blob16ae49de9390cf2ea309bf780d1d6318cdac7401
2 #include "st-unicode.h"
3 #include "st-utils.h"
4 #include <string.h>
6 st_unichar
7 st_utf8_get_unichar (const char *p)
9 st_unichar ch;
11 if (p == NULL)
12 return 0x00;
14 if ((p[0] & 0x80) == 0x00) {
15 ch = p[0];
16 } else if ((p[0] & 0xe0) == 0xc0) {
17 ch = ((p[0] & 0x1f) << 6) | (p[1] & 0x3f);
18 } else if ((p[0] & 0xf0) == 0xe0) {
19 ch = ((p[0] & 0xf) << 12) | ((p[1] & 0x3f) << 6) | (p[2] & 0x3f);
20 } else if ((p[0] & 0xf8) == 0xf0) {
21 ch = ((p[0] & 0x7) << 18) | ((p[1] & 0x3f) << 12) | ((p[2] & 0x3f) << 6) | (p[3] & 0x3f);
22 } else
23 ch = 0x00; /* undefined */
25 return ch;
29 * Copyright (C) 2008 Colin Percival
31 #if 0
32 #define ONEMASK ((size_t)(-1) / 0xFF)
33 size_t
34 st_utf8_strlen(const char * _s)
36 const char * s;
37 size_t count = 0;
38 size_t u;
39 unsigned char b;
41 /* Handle any initial misaligned bytes. */
42 for (s = _s; (uintptr_t)(s) & (sizeof(size_t) - 1); s++) {
43 b = *s;
45 /* Exit if we hit a zero byte. */
46 if (b == '\0')
47 goto done;
49 /* Is this byte NOT the first byte of a character? */
50 count += (b >> 7) & ((~b) >> 6);
53 /* Handle complete blocks. */
54 for (; ; s += sizeof(size_t)) {
55 /* Prefetch 256 bytes ahead. */
56 __builtin_prefetch(&s[256], 0, 0);
58 /* Grab 4 or 8 bytes of UTF-8 data. */
59 u = *(size_t *)(s);
61 /* Exit the loop if there are any zero bytes. */
62 if ((u - ONEMASK) & (~u) & (ONEMASK * 0x80))
63 break;
65 /* Count bytes which are NOT the first byte of a character. */
66 u = ((u & (ONEMASK * 0x80)) >> 7) & ((~u) >> 6);
67 count += (u * ONEMASK) >> ((sizeof(size_t) - 1) * 8);
70 /* Take care of any left-over bytes. */
71 for (; ; s++) {
72 b = *s;
74 /* Exit if we hit a zero byte. */
75 if (b == '\0')
76 break;
78 /* Is this byte NOT the first byte of a character? */
79 count += (b >> 7) & ((~b) >> 6);
82 done:
83 return ((s - _s) - count);
85 #endif
87 /* Derived from FontConfig
88 * Copyright (C) 2006 Keith Packard
90 int
91 st_unichar_to_utf8 (st_unichar ch, char *outbuf)
93 int bits;
94 char *d = outbuf;
96 if (ch < 0x80) { *d++ = ch; bits = -6; }
97 else if (ch < 0x800) { *d++ = ((ch >> 6) & 0x1f) | 0xc0; bits = 0; }
98 else if (ch < 0x10000) { *d++ = ((ch >> 12) & 0x0f) | 0xe0; bits = 6; }
99 else if (ch < 0x200000) { *d++ = ((ch >> 18) & 0x07) | 0xf0; bits = 12; }
100 else if (ch < 0x4000000) { *d++ = ((ch >> 24) & 0x03) | 0xf8; bits = 18; }
101 else if (ch < 0x80000000) { *d++ = ((ch >> 30) & 0x01) | 0xfC; bits = 24; }
102 else return 0;
104 for (; bits >= 0; bits -= 6) {
105 *d++= ((ch >> bits) & 0x3F) | 0x80;
107 return d - outbuf;
111 * st_utf8_validate
112 * @utf: Pointer to putative UTF-8 encoded string.
114 * Checks @utf for being valid UTF-8. @utf is assumed to be
115 * null-terminated. This function is not super-strict, as it will
116 * allow longer UTF-8 sequences than necessary. Note that Java is
117 * capable of producing these sequences if provoked. Also note, this
118 * routine checks for the 4-byte maximum size, but does not check for
119 * 0x10ffff maximum value.
121 * Return value: true if @utf is valid.
123 /* Derived from eglib, libxml2
124 * Copyright (C) 2006 Novell, Inc.
125 * Copyright (C) 1998-2003 Daniel Veillard
127 bool
128 st_utf8_validate (const char *string, ssize_t max_len)
130 int ix;
132 if (max_len == -1)
133 max_len = strlen (string);
136 * input is a string of 1, 2, 3 or 4 bytes. The valid strings
137 * are as follows (in "bit format"):
138 * 0xxxxxxx valid 1-byte
139 * 110xxxxx 10xxxxxx valid 2-byte
140 * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
141 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
143 for (ix = 0; ix < max_len;) { /* string is 0-terminated */
144 st_uchar c;
146 c = string[ix];
147 if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
148 ix++;
149 } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
150 if (((ix+1) >= max_len) || (string[ix+1] & 0xc0 ) != 0x80)
151 return false;
152 ix += 2;
153 } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
154 if (((ix + 2) >= max_len) ||
155 ((string[ix+1] & 0xc0) != 0x80) ||
156 ((string[ix+2] & 0xc0) != 0x80))
157 return false;
158 ix += 3;
159 } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
160 if (((ix + 3) >= max_len) ||
161 ((string[ix+1] & 0xc0) != 0x80) ||
162 ((string[ix+2] & 0xc0) != 0x80) ||
163 ((string[ix+3] & 0xc0) != 0x80))
164 return false;
165 ix += 4;
166 } else {/* unknown encoding */
167 return false;
171 return true;
175 * st_utf8_strlen:
176 * @utf: a sequence of UTF-8 encoded bytes
178 * compute the length of an UTF8 string, it doesn't do a full UTF8
179 * checking of the content of the string.
181 * Returns the number of characters in the string or -1 in case of error
183 /* Derived from libxml2
184 * Copyright (C) 1998-2003 Daniel Veillard
187 st_utf8_strlen (const char *string)
189 int ret = 0;
191 if (string == NULL)
192 return(-1);
194 while (*string != 0) {
195 if (string[0] & 0x80) {
196 if ((string[1] & 0xc0) != 0x80)
197 return(-1);
198 if ((string[0] & 0xe0) == 0xe0) {
199 if ((string[2] & 0xc0) != 0x80)
200 return(-1);
201 if ((string[0] & 0xf0) == 0xf0) {
202 if ((string[0] & 0xf8) != 0xf0 || (string[3] & 0xc0) != 0x80)
203 return(-1);
204 string += 4;
205 } else {
206 string += 3;
208 } else {
209 string += 2;
211 } else {
212 string++;
214 ret++;
216 return(ret);
219 const char *
220 st_utf8_offset_to_pointer (const char *string, st_uint offset)
222 const char *p = string;
224 for (st_uint i = 0; i < offset; i++)
225 p = st_utf8_next_char (p);
227 return p;
230 st_unichar *
231 st_utf8_to_ucs4 (const char *string)
233 const st_uchar *p = string;
234 st_unichar *buffer, c;
235 st_uint index = 0;
237 if (string == NULL)
238 return NULL;
240 buffer = st_malloc (sizeof (st_unichar) * (st_utf8_strlen (string) + 1));
242 while (p[0]) {
243 if ((p[0] & 0x80) == 0x00) {
244 c = p[0];
245 p += 1;
246 } else if ((p[0] & 0xe0) == 0xc0) {
247 c = ((p[0] & 0x1f) << 6) | (p[1] & 0x3f);
248 p += 2;
249 } else if ((p[0] & 0xf0) == 0xe0) {
250 c = ((p[0] & 0xf) << 12) | ((p[1] & 0x3f) << 6) | (p[2] & 0x3f);
251 p += 3;
252 } else if ((p[0] & 0xf8) == 0xf0) {
253 c = ((p[0] & 0x7) << 18) | ((p[1] & 0x3f) << 12) | ((p[2] & 0x3f) << 6) | (p[3] & 0x3f);
254 p += 4;
255 } else
256 break;
258 buffer[index++] = c;
261 buffer[index] = 0;
262 return buffer;
265 #if 0
267 char *
268 st_ucs4_to_utf8 (const st_unichar *string)
271 int bits;
272 char *d = outbuf;
274 if (ch < 0x80) { *d++ = ch; bits = -6; }
275 else if (ch < 0x800) { *d++ = ((ch >> 6) & 0x1f) | 0xc0; bits = 0; }
276 else if (ch < 0x10000) { *d++ = ((ch >> 12) & 0x0f) | 0xe0; bits = 6; }
277 else if (ch < 0x200000) { *d++ = ((ch >> 18) & 0x07) | 0xf0; bits = 12; }
278 else if (ch < 0x4000000) { *d++ = ((ch >> 24) & 0x03) | 0xf8; bits = 18; }
279 else if (ch < 0x80000000) { *d++ = ((ch >> 30) & 0x01) | 0xfC; bits = 24; }
280 else return 0;
282 for (; bits >= 0; bits -= 6) {
283 *d++= ((ch >> bits) & 0x3F) | 0x80;
285 return d - outbuf;
288 #endif