1 /* GLIB - Library of useful routines for C programming
2 * Copyright (C) 2001 Matthias Clasen <matthiasc@poet.de>
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
21 #define UNICODE_VALID(Char) \
22 ((Char) < 0x110000 && \
23 (((Char) & 0xFFFFF800) != 0xD800) && \
24 ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
25 ((Char) & 0xFFFE) != 0xFFFE)
36 /* some tests to check max_len handling */
38 { "abcde", -1, 5, TRUE
},
39 { "abcde", 3, 3, TRUE
},
40 { "abcde", 5, 5, TRUE
},
41 { "abcde", 7, 5, FALSE
},
43 { "\xc2\xa9\xc2\xa9\xc2\xa9", -1, 6, TRUE
},
44 { "\xc2\xa9\xc2\xa9\xc2\xa9", 1, 0, FALSE
},
45 { "\xc2\xa9\xc2\xa9\xc2\xa9", 2, 2, TRUE
},
46 { "\xc2\xa9\xc2\xa9\xc2\xa9", 3, 2, FALSE
},
47 { "\xc2\xa9\xc2\xa9\xc2\xa9", 4, 4, TRUE
},
48 { "\xc2\xa9\xc2\xa9\xc2\xa9", 5, 4, FALSE
},
49 { "\xc2\xa9\xc2\xa9\xc2\xa9", 6, 6, TRUE
},
50 { "\xc2\xa9\xc2\xa9\xc2\xa9", 7, 6, FALSE
},
52 { "\xe2\x89\xa0\xe2\x89\xa0", -1, 6, TRUE
},
53 { "\xe2\x89\xa0\xe2\x89\xa0", 1, 0, FALSE
},
54 { "\xe2\x89\xa0\xe2\x89\xa0", 2, 0, FALSE
},
55 { "\xe2\x89\xa0\xe2\x89\xa0", 3, 3, TRUE
},
56 { "\xe2\x89\xa0\xe2\x89\xa0", 4, 3, FALSE
},
57 { "\xe2\x89\xa0\xe2\x89\xa0", 5, 3, FALSE
},
58 { "\xe2\x89\xa0\xe2\x89\xa0", 6, 6, TRUE
},
59 { "\xe2\x89\xa0\xe2\x89\xa0", 7, 6, FALSE
},
61 /* examples from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt */
63 { "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5", -1, 11, TRUE
},
64 /* first sequence of each length */
65 { "\x00", -1, 0, TRUE
},
66 { "\xc2\x80", -1, 2, TRUE
},
67 { "\xe0\xa0\x80", -1, 3, TRUE
},
68 { "\xf0\x90\x80\x80", -1, 4, TRUE
},
69 { "\xf8\x88\x80\x80\x80", -1, 0, FALSE
},
70 { "\xfc\x84\x80\x80\x80\x80", -1, 0, FALSE
},
71 /* last sequence of each length */
72 { "\x7f", -1, 1, TRUE
},
73 { "\xdf\xbf", -1, 2, TRUE
},
74 { "\xef\xbf\xbf", -1, 3, TRUE
},
75 { "\xf7\xbf\xbf\xbf", -1, 0, FALSE
},
76 { "\xfb\xbf\xbf\xbf\xbf", -1, 0, FALSE
},
77 { "\xfd\xbf\xbf\xbf\xbf\xbf", -1, 0, FALSE
},
78 /* other boundary conditions */
79 { "\xed\x9f\xbf", -1, 3, TRUE
},
80 { "\xee\x80\x80", -1, 3, TRUE
},
81 { "\xef\xbf\xbd", -1, 3, TRUE
},
82 { "\xf4\x8f\xbf\xbf", -1, 4, TRUE
},
83 { "\xf4\x90\x80\x80", -1, 0, FALSE
},
84 /* malformed sequences */
85 /* continuation bytes */
86 { "\x80", -1, 0, FALSE
},
87 { "\xbf", -1, 0, FALSE
},
88 { "\xbf\x80", -1, 0, FALSE
},
89 { "\x80\xbf", -1, 0, FALSE
},
90 { "\x80\xbf\x80", -1, 0, FALSE
},
91 { "\x80\xbf\x80\xbf", -1, 0, FALSE
},
92 { "\x80\xbf\x80\xbf\x80", -1, 0, FALSE
},
93 { "\x80\xbf\x80\xbf\x80\xbf", -1, 0, FALSE
},
94 { "\x80\xbf\x80\xbf\x80\xbf\x80", -1, 0, FALSE
},
96 /* all possible continuation byte */
97 { "\x80", -1, 0, FALSE
},
98 { "\x81", -1, 0, FALSE
},
99 { "\x82", -1, 0, FALSE
},
100 { "\x83", -1, 0, FALSE
},
101 { "\x84", -1, 0, FALSE
},
102 { "\x85", -1, 0, FALSE
},
103 { "\x86", -1, 0, FALSE
},
104 { "\x87", -1, 0, FALSE
},
105 { "\x88", -1, 0, FALSE
},
106 { "\x89", -1, 0, FALSE
},
107 { "\x8a", -1, 0, FALSE
},
108 { "\x8b", -1, 0, FALSE
},
109 { "\x8c", -1, 0, FALSE
},
110 { "\x8d", -1, 0, FALSE
},
111 { "\x8e", -1, 0, FALSE
},
112 { "\x8f", -1, 0, FALSE
},
113 { "\x90", -1, 0, FALSE
},
114 { "\x91", -1, 0, FALSE
},
115 { "\x92", -1, 0, FALSE
},
116 { "\x93", -1, 0, FALSE
},
117 { "\x94", -1, 0, FALSE
},
118 { "\x95", -1, 0, FALSE
},
119 { "\x96", -1, 0, FALSE
},
120 { "\x97", -1, 0, FALSE
},
121 { "\x98", -1, 0, FALSE
},
122 { "\x99", -1, 0, FALSE
},
123 { "\x9a", -1, 0, FALSE
},
124 { "\x9b", -1, 0, FALSE
},
125 { "\x9c", -1, 0, FALSE
},
126 { "\x9d", -1, 0, FALSE
},
127 { "\x9e", -1, 0, FALSE
},
128 { "\x9f", -1, 0, FALSE
},
129 { "\xa0", -1, 0, FALSE
},
130 { "\xa1", -1, 0, FALSE
},
131 { "\xa2", -1, 0, FALSE
},
132 { "\xa3", -1, 0, FALSE
},
133 { "\xa4", -1, 0, FALSE
},
134 { "\xa5", -1, 0, FALSE
},
135 { "\xa6", -1, 0, FALSE
},
136 { "\xa7", -1, 0, FALSE
},
137 { "\xa8", -1, 0, FALSE
},
138 { "\xa9", -1, 0, FALSE
},
139 { "\xaa", -1, 0, FALSE
},
140 { "\xab", -1, 0, FALSE
},
141 { "\xac", -1, 0, FALSE
},
142 { "\xad", -1, 0, FALSE
},
143 { "\xae", -1, 0, FALSE
},
144 { "\xaf", -1, 0, FALSE
},
145 { "\xb0", -1, 0, FALSE
},
146 { "\xb1", -1, 0, FALSE
},
147 { "\xb2", -1, 0, FALSE
},
148 { "\xb3", -1, 0, FALSE
},
149 { "\xb4", -1, 0, FALSE
},
150 { "\xb5", -1, 0, FALSE
},
151 { "\xb6", -1, 0, FALSE
},
152 { "\xb7", -1, 0, FALSE
},
153 { "\xb8", -1, 0, FALSE
},
154 { "\xb9", -1, 0, FALSE
},
155 { "\xba", -1, 0, FALSE
},
156 { "\xbb", -1, 0, FALSE
},
157 { "\xbc", -1, 0, FALSE
},
158 { "\xbd", -1, 0, FALSE
},
159 { "\xbe", -1, 0, FALSE
},
160 { "\xbf", -1, 0, FALSE
},
161 /* lone start characters */
162 { "\xc0\x20", -1, 0, FALSE
},
163 { "\xc1\x20", -1, 0, FALSE
},
164 { "\xc2\x20", -1, 0, FALSE
},
165 { "\xc3\x20", -1, 0, FALSE
},
166 { "\xc4\x20", -1, 0, FALSE
},
167 { "\xc5\x20", -1, 0, FALSE
},
168 { "\xc6\x20", -1, 0, FALSE
},
169 { "\xc7\x20", -1, 0, FALSE
},
170 { "\xc8\x20", -1, 0, FALSE
},
171 { "\xc9\x20", -1, 0, FALSE
},
172 { "\xca\x20", -1, 0, FALSE
},
173 { "\xcb\x20", -1, 0, FALSE
},
174 { "\xcc\x20", -1, 0, FALSE
},
175 { "\xcd\x20", -1, 0, FALSE
},
176 { "\xce\x20", -1, 0, FALSE
},
177 { "\xcf\x20", -1, 0, FALSE
},
178 { "\xd0\x20", -1, 0, FALSE
},
179 { "\xd1\x20", -1, 0, FALSE
},
180 { "\xd2\x20", -1, 0, FALSE
},
181 { "\xd3\x20", -1, 0, FALSE
},
182 { "\xd4\x20", -1, 0, FALSE
},
183 { "\xd5\x20", -1, 0, FALSE
},
184 { "\xd6\x20", -1, 0, FALSE
},
185 { "\xd7\x20", -1, 0, FALSE
},
186 { "\xd8\x20", -1, 0, FALSE
},
187 { "\xd9\x20", -1, 0, FALSE
},
188 { "\xda\x20", -1, 0, FALSE
},
189 { "\xdb\x20", -1, 0, FALSE
},
190 { "\xdc\x20", -1, 0, FALSE
},
191 { "\xdd\x20", -1, 0, FALSE
},
192 { "\xde\x20", -1, 0, FALSE
},
193 { "\xdf\x20", -1, 0, FALSE
},
194 { "\xe0\x20", -1, 0, FALSE
},
195 { "\xe1\x20", -1, 0, FALSE
},
196 { "\xe2\x20", -1, 0, FALSE
},
197 { "\xe3\x20", -1, 0, FALSE
},
198 { "\xe4\x20", -1, 0, FALSE
},
199 { "\xe5\x20", -1, 0, FALSE
},
200 { "\xe6\x20", -1, 0, FALSE
},
201 { "\xe7\x20", -1, 0, FALSE
},
202 { "\xe8\x20", -1, 0, FALSE
},
203 { "\xe9\x20", -1, 0, FALSE
},
204 { "\xea\x20", -1, 0, FALSE
},
205 { "\xeb\x20", -1, 0, FALSE
},
206 { "\xec\x20", -1, 0, FALSE
},
207 { "\xed\x20", -1, 0, FALSE
},
208 { "\xee\x20", -1, 0, FALSE
},
209 { "\xef\x20", -1, 0, FALSE
},
210 { "\xf0\x20", -1, 0, FALSE
},
211 { "\xf1\x20", -1, 0, FALSE
},
212 { "\xf2\x20", -1, 0, FALSE
},
213 { "\xf3\x20", -1, 0, FALSE
},
214 { "\xf4\x20", -1, 0, FALSE
},
215 { "\xf5\x20", -1, 0, FALSE
},
216 { "\xf6\x20", -1, 0, FALSE
},
217 { "\xf7\x20", -1, 0, FALSE
},
218 { "\xf8\x20", -1, 0, FALSE
},
219 { "\xf9\x20", -1, 0, FALSE
},
220 { "\xfa\x20", -1, 0, FALSE
},
221 { "\xfb\x20", -1, 0, FALSE
},
222 { "\xfc\x20", -1, 0, FALSE
},
223 { "\xfd\x20", -1, 0, FALSE
},
224 /* missing continuation bytes */
225 { "\x20\xc0", -1, 1, FALSE
},
226 { "\x20\xe0\x80", -1, 1, FALSE
},
227 { "\x20\xf0\x80\x80", -1, 1, FALSE
},
228 { "\x20\xf8\x80\x80\x80", -1, 1, FALSE
},
229 { "\x20\xfc\x80\x80\x80\x80", -1, 1, FALSE
},
230 { "\x20\xdf", -1, 1, FALSE
},
231 { "\x20\xef\xbf", -1, 1, FALSE
},
232 { "\x20\xf7\xbf\xbf", -1, 1, FALSE
},
233 { "\x20\xfb\xbf\xbf\xbf", -1, 1, FALSE
},
234 { "\x20\xfd\xbf\xbf\xbf\xbf", -1, 1, FALSE
},
235 /* impossible bytes */
236 { "\x20\xfe\x20", -1, 1, FALSE
},
237 { "\x20\xff\x20", -1, 1, FALSE
},
238 /* overlong sequences */
239 { "\x20\xc0\xaf\x20", -1, 1, FALSE
},
240 { "\x20\xe0\x80\xaf\x20", -1, 1, FALSE
},
241 { "\x20\xf0\x80\x80\xaf\x20", -1, 1, FALSE
},
242 { "\x20\xf8\x80\x80\x80\xaf\x20", -1, 1, FALSE
},
243 { "\x20\xfc\x80\x80\x80\x80\xaf\x20", -1, 1, FALSE
},
244 { "\x20\xc1\xbf\x20", -1, 1, FALSE
},
245 { "\x20\xe0\x9f\xbf\x20", -1, 1, FALSE
},
246 { "\x20\xf0\x8f\xbf\xbf\x20", -1, 1, FALSE
},
247 { "\x20\xf8\x87\xbf\xbf\xbf\x20", -1, 1, FALSE
},
248 { "\x20\xfc\x83\xbf\xbf\xbf\xbf\x20", -1, 1, FALSE
},
249 { "\x20\xc0\x80\x20", -1, 1, FALSE
},
250 { "\x20\xe0\x80\x80\x20", -1, 1, FALSE
},
251 { "\x20\xf0\x80\x80\x80\x20", -1, 1, FALSE
},
252 { "\x20\xf8\x80\x80\x80\x80\x20", -1, 1, FALSE
},
253 { "\x20\xfc\x80\x80\x80\x80\x80\x20", -1, 1, FALSE
},
254 /* illegal code positions */
255 { "\x20\xed\xa0\x80\x20", -1, 1, FALSE
},
256 { "\x20\xed\xad\xbf\x20", -1, 1, FALSE
},
257 { "\x20\xed\xae\x80\x20", -1, 1, FALSE
},
258 { "\x20\xed\xaf\xbf\x20", -1, 1, FALSE
},
259 { "\x20\xed\xb0\x80\x20", -1, 1, FALSE
},
260 { "\x20\xed\xbe\x80\x20", -1, 1, FALSE
},
261 { "\x20\xed\xbf\xbf\x20", -1, 1, FALSE
},
262 { "\x20\xed\xa0\x80\xed\xb0\x80\x20", -1, 1, FALSE
},
263 { "\x20\xed\xa0\x80\xed\xbf\xbf\x20", -1, 1, FALSE
},
264 { "\x20\xed\xad\xbf\xed\xb0\x80\x20", -1, 1, FALSE
},
265 { "\x20\xed\xad\xbf\xed\xbf\xbf\x20", -1, 1, FALSE
},
266 { "\x20\xed\xae\x80\xed\xb0\x80\x20", -1, 1, FALSE
},
267 { "\x20\xed\xae\x80\xed\xbf\xbf\x20", -1, 1, FALSE
},
268 { "\x20\xed\xaf\xbf\xed\xb0\x80\x20", -1, 1, FALSE
},
269 { "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", -1, 1, FALSE
},
275 do_test (gconstpointer d
)
277 const Test
*test
= d
;
281 result
= g_utf8_validate (test
->text
, test
->max_len
, &end
);
283 g_assert (result
== test
->valid
);
284 g_assert (end
- test
->text
== test
->offset
);
286 if (test
->max_len
< 0)
288 result
= g_utf8_validate (test
->text
, strlen (test
->text
), &end
);
290 g_assert (result
== test
->valid
);
291 g_assert (end
- test
->text
== test
->offset
);
295 /* Test the behaviour of g_utf8_get_char_validated() with various inputs and
296 * length restrictions. */
298 test_utf8_get_char_validated (void)
303 gunichar expected_result
;
306 { "\xC0\x00_45678", 8, (gunichar
) -2 },
307 { "\xC0\x00_45678", -1, (gunichar
) -2 },
308 /* It seems odd that the return value differs with the length input, but
309 * that’s how it’s documented: */
310 { "", 0, (gunichar
) -2 },
311 { "", -1, (gunichar
) 0 },
313 { "hello", 5, (gunichar
) 'h' },
314 { "hello", -1, (gunichar
) 'h' },
315 { "\xD8\x9F", 2, 0x061F },
316 { "\xD8\x9F", -1, 0x061F },
317 { "\xD8\x9Fmore", 6, 0x061F },
318 { "\xD8\x9Fmore", -1, 0x061F },
319 { "\xE2\x96\xB3", 3, 0x25B3 },
320 { "\xE2\x96\xB3", -1, 0x25B3 },
321 { "\xE2\x96\xB3more", 7, 0x25B3 },
322 { "\xE2\x96\xB3more", -1, 0x25B3 },
323 { "\xF0\x9F\x92\xA9", 4, 0x1F4A9 },
324 { "\xF0\x9F\x92\xA9", -1, 0x1F4A9 },
325 { "\xF0\x9F\x92\xA9more", 8, 0x1F4A9 },
326 { "\xF0\x9F\x92\xA9more", -1, 0x1F4A9 },
327 /* Partial unichars: */
328 { "\xD8", -1, (gunichar
) -2 },
329 { "\xD8\x9F", 1, (gunichar
) -2 },
330 { "\xCE", -1, (gunichar
) -2 },
331 { "\xCE", 1, (gunichar
) -2 },
335 for (i
= 0; i
< G_N_ELEMENTS (test_vectors
); i
++)
337 gunichar actual_result
;
339 g_test_message ("Vector %" G_GSIZE_FORMAT
, i
);
340 actual_result
= g_utf8_get_char_validated (test_vectors
[i
].buf
,
341 test_vectors
[i
].max_len
);
342 g_assert_cmpint (actual_result
, ==, test_vectors
[i
].expected_result
);
347 main (int argc
, char *argv
[])
352 g_test_init (&argc
, &argv
, NULL
);
354 for (i
= 0; test
[i
].text
; i
++)
356 path
= g_strdup_printf ("/utf8/validate/%d", i
);
357 g_test_add_data_func (path
, &test
[i
], do_test
);
361 g_test_add_func ("/utf8/get-char-validated", test_utf8_get_char_validated
);
363 return g_test_run ();