Merge branch '976-disable-assert-checks' into 'master'
[glib.git] / tests / unicode-encoding.c
blob8ac4f9522a2a6908d385fd23a2799ae22858ec69
1 #undef G_DISABLE_ASSERT
2 #undef G_LOG_DOMAIN
4 #include <stdarg.h>
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <string.h>
8 #include <glib.h>
10 static gint exit_status = 0;
12 static void
13 croak (char *format, ...)
15 va_list va;
17 va_start (va, format);
18 vfprintf (stderr, format, va);
19 va_end (va);
21 exit (1);
24 static void
25 fail (char *format, ...)
27 va_list va;
29 va_start (va, format);
30 vfprintf (stderr, format, va);
31 va_end (va);
33 exit_status |= 1;
36 typedef enum
38 VALID,
39 INCOMPLETE,
40 NOTUNICODE,
41 OVERLONG,
42 MALFORMED
43 } Status;
45 static gboolean
46 ucs4_equal (gunichar *a, gunichar *b)
48 while (*a && *b && (*a == *b))
50 a++;
51 b++;
54 return (*a == *b);
57 static gboolean
58 utf16_equal (gunichar2 *a, gunichar2 *b)
60 while (*a && *b && (*a == *b))
62 a++;
63 b++;
66 return (*a == *b);
69 static gint
70 utf16_count (gunichar2 *a)
72 gint result = 0;
74 while (a[result])
75 result++;
77 return result;
80 static void
81 print_ucs4 (const gchar *prefix, gunichar *ucs4, gint ucs4_len)
83 gint i;
84 g_print ("%s ", prefix);
85 for (i = 0; i < ucs4_len; i++)
86 g_print ("%x ", ucs4[i]);
87 g_print ("\n");
90 static void
91 process (gint line,
92 gchar *utf8,
93 Status status,
94 gunichar *ucs4,
95 gint ucs4_len)
97 const gchar *end;
98 gboolean is_valid = g_utf8_validate (utf8, -1, &end);
99 GError *error = NULL;
100 glong items_read, items_written;
102 switch (status)
104 case VALID:
105 if (!is_valid)
107 fail ("line %d: valid but g_utf8_validate returned FALSE\n", line);
108 return;
110 break;
111 case NOTUNICODE:
112 case INCOMPLETE:
113 case OVERLONG:
114 case MALFORMED:
115 if (is_valid)
117 fail ("line %d: invalid but g_utf8_validate returned TRUE\n", line);
118 return;
120 break;
123 if (status == INCOMPLETE)
125 gunichar *ucs4_result;
127 ucs4_result = g_utf8_to_ucs4 (utf8, -1, NULL, NULL, &error);
129 if (!error || !g_error_matches (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT))
131 fail ("line %d: incomplete input not properly detected\n", line);
132 return;
134 g_clear_error (&error);
136 ucs4_result = g_utf8_to_ucs4 (utf8, -1, &items_read, NULL, &error);
138 if (!ucs4_result || items_read == strlen (utf8))
140 fail ("line %d: incomplete input not properly detected\n", line);
141 return;
144 g_free (ucs4_result);
147 if (status == VALID || status == NOTUNICODE)
149 gunichar *ucs4_result;
151 ucs4_result = g_utf8_to_ucs4 (utf8, -1, &items_read, &items_written, &error);
152 if (!ucs4_result)
154 fail ("line %d: conversion with status %d to ucs4 failed: %s\n", line, status, error->message);
155 return;
158 if (!ucs4_equal (ucs4_result, ucs4) ||
159 items_read != strlen (utf8) ||
160 items_written != ucs4_len)
162 fail ("line %d: results of conversion with status %d to ucs4 do not match expected.\n", line, status);
163 print_ucs4 ("expected: ", ucs4, ucs4_len);
164 print_ucs4 ("received: ", ucs4_result, items_written);
165 return;
168 g_free (ucs4_result);
171 if (status == VALID)
173 gunichar *ucs4_result;
174 gchar *utf8_result;
176 ucs4_result = g_utf8_to_ucs4_fast (utf8, -1, &items_written);
178 if (!ucs4_equal (ucs4_result, ucs4) ||
179 items_written != ucs4_len)
181 fail ("line %d: results of fast conversion with status %d to ucs4 do not match expected.\n", line, status);
182 print_ucs4 ("expected: ", ucs4, ucs4_len);
183 print_ucs4 ("received: ", ucs4_result, items_written);
184 return;
187 utf8_result = g_ucs4_to_utf8 (ucs4_result, -1, &items_read, &items_written, &error);
188 if (!utf8_result)
190 fail ("line %d: conversion back to utf8 failed: %s", line, error->message);
191 return;
194 if (strcmp (utf8_result, utf8) != 0 ||
195 items_read != ucs4_len ||
196 items_written != strlen (utf8))
198 fail ("line %d: conversion back to utf8 did not match original\n", line);
199 return;
202 g_free (utf8_result);
203 g_free (ucs4_result);
206 if (status == VALID)
208 gunichar2 *utf16_expected_tmp;
209 gunichar2 *utf16_expected;
210 gunichar2 *utf16_from_utf8;
211 gunichar2 *utf16_from_ucs4;
212 gunichar *ucs4_result;
213 gsize bytes_written;
214 gint n_chars;
215 gchar *utf8_result;
217 #if G_BYTE_ORDER == G_LITTLE_ENDIAN
218 #define TARGET "UTF-16LE"
219 #else
220 #define TARGET "UTF-16"
221 #endif
223 if (!(utf16_expected_tmp = (gunichar2 *)g_convert (utf8, -1, TARGET, "UTF-8",
224 NULL, &bytes_written, NULL)))
226 fail ("line %d: could not convert to UTF-16 via g_convert\n", line);
227 return;
230 /* zero-terminate and remove BOM
232 n_chars = bytes_written / 2;
233 if (utf16_expected_tmp[0] == 0xfeff) /* BOM */
235 n_chars--;
236 utf16_expected = g_new (gunichar2, n_chars + 1);
237 memcpy (utf16_expected, utf16_expected_tmp + 1, sizeof(gunichar2) * n_chars);
239 else if (utf16_expected_tmp[0] == 0xfffe) /* ANTI-BOM */
241 fail ("line %d: conversion via iconv to \"UTF-16\" is not native-endian\n", line);
242 return;
244 else
246 utf16_expected = g_new (gunichar2, n_chars + 1);
247 memcpy (utf16_expected, utf16_expected_tmp, sizeof(gunichar2) * n_chars);
250 utf16_expected[n_chars] = '\0';
252 if (!(utf16_from_utf8 = g_utf8_to_utf16 (utf8, -1, &items_read, &items_written, &error)))
254 fail ("line %d: conversion to ucs16 failed: %s\n", line, error->message);
255 return;
258 if (items_read != strlen (utf8) ||
259 utf16_count (utf16_from_utf8) != items_written)
261 fail ("line %d: length error in conversion to ucs16\n", line);
262 return;
265 if (!(utf16_from_ucs4 = g_ucs4_to_utf16 (ucs4, -1, &items_read, &items_written, &error)))
267 fail ("line %d: conversion to ucs16 failed: %s\n", line, error->message);
268 return;
271 if (items_read != ucs4_len ||
272 utf16_count (utf16_from_ucs4) != items_written)
274 fail ("line %d: length error in conversion to ucs16\n", line);
275 return;
278 if (!utf16_equal (utf16_from_utf8, utf16_expected) ||
279 !utf16_equal (utf16_from_ucs4, utf16_expected))
281 fail ("line %d: results of conversion to ucs16 do not match\n", line);
282 return;
285 if (!(utf8_result = g_utf16_to_utf8 (utf16_from_utf8, -1, &items_read, &items_written, &error)))
287 fail ("line %d: conversion back to utf8 failed: %s\n", line, error->message);
288 return;
291 if (items_read != utf16_count (utf16_from_utf8) ||
292 items_written != strlen (utf8))
294 fail ("line %d: length error in conversion from ucs16 to utf8\n", line);
295 return;
298 if (!(ucs4_result = g_utf16_to_ucs4 (utf16_from_ucs4, -1, &items_read, &items_written, &error)))
300 fail ("line %d: conversion back to utf8/ucs4 failed\n", line);
301 return;
304 if (items_read != utf16_count (utf16_from_utf8) ||
305 items_written != ucs4_len)
307 fail ("line %d: length error in conversion from ucs16 to ucs4\n", line);
308 return;
311 if (strcmp (utf8, utf8_result) != 0 ||
312 !ucs4_equal (ucs4, ucs4_result))
314 fail ("line %d: conversion back to utf8/ucs4 did not match original\n", line);
315 return;
318 g_free (utf16_expected_tmp);
319 g_free (utf16_expected);
320 g_free (utf16_from_utf8);
321 g_free (utf16_from_ucs4);
322 g_free (utf8_result);
323 g_free (ucs4_result);
328 main (int argc, char **argv)
330 gchar *testfile;
331 gchar *contents;
332 GError *error = NULL;
333 gchar *p, *end;
334 char *tmp;
335 gint state = 0;
336 gint line = 1;
337 gint start_line = 0; /* Quiet GCC */
338 gchar *utf8 = NULL; /* Quiet GCC */
339 GArray *ucs4;
340 Status status = VALID; /* Quiet GCC */
342 g_test_init (&argc, &argv, NULL);
344 testfile = g_test_build_filename (G_TEST_DIST, "utf8.txt", NULL);
346 g_file_get_contents (testfile, &contents, NULL, &error);
347 if (error)
348 croak ("Cannot open utf8.txt: %s", error->message);
350 ucs4 = g_array_new (TRUE, FALSE, sizeof(gunichar));
352 p = contents;
354 /* Loop over lines */
355 while (*p)
357 while (*p && (*p == ' ' || *p == '\t'))
358 p++;
360 end = p;
361 while (*end && (*end != '\r' && *end != '\n'))
362 end++;
364 if (!*p || *p == '#' || *p == '\r' || *p == '\n')
365 goto next_line;
367 tmp = g_strstrip (g_strndup (p, end - p));
369 switch (state)
371 case 0:
372 /* UTF-8 string */
373 start_line = line;
374 utf8 = tmp;
375 tmp = NULL;
376 break;
378 case 1:
379 /* Status */
380 if (!strcmp (tmp, "VALID"))
381 status = VALID;
382 else if (!strcmp (tmp, "INCOMPLETE"))
383 status = INCOMPLETE;
384 else if (!strcmp (tmp, "NOTUNICODE"))
385 status = NOTUNICODE;
386 else if (!strcmp (tmp, "OVERLONG"))
387 status = OVERLONG;
388 else if (!strcmp (tmp, "MALFORMED"))
389 status = MALFORMED;
390 else
391 croak ("Invalid status on line %d\n", line);
393 if (status != VALID && status != NOTUNICODE)
394 state++; /* No UCS-4 data */
396 break;
398 case 2:
399 /* UCS-4 version */
401 p = strtok (tmp, " \t");
402 while (p)
404 gchar *endptr;
406 gunichar ch = strtoul (p, &endptr, 16);
407 if (*endptr != '\0')
408 croak ("Invalid UCS-4 character on line %d\n", line);
410 g_array_append_val (ucs4, ch);
412 p = strtok (NULL, " \t");
415 break;
418 g_free (tmp);
419 state = (state + 1) % 3;
421 if (state == 0)
423 process (start_line, utf8, status, (gunichar *)ucs4->data, ucs4->len);
424 g_array_set_size (ucs4, 0);
425 g_free (utf8);
428 next_line:
429 p = end;
430 if (*p && *p == '\r')
431 p++;
432 if (*p && *p == '\n')
433 p++;
435 line++;
438 g_free (testfile);
439 g_array_free (ucs4, TRUE);
440 g_free (contents);
441 return exit_status;