glib/tests/utf8-validate.c

   1 /* GLIB - Library of useful routines for C programming
   2  * Copyright (C) 2001 Matthias Clasen <matthiasc@poet.de>
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2.1 of the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  16  */
  17
  18 #include "glib.h"
  19 #include <string.h>
  20
  21 #define UNICODE_VALID(Char)                   \
  22     ((Char) < 0x110000 &&                     \
  23      (((Char) & 0xFFFFF800) != 0xD800) &&     \
  24      ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&  \
  25      ((Char) & 0xFFFE) != 0xFFFE)
  26
  27
  28 typedef struct {
  29   const gchar *text;
  30   gint max_len;
  31   gint offset;
  32   gboolean valid;
  33 } Test;
  34
  35 Test test[] = {
  36   /* some tests to check max_len handling */
  37   /* length 1 */
  38   { "abcde", -1, 5, TRUE },
  39   { "abcde", 3, 3, TRUE },
  40   { "abcde", 5, 5, TRUE },
  41   { "abcde", 7, 5, FALSE },
  42   /* length 2 */
  43   { "\xc2\xa9\xc2\xa9\xc2\xa9", -1, 6, TRUE },
  44   { "\xc2\xa9\xc2\xa9\xc2\xa9",  1, 0, FALSE },
  45   { "\xc2\xa9\xc2\xa9\xc2\xa9",  2, 2, TRUE },
  46   { "\xc2\xa9\xc2\xa9\xc2\xa9",  3, 2, FALSE },
  47   { "\xc2\xa9\xc2\xa9\xc2\xa9",  4, 4, TRUE },
  48   { "\xc2\xa9\xc2\xa9\xc2\xa9",  5, 4, FALSE },
  49   { "\xc2\xa9\xc2\xa9\xc2\xa9",  6, 6, TRUE },
  50   { "\xc2\xa9\xc2\xa9\xc2\xa9",  7, 6, FALSE },
  51   /* length 3 */
  52   { "\xe2\x89\xa0\xe2\x89\xa0", -1, 6, TRUE },
  53   { "\xe2\x89\xa0\xe2\x89\xa0",  1, 0, FALSE },
  54   { "\xe2\x89\xa0\xe2\x89\xa0",  2, 0, FALSE },
  55   { "\xe2\x89\xa0\xe2\x89\xa0",  3, 3, TRUE },
  56   { "\xe2\x89\xa0\xe2\x89\xa0",  4, 3, FALSE },
  57   { "\xe2\x89\xa0\xe2\x89\xa0",  5, 3, FALSE },
  58   { "\xe2\x89\xa0\xe2\x89\xa0",  6, 6, TRUE },
  59   { "\xe2\x89\xa0\xe2\x89\xa0",  7, 6, FALSE },
  60
  61   /* examples from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt */
  62   /* greek 'kosme' */
  63   { "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5", -1, 11, TRUE },
  64   /* first sequence of each length */
  65   { "\x00", -1, 0, TRUE },
  66   { "\xc2\x80", -1, 2, TRUE },
  67   { "\xe0\xa0\x80", -1, 3, TRUE },
  68   { "\xf0\x90\x80\x80", -1, 4, TRUE },
  69   { "\xf8\x88\x80\x80\x80", -1, 0, FALSE },
  70   { "\xfc\x84\x80\x80\x80\x80", -1, 0, FALSE },
  71   /* last sequence of each length */
  72   { "\x7f", -1, 1, TRUE },
  73   { "\xdf\xbf", -1, 2, TRUE },
  74   { "\xef\xbf\xbf", -1, 3, TRUE },
  75   { "\xf7\xbf\xbf\xbf", -1, 0, FALSE },
  76   { "\xfb\xbf\xbf\xbf\xbf", -1, 0, FALSE },
  77   { "\xfd\xbf\xbf\xbf\xbf\xbf", -1, 0, FALSE },
  78   /* other boundary conditions */
  79   { "\xed\x9f\xbf", -1, 3, TRUE },
  80   { "\xee\x80\x80", -1, 3, TRUE },
  81   { "\xef\xbf\xbd", -1, 3, TRUE },
  82   { "\xf4\x8f\xbf\xbf", -1, 4, TRUE },
  83   { "\xf4\x90\x80\x80", -1, 0, FALSE },
  84   /* malformed sequences */
  85   /* continuation bytes */
  86   { "\x80", -1, 0, FALSE },
  87   { "\xbf", -1, 0, FALSE },
  88   { "\xbf\x80", -1, 0, FALSE },
  89   { "\x80\xbf", -1, 0, FALSE },
  90   { "\x80\xbf\x80", -1, 0, FALSE },
  91   { "\x80\xbf\x80\xbf", -1, 0, FALSE },
  92   { "\x80\xbf\x80\xbf\x80", -1, 0, FALSE },
  93   { "\x80\xbf\x80\xbf\x80\xbf", -1, 0, FALSE },
  94   { "\x80\xbf\x80\xbf\x80\xbf\x80", -1, 0, FALSE },
  95
  96   /* all possible continuation byte */
  97   { "\x80", -1, 0, FALSE },
  98   { "\x81", -1, 0, FALSE },
  99   { "\x82", -1, 0, FALSE },
 100   { "\x83", -1, 0, FALSE },
 101   { "\x84", -1, 0, FALSE },
 102   { "\x85", -1, 0, FALSE },
 103   { "\x86", -1, 0, FALSE },
 104   { "\x87", -1, 0, FALSE },
 105   { "\x88", -1, 0, FALSE },
 106   { "\x89", -1, 0, FALSE },
 107   { "\x8a", -1, 0, FALSE },
 108   { "\x8b", -1, 0, FALSE },
 109   { "\x8c", -1, 0, FALSE },
 110   { "\x8d", -1, 0, FALSE },
 111   { "\x8e", -1, 0, FALSE },
 112   { "\x8f", -1, 0, FALSE },
 113   { "\x90", -1, 0, FALSE },
 114   { "\x91", -1, 0, FALSE },
 115   { "\x92", -1, 0, FALSE },
 116   { "\x93", -1, 0, FALSE },
 117   { "\x94", -1, 0, FALSE },
 118   { "\x95", -1, 0, FALSE },
 119   { "\x96", -1, 0, FALSE },
 120   { "\x97", -1, 0, FALSE },
 121   { "\x98", -1, 0, FALSE },
 122   { "\x99", -1, 0, FALSE },
 123   { "\x9a", -1, 0, FALSE },
 124   { "\x9b", -1, 0, FALSE },
 125   { "\x9c", -1, 0, FALSE },
 126   { "\x9d", -1, 0, FALSE },
 127   { "\x9e", -1, 0, FALSE },
 128   { "\x9f", -1, 0, FALSE },
 129   { "\xa0", -1, 0, FALSE },
 130   { "\xa1", -1, 0, FALSE },
 131   { "\xa2", -1, 0, FALSE },
 132   { "\xa3", -1, 0, FALSE },
 133   { "\xa4", -1, 0, FALSE },
 134   { "\xa5", -1, 0, FALSE },
 135   { "\xa6", -1, 0, FALSE },
 136   { "\xa7", -1, 0, FALSE },
 137   { "\xa8", -1, 0, FALSE },
 138   { "\xa9", -1, 0, FALSE },
 139   { "\xaa", -1, 0, FALSE },
 140   { "\xab", -1, 0, FALSE },
 141   { "\xac", -1, 0, FALSE },
 142   { "\xad", -1, 0, FALSE },
 143   { "\xae", -1, 0, FALSE },
 144   { "\xaf", -1, 0, FALSE },
 145   { "\xb0", -1, 0, FALSE },
 146   { "\xb1", -1, 0, FALSE },
 147   { "\xb2", -1, 0, FALSE },
 148   { "\xb3", -1, 0, FALSE },
 149   { "\xb4", -1, 0, FALSE },
 150   { "\xb5", -1, 0, FALSE },
 151   { "\xb6", -1, 0, FALSE },
 152   { "\xb7", -1, 0, FALSE },
 153   { "\xb8", -1, 0, FALSE },
 154   { "\xb9", -1, 0, FALSE },
 155   { "\xba", -1, 0, FALSE },
 156   { "\xbb", -1, 0, FALSE },
 157   { "\xbc", -1, 0, FALSE },
 158   { "\xbd", -1, 0, FALSE },
 159   { "\xbe", -1, 0, FALSE },
 160   { "\xbf", -1, 0, FALSE },
 161   /* lone start characters */
 162   { "\xc0\x20", -1, 0, FALSE },
 163   { "\xc1\x20", -1, 0, FALSE },
 164   { "\xc2\x20", -1, 0, FALSE },
 165   { "\xc3\x20", -1, 0, FALSE },
 166   { "\xc4\x20", -1, 0, FALSE },
 167   { "\xc5\x20", -1, 0, FALSE },
 168   { "\xc6\x20", -1, 0, FALSE },
 169   { "\xc7\x20", -1, 0, FALSE },
 170   { "\xc8\x20", -1, 0, FALSE },
 171   { "\xc9\x20", -1, 0, FALSE },
 172   { "\xca\x20", -1, 0, FALSE },
 173   { "\xcb\x20", -1, 0, FALSE },
 174   { "\xcc\x20", -1, 0, FALSE },
 175   { "\xcd\x20", -1, 0, FALSE },
 176   { "\xce\x20", -1, 0, FALSE },
 177   { "\xcf\x20", -1, 0, FALSE },
 178   { "\xd0\x20", -1, 0, FALSE },
 179   { "\xd1\x20", -1, 0, FALSE },
 180   { "\xd2\x20", -1, 0, FALSE },
 181   { "\xd3\x20", -1, 0, FALSE },
 182   { "\xd4\x20", -1, 0, FALSE },
 183   { "\xd5\x20", -1, 0, FALSE },
 184   { "\xd6\x20", -1, 0, FALSE },
 185   { "\xd7\x20", -1, 0, FALSE },
 186   { "\xd8\x20", -1, 0, FALSE },
 187   { "\xd9\x20", -1, 0, FALSE },
 188   { "\xda\x20", -1, 0, FALSE },
 189   { "\xdb\x20", -1, 0, FALSE },
 190   { "\xdc\x20", -1, 0, FALSE },
 191   { "\xdd\x20", -1, 0, FALSE },
 192   { "\xde\x20", -1, 0, FALSE },
 193   { "\xdf\x20", -1, 0, FALSE },
 194   { "\xe0\x20", -1, 0, FALSE },
 195   { "\xe1\x20", -1, 0, FALSE },
 196   { "\xe2\x20", -1, 0, FALSE },
 197   { "\xe3\x20", -1, 0, FALSE },
 198   { "\xe4\x20", -1, 0, FALSE },
 199   { "\xe5\x20", -1, 0, FALSE },
 200   { "\xe6\x20", -1, 0, FALSE },
 201   { "\xe7\x20", -1, 0, FALSE },
 202   { "\xe8\x20", -1, 0, FALSE },
 203   { "\xe9\x20", -1, 0, FALSE },
 204   { "\xea\x20", -1, 0, FALSE },
 205   { "\xeb\x20", -1, 0, FALSE },
 206   { "\xec\x20", -1, 0, FALSE },
 207   { "\xed\x20", -1, 0, FALSE },
 208   { "\xee\x20", -1, 0, FALSE },
 209   { "\xef\x20", -1, 0, FALSE },
 210   { "\xf0\x20", -1, 0, FALSE },
 211   { "\xf1\x20", -1, 0, FALSE },
 212   { "\xf2\x20", -1, 0, FALSE },
 213   { "\xf3\x20", -1, 0, FALSE },
 214   { "\xf4\x20", -1, 0, FALSE },
 215   { "\xf5\x20", -1, 0, FALSE },
 216   { "\xf6\x20", -1, 0, FALSE },
 217   { "\xf7\x20", -1, 0, FALSE },
 218   { "\xf8\x20", -1, 0, FALSE },
 219   { "\xf9\x20", -1, 0, FALSE },
 220   { "\xfa\x20", -1, 0, FALSE },
 221   { "\xfb\x20", -1, 0, FALSE },
 222   { "\xfc\x20", -1, 0, FALSE },
 223   { "\xfd\x20", -1, 0, FALSE },
 224   /* missing continuation bytes */
 225   { "\x20\xc0", -1, 1, FALSE },
 226   { "\x20\xe0\x80", -1, 1, FALSE },
 227   { "\x20\xf0\x80\x80", -1, 1, FALSE },
 228   { "\x20\xf8\x80\x80\x80", -1, 1, FALSE },
 229   { "\x20\xfc\x80\x80\x80\x80", -1, 1, FALSE },
 230   { "\x20\xdf", -1, 1, FALSE },
 231   { "\x20\xef\xbf", -1, 1, FALSE },
 232   { "\x20\xf7\xbf\xbf", -1, 1, FALSE },
 233   { "\x20\xfb\xbf\xbf\xbf", -1, 1, FALSE },
 234   { "\x20\xfd\xbf\xbf\xbf\xbf", -1, 1, FALSE },
 235   /* impossible bytes */
 236   { "\x20\xfe\x20", -1, 1, FALSE },
 237   { "\x20\xff\x20", -1, 1, FALSE },
 238   /* overlong sequences */
 239   { "\x20\xc0\xaf\x20", -1, 1, FALSE },
 240   { "\x20\xe0\x80\xaf\x20", -1, 1, FALSE },
 241   { "\x20\xf0\x80\x80\xaf\x20", -1, 1, FALSE },
 242   { "\x20\xf8\x80\x80\x80\xaf\x20", -1, 1, FALSE },
 243   { "\x20\xfc\x80\x80\x80\x80\xaf\x20", -1, 1, FALSE },
 244   { "\x20\xc1\xbf\x20", -1, 1, FALSE },
 245   { "\x20\xe0\x9f\xbf\x20", -1, 1, FALSE },
 246   { "\x20\xf0\x8f\xbf\xbf\x20", -1, 1, FALSE },
 247   { "\x20\xf8\x87\xbf\xbf\xbf\x20", -1, 1, FALSE },
 248   { "\x20\xfc\x83\xbf\xbf\xbf\xbf\x20", -1, 1, FALSE },
 249   { "\x20\xc0\x80\x20", -1, 1, FALSE },
 250   { "\x20\xe0\x80\x80\x20", -1, 1, FALSE },
 251   { "\x20\xf0\x80\x80\x80\x20", -1, 1, FALSE },
 252   { "\x20\xf8\x80\x80\x80\x80\x20", -1, 1, FALSE },
 253   { "\x20\xfc\x80\x80\x80\x80\x80\x20", -1, 1, FALSE },
 254   /* illegal code positions */
 255   { "\x20\xed\xa0\x80\x20", -1, 1, FALSE },
 256   { "\x20\xed\xad\xbf\x20", -1, 1, FALSE },
 257   { "\x20\xed\xae\x80\x20", -1, 1, FALSE },
 258   { "\x20\xed\xaf\xbf\x20", -1, 1, FALSE },
 259   { "\x20\xed\xb0\x80\x20", -1, 1, FALSE },
 260   { "\x20\xed\xbe\x80\x20", -1, 1, FALSE },
 261   { "\x20\xed\xbf\xbf\x20", -1, 1, FALSE },
 262   { "\x20\xed\xa0\x80\xed\xb0\x80\x20", -1, 1, FALSE },
 263   { "\x20\xed\xa0\x80\xed\xbf\xbf\x20", -1, 1, FALSE },
 264   { "\x20\xed\xad\xbf\xed\xb0\x80\x20", -1, 1, FALSE },
 265   { "\x20\xed\xad\xbf\xed\xbf\xbf\x20", -1, 1, FALSE },
 266   { "\x20\xed\xae\x80\xed\xb0\x80\x20", -1, 1, FALSE },
 267   { "\x20\xed\xae\x80\xed\xbf\xbf\x20", -1, 1, FALSE },
 268   { "\x20\xed\xaf\xbf\xed\xb0\x80\x20", -1, 1, FALSE },
 269   { "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", -1, 1, FALSE },
 270
 271   { NULL, }
 272 };
 273
 274 static void
 275 do_test (gconstpointer d)
 276 {
 277   const Test *test = d;
 278   const gchar *end;
 279   gboolean result;
 280
 281   result = g_utf8_validate (test->text, test->max_len, &end);
 282
 283   g_assert (result == test->valid);
 284   g_assert (end - test->text == test->offset);
 285
 286   if (test->max_len < 0)
 287     {
 288       result = g_utf8_validate (test->text, strlen (test->text), &end);
 289
 290       g_assert (result == test->valid);
 291       g_assert (end - test->text == test->offset);
 292     }
 293 }
 294
 295 /* Test the behaviour of g_utf8_get_char_validated() with various inputs and
 296  * length restrictions. */
 297 static void
 298 test_utf8_get_char_validated (void)
 299 {
 300   const struct {
 301     const gchar *buf;
 302     gssize max_len;
 303     gunichar expected_result;
 304   } test_vectors[] = {
 305     /* Bug #780095: */
 306     { "\xC0\x00_45678", 8, (gunichar) -2 },
 307     { "\xC0\x00_45678", -1, (gunichar) -2 },
 308     /* It seems odd that the return value differs with the length input, but
 309      * that’s how it’s documented: */
 310     { "", 0, (gunichar) -2 },
 311     { "", -1, (gunichar) 0 },
 312     /* Normal inputs: */
 313     { "hello", 5, (gunichar) 'h' },
 314     { "hello", -1, (gunichar) 'h' },
 315     { "\xD8\x9F", 2, 0x061F },
 316     { "\xD8\x9F", -1, 0x061F },
 317     { "\xD8\x9Fmore", 6, 0x061F },
 318     { "\xD8\x9Fmore", -1, 0x061F },
 319     { "\xE2\x96\xB3", 3, 0x25B3 },
 320     { "\xE2\x96\xB3", -1, 0x25B3 },
 321     { "\xE2\x96\xB3more", 7, 0x25B3 },
 322     { "\xE2\x96\xB3more", -1, 0x25B3 },
 323     { "\xF0\x9F\x92\xA9", 4, 0x1F4A9 },
 324     { "\xF0\x9F\x92\xA9", -1, 0x1F4A9 },
 325     { "\xF0\x9F\x92\xA9more", 8, 0x1F4A9 },
 326     { "\xF0\x9F\x92\xA9more", -1, 0x1F4A9 },
 327     /* Partial unichars: */
 328     { "\xD8", -1, (gunichar) -2 },
 329     { "\xD8\x9F", 1, (gunichar) -2 },
 330     { "\xCE", -1, (gunichar) -2 },
 331     { "\xCE", 1, (gunichar) -2 },
 332   };
 333   gsize i;
 334
 335   for (i = 0; i < G_N_ELEMENTS (test_vectors); i++)
 336     {
 337       gunichar actual_result;
 338
 339       g_test_message ("Vector %" G_GSIZE_FORMAT, i);
 340       actual_result = g_utf8_get_char_validated (test_vectors[i].buf,
 341                                                  test_vectors[i].max_len);
 342       g_assert_cmpint (actual_result, ==, test_vectors[i].expected_result);
 343     }
 344 }
 345
 346 int
 347 main (int argc, char *argv[])
 348 {
 349   gint i;
 350   gchar *path;
 351
 352   g_test_init (&argc, &argv, NULL);
 353
 354   for (i = 0; test[i].text; i++)
 355     {
 356       path = g_strdup_printf ("/utf8/validate/%d", i);
 357       g_test_add_data_func (path, &test[i], do_test);
 358       g_free (path);
 359     }
 360
 361   g_test_add_func ("/utf8/get-char-validated", test_utf8_get_char_validated);
 362
 363   return g_test_run ();
 364 }