test/fts4unicode.test

   1 # 2012 May 25
   2 #
   3 # The author disclaims copyright to this source code.  In place of
   4 # a legal notice, here is a blessing:
   5 #
   6 #    May you do good and not evil.
   7 #    May you find forgiveness for yourself and forgive others.
   8 #    May you share freely, never taking more than you give.
   9 #
  10 #*************************************************************************
  11 #
  12 # The tests in this file focus on testing the "unicode" FTS tokenizer.
  13 #
  14
  15 set testdir [file dirname $argv0]
  16 source $testdir/tester.tcl
  17 ifcapable !fts3_unicode { finish_test ; return }
  18 set ::testprefix fts4unicode
  19
  20 proc do_unicode_token_test {tn input res} {
  21   set input [string map {' ''} $input]
  22   uplevel [list do_execsql_test $tn "
  23     SELECT fts3_tokenizer_test('unicode61', 'remove_diacritics=0', '$input');
  24   " [list [list {*}$res]]]
  25 }
  26
  27 proc do_unicode_token_test2 {tn input res} {
  28   set input [string map {' ''} $input]
  29   uplevel [list do_execsql_test $tn "
  30     SELECT fts3_tokenizer_test('unicode61', '$input');
  31   " [list [list {*}$res]]]
  32 }
  33
  34 proc do_unicode_token_test3 {tn args} {
  35   set res   [lindex $args end]
  36   set sql "SELECT fts3_tokenizer_test('unicode61'"
  37   foreach a [lrange $args 0 end-1] {
  38     append sql ", '"
  39     append sql [string map {' ''} $a]
  40     append sql "'"
  41   }
  42   append sql ")"
  43   uplevel [list do_execsql_test $tn $sql [list [list {*}$res]]]
  44 }
  45
  46 do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D}
  47 do_unicode_token_test 1.1 {Ä Ö Ü} {0 ä Ä 1 ö Ö 2 ü Ü}
  48 do_unicode_token_test 1.2 {xÄx xÖx xÜx} {0 xäx xÄx 1 xöx xÖx 2 xüx xÜx}
  49
  50 # 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s.
  51 do_unicode_token_test 1.3 "\uDF" "0 \uDF \uDF"
  52 do_unicode_token_test 1.4 "\u1E9E" "0 ß \u1E9E"
  53 do_unicode_token_test 1.5 "\u1E9E" "0 \uDF \u1E9E"
  54
  55 do_unicode_token_test 1.6 "The quick brown fox" {
  56   0 the The 1 quick quick 2 brown brown 3 fox fox
  57 }
  58 do_unicode_token_test 1.7 "The\u00bfquick\u224ebrown\u2263fox" {
  59   0 the The 1 quick quick 2 brown brown 3 fox fox
  60 }
  61
  62 do_unicode_token_test2 1.8  {a B c D} {0 a a 1 b B 2 c c 3 d D}
  63 do_unicode_token_test2 1.9  {Ä Ö Ü} {0 a Ä 1 o Ö 2 u Ü}
  64 do_unicode_token_test2 1.10 {xÄx xÖx xÜx} {0 xax xÄx 1 xox xÖx 2 xux xÜx}
  65
  66 # Check that diacritics are removed if remove_diacritics=1 is specified.
  67 # And that they do not break tokens.
  68 do_unicode_token_test2 1.10 "xx\u0301xx" "0 xxxx xx\u301xx"
  69
  70 #-------------------------------------------------------------------------
  71 #
  72 set docs [list {
  73   Enhance the INSERT syntax to allow multiple rows to be inserted via the
  74   VALUES clause.
  75 } {
  76   Enhance the CREATE VIRTUAL TABLE command to support the IF NOT EXISTS clause.
  77 } {
  78   Added the sqlite3_stricmp() interface as a counterpart to sqlite3_strnicmp().
  79 } {
  80   Added the sqlite3_db_readonly() interface.
  81 } {
  82   Added the SQLITE_FCNTL_PRAGMA file control, giving VFS implementations the
  83   ability to add new PRAGMA statements or to override built-in PRAGMAs.
  84 } {
  85   Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
  86   the same row that contains the maximum x value.
  87 } {
  88   Added support for the FTS4 languageid option.
  89 } {
  90   Documented support for the FTS4 content option. This feature has actually
  91   been in the code since version 3.7.9 but is only now considered to be
  92   officially supported.
  93 } {
  94   Pending statements no longer block ROLLBACK. Instead, the pending statement
  95   will return SQLITE_ABORT upon next access after the ROLLBACK.
  96 } {
  97   Improvements to the handling of CSV inputs in the command-line shell
  98 } {
  99   Fix a bug introduced in version 3.7.10 that might cause a LEFT JOIN to be
 100   incorrectly converted into an INNER JOIN if the WHERE clause indexable terms
 101   connected by OR.
 102 }]
 103
 104 set map(a) [list "\u00C4" "\u00E4"]  ; # LATIN LETTER A WITH DIAERESIS
 105 set map(e) [list "\u00CB" "\u00EB"]  ; # LATIN LETTER E WITH DIAERESIS
 106 set map(i) [list "\u00CF" "\u00EF"]  ; # LATIN LETTER I WITH DIAERESIS
 107 set map(o) [list "\u00D6" "\u00F6"]  ; # LATIN LETTER O WITH DIAERESIS
 108 set map(u) [list "\u00DC" "\u00FC"]  ; # LATIN LETTER U WITH DIAERESIS
 109 set map(y) [list "\u0178" "\u00FF"]  ; # LATIN LETTER Y WITH DIAERESIS
 110 set map(h) [list "\u1E26" "\u1E27"]  ; # LATIN LETTER H WITH DIAERESIS
 111 set map(w) [list "\u1E84" "\u1E85"]  ; # LATIN LETTER W WITH DIAERESIS
 112 set map(x) [list "\u1E8C" "\u1E8D"]  ; # LATIN LETTER X WITH DIAERESIS
 113 foreach k [array names map] {
 114   lappend mappings [string toupper $k] [lindex $map($k) 0]
 115   lappend mappings $k [lindex $map($k) 1]
 116 }
 117 proc mapdoc {doc} {
 118   set doc [regsub -all {[[:space:]]+} $doc " "]
 119   string map $::mappings [string trim $doc]
 120 }
 121
 122 do_test 2.0 {
 123   execsql { CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61, x); }
 124   foreach doc $docs {
 125     set d [mapdoc $doc]
 126     execsql { INSERT INTO t2 VALUES($d) }
 127   }
 128 } {}
 129
 130 do_test 2.1 {
 131   set q [mapdoc "row"]
 132   execsql { SELECT * FROM t2 WHERE t2 MATCH $q }
 133 } [list [mapdoc {
 134   Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
 135   the same row that contains the maximum x value.
 136 }]]
 137
 138 foreach {tn query snippet} {
 139   2 "row" {
 140      ...returns the value of y on the same [row] that contains
 141      the maximum x value.
 142   }
 143   3 "ROW" {
 144      ...returns the value of y on the same [row] that contains
 145      the maximum x value.
 146   }
 147   4 "rollback" {
 148      ...[ROLLBACK]. Instead, the pending statement
 149      will return SQLITE_ABORT upon next access after the [ROLLBACK].
 150   }
 151   5 "rOllback" {
 152      ...[ROLLBACK]. Instead, the pending statement
 153      will return SQLITE_ABORT upon next access after the [ROLLBACK].
 154   }
 155   6 "lang*" {
 156      Added support for the FTS4 [languageid] option.
 157   }
 158 } {
 159   do_test 2.$tn {
 160     set q [mapdoc $query]
 161     execsql { SELECT snippet(t2, '[', ']', '...') FROM t2 WHERE t2 MATCH $q }
 162   } [list [mapdoc $snippet]]
 163 }
 164
 165 #-------------------------------------------------------------------------
 166 # Make sure the unicode61 tokenizer does not crash if it is passed a
 167 # NULL pointer.
 168 reset_db
 169 do_execsql_test 3.1 {
 170   CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x, y);
 171   INSERT INTO t1 VALUES(NULL, 'a b c');
 172 }
 173
 174 do_execsql_test 3.2 {
 175   SELECT snippet(t1, '[', ']') FROM t1 WHERE t1 MATCH 'b'
 176 } {{a [b] c}}
 177
 178 do_execsql_test 3.3 {
 179   BEGIN;
 180   DELETE FROM t1;
 181   INSERT INTO t1 VALUES('b b b b b b b b b b b', 'b b b b b b b b b b b b b');
 182   INSERT INTO t1 SELECT * FROM t1;
 183   INSERT INTO t1 SELECT * FROM t1;
 184   INSERT INTO t1 SELECT * FROM t1;
 185   INSERT INTO t1 SELECT * FROM t1;
 186   INSERT INTO t1 SELECT * FROM t1;
 187   INSERT INTO t1 SELECT * FROM t1;
 188   INSERT INTO t1 SELECT * FROM t1;
 189   INSERT INTO t1 SELECT * FROM t1;
 190   INSERT INTO t1 SELECT * FROM t1;
 191   INSERT INTO t1 SELECT * FROM t1;
 192   INSERT INTO t1 SELECT * FROM t1;
 193   INSERT INTO t1 SELECT * FROM t1;
 194   INSERT INTO t1 SELECT * FROM t1;
 195   INSERT INTO t1 SELECT * FROM t1;
 196   INSERT INTO t1 SELECT * FROM t1;
 197   INSERT INTO t1 SELECT * FROM t1;
 198   INSERT INTO t1 VALUES('a b c', NULL);
 199   INSERT INTO t1 VALUES('a x c', NULL);
 200   COMMIT;
 201 }
 202
 203 do_execsql_test 3.4 {
 204   SELECT * FROM t1 WHERE t1 MATCH 'a b';
 205 } {{a b c} {}}
 206
 207 #-------------------------------------------------------------------------
 208 #
 209 reset_db
 210
 211 do_test 4.1 {
 212   set a "abc\uFFFEdef"
 213   set b "abc\uD800def"
 214   set c "\uFFFEdef"
 215   set d "\uD800def"
 216   execsql {
 217     CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x);
 218     INSERT INTO t1 VALUES($a);
 219     INSERT INTO t1 VALUES($b);
 220     INSERT INTO t1 VALUES($c);
 221     INSERT INTO t1 VALUES($d);
 222   }
 223 } {}
 224
 225 do_test 4.2 {
 226   set a [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0x62}]
 227   set b [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0x62}]
 228   set c [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
 229   set d [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
 230   execsql {
 231     INSERT INTO t1 VALUES($a);
 232     INSERT INTO t1 VALUES($b);
 233     INSERT INTO t1 VALUES($c);
 234     INSERT INTO t1 VALUES($d);
 235   }
 236 } {}
 237
 238 do_test 4.3 {
 239   set a [binary format c* {0xF7 0xBF 0xBF 0xBF}]
 240   set b [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF}]
 241   set c [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF}]
 242   set d [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF}]
 243   execsql {
 244     INSERT INTO t1 VALUES($a);
 245     INSERT INTO t1 VALUES($b);
 246     INSERT INTO t1 VALUES($c);
 247     INSERT INTO t1 VALUES($d);
 248   }
 249 } {}
 250
 251 #-------------------------------------------------------------------------
 252
 253 do_unicode_token_test3 5.1 {tokenchars=} {
 254   sqlite3_reset sqlite3_column_int
 255 } {
 256   0 sqlite3 sqlite3
 257   1 reset reset
 258   2 sqlite3 sqlite3
 259   3 column column
 260   4 int int
 261 }
 262
 263 do_unicode_token_test3 5.2 {tokenchars=_} {
 264   sqlite3_reset sqlite3_column_int
 265 } {
 266   0 sqlite3_reset sqlite3_reset
 267   1 sqlite3_column_int sqlite3_column_int
 268 }
 269
 270 do_unicode_token_test3 5.3 {separators=xyz} {
 271   Laotianxhorseyrunszfast
 272 } {
 273   0 laotian Laotian
 274   1 horse horse
 275   2 runs runs
 276   3 fast fast
 277 }
 278
 279 do_unicode_token_test3 5.4 {tokenchars=xyz} {
 280   Laotianxhorseyrunszfast
 281 } {
 282   0 laotianxhorseyrunszfast Laotianxhorseyrunszfast
 283 }
 284
 285 do_unicode_token_test3 5.5 {tokenchars=_} {separators=zyx} {
 286   sqlite3_resetxsqlite3_column_intyhonda_phantom
 287 } {
 288   0 sqlite3_reset sqlite3_reset
 289   1 sqlite3_column_int sqlite3_column_int
 290   2 honda_phantom honda_phantom
 291 }
 292
 293 do_unicode_token_test3 5.6 "separators=\u05D1" "abc\u05D1def" {
 294   0 abc abc 1 def def
 295 }
 296
 297 do_unicode_token_test3 5.7                             \
 298   "tokenchars=\u2444\u2445"                            \
 299   "separators=\u05D0\u05D1\u05D2"                      \
 300   "\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \
 301   [list                                                \
 302     0 \u2444fre\u2445sh \u2444fre\u2445sh              \
 303     1 water water                                      \
 304     2 fish fish                                        \
 305     3 \u2445timer \u2445timer                          \
 306   ]
 307
 308 # Check that it is not possible to add a standalone diacritic codepoint
 309 # to either separators or tokenchars.
 310 do_unicode_token_test3 5.8 "separators=\u0301" \
 311   "hello\u0301world \u0301helloworld"          \
 312   "0 helloworld hello\u0301world 1 helloworld helloworld"
 313
 314 do_unicode_token_test3 5.9 "tokenchars=\u0301" \
 315   "hello\u0301world \u0301helloworld"          \
 316   "0 helloworld hello\u0301world 1 helloworld helloworld"
 317
 318 do_unicode_token_test3 5.10 "separators=\u0301" \
 319   "remove_diacritics=0"                        \
 320   "hello\u0301world \u0301helloworld"          \
 321   "0 hello\u0301world hello\u0301world 1 helloworld helloworld"
 322
 323 do_unicode_token_test3 5.11 "tokenchars=\u0301" \
 324   "remove_diacritics=0"                         \
 325   "hello\u0301world \u0301helloworld"           \
 326   "0 hello\u0301world hello\u0301world 1 helloworld helloworld"
 327
 328
 329 #-------------------------------------------------------------------------
 330
 331 proc do_tokenize {tokenizer txt} {
 332   set res [list]
 333   foreach {a b c} [db one {SELECT fts3_tokenizer_test($tokenizer, $txt)}] {
 334     lappend res $b
 335   }
 336   set res
 337 }
 338
 339 # Argument $lCodepoint must be a list of codepoints (integers) that
 340 # correspond to whitespace characters. This command creates a string
 341 # $W from the codepoints, then tokenizes "${W}hello{$W}world${W}"
 342 # using tokenizer $tokenizer. The test passes if the tokenizer successfully
 343 # extracts the two 5 character tokens.
 344 #
 345 proc do_isspace_test {tn tokenizer lCp} {
 346   set whitespace [format [string repeat %c [llength $lCp]] {*}$lCp]
 347   set txt "${whitespace}hello${whitespace}world${whitespace}"
 348   uplevel [list do_test $tn [list do_tokenize $tokenizer $txt] {hello world}]
 349 }
 350
 351 set tokenizers [list unicode61]
 352 ifcapable icu { lappend tokenizers icu }
 353
 354 # Some tests to check that the tokenizers can both identify white-space
 355 # codepoints. All codepoints tested below are of type "Zs" in the
 356 # UnicodeData.txt file.
 357 foreach T $tokenizers {
 358   do_isspace_test 6.$T.1 $T    32
 359   do_isspace_test 6.$T.2 $T    160
 360   do_isspace_test 6.$T.3 $T    5760
 361   do_isspace_test 6.$T.4 $T    6158
 362   do_isspace_test 6.$T.5 $T    8192
 363   do_isspace_test 6.$T.6 $T    8193
 364   do_isspace_test 6.$T.7 $T    8194
 365   do_isspace_test 6.$T.8 $T    8195
 366   do_isspace_test 6.$T.9 $T    8196
 367   do_isspace_test 6.$T.10 $T    8197
 368   do_isspace_test 6.$T.11 $T    8198
 369   do_isspace_test 6.$T.12 $T    8199
 370   do_isspace_test 6.$T.13 $T    8200
 371   do_isspace_test 6.$T.14 $T    8201
 372   do_isspace_test 6.$T.15 $T    8202
 373   do_isspace_test 6.$T.16 $T    8239
 374   do_isspace_test 6.$T.17 $T    8287
 375   do_isspace_test 6.$T.18 $T   12288
 376
 377   do_isspace_test 6.$T.19 $T   {32 160 5760 6158}
 378   do_isspace_test 6.$T.19 $T   {8192 8193 8194 8195}
 379   do_isspace_test 6.$T.19 $T   {8196 8197 8198 8199}
 380   do_isspace_test 6.$T.19 $T   {8200 8201 8202 8239}
 381   do_isspace_test 6.$T.19 $T   {8287 12288}
 382 }
 383
 384
 385 finish_test
 386
 387