3 # The author disclaims copyright to this source code. In place of
4 # a legal notice, here is a blessing:
6 # May you do good and not evil.
7 # May you find forgiveness for yourself and forgive others.
8 # May you share freely, never taking more than you give.
10 #*************************************************************************
12 # The tests in this file focus on testing the "unicode" FTS tokenizer.
15 set testdir [file dirname $argv0]
16 source $testdir/tester.tcl
17 ifcapable !fts3_unicode { finish_test ; return }
18 set ::testprefix fts4unicode
20 proc do_unicode_token_test {tn input res} {
21 set input [string map {' ''} $input]
22 uplevel [list do_execsql_test $tn "
23 SELECT fts3_tokenizer_test('unicode61', 'remove_diacritics=0', '$input');
24 " [list [list {*}$res]]]
27 proc do_unicode_token_test2 {tn input res} {
28 set input [string map {' ''} $input]
29 uplevel [list do_execsql_test $tn "
30 SELECT fts3_tokenizer_test('unicode61', '$input');
31 " [list [list {*}$res]]]
34 proc do_unicode_token_test3 {tn args} {
35 set res [lindex $args end]
36 set sql "SELECT fts3_tokenizer_test('unicode61'"
37 foreach a [lrange $args 0 end-1] {
39 append sql [string map {' ''} $a]
43 uplevel [list do_execsql_test $tn $sql [list [list {*}$res]]]
46 do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D}
48 do_unicode_token_test 1.1 "\uC4 \uD6 \uDC" \
49 "0 \uE4 \uC4 1 \uF6 \uD6 2 \uFC \uDC"
51 do_unicode_token_test 1.2 "x\uC4x x\uD6x x\uDCx" \
52 "0 x\uE4x x\uC4x 1 x\uF6x x\uD6x 2 x\uFCx x\uDCx"
54 # 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s.
55 do_unicode_token_test 1.3 "\uDF" "0 \uDF \uDF"
56 do_unicode_token_test 1.4 "\u1E9E" "0 \uDF \u1E9E"
58 do_unicode_token_test 1.5 "The quick brown fox" {
59 0 the The 1 quick quick 2 brown brown 3 fox fox
61 do_unicode_token_test 1.6 "The\u00bfquick\u224ebrown\u2263fox" {
62 0 the The 1 quick quick 2 brown brown 3 fox fox
65 do_unicode_token_test2 1.7 {a B c D} {0 a a 1 b B 2 c c 3 d D}
66 do_unicode_token_test2 1.8 "\uC4 \uD6 \uDC" "0 a \uC4 1 o \uD6 2 u \uDC"
68 do_unicode_token_test2 1.9 "x\uC4x x\uD6x x\uDCx" \
69 "0 xax x\uC4x 1 xox x\uD6x 2 xux x\uDCx"
71 # Check that diacritics are removed if remove_diacritics=1 is specified.
72 # And that they do not break tokens.
73 do_unicode_token_test2 1.10 "xx\u0301xx" "0 xxxx xx\u301xx"
75 # Title-case mappings work
76 do_unicode_token_test 1.11 "\u01c5" "0 \u01c6 \u01c5"
78 #-------------------------------------------------------------------------
81 Enhance the INSERT syntax to allow multiple rows to be inserted via the
84 Enhance the CREATE VIRTUAL TABLE command to support the IF NOT EXISTS clause.
86 Added the sqlite3_stricmp() interface as a counterpart to sqlite3_strnicmp().
88 Added the sqlite3_db_readonly() interface.
90 Added the SQLITE_FCNTL_PRAGMA file control, giving VFS implementations the
91 ability to add new PRAGMA statements or to override built-in PRAGMAs.
93 Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
94 the same row that contains the maximum x value.
96 Added support for the FTS4 languageid option.
98 Documented support for the FTS4 content option. This feature has actually
99 been in the code since version 3.7.9 but is only now considered to be
100 officially supported.
102 Pending statements no longer block ROLLBACK. Instead, the pending statement
103 will return SQLITE_ABORT upon next access after the ROLLBACK.
105 Improvements to the handling of CSV inputs in the command-line shell
107 Fix a bug introduced in version 3.7.10 that might cause a LEFT JOIN to be
108 incorrectly converted into an INNER JOIN if the WHERE clause indexable terms
112 set map(a) [list "\u00C4" "\u00E4"] ; # LATIN LETTER A WITH DIAERESIS
113 set map(e) [list "\u00CB" "\u00EB"] ; # LATIN LETTER E WITH DIAERESIS
114 set map(i) [list "\u00CF" "\u00EF"] ; # LATIN LETTER I WITH DIAERESIS
115 set map(o) [list "\u00D6" "\u00F6"] ; # LATIN LETTER O WITH DIAERESIS
116 set map(u) [list "\u00DC" "\u00FC"] ; # LATIN LETTER U WITH DIAERESIS
117 set map(y) [list "\u0178" "\u00FF"] ; # LATIN LETTER Y WITH DIAERESIS
118 set map(h) [list "\u1E26" "\u1E27"] ; # LATIN LETTER H WITH DIAERESIS
119 set map(w) [list "\u1E84" "\u1E85"] ; # LATIN LETTER W WITH DIAERESIS
120 set map(x) [list "\u1E8C" "\u1E8D"] ; # LATIN LETTER X WITH DIAERESIS
121 foreach k [array names map] {
122 lappend mappings [string toupper $k] [lindex $map($k) 0]
123 lappend mappings $k [lindex $map($k) 1]
126 set doc [regsub -all {[[:space:]]+} $doc " "]
127 string map $::mappings [string trim $doc]
131 execsql { CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61, x); }
134 execsql { INSERT INTO t2 VALUES($d) }
140 execsql { SELECT * FROM t2 WHERE t2 MATCH $q }
142 Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
143 the same row that contains the maximum x value.
146 foreach {tn query snippet} {
148 ...returns the value of y on the same [row] that contains
152 ...returns the value of y on the same [row] that contains
156 ...[ROLLBACK]. Instead, the pending statement
157 will return SQLITE_ABORT upon next access after the [ROLLBACK].
160 ...[ROLLBACK]. Instead, the pending statement
161 will return SQLITE_ABORT upon next access after the [ROLLBACK].
164 Added support for the FTS4 [languageid] option.
168 set q [mapdoc $query]
169 execsql { SELECT snippet(t2, '[', ']', '...') FROM t2 WHERE t2 MATCH $q }
170 } [list [mapdoc $snippet]]
173 #-------------------------------------------------------------------------
174 # Make sure the unicode61 tokenizer does not crash if it is passed a
177 do_execsql_test 3.1 {
178 CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x, y);
179 INSERT INTO t1 VALUES(NULL, 'a b c');
182 do_execsql_test 3.2 {
183 SELECT snippet(t1, '[', ']') FROM t1 WHERE t1 MATCH 'b'
186 do_execsql_test 3.3 {
189 INSERT INTO t1 VALUES('b b b b b b b b b b b', 'b b b b b b b b b b b b b');
190 INSERT INTO t1 SELECT * FROM t1;
191 INSERT INTO t1 SELECT * FROM t1;
192 INSERT INTO t1 SELECT * FROM t1;
193 INSERT INTO t1 SELECT * FROM t1;
194 INSERT INTO t1 SELECT * FROM t1;
195 INSERT INTO t1 SELECT * FROM t1;
196 INSERT INTO t1 SELECT * FROM t1;
197 INSERT INTO t1 SELECT * FROM t1;
198 INSERT INTO t1 SELECT * FROM t1;
199 INSERT INTO t1 SELECT * FROM t1;
200 INSERT INTO t1 SELECT * FROM t1;
201 INSERT INTO t1 SELECT * FROM t1;
202 INSERT INTO t1 SELECT * FROM t1;
203 INSERT INTO t1 SELECT * FROM t1;
204 INSERT INTO t1 SELECT * FROM t1;
205 INSERT INTO t1 SELECT * FROM t1;
206 INSERT INTO t1 VALUES('a b c', NULL);
207 INSERT INTO t1 VALUES('a x c', NULL);
211 do_execsql_test 3.4 {
212 SELECT * FROM t1 WHERE t1 MATCH 'a b';
215 #-------------------------------------------------------------------------
225 CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x);
226 INSERT INTO t1 VALUES($a);
227 INSERT INTO t1 VALUES($b);
228 INSERT INTO t1 VALUES($c);
229 INSERT INTO t1 VALUES($d);
234 set a [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0x62}]
235 set b [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0x62}]
236 set c [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
237 set d [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
239 INSERT INTO t1 VALUES($a);
240 INSERT INTO t1 VALUES($b);
241 INSERT INTO t1 VALUES($c);
242 INSERT INTO t1 VALUES($d);
247 set a [binary format c* {0xF7 0xBF 0xBF 0xBF}]
248 set b [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF}]
249 set c [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF}]
250 set d [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF}]
252 INSERT INTO t1 VALUES($a);
253 INSERT INTO t1 VALUES($b);
254 INSERT INTO t1 VALUES($c);
255 INSERT INTO t1 VALUES($d);
259 #-------------------------------------------------------------------------
261 do_unicode_token_test3 5.1 {tokenchars=} {
262 sqlite3_reset sqlite3_column_int
271 do_unicode_token_test3 5.2 {tokenchars=_} {
272 sqlite3_reset sqlite3_column_int
274 0 sqlite3_reset sqlite3_reset
275 1 sqlite3_column_int sqlite3_column_int
278 do_unicode_token_test3 5.3 {separators=xyz} {
279 Laotianxhorseyrunszfast
287 do_unicode_token_test3 5.4 {tokenchars=xyz} {
288 Laotianxhorseyrunszfast
290 0 laotianxhorseyrunszfast Laotianxhorseyrunszfast
293 do_unicode_token_test3 5.5 {tokenchars=_} {separators=zyx} {
294 sqlite3_resetxsqlite3_column_intyhonda_phantom
296 0 sqlite3_reset sqlite3_reset
297 1 sqlite3_column_int sqlite3_column_int
298 2 honda_phantom honda_phantom
301 do_unicode_token_test3 5.6 "separators=\u05D1" "abc\u05D1def" {
305 do_unicode_token_test3 5.7 \
306 "tokenchars=\u2444\u2445" \
307 "separators=\u05D0\u05D1\u05D2" \
308 "\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \
310 0 \u2444fre\u2445sh \u2444fre\u2445sh \
313 3 \u2445timer \u2445timer \
316 # Check that it is not possible to add a standalone diacritic codepoint
317 # to either separators or tokenchars.
318 do_unicode_token_test3 5.8 "separators=\u0301" \
319 "hello\u0301world \u0301helloworld" \
320 "0 helloworld hello\u0301world 1 helloworld helloworld"
322 do_unicode_token_test3 5.9 "tokenchars=\u0301" \
323 "hello\u0301world \u0301helloworld" \
324 "0 helloworld hello\u0301world 1 helloworld helloworld"
326 do_unicode_token_test3 5.10 "separators=\u0301" \
327 "remove_diacritics=0" \
328 "hello\u0301world \u0301helloworld" \
329 "0 hello\u0301world hello\u0301world 1 helloworld helloworld"
331 do_unicode_token_test3 5.11 "tokenchars=\u0301" \
332 "remove_diacritics=0" \
333 "hello\u0301world \u0301helloworld" \
334 "0 hello\u0301world hello\u0301world 1 helloworld helloworld"
337 #-------------------------------------------------------------------------
339 proc do_tokenize {tokenizer txt} {
341 foreach {a b c} [db one {SELECT fts3_tokenizer_test($tokenizer, $txt)}] {
347 # Argument $lCodepoint must be a list of codepoints (integers) that
348 # correspond to whitespace characters. This command creates a string
349 # $W from the codepoints, then tokenizes "${W}hello{$W}world${W}"
350 # using tokenizer $tokenizer. The test passes if the tokenizer successfully
351 # extracts the two 5 character tokens.
353 proc do_isspace_test {tn tokenizer lCp} {
354 set whitespace [format [string repeat %c [llength $lCp]] {*}$lCp]
355 set txt "${whitespace}hello${whitespace}world${whitespace}"
356 uplevel [list do_test $tn [list do_tokenize $tokenizer $txt] {hello world}]
359 set tokenizers [list unicode61]
360 ifcapable icu { lappend tokenizers icu }
362 # Some tests to check that the tokenizers can both identify white-space
363 # codepoints. All codepoints tested below are of type "Zs" in the
364 # UnicodeData.txt file.
365 foreach T $tokenizers {
366 do_isspace_test 6.$T.1 $T 32
367 do_isspace_test 6.$T.2 $T 160
368 do_isspace_test 6.$T.3 $T 5760
369 do_isspace_test 6.$T.4 $T 6158
370 do_isspace_test 6.$T.5 $T 8192
371 do_isspace_test 6.$T.6 $T 8193
372 do_isspace_test 6.$T.7 $T 8194
373 do_isspace_test 6.$T.8 $T 8195
374 do_isspace_test 6.$T.9 $T 8196
375 do_isspace_test 6.$T.10 $T 8197
376 do_isspace_test 6.$T.11 $T 8198
377 do_isspace_test 6.$T.12 $T 8199
378 do_isspace_test 6.$T.13 $T 8200
379 do_isspace_test 6.$T.14 $T 8201
380 do_isspace_test 6.$T.15 $T 8202
381 do_isspace_test 6.$T.16 $T 8239
382 do_isspace_test 6.$T.17 $T 8287
383 do_isspace_test 6.$T.18 $T 12288
385 do_isspace_test 6.$T.19 $T {32 160 5760 6158}
386 do_isspace_test 6.$T.20 $T {8192 8193 8194 8195}
387 do_isspace_test 6.$T.21 $T {8196 8197 8198 8199}
388 do_isspace_test 6.$T.22 $T {8200 8201 8202 8239}
389 do_isspace_test 6.$T.23 $T {8287 12288}
392 #-------------------------------------------------------------------------
393 # Test that the private use ranges are treated as alphanumeric.
396 1 \ue000 2 \ue001 3 \uf000 4 \uf8fe 5 \uf8ff
398 foreach {tn2 config res} {
399 1 "" "0 hello*world hello*world"
400 2 "separators=*" "0 hello hello 1 world world"
402 set config [string map [list * $c] $config]
403 set input [string map [list * $c] "hello*world"]
404 set output [string map [list * $c] $res]
405 do_unicode_token_test3 7.$tn1.$tn2 {*}$config $input $output
409 #-------------------------------------------------------------------------
410 # Cursory test of remove_diacritics=0.
412 # 00C4;LATIN CAPITAL LETTER A WITH DIAERESIS
413 # 00D6;LATIN CAPITAL LETTER O WITH DIAERESIS
414 # 00E4;LATIN SMALL LETTER A WITH DIAERESIS
415 # 00F6;LATIN SMALL LETTER O WITH DIAERESIS
417 do_execsql_test 8.1.1 "
418 CREATE VIRTUAL TABLE t3 USING fts4(tokenize=unicode61 'remove_diacritics=1');
419 INSERT INTO t3 VALUES('o');
420 INSERT INTO t3 VALUES('a');
421 INSERT INTO t3 VALUES('O');
422 INSERT INTO t3 VALUES('A');
423 INSERT INTO t3 VALUES('\xD6');
424 INSERT INTO t3 VALUES('\xC4');
425 INSERT INTO t3 VALUES('\xF6');
426 INSERT INTO t3 VALUES('\xE4');
428 do_execsql_test 8.1.2 {
429 SELECT rowid FROM t3 WHERE t3 MATCH 'o';
431 do_execsql_test 8.1.3 {
432 SELECT rowid FROM t3 WHERE t3 MATCH 'a';
434 do_execsql_test 8.2.1 {
435 CREATE VIRTUAL TABLE t4 USING fts4(tokenize=unicode61 "remove_diacritics=0");
436 INSERT INTO t4 SELECT * FROM t3;
438 do_execsql_test 8.2.2 {
439 SELECT rowid FROM t4 WHERE t4 MATCH 'o';
441 do_execsql_test 8.2.3 {
442 SELECT rowid FROM t4 WHERE t4 MATCH 'a';
445 #-------------------------------------------------------------------------
449 CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 [tokenchars= .]);
450 CREATE VIRTUAL TABLE t6 USING fts4(
451 tokenize=unicode61 [tokenchars=="] "tokenchars=[]");
452 CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 [separators=x\xC4]);
455 CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 "tokenchars= .");
456 CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 "tokenchars=[=""]");
457 CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 "separators=x\xC4");
460 CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 'tokenchars= .');
461 CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 'tokenchars=="[]');
462 CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 'separators=x\xC4');
465 CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 `tokenchars= .`);
466 CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 `tokenchars=[="]`);
467 CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 `separators=x\xC4`);
470 do_execsql_test 9.$tn.0 {
471 DROP TABLE IF EXISTS t5;
472 DROP TABLE IF EXISTS t5aux;
473 DROP TABLE IF EXISTS t6;
474 DROP TABLE IF EXISTS t6aux;
475 DROP TABLE IF EXISTS t7;
476 DROP TABLE IF EXISTS t7aux;
478 do_execsql_test 9.$tn.1 $sql
480 do_execsql_test 9.$tn.2 {
481 CREATE VIRTUAL TABLE t5aux USING fts4aux(t5);
482 INSERT INTO t5 VALUES('one two three/four.five.six');
485 four.five.six * 1 1 four.five.six 0 1 1
486 {one two three} * 1 1 {one two three} 0 1 1
489 do_execsql_test 9.$tn.3 {
490 CREATE VIRTUAL TABLE t6aux USING fts4aux(t6);
491 INSERT INTO t6 VALUES('alpha=beta"gamma/delta[epsilon]zeta');
494 {alpha=beta"gamma} * 1 1 {alpha=beta"gamma} 0 1 1
495 {delta[epsilon]zeta} * 1 1 {delta[epsilon]zeta} 0 1 1
498 do_execsql_test 9.$tn.4 {
499 CREATE VIRTUAL TABLE t7aux USING fts4aux(t7);
500 INSERT INTO t7 VALUES('alephxbeth\xC4gimel');
503 aleph * 1 1 aleph 0 1 1
504 beth * 1 1 beth 0 1 1
505 gimel * 1 1 gimel 0 1 1
509 # Check that multiple options are handled correctly.
511 do_execsql_test 10.1 {
512 DROP TABLE IF EXISTS t1;
513 CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61
514 "tokenchars=xyz" "tokenchars=.=" "separators=.=" "separators=xy"
515 "separators=a" "separators=a" "tokenchars=a" "tokenchars=a"
518 INSERT INTO t1 VALUES('oneatwoxthreeyfour');
519 INSERT INTO t1 VALUES('a.single=word');
520 CREATE VIRTUAL TABLE t1aux USING fts4aux(t1);
523 .single=word * 1 1 .single=word 0 1 1
524 four * 1 1 four 0 1 1
526 three * 1 1 three 0 1 1
530 # Test that case folding happens after tokenization, not before.
532 do_execsql_test 10.2 {
533 DROP TABLE IF EXISTS t2;
534 CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61 "separators=aB");
535 INSERT INTO t2 VALUES('oneatwoBthree');
536 INSERT INTO t2 VALUES('onebtwoAthree');
537 CREATE VIRTUAL TABLE t2aux USING fts4aux(t2);
541 onebtwoathree * 1 1 onebtwoathree 0 1 1
542 three * 1 1 three 0 1 1
546 # Test that the tokenchars and separators options work with the
547 # fts3tokenize table.
549 do_execsql_test 11.1 {
550 CREATE VIRTUAL TABLE ft1 USING fts3tokenize(
551 "unicode61", "tokenchars=@.", "separators=1234567890"
553 SELECT token FROM ft1 WHERE input = 'berlin@street123sydney.road';
555 berlin@street sydney.road