3 # The author disclaims copyright to this source code. In place of
4 # a legal notice, here is a blessing:
6 # May you do good and not evil.
7 # May you find forgiveness for yourself and forgive others.
8 # May you share freely, never taking more than you give.
10 #*************************************************************************
12 # The tests in this file focus on testing the "unicode" FTS tokenizer.
15 set testdir [file dirname $argv0]
16 source $testdir/tester.tcl
17 ifcapable !fts3_unicode { finish_test ; return }
18 set ::testprefix fts4unicode
20 proc do_unicode_token_test {tn input res} {
21 set input [string map {' ''} $input]
22 uplevel [list do_execsql_test $tn "
23 SELECT fts3_tokenizer_test('unicode61', 'remove_diacritics=0', '$input');
24 " [list [list {*}$res]]]
27 proc do_unicode_token_test2 {tn input res} {
28 set input [string map {' ''} $input]
29 uplevel [list do_execsql_test $tn "
30 SELECT fts3_tokenizer_test('unicode61', '$input');
31 " [list [list {*}$res]]]
34 proc do_unicode_token_test3 {tn args} {
35 set res [lindex $args end]
36 set sql "SELECT fts3_tokenizer_test('unicode61'"
37 foreach a [lrange $args 0 end-1] {
39 append sql [string map {' ''} $a]
43 uplevel [list do_execsql_test $tn $sql [list [list {*}$res]]]
46 do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D}
48 do_unicode_token_test 1.1 "\uC4 \uD6 \uDC" \
49 "0 \uE4 \uC4 1 \uF6 \uD6 2 \uFC \uDC"
51 do_unicode_token_test 1.2 "x\uC4x x\uD6x x\uDCx" \
52 "0 x\uE4x x\uC4x 1 x\uF6x x\uD6x 2 x\uFCx x\uDCx"
54 # 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s.
55 do_unicode_token_test 1.3 "\uDF" "0 \uDF \uDF"
56 do_unicode_token_test 1.4 "\u1E9E" "0 \uDF \u1E9E"
58 do_unicode_token_test 1.5 "The quick brown fox" {
59 0 the The 1 quick quick 2 brown brown 3 fox fox
61 do_unicode_token_test 1.6 "The\u00bfquick\u224ebrown\u2263fox" {
62 0 the The 1 quick quick 2 brown brown 3 fox fox
65 do_unicode_token_test2 1.7 {a B c D} {0 a a 1 b B 2 c c 3 d D}
66 do_unicode_token_test2 1.8 "\uC4 \uD6 \uDC" "0 a \uC4 1 o \uD6 2 u \uDC"
68 do_unicode_token_test2 1.9 "x\uC4x x\uD6x x\uDCx" \
69 "0 xax x\uC4x 1 xox x\uD6x 2 xux x\uDCx"
71 # Check that diacritics are removed if remove_diacritics=1 is specified.
72 # And that they do not break tokens.
73 do_unicode_token_test2 1.10 "xx\u0301xx" "0 xxxx xx\u301xx"
75 # Title-case mappings work
76 do_unicode_token_test 1.11 "\u01c5" "0 \u01c6 \u01c5"
78 #-------------------------------------------------------------------------
81 Enhance the INSERT syntax to allow multiple rows to be inserted via the
84 Enhance the CREATE VIRTUAL TABLE command to support the IF NOT EXISTS clause.
86 Added the sqlite3_stricmp() interface as a counterpart to sqlite3_strnicmp().
88 Added the sqlite3_db_readonly() interface.
90 Added the SQLITE_FCNTL_PRAGMA file control, giving VFS implementations the
91 ability to add new PRAGMA statements or to override built-in PRAGMAs.
93 Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
94 the same row that contains the maximum x value.
96 Added support for the FTS4 languageid option.
98 Documented support for the FTS4 content option. This feature has actually
99 been in the code since version 3.7.9 but is only now considered to be
100 officially supported.
102 Pending statements no longer block ROLLBACK. Instead, the pending statement
103 will return SQLITE_ABORT upon next access after the ROLLBACK.
105 Improvements to the handling of CSV inputs in the command-line shell
107 Fix a bug introduced in version 3.7.10 that might cause a LEFT JOIN to be
108 incorrectly converted into an INNER JOIN if the WHERE clause indexable terms
112 set map(a) [list "\u00C4" "\u00E4"] ; # LATIN LETTER A WITH DIAERESIS
113 set map(e) [list "\u00CB" "\u00EB"] ; # LATIN LETTER E WITH DIAERESIS
114 set map(i) [list "\u00CF" "\u00EF"] ; # LATIN LETTER I WITH DIAERESIS
115 set map(o) [list "\u00D6" "\u00F6"] ; # LATIN LETTER O WITH DIAERESIS
116 set map(u) [list "\u00DC" "\u00FC"] ; # LATIN LETTER U WITH DIAERESIS
117 set map(y) [list "\u0178" "\u00FF"] ; # LATIN LETTER Y WITH DIAERESIS
118 set map(h) [list "\u1E26" "\u1E27"] ; # LATIN LETTER H WITH DIAERESIS
119 set map(w) [list "\u1E84" "\u1E85"] ; # LATIN LETTER W WITH DIAERESIS
120 set map(x) [list "\u1E8C" "\u1E8D"] ; # LATIN LETTER X WITH DIAERESIS
121 foreach k [array names map] {
122 lappend mappings [string toupper $k] [lindex $map($k) 0]
123 lappend mappings $k [lindex $map($k) 1]
126 set doc [regsub -all {[[:space:]]+} $doc " "]
127 string map $::mappings [string trim $doc]
131 execsql { CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61, x); }
134 execsql { INSERT INTO t2 VALUES($d) }
140 execsql { SELECT * FROM t2 WHERE t2 MATCH $q }
142 Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
143 the same row that contains the maximum x value.
146 foreach {tn query snippet} {
148 ...returns the value of y on the same [row] that contains
152 ...returns the value of y on the same [row] that contains
156 ...[ROLLBACK]. Instead, the pending statement
157 will return SQLITE_ABORT upon next access after the [ROLLBACK].
160 ...[ROLLBACK]. Instead, the pending statement
161 will return SQLITE_ABORT upon next access after the [ROLLBACK].
164 Added support for the FTS4 [languageid] option.
168 set q [mapdoc $query]
169 execsql { SELECT snippet(t2, '[', ']', '...') FROM t2 WHERE t2 MATCH $q }
170 } [list [mapdoc $snippet]]
173 #-------------------------------------------------------------------------
174 # Make sure the unicode61 tokenizer does not crash if it is passed a
177 do_execsql_test 3.1 {
178 CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x, y);
179 INSERT INTO t1 VALUES(NULL, 'a b c');
182 do_execsql_test 3.2 {
183 SELECT snippet(t1, '[', ']') FROM t1 WHERE t1 MATCH 'b'
186 do_execsql_test 3.3 {
189 INSERT INTO t1 VALUES('b b b b b b b b b b b', 'b b b b b b b b b b b b b');
190 INSERT INTO t1 SELECT * FROM t1;
191 INSERT INTO t1 SELECT * FROM t1;
192 INSERT INTO t1 SELECT * FROM t1;
193 INSERT INTO t1 SELECT * FROM t1;
194 INSERT INTO t1 SELECT * FROM t1;
195 INSERT INTO t1 SELECT * FROM t1;
196 INSERT INTO t1 SELECT * FROM t1;
197 INSERT INTO t1 SELECT * FROM t1;
198 INSERT INTO t1 SELECT * FROM t1;
199 INSERT INTO t1 SELECT * FROM t1;
200 INSERT INTO t1 SELECT * FROM t1;
201 INSERT INTO t1 SELECT * FROM t1;
202 INSERT INTO t1 SELECT * FROM t1;
203 INSERT INTO t1 SELECT * FROM t1;
204 INSERT INTO t1 SELECT * FROM t1;
205 INSERT INTO t1 SELECT * FROM t1;
206 INSERT INTO t1 VALUES('a b c', NULL);
207 INSERT INTO t1 VALUES('a x c', NULL);
211 do_execsql_test 3.4 {
212 SELECT * FROM t1 WHERE t1 MATCH 'a b';
215 #-------------------------------------------------------------------------
225 CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x);
226 INSERT INTO t1 VALUES($a);
227 INSERT INTO t1 VALUES($b);
228 INSERT INTO t1 VALUES($c);
229 INSERT INTO t1 VALUES($d);
234 set a [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0x62}]
235 set b [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0x62}]
236 set c [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
237 set d [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
239 INSERT INTO t1 VALUES($a);
240 INSERT INTO t1 VALUES($b);
241 INSERT INTO t1 VALUES($c);
242 INSERT INTO t1 VALUES($d);
247 set a [binary format c* {0xF7 0xBF 0xBF 0xBF}]
248 set b [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF}]
249 set c [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF}]
250 set d [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF}]
252 INSERT INTO t1 VALUES($a);
253 INSERT INTO t1 VALUES($b);
254 INSERT INTO t1 VALUES($c);
255 INSERT INTO t1 VALUES($d);
259 #-------------------------------------------------------------------------
261 do_unicode_token_test3 5.1 {tokenchars=} {
262 sqlite3_reset sqlite3_column_int
271 do_unicode_token_test3 5.2 {tokenchars=_} {
272 sqlite3_reset sqlite3_column_int
274 0 sqlite3_reset sqlite3_reset
275 1 sqlite3_column_int sqlite3_column_int
278 do_unicode_token_test3 5.3 {separators=xyz} {
279 Laotianxhorseyrunszfast
287 do_unicode_token_test3 5.4 {tokenchars=xyz} {
288 Laotianxhorseyrunszfast
290 0 laotianxhorseyrunszfast Laotianxhorseyrunszfast
293 do_unicode_token_test3 5.5 {tokenchars=_} {separators=zyx} {
294 sqlite3_resetxsqlite3_column_intyhonda_phantom
296 0 sqlite3_reset sqlite3_reset
297 1 sqlite3_column_int sqlite3_column_int
298 2 honda_phantom honda_phantom
301 do_unicode_token_test3 5.6 "separators=\u05D1" "abc\u05D1def" {
305 do_unicode_token_test3 5.7 \
306 "tokenchars=\u2444\u2445" \
307 "separators=\u05D0\u05D1\u05D2" \
308 "\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \
310 0 \u2444fre\u2445sh \u2444fre\u2445sh \
313 3 \u2445timer \u2445timer \
316 # Check that it is not possible to add a standalone diacritic codepoint
317 # to either separators or tokenchars.
318 do_unicode_token_test3 5.8 "separators=\u0301" \
319 "hello\u0301world \u0301helloworld" \
320 "0 helloworld hello\u0301world 1 helloworld helloworld"
322 do_unicode_token_test3 5.9 "tokenchars=\u0301" \
323 "hello\u0301world \u0301helloworld" \
324 "0 helloworld hello\u0301world 1 helloworld helloworld"
326 do_unicode_token_test3 5.10 "separators=\u0301" \
327 "remove_diacritics=0" \
328 "hello\u0301world \u0301helloworld" \
329 "0 hello\u0301world hello\u0301world 1 helloworld helloworld"
331 do_unicode_token_test3 5.11 "tokenchars=\u0301" \
332 "remove_diacritics=0" \
333 "hello\u0301world \u0301helloworld" \
334 "0 hello\u0301world hello\u0301world 1 helloworld helloworld"
337 #-------------------------------------------------------------------------
339 proc do_tokenize {tokenizer txt} {
341 foreach {a b c} [db one {SELECT fts3_tokenizer_test($tokenizer, $txt)}] {
347 # Argument $lCodepoint must be a list of codepoints (integers) that
348 # correspond to whitespace characters. This command creates a string
349 # $W from the codepoints, then tokenizes "${W}hello{$W}world${W}"
350 # using tokenizer $tokenizer. The test passes if the tokenizer successfully
351 # extracts the two 5 character tokens.
353 proc do_isspace_test {tn tokenizer lCp} {
354 set whitespace [format [string repeat %c [llength $lCp]] {*}$lCp]
355 set txt "${whitespace}hello${whitespace}world${whitespace}"
356 uplevel [list do_test $tn [list do_tokenize $tokenizer $txt] {hello world}]
359 set tokenizers [list unicode61]
360 ifcapable icu { lappend tokenizers icu }
362 # Some tests to check that the tokenizers can both identify white-space
363 # codepoints. All codepoints tested below are of type "Zs" in the
364 # UnicodeData.txt file.
366 # Note that codepoint 6158 has changed from Zs to Cf in recent versions
367 # of UnicodeData.txt. So take that into account for the "icu" tests.
369 foreach T $tokenizers {
370 do_isspace_test 6.$T.1 $T 32
371 do_isspace_test 6.$T.2 $T 160
372 do_isspace_test 6.$T.3 $T 5760
374 do_isspace_test 6.$T.4 $T 6158
376 do_isspace_test 6.$T.5 $T 8192
377 do_isspace_test 6.$T.6 $T 8193
378 do_isspace_test 6.$T.7 $T 8194
379 do_isspace_test 6.$T.8 $T 8195
380 do_isspace_test 6.$T.9 $T 8196
381 do_isspace_test 6.$T.10 $T 8197
382 do_isspace_test 6.$T.11 $T 8198
383 do_isspace_test 6.$T.12 $T 8199
384 do_isspace_test 6.$T.13 $T 8200
385 do_isspace_test 6.$T.14 $T 8201
386 do_isspace_test 6.$T.15 $T 8202
388 do_isspace_test 6.$T.16 $T 8239
390 do_isspace_test 6.$T.17 $T 8287
391 do_isspace_test 6.$T.18 $T 12288
394 do_isspace_test 6.$T.19 $T {32 160 5760 6158}
396 do_isspace_test 6.$T.19 $T {32 160 5760 8192}
398 do_isspace_test 6.$T.20 $T {8192 8193 8194 8195}
399 do_isspace_test 6.$T.21 $T {8196 8197 8198 8199}
400 do_isspace_test 6.$T.22 $T {8200 8201 8202 8239}
401 do_isspace_test 6.$T.23 $T {8287 12288}
404 #-------------------------------------------------------------------------
405 # Test that the private use ranges are treated as alphanumeric.
408 1 \ue000 2 \ue001 3 \uf000 4 \uf8fe 5 \uf8ff
410 foreach {tn2 config res} {
411 1 "" "0 hello*world hello*world"
412 2 "separators=*" "0 hello hello 1 world world"
414 set config [string map [list * $c] $config]
415 set input [string map [list * $c] "hello*world"]
416 set output [string map [list * $c] $res]
417 do_unicode_token_test3 7.$tn1.$tn2 {*}$config $input $output
421 #-------------------------------------------------------------------------
422 # Cursory test of remove_diacritics=0.
424 # 00C4;LATIN CAPITAL LETTER A WITH DIAERESIS
425 # 00D6;LATIN CAPITAL LETTER O WITH DIAERESIS
426 # 00E4;LATIN SMALL LETTER A WITH DIAERESIS
427 # 00F6;LATIN SMALL LETTER O WITH DIAERESIS
429 do_execsql_test 8.1.1 "
430 CREATE VIRTUAL TABLE t3 USING fts4(tokenize=unicode61 'remove_diacritics=1');
431 INSERT INTO t3 VALUES('o');
432 INSERT INTO t3 VALUES('a');
433 INSERT INTO t3 VALUES('O');
434 INSERT INTO t3 VALUES('A');
435 INSERT INTO t3 VALUES('\xD6');
436 INSERT INTO t3 VALUES('\xC4');
437 INSERT INTO t3 VALUES('\xF6');
438 INSERT INTO t3 VALUES('\xE4');
440 do_execsql_test 8.1.2 {
441 SELECT rowid FROM t3 WHERE t3 MATCH 'o';
443 do_execsql_test 8.1.3 {
444 SELECT rowid FROM t3 WHERE t3 MATCH 'a';
446 do_execsql_test 8.2.1 {
447 CREATE VIRTUAL TABLE t4 USING fts4(tokenize=unicode61 "remove_diacritics=0");
448 INSERT INTO t4 SELECT * FROM t3;
450 do_execsql_test 8.2.2 {
451 SELECT rowid FROM t4 WHERE t4 MATCH 'o';
453 do_execsql_test 8.2.3 {
454 SELECT rowid FROM t4 WHERE t4 MATCH 'a';
457 #-------------------------------------------------------------------------
461 CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 [tokenchars= .]);
462 CREATE VIRTUAL TABLE t6 USING fts4(
463 tokenize=unicode61 [tokenchars=="] "tokenchars=[]");
464 CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 [separators=x\xC4]);
467 CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 "tokenchars= .");
468 CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 "tokenchars=[=""]");
469 CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 "separators=x\xC4");
472 CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 'tokenchars= .');
473 CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 'tokenchars=="[]');
474 CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 'separators=x\xC4');
477 CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 `tokenchars= .`);
478 CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 `tokenchars=[="]`);
479 CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 `separators=x\xC4`);
482 do_execsql_test 9.$tn.0 {
483 DROP TABLE IF EXISTS t5;
484 DROP TABLE IF EXISTS t5aux;
485 DROP TABLE IF EXISTS t6;
486 DROP TABLE IF EXISTS t6aux;
487 DROP TABLE IF EXISTS t7;
488 DROP TABLE IF EXISTS t7aux;
490 do_execsql_test 9.$tn.1 $sql
492 do_execsql_test 9.$tn.2 {
493 CREATE VIRTUAL TABLE t5aux USING fts4aux(t5);
494 INSERT INTO t5 VALUES('one two three/four.five.six');
497 four.five.six * 1 1 four.five.six 0 1 1
498 {one two three} * 1 1 {one two three} 0 1 1
501 do_execsql_test 9.$tn.3 {
502 CREATE VIRTUAL TABLE t6aux USING fts4aux(t6);
503 INSERT INTO t6 VALUES('alpha=beta"gamma/delta[epsilon]zeta');
506 {alpha=beta"gamma} * 1 1 {alpha=beta"gamma} 0 1 1
507 {delta[epsilon]zeta} * 1 1 {delta[epsilon]zeta} 0 1 1
510 do_execsql_test 9.$tn.4 {
511 CREATE VIRTUAL TABLE t7aux USING fts4aux(t7);
512 INSERT INTO t7 VALUES('alephxbeth\xC4gimel');
515 aleph * 1 1 aleph 0 1 1
516 beth * 1 1 beth 0 1 1
517 gimel * 1 1 gimel 0 1 1
521 # Check that multiple options are handled correctly.
523 do_execsql_test 10.1 {
524 DROP TABLE IF EXISTS t1;
525 CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61
526 "tokenchars=xyz" "tokenchars=.=" "separators=.=" "separators=xy"
527 "separators=a" "separators=a" "tokenchars=a" "tokenchars=a"
530 INSERT INTO t1 VALUES('oneatwoxthreeyfour');
531 INSERT INTO t1 VALUES('a.single=word');
532 CREATE VIRTUAL TABLE t1aux USING fts4aux(t1);
535 .single=word * 1 1 .single=word 0 1 1
536 four * 1 1 four 0 1 1
538 three * 1 1 three 0 1 1
542 # Test that case folding happens after tokenization, not before.
544 do_execsql_test 10.2 {
545 DROP TABLE IF EXISTS t2;
546 CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61 "separators=aB");
547 INSERT INTO t2 VALUES('oneatwoBthree');
548 INSERT INTO t2 VALUES('onebtwoAthree');
549 CREATE VIRTUAL TABLE t2aux USING fts4aux(t2);
553 onebtwoathree * 1 1 onebtwoathree 0 1 1
554 three * 1 1 three 0 1 1
558 # Test that the tokenchars and separators options work with the
559 # fts3tokenize table.
561 do_execsql_test 11.1 {
562 CREATE VIRTUAL TABLE ft1 USING fts3tokenize(
563 "unicode61", "tokenchars=@.", "separators=1234567890"
565 SELECT token FROM ft1 WHERE input = 'berlin@street123sydney.road';
567 berlin@street sydney.road
570 # Test for embedded nul characters in fts4 unicode index.
572 do_execsql_test 12.0 {
573 CREATE VIRTUAL TABLE t12 USING fts4(tokenize=unicode61);
574 INSERT INTO t12 VALUES('abc' || char(0) || 'def');
575 SELECT hex(CAST(content AS blob)) FROM t12;
577 do_execsql_test 12.1 {
578 INSERT INTO t12(t12) VALUES('integrity-check');
580 do_execsql_test 12.2 {
581 CREATE VIRTUAL TABLE t12aux USING fts4aux(t12);
582 SELECT * FROM t12aux;
583 } {abc * 1 1 abc 0 1 1}
584 do_execsql_test 12.3 {
585 SELECT hex(CAST(content AS blob)) FROM t12 WHERE t12 MATCH 'abc'