3 # The author disclaims copyright to this source code. In place of
4 # a legal notice, here is a blessing:
6 # May you do good and not evil.
7 # May you find forgiveness for yourself and forgive others.
8 # May you share freely, never taking more than you give.
10 #*************************************************************************
12 # The tests in this file focus on testing the "unicode" FTS tokenizer.
15 set testdir [file dirname $argv0]
16 source $testdir/tester.tcl
17 ifcapable !fts3_unicode { finish_test ; return }
18 set ::testprefix fts4unicode
20 proc do_unicode_token_test {tn input res} {
21 set input [string map {' ''} $input]
22 uplevel [list do_execsql_test $tn "
23 SELECT fts3_tokenizer_test('unicode61', 'remove_diacritics=0', '$input');
24 " [list [list {*}$res]]]
27 proc do_unicode_token_test2 {tn input res} {
28 set input [string map {' ''} $input]
29 uplevel [list do_execsql_test $tn "
30 SELECT fts3_tokenizer_test('unicode61', '$input');
31 " [list [list {*}$res]]]
34 proc do_unicode_token_test3 {tn args} {
35 set res [lindex $args end]
36 set sql "SELECT fts3_tokenizer_test('unicode61'"
37 foreach a [lrange $args 0 end-1] {
39 append sql [string map {' ''} $a]
43 uplevel [list do_execsql_test $tn $sql [list [list {*}$res]]]
46 do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D}
47 do_unicode_token_test 1.1 {Ä Ö Ü} {0 ä Ä 1 ö Ö 2 ü Ü}
48 do_unicode_token_test 1.2 {xÄx xÖx xÜx} {0 xäx xÄx 1 xöx xÖx 2 xüx xÜx}
50 # 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s.
51 do_unicode_token_test 1.3 "\uDF" "0 \uDF \uDF"
52 do_unicode_token_test 1.4 "\u1E9E" "0 ß \u1E9E"
53 do_unicode_token_test 1.5 "\u1E9E" "0 \uDF \u1E9E"
55 do_unicode_token_test 1.6 "The quick brown fox" {
56 0 the The 1 quick quick 2 brown brown 3 fox fox
58 do_unicode_token_test 1.7 "The\u00bfquick\u224ebrown\u2263fox" {
59 0 the The 1 quick quick 2 brown brown 3 fox fox
62 do_unicode_token_test2 1.8 {a B c D} {0 a a 1 b B 2 c c 3 d D}
63 do_unicode_token_test2 1.9 {Ä Ö Ü} {0 a Ä 1 o Ö 2 u Ü}
64 do_unicode_token_test2 1.10 {xÄx xÖx xÜx} {0 xax xÄx 1 xox xÖx 2 xux xÜx}
66 # Check that diacritics are removed if remove_diacritics=1 is specified.
67 # And that they do not break tokens.
68 do_unicode_token_test2 1.10 "xx\u0301xx" "0 xxxx xx\u301xx"
70 #-------------------------------------------------------------------------
73 Enhance the INSERT syntax to allow multiple rows to be inserted via the
76 Enhance the CREATE VIRTUAL TABLE command to support the IF NOT EXISTS clause.
78 Added the sqlite3_stricmp() interface as a counterpart to sqlite3_strnicmp().
80 Added the sqlite3_db_readonly() interface.
82 Added the SQLITE_FCNTL_PRAGMA file control, giving VFS implementations the
83 ability to add new PRAGMA statements or to override built-in PRAGMAs.
85 Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
86 the same row that contains the maximum x value.
88 Added support for the FTS4 languageid option.
90 Documented support for the FTS4 content option. This feature has actually
91 been in the code since version 3.7.9 but is only now considered to be
94 Pending statements no longer block ROLLBACK. Instead, the pending statement
95 will return SQLITE_ABORT upon next access after the ROLLBACK.
97 Improvements to the handling of CSV inputs in the command-line shell
99 Fix a bug introduced in version 3.7.10 that might cause a LEFT JOIN to be
100 incorrectly converted into an INNER JOIN if the WHERE clause indexable terms
104 set map(a) [list "\u00C4" "\u00E4"] ; # LATIN LETTER A WITH DIAERESIS
105 set map(e) [list "\u00CB" "\u00EB"] ; # LATIN LETTER E WITH DIAERESIS
106 set map(i) [list "\u00CF" "\u00EF"] ; # LATIN LETTER I WITH DIAERESIS
107 set map(o) [list "\u00D6" "\u00F6"] ; # LATIN LETTER O WITH DIAERESIS
108 set map(u) [list "\u00DC" "\u00FC"] ; # LATIN LETTER U WITH DIAERESIS
109 set map(y) [list "\u0178" "\u00FF"] ; # LATIN LETTER Y WITH DIAERESIS
110 set map(h) [list "\u1E26" "\u1E27"] ; # LATIN LETTER H WITH DIAERESIS
111 set map(w) [list "\u1E84" "\u1E85"] ; # LATIN LETTER W WITH DIAERESIS
112 set map(x) [list "\u1E8C" "\u1E8D"] ; # LATIN LETTER X WITH DIAERESIS
113 foreach k [array names map] {
114 lappend mappings [string toupper $k] [lindex $map($k) 0]
115 lappend mappings $k [lindex $map($k) 1]
118 set doc [regsub -all {[[:space:]]+} $doc " "]
119 string map $::mappings [string trim $doc]
123 execsql { CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61, x); }
126 execsql { INSERT INTO t2 VALUES($d) }
132 execsql { SELECT * FROM t2 WHERE t2 MATCH $q }
134 Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
135 the same row that contains the maximum x value.
138 foreach {tn query snippet} {
140 ...returns the value of y on the same [row] that contains
144 ...returns the value of y on the same [row] that contains
148 ...[ROLLBACK]. Instead, the pending statement
149 will return SQLITE_ABORT upon next access after the [ROLLBACK].
152 ...[ROLLBACK]. Instead, the pending statement
153 will return SQLITE_ABORT upon next access after the [ROLLBACK].
156 Added support for the FTS4 [languageid] option.
160 set q [mapdoc $query]
161 execsql { SELECT snippet(t2, '[', ']', '...') FROM t2 WHERE t2 MATCH $q }
162 } [list [mapdoc $snippet]]
165 #-------------------------------------------------------------------------
166 # Make sure the unicode61 tokenizer does not crash if it is passed a
169 do_execsql_test 3.1 {
170 CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x, y);
171 INSERT INTO t1 VALUES(NULL, 'a b c');
174 do_execsql_test 3.2 {
175 SELECT snippet(t1, '[', ']') FROM t1 WHERE t1 MATCH 'b'
178 do_execsql_test 3.3 {
181 INSERT INTO t1 VALUES('b b b b b b b b b b b', 'b b b b b b b b b b b b b');
182 INSERT INTO t1 SELECT * FROM t1;
183 INSERT INTO t1 SELECT * FROM t1;
184 INSERT INTO t1 SELECT * FROM t1;
185 INSERT INTO t1 SELECT * FROM t1;
186 INSERT INTO t1 SELECT * FROM t1;
187 INSERT INTO t1 SELECT * FROM t1;
188 INSERT INTO t1 SELECT * FROM t1;
189 INSERT INTO t1 SELECT * FROM t1;
190 INSERT INTO t1 SELECT * FROM t1;
191 INSERT INTO t1 SELECT * FROM t1;
192 INSERT INTO t1 SELECT * FROM t1;
193 INSERT INTO t1 SELECT * FROM t1;
194 INSERT INTO t1 SELECT * FROM t1;
195 INSERT INTO t1 SELECT * FROM t1;
196 INSERT INTO t1 SELECT * FROM t1;
197 INSERT INTO t1 SELECT * FROM t1;
198 INSERT INTO t1 VALUES('a b c', NULL);
199 INSERT INTO t1 VALUES('a x c', NULL);
203 do_execsql_test 3.4 {
204 SELECT * FROM t1 WHERE t1 MATCH 'a b';
207 #-------------------------------------------------------------------------
217 CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x);
218 INSERT INTO t1 VALUES($a);
219 INSERT INTO t1 VALUES($b);
220 INSERT INTO t1 VALUES($c);
221 INSERT INTO t1 VALUES($d);
226 set a [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0x62}]
227 set b [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0x62}]
228 set c [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
229 set d [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
231 INSERT INTO t1 VALUES($a);
232 INSERT INTO t1 VALUES($b);
233 INSERT INTO t1 VALUES($c);
234 INSERT INTO t1 VALUES($d);
239 set a [binary format c* {0xF7 0xBF 0xBF 0xBF}]
240 set b [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF}]
241 set c [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF}]
242 set d [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF}]
244 INSERT INTO t1 VALUES($a);
245 INSERT INTO t1 VALUES($b);
246 INSERT INTO t1 VALUES($c);
247 INSERT INTO t1 VALUES($d);
251 #-------------------------------------------------------------------------
253 do_unicode_token_test3 5.1 {tokenchars=} {
254 sqlite3_reset sqlite3_column_int
263 do_unicode_token_test3 5.2 {tokenchars=_} {
264 sqlite3_reset sqlite3_column_int
266 0 sqlite3_reset sqlite3_reset
267 1 sqlite3_column_int sqlite3_column_int
270 do_unicode_token_test3 5.3 {separators=xyz} {
271 Laotianxhorseyrunszfast
279 do_unicode_token_test3 5.4 {tokenchars=xyz} {
280 Laotianxhorseyrunszfast
282 0 laotianxhorseyrunszfast Laotianxhorseyrunszfast
285 do_unicode_token_test3 5.5 {tokenchars=_} {separators=zyx} {
286 sqlite3_resetxsqlite3_column_intyhonda_phantom
288 0 sqlite3_reset sqlite3_reset
289 1 sqlite3_column_int sqlite3_column_int
290 2 honda_phantom honda_phantom
293 do_unicode_token_test3 5.6 "separators=\u05D1" "abc\u05D1def" {
297 do_unicode_token_test3 5.7 \
298 "tokenchars=\u2444\u2445" \
299 "separators=\u05D0\u05D1\u05D2" \
300 "\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \
302 0 \u2444fre\u2445sh \u2444fre\u2445sh \
305 3 \u2445timer \u2445timer \
308 # Check that it is not possible to add a standalone diacritic codepoint
309 # to either separators or tokenchars.
310 do_unicode_token_test3 5.8 "separators=\u0301" \
311 "hello\u0301world \u0301helloworld" \
312 "0 helloworld hello\u0301world 1 helloworld helloworld"
314 do_unicode_token_test3 5.9 "tokenchars=\u0301" \
315 "hello\u0301world \u0301helloworld" \
316 "0 helloworld hello\u0301world 1 helloworld helloworld"
318 do_unicode_token_test3 5.10 "separators=\u0301" \
319 "remove_diacritics=0" \
320 "hello\u0301world \u0301helloworld" \
321 "0 hello\u0301world hello\u0301world 1 helloworld helloworld"
323 do_unicode_token_test3 5.11 "tokenchars=\u0301" \
324 "remove_diacritics=0" \
325 "hello\u0301world \u0301helloworld" \
326 "0 hello\u0301world hello\u0301world 1 helloworld helloworld"
329 #-------------------------------------------------------------------------
331 proc do_tokenize {tokenizer txt} {
333 foreach {a b c} [db one {SELECT fts3_tokenizer_test($tokenizer, $txt)}] {
339 # Argument $lCodepoint must be a list of codepoints (integers) that
340 # correspond to whitespace characters. This command creates a string
341 # $W from the codepoints, then tokenizes "${W}hello{$W}world${W}"
342 # using tokenizer $tokenizer. The test passes if the tokenizer successfully
343 # extracts the two 5 character tokens.
345 proc do_isspace_test {tn tokenizer lCp} {
346 set whitespace [format [string repeat %c [llength $lCp]] {*}$lCp]
347 set txt "${whitespace}hello${whitespace}world${whitespace}"
348 uplevel [list do_test $tn [list do_tokenize $tokenizer $txt] {hello world}]
351 set tokenizers [list unicode61]
352 ifcapable icu { lappend tokenizers icu }
354 # Some tests to check that the tokenizers can both identify white-space
355 # codepoints. All codepoints tested below are of type "Zs" in the
356 # UnicodeData.txt file.
357 foreach T $tokenizers {
358 do_isspace_test 6.$T.1 $T 32
359 do_isspace_test 6.$T.2 $T 160
360 do_isspace_test 6.$T.3 $T 5760
361 do_isspace_test 6.$T.4 $T 6158
362 do_isspace_test 6.$T.5 $T 8192
363 do_isspace_test 6.$T.6 $T 8193
364 do_isspace_test 6.$T.7 $T 8194
365 do_isspace_test 6.$T.8 $T 8195
366 do_isspace_test 6.$T.9 $T 8196
367 do_isspace_test 6.$T.10 $T 8197
368 do_isspace_test 6.$T.11 $T 8198
369 do_isspace_test 6.$T.12 $T 8199
370 do_isspace_test 6.$T.13 $T 8200
371 do_isspace_test 6.$T.14 $T 8201
372 do_isspace_test 6.$T.15 $T 8202
373 do_isspace_test 6.$T.16 $T 8239
374 do_isspace_test 6.$T.17 $T 8287
375 do_isspace_test 6.$T.18 $T 12288
377 do_isspace_test 6.$T.19 $T {32 160 5760 6158}
378 do_isspace_test 6.$T.19 $T {8192 8193 8194 8195}
379 do_isspace_test 6.$T.19 $T {8196 8197 8198 8199}
380 do_isspace_test 6.$T.19 $T {8200 8201 8202 8239}
381 do_isspace_test 6.$T.19 $T {8287 12288}