Snapshot of upstream SQLite 3.46.1
[sqlcipher.git] / ext / fts5 / test / fts5_common.tcl
blobfda388a6fbb184c95193e9bae8782e1015a3f35d
1 # 2014 Dec 19
3 # The author disclaims copyright to this source code. In place of
4 # a legal notice, here is a blessing:
6 # May you do good and not evil.
7 # May you find forgiveness for yourself and forgive others.
8 # May you share freely, never taking more than you give.
10 #***********************************************************************
13 if {![info exists testdir]} {
14 set testdir [file join [file dirname [info script]] .. .. .. test]
16 source $testdir/tester.tcl
18 ifcapable !fts5 {
19 proc return_if_no_fts5 {} {
20 finish_test
21 return -code return
23 return
24 } else {
25 proc return_if_no_fts5 {} {}
28 catch {
29 sqlite3_fts5_may_be_corrupt 0
30 reset_db
33 proc fts5_test_poslist {cmd} {
34 set res [list]
35 for {set i 0} {$i < [$cmd xInstCount]} {incr i} {
36 lappend res [string map {{ } .} [$cmd xInst $i]]
38 set res
41 proc fts5_test_poslist2 {cmd} {
42 set res [list]
44 for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} {
45 $cmd xPhraseForeach $i c o {
46 lappend res $i.$c.$o
50 #set res
51 sort_poslist $res
54 proc fts5_test_collist {cmd} {
55 set res [list]
57 for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} {
58 $cmd xPhraseColumnForeach $i c { lappend res $i.$c }
61 set res
64 proc fts5_collist {cmd iPhrase} {
65 set res [list]
66 $cmd xPhraseColumnForeach $iPhrase c { lappend res $c }
67 set res
70 proc fts5_test_columnsize {cmd} {
71 set res [list]
72 for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
73 lappend res [$cmd xColumnSize $i]
75 set res
78 proc fts5_columntext {cmd iCol} {
79 $cmd xColumnText $iCol
82 proc fts5_test_columntext {cmd} {
83 set res [list]
84 for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
85 lappend res [$cmd xColumnText $i]
87 set res
90 proc fts5_test_columntotalsize {cmd} {
91 set res [list]
92 for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
93 lappend res [$cmd xColumnTotalSize $i]
95 set res
98 proc test_append_token {varname token iStart iEnd} {
99 upvar $varname var
100 lappend var $token
101 return "SQLITE_OK"
103 proc fts5_test_tokenize {cmd} {
104 set res [list]
105 for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
106 set tokens [list]
107 $cmd xTokenize [$cmd xColumnText $i] [list test_append_token tokens]
108 lappend res $tokens
110 set res
113 proc fts5_test_rowcount {cmd} {
114 $cmd xRowCount
117 proc test_queryphrase_cb {cnt cmd} {
118 upvar $cnt L
119 for {set i 0} {$i < [$cmd xInstCount]} {incr i} {
120 foreach {ip ic io} [$cmd xInst $i] break
121 set A($ic) 1
123 foreach ic [array names A] {
124 lset L $ic [expr {[lindex $L $ic] + 1}]
127 proc fts5_test_queryphrase {cmd} {
128 set res [list]
129 for {set i 0} {$i < [$cmd xPhraseCount]} {incr i} {
130 set cnt [list]
131 for {set j 0} {$j < [$cmd xColumnCount]} {incr j} { lappend cnt 0 }
132 $cmd xQueryPhrase $i [list test_queryphrase_cb cnt]
133 lappend res $cnt
135 set res
138 proc fts5_queryphrase {cmd iPhrase} {
139 set cnt [list]
140 for {set j 0} {$j < [$cmd xColumnCount]} {incr j} { lappend cnt 0 }
141 $cmd xQueryPhrase $iPhrase [list test_queryphrase_cb cnt]
142 set cnt
145 proc fts5_test_phrasecount {cmd} {
146 $cmd xPhraseCount
149 proc fts5_test_all {cmd} {
150 set res [list]
151 lappend res columnsize [fts5_test_columnsize $cmd]
152 lappend res columntext [fts5_test_columntext $cmd]
153 lappend res columntotalsize [fts5_test_columntotalsize $cmd]
154 lappend res poslist [fts5_test_poslist $cmd]
155 lappend res tokenize [fts5_test_tokenize $cmd]
156 lappend res rowcount [fts5_test_rowcount $cmd]
157 set res
160 proc fts5_aux_test_functions {db} {
161 foreach f {
162 fts5_test_columnsize
163 fts5_test_columntext
164 fts5_test_columntotalsize
165 fts5_test_poslist
166 fts5_test_poslist2
167 fts5_test_collist
168 fts5_test_tokenize
169 fts5_test_rowcount
170 fts5_test_all
172 fts5_test_queryphrase
173 fts5_test_phrasecount
174 fts5_columntext
175 fts5_queryphrase
176 fts5_collist
178 sqlite3_fts5_create_function $db $f $f
182 proc fts5_segcount {tbl} {
183 set N 0
184 foreach n [fts5_level_segs $tbl] { incr N $n }
185 set N
188 proc fts5_level_segs {tbl} {
189 set sql "SELECT fts5_decode(rowid,block) aS r FROM ${tbl}_data WHERE rowid=10"
190 set ret [list]
191 foreach L [lrange [db one $sql] 1 end] {
192 lappend ret [expr [llength $L] - 3]
194 set ret
197 proc fts5_level_segids {tbl} {
198 set sql "SELECT fts5_decode(rowid,block) aS r FROM ${tbl}_data WHERE rowid=10"
199 set ret [list]
200 foreach L [lrange [db one $sql] 1 end] {
201 set lvl [list]
202 foreach S [lrange $L 3 end] {
203 regexp {id=([1234567890]*)} $S -> segid
204 lappend lvl $segid
206 lappend ret $lvl
208 set ret
211 proc fts5_rnddoc {n} {
212 set map [list 0 a 1 b 2 c 3 d 4 e 5 f 6 g 7 h 8 i 9 j]
213 set doc [list]
214 for {set i 0} {$i < $n} {incr i} {
215 lappend doc "x[string map $map [format %.3d [expr int(rand()*1000)]]]"
217 set doc
220 #-------------------------------------------------------------------------
221 # Usage:
223 # nearset aCol ?-pc VARNAME? ?-near N? ?-col C? -- phrase1 phrase2...
225 # This command is used to test if a document (set of column values) matches
226 # the logical equivalent of a single FTS5 NEAR() clump and, if so, return
227 # the equivalent of an FTS5 position list.
229 # Parameter $aCol is passed a list of the column values for the document
230 # to test. Parameters $phrase1 and so on are the phrases.
232 # The result is a list of phrase hits. Each phrase hit is formatted as
233 # three integers separated by "." characters, in the following format:
235 # <phrase number> . <column number> . <token offset>
237 # Options:
239 # -near N (NEAR distance. Default 10)
240 # -col C (List of column indexes to match against)
241 # -pc VARNAME (variable in caller frame to use for phrase numbering)
242 # -dict VARNAME (array in caller frame to use for synonyms)
244 proc nearset {aCol args} {
246 # Process the command line options.
248 set O(-near) 10
249 set O(-col) {}
250 set O(-pc) ""
251 set O(-dict) ""
253 set nOpt [lsearch -exact $args --]
254 if {$nOpt<0} { error "no -- option" }
256 # Set $lPhrase to be a list of phrases. $nPhrase its length.
257 set lPhrase [lrange $args [expr $nOpt+1] end]
258 set nPhrase [llength $lPhrase]
260 foreach {k v} [lrange $args 0 [expr $nOpt-1]] {
261 if {[info exists O($k)]==0} { error "unrecognized option $k" }
262 set O($k) $v
265 if {$O(-pc) == ""} {
266 set counter 0
267 } else {
268 upvar $O(-pc) counter
271 if {$O(-dict)!=""} { upvar $O(-dict) aDict }
273 for {set j 0} {$j < [llength $aCol]} {incr j} {
274 for {set i 0} {$i < $nPhrase} {incr i} {
275 set A($j,$i) [list]
279 # Loop through each column of the current row.
280 for {set iCol 0} {$iCol < [llength $aCol]} {incr iCol} {
282 # If there is a column filter, test whether this column is excluded. If
283 # so, skip to the next iteration of this loop. Otherwise, set zCol to the
284 # column value and nToken to the number of tokens that comprise it.
285 if {$O(-col)!="" && [lsearch $O(-col) $iCol]<0} continue
286 set zCol [lindex $aCol $iCol]
287 set nToken [llength $zCol]
289 # Each iteration of the following loop searches a substring of the
290 # column value for phrase matches. The last token of the substring
291 # is token $iLast of the column value. The first token is:
293 # iFirst = ($iLast - $O(-near) - 1)
295 # where $sz is the length of the phrase being searched for. A phrase
296 # counts as matching the substring if its first token lies on or before
297 # $iLast and its last token on or after $iFirst.
299 # For example, if the query is "NEAR(a+b c, 2)" and the column value:
301 # "x x x x A B x x C x"
302 # 0 1 2 3 4 5 6 7 8 9"
304 # when (iLast==8 && iFirst=5) the range will contain both phrases and
305 # so both instances can be added to the output poslists.
307 set iLast [expr $O(-near) >= $nToken ? $nToken - 1 : $O(-near)]
308 for { } {$iLast < $nToken} {incr iLast} {
310 catch { array unset B }
312 for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} {
313 set p [lindex $lPhrase $iPhrase]
314 set nPm1 [expr {[llength $p] - 1}]
315 set iFirst [expr $iLast - $O(-near) - [llength $p]]
317 for {set i $iFirst} {$i <= $iLast} {incr i} {
318 set lCand [lrange $zCol $i [expr $i+$nPm1]]
319 set bMatch 1
320 foreach tok $p term $lCand {
321 if {[nearset_match aDict $tok $term]==0} { set bMatch 0 ; break }
323 if {$bMatch} { lappend B($iPhrase) $i }
326 if {![info exists B($iPhrase)]} break
329 if {$iPhrase==$nPhrase} {
330 for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} {
331 set A($iCol,$iPhrase) [concat $A($iCol,$iPhrase) $B($iPhrase)]
332 set A($iCol,$iPhrase) [lsort -integer -uniq $A($iCol,$iPhrase)]
338 set res [list]
339 #puts [array names A]
341 for {set iPhrase 0} {$iPhrase<$nPhrase} {incr iPhrase} {
342 for {set iCol 0} {$iCol < [llength $aCol]} {incr iCol} {
343 foreach a $A($iCol,$iPhrase) {
344 lappend res "$counter.$iCol.$a"
347 incr counter
350 #puts "$aCol -> $res"
351 sort_poslist $res
354 proc nearset_match {aDictVar tok term} {
355 if {[string match $tok $term]} { return 1 }
357 upvar $aDictVar aDict
358 if {[info exists aDict($tok)]} {
359 foreach s $aDict($tok) {
360 if {[string match $s $term]} { return 1 }
363 return 0;
366 #-------------------------------------------------------------------------
367 # Usage:
369 # sort_poslist LIST
371 # Sort a position list of the type returned by command [nearset]
373 proc sort_poslist {L} {
374 lsort -command instcompare $L
376 proc instcompare {lhs rhs} {
377 foreach {p1 c1 o1} [split $lhs .] {}
378 foreach {p2 c2 o2} [split $rhs .] {}
380 set res [expr $c1 - $c2]
381 if {$res==0} { set res [expr $o1 - $o2] }
382 if {$res==0} { set res [expr $p1 - $p2] }
384 return $res
387 #-------------------------------------------------------------------------
388 # Logical operators used by the commands returned by fts5_tcl_expr().
390 proc AND {args} {
391 foreach a $args {
392 if {[llength $a]==0} { return [list] }
394 sort_poslist [concat {*}$args]
396 proc OR {args} {
397 sort_poslist [concat {*}$args]
399 proc NOT {a b} {
400 if {[llength $b]>0} { return [list] }
401 return $a
404 #-------------------------------------------------------------------------
405 # This command is similar to [split], except that it also provides the
406 # start and end offsets of each token. For example:
408 # [fts5_tokenize_split "abc d ef"] -> {abc 0 3 d 4 5 ef 6 8}
411 proc gobble_whitespace {textvar} {
412 upvar $textvar t
413 regexp {([ ]*)(.*)} $t -> space t
414 return [string length $space]
417 proc gobble_text {textvar wordvar} {
418 upvar $textvar t
419 upvar $wordvar w
420 regexp {([^ ]*)(.*)} $t -> w t
421 return [string length $w]
424 proc fts5_tokenize_split {text} {
425 set token ""
426 set ret [list]
427 set iOff [gobble_whitespace text]
428 while {[set nToken [gobble_text text word]]} {
429 lappend ret $word $iOff [expr $iOff+$nToken]
430 incr iOff $nToken
431 incr iOff [gobble_whitespace text]
434 set ret
437 #-------------------------------------------------------------------------
439 proc foreach_detail_mode {prefix script} {
440 set saved $::testprefix
441 foreach d [list full col none] {
442 set s [string map [list %DETAIL% $d] $script]
443 set ::detail $d
444 set ::testprefix "$prefix-$d"
445 reset_db
446 uplevel $s
447 unset ::detail
449 set ::testprefix $saved
452 proc detail_check {} {
453 if {$::detail != "none" && $::detail!="full" && $::detail!="col"} {
454 error "not in foreach_detail_mode {...} block"
457 proc detail_is_none {} { detail_check ; expr {$::detail == "none"} }
458 proc detail_is_col {} { detail_check ; expr {$::detail == "col" } }
459 proc detail_is_full {} { detail_check ; expr {$::detail == "full"} }
461 proc foreach_tokenizer_mode {prefix script} {
462 set saved $::testprefix
463 foreach {d mapping} {
464 "" {}
465 "-origintext" {, tokenize="origintext unicode61", tokendata=1}
467 set s [string map [list %TOKENIZER% $mapping] $script]
468 set ::testprefix "$prefix$d"
469 reset_db
470 sqlite3_fts5_register_origintext db
471 uplevel $s
473 set ::testprefix $saved
476 #-------------------------------------------------------------------------
477 # Convert a poslist of the type returned by fts5_test_poslist() to a
478 # collist as returned by fts5_test_collist().
480 proc fts5_poslist2collist {poslist} {
481 set res [list]
482 foreach h $poslist {
483 regexp {(.*)\.[1234567890]+} $h -> cand
484 lappend res $cand
486 set res [lsort -command fts5_collist_elem_compare -unique $res]
487 return $res
490 # Comparison function used by fts5_poslist2collist to sort collist entries.
491 proc fts5_collist_elem_compare {a b} {
492 foreach {a1 a2} [split $a .] {}
493 foreach {b1 b2} [split $b .] {}
495 if {$a1==$b1} { return [expr $a2 - $b2] }
496 return [expr $a1 - $b1]
500 #--------------------------------------------------------------------------
501 # Construct and return a tcl list equivalent to that returned by the SQL
502 # query executed against database handle [db]:
504 # SELECT
505 # rowid,
506 # fts5_test_poslist($tbl),
507 # fts5_test_collist($tbl)
508 # FROM $tbl('$expr')
509 # ORDER BY rowid $order;
511 proc fts5_query_data {expr tbl {order ASC} {aDictVar ""}} {
513 # Figure out the set of columns in the FTS5 table. This routine does
514 # not handle tables with UNINDEXED columns, but if it did, it would
515 # have to be here.
516 db eval "PRAGMA table_info = $tbl" x { lappend lCols $x(name) }
518 set d ""
519 if {$aDictVar != ""} {
520 upvar $aDictVar aDict
521 set d aDict
524 set cols ""
525 foreach e $lCols { append cols ", '$e'" }
526 set tclexpr [db one [subst -novar {
527 SELECT fts5_expr_tcl( $expr, 'nearset $cols -dict $d -pc ::pc' [set cols] )
530 set res [list]
531 db eval "SELECT rowid, * FROM $tbl ORDER BY rowid $order" x {
532 set cols [list]
533 foreach col $lCols { lappend cols $x($col) }
535 set ::pc 0
536 set rowdata [eval $tclexpr]
537 if {$rowdata != ""} {
538 lappend res $x(rowid) $rowdata [fts5_poslist2collist $rowdata]
542 set res
545 #-------------------------------------------------------------------------
546 # Similar to [fts5_query_data], but omit the collist field.
548 proc fts5_poslist_data {expr tbl {order ASC} {aDictVar ""}} {
549 set res [list]
551 if {$aDictVar!=""} {
552 upvar $aDictVar aDict
553 set dict aDict
554 } else {
555 set dict ""
558 foreach {rowid poslist collist} [fts5_query_data $expr $tbl $order $dict] {
559 lappend res $rowid $poslist
561 set res
564 proc fts5_collist_data {expr tbl {order ASC} {aDictVar ""}} {
565 set res [list]
567 if {$aDictVar!=""} {
568 upvar $aDictVar aDict
569 set dict aDict
570 } else {
571 set dict ""
574 foreach {rowid poslist collist} [fts5_query_data $expr $tbl $order $dict] {
575 lappend res $rowid $collist
577 set res
580 #-------------------------------------------------------------------------
583 # This command will only work inside a [foreach_detail_mode] block. It tests
584 # whether or not expression $expr run on FTS5 table $tbl is supported by
585 # the current mode. If so, 1 is returned. If not, 0.
587 # detail=full (all queries supported)
588 # detail=col (all but phrase queries and NEAR queries)
589 # detail=none (all but phrase queries, NEAR queries, and column filters)
591 proc fts5_expr_ok {expr tbl} {
593 if {![detail_is_full]} {
594 set nearset "nearset_rc"
595 if {[detail_is_col]} { set nearset "nearset_rf" }
597 set ::expr_not_ok 0
598 db eval "PRAGMA table_info = $tbl" x { lappend lCols $x(name) }
600 set cols ""
601 foreach e $lCols { append cols ", '$e'" }
602 set ::pc 0
603 set tclexpr [db one [subst -novar {
604 SELECT fts5_expr_tcl( $expr, '[set nearset] $cols -pc ::pc' [set cols] )
606 eval $tclexpr
607 if {$::expr_not_ok} { return 0 }
610 return 1
613 # Helper for [fts5_expr_ok]
614 proc nearset_rf {aCol args} {
615 set idx [lsearch -exact $args --]
616 if {$idx != [llength $args]-2 || [llength [lindex $args end]]!=1} {
617 set ::expr_not_ok 1
619 list
622 # Helper for [fts5_expr_ok]
623 proc nearset_rc {aCol args} {
624 nearset_rf $aCol {*}$args
625 if {[lsearch $args -col]>=0} {
626 set ::expr_not_ok 1
628 list
631 proc dump {tname} {
632 execsql_pp "SELECT * FROM ${tname}_idx"
633 execsql_pp "SELECT id, quote(block), fts5_decode(id,block) FROM ${tname}_data"
636 #-------------------------------------------------------------------------
637 # Code for a simple Tcl tokenizer that supports synonyms at query time.
639 proc tclnum_tokenize {mode tflags text} {
640 foreach {w iStart iEnd} [fts5_tokenize_split $text] {
641 sqlite3_fts5_token $w $iStart $iEnd
642 if {$tflags == $mode && [info exists ::tclnum_syn($w)]} {
643 foreach s $::tclnum_syn($w) { sqlite3_fts5_token -colo $s $iStart $iEnd }
648 proc tclnum_create {args} {
649 set mode query
650 if {[llength $args]} {
651 set mode [lindex $args 0]
653 if {$mode != "query" && $mode != "document"} { error "bad mode: $mode" }
654 return [list tclnum_tokenize $mode]
657 proc fts5_tclnum_register {db} {
658 foreach SYNDICT {
659 {zero 0}
660 {one 1 i}
661 {two 2 ii}
662 {three 3 iii}
663 {four 4 iv}
664 {five 5 v}
665 {six 6 vi}
666 {seven 7 vii}
667 {eight 8 viii}
668 {nine 9 ix}
670 {a1 a2 a3 a4 a5 a6 a7 a8 a9}
671 {b1 b2 b3 b4 b5 b6 b7 b8 b9}
672 {c1 c2 c3 c4 c5 c6 c7 c8 c9}
674 foreach s $SYNDICT {
675 set o [list]
676 foreach x $SYNDICT {if {$x!=$s} {lappend o $x}}
677 set ::tclnum_syn($s) $o
680 sqlite3_fts5_create_tokenizer db tclnum tclnum_create
683 # End of tokenizer code.
684 #-------------------------------------------------------------------------