test/fts3rnd.test

   1 # 2009 December 03
   2 #
   3 #    May you do good and not evil.
   4 #    May you find forgiveness for yourself and forgive others.
   5 #    May you share freely, never taking more than you give.
   6 #
   7 #***********************************************************************
   8 #
   9 # Brute force (random data) tests for FTS3.
  10 #
  11
  12 #-------------------------------------------------------------------------
  13 #
  14 # The FTS3 tests implemented in this file focus on testing that FTS3
  15 # returns the correct set of documents for various types of full-text
  16 # query. This is done using pseudo-randomly generated data and queries.
  17 # The expected result of each query is calculated using Tcl code.
  18 #
  19 #   1. The database is initialized to contain a single table with three
  20 #      columns. 100 rows are inserted into the table. Each of the three
  21 #      values in each row is a document consisting of between 0 and 100
  22 #      terms. Terms are selected from a vocabulary of $G(nVocab) terms.
  23 #
  24 #   2. The following is performed 100 times:
  25 #
  26 #      a. A row is inserted into the database. The row contents are
  27 #         generated as in step 1. The docid is a pseudo-randomly selected
  28 #         value between 0 and 1000000.
  29 #
  30 #      b. A psuedo-randomly selected row is updated. One of its columns is
  31 #         set to contain a new document generated in the same way as the
  32 #         documents in step 1.
  33 #
  34 #      c. A psuedo-randomly selected row is deleted.
  35 #
  36 #      d. For each of several types of fts3 queries, 10 SELECT queries
  37 #         of the form:
  38 #
  39 #           SELECT docid FROM <tbl> WHERE <tbl> MATCH '<query>'
  40 #
  41 #         are evaluated. The results are compared to those calculated by
  42 #         Tcl code in this file. The patterns used for the different query
  43 #         types are:
  44 #
  45 #           1.  query = <term>
  46 #           2.  query = <prefix>
  47 #           3.  query = "<term> <term>"
  48 #           4.  query = "<term> <term> <term>"
  49 #           5.  query = "<prefix> <prefix> <prefix>"
  50 #           6.  query = <term> NEAR <term>
  51 #           7.  query = <term> NEAR/11 <term> NEAR/11 <term>
  52 #           8.  query = <term> OR <term>
  53 #           9.  query = <term> NOT <term>
  54 #           10. query = <term> AND <term>
  55 #           11. query = <term> NEAR <term> OR <term> NEAR <term>
  56 #           12. query = <term> NEAR <term> NOT <term> NEAR <term>
  57 #           13. query = <term> NEAR <term> AND <term> NEAR <term>
  58 #
  59 #         where <term> is a term psuedo-randomly selected from the vocabulary
  60 #         and prefix is the first 2 characters of such a term followed by
  61 #         a "*" character.
  62 #
  63 #      Every second iteration, steps (a) through (d) above are performed
  64 #      within a single transaction. This forces the queries in (d) to
  65 #      read data from both the database and the in-memory hash table
  66 #      that caches the full-text index entries created by steps (a), (b)
  67 #      and (c) until the transaction is committed.
  68 #
  69 # The procedure above is run 5 times, using advisory fts3 node sizes of 50,
  70 # 500, 1000 and 2000 bytes.
  71 #
  72 # After the test using an advisory node-size of 50, an OOM test is run using
  73 # the database. This test is similar to step (d) above, except that it tests
  74 # the effects of transient and persistent OOM conditions encountered while
  75 # executing each query.
  76 #
  77
  78 set testdir [file dirname $argv0]
  79 source $testdir/tester.tcl
  80
  81 # If this build does not include FTS3, skip the tests in this file.
  82 #
  83 ifcapable !fts3 { finish_test ; return }
  84 source $testdir/fts3_common.tcl
  85 source $testdir/malloc_common.tcl
  86
  87 set G(nVocab) 100
  88
  89 set nVocab 100
  90 set lVocab [list]
  91
  92 expr srand(0)
  93
  94 # Generate a vocabulary of nVocab words. Each word is 3 characters long.
  95 #
  96 set lChar {a b c d e f g h i j k l m n o p q r s t u v w x y z}
  97 for {set i 0} {$i < $nVocab} {incr i} {
  98   set len [expr int(rand()*3)+2]
  99   set    word [lindex $lChar [expr int(rand()*26)]]
 100   append word [lindex $lChar [expr int(rand()*26)]]
 101   if {$len>2} { append word [lindex $lChar [expr int(rand()*26)]] }
 102   if {$len>3} { append word [lindex $lChar [expr int(rand()*26)]] }
 103   lappend lVocab $word
 104 }
 105
 106 proc random_term {} {
 107   lindex $::lVocab [expr {int(rand()*$::nVocab)}]
 108 }
 109
 110 # Return a document consisting of $nWord arbitrarily selected terms
 111 # from the $::lVocab list.
 112 #
 113 proc generate_doc {nWord} {
 114   set doc [list]
 115   for {set i 0} {$i < $nWord} {incr i} {
 116     lappend doc [random_term]
 117   }
 118   return $doc
 119 }
 120
 121
 122
 123 # Primitives to update the table.
 124 #
 125 unset -nocomplain t1
 126 proc insert_row {rowid} {
 127   set a [generate_doc [expr int((rand()*100))]]
 128   set b [generate_doc [expr int((rand()*100))]]
 129   set c [generate_doc [expr int((rand()*100))]]
 130   execsql { INSERT INTO t1(docid, a, b, c) VALUES($rowid, $a, $b, $c) }
 131   set ::t1($rowid) [list $a $b $c]
 132 }
 133 proc delete_row {rowid} {
 134   execsql { DELETE FROM t1 WHERE rowid = $rowid }
 135   catch {unset ::t1($rowid)}
 136 }
 137 proc update_row {rowid} {
 138   set cols {a b c}
 139   set iCol [expr int(rand()*3)]
 140   set doc  [generate_doc [expr int((rand()*100))]]
 141   lset ::t1($rowid) $iCol $doc
 142   execsql "UPDATE t1 SET [lindex $cols $iCol] = \$doc WHERE rowid = \$rowid"
 143 }
 144
 145 proc simple_phrase {zPrefix} {
 146   set ret [list]
 147
 148   set reg [string map {* {[^ ]*}} $zPrefix]
 149   set reg " $reg "
 150
 151   foreach key [lsort -integer [array names ::t1]] {
 152     set value $::t1($key)
 153     set cnt [list]
 154     foreach col $value {
 155       if {[regexp $reg " $col "]} { lappend ret $key ; break }
 156     }
 157   }
 158
 159   #lsort -uniq -integer $ret
 160   set ret
 161 }
 162
 163 # This [proc] is used to test the FTS3 matchinfo() function.
 164 #
 165 proc simple_token_matchinfo {zToken bDesc} {
 166
 167   set nDoc(0) 0
 168   set nDoc(1) 0
 169   set nDoc(2) 0
 170   set nHit(0) 0
 171   set nHit(1) 0
 172   set nHit(2) 0
 173
 174   set dir -inc
 175   if {$bDesc} { set dir -dec }
 176
 177   foreach key [array names ::t1] {
 178     set value $::t1($key)
 179     set a($key) [list]
 180     foreach i {0 1 2} col $value {
 181       set hit [llength [lsearch -all $col $zToken]]
 182       lappend a($key) $hit
 183       incr nHit($i) $hit
 184       if {$hit>0} { incr nDoc($i) }
 185     }
 186   }
 187
 188   set ret [list]
 189   foreach docid [lsort -integer $dir [array names a]] {
 190     if { [lindex [lsort -integer $a($docid)] end] } {
 191       set matchinfo [list 1 3]
 192       foreach i {0 1 2} hit $a($docid) {
 193         lappend matchinfo $hit $nHit($i) $nDoc($i)
 194       }
 195       lappend ret $docid $matchinfo
 196     }
 197   }
 198
 199   set ret
 200 }
 201
 202 proc simple_near {termlist nNear} {
 203   set ret [list]
 204
 205   foreach {key value} [array get ::t1] {
 206     foreach v $value {
 207
 208       set l [lsearch -exact -all $v [lindex $termlist 0]]
 209       foreach T [lrange $termlist 1 end] {
 210         set l2 [list]
 211         foreach i $l {
 212           set iStart [expr $i - $nNear - 1]
 213           set iEnd [expr $i + $nNear + 1]
 214           if {$iStart < 0} {set iStart 0}
 215           foreach i2 [lsearch -exact -all [lrange $v $iStart $iEnd] $T] {
 216             incr i2 $iStart
 217             if {$i2 != $i} { lappend l2 $i2 }
 218           }
 219         }
 220         set l [lsort -uniq -integer $l2]
 221       }
 222
 223       if {[llength $l]} {
 224 #puts "MATCH($key): $v"
 225         lappend ret $key
 226       }
 227     }
 228   }
 229
 230   lsort -unique -integer $ret
 231 }
 232
 233 # The following three procs:
 234 #
 235 #   setup_not A B
 236 #   setup_or  A B
 237 #   setup_and A B
 238 #
 239 # each take two arguments. Both arguments must be lists of integer values
 240 # sorted by value. The return value is the list produced by evaluating
 241 # the equivalent of "A op B", where op is the FTS3 operator NOT, OR or
 242 # AND.
 243 #
 244 proc setop_not {A B} {
 245   foreach b $B { set n($b) {} }
 246   set ret [list]
 247   foreach a $A { if {![info exists n($a)]} {lappend ret $a} }
 248   return $ret
 249 }
 250 proc setop_or {A B} {
 251   lsort -integer -uniq [concat $A $B]
 252 }
 253 proc setop_and {A B} {
 254   foreach b $B { set n($b) {} }
 255   set ret [list]
 256   foreach a $A { if {[info exists n($a)]} {lappend ret $a} }
 257   return $ret
 258 }
 259
 260 proc mit {blob} {
 261   set scan(littleEndian) i*
 262   set scan(bigEndian) I*
 263   binary scan $blob $scan($::tcl_platform(byteOrder)) r
 264   return $r
 265 }
 266 db func mit mit
 267 set sqlite_fts3_enable_parentheses 1
 268
 269 proc do_orderbydocid_test {tn sql res} {
 270   uplevel [list do_select_test $tn.asc "$sql ORDER BY docid ASC" $res]
 271   uplevel [list do_select_test $tn.desc "$sql ORDER BY docid DESC" \
 272     [lsort -int -dec $res]
 273   ]
 274 }
 275
 276 set NUM_TRIALS 100
 277
 278 foreach {nodesize order} {
 279   50    DESC
 280   50    ASC
 281   500   ASC
 282   1000  DESC
 283   2000  ASC
 284 } {
 285   catch { array unset ::t1 }
 286   set testname "$nodesize/$order"
 287
 288   # Create the FTS3 table. Populate it (and the Tcl array) with 100 rows.
 289   #
 290   db transaction {
 291     catchsql { DROP TABLE t1 }
 292     execsql "CREATE VIRTUAL TABLE t1 USING fts4(a, b, c, order=$order)"
 293     execsql "INSERT INTO t1(t1) VALUES('nodesize=$nodesize')"
 294     for {set i 0} {$i < 100} {incr i} { insert_row $i }
 295   }
 296
 297   for {set iTest 1} {$iTest <= $NUM_TRIALS} {incr iTest} {
 298     catchsql COMMIT
 299
 300     set DO_MALLOC_TEST 0
 301     set nRep 10
 302     if {$iTest==100 && $nodesize==50} {
 303       set DO_MALLOC_TEST 1
 304       set nRep 2
 305     }
 306
 307     set ::testprefix fts3rnd-1.$testname.$iTest
 308
 309     # Delete one row, update one row and insert one row.
 310     #
 311     set rows [array names ::t1]
 312     set nRow [llength $rows]
 313     set iUpdate [lindex $rows [expr {int(rand()*$nRow)}]]
 314     set iDelete $iUpdate
 315     while {$iDelete == $iUpdate} {
 316       set iDelete [lindex $rows [expr {int(rand()*$nRow)}]]
 317     }
 318     set iInsert $iUpdate
 319     while {[info exists ::t1($iInsert)]} {
 320       set iInsert [expr {int(rand()*1000000)}]
 321     }
 322     execsql BEGIN
 323       insert_row $iInsert
 324       update_row $iUpdate
 325       delete_row $iDelete
 326     if {0==($iTest%2)} { execsql COMMIT }
 327
 328     if {0==($iTest%2)} {
 329       #do_test 0 { fts3_integrity_check t1 } ok
 330     }
 331
 332     # Pick 10 terms from the vocabulary. Check that the results of querying
 333     # the database for the set of documents containing each of these terms
 334     # is the same as the result obtained by scanning the contents of the Tcl
 335     # array for each term.
 336     #
 337     for {set i 0} {$i < 10} {incr i} {
 338       set term [random_term]
 339       do_select_test 1.$i.asc {
 340         SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term
 341         ORDER BY docid ASC
 342       } [simple_token_matchinfo $term 0]
 343       do_select_test 1.$i.desc {
 344         SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term
 345         ORDER BY docid DESC
 346       } [simple_token_matchinfo $term 1]
 347     }
 348
 349     # This time, use the first two characters of each term as a term prefix
 350     # to query for. Test that querying the Tcl array produces the same results
 351     # as querying the FTS3 table for the prefix.
 352     #
 353     for {set i 0} {$i < $nRep} {incr i} {
 354       set prefix [string range [random_term] 0 end-1]
 355       set match "${prefix}*"
 356       do_orderbydocid_test 2.$i {
 357         SELECT docid FROM t1 WHERE t1 MATCH $match
 358       } [simple_phrase $match]
 359     }
 360
 361     # Similar to the above, except for phrase queries.
 362     #
 363     for {set i 0} {$i < $nRep} {incr i} {
 364       set term [list [random_term] [random_term]]
 365       set match "\"$term\""
 366       do_orderbydocid_test 3.$i {
 367         SELECT docid FROM t1 WHERE t1 MATCH $match
 368       } [simple_phrase $term]
 369     }
 370
 371     # Three word phrases.
 372     #
 373     for {set i 0} {$i < $nRep} {incr i} {
 374       set term [list [random_term] [random_term] [random_term]]
 375       set match "\"$term\""
 376       do_orderbydocid_test 4.$i {
 377         SELECT docid FROM t1 WHERE t1 MATCH $match
 378       } [simple_phrase $term]
 379     }
 380
 381     # Three word phrases made up of term-prefixes.
 382     #
 383     for {set i 0} {$i < $nRep} {incr i} {
 384       set    query "[string range [random_term] 0 end-1]* "
 385       append query "[string range [random_term] 0 end-1]* "
 386       append query "[string range [random_term] 0 end-1]*"
 387
 388       set match "\"$query\""
 389       do_orderbydocid_test 5.$i {
 390         SELECT docid FROM t1 WHERE t1 MATCH $match
 391       } [simple_phrase $query]
 392     }
 393
 394     # A NEAR query with terms as the arguments:
 395     #
 396     #     ... MATCH '$term1 NEAR $term2' ...
 397     #
 398     for {set i 0} {$i < $nRep} {incr i} {
 399       set terms [list [random_term] [random_term]]
 400       set match [join $terms " NEAR "]
 401       do_orderbydocid_test 6.$i {
 402         SELECT docid FROM t1 WHERE t1 MATCH $match
 403       } [simple_near $terms 10]
 404     }
 405
 406     # A 3-way NEAR query with terms as the arguments.
 407     #
 408     for {set i 0} {$i < $nRep} {incr i} {
 409       set terms [list [random_term] [random_term] [random_term]]
 410       set nNear 11
 411       set match [join $terms " NEAR/$nNear "]
 412       do_orderbydocid_test 7.$i {
 413         SELECT docid FROM t1 WHERE t1 MATCH $match
 414       } [simple_near $terms $nNear]
 415     }
 416
 417     # Set operations on simple term queries.
 418     #
 419     foreach {tn op proc} {
 420       8  OR  setop_or
 421       9  NOT setop_not
 422       10 AND setop_and
 423     } {
 424       for {set i 0} {$i < $nRep} {incr i} {
 425         set term1 [random_term]
 426         set term2 [random_term]
 427         set match "$term1 $op $term2"
 428         do_orderbydocid_test $tn.$i {
 429           SELECT docid FROM t1 WHERE t1 MATCH $match
 430         } [$proc [simple_phrase $term1] [simple_phrase $term2]]
 431       }
 432     }
 433
 434     # Set operations on NEAR queries.
 435     #
 436     foreach {tn op proc} {
 437       11 OR  setop_or
 438       12 NOT setop_not
 439       13 AND setop_and
 440     } {
 441       for {set i 0} {$i < $nRep} {incr i} {
 442         set term1 [random_term]
 443         set term2 [random_term]
 444         set term3 [random_term]
 445         set term4 [random_term]
 446         set match "$term1 NEAR $term2 $op $term3 NEAR $term4"
 447         do_orderbydocid_test $tn.$i {
 448           SELECT docid FROM t1 WHERE t1 MATCH $match
 449         } [$proc                                  \
 450             [simple_near [list $term1 $term2] 10] \
 451             [simple_near [list $term3 $term4] 10]
 452           ]
 453       }
 454     }
 455
 456     catchsql COMMIT
 457   }
 458 }
 459
 460 finish_test