third_party/sqlite/sqlite-src-3080704/ext/fts3/fts3_test.c

   1 /*
   2 ** 2011 Jun 13
   3 **
   4 ** The author disclaims copyright to this source code.  In place of
   5 ** a legal notice, here is a blessing:
   6 **
   7 **    May you do good and not evil.
   8 **    May you find forgiveness for yourself and forgive others.
   9 **    May you share freely, never taking more than you give.
  10 **
  11 ******************************************************************************
  12 **
  13 ** This file is not part of the production FTS code. It is only used for
  14 ** testing. It contains a Tcl command that can be used to test if a document
  15 ** matches an FTS NEAR expression.
  16 **
  17 ** As of March 2012, it also contains a version 1 tokenizer used for testing
  18 ** that the sqlite3_tokenizer_module.xLanguage() method is invoked correctly.
  19 */
  20
  21 #include <tcl.h>
  22 #include <string.h>
  23 #include <assert.h>
  24
  25 #if defined(SQLITE_TEST)
  26 #if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)
  27
  28 /* Required so that the "ifdef SQLITE_ENABLE_FTS3" below works */
  29 #include "fts3Int.h"
  30
  31 #define NM_MAX_TOKEN 12
  32
  33 typedef struct NearPhrase NearPhrase;
  34 typedef struct NearDocument NearDocument;
  35 typedef struct NearToken NearToken;
  36
  37 struct NearDocument {
  38   int nToken;                     /* Length of token in bytes */
  39   NearToken *aToken;              /* Token array */
  40 };
  41
  42 struct NearToken {
  43   int n;                          /* Length of token in bytes */
  44   const char *z;                  /* Pointer to token string */
  45 };
  46
  47 struct NearPhrase {
  48   int nNear;                      /* Preceding NEAR value */
  49   int nToken;                     /* Number of tokens in this phrase */
  50   NearToken aToken[NM_MAX_TOKEN]; /* Array of tokens in this phrase */
  51 };
  52
  53 static int nm_phrase_match(
  54   NearPhrase *p,
  55   NearToken *aToken
  56 ){
  57   int ii;
  58
  59   for(ii=0; ii<p->nToken; ii++){
  60     NearToken *pToken = &p->aToken[ii];
  61     if( pToken->n>0 && pToken->z[pToken->n-1]=='*' ){
  62       if( aToken[ii].n<(pToken->n-1) ) return 0;
  63       if( memcmp(aToken[ii].z, pToken->z, pToken->n-1) ) return 0;
  64     }else{
  65       if( aToken[ii].n!=pToken->n ) return 0;
  66       if( memcmp(aToken[ii].z, pToken->z, pToken->n) ) return 0;
  67     }
  68   }
  69
  70   return 1;
  71 }
  72
  73 static int nm_near_chain(
  74   int iDir,                       /* Direction to iterate through aPhrase[] */
  75   NearDocument *pDoc,             /* Document to match against */
  76   int iPos,                       /* Position at which iPhrase was found */
  77   int nPhrase,                    /* Size of phrase array */
  78   NearPhrase *aPhrase,            /* Phrase array */
  79   int iPhrase                     /* Index of phrase found */
  80 ){
  81   int iStart;
  82   int iStop;
  83   int ii;
  84   int nNear;
  85   int iPhrase2;
  86   NearPhrase *p;
  87   NearPhrase *pPrev;
  88
  89   assert( iDir==1 || iDir==-1 );
  90
  91   if( iDir==1 ){
  92     if( (iPhrase+1)==nPhrase ) return 1;
  93     nNear = aPhrase[iPhrase+1].nNear;
  94   }else{
  95     if( iPhrase==0 ) return 1;
  96     nNear = aPhrase[iPhrase].nNear;
  97   }
  98   pPrev = &aPhrase[iPhrase];
  99   iPhrase2 = iPhrase+iDir;
 100   p = &aPhrase[iPhrase2];
 101
 102   iStart = iPos - nNear - p->nToken;
 103   iStop = iPos + nNear + pPrev->nToken;
 104
 105   if( iStart<0 ) iStart = 0;
 106   if( iStop > pDoc->nToken - p->nToken ) iStop = pDoc->nToken - p->nToken;
 107
 108   for(ii=iStart; ii<=iStop; ii++){
 109     if( nm_phrase_match(p, &pDoc->aToken[ii]) ){
 110       if( nm_near_chain(iDir, pDoc, ii, nPhrase, aPhrase, iPhrase2) ) return 1;
 111     }
 112   }
 113
 114   return 0;
 115 }
 116
 117 static int nm_match_count(
 118   NearDocument *pDoc,             /* Document to match against */
 119   int nPhrase,                    /* Size of phrase array */
 120   NearPhrase *aPhrase,            /* Phrase array */
 121   int iPhrase                     /* Index of phrase to count matches for */
 122 ){
 123   int nOcc = 0;
 124   int ii;
 125   NearPhrase *p = &aPhrase[iPhrase];
 126
 127   for(ii=0; ii<(pDoc->nToken + 1 - p->nToken); ii++){
 128     if( nm_phrase_match(p, &pDoc->aToken[ii]) ){
 129       /* Test forward NEAR chain (i>iPhrase) */
 130       if( 0==nm_near_chain(1, pDoc, ii, nPhrase, aPhrase, iPhrase) ) continue;
 131
 132       /* Test reverse NEAR chain (i<iPhrase) */
 133       if( 0==nm_near_chain(-1, pDoc, ii, nPhrase, aPhrase, iPhrase) ) continue;
 134
 135       /* This is a real match. Increment the counter. */
 136       nOcc++;
 137     }
 138   }
 139
 140   return nOcc;
 141 }
 142
 143 /*
 144 ** Tclcmd: fts3_near_match DOCUMENT EXPR ?OPTIONS?
 145 */
 146 static int fts3_near_match_cmd(
 147   ClientData clientData,
 148   Tcl_Interp *interp,
 149   int objc,
 150   Tcl_Obj *CONST objv[]
 151 ){
 152   int nTotal = 0;
 153   int rc;
 154   int ii;
 155   int nPhrase;
 156   NearPhrase *aPhrase = 0;
 157   NearDocument doc = {0, 0};
 158   Tcl_Obj **apDocToken;
 159   Tcl_Obj *pRet;
 160   Tcl_Obj *pPhrasecount = 0;
 161
 162   Tcl_Obj **apExprToken;
 163   int nExprToken;
 164
 165   UNUSED_PARAMETER(clientData);
 166
 167   /* Must have 3 or more arguments. */
 168   if( objc<3 || (objc%2)==0 ){
 169     Tcl_WrongNumArgs(interp, 1, objv, "DOCUMENT EXPR ?OPTION VALUE?...");
 170     rc = TCL_ERROR;
 171     goto near_match_out;
 172   }
 173
 174   for(ii=3; ii<objc; ii+=2){
 175     enum NM_enum { NM_PHRASECOUNTS };
 176     struct TestnmSubcmd {
 177       char *zName;
 178       enum NM_enum eOpt;
 179     } aOpt[] = {
 180       { "-phrasecountvar", NM_PHRASECOUNTS },
 181       { 0, 0 }
 182     };
 183     int iOpt;
 184     if( Tcl_GetIndexFromObjStruct(
 185         interp, objv[ii], aOpt, sizeof(aOpt[0]), "option", 0, &iOpt)
 186     ){
 187       return TCL_ERROR;
 188     }
 189
 190     switch( aOpt[iOpt].eOpt ){
 191       case NM_PHRASECOUNTS:
 192         pPhrasecount = objv[ii+1];
 193         break;
 194     }
 195   }
 196
 197   rc = Tcl_ListObjGetElements(interp, objv[1], &doc.nToken, &apDocToken);
 198   if( rc!=TCL_OK ) goto near_match_out;
 199   doc.aToken = (NearToken *)ckalloc(doc.nToken*sizeof(NearToken));
 200   for(ii=0; ii<doc.nToken; ii++){
 201     doc.aToken[ii].z = Tcl_GetStringFromObj(apDocToken[ii], &doc.aToken[ii].n);
 202   }
 203
 204   rc = Tcl_ListObjGetElements(interp, objv[2], &nExprToken, &apExprToken);
 205   if( rc!=TCL_OK ) goto near_match_out;
 206
 207   nPhrase = (nExprToken + 1) / 2;
 208   aPhrase = (NearPhrase *)ckalloc(nPhrase * sizeof(NearPhrase));
 209   memset(aPhrase, 0, nPhrase * sizeof(NearPhrase));
 210   for(ii=0; ii<nPhrase; ii++){
 211     Tcl_Obj *pPhrase = apExprToken[ii*2];
 212     Tcl_Obj **apToken;
 213     int nToken;
 214     int jj;
 215
 216     rc = Tcl_ListObjGetElements(interp, pPhrase, &nToken, &apToken);
 217     if( rc!=TCL_OK ) goto near_match_out;
 218     if( nToken>NM_MAX_TOKEN ){
 219       Tcl_AppendResult(interp, "Too many tokens in phrase", 0);
 220       rc = TCL_ERROR;
 221       goto near_match_out;
 222     }
 223     for(jj=0; jj<nToken; jj++){
 224       NearToken *pT = &aPhrase[ii].aToken[jj];
 225       pT->z = Tcl_GetStringFromObj(apToken[jj], &pT->n);
 226     }
 227     aPhrase[ii].nToken = nToken;
 228   }
 229   for(ii=1; ii<nPhrase; ii++){
 230     Tcl_Obj *pNear = apExprToken[2*ii-1];
 231     int nNear;
 232     rc = Tcl_GetIntFromObj(interp, pNear, &nNear);
 233     if( rc!=TCL_OK ) goto near_match_out;
 234     aPhrase[ii].nNear = nNear;
 235   }
 236
 237   pRet = Tcl_NewObj();
 238   Tcl_IncrRefCount(pRet);
 239   for(ii=0; ii<nPhrase; ii++){
 240     int nOcc = nm_match_count(&doc, nPhrase, aPhrase, ii);
 241     Tcl_ListObjAppendElement(interp, pRet, Tcl_NewIntObj(nOcc));
 242     nTotal += nOcc;
 243   }
 244   if( pPhrasecount ){
 245     Tcl_ObjSetVar2(interp, pPhrasecount, 0, pRet, 0);
 246   }
 247   Tcl_DecrRefCount(pRet);
 248   Tcl_SetObjResult(interp, Tcl_NewBooleanObj(nTotal>0));
 249
 250  near_match_out:
 251   ckfree((char *)aPhrase);
 252   ckfree((char *)doc.aToken);
 253   return rc;
 254 }
 255
 256 /*
 257 **   Tclcmd: fts3_configure_incr_load ?CHUNKSIZE THRESHOLD?
 258 **
 259 ** Normally, FTS uses hard-coded values to determine the minimum doclist
 260 ** size eligible for incremental loading, and the size of the chunks loaded
 261 ** when a doclist is incrementally loaded. This command allows the built-in
 262 ** values to be overridden for testing purposes.
 263 **
 264 ** If present, the first argument is the chunksize in bytes to load doclists
 265 ** in. The second argument is the minimum doclist size in bytes to use
 266 ** incremental loading with.
 267 **
 268 ** Whether or not the arguments are present, this command returns a list of
 269 ** two integers - the initial chunksize and threshold when the command is
 270 ** invoked. This can be used to restore the default behavior after running
 271 ** tests. For example:
 272 **
 273 **    # Override incr-load settings for testing:
 274 **    set cfg [fts3_configure_incr_load $new_chunksize $new_threshold]
 275 **
 276 **    .... run tests ....
 277 **
 278 **    # Restore initial incr-load settings:
 279 **    eval fts3_configure_incr_load $cfg
 280 */
 281 static int fts3_configure_incr_load_cmd(
 282   ClientData clientData,
 283   Tcl_Interp *interp,
 284   int objc,
 285   Tcl_Obj *CONST objv[]
 286 ){
 287 #ifdef SQLITE_ENABLE_FTS3
 288   extern int test_fts3_node_chunksize;
 289   extern int test_fts3_node_chunk_threshold;
 290   Tcl_Obj *pRet;
 291
 292   if( objc!=1 && objc!=3 ){
 293     Tcl_WrongNumArgs(interp, 1, objv, "?CHUNKSIZE THRESHOLD?");
 294     return TCL_ERROR;
 295   }
 296
 297   pRet = Tcl_NewObj();
 298   Tcl_IncrRefCount(pRet);
 299   Tcl_ListObjAppendElement(
 300       interp, pRet, Tcl_NewIntObj(test_fts3_node_chunksize));
 301   Tcl_ListObjAppendElement(
 302       interp, pRet, Tcl_NewIntObj(test_fts3_node_chunk_threshold));
 303
 304   if( objc==3 ){
 305     int iArg1;
 306     int iArg2;
 307     if( Tcl_GetIntFromObj(interp, objv[1], &iArg1)
 308      || Tcl_GetIntFromObj(interp, objv[2], &iArg2)
 309     ){
 310       Tcl_DecrRefCount(pRet);
 311       return TCL_ERROR;
 312     }
 313     test_fts3_node_chunksize = iArg1;
 314     test_fts3_node_chunk_threshold = iArg2;
 315   }
 316
 317   Tcl_SetObjResult(interp, pRet);
 318   Tcl_DecrRefCount(pRet);
 319 #endif
 320   UNUSED_PARAMETER(clientData);
 321   return TCL_OK;
 322 }
 323
 324 #ifdef SQLITE_ENABLE_FTS3
 325 /**************************************************************************
 326 ** Beginning of test tokenizer code.
 327 **
 328 ** For language 0, this tokenizer is similar to the default 'simple'
 329 ** tokenizer. For other languages L, the following:
 330 **
 331 **   * Odd numbered languages are case-sensitive. Even numbered
 332 **     languages are not.
 333 **
 334 **   * Language ids 100 or greater are considered an error.
 335 **
 336 ** The implementation assumes that the input contains only ASCII characters
 337 ** (i.e. those that may be encoded in UTF-8 using a single byte).
 338 */
 339 typedef struct test_tokenizer {
 340   sqlite3_tokenizer base;
 341 } test_tokenizer;
 342
 343 typedef struct test_tokenizer_cursor {
 344   sqlite3_tokenizer_cursor base;
 345   const char *aInput;          /* Input being tokenized */
 346   int nInput;                  /* Size of the input in bytes */
 347   int iInput;                  /* Current offset in aInput */
 348   int iToken;                  /* Index of next token to be returned */
 349   char *aBuffer;               /* Buffer containing current token */
 350   int nBuffer;                 /* Number of bytes allocated at pToken */
 351   int iLangid;                 /* Configured language id */
 352 } test_tokenizer_cursor;
 353
 354 static int testTokenizerCreate(
 355   int argc, const char * const *argv,
 356   sqlite3_tokenizer **ppTokenizer
 357 ){
 358   test_tokenizer *pNew;
 359   UNUSED_PARAMETER(argc);
 360   UNUSED_PARAMETER(argv);
 361
 362   pNew = sqlite3_malloc(sizeof(test_tokenizer));
 363   if( !pNew ) return SQLITE_NOMEM;
 364   memset(pNew, 0, sizeof(test_tokenizer));
 365
 366   *ppTokenizer = (sqlite3_tokenizer *)pNew;
 367   return SQLITE_OK;
 368 }
 369
 370 static int testTokenizerDestroy(sqlite3_tokenizer *pTokenizer){
 371   test_tokenizer *p = (test_tokenizer *)pTokenizer;
 372   sqlite3_free(p);
 373   return SQLITE_OK;
 374 }
 375
 376 static int testTokenizerOpen(
 377   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
 378   const char *pInput, int nBytes,        /* String to be tokenized */
 379   sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
 380 ){
 381   int rc = SQLITE_OK;                    /* Return code */
 382   test_tokenizer_cursor *pCsr;           /* New cursor object */
 383
 384   UNUSED_PARAMETER(pTokenizer);
 385
 386   pCsr = (test_tokenizer_cursor *)sqlite3_malloc(sizeof(test_tokenizer_cursor));
 387   if( pCsr==0 ){
 388     rc = SQLITE_NOMEM;
 389   }else{
 390     memset(pCsr, 0, sizeof(test_tokenizer_cursor));
 391     pCsr->aInput = pInput;
 392     if( nBytes<0 ){
 393       pCsr->nInput = (int)strlen(pInput);
 394     }else{
 395       pCsr->nInput = nBytes;
 396     }
 397   }
 398
 399   *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
 400   return rc;
 401 }
 402
 403 static int testTokenizerClose(sqlite3_tokenizer_cursor *pCursor){
 404   test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
 405   sqlite3_free(pCsr->aBuffer);
 406   sqlite3_free(pCsr);
 407   return SQLITE_OK;
 408 }
 409
 410 static int testIsTokenChar(char c){
 411   return (c>='a' && c<='z') || (c>='A' && c<='Z');
 412 }
 413 static int testTolower(char c){
 414   char ret = c;
 415   if( ret>='A' && ret<='Z') ret = ret - ('A'-'a');
 416   return ret;
 417 }
 418
 419 static int testTokenizerNext(
 420   sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by testTokenizerOpen */
 421   const char **ppToken,               /* OUT: *ppToken is the token text */
 422   int *pnBytes,                       /* OUT: Number of bytes in token */
 423   int *piStartOffset,                 /* OUT: Starting offset of token */
 424   int *piEndOffset,                   /* OUT: Ending offset of token */
 425   int *piPosition                     /* OUT: Position integer of token */
 426 ){
 427   test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
 428   int rc = SQLITE_OK;
 429   const char *p;
 430   const char *pEnd;
 431
 432   p = &pCsr->aInput[pCsr->iInput];
 433   pEnd = &pCsr->aInput[pCsr->nInput];
 434
 435   /* Skip past any white-space */
 436   assert( p<=pEnd );
 437   while( p<pEnd && testIsTokenChar(*p)==0 ) p++;
 438
 439   if( p==pEnd ){
 440     rc = SQLITE_DONE;
 441   }else{
 442     /* Advance to the end of the token */
 443     const char *pToken = p;
 444     int nToken;
 445     while( p<pEnd && testIsTokenChar(*p) ) p++;
 446     nToken = (int)(p-pToken);
 447
 448     /* Copy the token into the buffer */
 449     if( nToken>pCsr->nBuffer ){
 450       sqlite3_free(pCsr->aBuffer);
 451       pCsr->aBuffer = sqlite3_malloc(nToken);
 452     }
 453     if( pCsr->aBuffer==0 ){
 454       rc = SQLITE_NOMEM;
 455     }else{
 456       int i;
 457
 458       if( pCsr->iLangid & 0x00000001 ){
 459         for(i=0; i<nToken; i++) pCsr->aBuffer[i] = pToken[i];
 460       }else{
 461         for(i=0; i<nToken; i++) pCsr->aBuffer[i] = testTolower(pToken[i]);
 462       }
 463       pCsr->iToken++;
 464       pCsr->iInput = (int)(p - pCsr->aInput);
 465
 466       *ppToken = pCsr->aBuffer;
 467       *pnBytes = nToken;
 468       *piStartOffset = (int)(pToken - pCsr->aInput);
 469       *piEndOffset = (int)(p - pCsr->aInput);
 470       *piPosition = pCsr->iToken;
 471     }
 472   }
 473
 474   return rc;
 475 }
 476
 477 static int testTokenizerLanguage(
 478   sqlite3_tokenizer_cursor *pCursor,
 479   int iLangid
 480 ){
 481   int rc = SQLITE_OK;
 482   test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
 483   pCsr->iLangid = iLangid;
 484   if( pCsr->iLangid>=100 ){
 485     rc = SQLITE_ERROR;
 486   }
 487   return rc;
 488 }
 489 #endif
 490
 491 static int fts3_test_tokenizer_cmd(
 492   ClientData clientData,
 493   Tcl_Interp *interp,
 494   int objc,
 495   Tcl_Obj *CONST objv[]
 496 ){
 497 #ifdef SQLITE_ENABLE_FTS3
 498   static const sqlite3_tokenizer_module testTokenizerModule = {
 499     1,
 500     testTokenizerCreate,
 501     testTokenizerDestroy,
 502     testTokenizerOpen,
 503     testTokenizerClose,
 504     testTokenizerNext,
 505     testTokenizerLanguage
 506   };
 507   const sqlite3_tokenizer_module *pPtr = &testTokenizerModule;
 508   if( objc!=1 ){
 509     Tcl_WrongNumArgs(interp, 1, objv, "");
 510     return TCL_ERROR;
 511   }
 512   Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(
 513     (const unsigned char *)&pPtr, sizeof(sqlite3_tokenizer_module *)
 514   ));
 515 #endif
 516   UNUSED_PARAMETER(clientData);
 517   return TCL_OK;
 518 }
 519
 520 static int fts3_test_varint_cmd(
 521   ClientData clientData,
 522   Tcl_Interp *interp,
 523   int objc,
 524   Tcl_Obj *CONST objv[]
 525 ){
 526 #ifdef SQLITE_ENABLE_FTS3
 527   char aBuf[24];
 528   int rc;
 529   Tcl_WideInt w, w2;
 530   int nByte, nByte2;
 531
 532   if( objc!=2 ){
 533     Tcl_WrongNumArgs(interp, 1, objv, "INTEGER");
 534     return TCL_ERROR;
 535   }
 536
 537   rc = Tcl_GetWideIntFromObj(interp, objv[1], &w);
 538   if( rc!=TCL_OK ) return rc;
 539
 540   nByte = sqlite3Fts3PutVarint(aBuf, w);
 541   nByte2 = sqlite3Fts3GetVarint(aBuf, &w2);
 542   if( w!=w2 || nByte!=nByte2 ){
 543     char *zErr = sqlite3_mprintf("error testing %lld", w);
 544     Tcl_ResetResult(interp);
 545     Tcl_AppendResult(interp, zErr, 0);
 546     return TCL_ERROR;
 547   }
 548
 549   if( w<=2147483647 && w>=0 ){
 550     int i;
 551     nByte2 = fts3GetVarint32(aBuf, &i);
 552     if( (int)w!=i || nByte!=nByte2 ){
 553       char *zErr = sqlite3_mprintf("error testing %lld (32-bit)", w);
 554       Tcl_ResetResult(interp);
 555       Tcl_AppendResult(interp, zErr, 0);
 556       return TCL_ERROR;
 557     }
 558   }
 559
 560 #endif
 561   UNUSED_PARAMETER(clientData);
 562   return TCL_OK;
 563 }
 564
 565 /*
 566 ** End of tokenizer code.
 567 **************************************************************************/
 568
 569 int Sqlitetestfts3_Init(Tcl_Interp *interp){
 570   Tcl_CreateObjCommand(interp, "fts3_near_match", fts3_near_match_cmd, 0, 0);
 571   Tcl_CreateObjCommand(interp,
 572       "fts3_configure_incr_load", fts3_configure_incr_load_cmd, 0, 0
 573   );
 574   Tcl_CreateObjCommand(
 575       interp, "fts3_test_tokenizer", fts3_test_tokenizer_cmd, 0, 0
 576   );
 577
 578   Tcl_CreateObjCommand(
 579       interp, "fts3_test_varint", fts3_test_varint_cmd, 0, 0
 580   );
 581   return TCL_OK;
 582 }
 583 #endif                  /* SQLITE_ENABLE_FTS3 || SQLITE_ENABLE_FTS4 */
 584 #endif                  /* ifdef SQLITE_TEST */