Snapshot of upstream SQLite 3.46.1
[sqlcipher.git] / ext / fts3 / fts3_test.c
blob49a8476bf3e8b6f6ea4083d3a6db3b6e0ccec322
1 /*
2 ** 2011 Jun 13
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 ******************************************************************************
13 ** This file is not part of the production FTS code. It is only used for
14 ** testing. It contains a Tcl command that can be used to test if a document
15 ** matches an FTS NEAR expression.
17 ** As of March 2012, it also contains a version 1 tokenizer used for testing
18 ** that the sqlite3_tokenizer_module.xLanguage() method is invoked correctly.
21 #if defined(INCLUDE_SQLITE_TCL_H)
22 # include "sqlite_tcl.h"
23 #else
24 # include "tcl.h"
25 # ifndef SQLITE_TCLAPI
26 # define SQLITE_TCLAPI
27 # endif
28 #endif
29 #include <string.h>
30 #include <assert.h>
32 #if defined(SQLITE_TEST)
33 #if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)
35 /* Required so that the "ifdef SQLITE_ENABLE_FTS3" below works */
36 #include "fts3Int.h"
38 #define NM_MAX_TOKEN 12
40 typedef struct NearPhrase NearPhrase;
41 typedef struct NearDocument NearDocument;
42 typedef struct NearToken NearToken;
44 struct NearDocument {
45 int nToken; /* Length of token in bytes */
46 NearToken *aToken; /* Token array */
49 struct NearToken {
50 int n; /* Length of token in bytes */
51 const char *z; /* Pointer to token string */
54 struct NearPhrase {
55 int nNear; /* Preceding NEAR value */
56 int nToken; /* Number of tokens in this phrase */
57 NearToken aToken[NM_MAX_TOKEN]; /* Array of tokens in this phrase */
60 static int nm_phrase_match(
61 NearPhrase *p,
62 NearToken *aToken
64 int ii;
66 for(ii=0; ii<p->nToken; ii++){
67 NearToken *pToken = &p->aToken[ii];
68 if( pToken->n>0 && pToken->z[pToken->n-1]=='*' ){
69 if( aToken[ii].n<(pToken->n-1) ) return 0;
70 if( memcmp(aToken[ii].z, pToken->z, pToken->n-1) ) return 0;
71 }else{
72 if( aToken[ii].n!=pToken->n ) return 0;
73 if( memcmp(aToken[ii].z, pToken->z, pToken->n) ) return 0;
77 return 1;
80 static int nm_near_chain(
81 int iDir, /* Direction to iterate through aPhrase[] */
82 NearDocument *pDoc, /* Document to match against */
83 int iPos, /* Position at which iPhrase was found */
84 int nPhrase, /* Size of phrase array */
85 NearPhrase *aPhrase, /* Phrase array */
86 int iPhrase /* Index of phrase found */
88 int iStart;
89 int iStop;
90 int ii;
91 int nNear;
92 int iPhrase2;
93 NearPhrase *p;
94 NearPhrase *pPrev;
96 assert( iDir==1 || iDir==-1 );
98 if( iDir==1 ){
99 if( (iPhrase+1)==nPhrase ) return 1;
100 nNear = aPhrase[iPhrase+1].nNear;
101 }else{
102 if( iPhrase==0 ) return 1;
103 nNear = aPhrase[iPhrase].nNear;
105 pPrev = &aPhrase[iPhrase];
106 iPhrase2 = iPhrase+iDir;
107 p = &aPhrase[iPhrase2];
109 iStart = iPos - nNear - p->nToken;
110 iStop = iPos + nNear + pPrev->nToken;
112 if( iStart<0 ) iStart = 0;
113 if( iStop > pDoc->nToken - p->nToken ) iStop = pDoc->nToken - p->nToken;
115 for(ii=iStart; ii<=iStop; ii++){
116 if( nm_phrase_match(p, &pDoc->aToken[ii]) ){
117 if( nm_near_chain(iDir, pDoc, ii, nPhrase, aPhrase, iPhrase2) ) return 1;
121 return 0;
124 static int nm_match_count(
125 NearDocument *pDoc, /* Document to match against */
126 int nPhrase, /* Size of phrase array */
127 NearPhrase *aPhrase, /* Phrase array */
128 int iPhrase /* Index of phrase to count matches for */
130 int nOcc = 0;
131 int ii;
132 NearPhrase *p = &aPhrase[iPhrase];
134 for(ii=0; ii<(pDoc->nToken + 1 - p->nToken); ii++){
135 if( nm_phrase_match(p, &pDoc->aToken[ii]) ){
136 /* Test forward NEAR chain (i>iPhrase) */
137 if( 0==nm_near_chain(1, pDoc, ii, nPhrase, aPhrase, iPhrase) ) continue;
139 /* Test reverse NEAR chain (i<iPhrase) */
140 if( 0==nm_near_chain(-1, pDoc, ii, nPhrase, aPhrase, iPhrase) ) continue;
142 /* This is a real match. Increment the counter. */
143 nOcc++;
147 return nOcc;
151 ** Tclcmd: fts3_near_match DOCUMENT EXPR ?OPTIONS?
153 static int SQLITE_TCLAPI fts3_near_match_cmd(
154 ClientData clientData,
155 Tcl_Interp *interp,
156 int objc,
157 Tcl_Obj *CONST objv[]
159 int nTotal = 0;
160 int rc;
161 int ii;
162 int nPhrase;
163 NearPhrase *aPhrase = 0;
164 NearDocument doc = {0, 0};
165 Tcl_Obj **apDocToken;
166 Tcl_Obj *pRet;
167 Tcl_Obj *pPhrasecount = 0;
169 Tcl_Obj **apExprToken;
170 int nExprToken;
172 UNUSED_PARAMETER(clientData);
174 /* Must have 3 or more arguments. */
175 if( objc<3 || (objc%2)==0 ){
176 Tcl_WrongNumArgs(interp, 1, objv, "DOCUMENT EXPR ?OPTION VALUE?...");
177 rc = TCL_ERROR;
178 goto near_match_out;
181 for(ii=3; ii<objc; ii+=2){
182 enum NM_enum { NM_PHRASECOUNTS };
183 struct TestnmSubcmd {
184 char *zName;
185 enum NM_enum eOpt;
186 } aOpt[] = {
187 { "-phrasecountvar", NM_PHRASECOUNTS },
188 { 0, 0 }
190 int iOpt;
191 if( Tcl_GetIndexFromObjStruct(
192 interp, objv[ii], aOpt, sizeof(aOpt[0]), "option", 0, &iOpt)
194 return TCL_ERROR;
197 switch( aOpt[iOpt].eOpt ){
198 case NM_PHRASECOUNTS:
199 pPhrasecount = objv[ii+1];
200 break;
204 rc = Tcl_ListObjGetElements(interp, objv[1], &doc.nToken, &apDocToken);
205 if( rc!=TCL_OK ) goto near_match_out;
206 doc.aToken = (NearToken *)ckalloc(doc.nToken*sizeof(NearToken));
207 for(ii=0; ii<doc.nToken; ii++){
208 doc.aToken[ii].z = Tcl_GetStringFromObj(apDocToken[ii], &doc.aToken[ii].n);
211 rc = Tcl_ListObjGetElements(interp, objv[2], &nExprToken, &apExprToken);
212 if( rc!=TCL_OK ) goto near_match_out;
214 nPhrase = (nExprToken + 1) / 2;
215 aPhrase = (NearPhrase *)ckalloc(nPhrase * sizeof(NearPhrase));
216 memset(aPhrase, 0, nPhrase * sizeof(NearPhrase));
217 for(ii=0; ii<nPhrase; ii++){
218 Tcl_Obj *pPhrase = apExprToken[ii*2];
219 Tcl_Obj **apToken;
220 int nToken;
221 int jj;
223 rc = Tcl_ListObjGetElements(interp, pPhrase, &nToken, &apToken);
224 if( rc!=TCL_OK ) goto near_match_out;
225 if( nToken>NM_MAX_TOKEN ){
226 Tcl_AppendResult(interp, "Too many tokens in phrase", 0);
227 rc = TCL_ERROR;
228 goto near_match_out;
230 for(jj=0; jj<nToken; jj++){
231 NearToken *pT = &aPhrase[ii].aToken[jj];
232 pT->z = Tcl_GetStringFromObj(apToken[jj], &pT->n);
234 aPhrase[ii].nToken = nToken;
236 for(ii=1; ii<nPhrase; ii++){
237 Tcl_Obj *pNear = apExprToken[2*ii-1];
238 int nNear;
239 rc = Tcl_GetIntFromObj(interp, pNear, &nNear);
240 if( rc!=TCL_OK ) goto near_match_out;
241 aPhrase[ii].nNear = nNear;
244 pRet = Tcl_NewObj();
245 Tcl_IncrRefCount(pRet);
246 for(ii=0; ii<nPhrase; ii++){
247 int nOcc = nm_match_count(&doc, nPhrase, aPhrase, ii);
248 Tcl_ListObjAppendElement(interp, pRet, Tcl_NewIntObj(nOcc));
249 nTotal += nOcc;
251 if( pPhrasecount ){
252 Tcl_ObjSetVar2(interp, pPhrasecount, 0, pRet, 0);
254 Tcl_DecrRefCount(pRet);
255 Tcl_SetObjResult(interp, Tcl_NewBooleanObj(nTotal>0));
257 near_match_out:
258 ckfree((char *)aPhrase);
259 ckfree((char *)doc.aToken);
260 return rc;
264 ** Tclcmd: fts3_configure_incr_load ?CHUNKSIZE THRESHOLD?
266 ** Normally, FTS uses hard-coded values to determine the minimum doclist
267 ** size eligible for incremental loading, and the size of the chunks loaded
268 ** when a doclist is incrementally loaded. This command allows the built-in
269 ** values to be overridden for testing purposes.
271 ** If present, the first argument is the chunksize in bytes to load doclists
272 ** in. The second argument is the minimum doclist size in bytes to use
273 ** incremental loading with.
275 ** Whether or not the arguments are present, this command returns a list of
276 ** two integers - the initial chunksize and threshold when the command is
277 ** invoked. This can be used to restore the default behavior after running
278 ** tests. For example:
280 ** # Override incr-load settings for testing:
281 ** set cfg [fts3_configure_incr_load $new_chunksize $new_threshold]
283 ** .... run tests ....
285 ** # Restore initial incr-load settings:
286 ** eval fts3_configure_incr_load $cfg
288 static int SQLITE_TCLAPI fts3_configure_incr_load_cmd(
289 ClientData clientData,
290 Tcl_Interp *interp,
291 int objc,
292 Tcl_Obj *CONST objv[]
294 #ifdef SQLITE_ENABLE_FTS3
295 extern int test_fts3_node_chunksize;
296 extern int test_fts3_node_chunk_threshold;
297 Tcl_Obj *pRet;
299 if( objc!=1 && objc!=3 ){
300 Tcl_WrongNumArgs(interp, 1, objv, "?CHUNKSIZE THRESHOLD?");
301 return TCL_ERROR;
304 pRet = Tcl_NewObj();
305 Tcl_IncrRefCount(pRet);
306 Tcl_ListObjAppendElement(
307 interp, pRet, Tcl_NewIntObj(test_fts3_node_chunksize));
308 Tcl_ListObjAppendElement(
309 interp, pRet, Tcl_NewIntObj(test_fts3_node_chunk_threshold));
311 if( objc==3 ){
312 int iArg1;
313 int iArg2;
314 if( Tcl_GetIntFromObj(interp, objv[1], &iArg1)
315 || Tcl_GetIntFromObj(interp, objv[2], &iArg2)
317 Tcl_DecrRefCount(pRet);
318 return TCL_ERROR;
320 test_fts3_node_chunksize = iArg1;
321 test_fts3_node_chunk_threshold = iArg2;
324 Tcl_SetObjResult(interp, pRet);
325 Tcl_DecrRefCount(pRet);
326 #endif
327 UNUSED_PARAMETER(clientData);
328 return TCL_OK;
331 #ifdef SQLITE_ENABLE_FTS3
332 /**************************************************************************
333 ** Beginning of test tokenizer code.
335 ** For language 0, this tokenizer is similar to the default 'simple'
336 ** tokenizer. For other languages L, the following:
338 ** * Odd numbered languages are case-sensitive. Even numbered
339 ** languages are not.
341 ** * Language ids 100 or greater are considered an error.
343 ** The implementation assumes that the input contains only ASCII characters
344 ** (i.e. those that may be encoded in UTF-8 using a single byte).
346 typedef struct test_tokenizer {
347 sqlite3_tokenizer base;
348 } test_tokenizer;
350 typedef struct test_tokenizer_cursor {
351 sqlite3_tokenizer_cursor base;
352 const char *aInput; /* Input being tokenized */
353 int nInput; /* Size of the input in bytes */
354 int iInput; /* Current offset in aInput */
355 int iToken; /* Index of next token to be returned */
356 char *aBuffer; /* Buffer containing current token */
357 int nBuffer; /* Number of bytes allocated at pToken */
358 int iLangid; /* Configured language id */
359 } test_tokenizer_cursor;
361 static int testTokenizerCreate(
362 int argc, const char * const *argv,
363 sqlite3_tokenizer **ppTokenizer
365 test_tokenizer *pNew;
366 UNUSED_PARAMETER(argc);
367 UNUSED_PARAMETER(argv);
369 pNew = sqlite3_malloc(sizeof(test_tokenizer));
370 if( !pNew ) return SQLITE_NOMEM;
371 memset(pNew, 0, sizeof(test_tokenizer));
373 *ppTokenizer = (sqlite3_tokenizer *)pNew;
374 return SQLITE_OK;
377 static int testTokenizerDestroy(sqlite3_tokenizer *pTokenizer){
378 test_tokenizer *p = (test_tokenizer *)pTokenizer;
379 sqlite3_free(p);
380 return SQLITE_OK;
383 static int testTokenizerOpen(
384 sqlite3_tokenizer *pTokenizer, /* The tokenizer */
385 const char *pInput, int nBytes, /* String to be tokenized */
386 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
388 int rc = SQLITE_OK; /* Return code */
389 test_tokenizer_cursor *pCsr; /* New cursor object */
391 UNUSED_PARAMETER(pTokenizer);
393 pCsr = (test_tokenizer_cursor *)sqlite3_malloc(sizeof(test_tokenizer_cursor));
394 if( pCsr==0 ){
395 rc = SQLITE_NOMEM;
396 }else{
397 memset(pCsr, 0, sizeof(test_tokenizer_cursor));
398 pCsr->aInput = pInput;
399 if( nBytes<0 ){
400 pCsr->nInput = (int)strlen(pInput);
401 }else{
402 pCsr->nInput = nBytes;
406 *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
407 return rc;
410 static int testTokenizerClose(sqlite3_tokenizer_cursor *pCursor){
411 test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
412 sqlite3_free(pCsr->aBuffer);
413 sqlite3_free(pCsr);
414 return SQLITE_OK;
417 static int testIsTokenChar(char c){
418 return (c>='a' && c<='z') || (c>='A' && c<='Z');
420 static int testTolower(char c){
421 char ret = c;
422 if( ret>='A' && ret<='Z') ret = ret - ('A'-'a');
423 return ret;
426 static int testTokenizerNext(
427 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by testTokenizerOpen */
428 const char **ppToken, /* OUT: *ppToken is the token text */
429 int *pnBytes, /* OUT: Number of bytes in token */
430 int *piStartOffset, /* OUT: Starting offset of token */
431 int *piEndOffset, /* OUT: Ending offset of token */
432 int *piPosition /* OUT: Position integer of token */
434 test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
435 int rc = SQLITE_OK;
436 const char *p;
437 const char *pEnd;
439 p = &pCsr->aInput[pCsr->iInput];
440 pEnd = &pCsr->aInput[pCsr->nInput];
442 /* Skip past any white-space */
443 assert( p<=pEnd );
444 while( p<pEnd && testIsTokenChar(*p)==0 ) p++;
446 if( p==pEnd ){
447 rc = SQLITE_DONE;
448 }else{
449 /* Advance to the end of the token */
450 const char *pToken = p;
451 sqlite3_int64 nToken;
452 while( p<pEnd && testIsTokenChar(*p) ) p++;
453 nToken = (sqlite3_int64)(p-pToken);
455 /* Copy the token into the buffer */
456 if( nToken>pCsr->nBuffer ){
457 sqlite3_free(pCsr->aBuffer);
458 pCsr->aBuffer = sqlite3_malloc64(nToken);
460 if( pCsr->aBuffer==0 ){
461 rc = SQLITE_NOMEM;
462 }else{
463 int i;
465 if( pCsr->iLangid & 0x00000001 ){
466 for(i=0; i<nToken; i++) pCsr->aBuffer[i] = pToken[i];
467 }else{
468 for(i=0; i<nToken; i++) pCsr->aBuffer[i] = (char)testTolower(pToken[i]);
470 pCsr->iToken++;
471 pCsr->iInput = (int)(p - pCsr->aInput);
473 *ppToken = pCsr->aBuffer;
474 *pnBytes = (int)nToken;
475 *piStartOffset = (int)(pToken - pCsr->aInput);
476 *piEndOffset = (int)(p - pCsr->aInput);
477 *piPosition = pCsr->iToken;
481 return rc;
484 static int testTokenizerLanguage(
485 sqlite3_tokenizer_cursor *pCursor,
486 int iLangid
488 int rc = SQLITE_OK;
489 test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
490 pCsr->iLangid = iLangid;
491 if( pCsr->iLangid>=100 ){
492 rc = SQLITE_ERROR;
494 return rc;
496 #endif
498 static int SQLITE_TCLAPI fts3_test_tokenizer_cmd(
499 ClientData clientData,
500 Tcl_Interp *interp,
501 int objc,
502 Tcl_Obj *CONST objv[]
504 #ifdef SQLITE_ENABLE_FTS3
505 static const sqlite3_tokenizer_module testTokenizerModule = {
507 testTokenizerCreate,
508 testTokenizerDestroy,
509 testTokenizerOpen,
510 testTokenizerClose,
511 testTokenizerNext,
512 testTokenizerLanguage
514 const sqlite3_tokenizer_module *pPtr = &testTokenizerModule;
515 if( objc!=1 ){
516 Tcl_WrongNumArgs(interp, 1, objv, "");
517 return TCL_ERROR;
519 Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(
520 (const unsigned char *)&pPtr, sizeof(sqlite3_tokenizer_module *)
522 #endif
523 UNUSED_PARAMETER(clientData);
524 return TCL_OK;
527 static int SQLITE_TCLAPI fts3_test_varint_cmd(
528 ClientData clientData,
529 Tcl_Interp *interp,
530 int objc,
531 Tcl_Obj *CONST objv[]
533 #ifdef SQLITE_ENABLE_FTS3
534 char aBuf[24];
535 int rc;
536 Tcl_WideInt w;
537 sqlite3_int64 w2;
538 int nByte, nByte2;
540 if( objc!=2 ){
541 Tcl_WrongNumArgs(interp, 1, objv, "INTEGER");
542 return TCL_ERROR;
545 rc = Tcl_GetWideIntFromObj(interp, objv[1], &w);
546 if( rc!=TCL_OK ) return rc;
548 nByte = sqlite3Fts3PutVarint(aBuf, w);
549 nByte2 = sqlite3Fts3GetVarint(aBuf, &w2);
550 if( w!=w2 || nByte!=nByte2 ){
551 char *zErr = sqlite3_mprintf("error testing %lld", w);
552 Tcl_ResetResult(interp);
553 Tcl_AppendResult(interp, zErr, 0);
554 return TCL_ERROR;
557 if( w<=2147483647 && w>=0 ){
558 int i;
559 nByte2 = fts3GetVarint32(aBuf, &i);
560 if( (int)w!=i || nByte!=nByte2 ){
561 char *zErr = sqlite3_mprintf("error testing %lld (32-bit)", w);
562 Tcl_ResetResult(interp);
563 Tcl_AppendResult(interp, zErr, 0);
564 return TCL_ERROR;
568 #endif
569 UNUSED_PARAMETER(clientData);
570 return TCL_OK;
574 ** End of tokenizer code.
575 **************************************************************************/
578 ** sqlite3_fts3_may_be_corrupt BOOLEAN
580 ** Set or clear the global "may-be-corrupt" flag. Return the old value.
582 static int SQLITE_TCLAPI fts3_may_be_corrupt(
583 void * clientData,
584 Tcl_Interp *interp,
585 int objc,
586 Tcl_Obj *CONST objv[]
588 #ifdef SQLITE_DEBUG
589 int bOld = sqlite3_fts3_may_be_corrupt;
591 if( objc!=2 && objc!=1 ){
592 Tcl_WrongNumArgs(interp, 1, objv, "?BOOLEAN?");
593 return TCL_ERROR;
595 if( objc==2 ){
596 int bNew;
597 if( Tcl_GetBooleanFromObj(interp, objv[1], &bNew) ) return TCL_ERROR;
598 sqlite3_fts3_may_be_corrupt = bNew;
601 Tcl_SetObjResult(interp, Tcl_NewIntObj(bOld));
602 #endif
603 return TCL_OK;
606 int Sqlitetestfts3_Init(Tcl_Interp *interp){
607 Tcl_CreateObjCommand(interp, "fts3_near_match", fts3_near_match_cmd, 0, 0);
608 Tcl_CreateObjCommand(interp,
609 "fts3_configure_incr_load", fts3_configure_incr_load_cmd, 0, 0
611 Tcl_CreateObjCommand(
612 interp, "fts3_test_tokenizer", fts3_test_tokenizer_cmd, 0, 0
614 Tcl_CreateObjCommand(
615 interp, "fts3_test_varint", fts3_test_varint_cmd, 0, 0
617 Tcl_CreateObjCommand(
618 interp, "sqlite3_fts3_may_be_corrupt", fts3_may_be_corrupt, 0, 0
620 return TCL_OK;
622 #endif /* SQLITE_ENABLE_FTS3 || SQLITE_ENABLE_FTS4 */
623 #endif /* ifdef SQLITE_TEST */