4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 ******************************************************************************
13 ** This file is not part of the production FTS code. It is only used for
14 ** testing. It contains a Tcl command that can be used to test if a document
15 ** matches an FTS NEAR expression.
17 ** As of March 2012, it also contains a version 1 tokenizer used for testing
18 ** that the sqlite3_tokenizer_module.xLanguage() method is invoked correctly.
25 #if defined(SQLITE_TEST)
26 #if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)
28 /* Required so that the "ifdef SQLITE_ENABLE_FTS3" below works */
31 #define NM_MAX_TOKEN 12
33 typedef struct NearPhrase NearPhrase
;
34 typedef struct NearDocument NearDocument
;
35 typedef struct NearToken NearToken
;
38 int nToken
; /* Length of token in bytes */
39 NearToken
*aToken
; /* Token array */
43 int n
; /* Length of token in bytes */
44 const char *z
; /* Pointer to token string */
48 int nNear
; /* Preceding NEAR value */
49 int nToken
; /* Number of tokens in this phrase */
50 NearToken aToken
[NM_MAX_TOKEN
]; /* Array of tokens in this phrase */
53 static int nm_phrase_match(
59 for(ii
=0; ii
<p
->nToken
; ii
++){
60 NearToken
*pToken
= &p
->aToken
[ii
];
61 if( pToken
->n
>0 && pToken
->z
[pToken
->n
-1]=='*' ){
62 if( aToken
[ii
].n
<(pToken
->n
-1) ) return 0;
63 if( memcmp(aToken
[ii
].z
, pToken
->z
, pToken
->n
-1) ) return 0;
65 if( aToken
[ii
].n
!=pToken
->n
) return 0;
66 if( memcmp(aToken
[ii
].z
, pToken
->z
, pToken
->n
) ) return 0;
73 static int nm_near_chain(
74 int iDir
, /* Direction to iterate through aPhrase[] */
75 NearDocument
*pDoc
, /* Document to match against */
76 int iPos
, /* Position at which iPhrase was found */
77 int nPhrase
, /* Size of phrase array */
78 NearPhrase
*aPhrase
, /* Phrase array */
79 int iPhrase
/* Index of phrase found */
89 assert( iDir
==1 || iDir
==-1 );
92 if( (iPhrase
+1)==nPhrase
) return 1;
93 nNear
= aPhrase
[iPhrase
+1].nNear
;
95 if( iPhrase
==0 ) return 1;
96 nNear
= aPhrase
[iPhrase
].nNear
;
98 pPrev
= &aPhrase
[iPhrase
];
99 iPhrase2
= iPhrase
+iDir
;
100 p
= &aPhrase
[iPhrase2
];
102 iStart
= iPos
- nNear
- p
->nToken
;
103 iStop
= iPos
+ nNear
+ pPrev
->nToken
;
105 if( iStart
<0 ) iStart
= 0;
106 if( iStop
> pDoc
->nToken
- p
->nToken
) iStop
= pDoc
->nToken
- p
->nToken
;
108 for(ii
=iStart
; ii
<=iStop
; ii
++){
109 if( nm_phrase_match(p
, &pDoc
->aToken
[ii
]) ){
110 if( nm_near_chain(iDir
, pDoc
, ii
, nPhrase
, aPhrase
, iPhrase2
) ) return 1;
117 static int nm_match_count(
118 NearDocument
*pDoc
, /* Document to match against */
119 int nPhrase
, /* Size of phrase array */
120 NearPhrase
*aPhrase
, /* Phrase array */
121 int iPhrase
/* Index of phrase to count matches for */
125 NearPhrase
*p
= &aPhrase
[iPhrase
];
127 for(ii
=0; ii
<(pDoc
->nToken
+ 1 - p
->nToken
); ii
++){
128 if( nm_phrase_match(p
, &pDoc
->aToken
[ii
]) ){
129 /* Test forward NEAR chain (i>iPhrase) */
130 if( 0==nm_near_chain(1, pDoc
, ii
, nPhrase
, aPhrase
, iPhrase
) ) continue;
132 /* Test reverse NEAR chain (i<iPhrase) */
133 if( 0==nm_near_chain(-1, pDoc
, ii
, nPhrase
, aPhrase
, iPhrase
) ) continue;
135 /* This is a real match. Increment the counter. */
144 ** Tclcmd: fts3_near_match DOCUMENT EXPR ?OPTIONS?
146 static int fts3_near_match_cmd(
147 ClientData clientData
,
150 Tcl_Obj
*CONST objv
[]
156 NearPhrase
*aPhrase
= 0;
157 NearDocument doc
= {0, 0};
158 Tcl_Obj
**apDocToken
;
160 Tcl_Obj
*pPhrasecount
= 0;
162 Tcl_Obj
**apExprToken
;
165 UNUSED_PARAMETER(clientData
);
167 /* Must have 3 or more arguments. */
168 if( objc
<3 || (objc
%2)==0 ){
169 Tcl_WrongNumArgs(interp
, 1, objv
, "DOCUMENT EXPR ?OPTION VALUE?...");
174 for(ii
=3; ii
<objc
; ii
+=2){
175 enum NM_enum
{ NM_PHRASECOUNTS
};
176 struct TestnmSubcmd
{
180 { "-phrasecountvar", NM_PHRASECOUNTS
},
184 if( Tcl_GetIndexFromObjStruct(
185 interp
, objv
[ii
], aOpt
, sizeof(aOpt
[0]), "option", 0, &iOpt
)
190 switch( aOpt
[iOpt
].eOpt
){
191 case NM_PHRASECOUNTS
:
192 pPhrasecount
= objv
[ii
+1];
197 rc
= Tcl_ListObjGetElements(interp
, objv
[1], &doc
.nToken
, &apDocToken
);
198 if( rc
!=TCL_OK
) goto near_match_out
;
199 doc
.aToken
= (NearToken
*)ckalloc(doc
.nToken
*sizeof(NearToken
));
200 for(ii
=0; ii
<doc
.nToken
; ii
++){
201 doc
.aToken
[ii
].z
= Tcl_GetStringFromObj(apDocToken
[ii
], &doc
.aToken
[ii
].n
);
204 rc
= Tcl_ListObjGetElements(interp
, objv
[2], &nExprToken
, &apExprToken
);
205 if( rc
!=TCL_OK
) goto near_match_out
;
207 nPhrase
= (nExprToken
+ 1) / 2;
208 aPhrase
= (NearPhrase
*)ckalloc(nPhrase
* sizeof(NearPhrase
));
209 memset(aPhrase
, 0, nPhrase
* sizeof(NearPhrase
));
210 for(ii
=0; ii
<nPhrase
; ii
++){
211 Tcl_Obj
*pPhrase
= apExprToken
[ii
*2];
216 rc
= Tcl_ListObjGetElements(interp
, pPhrase
, &nToken
, &apToken
);
217 if( rc
!=TCL_OK
) goto near_match_out
;
218 if( nToken
>NM_MAX_TOKEN
){
219 Tcl_AppendResult(interp
, "Too many tokens in phrase", 0);
223 for(jj
=0; jj
<nToken
; jj
++){
224 NearToken
*pT
= &aPhrase
[ii
].aToken
[jj
];
225 pT
->z
= Tcl_GetStringFromObj(apToken
[jj
], &pT
->n
);
227 aPhrase
[ii
].nToken
= nToken
;
229 for(ii
=1; ii
<nPhrase
; ii
++){
230 Tcl_Obj
*pNear
= apExprToken
[2*ii
-1];
232 rc
= Tcl_GetIntFromObj(interp
, pNear
, &nNear
);
233 if( rc
!=TCL_OK
) goto near_match_out
;
234 aPhrase
[ii
].nNear
= nNear
;
238 Tcl_IncrRefCount(pRet
);
239 for(ii
=0; ii
<nPhrase
; ii
++){
240 int nOcc
= nm_match_count(&doc
, nPhrase
, aPhrase
, ii
);
241 Tcl_ListObjAppendElement(interp
, pRet
, Tcl_NewIntObj(nOcc
));
245 Tcl_ObjSetVar2(interp
, pPhrasecount
, 0, pRet
, 0);
247 Tcl_DecrRefCount(pRet
);
248 Tcl_SetObjResult(interp
, Tcl_NewBooleanObj(nTotal
>0));
251 ckfree((char *)aPhrase
);
252 ckfree((char *)doc
.aToken
);
257 ** Tclcmd: fts3_configure_incr_load ?CHUNKSIZE THRESHOLD?
259 ** Normally, FTS uses hard-coded values to determine the minimum doclist
260 ** size eligible for incremental loading, and the size of the chunks loaded
261 ** when a doclist is incrementally loaded. This command allows the built-in
262 ** values to be overridden for testing purposes.
264 ** If present, the first argument is the chunksize in bytes to load doclists
265 ** in. The second argument is the minimum doclist size in bytes to use
266 ** incremental loading with.
268 ** Whether or not the arguments are present, this command returns a list of
269 ** two integers - the initial chunksize and threshold when the command is
270 ** invoked. This can be used to restore the default behavior after running
271 ** tests. For example:
273 ** # Override incr-load settings for testing:
274 ** set cfg [fts3_configure_incr_load $new_chunksize $new_threshold]
276 ** .... run tests ....
278 ** # Restore initial incr-load settings:
279 ** eval fts3_configure_incr_load $cfg
281 static int fts3_configure_incr_load_cmd(
282 ClientData clientData
,
285 Tcl_Obj
*CONST objv
[]
287 #ifdef SQLITE_ENABLE_FTS3
288 extern int test_fts3_node_chunksize
;
289 extern int test_fts3_node_chunk_threshold
;
292 if( objc
!=1 && objc
!=3 ){
293 Tcl_WrongNumArgs(interp
, 1, objv
, "?CHUNKSIZE THRESHOLD?");
298 Tcl_IncrRefCount(pRet
);
299 Tcl_ListObjAppendElement(
300 interp
, pRet
, Tcl_NewIntObj(test_fts3_node_chunksize
));
301 Tcl_ListObjAppendElement(
302 interp
, pRet
, Tcl_NewIntObj(test_fts3_node_chunk_threshold
));
307 if( Tcl_GetIntFromObj(interp
, objv
[1], &iArg1
)
308 || Tcl_GetIntFromObj(interp
, objv
[2], &iArg2
)
310 Tcl_DecrRefCount(pRet
);
313 test_fts3_node_chunksize
= iArg1
;
314 test_fts3_node_chunk_threshold
= iArg2
;
317 Tcl_SetObjResult(interp
, pRet
);
318 Tcl_DecrRefCount(pRet
);
320 UNUSED_PARAMETER(clientData
);
324 #ifdef SQLITE_ENABLE_FTS3
325 /**************************************************************************
326 ** Beginning of test tokenizer code.
328 ** For language 0, this tokenizer is similar to the default 'simple'
329 ** tokenizer. For other languages L, the following:
331 ** * Odd numbered languages are case-sensitive. Even numbered
332 ** languages are not.
334 ** * Language ids 100 or greater are considered an error.
336 ** The implementation assumes that the input contains only ASCII characters
337 ** (i.e. those that may be encoded in UTF-8 using a single byte).
339 typedef struct test_tokenizer
{
340 sqlite3_tokenizer base
;
343 typedef struct test_tokenizer_cursor
{
344 sqlite3_tokenizer_cursor base
;
345 const char *aInput
; /* Input being tokenized */
346 int nInput
; /* Size of the input in bytes */
347 int iInput
; /* Current offset in aInput */
348 int iToken
; /* Index of next token to be returned */
349 char *aBuffer
; /* Buffer containing current token */
350 int nBuffer
; /* Number of bytes allocated at pToken */
351 int iLangid
; /* Configured language id */
352 } test_tokenizer_cursor
;
354 static int testTokenizerCreate(
355 int argc
, const char * const *argv
,
356 sqlite3_tokenizer
**ppTokenizer
358 test_tokenizer
*pNew
;
359 UNUSED_PARAMETER(argc
);
360 UNUSED_PARAMETER(argv
);
362 pNew
= sqlite3_malloc(sizeof(test_tokenizer
));
363 if( !pNew
) return SQLITE_NOMEM
;
364 memset(pNew
, 0, sizeof(test_tokenizer
));
366 *ppTokenizer
= (sqlite3_tokenizer
*)pNew
;
370 static int testTokenizerDestroy(sqlite3_tokenizer
*pTokenizer
){
371 test_tokenizer
*p
= (test_tokenizer
*)pTokenizer
;
376 static int testTokenizerOpen(
377 sqlite3_tokenizer
*pTokenizer
, /* The tokenizer */
378 const char *pInput
, int nBytes
, /* String to be tokenized */
379 sqlite3_tokenizer_cursor
**ppCursor
/* OUT: Tokenization cursor */
381 int rc
= SQLITE_OK
; /* Return code */
382 test_tokenizer_cursor
*pCsr
; /* New cursor object */
384 UNUSED_PARAMETER(pTokenizer
);
386 pCsr
= (test_tokenizer_cursor
*)sqlite3_malloc(sizeof(test_tokenizer_cursor
));
390 memset(pCsr
, 0, sizeof(test_tokenizer_cursor
));
391 pCsr
->aInput
= pInput
;
393 pCsr
->nInput
= (int)strlen(pInput
);
395 pCsr
->nInput
= nBytes
;
399 *ppCursor
= (sqlite3_tokenizer_cursor
*)pCsr
;
403 static int testTokenizerClose(sqlite3_tokenizer_cursor
*pCursor
){
404 test_tokenizer_cursor
*pCsr
= (test_tokenizer_cursor
*)pCursor
;
405 sqlite3_free(pCsr
->aBuffer
);
410 static int testIsTokenChar(char c
){
411 return (c
>='a' && c
<='z') || (c
>='A' && c
<='Z');
413 static int testTolower(char c
){
415 if( ret
>='A' && ret
<='Z') ret
= ret
- ('A'-'a');
419 static int testTokenizerNext(
420 sqlite3_tokenizer_cursor
*pCursor
, /* Cursor returned by testTokenizerOpen */
421 const char **ppToken
, /* OUT: *ppToken is the token text */
422 int *pnBytes
, /* OUT: Number of bytes in token */
423 int *piStartOffset
, /* OUT: Starting offset of token */
424 int *piEndOffset
, /* OUT: Ending offset of token */
425 int *piPosition
/* OUT: Position integer of token */
427 test_tokenizer_cursor
*pCsr
= (test_tokenizer_cursor
*)pCursor
;
432 p
= &pCsr
->aInput
[pCsr
->iInput
];
433 pEnd
= &pCsr
->aInput
[pCsr
->nInput
];
435 /* Skip past any white-space */
437 while( p
<pEnd
&& testIsTokenChar(*p
)==0 ) p
++;
442 /* Advance to the end of the token */
443 const char *pToken
= p
;
445 while( p
<pEnd
&& testIsTokenChar(*p
) ) p
++;
446 nToken
= (int)(p
-pToken
);
448 /* Copy the token into the buffer */
449 if( nToken
>pCsr
->nBuffer
){
450 sqlite3_free(pCsr
->aBuffer
);
451 pCsr
->aBuffer
= sqlite3_malloc(nToken
);
453 if( pCsr
->aBuffer
==0 ){
458 if( pCsr
->iLangid
& 0x00000001 ){
459 for(i
=0; i
<nToken
; i
++) pCsr
->aBuffer
[i
] = pToken
[i
];
461 for(i
=0; i
<nToken
; i
++) pCsr
->aBuffer
[i
] = testTolower(pToken
[i
]);
464 pCsr
->iInput
= (int)(p
- pCsr
->aInput
);
466 *ppToken
= pCsr
->aBuffer
;
468 *piStartOffset
= (int)(pToken
- pCsr
->aInput
);
469 *piEndOffset
= (int)(p
- pCsr
->aInput
);
470 *piPosition
= pCsr
->iToken
;
477 static int testTokenizerLanguage(
478 sqlite3_tokenizer_cursor
*pCursor
,
482 test_tokenizer_cursor
*pCsr
= (test_tokenizer_cursor
*)pCursor
;
483 pCsr
->iLangid
= iLangid
;
484 if( pCsr
->iLangid
>=100 ){
491 static int fts3_test_tokenizer_cmd(
492 ClientData clientData
,
495 Tcl_Obj
*CONST objv
[]
497 #ifdef SQLITE_ENABLE_FTS3
498 static const sqlite3_tokenizer_module testTokenizerModule
= {
501 testTokenizerDestroy
,
505 testTokenizerLanguage
507 const sqlite3_tokenizer_module
*pPtr
= &testTokenizerModule
;
509 Tcl_WrongNumArgs(interp
, 1, objv
, "");
512 Tcl_SetObjResult(interp
, Tcl_NewByteArrayObj(
513 (const unsigned char *)&pPtr
, sizeof(sqlite3_tokenizer_module
*)
516 UNUSED_PARAMETER(clientData
);
520 static int fts3_test_varint_cmd(
521 ClientData clientData
,
524 Tcl_Obj
*CONST objv
[]
526 #ifdef SQLITE_ENABLE_FTS3
533 Tcl_WrongNumArgs(interp
, 1, objv
, "INTEGER");
537 rc
= Tcl_GetWideIntFromObj(interp
, objv
[1], &w
);
538 if( rc
!=TCL_OK
) return rc
;
540 nByte
= sqlite3Fts3PutVarint(aBuf
, w
);
541 nByte2
= sqlite3Fts3GetVarint(aBuf
, &w2
);
542 if( w
!=w2
|| nByte
!=nByte2
){
543 char *zErr
= sqlite3_mprintf("error testing %lld", w
);
544 Tcl_ResetResult(interp
);
545 Tcl_AppendResult(interp
, zErr
, 0);
549 if( w
<=2147483647 && w
>=0 ){
551 nByte2
= fts3GetVarint32(aBuf
, &i
);
552 if( (int)w
!=i
|| nByte
!=nByte2
){
553 char *zErr
= sqlite3_mprintf("error testing %lld (32-bit)", w
);
554 Tcl_ResetResult(interp
);
555 Tcl_AppendResult(interp
, zErr
, 0);
561 UNUSED_PARAMETER(clientData
);
566 ** End of tokenizer code.
567 **************************************************************************/
569 int Sqlitetestfts3_Init(Tcl_Interp
*interp
){
570 Tcl_CreateObjCommand(interp
, "fts3_near_match", fts3_near_match_cmd
, 0, 0);
571 Tcl_CreateObjCommand(interp
,
572 "fts3_configure_incr_load", fts3_configure_incr_load_cmd
, 0, 0
574 Tcl_CreateObjCommand(
575 interp
, "fts3_test_tokenizer", fts3_test_tokenizer_cmd
, 0, 0
578 Tcl_CreateObjCommand(
579 interp
, "fts3_test_varint", fts3_test_varint_cmd
, 0, 0
583 #endif /* SQLITE_ENABLE_FTS3 || SQLITE_ENABLE_FTS4 */
584 #endif /* ifdef SQLITE_TEST */