Roll src/third_party/WebKit a3b4a2e:7441784 (svn 202551:202552)
[chromium-blink-merge.git] / third_party / sqlite / src / ext / fts3 / fts3_test.c
blob36dcc94e6dabcca9aa220eb04cf49bf4cad30878
1 /*
2 ** 2011 Jun 13
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 ******************************************************************************
13 ** This file is not part of the production FTS code. It is only used for
14 ** testing. It contains a Tcl command that can be used to test if a document
15 ** matches an FTS NEAR expression.
17 ** As of March 2012, it also contains a version 1 tokenizer used for testing
18 ** that the sqlite3_tokenizer_module.xLanguage() method is invoked correctly.
21 #include <tcl.h>
22 #include <string.h>
23 #include <assert.h>
25 #if defined(SQLITE_TEST)
26 #if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)
28 /* Required so that the "ifdef SQLITE_ENABLE_FTS3" below works */
29 #include "fts3Int.h"
31 #define NM_MAX_TOKEN 12
33 typedef struct NearPhrase NearPhrase;
34 typedef struct NearDocument NearDocument;
35 typedef struct NearToken NearToken;
37 struct NearDocument {
38 int nToken; /* Length of token in bytes */
39 NearToken *aToken; /* Token array */
42 struct NearToken {
43 int n; /* Length of token in bytes */
44 const char *z; /* Pointer to token string */
47 struct NearPhrase {
48 int nNear; /* Preceding NEAR value */
49 int nToken; /* Number of tokens in this phrase */
50 NearToken aToken[NM_MAX_TOKEN]; /* Array of tokens in this phrase */
53 static int nm_phrase_match(
54 NearPhrase *p,
55 NearToken *aToken
57 int ii;
59 for(ii=0; ii<p->nToken; ii++){
60 NearToken *pToken = &p->aToken[ii];
61 if( pToken->n>0 && pToken->z[pToken->n-1]=='*' ){
62 if( aToken[ii].n<(pToken->n-1) ) return 0;
63 if( memcmp(aToken[ii].z, pToken->z, pToken->n-1) ) return 0;
64 }else{
65 if( aToken[ii].n!=pToken->n ) return 0;
66 if( memcmp(aToken[ii].z, pToken->z, pToken->n) ) return 0;
70 return 1;
73 static int nm_near_chain(
74 int iDir, /* Direction to iterate through aPhrase[] */
75 NearDocument *pDoc, /* Document to match against */
76 int iPos, /* Position at which iPhrase was found */
77 int nPhrase, /* Size of phrase array */
78 NearPhrase *aPhrase, /* Phrase array */
79 int iPhrase /* Index of phrase found */
81 int iStart;
82 int iStop;
83 int ii;
84 int nNear;
85 int iPhrase2;
86 NearPhrase *p;
87 NearPhrase *pPrev;
89 assert( iDir==1 || iDir==-1 );
91 if( iDir==1 ){
92 if( (iPhrase+1)==nPhrase ) return 1;
93 nNear = aPhrase[iPhrase+1].nNear;
94 }else{
95 if( iPhrase==0 ) return 1;
96 nNear = aPhrase[iPhrase].nNear;
98 pPrev = &aPhrase[iPhrase];
99 iPhrase2 = iPhrase+iDir;
100 p = &aPhrase[iPhrase2];
102 iStart = iPos - nNear - p->nToken;
103 iStop = iPos + nNear + pPrev->nToken;
105 if( iStart<0 ) iStart = 0;
106 if( iStop > pDoc->nToken - p->nToken ) iStop = pDoc->nToken - p->nToken;
108 for(ii=iStart; ii<=iStop; ii++){
109 if( nm_phrase_match(p, &pDoc->aToken[ii]) ){
110 if( nm_near_chain(iDir, pDoc, ii, nPhrase, aPhrase, iPhrase2) ) return 1;
114 return 0;
117 static int nm_match_count(
118 NearDocument *pDoc, /* Document to match against */
119 int nPhrase, /* Size of phrase array */
120 NearPhrase *aPhrase, /* Phrase array */
121 int iPhrase /* Index of phrase to count matches for */
123 int nOcc = 0;
124 int ii;
125 NearPhrase *p = &aPhrase[iPhrase];
127 for(ii=0; ii<(pDoc->nToken + 1 - p->nToken); ii++){
128 if( nm_phrase_match(p, &pDoc->aToken[ii]) ){
129 /* Test forward NEAR chain (i>iPhrase) */
130 if( 0==nm_near_chain(1, pDoc, ii, nPhrase, aPhrase, iPhrase) ) continue;
132 /* Test reverse NEAR chain (i<iPhrase) */
133 if( 0==nm_near_chain(-1, pDoc, ii, nPhrase, aPhrase, iPhrase) ) continue;
135 /* This is a real match. Increment the counter. */
136 nOcc++;
140 return nOcc;
144 ** Tclcmd: fts3_near_match DOCUMENT EXPR ?OPTIONS?
146 static int fts3_near_match_cmd(
147 ClientData clientData,
148 Tcl_Interp *interp,
149 int objc,
150 Tcl_Obj *CONST objv[]
152 int nTotal = 0;
153 int rc;
154 int ii;
155 int nPhrase;
156 NearPhrase *aPhrase = 0;
157 NearDocument doc = {0, 0};
158 Tcl_Obj **apDocToken;
159 Tcl_Obj *pRet;
160 Tcl_Obj *pPhrasecount = 0;
162 Tcl_Obj **apExprToken;
163 int nExprToken;
165 UNUSED_PARAMETER(clientData);
167 /* Must have 3 or more arguments. */
168 if( objc<3 || (objc%2)==0 ){
169 Tcl_WrongNumArgs(interp, 1, objv, "DOCUMENT EXPR ?OPTION VALUE?...");
170 rc = TCL_ERROR;
171 goto near_match_out;
174 for(ii=3; ii<objc; ii+=2){
175 enum NM_enum { NM_PHRASECOUNTS };
176 struct TestnmSubcmd {
177 char *zName;
178 enum NM_enum eOpt;
179 } aOpt[] = {
180 { "-phrasecountvar", NM_PHRASECOUNTS },
181 { 0, 0 }
183 int iOpt;
184 if( Tcl_GetIndexFromObjStruct(
185 interp, objv[ii], aOpt, sizeof(aOpt[0]), "option", 0, &iOpt)
187 return TCL_ERROR;
190 switch( aOpt[iOpt].eOpt ){
191 case NM_PHRASECOUNTS:
192 pPhrasecount = objv[ii+1];
193 break;
197 rc = Tcl_ListObjGetElements(interp, objv[1], &doc.nToken, &apDocToken);
198 if( rc!=TCL_OK ) goto near_match_out;
199 doc.aToken = (NearToken *)ckalloc(doc.nToken*sizeof(NearToken));
200 for(ii=0; ii<doc.nToken; ii++){
201 doc.aToken[ii].z = Tcl_GetStringFromObj(apDocToken[ii], &doc.aToken[ii].n);
204 rc = Tcl_ListObjGetElements(interp, objv[2], &nExprToken, &apExprToken);
205 if( rc!=TCL_OK ) goto near_match_out;
207 nPhrase = (nExprToken + 1) / 2;
208 aPhrase = (NearPhrase *)ckalloc(nPhrase * sizeof(NearPhrase));
209 memset(aPhrase, 0, nPhrase * sizeof(NearPhrase));
210 for(ii=0; ii<nPhrase; ii++){
211 Tcl_Obj *pPhrase = apExprToken[ii*2];
212 Tcl_Obj **apToken;
213 int nToken;
214 int jj;
216 rc = Tcl_ListObjGetElements(interp, pPhrase, &nToken, &apToken);
217 if( rc!=TCL_OK ) goto near_match_out;
218 if( nToken>NM_MAX_TOKEN ){
219 Tcl_AppendResult(interp, "Too many tokens in phrase", 0);
220 rc = TCL_ERROR;
221 goto near_match_out;
223 for(jj=0; jj<nToken; jj++){
224 NearToken *pT = &aPhrase[ii].aToken[jj];
225 pT->z = Tcl_GetStringFromObj(apToken[jj], &pT->n);
227 aPhrase[ii].nToken = nToken;
229 for(ii=1; ii<nPhrase; ii++){
230 Tcl_Obj *pNear = apExprToken[2*ii-1];
231 int nNear;
232 rc = Tcl_GetIntFromObj(interp, pNear, &nNear);
233 if( rc!=TCL_OK ) goto near_match_out;
234 aPhrase[ii].nNear = nNear;
237 pRet = Tcl_NewObj();
238 Tcl_IncrRefCount(pRet);
239 for(ii=0; ii<nPhrase; ii++){
240 int nOcc = nm_match_count(&doc, nPhrase, aPhrase, ii);
241 Tcl_ListObjAppendElement(interp, pRet, Tcl_NewIntObj(nOcc));
242 nTotal += nOcc;
244 if( pPhrasecount ){
245 Tcl_ObjSetVar2(interp, pPhrasecount, 0, pRet, 0);
247 Tcl_DecrRefCount(pRet);
248 Tcl_SetObjResult(interp, Tcl_NewBooleanObj(nTotal>0));
250 near_match_out:
251 ckfree((char *)aPhrase);
252 ckfree((char *)doc.aToken);
253 return rc;
257 ** Tclcmd: fts3_configure_incr_load ?CHUNKSIZE THRESHOLD?
259 ** Normally, FTS uses hard-coded values to determine the minimum doclist
260 ** size eligible for incremental loading, and the size of the chunks loaded
261 ** when a doclist is incrementally loaded. This command allows the built-in
262 ** values to be overridden for testing purposes.
264 ** If present, the first argument is the chunksize in bytes to load doclists
265 ** in. The second argument is the minimum doclist size in bytes to use
266 ** incremental loading with.
268 ** Whether or not the arguments are present, this command returns a list of
269 ** two integers - the initial chunksize and threshold when the command is
270 ** invoked. This can be used to restore the default behavior after running
271 ** tests. For example:
273 ** # Override incr-load settings for testing:
274 ** set cfg [fts3_configure_incr_load $new_chunksize $new_threshold]
276 ** .... run tests ....
278 ** # Restore initial incr-load settings:
279 ** eval fts3_configure_incr_load $cfg
281 static int fts3_configure_incr_load_cmd(
282 ClientData clientData,
283 Tcl_Interp *interp,
284 int objc,
285 Tcl_Obj *CONST objv[]
287 #ifdef SQLITE_ENABLE_FTS3
288 extern int test_fts3_node_chunksize;
289 extern int test_fts3_node_chunk_threshold;
290 Tcl_Obj *pRet;
292 if( objc!=1 && objc!=3 ){
293 Tcl_WrongNumArgs(interp, 1, objv, "?CHUNKSIZE THRESHOLD?");
294 return TCL_ERROR;
297 pRet = Tcl_NewObj();
298 Tcl_IncrRefCount(pRet);
299 Tcl_ListObjAppendElement(
300 interp, pRet, Tcl_NewIntObj(test_fts3_node_chunksize));
301 Tcl_ListObjAppendElement(
302 interp, pRet, Tcl_NewIntObj(test_fts3_node_chunk_threshold));
304 if( objc==3 ){
305 int iArg1;
306 int iArg2;
307 if( Tcl_GetIntFromObj(interp, objv[1], &iArg1)
308 || Tcl_GetIntFromObj(interp, objv[2], &iArg2)
310 Tcl_DecrRefCount(pRet);
311 return TCL_ERROR;
313 test_fts3_node_chunksize = iArg1;
314 test_fts3_node_chunk_threshold = iArg2;
317 Tcl_SetObjResult(interp, pRet);
318 Tcl_DecrRefCount(pRet);
319 #endif
320 UNUSED_PARAMETER(clientData);
321 return TCL_OK;
324 #ifdef SQLITE_ENABLE_FTS3
325 /**************************************************************************
326 ** Beginning of test tokenizer code.
328 ** For language 0, this tokenizer is similar to the default 'simple'
329 ** tokenizer. For other languages L, the following:
331 ** * Odd numbered languages are case-sensitive. Even numbered
332 ** languages are not.
334 ** * Language ids 100 or greater are considered an error.
336 ** The implementation assumes that the input contains only ASCII characters
337 ** (i.e. those that may be encoded in UTF-8 using a single byte).
339 typedef struct test_tokenizer {
340 sqlite3_tokenizer base;
341 } test_tokenizer;
343 typedef struct test_tokenizer_cursor {
344 sqlite3_tokenizer_cursor base;
345 const char *aInput; /* Input being tokenized */
346 int nInput; /* Size of the input in bytes */
347 int iInput; /* Current offset in aInput */
348 int iToken; /* Index of next token to be returned */
349 char *aBuffer; /* Buffer containing current token */
350 int nBuffer; /* Number of bytes allocated at pToken */
351 int iLangid; /* Configured language id */
352 } test_tokenizer_cursor;
354 static int testTokenizerCreate(
355 int argc, const char * const *argv,
356 sqlite3_tokenizer **ppTokenizer
358 test_tokenizer *pNew;
359 UNUSED_PARAMETER(argc);
360 UNUSED_PARAMETER(argv);
362 pNew = sqlite3_malloc(sizeof(test_tokenizer));
363 if( !pNew ) return SQLITE_NOMEM;
364 memset(pNew, 0, sizeof(test_tokenizer));
366 *ppTokenizer = (sqlite3_tokenizer *)pNew;
367 return SQLITE_OK;
370 static int testTokenizerDestroy(sqlite3_tokenizer *pTokenizer){
371 test_tokenizer *p = (test_tokenizer *)pTokenizer;
372 sqlite3_free(p);
373 return SQLITE_OK;
376 static int testTokenizerOpen(
377 sqlite3_tokenizer *pTokenizer, /* The tokenizer */
378 const char *pInput, int nBytes, /* String to be tokenized */
379 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
381 int rc = SQLITE_OK; /* Return code */
382 test_tokenizer_cursor *pCsr; /* New cursor object */
384 UNUSED_PARAMETER(pTokenizer);
386 pCsr = (test_tokenizer_cursor *)sqlite3_malloc(sizeof(test_tokenizer_cursor));
387 if( pCsr==0 ){
388 rc = SQLITE_NOMEM;
389 }else{
390 memset(pCsr, 0, sizeof(test_tokenizer_cursor));
391 pCsr->aInput = pInput;
392 if( nBytes<0 ){
393 pCsr->nInput = (int)strlen(pInput);
394 }else{
395 pCsr->nInput = nBytes;
399 *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
400 return rc;
403 static int testTokenizerClose(sqlite3_tokenizer_cursor *pCursor){
404 test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
405 sqlite3_free(pCsr->aBuffer);
406 sqlite3_free(pCsr);
407 return SQLITE_OK;
410 static int testIsTokenChar(char c){
411 return (c>='a' && c<='z') || (c>='A' && c<='Z');
413 static int testTolower(char c){
414 char ret = c;
415 if( ret>='A' && ret<='Z') ret = ret - ('A'-'a');
416 return ret;
419 static int testTokenizerNext(
420 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by testTokenizerOpen */
421 const char **ppToken, /* OUT: *ppToken is the token text */
422 int *pnBytes, /* OUT: Number of bytes in token */
423 int *piStartOffset, /* OUT: Starting offset of token */
424 int *piEndOffset, /* OUT: Ending offset of token */
425 int *piPosition /* OUT: Position integer of token */
427 test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
428 int rc = SQLITE_OK;
429 const char *p;
430 const char *pEnd;
432 p = &pCsr->aInput[pCsr->iInput];
433 pEnd = &pCsr->aInput[pCsr->nInput];
435 /* Skip past any white-space */
436 assert( p<=pEnd );
437 while( p<pEnd && testIsTokenChar(*p)==0 ) p++;
439 if( p==pEnd ){
440 rc = SQLITE_DONE;
441 }else{
442 /* Advance to the end of the token */
443 const char *pToken = p;
444 int nToken;
445 while( p<pEnd && testIsTokenChar(*p) ) p++;
446 nToken = (int)(p-pToken);
448 /* Copy the token into the buffer */
449 if( nToken>pCsr->nBuffer ){
450 sqlite3_free(pCsr->aBuffer);
451 pCsr->aBuffer = sqlite3_malloc(nToken);
453 if( pCsr->aBuffer==0 ){
454 rc = SQLITE_NOMEM;
455 }else{
456 int i;
458 if( pCsr->iLangid & 0x00000001 ){
459 for(i=0; i<nToken; i++) pCsr->aBuffer[i] = pToken[i];
460 }else{
461 for(i=0; i<nToken; i++) pCsr->aBuffer[i] = testTolower(pToken[i]);
463 pCsr->iToken++;
464 pCsr->iInput = (int)(p - pCsr->aInput);
466 *ppToken = pCsr->aBuffer;
467 *pnBytes = nToken;
468 *piStartOffset = (int)(pToken - pCsr->aInput);
469 *piEndOffset = (int)(p - pCsr->aInput);
470 *piPosition = pCsr->iToken;
474 return rc;
477 static int testTokenizerLanguage(
478 sqlite3_tokenizer_cursor *pCursor,
479 int iLangid
481 int rc = SQLITE_OK;
482 test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
483 pCsr->iLangid = iLangid;
484 if( pCsr->iLangid>=100 ){
485 rc = SQLITE_ERROR;
487 return rc;
489 #endif
491 static int fts3_test_tokenizer_cmd(
492 ClientData clientData,
493 Tcl_Interp *interp,
494 int objc,
495 Tcl_Obj *CONST objv[]
497 #ifdef SQLITE_ENABLE_FTS3
498 static const sqlite3_tokenizer_module testTokenizerModule = {
500 testTokenizerCreate,
501 testTokenizerDestroy,
502 testTokenizerOpen,
503 testTokenizerClose,
504 testTokenizerNext,
505 testTokenizerLanguage
507 const sqlite3_tokenizer_module *pPtr = &testTokenizerModule;
508 if( objc!=1 ){
509 Tcl_WrongNumArgs(interp, 1, objv, "");
510 return TCL_ERROR;
512 Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(
513 (const unsigned char *)&pPtr, sizeof(sqlite3_tokenizer_module *)
515 #endif
516 UNUSED_PARAMETER(clientData);
517 return TCL_OK;
520 static int fts3_test_varint_cmd(
521 ClientData clientData,
522 Tcl_Interp *interp,
523 int objc,
524 Tcl_Obj *CONST objv[]
526 #ifdef SQLITE_ENABLE_FTS3
527 char aBuf[24];
528 int rc;
529 Tcl_WideInt w, w2;
530 int nByte, nByte2;
532 if( objc!=2 ){
533 Tcl_WrongNumArgs(interp, 1, objv, "INTEGER");
534 return TCL_ERROR;
537 rc = Tcl_GetWideIntFromObj(interp, objv[1], &w);
538 if( rc!=TCL_OK ) return rc;
540 nByte = sqlite3Fts3PutVarint(aBuf, w);
541 nByte2 = sqlite3Fts3GetVarint(aBuf, &w2);
542 if( w!=w2 || nByte!=nByte2 ){
543 char *zErr = sqlite3_mprintf("error testing %lld", w);
544 Tcl_ResetResult(interp);
545 Tcl_AppendResult(interp, zErr, 0);
546 return TCL_ERROR;
549 if( w<=2147483647 && w>=0 ){
550 int i;
551 nByte2 = fts3GetVarint32(aBuf, &i);
552 if( (int)w!=i || nByte!=nByte2 ){
553 char *zErr = sqlite3_mprintf("error testing %lld (32-bit)", w);
554 Tcl_ResetResult(interp);
555 Tcl_AppendResult(interp, zErr, 0);
556 return TCL_ERROR;
560 #endif
561 UNUSED_PARAMETER(clientData);
562 return TCL_OK;
566 ** End of tokenizer code.
567 **************************************************************************/
569 int Sqlitetestfts3_Init(Tcl_Interp *interp){
570 Tcl_CreateObjCommand(interp, "fts3_near_match", fts3_near_match_cmd, 0, 0);
571 Tcl_CreateObjCommand(interp,
572 "fts3_configure_incr_load", fts3_configure_incr_load_cmd, 0, 0
574 Tcl_CreateObjCommand(
575 interp, "fts3_test_tokenizer", fts3_test_tokenizer_cmd, 0, 0
578 Tcl_CreateObjCommand(
579 interp, "fts3_test_varint", fts3_test_varint_cmd, 0, 0
581 return TCL_OK;
583 #endif /* SQLITE_ENABLE_FTS3 || SQLITE_ENABLE_FTS4 */
584 #endif /* ifdef SQLITE_TEST */