Snapshot of upstream SQLite 3.40.1
[sqlcipher.git] / ext / fts5 / fts5_tokenize.c
blobe61d6b1eddc3ebbe73d0023c870690f38af92099
1 /*
2 ** 2014 May 31
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 ******************************************************************************
15 #include "fts5Int.h"
17 /**************************************************************************
18 ** Start of ascii tokenizer implementation.
22 ** For tokenizers with no "unicode" modifier, the set of token characters
23 ** is the same as the set of ASCII range alphanumeric characters.
25 static unsigned char aAsciiTokenChar[128] = {
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00..0x0F */
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10..0x1F */
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20..0x2F */
29 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 0x30..0x3F */
30 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40..0x4F */
31 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x50..0x5F */
32 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60..0x6F */
33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x70..0x7F */
36 typedef struct AsciiTokenizer AsciiTokenizer;
37 struct AsciiTokenizer {
38 unsigned char aTokenChar[128];
41 static void fts5AsciiAddExceptions(
42 AsciiTokenizer *p,
43 const char *zArg,
44 int bTokenChars
46 int i;
47 for(i=0; zArg[i]; i++){
48 if( (zArg[i] & 0x80)==0 ){
49 p->aTokenChar[(int)zArg[i]] = (unsigned char)bTokenChars;
55 ** Delete a "ascii" tokenizer.
57 static void fts5AsciiDelete(Fts5Tokenizer *p){
58 sqlite3_free(p);
62 ** Create an "ascii" tokenizer.
64 static int fts5AsciiCreate(
65 void *pUnused,
66 const char **azArg, int nArg,
67 Fts5Tokenizer **ppOut
69 int rc = SQLITE_OK;
70 AsciiTokenizer *p = 0;
71 UNUSED_PARAM(pUnused);
72 if( nArg%2 ){
73 rc = SQLITE_ERROR;
74 }else{
75 p = sqlite3_malloc(sizeof(AsciiTokenizer));
76 if( p==0 ){
77 rc = SQLITE_NOMEM;
78 }else{
79 int i;
80 memset(p, 0, sizeof(AsciiTokenizer));
81 memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
82 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
83 const char *zArg = azArg[i+1];
84 if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
85 fts5AsciiAddExceptions(p, zArg, 1);
86 }else
87 if( 0==sqlite3_stricmp(azArg[i], "separators") ){
88 fts5AsciiAddExceptions(p, zArg, 0);
89 }else{
90 rc = SQLITE_ERROR;
93 if( rc!=SQLITE_OK ){
94 fts5AsciiDelete((Fts5Tokenizer*)p);
95 p = 0;
100 *ppOut = (Fts5Tokenizer*)p;
101 return rc;
105 static void asciiFold(char *aOut, const char *aIn, int nByte){
106 int i;
107 for(i=0; i<nByte; i++){
108 char c = aIn[i];
109 if( c>='A' && c<='Z' ) c += 32;
110 aOut[i] = c;
115 ** Tokenize some text using the ascii tokenizer.
117 static int fts5AsciiTokenize(
118 Fts5Tokenizer *pTokenizer,
119 void *pCtx,
120 int iUnused,
121 const char *pText, int nText,
122 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
124 AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer;
125 int rc = SQLITE_OK;
126 int ie;
127 int is = 0;
129 char aFold[64];
130 int nFold = sizeof(aFold);
131 char *pFold = aFold;
132 unsigned char *a = p->aTokenChar;
134 UNUSED_PARAM(iUnused);
136 while( is<nText && rc==SQLITE_OK ){
137 int nByte;
139 /* Skip any leading divider characters. */
140 while( is<nText && ((pText[is]&0x80)==0 && a[(int)pText[is]]==0) ){
141 is++;
143 if( is==nText ) break;
145 /* Count the token characters */
146 ie = is+1;
147 while( ie<nText && ((pText[ie]&0x80) || a[(int)pText[ie]] ) ){
148 ie++;
151 /* Fold to lower case */
152 nByte = ie-is;
153 if( nByte>nFold ){
154 if( pFold!=aFold ) sqlite3_free(pFold);
155 pFold = sqlite3_malloc64((sqlite3_int64)nByte*2);
156 if( pFold==0 ){
157 rc = SQLITE_NOMEM;
158 break;
160 nFold = nByte*2;
162 asciiFold(pFold, &pText[is], nByte);
164 /* Invoke the token callback */
165 rc = xToken(pCtx, 0, pFold, nByte, is, ie);
166 is = ie+1;
169 if( pFold!=aFold ) sqlite3_free(pFold);
170 if( rc==SQLITE_DONE ) rc = SQLITE_OK;
171 return rc;
174 /**************************************************************************
175 ** Start of unicode61 tokenizer implementation.
180 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
181 ** from the sqlite3 source file utf.c. If this file is compiled as part
182 ** of the amalgamation, they are not required.
184 #ifndef SQLITE_AMALGAMATION
186 static const unsigned char sqlite3Utf8Trans1[] = {
187 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
188 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
189 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
190 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
191 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
192 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
193 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
194 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
197 #define READ_UTF8(zIn, zTerm, c) \
198 c = *(zIn++); \
199 if( c>=0xc0 ){ \
200 c = sqlite3Utf8Trans1[c-0xc0]; \
201 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \
202 c = (c<<6) + (0x3f & *(zIn++)); \
204 if( c<0x80 \
205 || (c&0xFFFFF800)==0xD800 \
206 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
210 #define WRITE_UTF8(zOut, c) { \
211 if( c<0x00080 ){ \
212 *zOut++ = (unsigned char)(c&0xFF); \
214 else if( c<0x00800 ){ \
215 *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F); \
216 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
218 else if( c<0x10000 ){ \
219 *zOut++ = 0xE0 + (unsigned char)((c>>12)&0x0F); \
220 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
221 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
222 }else{ \
223 *zOut++ = 0xF0 + (unsigned char)((c>>18) & 0x07); \
224 *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F); \
225 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
226 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
230 #endif /* ifndef SQLITE_AMALGAMATION */
232 typedef struct Unicode61Tokenizer Unicode61Tokenizer;
233 struct Unicode61Tokenizer {
234 unsigned char aTokenChar[128]; /* ASCII range token characters */
235 char *aFold; /* Buffer to fold text into */
236 int nFold; /* Size of aFold[] in bytes */
237 int eRemoveDiacritic; /* True if remove_diacritics=1 is set */
238 int nException;
239 int *aiException;
241 unsigned char aCategory[32]; /* True for token char categories */
244 /* Values for eRemoveDiacritic (must match internals of fts5_unicode2.c) */
245 #define FTS5_REMOVE_DIACRITICS_NONE 0
246 #define FTS5_REMOVE_DIACRITICS_SIMPLE 1
247 #define FTS5_REMOVE_DIACRITICS_COMPLEX 2
249 static int fts5UnicodeAddExceptions(
250 Unicode61Tokenizer *p, /* Tokenizer object */
251 const char *z, /* Characters to treat as exceptions */
252 int bTokenChars /* 1 for 'tokenchars', 0 for 'separators' */
254 int rc = SQLITE_OK;
255 int n = (int)strlen(z);
256 int *aNew;
258 if( n>0 ){
259 aNew = (int*)sqlite3_realloc64(p->aiException,
260 (n+p->nException)*sizeof(int));
261 if( aNew ){
262 int nNew = p->nException;
263 const unsigned char *zCsr = (const unsigned char*)z;
264 const unsigned char *zTerm = (const unsigned char*)&z[n];
265 while( zCsr<zTerm ){
266 u32 iCode;
267 int bToken;
268 READ_UTF8(zCsr, zTerm, iCode);
269 if( iCode<128 ){
270 p->aTokenChar[iCode] = (unsigned char)bTokenChars;
271 }else{
272 bToken = p->aCategory[sqlite3Fts5UnicodeCategory(iCode)];
273 assert( (bToken==0 || bToken==1) );
274 assert( (bTokenChars==0 || bTokenChars==1) );
275 if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){
276 int i;
277 for(i=0; i<nNew; i++){
278 if( (u32)aNew[i]>iCode ) break;
280 memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int));
281 aNew[i] = iCode;
282 nNew++;
286 p->aiException = aNew;
287 p->nException = nNew;
288 }else{
289 rc = SQLITE_NOMEM;
293 return rc;
297 ** Return true if the p->aiException[] array contains the value iCode.
299 static int fts5UnicodeIsException(Unicode61Tokenizer *p, int iCode){
300 if( p->nException>0 ){
301 int *a = p->aiException;
302 int iLo = 0;
303 int iHi = p->nException-1;
305 while( iHi>=iLo ){
306 int iTest = (iHi + iLo) / 2;
307 if( iCode==a[iTest] ){
308 return 1;
309 }else if( iCode>a[iTest] ){
310 iLo = iTest+1;
311 }else{
312 iHi = iTest-1;
317 return 0;
321 ** Delete a "unicode61" tokenizer.
323 static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
324 if( pTok ){
325 Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
326 sqlite3_free(p->aiException);
327 sqlite3_free(p->aFold);
328 sqlite3_free(p);
330 return;
333 static int unicodeSetCategories(Unicode61Tokenizer *p, const char *zCat){
334 const char *z = zCat;
336 while( *z ){
337 while( *z==' ' || *z=='\t' ) z++;
338 if( *z && sqlite3Fts5UnicodeCatParse(z, p->aCategory) ){
339 return SQLITE_ERROR;
341 while( *z!=' ' && *z!='\t' && *z!='\0' ) z++;
344 sqlite3Fts5UnicodeAscii(p->aCategory, p->aTokenChar);
345 return SQLITE_OK;
349 ** Create a "unicode61" tokenizer.
351 static int fts5UnicodeCreate(
352 void *pUnused,
353 const char **azArg, int nArg,
354 Fts5Tokenizer **ppOut
356 int rc = SQLITE_OK; /* Return code */
357 Unicode61Tokenizer *p = 0; /* New tokenizer object */
359 UNUSED_PARAM(pUnused);
361 if( nArg%2 ){
362 rc = SQLITE_ERROR;
363 }else{
364 p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer));
365 if( p ){
366 const char *zCat = "L* N* Co";
367 int i;
368 memset(p, 0, sizeof(Unicode61Tokenizer));
370 p->eRemoveDiacritic = FTS5_REMOVE_DIACRITICS_SIMPLE;
371 p->nFold = 64;
372 p->aFold = sqlite3_malloc64(p->nFold * sizeof(char));
373 if( p->aFold==0 ){
374 rc = SQLITE_NOMEM;
377 /* Search for a "categories" argument */
378 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
379 if( 0==sqlite3_stricmp(azArg[i], "categories") ){
380 zCat = azArg[i+1];
384 if( rc==SQLITE_OK ){
385 rc = unicodeSetCategories(p, zCat);
388 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
389 const char *zArg = azArg[i+1];
390 if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
391 if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){
392 rc = SQLITE_ERROR;
393 }else{
394 p->eRemoveDiacritic = (zArg[0] - '0');
395 assert( p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_NONE
396 || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_SIMPLE
397 || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_COMPLEX
400 }else
401 if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
402 rc = fts5UnicodeAddExceptions(p, zArg, 1);
403 }else
404 if( 0==sqlite3_stricmp(azArg[i], "separators") ){
405 rc = fts5UnicodeAddExceptions(p, zArg, 0);
406 }else
407 if( 0==sqlite3_stricmp(azArg[i], "categories") ){
408 /* no-op */
409 }else{
410 rc = SQLITE_ERROR;
414 }else{
415 rc = SQLITE_NOMEM;
417 if( rc!=SQLITE_OK ){
418 fts5UnicodeDelete((Fts5Tokenizer*)p);
419 p = 0;
421 *ppOut = (Fts5Tokenizer*)p;
423 return rc;
427 ** Return true if, for the purposes of tokenizing with the tokenizer
428 ** passed as the first argument, codepoint iCode is considered a token
429 ** character (not a separator).
431 static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){
432 return (
433 p->aCategory[sqlite3Fts5UnicodeCategory((u32)iCode)]
434 ^ fts5UnicodeIsException(p, iCode)
438 static int fts5UnicodeTokenize(
439 Fts5Tokenizer *pTokenizer,
440 void *pCtx,
441 int iUnused,
442 const char *pText, int nText,
443 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
445 Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
446 int rc = SQLITE_OK;
447 unsigned char *a = p->aTokenChar;
449 unsigned char *zTerm = (unsigned char*)&pText[nText];
450 unsigned char *zCsr = (unsigned char *)pText;
452 /* Output buffer */
453 char *aFold = p->aFold;
454 int nFold = p->nFold;
455 const char *pEnd = &aFold[nFold-6];
457 UNUSED_PARAM(iUnused);
459 /* Each iteration of this loop gobbles up a contiguous run of separators,
460 ** then the next token. */
461 while( rc==SQLITE_OK ){
462 u32 iCode; /* non-ASCII codepoint read from input */
463 char *zOut = aFold;
464 int is;
465 int ie;
467 /* Skip any separator characters. */
468 while( 1 ){
469 if( zCsr>=zTerm ) goto tokenize_done;
470 if( *zCsr & 0x80 ) {
471 /* A character outside of the ascii range. Skip past it if it is
472 ** a separator character. Or break out of the loop if it is not. */
473 is = zCsr - (unsigned char*)pText;
474 READ_UTF8(zCsr, zTerm, iCode);
475 if( fts5UnicodeIsAlnum(p, iCode) ){
476 goto non_ascii_tokenchar;
478 }else{
479 if( a[*zCsr] ){
480 is = zCsr - (unsigned char*)pText;
481 goto ascii_tokenchar;
483 zCsr++;
487 /* Run through the tokenchars. Fold them into the output buffer along
488 ** the way. */
489 while( zCsr<zTerm ){
491 /* Grow the output buffer so that there is sufficient space to fit the
492 ** largest possible utf-8 character. */
493 if( zOut>pEnd ){
494 aFold = sqlite3_malloc64((sqlite3_int64)nFold*2);
495 if( aFold==0 ){
496 rc = SQLITE_NOMEM;
497 goto tokenize_done;
499 zOut = &aFold[zOut - p->aFold];
500 memcpy(aFold, p->aFold, nFold);
501 sqlite3_free(p->aFold);
502 p->aFold = aFold;
503 p->nFold = nFold = nFold*2;
504 pEnd = &aFold[nFold-6];
507 if( *zCsr & 0x80 ){
508 /* An non-ascii-range character. Fold it into the output buffer if
509 ** it is a token character, or break out of the loop if it is not. */
510 READ_UTF8(zCsr, zTerm, iCode);
511 if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
512 non_ascii_tokenchar:
513 iCode = sqlite3Fts5UnicodeFold(iCode, p->eRemoveDiacritic);
514 if( iCode ) WRITE_UTF8(zOut, iCode);
515 }else{
516 break;
518 }else if( a[*zCsr]==0 ){
519 /* An ascii-range separator character. End of token. */
520 break;
521 }else{
522 ascii_tokenchar:
523 if( *zCsr>='A' && *zCsr<='Z' ){
524 *zOut++ = *zCsr + 32;
525 }else{
526 *zOut++ = *zCsr;
528 zCsr++;
530 ie = zCsr - (unsigned char*)pText;
533 /* Invoke the token callback */
534 rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie);
537 tokenize_done:
538 if( rc==SQLITE_DONE ) rc = SQLITE_OK;
539 return rc;
542 /**************************************************************************
543 ** Start of porter stemmer implementation.
546 /* Any tokens larger than this (in bytes) are passed through without
547 ** stemming. */
548 #define FTS5_PORTER_MAX_TOKEN 64
550 typedef struct PorterTokenizer PorterTokenizer;
551 struct PorterTokenizer {
552 fts5_tokenizer tokenizer; /* Parent tokenizer module */
553 Fts5Tokenizer *pTokenizer; /* Parent tokenizer instance */
554 char aBuf[FTS5_PORTER_MAX_TOKEN + 64];
558 ** Delete a "porter" tokenizer.
560 static void fts5PorterDelete(Fts5Tokenizer *pTok){
561 if( pTok ){
562 PorterTokenizer *p = (PorterTokenizer*)pTok;
563 if( p->pTokenizer ){
564 p->tokenizer.xDelete(p->pTokenizer);
566 sqlite3_free(p);
571 ** Create a "porter" tokenizer.
573 static int fts5PorterCreate(
574 void *pCtx,
575 const char **azArg, int nArg,
576 Fts5Tokenizer **ppOut
578 fts5_api *pApi = (fts5_api*)pCtx;
579 int rc = SQLITE_OK;
580 PorterTokenizer *pRet;
581 void *pUserdata = 0;
582 const char *zBase = "unicode61";
584 if( nArg>0 ){
585 zBase = azArg[0];
588 pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer));
589 if( pRet ){
590 memset(pRet, 0, sizeof(PorterTokenizer));
591 rc = pApi->xFindTokenizer(pApi, zBase, &pUserdata, &pRet->tokenizer);
592 }else{
593 rc = SQLITE_NOMEM;
595 if( rc==SQLITE_OK ){
596 int nArg2 = (nArg>0 ? nArg-1 : 0);
597 const char **azArg2 = (nArg2 ? &azArg[1] : 0);
598 rc = pRet->tokenizer.xCreate(pUserdata, azArg2, nArg2, &pRet->pTokenizer);
601 if( rc!=SQLITE_OK ){
602 fts5PorterDelete((Fts5Tokenizer*)pRet);
603 pRet = 0;
605 *ppOut = (Fts5Tokenizer*)pRet;
606 return rc;
609 typedef struct PorterContext PorterContext;
610 struct PorterContext {
611 void *pCtx;
612 int (*xToken)(void*, int, const char*, int, int, int);
613 char *aBuf;
616 typedef struct PorterRule PorterRule;
617 struct PorterRule {
618 const char *zSuffix;
619 int nSuffix;
620 int (*xCond)(char *zStem, int nStem);
621 const char *zOutput;
622 int nOutput;
625 #if 0
626 static int fts5PorterApply(char *aBuf, int *pnBuf, PorterRule *aRule){
627 int ret = -1;
628 int nBuf = *pnBuf;
629 PorterRule *p;
631 for(p=aRule; p->zSuffix; p++){
632 assert( strlen(p->zSuffix)==p->nSuffix );
633 assert( strlen(p->zOutput)==p->nOutput );
634 if( nBuf<p->nSuffix ) continue;
635 if( 0==memcmp(&aBuf[nBuf - p->nSuffix], p->zSuffix, p->nSuffix) ) break;
638 if( p->zSuffix ){
639 int nStem = nBuf - p->nSuffix;
640 if( p->xCond==0 || p->xCond(aBuf, nStem) ){
641 memcpy(&aBuf[nStem], p->zOutput, p->nOutput);
642 *pnBuf = nStem + p->nOutput;
643 ret = p - aRule;
647 return ret;
649 #endif
651 static int fts5PorterIsVowel(char c, int bYIsVowel){
652 return (
653 c=='a' || c=='e' || c=='i' || c=='o' || c=='u' || (bYIsVowel && c=='y')
657 static int fts5PorterGobbleVC(char *zStem, int nStem, int bPrevCons){
658 int i;
659 int bCons = bPrevCons;
661 /* Scan for a vowel */
662 for(i=0; i<nStem; i++){
663 if( 0==(bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) break;
666 /* Scan for a consonent */
667 for(i++; i<nStem; i++){
668 if( (bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) return i+1;
670 return 0;
673 /* porter rule condition: (m > 0) */
674 static int fts5Porter_MGt0(char *zStem, int nStem){
675 return !!fts5PorterGobbleVC(zStem, nStem, 0);
678 /* porter rule condition: (m > 1) */
679 static int fts5Porter_MGt1(char *zStem, int nStem){
680 int n;
681 n = fts5PorterGobbleVC(zStem, nStem, 0);
682 if( n && fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
683 return 1;
685 return 0;
688 /* porter rule condition: (m = 1) */
689 static int fts5Porter_MEq1(char *zStem, int nStem){
690 int n;
691 n = fts5PorterGobbleVC(zStem, nStem, 0);
692 if( n && 0==fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
693 return 1;
695 return 0;
698 /* porter rule condition: (*o) */
699 static int fts5Porter_Ostar(char *zStem, int nStem){
700 if( zStem[nStem-1]=='w' || zStem[nStem-1]=='x' || zStem[nStem-1]=='y' ){
701 return 0;
702 }else{
703 int i;
704 int mask = 0;
705 int bCons = 0;
706 for(i=0; i<nStem; i++){
707 bCons = !fts5PorterIsVowel(zStem[i], bCons);
708 assert( bCons==0 || bCons==1 );
709 mask = (mask << 1) + bCons;
711 return ((mask & 0x0007)==0x0005);
715 /* porter rule condition: (m > 1 and (*S or *T)) */
716 static int fts5Porter_MGt1_and_S_or_T(char *zStem, int nStem){
717 assert( nStem>0 );
718 return (zStem[nStem-1]=='s' || zStem[nStem-1]=='t')
719 && fts5Porter_MGt1(zStem, nStem);
722 /* porter rule condition: (*v*) */
723 static int fts5Porter_Vowel(char *zStem, int nStem){
724 int i;
725 for(i=0; i<nStem; i++){
726 if( fts5PorterIsVowel(zStem[i], i>0) ){
727 return 1;
730 return 0;
734 /**************************************************************************
735 ***************************************************************************
736 ** GENERATED CODE STARTS HERE (mkportersteps.tcl)
739 static int fts5PorterStep4(char *aBuf, int *pnBuf){
740 int ret = 0;
741 int nBuf = *pnBuf;
742 switch( aBuf[nBuf-2] ){
744 case 'a':
745 if( nBuf>2 && 0==memcmp("al", &aBuf[nBuf-2], 2) ){
746 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
747 *pnBuf = nBuf - 2;
750 break;
752 case 'c':
753 if( nBuf>4 && 0==memcmp("ance", &aBuf[nBuf-4], 4) ){
754 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
755 *pnBuf = nBuf - 4;
757 }else if( nBuf>4 && 0==memcmp("ence", &aBuf[nBuf-4], 4) ){
758 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
759 *pnBuf = nBuf - 4;
762 break;
764 case 'e':
765 if( nBuf>2 && 0==memcmp("er", &aBuf[nBuf-2], 2) ){
766 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
767 *pnBuf = nBuf - 2;
770 break;
772 case 'i':
773 if( nBuf>2 && 0==memcmp("ic", &aBuf[nBuf-2], 2) ){
774 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
775 *pnBuf = nBuf - 2;
778 break;
780 case 'l':
781 if( nBuf>4 && 0==memcmp("able", &aBuf[nBuf-4], 4) ){
782 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
783 *pnBuf = nBuf - 4;
785 }else if( nBuf>4 && 0==memcmp("ible", &aBuf[nBuf-4], 4) ){
786 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
787 *pnBuf = nBuf - 4;
790 break;
792 case 'n':
793 if( nBuf>3 && 0==memcmp("ant", &aBuf[nBuf-3], 3) ){
794 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
795 *pnBuf = nBuf - 3;
797 }else if( nBuf>5 && 0==memcmp("ement", &aBuf[nBuf-5], 5) ){
798 if( fts5Porter_MGt1(aBuf, nBuf-5) ){
799 *pnBuf = nBuf - 5;
801 }else if( nBuf>4 && 0==memcmp("ment", &aBuf[nBuf-4], 4) ){
802 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
803 *pnBuf = nBuf - 4;
805 }else if( nBuf>3 && 0==memcmp("ent", &aBuf[nBuf-3], 3) ){
806 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
807 *pnBuf = nBuf - 3;
810 break;
812 case 'o':
813 if( nBuf>3 && 0==memcmp("ion", &aBuf[nBuf-3], 3) ){
814 if( fts5Porter_MGt1_and_S_or_T(aBuf, nBuf-3) ){
815 *pnBuf = nBuf - 3;
817 }else if( nBuf>2 && 0==memcmp("ou", &aBuf[nBuf-2], 2) ){
818 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
819 *pnBuf = nBuf - 2;
822 break;
824 case 's':
825 if( nBuf>3 && 0==memcmp("ism", &aBuf[nBuf-3], 3) ){
826 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
827 *pnBuf = nBuf - 3;
830 break;
832 case 't':
833 if( nBuf>3 && 0==memcmp("ate", &aBuf[nBuf-3], 3) ){
834 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
835 *pnBuf = nBuf - 3;
837 }else if( nBuf>3 && 0==memcmp("iti", &aBuf[nBuf-3], 3) ){
838 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
839 *pnBuf = nBuf - 3;
842 break;
844 case 'u':
845 if( nBuf>3 && 0==memcmp("ous", &aBuf[nBuf-3], 3) ){
846 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
847 *pnBuf = nBuf - 3;
850 break;
852 case 'v':
853 if( nBuf>3 && 0==memcmp("ive", &aBuf[nBuf-3], 3) ){
854 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
855 *pnBuf = nBuf - 3;
858 break;
860 case 'z':
861 if( nBuf>3 && 0==memcmp("ize", &aBuf[nBuf-3], 3) ){
862 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
863 *pnBuf = nBuf - 3;
866 break;
869 return ret;
873 static int fts5PorterStep1B2(char *aBuf, int *pnBuf){
874 int ret = 0;
875 int nBuf = *pnBuf;
876 switch( aBuf[nBuf-2] ){
878 case 'a':
879 if( nBuf>2 && 0==memcmp("at", &aBuf[nBuf-2], 2) ){
880 memcpy(&aBuf[nBuf-2], "ate", 3);
881 *pnBuf = nBuf - 2 + 3;
882 ret = 1;
884 break;
886 case 'b':
887 if( nBuf>2 && 0==memcmp("bl", &aBuf[nBuf-2], 2) ){
888 memcpy(&aBuf[nBuf-2], "ble", 3);
889 *pnBuf = nBuf - 2 + 3;
890 ret = 1;
892 break;
894 case 'i':
895 if( nBuf>2 && 0==memcmp("iz", &aBuf[nBuf-2], 2) ){
896 memcpy(&aBuf[nBuf-2], "ize", 3);
897 *pnBuf = nBuf - 2 + 3;
898 ret = 1;
900 break;
903 return ret;
907 static int fts5PorterStep2(char *aBuf, int *pnBuf){
908 int ret = 0;
909 int nBuf = *pnBuf;
910 switch( aBuf[nBuf-2] ){
912 case 'a':
913 if( nBuf>7 && 0==memcmp("ational", &aBuf[nBuf-7], 7) ){
914 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
915 memcpy(&aBuf[nBuf-7], "ate", 3);
916 *pnBuf = nBuf - 7 + 3;
918 }else if( nBuf>6 && 0==memcmp("tional", &aBuf[nBuf-6], 6) ){
919 if( fts5Porter_MGt0(aBuf, nBuf-6) ){
920 memcpy(&aBuf[nBuf-6], "tion", 4);
921 *pnBuf = nBuf - 6 + 4;
924 break;
926 case 'c':
927 if( nBuf>4 && 0==memcmp("enci", &aBuf[nBuf-4], 4) ){
928 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
929 memcpy(&aBuf[nBuf-4], "ence", 4);
930 *pnBuf = nBuf - 4 + 4;
932 }else if( nBuf>4 && 0==memcmp("anci", &aBuf[nBuf-4], 4) ){
933 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
934 memcpy(&aBuf[nBuf-4], "ance", 4);
935 *pnBuf = nBuf - 4 + 4;
938 break;
940 case 'e':
941 if( nBuf>4 && 0==memcmp("izer", &aBuf[nBuf-4], 4) ){
942 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
943 memcpy(&aBuf[nBuf-4], "ize", 3);
944 *pnBuf = nBuf - 4 + 3;
947 break;
949 case 'g':
950 if( nBuf>4 && 0==memcmp("logi", &aBuf[nBuf-4], 4) ){
951 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
952 memcpy(&aBuf[nBuf-4], "log", 3);
953 *pnBuf = nBuf - 4 + 3;
956 break;
958 case 'l':
959 if( nBuf>3 && 0==memcmp("bli", &aBuf[nBuf-3], 3) ){
960 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
961 memcpy(&aBuf[nBuf-3], "ble", 3);
962 *pnBuf = nBuf - 3 + 3;
964 }else if( nBuf>4 && 0==memcmp("alli", &aBuf[nBuf-4], 4) ){
965 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
966 memcpy(&aBuf[nBuf-4], "al", 2);
967 *pnBuf = nBuf - 4 + 2;
969 }else if( nBuf>5 && 0==memcmp("entli", &aBuf[nBuf-5], 5) ){
970 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
971 memcpy(&aBuf[nBuf-5], "ent", 3);
972 *pnBuf = nBuf - 5 + 3;
974 }else if( nBuf>3 && 0==memcmp("eli", &aBuf[nBuf-3], 3) ){
975 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
976 memcpy(&aBuf[nBuf-3], "e", 1);
977 *pnBuf = nBuf - 3 + 1;
979 }else if( nBuf>5 && 0==memcmp("ousli", &aBuf[nBuf-5], 5) ){
980 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
981 memcpy(&aBuf[nBuf-5], "ous", 3);
982 *pnBuf = nBuf - 5 + 3;
985 break;
987 case 'o':
988 if( nBuf>7 && 0==memcmp("ization", &aBuf[nBuf-7], 7) ){
989 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
990 memcpy(&aBuf[nBuf-7], "ize", 3);
991 *pnBuf = nBuf - 7 + 3;
993 }else if( nBuf>5 && 0==memcmp("ation", &aBuf[nBuf-5], 5) ){
994 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
995 memcpy(&aBuf[nBuf-5], "ate", 3);
996 *pnBuf = nBuf - 5 + 3;
998 }else if( nBuf>4 && 0==memcmp("ator", &aBuf[nBuf-4], 4) ){
999 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1000 memcpy(&aBuf[nBuf-4], "ate", 3);
1001 *pnBuf = nBuf - 4 + 3;
1004 break;
1006 case 's':
1007 if( nBuf>5 && 0==memcmp("alism", &aBuf[nBuf-5], 5) ){
1008 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1009 memcpy(&aBuf[nBuf-5], "al", 2);
1010 *pnBuf = nBuf - 5 + 2;
1012 }else if( nBuf>7 && 0==memcmp("iveness", &aBuf[nBuf-7], 7) ){
1013 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1014 memcpy(&aBuf[nBuf-7], "ive", 3);
1015 *pnBuf = nBuf - 7 + 3;
1017 }else if( nBuf>7 && 0==memcmp("fulness", &aBuf[nBuf-7], 7) ){
1018 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1019 memcpy(&aBuf[nBuf-7], "ful", 3);
1020 *pnBuf = nBuf - 7 + 3;
1022 }else if( nBuf>7 && 0==memcmp("ousness", &aBuf[nBuf-7], 7) ){
1023 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1024 memcpy(&aBuf[nBuf-7], "ous", 3);
1025 *pnBuf = nBuf - 7 + 3;
1028 break;
1030 case 't':
1031 if( nBuf>5 && 0==memcmp("aliti", &aBuf[nBuf-5], 5) ){
1032 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1033 memcpy(&aBuf[nBuf-5], "al", 2);
1034 *pnBuf = nBuf - 5 + 2;
1036 }else if( nBuf>5 && 0==memcmp("iviti", &aBuf[nBuf-5], 5) ){
1037 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1038 memcpy(&aBuf[nBuf-5], "ive", 3);
1039 *pnBuf = nBuf - 5 + 3;
1041 }else if( nBuf>6 && 0==memcmp("biliti", &aBuf[nBuf-6], 6) ){
1042 if( fts5Porter_MGt0(aBuf, nBuf-6) ){
1043 memcpy(&aBuf[nBuf-6], "ble", 3);
1044 *pnBuf = nBuf - 6 + 3;
1047 break;
1050 return ret;
1054 static int fts5PorterStep3(char *aBuf, int *pnBuf){
1055 int ret = 0;
1056 int nBuf = *pnBuf;
1057 switch( aBuf[nBuf-2] ){
1059 case 'a':
1060 if( nBuf>4 && 0==memcmp("ical", &aBuf[nBuf-4], 4) ){
1061 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1062 memcpy(&aBuf[nBuf-4], "ic", 2);
1063 *pnBuf = nBuf - 4 + 2;
1066 break;
1068 case 's':
1069 if( nBuf>4 && 0==memcmp("ness", &aBuf[nBuf-4], 4) ){
1070 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1071 *pnBuf = nBuf - 4;
1074 break;
1076 case 't':
1077 if( nBuf>5 && 0==memcmp("icate", &aBuf[nBuf-5], 5) ){
1078 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1079 memcpy(&aBuf[nBuf-5], "ic", 2);
1080 *pnBuf = nBuf - 5 + 2;
1082 }else if( nBuf>5 && 0==memcmp("iciti", &aBuf[nBuf-5], 5) ){
1083 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1084 memcpy(&aBuf[nBuf-5], "ic", 2);
1085 *pnBuf = nBuf - 5 + 2;
1088 break;
1090 case 'u':
1091 if( nBuf>3 && 0==memcmp("ful", &aBuf[nBuf-3], 3) ){
1092 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
1093 *pnBuf = nBuf - 3;
1096 break;
1098 case 'v':
1099 if( nBuf>5 && 0==memcmp("ative", &aBuf[nBuf-5], 5) ){
1100 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1101 *pnBuf = nBuf - 5;
1104 break;
1106 case 'z':
1107 if( nBuf>5 && 0==memcmp("alize", &aBuf[nBuf-5], 5) ){
1108 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1109 memcpy(&aBuf[nBuf-5], "al", 2);
1110 *pnBuf = nBuf - 5 + 2;
1113 break;
1116 return ret;
1120 static int fts5PorterStep1B(char *aBuf, int *pnBuf){
1121 int ret = 0;
1122 int nBuf = *pnBuf;
1123 switch( aBuf[nBuf-2] ){
1125 case 'e':
1126 if( nBuf>3 && 0==memcmp("eed", &aBuf[nBuf-3], 3) ){
1127 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
1128 memcpy(&aBuf[nBuf-3], "ee", 2);
1129 *pnBuf = nBuf - 3 + 2;
1131 }else if( nBuf>2 && 0==memcmp("ed", &aBuf[nBuf-2], 2) ){
1132 if( fts5Porter_Vowel(aBuf, nBuf-2) ){
1133 *pnBuf = nBuf - 2;
1134 ret = 1;
1137 break;
1139 case 'n':
1140 if( nBuf>3 && 0==memcmp("ing", &aBuf[nBuf-3], 3) ){
1141 if( fts5Porter_Vowel(aBuf, nBuf-3) ){
1142 *pnBuf = nBuf - 3;
1143 ret = 1;
1146 break;
1149 return ret;
1153 ** GENERATED CODE ENDS HERE (mkportersteps.tcl)
1154 ***************************************************************************
1155 **************************************************************************/
1157 static void fts5PorterStep1A(char *aBuf, int *pnBuf){
1158 int nBuf = *pnBuf;
1159 if( aBuf[nBuf-1]=='s' ){
1160 if( aBuf[nBuf-2]=='e' ){
1161 if( (nBuf>4 && aBuf[nBuf-4]=='s' && aBuf[nBuf-3]=='s')
1162 || (nBuf>3 && aBuf[nBuf-3]=='i' )
1164 *pnBuf = nBuf-2;
1165 }else{
1166 *pnBuf = nBuf-1;
1169 else if( aBuf[nBuf-2]!='s' ){
1170 *pnBuf = nBuf-1;
1175 static int fts5PorterCb(
1176 void *pCtx,
1177 int tflags,
1178 const char *pToken,
1179 int nToken,
1180 int iStart,
1181 int iEnd
1183 PorterContext *p = (PorterContext*)pCtx;
1185 char *aBuf;
1186 int nBuf;
1188 if( nToken>FTS5_PORTER_MAX_TOKEN || nToken<3 ) goto pass_through;
1189 aBuf = p->aBuf;
1190 nBuf = nToken;
1191 memcpy(aBuf, pToken, nBuf);
1193 /* Step 1. */
1194 fts5PorterStep1A(aBuf, &nBuf);
1195 if( fts5PorterStep1B(aBuf, &nBuf) ){
1196 if( fts5PorterStep1B2(aBuf, &nBuf)==0 ){
1197 char c = aBuf[nBuf-1];
1198 if( fts5PorterIsVowel(c, 0)==0
1199 && c!='l' && c!='s' && c!='z' && c==aBuf[nBuf-2]
1201 nBuf--;
1202 }else if( fts5Porter_MEq1(aBuf, nBuf) && fts5Porter_Ostar(aBuf, nBuf) ){
1203 aBuf[nBuf++] = 'e';
1208 /* Step 1C. */
1209 if( aBuf[nBuf-1]=='y' && fts5Porter_Vowel(aBuf, nBuf-1) ){
1210 aBuf[nBuf-1] = 'i';
1213 /* Steps 2 through 4. */
1214 fts5PorterStep2(aBuf, &nBuf);
1215 fts5PorterStep3(aBuf, &nBuf);
1216 fts5PorterStep4(aBuf, &nBuf);
1218 /* Step 5a. */
1219 assert( nBuf>0 );
1220 if( aBuf[nBuf-1]=='e' ){
1221 if( fts5Porter_MGt1(aBuf, nBuf-1)
1222 || (fts5Porter_MEq1(aBuf, nBuf-1) && !fts5Porter_Ostar(aBuf, nBuf-1))
1224 nBuf--;
1228 /* Step 5b. */
1229 if( nBuf>1 && aBuf[nBuf-1]=='l'
1230 && aBuf[nBuf-2]=='l' && fts5Porter_MGt1(aBuf, nBuf-1)
1232 nBuf--;
1235 return p->xToken(p->pCtx, tflags, aBuf, nBuf, iStart, iEnd);
1237 pass_through:
1238 return p->xToken(p->pCtx, tflags, pToken, nToken, iStart, iEnd);
1242 ** Tokenize using the porter tokenizer.
1244 static int fts5PorterTokenize(
1245 Fts5Tokenizer *pTokenizer,
1246 void *pCtx,
1247 int flags,
1248 const char *pText, int nText,
1249 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
1251 PorterTokenizer *p = (PorterTokenizer*)pTokenizer;
1252 PorterContext sCtx;
1253 sCtx.xToken = xToken;
1254 sCtx.pCtx = pCtx;
1255 sCtx.aBuf = p->aBuf;
1256 return p->tokenizer.xTokenize(
1257 p->pTokenizer, (void*)&sCtx, flags, pText, nText, fts5PorterCb
1261 /**************************************************************************
1262 ** Start of trigram implementation.
1264 typedef struct TrigramTokenizer TrigramTokenizer;
1265 struct TrigramTokenizer {
1266 int bFold; /* True to fold to lower-case */
1270 ** Free a trigram tokenizer.
1272 static void fts5TriDelete(Fts5Tokenizer *p){
1273 sqlite3_free(p);
1277 ** Allocate a trigram tokenizer.
1279 static int fts5TriCreate(
1280 void *pUnused,
1281 const char **azArg,
1282 int nArg,
1283 Fts5Tokenizer **ppOut
1285 int rc = SQLITE_OK;
1286 TrigramTokenizer *pNew = (TrigramTokenizer*)sqlite3_malloc(sizeof(*pNew));
1287 UNUSED_PARAM(pUnused);
1288 if( pNew==0 ){
1289 rc = SQLITE_NOMEM;
1290 }else{
1291 int i;
1292 pNew->bFold = 1;
1293 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
1294 const char *zArg = azArg[i+1];
1295 if( 0==sqlite3_stricmp(azArg[i], "case_sensitive") ){
1296 if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
1297 rc = SQLITE_ERROR;
1298 }else{
1299 pNew->bFold = (zArg[0]=='0');
1301 }else{
1302 rc = SQLITE_ERROR;
1305 if( rc!=SQLITE_OK ){
1306 fts5TriDelete((Fts5Tokenizer*)pNew);
1307 pNew = 0;
1310 *ppOut = (Fts5Tokenizer*)pNew;
1311 return rc;
1315 ** Trigram tokenizer tokenize routine.
1317 static int fts5TriTokenize(
1318 Fts5Tokenizer *pTok,
1319 void *pCtx,
1320 int unusedFlags,
1321 const char *pText, int nText,
1322 int (*xToken)(void*, int, const char*, int, int, int)
1324 TrigramTokenizer *p = (TrigramTokenizer*)pTok;
1325 int rc = SQLITE_OK;
1326 char aBuf[32];
1327 const unsigned char *zIn = (const unsigned char*)pText;
1328 const unsigned char *zEof = &zIn[nText];
1329 u32 iCode;
1331 UNUSED_PARAM(unusedFlags);
1332 while( 1 ){
1333 char *zOut = aBuf;
1334 int iStart = zIn - (const unsigned char*)pText;
1335 const unsigned char *zNext;
1337 READ_UTF8(zIn, zEof, iCode);
1338 if( iCode==0 ) break;
1339 zNext = zIn;
1340 if( zIn<zEof ){
1341 if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0);
1342 WRITE_UTF8(zOut, iCode);
1343 READ_UTF8(zIn, zEof, iCode);
1344 if( iCode==0 ) break;
1345 }else{
1346 break;
1348 if( zIn<zEof ){
1349 if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0);
1350 WRITE_UTF8(zOut, iCode);
1351 READ_UTF8(zIn, zEof, iCode);
1352 if( iCode==0 ) break;
1353 if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0);
1354 WRITE_UTF8(zOut, iCode);
1355 }else{
1356 break;
1358 rc = xToken(pCtx, 0, aBuf, zOut-aBuf, iStart, iStart + zOut-aBuf);
1359 if( rc!=SQLITE_OK ) break;
1360 zIn = zNext;
1363 return rc;
1367 ** Argument xCreate is a pointer to a constructor function for a tokenizer.
1368 ** pTok is a tokenizer previously created using the same method. This function
1369 ** returns one of FTS5_PATTERN_NONE, FTS5_PATTERN_LIKE or FTS5_PATTERN_GLOB
1370 ** indicating the style of pattern matching that the tokenizer can support.
1371 ** In practice, this is:
1373 ** "trigram" tokenizer, case_sensitive=1 - FTS5_PATTERN_GLOB
1374 ** "trigram" tokenizer, case_sensitive=0 (the default) - FTS5_PATTERN_LIKE
1375 ** all other tokenizers - FTS5_PATTERN_NONE
1377 int sqlite3Fts5TokenizerPattern(
1378 int (*xCreate)(void*, const char**, int, Fts5Tokenizer**),
1379 Fts5Tokenizer *pTok
1381 if( xCreate==fts5TriCreate ){
1382 TrigramTokenizer *p = (TrigramTokenizer*)pTok;
1383 return p->bFold ? FTS5_PATTERN_LIKE : FTS5_PATTERN_GLOB;
1385 return FTS5_PATTERN_NONE;
1389 ** Register all built-in tokenizers with FTS5.
1391 int sqlite3Fts5TokenizerInit(fts5_api *pApi){
1392 struct BuiltinTokenizer {
1393 const char *zName;
1394 fts5_tokenizer x;
1395 } aBuiltin[] = {
1396 { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}},
1397 { "ascii", {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }},
1398 { "porter", {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }},
1399 { "trigram", {fts5TriCreate, fts5TriDelete, fts5TriTokenize}},
1402 int rc = SQLITE_OK; /* Return code */
1403 int i; /* To iterate through builtin functions */
1405 for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){
1406 rc = pApi->xCreateTokenizer(pApi,
1407 aBuiltin[i].zName,
1408 (void*)pApi,
1409 &aBuiltin[i].x,
1414 return rc;