skip uneccesary sqlcipher_free calls
[sqlcipher.git] / ext / fts5 / fts5_tokenize.c
blobf12056170fa296f769a5a8aa3d9cadb631476b52
1 /*
2 ** 2014 May 31
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 ******************************************************************************
15 #include "fts5Int.h"
17 /**************************************************************************
18 ** Start of ascii tokenizer implementation.
22 ** For tokenizers with no "unicode" modifier, the set of token characters
23 ** is the same as the set of ASCII range alphanumeric characters.
25 static unsigned char aAsciiTokenChar[128] = {
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00..0x0F */
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10..0x1F */
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20..0x2F */
29 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 0x30..0x3F */
30 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40..0x4F */
31 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x50..0x5F */
32 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60..0x6F */
33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x70..0x7F */
36 typedef struct AsciiTokenizer AsciiTokenizer;
37 struct AsciiTokenizer {
38 unsigned char aTokenChar[128];
41 static void fts5AsciiAddExceptions(
42 AsciiTokenizer *p,
43 const char *zArg,
44 int bTokenChars
46 int i;
47 for(i=0; zArg[i]; i++){
48 if( (zArg[i] & 0x80)==0 ){
49 p->aTokenChar[(int)zArg[i]] = (unsigned char)bTokenChars;
55 ** Delete a "ascii" tokenizer.
57 static void fts5AsciiDelete(Fts5Tokenizer *p){
58 sqlite3_free(p);
62 ** Create an "ascii" tokenizer.
64 static int fts5AsciiCreate(
65 void *pUnused,
66 const char **azArg, int nArg,
67 Fts5Tokenizer **ppOut
69 int rc = SQLITE_OK;
70 AsciiTokenizer *p = 0;
71 UNUSED_PARAM(pUnused);
72 if( nArg%2 ){
73 rc = SQLITE_ERROR;
74 }else{
75 p = sqlite3_malloc(sizeof(AsciiTokenizer));
76 if( p==0 ){
77 rc = SQLITE_NOMEM;
78 }else{
79 int i;
80 memset(p, 0, sizeof(AsciiTokenizer));
81 memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
82 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
83 const char *zArg = azArg[i+1];
84 if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
85 fts5AsciiAddExceptions(p, zArg, 1);
86 }else
87 if( 0==sqlite3_stricmp(azArg[i], "separators") ){
88 fts5AsciiAddExceptions(p, zArg, 0);
89 }else{
90 rc = SQLITE_ERROR;
93 if( rc!=SQLITE_OK ){
94 fts5AsciiDelete((Fts5Tokenizer*)p);
95 p = 0;
100 *ppOut = (Fts5Tokenizer*)p;
101 return rc;
105 static void asciiFold(char *aOut, const char *aIn, int nByte){
106 int i;
107 for(i=0; i<nByte; i++){
108 char c = aIn[i];
109 if( c>='A' && c<='Z' ) c += 32;
110 aOut[i] = c;
115 ** Tokenize some text using the ascii tokenizer.
117 static int fts5AsciiTokenize(
118 Fts5Tokenizer *pTokenizer,
119 void *pCtx,
120 int iUnused,
121 const char *pText, int nText,
122 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
124 AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer;
125 int rc = SQLITE_OK;
126 int ie;
127 int is = 0;
129 char aFold[64];
130 int nFold = sizeof(aFold);
131 char *pFold = aFold;
132 unsigned char *a = p->aTokenChar;
134 UNUSED_PARAM(iUnused);
136 while( is<nText && rc==SQLITE_OK ){
137 int nByte;
139 /* Skip any leading divider characters. */
140 while( is<nText && ((pText[is]&0x80)==0 && a[(int)pText[is]]==0) ){
141 is++;
143 if( is==nText ) break;
145 /* Count the token characters */
146 ie = is+1;
147 while( ie<nText && ((pText[ie]&0x80) || a[(int)pText[ie]] ) ){
148 ie++;
151 /* Fold to lower case */
152 nByte = ie-is;
153 if( nByte>nFold ){
154 if( pFold!=aFold ) sqlite3_free(pFold);
155 pFold = sqlite3_malloc64((sqlite3_int64)nByte*2);
156 if( pFold==0 ){
157 rc = SQLITE_NOMEM;
158 break;
160 nFold = nByte*2;
162 asciiFold(pFold, &pText[is], nByte);
164 /* Invoke the token callback */
165 rc = xToken(pCtx, 0, pFold, nByte, is, ie);
166 is = ie+1;
169 if( pFold!=aFold ) sqlite3_free(pFold);
170 if( rc==SQLITE_DONE ) rc = SQLITE_OK;
171 return rc;
174 /**************************************************************************
175 ** Start of unicode61 tokenizer implementation.
180 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
181 ** from the sqlite3 source file utf.c. If this file is compiled as part
182 ** of the amalgamation, they are not required.
184 #ifndef SQLITE_AMALGAMATION
186 static const unsigned char sqlite3Utf8Trans1[] = {
187 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
188 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
189 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
190 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
191 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
192 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
193 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
194 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
197 #define READ_UTF8(zIn, zTerm, c) \
198 c = *(zIn++); \
199 if( c>=0xc0 ){ \
200 c = sqlite3Utf8Trans1[c-0xc0]; \
201 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \
202 c = (c<<6) + (0x3f & *(zIn++)); \
204 if( c<0x80 \
205 || (c&0xFFFFF800)==0xD800 \
206 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
210 #define WRITE_UTF8(zOut, c) { \
211 if( c<0x00080 ){ \
212 *zOut++ = (unsigned char)(c&0xFF); \
214 else if( c<0x00800 ){ \
215 *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F); \
216 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
218 else if( c<0x10000 ){ \
219 *zOut++ = 0xE0 + (unsigned char)((c>>12)&0x0F); \
220 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
221 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
222 }else{ \
223 *zOut++ = 0xF0 + (unsigned char)((c>>18) & 0x07); \
224 *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F); \
225 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
226 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
230 #endif /* ifndef SQLITE_AMALGAMATION */
232 #define FTS5_SKIP_UTF8(zIn) { \
233 if( ((unsigned char)(*(zIn++)))>=0xc0 ){ \
234 while( (((unsigned char)*zIn) & 0xc0)==0x80 ){ zIn++; } \
238 typedef struct Unicode61Tokenizer Unicode61Tokenizer;
239 struct Unicode61Tokenizer {
240 unsigned char aTokenChar[128]; /* ASCII range token characters */
241 char *aFold; /* Buffer to fold text into */
242 int nFold; /* Size of aFold[] in bytes */
243 int eRemoveDiacritic; /* True if remove_diacritics=1 is set */
244 int nException;
245 int *aiException;
247 unsigned char aCategory[32]; /* True for token char categories */
250 /* Values for eRemoveDiacritic (must match internals of fts5_unicode2.c) */
251 #define FTS5_REMOVE_DIACRITICS_NONE 0
252 #define FTS5_REMOVE_DIACRITICS_SIMPLE 1
253 #define FTS5_REMOVE_DIACRITICS_COMPLEX 2
255 static int fts5UnicodeAddExceptions(
256 Unicode61Tokenizer *p, /* Tokenizer object */
257 const char *z, /* Characters to treat as exceptions */
258 int bTokenChars /* 1 for 'tokenchars', 0 for 'separators' */
260 int rc = SQLITE_OK;
261 int n = (int)strlen(z);
262 int *aNew;
264 if( n>0 ){
265 aNew = (int*)sqlite3_realloc64(p->aiException,
266 (n+p->nException)*sizeof(int));
267 if( aNew ){
268 int nNew = p->nException;
269 const unsigned char *zCsr = (const unsigned char*)z;
270 const unsigned char *zTerm = (const unsigned char*)&z[n];
271 while( zCsr<zTerm ){
272 u32 iCode;
273 int bToken;
274 READ_UTF8(zCsr, zTerm, iCode);
275 if( iCode<128 ){
276 p->aTokenChar[iCode] = (unsigned char)bTokenChars;
277 }else{
278 bToken = p->aCategory[sqlite3Fts5UnicodeCategory(iCode)];
279 assert( (bToken==0 || bToken==1) );
280 assert( (bTokenChars==0 || bTokenChars==1) );
281 if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){
282 int i;
283 for(i=0; i<nNew; i++){
284 if( (u32)aNew[i]>iCode ) break;
286 memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int));
287 aNew[i] = iCode;
288 nNew++;
292 p->aiException = aNew;
293 p->nException = nNew;
294 }else{
295 rc = SQLITE_NOMEM;
299 return rc;
303 ** Return true if the p->aiException[] array contains the value iCode.
305 static int fts5UnicodeIsException(Unicode61Tokenizer *p, int iCode){
306 if( p->nException>0 ){
307 int *a = p->aiException;
308 int iLo = 0;
309 int iHi = p->nException-1;
311 while( iHi>=iLo ){
312 int iTest = (iHi + iLo) / 2;
313 if( iCode==a[iTest] ){
314 return 1;
315 }else if( iCode>a[iTest] ){
316 iLo = iTest+1;
317 }else{
318 iHi = iTest-1;
323 return 0;
327 ** Delete a "unicode61" tokenizer.
329 static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
330 if( pTok ){
331 Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
332 sqlite3_free(p->aiException);
333 sqlite3_free(p->aFold);
334 sqlite3_free(p);
336 return;
339 static int unicodeSetCategories(Unicode61Tokenizer *p, const char *zCat){
340 const char *z = zCat;
342 while( *z ){
343 while( *z==' ' || *z=='\t' ) z++;
344 if( *z && sqlite3Fts5UnicodeCatParse(z, p->aCategory) ){
345 return SQLITE_ERROR;
347 while( *z!=' ' && *z!='\t' && *z!='\0' ) z++;
350 sqlite3Fts5UnicodeAscii(p->aCategory, p->aTokenChar);
351 return SQLITE_OK;
355 ** Create a "unicode61" tokenizer.
357 static int fts5UnicodeCreate(
358 void *pUnused,
359 const char **azArg, int nArg,
360 Fts5Tokenizer **ppOut
362 int rc = SQLITE_OK; /* Return code */
363 Unicode61Tokenizer *p = 0; /* New tokenizer object */
365 UNUSED_PARAM(pUnused);
367 if( nArg%2 ){
368 rc = SQLITE_ERROR;
369 }else{
370 p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer));
371 if( p ){
372 const char *zCat = "L* N* Co";
373 int i;
374 memset(p, 0, sizeof(Unicode61Tokenizer));
376 p->eRemoveDiacritic = FTS5_REMOVE_DIACRITICS_SIMPLE;
377 p->nFold = 64;
378 p->aFold = sqlite3_malloc64(p->nFold * sizeof(char));
379 if( p->aFold==0 ){
380 rc = SQLITE_NOMEM;
383 /* Search for a "categories" argument */
384 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
385 if( 0==sqlite3_stricmp(azArg[i], "categories") ){
386 zCat = azArg[i+1];
390 if( rc==SQLITE_OK ){
391 rc = unicodeSetCategories(p, zCat);
394 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
395 const char *zArg = azArg[i+1];
396 if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
397 if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){
398 rc = SQLITE_ERROR;
399 }else{
400 p->eRemoveDiacritic = (zArg[0] - '0');
401 assert( p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_NONE
402 || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_SIMPLE
403 || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_COMPLEX
406 }else
407 if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
408 rc = fts5UnicodeAddExceptions(p, zArg, 1);
409 }else
410 if( 0==sqlite3_stricmp(azArg[i], "separators") ){
411 rc = fts5UnicodeAddExceptions(p, zArg, 0);
412 }else
413 if( 0==sqlite3_stricmp(azArg[i], "categories") ){
414 /* no-op */
415 }else{
416 rc = SQLITE_ERROR;
420 }else{
421 rc = SQLITE_NOMEM;
423 if( rc!=SQLITE_OK ){
424 fts5UnicodeDelete((Fts5Tokenizer*)p);
425 p = 0;
427 *ppOut = (Fts5Tokenizer*)p;
429 return rc;
433 ** Return true if, for the purposes of tokenizing with the tokenizer
434 ** passed as the first argument, codepoint iCode is considered a token
435 ** character (not a separator).
437 static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){
438 return (
439 p->aCategory[sqlite3Fts5UnicodeCategory((u32)iCode)]
440 ^ fts5UnicodeIsException(p, iCode)
444 static int fts5UnicodeTokenize(
445 Fts5Tokenizer *pTokenizer,
446 void *pCtx,
447 int iUnused,
448 const char *pText, int nText,
449 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
451 Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
452 int rc = SQLITE_OK;
453 unsigned char *a = p->aTokenChar;
455 unsigned char *zTerm = (unsigned char*)&pText[nText];
456 unsigned char *zCsr = (unsigned char *)pText;
458 /* Output buffer */
459 char *aFold = p->aFold;
460 int nFold = p->nFold;
461 const char *pEnd = &aFold[nFold-6];
463 UNUSED_PARAM(iUnused);
465 /* Each iteration of this loop gobbles up a contiguous run of separators,
466 ** then the next token. */
467 while( rc==SQLITE_OK ){
468 u32 iCode; /* non-ASCII codepoint read from input */
469 char *zOut = aFold;
470 int is;
471 int ie;
473 /* Skip any separator characters. */
474 while( 1 ){
475 if( zCsr>=zTerm ) goto tokenize_done;
476 if( *zCsr & 0x80 ) {
477 /* A character outside of the ascii range. Skip past it if it is
478 ** a separator character. Or break out of the loop if it is not. */
479 is = zCsr - (unsigned char*)pText;
480 READ_UTF8(zCsr, zTerm, iCode);
481 if( fts5UnicodeIsAlnum(p, iCode) ){
482 goto non_ascii_tokenchar;
484 }else{
485 if( a[*zCsr] ){
486 is = zCsr - (unsigned char*)pText;
487 goto ascii_tokenchar;
489 zCsr++;
493 /* Run through the tokenchars. Fold them into the output buffer along
494 ** the way. */
495 while( zCsr<zTerm ){
497 /* Grow the output buffer so that there is sufficient space to fit the
498 ** largest possible utf-8 character. */
499 if( zOut>pEnd ){
500 aFold = sqlite3_malloc64((sqlite3_int64)nFold*2);
501 if( aFold==0 ){
502 rc = SQLITE_NOMEM;
503 goto tokenize_done;
505 zOut = &aFold[zOut - p->aFold];
506 memcpy(aFold, p->aFold, nFold);
507 sqlite3_free(p->aFold);
508 p->aFold = aFold;
509 p->nFold = nFold = nFold*2;
510 pEnd = &aFold[nFold-6];
513 if( *zCsr & 0x80 ){
514 /* An non-ascii-range character. Fold it into the output buffer if
515 ** it is a token character, or break out of the loop if it is not. */
516 READ_UTF8(zCsr, zTerm, iCode);
517 if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
518 non_ascii_tokenchar:
519 iCode = sqlite3Fts5UnicodeFold(iCode, p->eRemoveDiacritic);
520 if( iCode ) WRITE_UTF8(zOut, iCode);
521 }else{
522 break;
524 }else if( a[*zCsr]==0 ){
525 /* An ascii-range separator character. End of token. */
526 break;
527 }else{
528 ascii_tokenchar:
529 if( *zCsr>='A' && *zCsr<='Z' ){
530 *zOut++ = *zCsr + 32;
531 }else{
532 *zOut++ = *zCsr;
534 zCsr++;
536 ie = zCsr - (unsigned char*)pText;
539 /* Invoke the token callback */
540 rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie);
543 tokenize_done:
544 if( rc==SQLITE_DONE ) rc = SQLITE_OK;
545 return rc;
548 /**************************************************************************
549 ** Start of porter stemmer implementation.
552 /* Any tokens larger than this (in bytes) are passed through without
553 ** stemming. */
554 #define FTS5_PORTER_MAX_TOKEN 64
556 typedef struct PorterTokenizer PorterTokenizer;
557 struct PorterTokenizer {
558 fts5_tokenizer tokenizer; /* Parent tokenizer module */
559 Fts5Tokenizer *pTokenizer; /* Parent tokenizer instance */
560 char aBuf[FTS5_PORTER_MAX_TOKEN + 64];
564 ** Delete a "porter" tokenizer.
566 static void fts5PorterDelete(Fts5Tokenizer *pTok){
567 if( pTok ){
568 PorterTokenizer *p = (PorterTokenizer*)pTok;
569 if( p->pTokenizer ){
570 p->tokenizer.xDelete(p->pTokenizer);
572 sqlite3_free(p);
577 ** Create a "porter" tokenizer.
579 static int fts5PorterCreate(
580 void *pCtx,
581 const char **azArg, int nArg,
582 Fts5Tokenizer **ppOut
584 fts5_api *pApi = (fts5_api*)pCtx;
585 int rc = SQLITE_OK;
586 PorterTokenizer *pRet;
587 void *pUserdata = 0;
588 const char *zBase = "unicode61";
590 if( nArg>0 ){
591 zBase = azArg[0];
594 pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer));
595 if( pRet ){
596 memset(pRet, 0, sizeof(PorterTokenizer));
597 rc = pApi->xFindTokenizer(pApi, zBase, &pUserdata, &pRet->tokenizer);
598 }else{
599 rc = SQLITE_NOMEM;
601 if( rc==SQLITE_OK ){
602 int nArg2 = (nArg>0 ? nArg-1 : 0);
603 const char **azArg2 = (nArg2 ? &azArg[1] : 0);
604 rc = pRet->tokenizer.xCreate(pUserdata, azArg2, nArg2, &pRet->pTokenizer);
607 if( rc!=SQLITE_OK ){
608 fts5PorterDelete((Fts5Tokenizer*)pRet);
609 pRet = 0;
611 *ppOut = (Fts5Tokenizer*)pRet;
612 return rc;
615 typedef struct PorterContext PorterContext;
616 struct PorterContext {
617 void *pCtx;
618 int (*xToken)(void*, int, const char*, int, int, int);
619 char *aBuf;
622 typedef struct PorterRule PorterRule;
623 struct PorterRule {
624 const char *zSuffix;
625 int nSuffix;
626 int (*xCond)(char *zStem, int nStem);
627 const char *zOutput;
628 int nOutput;
631 #if 0
632 static int fts5PorterApply(char *aBuf, int *pnBuf, PorterRule *aRule){
633 int ret = -1;
634 int nBuf = *pnBuf;
635 PorterRule *p;
637 for(p=aRule; p->zSuffix; p++){
638 assert( strlen(p->zSuffix)==p->nSuffix );
639 assert( strlen(p->zOutput)==p->nOutput );
640 if( nBuf<p->nSuffix ) continue;
641 if( 0==memcmp(&aBuf[nBuf - p->nSuffix], p->zSuffix, p->nSuffix) ) break;
644 if( p->zSuffix ){
645 int nStem = nBuf - p->nSuffix;
646 if( p->xCond==0 || p->xCond(aBuf, nStem) ){
647 memcpy(&aBuf[nStem], p->zOutput, p->nOutput);
648 *pnBuf = nStem + p->nOutput;
649 ret = p - aRule;
653 return ret;
655 #endif
657 static int fts5PorterIsVowel(char c, int bYIsVowel){
658 return (
659 c=='a' || c=='e' || c=='i' || c=='o' || c=='u' || (bYIsVowel && c=='y')
663 static int fts5PorterGobbleVC(char *zStem, int nStem, int bPrevCons){
664 int i;
665 int bCons = bPrevCons;
667 /* Scan for a vowel */
668 for(i=0; i<nStem; i++){
669 if( 0==(bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) break;
672 /* Scan for a consonent */
673 for(i++; i<nStem; i++){
674 if( (bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) return i+1;
676 return 0;
679 /* porter rule condition: (m > 0) */
680 static int fts5Porter_MGt0(char *zStem, int nStem){
681 return !!fts5PorterGobbleVC(zStem, nStem, 0);
684 /* porter rule condition: (m > 1) */
685 static int fts5Porter_MGt1(char *zStem, int nStem){
686 int n;
687 n = fts5PorterGobbleVC(zStem, nStem, 0);
688 if( n && fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
689 return 1;
691 return 0;
694 /* porter rule condition: (m = 1) */
695 static int fts5Porter_MEq1(char *zStem, int nStem){
696 int n;
697 n = fts5PorterGobbleVC(zStem, nStem, 0);
698 if( n && 0==fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
699 return 1;
701 return 0;
704 /* porter rule condition: (*o) */
705 static int fts5Porter_Ostar(char *zStem, int nStem){
706 if( zStem[nStem-1]=='w' || zStem[nStem-1]=='x' || zStem[nStem-1]=='y' ){
707 return 0;
708 }else{
709 int i;
710 int mask = 0;
711 int bCons = 0;
712 for(i=0; i<nStem; i++){
713 bCons = !fts5PorterIsVowel(zStem[i], bCons);
714 assert( bCons==0 || bCons==1 );
715 mask = (mask << 1) + bCons;
717 return ((mask & 0x0007)==0x0005);
721 /* porter rule condition: (m > 1 and (*S or *T)) */
722 static int fts5Porter_MGt1_and_S_or_T(char *zStem, int nStem){
723 assert( nStem>0 );
724 return (zStem[nStem-1]=='s' || zStem[nStem-1]=='t')
725 && fts5Porter_MGt1(zStem, nStem);
728 /* porter rule condition: (*v*) */
729 static int fts5Porter_Vowel(char *zStem, int nStem){
730 int i;
731 for(i=0; i<nStem; i++){
732 if( fts5PorterIsVowel(zStem[i], i>0) ){
733 return 1;
736 return 0;
740 /**************************************************************************
741 ***************************************************************************
742 ** GENERATED CODE STARTS HERE (mkportersteps.tcl)
745 static int fts5PorterStep4(char *aBuf, int *pnBuf){
746 int ret = 0;
747 int nBuf = *pnBuf;
748 switch( aBuf[nBuf-2] ){
750 case 'a':
751 if( nBuf>2 && 0==memcmp("al", &aBuf[nBuf-2], 2) ){
752 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
753 *pnBuf = nBuf - 2;
756 break;
758 case 'c':
759 if( nBuf>4 && 0==memcmp("ance", &aBuf[nBuf-4], 4) ){
760 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
761 *pnBuf = nBuf - 4;
763 }else if( nBuf>4 && 0==memcmp("ence", &aBuf[nBuf-4], 4) ){
764 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
765 *pnBuf = nBuf - 4;
768 break;
770 case 'e':
771 if( nBuf>2 && 0==memcmp("er", &aBuf[nBuf-2], 2) ){
772 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
773 *pnBuf = nBuf - 2;
776 break;
778 case 'i':
779 if( nBuf>2 && 0==memcmp("ic", &aBuf[nBuf-2], 2) ){
780 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
781 *pnBuf = nBuf - 2;
784 break;
786 case 'l':
787 if( nBuf>4 && 0==memcmp("able", &aBuf[nBuf-4], 4) ){
788 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
789 *pnBuf = nBuf - 4;
791 }else if( nBuf>4 && 0==memcmp("ible", &aBuf[nBuf-4], 4) ){
792 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
793 *pnBuf = nBuf - 4;
796 break;
798 case 'n':
799 if( nBuf>3 && 0==memcmp("ant", &aBuf[nBuf-3], 3) ){
800 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
801 *pnBuf = nBuf - 3;
803 }else if( nBuf>5 && 0==memcmp("ement", &aBuf[nBuf-5], 5) ){
804 if( fts5Porter_MGt1(aBuf, nBuf-5) ){
805 *pnBuf = nBuf - 5;
807 }else if( nBuf>4 && 0==memcmp("ment", &aBuf[nBuf-4], 4) ){
808 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
809 *pnBuf = nBuf - 4;
811 }else if( nBuf>3 && 0==memcmp("ent", &aBuf[nBuf-3], 3) ){
812 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
813 *pnBuf = nBuf - 3;
816 break;
818 case 'o':
819 if( nBuf>3 && 0==memcmp("ion", &aBuf[nBuf-3], 3) ){
820 if( fts5Porter_MGt1_and_S_or_T(aBuf, nBuf-3) ){
821 *pnBuf = nBuf - 3;
823 }else if( nBuf>2 && 0==memcmp("ou", &aBuf[nBuf-2], 2) ){
824 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
825 *pnBuf = nBuf - 2;
828 break;
830 case 's':
831 if( nBuf>3 && 0==memcmp("ism", &aBuf[nBuf-3], 3) ){
832 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
833 *pnBuf = nBuf - 3;
836 break;
838 case 't':
839 if( nBuf>3 && 0==memcmp("ate", &aBuf[nBuf-3], 3) ){
840 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
841 *pnBuf = nBuf - 3;
843 }else if( nBuf>3 && 0==memcmp("iti", &aBuf[nBuf-3], 3) ){
844 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
845 *pnBuf = nBuf - 3;
848 break;
850 case 'u':
851 if( nBuf>3 && 0==memcmp("ous", &aBuf[nBuf-3], 3) ){
852 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
853 *pnBuf = nBuf - 3;
856 break;
858 case 'v':
859 if( nBuf>3 && 0==memcmp("ive", &aBuf[nBuf-3], 3) ){
860 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
861 *pnBuf = nBuf - 3;
864 break;
866 case 'z':
867 if( nBuf>3 && 0==memcmp("ize", &aBuf[nBuf-3], 3) ){
868 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
869 *pnBuf = nBuf - 3;
872 break;
875 return ret;
879 static int fts5PorterStep1B2(char *aBuf, int *pnBuf){
880 int ret = 0;
881 int nBuf = *pnBuf;
882 switch( aBuf[nBuf-2] ){
884 case 'a':
885 if( nBuf>2 && 0==memcmp("at", &aBuf[nBuf-2], 2) ){
886 memcpy(&aBuf[nBuf-2], "ate", 3);
887 *pnBuf = nBuf - 2 + 3;
888 ret = 1;
890 break;
892 case 'b':
893 if( nBuf>2 && 0==memcmp("bl", &aBuf[nBuf-2], 2) ){
894 memcpy(&aBuf[nBuf-2], "ble", 3);
895 *pnBuf = nBuf - 2 + 3;
896 ret = 1;
898 break;
900 case 'i':
901 if( nBuf>2 && 0==memcmp("iz", &aBuf[nBuf-2], 2) ){
902 memcpy(&aBuf[nBuf-2], "ize", 3);
903 *pnBuf = nBuf - 2 + 3;
904 ret = 1;
906 break;
909 return ret;
913 static int fts5PorterStep2(char *aBuf, int *pnBuf){
914 int ret = 0;
915 int nBuf = *pnBuf;
916 switch( aBuf[nBuf-2] ){
918 case 'a':
919 if( nBuf>7 && 0==memcmp("ational", &aBuf[nBuf-7], 7) ){
920 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
921 memcpy(&aBuf[nBuf-7], "ate", 3);
922 *pnBuf = nBuf - 7 + 3;
924 }else if( nBuf>6 && 0==memcmp("tional", &aBuf[nBuf-6], 6) ){
925 if( fts5Porter_MGt0(aBuf, nBuf-6) ){
926 memcpy(&aBuf[nBuf-6], "tion", 4);
927 *pnBuf = nBuf - 6 + 4;
930 break;
932 case 'c':
933 if( nBuf>4 && 0==memcmp("enci", &aBuf[nBuf-4], 4) ){
934 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
935 memcpy(&aBuf[nBuf-4], "ence", 4);
936 *pnBuf = nBuf - 4 + 4;
938 }else if( nBuf>4 && 0==memcmp("anci", &aBuf[nBuf-4], 4) ){
939 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
940 memcpy(&aBuf[nBuf-4], "ance", 4);
941 *pnBuf = nBuf - 4 + 4;
944 break;
946 case 'e':
947 if( nBuf>4 && 0==memcmp("izer", &aBuf[nBuf-4], 4) ){
948 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
949 memcpy(&aBuf[nBuf-4], "ize", 3);
950 *pnBuf = nBuf - 4 + 3;
953 break;
955 case 'g':
956 if( nBuf>4 && 0==memcmp("logi", &aBuf[nBuf-4], 4) ){
957 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
958 memcpy(&aBuf[nBuf-4], "log", 3);
959 *pnBuf = nBuf - 4 + 3;
962 break;
964 case 'l':
965 if( nBuf>3 && 0==memcmp("bli", &aBuf[nBuf-3], 3) ){
966 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
967 memcpy(&aBuf[nBuf-3], "ble", 3);
968 *pnBuf = nBuf - 3 + 3;
970 }else if( nBuf>4 && 0==memcmp("alli", &aBuf[nBuf-4], 4) ){
971 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
972 memcpy(&aBuf[nBuf-4], "al", 2);
973 *pnBuf = nBuf - 4 + 2;
975 }else if( nBuf>5 && 0==memcmp("entli", &aBuf[nBuf-5], 5) ){
976 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
977 memcpy(&aBuf[nBuf-5], "ent", 3);
978 *pnBuf = nBuf - 5 + 3;
980 }else if( nBuf>3 && 0==memcmp("eli", &aBuf[nBuf-3], 3) ){
981 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
982 memcpy(&aBuf[nBuf-3], "e", 1);
983 *pnBuf = nBuf - 3 + 1;
985 }else if( nBuf>5 && 0==memcmp("ousli", &aBuf[nBuf-5], 5) ){
986 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
987 memcpy(&aBuf[nBuf-5], "ous", 3);
988 *pnBuf = nBuf - 5 + 3;
991 break;
993 case 'o':
994 if( nBuf>7 && 0==memcmp("ization", &aBuf[nBuf-7], 7) ){
995 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
996 memcpy(&aBuf[nBuf-7], "ize", 3);
997 *pnBuf = nBuf - 7 + 3;
999 }else if( nBuf>5 && 0==memcmp("ation", &aBuf[nBuf-5], 5) ){
1000 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1001 memcpy(&aBuf[nBuf-5], "ate", 3);
1002 *pnBuf = nBuf - 5 + 3;
1004 }else if( nBuf>4 && 0==memcmp("ator", &aBuf[nBuf-4], 4) ){
1005 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1006 memcpy(&aBuf[nBuf-4], "ate", 3);
1007 *pnBuf = nBuf - 4 + 3;
1010 break;
1012 case 's':
1013 if( nBuf>5 && 0==memcmp("alism", &aBuf[nBuf-5], 5) ){
1014 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1015 memcpy(&aBuf[nBuf-5], "al", 2);
1016 *pnBuf = nBuf - 5 + 2;
1018 }else if( nBuf>7 && 0==memcmp("iveness", &aBuf[nBuf-7], 7) ){
1019 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1020 memcpy(&aBuf[nBuf-7], "ive", 3);
1021 *pnBuf = nBuf - 7 + 3;
1023 }else if( nBuf>7 && 0==memcmp("fulness", &aBuf[nBuf-7], 7) ){
1024 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1025 memcpy(&aBuf[nBuf-7], "ful", 3);
1026 *pnBuf = nBuf - 7 + 3;
1028 }else if( nBuf>7 && 0==memcmp("ousness", &aBuf[nBuf-7], 7) ){
1029 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1030 memcpy(&aBuf[nBuf-7], "ous", 3);
1031 *pnBuf = nBuf - 7 + 3;
1034 break;
1036 case 't':
1037 if( nBuf>5 && 0==memcmp("aliti", &aBuf[nBuf-5], 5) ){
1038 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1039 memcpy(&aBuf[nBuf-5], "al", 2);
1040 *pnBuf = nBuf - 5 + 2;
1042 }else if( nBuf>5 && 0==memcmp("iviti", &aBuf[nBuf-5], 5) ){
1043 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1044 memcpy(&aBuf[nBuf-5], "ive", 3);
1045 *pnBuf = nBuf - 5 + 3;
1047 }else if( nBuf>6 && 0==memcmp("biliti", &aBuf[nBuf-6], 6) ){
1048 if( fts5Porter_MGt0(aBuf, nBuf-6) ){
1049 memcpy(&aBuf[nBuf-6], "ble", 3);
1050 *pnBuf = nBuf - 6 + 3;
1053 break;
1056 return ret;
1060 static int fts5PorterStep3(char *aBuf, int *pnBuf){
1061 int ret = 0;
1062 int nBuf = *pnBuf;
1063 switch( aBuf[nBuf-2] ){
1065 case 'a':
1066 if( nBuf>4 && 0==memcmp("ical", &aBuf[nBuf-4], 4) ){
1067 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1068 memcpy(&aBuf[nBuf-4], "ic", 2);
1069 *pnBuf = nBuf - 4 + 2;
1072 break;
1074 case 's':
1075 if( nBuf>4 && 0==memcmp("ness", &aBuf[nBuf-4], 4) ){
1076 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1077 *pnBuf = nBuf - 4;
1080 break;
1082 case 't':
1083 if( nBuf>5 && 0==memcmp("icate", &aBuf[nBuf-5], 5) ){
1084 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1085 memcpy(&aBuf[nBuf-5], "ic", 2);
1086 *pnBuf = nBuf - 5 + 2;
1088 }else if( nBuf>5 && 0==memcmp("iciti", &aBuf[nBuf-5], 5) ){
1089 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1090 memcpy(&aBuf[nBuf-5], "ic", 2);
1091 *pnBuf = nBuf - 5 + 2;
1094 break;
1096 case 'u':
1097 if( nBuf>3 && 0==memcmp("ful", &aBuf[nBuf-3], 3) ){
1098 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
1099 *pnBuf = nBuf - 3;
1102 break;
1104 case 'v':
1105 if( nBuf>5 && 0==memcmp("ative", &aBuf[nBuf-5], 5) ){
1106 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1107 *pnBuf = nBuf - 5;
1110 break;
1112 case 'z':
1113 if( nBuf>5 && 0==memcmp("alize", &aBuf[nBuf-5], 5) ){
1114 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1115 memcpy(&aBuf[nBuf-5], "al", 2);
1116 *pnBuf = nBuf - 5 + 2;
1119 break;
1122 return ret;
1126 static int fts5PorterStep1B(char *aBuf, int *pnBuf){
1127 int ret = 0;
1128 int nBuf = *pnBuf;
1129 switch( aBuf[nBuf-2] ){
1131 case 'e':
1132 if( nBuf>3 && 0==memcmp("eed", &aBuf[nBuf-3], 3) ){
1133 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
1134 memcpy(&aBuf[nBuf-3], "ee", 2);
1135 *pnBuf = nBuf - 3 + 2;
1137 }else if( nBuf>2 && 0==memcmp("ed", &aBuf[nBuf-2], 2) ){
1138 if( fts5Porter_Vowel(aBuf, nBuf-2) ){
1139 *pnBuf = nBuf - 2;
1140 ret = 1;
1143 break;
1145 case 'n':
1146 if( nBuf>3 && 0==memcmp("ing", &aBuf[nBuf-3], 3) ){
1147 if( fts5Porter_Vowel(aBuf, nBuf-3) ){
1148 *pnBuf = nBuf - 3;
1149 ret = 1;
1152 break;
1155 return ret;
1159 ** GENERATED CODE ENDS HERE (mkportersteps.tcl)
1160 ***************************************************************************
1161 **************************************************************************/
1163 static void fts5PorterStep1A(char *aBuf, int *pnBuf){
1164 int nBuf = *pnBuf;
1165 if( aBuf[nBuf-1]=='s' ){
1166 if( aBuf[nBuf-2]=='e' ){
1167 if( (nBuf>4 && aBuf[nBuf-4]=='s' && aBuf[nBuf-3]=='s')
1168 || (nBuf>3 && aBuf[nBuf-3]=='i' )
1170 *pnBuf = nBuf-2;
1171 }else{
1172 *pnBuf = nBuf-1;
1175 else if( aBuf[nBuf-2]!='s' ){
1176 *pnBuf = nBuf-1;
1181 static int fts5PorterCb(
1182 void *pCtx,
1183 int tflags,
1184 const char *pToken,
1185 int nToken,
1186 int iStart,
1187 int iEnd
1189 PorterContext *p = (PorterContext*)pCtx;
1191 char *aBuf;
1192 int nBuf;
1194 if( nToken>FTS5_PORTER_MAX_TOKEN || nToken<3 ) goto pass_through;
1195 aBuf = p->aBuf;
1196 nBuf = nToken;
1197 memcpy(aBuf, pToken, nBuf);
1199 /* Step 1. */
1200 fts5PorterStep1A(aBuf, &nBuf);
1201 if( fts5PorterStep1B(aBuf, &nBuf) ){
1202 if( fts5PorterStep1B2(aBuf, &nBuf)==0 ){
1203 char c = aBuf[nBuf-1];
1204 if( fts5PorterIsVowel(c, 0)==0
1205 && c!='l' && c!='s' && c!='z' && c==aBuf[nBuf-2]
1207 nBuf--;
1208 }else if( fts5Porter_MEq1(aBuf, nBuf) && fts5Porter_Ostar(aBuf, nBuf) ){
1209 aBuf[nBuf++] = 'e';
1214 /* Step 1C. */
1215 if( aBuf[nBuf-1]=='y' && fts5Porter_Vowel(aBuf, nBuf-1) ){
1216 aBuf[nBuf-1] = 'i';
1219 /* Steps 2 through 4. */
1220 fts5PorterStep2(aBuf, &nBuf);
1221 fts5PorterStep3(aBuf, &nBuf);
1222 fts5PorterStep4(aBuf, &nBuf);
1224 /* Step 5a. */
1225 assert( nBuf>0 );
1226 if( aBuf[nBuf-1]=='e' ){
1227 if( fts5Porter_MGt1(aBuf, nBuf-1)
1228 || (fts5Porter_MEq1(aBuf, nBuf-1) && !fts5Porter_Ostar(aBuf, nBuf-1))
1230 nBuf--;
1234 /* Step 5b. */
1235 if( nBuf>1 && aBuf[nBuf-1]=='l'
1236 && aBuf[nBuf-2]=='l' && fts5Porter_MGt1(aBuf, nBuf-1)
1238 nBuf--;
1241 return p->xToken(p->pCtx, tflags, aBuf, nBuf, iStart, iEnd);
1243 pass_through:
1244 return p->xToken(p->pCtx, tflags, pToken, nToken, iStart, iEnd);
1248 ** Tokenize using the porter tokenizer.
1250 static int fts5PorterTokenize(
1251 Fts5Tokenizer *pTokenizer,
1252 void *pCtx,
1253 int flags,
1254 const char *pText, int nText,
1255 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
1257 PorterTokenizer *p = (PorterTokenizer*)pTokenizer;
1258 PorterContext sCtx;
1259 sCtx.xToken = xToken;
1260 sCtx.pCtx = pCtx;
1261 sCtx.aBuf = p->aBuf;
1262 return p->tokenizer.xTokenize(
1263 p->pTokenizer, (void*)&sCtx, flags, pText, nText, fts5PorterCb
1267 /**************************************************************************
1268 ** Start of trigram implementation.
1270 typedef struct TrigramTokenizer TrigramTokenizer;
1271 struct TrigramTokenizer {
1272 int bFold; /* True to fold to lower-case */
1273 int iFoldParam; /* Parameter to pass to Fts5UnicodeFold() */
1277 ** Free a trigram tokenizer.
1279 static void fts5TriDelete(Fts5Tokenizer *p){
1280 sqlite3_free(p);
1284 ** Allocate a trigram tokenizer.
1286 static int fts5TriCreate(
1287 void *pUnused,
1288 const char **azArg,
1289 int nArg,
1290 Fts5Tokenizer **ppOut
1292 int rc = SQLITE_OK;
1293 TrigramTokenizer *pNew = (TrigramTokenizer*)sqlite3_malloc(sizeof(*pNew));
1294 UNUSED_PARAM(pUnused);
1295 if( pNew==0 ){
1296 rc = SQLITE_NOMEM;
1297 }else{
1298 int i;
1299 pNew->bFold = 1;
1300 pNew->iFoldParam = 0;
1301 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
1302 const char *zArg = azArg[i+1];
1303 if( 0==sqlite3_stricmp(azArg[i], "case_sensitive") ){
1304 if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
1305 rc = SQLITE_ERROR;
1306 }else{
1307 pNew->bFold = (zArg[0]=='0');
1309 }else if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
1310 if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){
1311 rc = SQLITE_ERROR;
1312 }else{
1313 pNew->iFoldParam = (zArg[0]!='0') ? 2 : 0;
1315 }else{
1316 rc = SQLITE_ERROR;
1320 if( pNew->iFoldParam!=0 && pNew->bFold==0 ){
1321 rc = SQLITE_ERROR;
1324 if( rc!=SQLITE_OK ){
1325 fts5TriDelete((Fts5Tokenizer*)pNew);
1326 pNew = 0;
1329 *ppOut = (Fts5Tokenizer*)pNew;
1330 return rc;
1334 ** Trigram tokenizer tokenize routine.
1336 static int fts5TriTokenize(
1337 Fts5Tokenizer *pTok,
1338 void *pCtx,
1339 int unusedFlags,
1340 const char *pText, int nText,
1341 int (*xToken)(void*, int, const char*, int, int, int)
1343 TrigramTokenizer *p = (TrigramTokenizer*)pTok;
1344 int rc = SQLITE_OK;
1345 char aBuf[32];
1346 char *zOut = aBuf;
1347 int ii;
1348 const unsigned char *zIn = (const unsigned char*)pText;
1349 const unsigned char *zEof = &zIn[nText];
1350 u32 iCode;
1351 int aStart[3]; /* Input offset of each character in aBuf[] */
1353 UNUSED_PARAM(unusedFlags);
1355 /* Populate aBuf[] with the characters for the first trigram. */
1356 for(ii=0; ii<3; ii++){
1357 do {
1358 aStart[ii] = zIn - (const unsigned char*)pText;
1359 READ_UTF8(zIn, zEof, iCode);
1360 if( iCode==0 ) return SQLITE_OK;
1361 if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam);
1362 }while( iCode==0 );
1363 WRITE_UTF8(zOut, iCode);
1366 /* At the start of each iteration of this loop:
1368 ** aBuf: Contains 3 characters. The 3 characters of the next trigram.
1369 ** zOut: Points to the byte following the last character in aBuf.
1370 ** aStart[3]: Contains the byte offset in the input text corresponding
1371 ** to the start of each of the three characters in the buffer.
1373 assert( zIn<=zEof );
1374 while( 1 ){
1375 int iNext; /* Start of character following current tri */
1376 const char *z1;
1378 /* Read characters from the input up until the first non-diacritic */
1379 do {
1380 iNext = zIn - (const unsigned char*)pText;
1381 READ_UTF8(zIn, zEof, iCode);
1382 if( iCode==0 ) break;
1383 if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam);
1384 }while( iCode==0 );
1386 /* Pass the current trigram back to fts5 */
1387 rc = xToken(pCtx, 0, aBuf, zOut-aBuf, aStart[0], iNext);
1388 if( iCode==0 || rc!=SQLITE_OK ) break;
1390 /* Remove the first character from buffer aBuf[]. Append the character
1391 ** with codepoint iCode. */
1392 z1 = aBuf;
1393 FTS5_SKIP_UTF8(z1);
1394 memmove(aBuf, z1, zOut - z1);
1395 zOut -= (z1 - aBuf);
1396 WRITE_UTF8(zOut, iCode);
1398 /* Update the aStart[] array */
1399 aStart[0] = aStart[1];
1400 aStart[1] = aStart[2];
1401 aStart[2] = iNext;
1404 return rc;
1408 ** Argument xCreate is a pointer to a constructor function for a tokenizer.
1409 ** pTok is a tokenizer previously created using the same method. This function
1410 ** returns one of FTS5_PATTERN_NONE, FTS5_PATTERN_LIKE or FTS5_PATTERN_GLOB
1411 ** indicating the style of pattern matching that the tokenizer can support.
1412 ** In practice, this is:
1414 ** "trigram" tokenizer, case_sensitive=1 - FTS5_PATTERN_GLOB
1415 ** "trigram" tokenizer, case_sensitive=0 (the default) - FTS5_PATTERN_LIKE
1416 ** all other tokenizers - FTS5_PATTERN_NONE
1418 int sqlite3Fts5TokenizerPattern(
1419 int (*xCreate)(void*, const char**, int, Fts5Tokenizer**),
1420 Fts5Tokenizer *pTok
1422 if( xCreate==fts5TriCreate ){
1423 TrigramTokenizer *p = (TrigramTokenizer*)pTok;
1424 if( p->iFoldParam==0 ){
1425 return p->bFold ? FTS5_PATTERN_LIKE : FTS5_PATTERN_GLOB;
1428 return FTS5_PATTERN_NONE;
1432 ** Register all built-in tokenizers with FTS5.
1434 int sqlite3Fts5TokenizerInit(fts5_api *pApi){
1435 struct BuiltinTokenizer {
1436 const char *zName;
1437 fts5_tokenizer x;
1438 } aBuiltin[] = {
1439 { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}},
1440 { "ascii", {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }},
1441 { "porter", {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }},
1442 { "trigram", {fts5TriCreate, fts5TriDelete, fts5TriTokenize}},
1445 int rc = SQLITE_OK; /* Return code */
1446 int i; /* To iterate through builtin functions */
1448 for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){
1449 rc = pApi->xCreateTokenizer(pApi,
1450 aBuiltin[i].zName,
1451 (void*)pApi,
1452 &aBuiltin[i].x,
1457 return rc;