Snapshot of upstream SQLite 3.46.1
[sqlcipher.git] / ext / fts5 / fts5_tokenize.c
blob2200e78375e3b88cde88e09f0d556d12e6a79dc2
1 /*
2 ** 2014 May 31
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 ******************************************************************************
15 #include "fts5Int.h"
17 /**************************************************************************
18 ** Start of ascii tokenizer implementation.
22 ** For tokenizers with no "unicode" modifier, the set of token characters
23 ** is the same as the set of ASCII range alphanumeric characters.
25 static unsigned char aAsciiTokenChar[128] = {
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00..0x0F */
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10..0x1F */
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20..0x2F */
29 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 0x30..0x3F */
30 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40..0x4F */
31 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x50..0x5F */
32 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60..0x6F */
33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x70..0x7F */
36 typedef struct AsciiTokenizer AsciiTokenizer;
37 struct AsciiTokenizer {
38 unsigned char aTokenChar[128];
41 static void fts5AsciiAddExceptions(
42 AsciiTokenizer *p,
43 const char *zArg,
44 int bTokenChars
46 int i;
47 for(i=0; zArg[i]; i++){
48 if( (zArg[i] & 0x80)==0 ){
49 p->aTokenChar[(int)zArg[i]] = (unsigned char)bTokenChars;
55 ** Delete a "ascii" tokenizer.
57 static void fts5AsciiDelete(Fts5Tokenizer *p){
58 sqlite3_free(p);
62 ** Create an "ascii" tokenizer.
64 static int fts5AsciiCreate(
65 void *pUnused,
66 const char **azArg, int nArg,
67 Fts5Tokenizer **ppOut
69 int rc = SQLITE_OK;
70 AsciiTokenizer *p = 0;
71 UNUSED_PARAM(pUnused);
72 if( nArg%2 ){
73 rc = SQLITE_ERROR;
74 }else{
75 p = sqlite3_malloc(sizeof(AsciiTokenizer));
76 if( p==0 ){
77 rc = SQLITE_NOMEM;
78 }else{
79 int i;
80 memset(p, 0, sizeof(AsciiTokenizer));
81 memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
82 for(i=0; rc==SQLITE_OK && i<nArg-1; i+=2){
83 const char *zArg = azArg[i+1];
84 if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
85 fts5AsciiAddExceptions(p, zArg, 1);
86 }else
87 if( 0==sqlite3_stricmp(azArg[i], "separators") ){
88 fts5AsciiAddExceptions(p, zArg, 0);
89 }else{
90 rc = SQLITE_ERROR;
93 if( rc==SQLITE_OK && i<nArg ) rc = SQLITE_ERROR;
94 if( rc!=SQLITE_OK ){
95 fts5AsciiDelete((Fts5Tokenizer*)p);
96 p = 0;
101 *ppOut = (Fts5Tokenizer*)p;
102 return rc;
106 static void asciiFold(char *aOut, const char *aIn, int nByte){
107 int i;
108 for(i=0; i<nByte; i++){
109 char c = aIn[i];
110 if( c>='A' && c<='Z' ) c += 32;
111 aOut[i] = c;
116 ** Tokenize some text using the ascii tokenizer.
118 static int fts5AsciiTokenize(
119 Fts5Tokenizer *pTokenizer,
120 void *pCtx,
121 int iUnused,
122 const char *pText, int nText,
123 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
125 AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer;
126 int rc = SQLITE_OK;
127 int ie;
128 int is = 0;
130 char aFold[64];
131 int nFold = sizeof(aFold);
132 char *pFold = aFold;
133 unsigned char *a = p->aTokenChar;
135 UNUSED_PARAM(iUnused);
137 while( is<nText && rc==SQLITE_OK ){
138 int nByte;
140 /* Skip any leading divider characters. */
141 while( is<nText && ((pText[is]&0x80)==0 && a[(int)pText[is]]==0) ){
142 is++;
144 if( is==nText ) break;
146 /* Count the token characters */
147 ie = is+1;
148 while( ie<nText && ((pText[ie]&0x80) || a[(int)pText[ie]] ) ){
149 ie++;
152 /* Fold to lower case */
153 nByte = ie-is;
154 if( nByte>nFold ){
155 if( pFold!=aFold ) sqlite3_free(pFold);
156 pFold = sqlite3_malloc64((sqlite3_int64)nByte*2);
157 if( pFold==0 ){
158 rc = SQLITE_NOMEM;
159 break;
161 nFold = nByte*2;
163 asciiFold(pFold, &pText[is], nByte);
165 /* Invoke the token callback */
166 rc = xToken(pCtx, 0, pFold, nByte, is, ie);
167 is = ie+1;
170 if( pFold!=aFold ) sqlite3_free(pFold);
171 if( rc==SQLITE_DONE ) rc = SQLITE_OK;
172 return rc;
175 /**************************************************************************
176 ** Start of unicode61 tokenizer implementation.
181 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
182 ** from the sqlite3 source file utf.c. If this file is compiled as part
183 ** of the amalgamation, they are not required.
185 #ifndef SQLITE_AMALGAMATION
187 static const unsigned char sqlite3Utf8Trans1[] = {
188 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
189 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
190 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
191 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
192 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
193 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
194 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
195 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
198 #define READ_UTF8(zIn, zTerm, c) \
199 c = *(zIn++); \
200 if( c>=0xc0 ){ \
201 c = sqlite3Utf8Trans1[c-0xc0]; \
202 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \
203 c = (c<<6) + (0x3f & *(zIn++)); \
205 if( c<0x80 \
206 || (c&0xFFFFF800)==0xD800 \
207 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
211 #define WRITE_UTF8(zOut, c) { \
212 if( c<0x00080 ){ \
213 *zOut++ = (unsigned char)(c&0xFF); \
215 else if( c<0x00800 ){ \
216 *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F); \
217 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
219 else if( c<0x10000 ){ \
220 *zOut++ = 0xE0 + (unsigned char)((c>>12)&0x0F); \
221 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
222 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
223 }else{ \
224 *zOut++ = 0xF0 + (unsigned char)((c>>18) & 0x07); \
225 *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F); \
226 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
227 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
231 #endif /* ifndef SQLITE_AMALGAMATION */
233 #define FTS5_SKIP_UTF8(zIn) { \
234 if( ((unsigned char)(*(zIn++)))>=0xc0 ){ \
235 while( (((unsigned char)*zIn) & 0xc0)==0x80 ){ zIn++; } \
239 typedef struct Unicode61Tokenizer Unicode61Tokenizer;
240 struct Unicode61Tokenizer {
241 unsigned char aTokenChar[128]; /* ASCII range token characters */
242 char *aFold; /* Buffer to fold text into */
243 int nFold; /* Size of aFold[] in bytes */
244 int eRemoveDiacritic; /* True if remove_diacritics=1 is set */
245 int nException;
246 int *aiException;
248 unsigned char aCategory[32]; /* True for token char categories */
251 /* Values for eRemoveDiacritic (must match internals of fts5_unicode2.c) */
252 #define FTS5_REMOVE_DIACRITICS_NONE 0
253 #define FTS5_REMOVE_DIACRITICS_SIMPLE 1
254 #define FTS5_REMOVE_DIACRITICS_COMPLEX 2
256 static int fts5UnicodeAddExceptions(
257 Unicode61Tokenizer *p, /* Tokenizer object */
258 const char *z, /* Characters to treat as exceptions */
259 int bTokenChars /* 1 for 'tokenchars', 0 for 'separators' */
261 int rc = SQLITE_OK;
262 int n = (int)strlen(z);
263 int *aNew;
265 if( n>0 ){
266 aNew = (int*)sqlite3_realloc64(p->aiException,
267 (n+p->nException)*sizeof(int));
268 if( aNew ){
269 int nNew = p->nException;
270 const unsigned char *zCsr = (const unsigned char*)z;
271 const unsigned char *zTerm = (const unsigned char*)&z[n];
272 while( zCsr<zTerm ){
273 u32 iCode;
274 int bToken;
275 READ_UTF8(zCsr, zTerm, iCode);
276 if( iCode<128 ){
277 p->aTokenChar[iCode] = (unsigned char)bTokenChars;
278 }else{
279 bToken = p->aCategory[sqlite3Fts5UnicodeCategory(iCode)];
280 assert( (bToken==0 || bToken==1) );
281 assert( (bTokenChars==0 || bTokenChars==1) );
282 if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){
283 int i;
284 for(i=0; i<nNew; i++){
285 if( (u32)aNew[i]>iCode ) break;
287 memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int));
288 aNew[i] = iCode;
289 nNew++;
293 p->aiException = aNew;
294 p->nException = nNew;
295 }else{
296 rc = SQLITE_NOMEM;
300 return rc;
304 ** Return true if the p->aiException[] array contains the value iCode.
306 static int fts5UnicodeIsException(Unicode61Tokenizer *p, int iCode){
307 if( p->nException>0 ){
308 int *a = p->aiException;
309 int iLo = 0;
310 int iHi = p->nException-1;
312 while( iHi>=iLo ){
313 int iTest = (iHi + iLo) / 2;
314 if( iCode==a[iTest] ){
315 return 1;
316 }else if( iCode>a[iTest] ){
317 iLo = iTest+1;
318 }else{
319 iHi = iTest-1;
324 return 0;
328 ** Delete a "unicode61" tokenizer.
330 static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
331 if( pTok ){
332 Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
333 sqlite3_free(p->aiException);
334 sqlite3_free(p->aFold);
335 sqlite3_free(p);
337 return;
340 static int unicodeSetCategories(Unicode61Tokenizer *p, const char *zCat){
341 const char *z = zCat;
343 while( *z ){
344 while( *z==' ' || *z=='\t' ) z++;
345 if( *z && sqlite3Fts5UnicodeCatParse(z, p->aCategory) ){
346 return SQLITE_ERROR;
348 while( *z!=' ' && *z!='\t' && *z!='\0' ) z++;
351 sqlite3Fts5UnicodeAscii(p->aCategory, p->aTokenChar);
352 return SQLITE_OK;
356 ** Create a "unicode61" tokenizer.
358 static int fts5UnicodeCreate(
359 void *pUnused,
360 const char **azArg, int nArg,
361 Fts5Tokenizer **ppOut
363 int rc = SQLITE_OK; /* Return code */
364 Unicode61Tokenizer *p = 0; /* New tokenizer object */
366 UNUSED_PARAM(pUnused);
368 if( nArg%2 ){
369 rc = SQLITE_ERROR;
370 }else{
371 p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer));
372 if( p ){
373 const char *zCat = "L* N* Co";
374 int i;
375 memset(p, 0, sizeof(Unicode61Tokenizer));
377 p->eRemoveDiacritic = FTS5_REMOVE_DIACRITICS_SIMPLE;
378 p->nFold = 64;
379 p->aFold = sqlite3_malloc64(p->nFold * sizeof(char));
380 if( p->aFold==0 ){
381 rc = SQLITE_NOMEM;
384 /* Search for a "categories" argument */
385 for(i=0; rc==SQLITE_OK && i<nArg-1; i+=2){
386 if( 0==sqlite3_stricmp(azArg[i], "categories") ){
387 zCat = azArg[i+1];
390 if( rc==SQLITE_OK ){
391 rc = unicodeSetCategories(p, zCat);
394 for(i=0; rc==SQLITE_OK && i<nArg-1; i+=2){
395 const char *zArg = azArg[i+1];
396 if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
397 if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){
398 rc = SQLITE_ERROR;
399 }else{
400 p->eRemoveDiacritic = (zArg[0] - '0');
401 assert( p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_NONE
402 || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_SIMPLE
403 || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_COMPLEX
406 }else
407 if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
408 rc = fts5UnicodeAddExceptions(p, zArg, 1);
409 }else
410 if( 0==sqlite3_stricmp(azArg[i], "separators") ){
411 rc = fts5UnicodeAddExceptions(p, zArg, 0);
412 }else
413 if( 0==sqlite3_stricmp(azArg[i], "categories") ){
414 /* no-op */
415 }else{
416 rc = SQLITE_ERROR;
419 if( i<nArg && rc==SQLITE_OK ) rc = SQLITE_ERROR;
421 }else{
422 rc = SQLITE_NOMEM;
424 if( rc!=SQLITE_OK ){
425 fts5UnicodeDelete((Fts5Tokenizer*)p);
426 p = 0;
428 *ppOut = (Fts5Tokenizer*)p;
430 return rc;
434 ** Return true if, for the purposes of tokenizing with the tokenizer
435 ** passed as the first argument, codepoint iCode is considered a token
436 ** character (not a separator).
438 static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){
439 return (
440 p->aCategory[sqlite3Fts5UnicodeCategory((u32)iCode)]
441 ^ fts5UnicodeIsException(p, iCode)
445 static int fts5UnicodeTokenize(
446 Fts5Tokenizer *pTokenizer,
447 void *pCtx,
448 int iUnused,
449 const char *pText, int nText,
450 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
452 Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
453 int rc = SQLITE_OK;
454 unsigned char *a = p->aTokenChar;
456 unsigned char *zTerm = (unsigned char*)&pText[nText];
457 unsigned char *zCsr = (unsigned char *)pText;
459 /* Output buffer */
460 char *aFold = p->aFold;
461 int nFold = p->nFold;
462 const char *pEnd = &aFold[nFold-6];
464 UNUSED_PARAM(iUnused);
466 /* Each iteration of this loop gobbles up a contiguous run of separators,
467 ** then the next token. */
468 while( rc==SQLITE_OK ){
469 u32 iCode; /* non-ASCII codepoint read from input */
470 char *zOut = aFold;
471 int is;
472 int ie;
474 /* Skip any separator characters. */
475 while( 1 ){
476 if( zCsr>=zTerm ) goto tokenize_done;
477 if( *zCsr & 0x80 ) {
478 /* A character outside of the ascii range. Skip past it if it is
479 ** a separator character. Or break out of the loop if it is not. */
480 is = zCsr - (unsigned char*)pText;
481 READ_UTF8(zCsr, zTerm, iCode);
482 if( fts5UnicodeIsAlnum(p, iCode) ){
483 goto non_ascii_tokenchar;
485 }else{
486 if( a[*zCsr] ){
487 is = zCsr - (unsigned char*)pText;
488 goto ascii_tokenchar;
490 zCsr++;
494 /* Run through the tokenchars. Fold them into the output buffer along
495 ** the way. */
496 while( zCsr<zTerm ){
498 /* Grow the output buffer so that there is sufficient space to fit the
499 ** largest possible utf-8 character. */
500 if( zOut>pEnd ){
501 aFold = sqlite3_malloc64((sqlite3_int64)nFold*2);
502 if( aFold==0 ){
503 rc = SQLITE_NOMEM;
504 goto tokenize_done;
506 zOut = &aFold[zOut - p->aFold];
507 memcpy(aFold, p->aFold, nFold);
508 sqlite3_free(p->aFold);
509 p->aFold = aFold;
510 p->nFold = nFold = nFold*2;
511 pEnd = &aFold[nFold-6];
514 if( *zCsr & 0x80 ){
515 /* An non-ascii-range character. Fold it into the output buffer if
516 ** it is a token character, or break out of the loop if it is not. */
517 READ_UTF8(zCsr, zTerm, iCode);
518 if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
519 non_ascii_tokenchar:
520 iCode = sqlite3Fts5UnicodeFold(iCode, p->eRemoveDiacritic);
521 if( iCode ) WRITE_UTF8(zOut, iCode);
522 }else{
523 break;
525 }else if( a[*zCsr]==0 ){
526 /* An ascii-range separator character. End of token. */
527 break;
528 }else{
529 ascii_tokenchar:
530 if( *zCsr>='A' && *zCsr<='Z' ){
531 *zOut++ = *zCsr + 32;
532 }else{
533 *zOut++ = *zCsr;
535 zCsr++;
537 ie = zCsr - (unsigned char*)pText;
540 /* Invoke the token callback */
541 rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie);
544 tokenize_done:
545 if( rc==SQLITE_DONE ) rc = SQLITE_OK;
546 return rc;
549 /**************************************************************************
550 ** Start of porter stemmer implementation.
553 /* Any tokens larger than this (in bytes) are passed through without
554 ** stemming. */
555 #define FTS5_PORTER_MAX_TOKEN 64
557 typedef struct PorterTokenizer PorterTokenizer;
558 struct PorterTokenizer {
559 fts5_tokenizer tokenizer; /* Parent tokenizer module */
560 Fts5Tokenizer *pTokenizer; /* Parent tokenizer instance */
561 char aBuf[FTS5_PORTER_MAX_TOKEN + 64];
565 ** Delete a "porter" tokenizer.
567 static void fts5PorterDelete(Fts5Tokenizer *pTok){
568 if( pTok ){
569 PorterTokenizer *p = (PorterTokenizer*)pTok;
570 if( p->pTokenizer ){
571 p->tokenizer.xDelete(p->pTokenizer);
573 sqlite3_free(p);
578 ** Create a "porter" tokenizer.
580 static int fts5PorterCreate(
581 void *pCtx,
582 const char **azArg, int nArg,
583 Fts5Tokenizer **ppOut
585 fts5_api *pApi = (fts5_api*)pCtx;
586 int rc = SQLITE_OK;
587 PorterTokenizer *pRet;
588 void *pUserdata = 0;
589 const char *zBase = "unicode61";
591 if( nArg>0 ){
592 zBase = azArg[0];
595 pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer));
596 if( pRet ){
597 memset(pRet, 0, sizeof(PorterTokenizer));
598 rc = pApi->xFindTokenizer(pApi, zBase, &pUserdata, &pRet->tokenizer);
599 }else{
600 rc = SQLITE_NOMEM;
602 if( rc==SQLITE_OK ){
603 int nArg2 = (nArg>0 ? nArg-1 : 0);
604 const char **azArg2 = (nArg2 ? &azArg[1] : 0);
605 rc = pRet->tokenizer.xCreate(pUserdata, azArg2, nArg2, &pRet->pTokenizer);
608 if( rc!=SQLITE_OK ){
609 fts5PorterDelete((Fts5Tokenizer*)pRet);
610 pRet = 0;
612 *ppOut = (Fts5Tokenizer*)pRet;
613 return rc;
616 typedef struct PorterContext PorterContext;
617 struct PorterContext {
618 void *pCtx;
619 int (*xToken)(void*, int, const char*, int, int, int);
620 char *aBuf;
623 typedef struct PorterRule PorterRule;
624 struct PorterRule {
625 const char *zSuffix;
626 int nSuffix;
627 int (*xCond)(char *zStem, int nStem);
628 const char *zOutput;
629 int nOutput;
632 #if 0
633 static int fts5PorterApply(char *aBuf, int *pnBuf, PorterRule *aRule){
634 int ret = -1;
635 int nBuf = *pnBuf;
636 PorterRule *p;
638 for(p=aRule; p->zSuffix; p++){
639 assert( strlen(p->zSuffix)==p->nSuffix );
640 assert( strlen(p->zOutput)==p->nOutput );
641 if( nBuf<p->nSuffix ) continue;
642 if( 0==memcmp(&aBuf[nBuf - p->nSuffix], p->zSuffix, p->nSuffix) ) break;
645 if( p->zSuffix ){
646 int nStem = nBuf - p->nSuffix;
647 if( p->xCond==0 || p->xCond(aBuf, nStem) ){
648 memcpy(&aBuf[nStem], p->zOutput, p->nOutput);
649 *pnBuf = nStem + p->nOutput;
650 ret = p - aRule;
654 return ret;
656 #endif
658 static int fts5PorterIsVowel(char c, int bYIsVowel){
659 return (
660 c=='a' || c=='e' || c=='i' || c=='o' || c=='u' || (bYIsVowel && c=='y')
664 static int fts5PorterGobbleVC(char *zStem, int nStem, int bPrevCons){
665 int i;
666 int bCons = bPrevCons;
668 /* Scan for a vowel */
669 for(i=0; i<nStem; i++){
670 if( 0==(bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) break;
673 /* Scan for a consonent */
674 for(i++; i<nStem; i++){
675 if( (bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) return i+1;
677 return 0;
680 /* porter rule condition: (m > 0) */
681 static int fts5Porter_MGt0(char *zStem, int nStem){
682 return !!fts5PorterGobbleVC(zStem, nStem, 0);
685 /* porter rule condition: (m > 1) */
686 static int fts5Porter_MGt1(char *zStem, int nStem){
687 int n;
688 n = fts5PorterGobbleVC(zStem, nStem, 0);
689 if( n && fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
690 return 1;
692 return 0;
695 /* porter rule condition: (m = 1) */
696 static int fts5Porter_MEq1(char *zStem, int nStem){
697 int n;
698 n = fts5PorterGobbleVC(zStem, nStem, 0);
699 if( n && 0==fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
700 return 1;
702 return 0;
705 /* porter rule condition: (*o) */
706 static int fts5Porter_Ostar(char *zStem, int nStem){
707 if( zStem[nStem-1]=='w' || zStem[nStem-1]=='x' || zStem[nStem-1]=='y' ){
708 return 0;
709 }else{
710 int i;
711 int mask = 0;
712 int bCons = 0;
713 for(i=0; i<nStem; i++){
714 bCons = !fts5PorterIsVowel(zStem[i], bCons);
715 assert( bCons==0 || bCons==1 );
716 mask = (mask << 1) + bCons;
718 return ((mask & 0x0007)==0x0005);
722 /* porter rule condition: (m > 1 and (*S or *T)) */
723 static int fts5Porter_MGt1_and_S_or_T(char *zStem, int nStem){
724 assert( nStem>0 );
725 return (zStem[nStem-1]=='s' || zStem[nStem-1]=='t')
726 && fts5Porter_MGt1(zStem, nStem);
729 /* porter rule condition: (*v*) */
730 static int fts5Porter_Vowel(char *zStem, int nStem){
731 int i;
732 for(i=0; i<nStem; i++){
733 if( fts5PorterIsVowel(zStem[i], i>0) ){
734 return 1;
737 return 0;
741 /**************************************************************************
742 ***************************************************************************
743 ** GENERATED CODE STARTS HERE (mkportersteps.tcl)
746 static int fts5PorterStep4(char *aBuf, int *pnBuf){
747 int ret = 0;
748 int nBuf = *pnBuf;
749 switch( aBuf[nBuf-2] ){
751 case 'a':
752 if( nBuf>2 && 0==memcmp("al", &aBuf[nBuf-2], 2) ){
753 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
754 *pnBuf = nBuf - 2;
757 break;
759 case 'c':
760 if( nBuf>4 && 0==memcmp("ance", &aBuf[nBuf-4], 4) ){
761 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
762 *pnBuf = nBuf - 4;
764 }else if( nBuf>4 && 0==memcmp("ence", &aBuf[nBuf-4], 4) ){
765 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
766 *pnBuf = nBuf - 4;
769 break;
771 case 'e':
772 if( nBuf>2 && 0==memcmp("er", &aBuf[nBuf-2], 2) ){
773 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
774 *pnBuf = nBuf - 2;
777 break;
779 case 'i':
780 if( nBuf>2 && 0==memcmp("ic", &aBuf[nBuf-2], 2) ){
781 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
782 *pnBuf = nBuf - 2;
785 break;
787 case 'l':
788 if( nBuf>4 && 0==memcmp("able", &aBuf[nBuf-4], 4) ){
789 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
790 *pnBuf = nBuf - 4;
792 }else if( nBuf>4 && 0==memcmp("ible", &aBuf[nBuf-4], 4) ){
793 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
794 *pnBuf = nBuf - 4;
797 break;
799 case 'n':
800 if( nBuf>3 && 0==memcmp("ant", &aBuf[nBuf-3], 3) ){
801 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
802 *pnBuf = nBuf - 3;
804 }else if( nBuf>5 && 0==memcmp("ement", &aBuf[nBuf-5], 5) ){
805 if( fts5Porter_MGt1(aBuf, nBuf-5) ){
806 *pnBuf = nBuf - 5;
808 }else if( nBuf>4 && 0==memcmp("ment", &aBuf[nBuf-4], 4) ){
809 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
810 *pnBuf = nBuf - 4;
812 }else if( nBuf>3 && 0==memcmp("ent", &aBuf[nBuf-3], 3) ){
813 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
814 *pnBuf = nBuf - 3;
817 break;
819 case 'o':
820 if( nBuf>3 && 0==memcmp("ion", &aBuf[nBuf-3], 3) ){
821 if( fts5Porter_MGt1_and_S_or_T(aBuf, nBuf-3) ){
822 *pnBuf = nBuf - 3;
824 }else if( nBuf>2 && 0==memcmp("ou", &aBuf[nBuf-2], 2) ){
825 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
826 *pnBuf = nBuf - 2;
829 break;
831 case 's':
832 if( nBuf>3 && 0==memcmp("ism", &aBuf[nBuf-3], 3) ){
833 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
834 *pnBuf = nBuf - 3;
837 break;
839 case 't':
840 if( nBuf>3 && 0==memcmp("ate", &aBuf[nBuf-3], 3) ){
841 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
842 *pnBuf = nBuf - 3;
844 }else if( nBuf>3 && 0==memcmp("iti", &aBuf[nBuf-3], 3) ){
845 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
846 *pnBuf = nBuf - 3;
849 break;
851 case 'u':
852 if( nBuf>3 && 0==memcmp("ous", &aBuf[nBuf-3], 3) ){
853 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
854 *pnBuf = nBuf - 3;
857 break;
859 case 'v':
860 if( nBuf>3 && 0==memcmp("ive", &aBuf[nBuf-3], 3) ){
861 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
862 *pnBuf = nBuf - 3;
865 break;
867 case 'z':
868 if( nBuf>3 && 0==memcmp("ize", &aBuf[nBuf-3], 3) ){
869 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
870 *pnBuf = nBuf - 3;
873 break;
876 return ret;
880 static int fts5PorterStep1B2(char *aBuf, int *pnBuf){
881 int ret = 0;
882 int nBuf = *pnBuf;
883 switch( aBuf[nBuf-2] ){
885 case 'a':
886 if( nBuf>2 && 0==memcmp("at", &aBuf[nBuf-2], 2) ){
887 memcpy(&aBuf[nBuf-2], "ate", 3);
888 *pnBuf = nBuf - 2 + 3;
889 ret = 1;
891 break;
893 case 'b':
894 if( nBuf>2 && 0==memcmp("bl", &aBuf[nBuf-2], 2) ){
895 memcpy(&aBuf[nBuf-2], "ble", 3);
896 *pnBuf = nBuf - 2 + 3;
897 ret = 1;
899 break;
901 case 'i':
902 if( nBuf>2 && 0==memcmp("iz", &aBuf[nBuf-2], 2) ){
903 memcpy(&aBuf[nBuf-2], "ize", 3);
904 *pnBuf = nBuf - 2 + 3;
905 ret = 1;
907 break;
910 return ret;
914 static int fts5PorterStep2(char *aBuf, int *pnBuf){
915 int ret = 0;
916 int nBuf = *pnBuf;
917 switch( aBuf[nBuf-2] ){
919 case 'a':
920 if( nBuf>7 && 0==memcmp("ational", &aBuf[nBuf-7], 7) ){
921 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
922 memcpy(&aBuf[nBuf-7], "ate", 3);
923 *pnBuf = nBuf - 7 + 3;
925 }else if( nBuf>6 && 0==memcmp("tional", &aBuf[nBuf-6], 6) ){
926 if( fts5Porter_MGt0(aBuf, nBuf-6) ){
927 memcpy(&aBuf[nBuf-6], "tion", 4);
928 *pnBuf = nBuf - 6 + 4;
931 break;
933 case 'c':
934 if( nBuf>4 && 0==memcmp("enci", &aBuf[nBuf-4], 4) ){
935 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
936 memcpy(&aBuf[nBuf-4], "ence", 4);
937 *pnBuf = nBuf - 4 + 4;
939 }else if( nBuf>4 && 0==memcmp("anci", &aBuf[nBuf-4], 4) ){
940 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
941 memcpy(&aBuf[nBuf-4], "ance", 4);
942 *pnBuf = nBuf - 4 + 4;
945 break;
947 case 'e':
948 if( nBuf>4 && 0==memcmp("izer", &aBuf[nBuf-4], 4) ){
949 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
950 memcpy(&aBuf[nBuf-4], "ize", 3);
951 *pnBuf = nBuf - 4 + 3;
954 break;
956 case 'g':
957 if( nBuf>4 && 0==memcmp("logi", &aBuf[nBuf-4], 4) ){
958 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
959 memcpy(&aBuf[nBuf-4], "log", 3);
960 *pnBuf = nBuf - 4 + 3;
963 break;
965 case 'l':
966 if( nBuf>3 && 0==memcmp("bli", &aBuf[nBuf-3], 3) ){
967 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
968 memcpy(&aBuf[nBuf-3], "ble", 3);
969 *pnBuf = nBuf - 3 + 3;
971 }else if( nBuf>4 && 0==memcmp("alli", &aBuf[nBuf-4], 4) ){
972 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
973 memcpy(&aBuf[nBuf-4], "al", 2);
974 *pnBuf = nBuf - 4 + 2;
976 }else if( nBuf>5 && 0==memcmp("entli", &aBuf[nBuf-5], 5) ){
977 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
978 memcpy(&aBuf[nBuf-5], "ent", 3);
979 *pnBuf = nBuf - 5 + 3;
981 }else if( nBuf>3 && 0==memcmp("eli", &aBuf[nBuf-3], 3) ){
982 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
983 memcpy(&aBuf[nBuf-3], "e", 1);
984 *pnBuf = nBuf - 3 + 1;
986 }else if( nBuf>5 && 0==memcmp("ousli", &aBuf[nBuf-5], 5) ){
987 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
988 memcpy(&aBuf[nBuf-5], "ous", 3);
989 *pnBuf = nBuf - 5 + 3;
992 break;
994 case 'o':
995 if( nBuf>7 && 0==memcmp("ization", &aBuf[nBuf-7], 7) ){
996 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
997 memcpy(&aBuf[nBuf-7], "ize", 3);
998 *pnBuf = nBuf - 7 + 3;
1000 }else if( nBuf>5 && 0==memcmp("ation", &aBuf[nBuf-5], 5) ){
1001 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1002 memcpy(&aBuf[nBuf-5], "ate", 3);
1003 *pnBuf = nBuf - 5 + 3;
1005 }else if( nBuf>4 && 0==memcmp("ator", &aBuf[nBuf-4], 4) ){
1006 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1007 memcpy(&aBuf[nBuf-4], "ate", 3);
1008 *pnBuf = nBuf - 4 + 3;
1011 break;
1013 case 's':
1014 if( nBuf>5 && 0==memcmp("alism", &aBuf[nBuf-5], 5) ){
1015 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1016 memcpy(&aBuf[nBuf-5], "al", 2);
1017 *pnBuf = nBuf - 5 + 2;
1019 }else if( nBuf>7 && 0==memcmp("iveness", &aBuf[nBuf-7], 7) ){
1020 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1021 memcpy(&aBuf[nBuf-7], "ive", 3);
1022 *pnBuf = nBuf - 7 + 3;
1024 }else if( nBuf>7 && 0==memcmp("fulness", &aBuf[nBuf-7], 7) ){
1025 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1026 memcpy(&aBuf[nBuf-7], "ful", 3);
1027 *pnBuf = nBuf - 7 + 3;
1029 }else if( nBuf>7 && 0==memcmp("ousness", &aBuf[nBuf-7], 7) ){
1030 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1031 memcpy(&aBuf[nBuf-7], "ous", 3);
1032 *pnBuf = nBuf - 7 + 3;
1035 break;
1037 case 't':
1038 if( nBuf>5 && 0==memcmp("aliti", &aBuf[nBuf-5], 5) ){
1039 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1040 memcpy(&aBuf[nBuf-5], "al", 2);
1041 *pnBuf = nBuf - 5 + 2;
1043 }else if( nBuf>5 && 0==memcmp("iviti", &aBuf[nBuf-5], 5) ){
1044 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1045 memcpy(&aBuf[nBuf-5], "ive", 3);
1046 *pnBuf = nBuf - 5 + 3;
1048 }else if( nBuf>6 && 0==memcmp("biliti", &aBuf[nBuf-6], 6) ){
1049 if( fts5Porter_MGt0(aBuf, nBuf-6) ){
1050 memcpy(&aBuf[nBuf-6], "ble", 3);
1051 *pnBuf = nBuf - 6 + 3;
1054 break;
1057 return ret;
1061 static int fts5PorterStep3(char *aBuf, int *pnBuf){
1062 int ret = 0;
1063 int nBuf = *pnBuf;
1064 switch( aBuf[nBuf-2] ){
1066 case 'a':
1067 if( nBuf>4 && 0==memcmp("ical", &aBuf[nBuf-4], 4) ){
1068 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1069 memcpy(&aBuf[nBuf-4], "ic", 2);
1070 *pnBuf = nBuf - 4 + 2;
1073 break;
1075 case 's':
1076 if( nBuf>4 && 0==memcmp("ness", &aBuf[nBuf-4], 4) ){
1077 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1078 *pnBuf = nBuf - 4;
1081 break;
1083 case 't':
1084 if( nBuf>5 && 0==memcmp("icate", &aBuf[nBuf-5], 5) ){
1085 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1086 memcpy(&aBuf[nBuf-5], "ic", 2);
1087 *pnBuf = nBuf - 5 + 2;
1089 }else if( nBuf>5 && 0==memcmp("iciti", &aBuf[nBuf-5], 5) ){
1090 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1091 memcpy(&aBuf[nBuf-5], "ic", 2);
1092 *pnBuf = nBuf - 5 + 2;
1095 break;
1097 case 'u':
1098 if( nBuf>3 && 0==memcmp("ful", &aBuf[nBuf-3], 3) ){
1099 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
1100 *pnBuf = nBuf - 3;
1103 break;
1105 case 'v':
1106 if( nBuf>5 && 0==memcmp("ative", &aBuf[nBuf-5], 5) ){
1107 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1108 *pnBuf = nBuf - 5;
1111 break;
1113 case 'z':
1114 if( nBuf>5 && 0==memcmp("alize", &aBuf[nBuf-5], 5) ){
1115 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1116 memcpy(&aBuf[nBuf-5], "al", 2);
1117 *pnBuf = nBuf - 5 + 2;
1120 break;
1123 return ret;
1127 static int fts5PorterStep1B(char *aBuf, int *pnBuf){
1128 int ret = 0;
1129 int nBuf = *pnBuf;
1130 switch( aBuf[nBuf-2] ){
1132 case 'e':
1133 if( nBuf>3 && 0==memcmp("eed", &aBuf[nBuf-3], 3) ){
1134 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
1135 memcpy(&aBuf[nBuf-3], "ee", 2);
1136 *pnBuf = nBuf - 3 + 2;
1138 }else if( nBuf>2 && 0==memcmp("ed", &aBuf[nBuf-2], 2) ){
1139 if( fts5Porter_Vowel(aBuf, nBuf-2) ){
1140 *pnBuf = nBuf - 2;
1141 ret = 1;
1144 break;
1146 case 'n':
1147 if( nBuf>3 && 0==memcmp("ing", &aBuf[nBuf-3], 3) ){
1148 if( fts5Porter_Vowel(aBuf, nBuf-3) ){
1149 *pnBuf = nBuf - 3;
1150 ret = 1;
1153 break;
1156 return ret;
1160 ** GENERATED CODE ENDS HERE (mkportersteps.tcl)
1161 ***************************************************************************
1162 **************************************************************************/
1164 static void fts5PorterStep1A(char *aBuf, int *pnBuf){
1165 int nBuf = *pnBuf;
1166 if( aBuf[nBuf-1]=='s' ){
1167 if( aBuf[nBuf-2]=='e' ){
1168 if( (nBuf>4 && aBuf[nBuf-4]=='s' && aBuf[nBuf-3]=='s')
1169 || (nBuf>3 && aBuf[nBuf-3]=='i' )
1171 *pnBuf = nBuf-2;
1172 }else{
1173 *pnBuf = nBuf-1;
1176 else if( aBuf[nBuf-2]!='s' ){
1177 *pnBuf = nBuf-1;
1182 static int fts5PorterCb(
1183 void *pCtx,
1184 int tflags,
1185 const char *pToken,
1186 int nToken,
1187 int iStart,
1188 int iEnd
1190 PorterContext *p = (PorterContext*)pCtx;
1192 char *aBuf;
1193 int nBuf;
1195 if( nToken>FTS5_PORTER_MAX_TOKEN || nToken<3 ) goto pass_through;
1196 aBuf = p->aBuf;
1197 nBuf = nToken;
1198 memcpy(aBuf, pToken, nBuf);
1200 /* Step 1. */
1201 fts5PorterStep1A(aBuf, &nBuf);
1202 if( fts5PorterStep1B(aBuf, &nBuf) ){
1203 if( fts5PorterStep1B2(aBuf, &nBuf)==0 ){
1204 char c = aBuf[nBuf-1];
1205 if( fts5PorterIsVowel(c, 0)==0
1206 && c!='l' && c!='s' && c!='z' && c==aBuf[nBuf-2]
1208 nBuf--;
1209 }else if( fts5Porter_MEq1(aBuf, nBuf) && fts5Porter_Ostar(aBuf, nBuf) ){
1210 aBuf[nBuf++] = 'e';
1215 /* Step 1C. */
1216 if( aBuf[nBuf-1]=='y' && fts5Porter_Vowel(aBuf, nBuf-1) ){
1217 aBuf[nBuf-1] = 'i';
1220 /* Steps 2 through 4. */
1221 fts5PorterStep2(aBuf, &nBuf);
1222 fts5PorterStep3(aBuf, &nBuf);
1223 fts5PorterStep4(aBuf, &nBuf);
1225 /* Step 5a. */
1226 assert( nBuf>0 );
1227 if( aBuf[nBuf-1]=='e' ){
1228 if( fts5Porter_MGt1(aBuf, nBuf-1)
1229 || (fts5Porter_MEq1(aBuf, nBuf-1) && !fts5Porter_Ostar(aBuf, nBuf-1))
1231 nBuf--;
1235 /* Step 5b. */
1236 if( nBuf>1 && aBuf[nBuf-1]=='l'
1237 && aBuf[nBuf-2]=='l' && fts5Porter_MGt1(aBuf, nBuf-1)
1239 nBuf--;
1242 return p->xToken(p->pCtx, tflags, aBuf, nBuf, iStart, iEnd);
1244 pass_through:
1245 return p->xToken(p->pCtx, tflags, pToken, nToken, iStart, iEnd);
1249 ** Tokenize using the porter tokenizer.
1251 static int fts5PorterTokenize(
1252 Fts5Tokenizer *pTokenizer,
1253 void *pCtx,
1254 int flags,
1255 const char *pText, int nText,
1256 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
1258 PorterTokenizer *p = (PorterTokenizer*)pTokenizer;
1259 PorterContext sCtx;
1260 sCtx.xToken = xToken;
1261 sCtx.pCtx = pCtx;
1262 sCtx.aBuf = p->aBuf;
1263 return p->tokenizer.xTokenize(
1264 p->pTokenizer, (void*)&sCtx, flags, pText, nText, fts5PorterCb
1268 /**************************************************************************
1269 ** Start of trigram implementation.
1271 typedef struct TrigramTokenizer TrigramTokenizer;
1272 struct TrigramTokenizer {
1273 int bFold; /* True to fold to lower-case */
1274 int iFoldParam; /* Parameter to pass to Fts5UnicodeFold() */
1278 ** Free a trigram tokenizer.
1280 static void fts5TriDelete(Fts5Tokenizer *p){
1281 sqlite3_free(p);
1285 ** Allocate a trigram tokenizer.
1287 static int fts5TriCreate(
1288 void *pUnused,
1289 const char **azArg,
1290 int nArg,
1291 Fts5Tokenizer **ppOut
1293 int rc = SQLITE_OK;
1294 TrigramTokenizer *pNew = (TrigramTokenizer*)sqlite3_malloc(sizeof(*pNew));
1295 UNUSED_PARAM(pUnused);
1296 if( pNew==0 ){
1297 rc = SQLITE_NOMEM;
1298 }else{
1299 int i;
1300 pNew->bFold = 1;
1301 pNew->iFoldParam = 0;
1302 for(i=0; rc==SQLITE_OK && i<nArg-1; i+=2){
1303 const char *zArg = azArg[i+1];
1304 if( 0==sqlite3_stricmp(azArg[i], "case_sensitive") ){
1305 if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
1306 rc = SQLITE_ERROR;
1307 }else{
1308 pNew->bFold = (zArg[0]=='0');
1310 }else if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
1311 if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){
1312 rc = SQLITE_ERROR;
1313 }else{
1314 pNew->iFoldParam = (zArg[0]!='0') ? 2 : 0;
1316 }else{
1317 rc = SQLITE_ERROR;
1320 if( i<nArg && rc==SQLITE_OK ) rc = SQLITE_ERROR;
1322 if( pNew->iFoldParam!=0 && pNew->bFold==0 ){
1323 rc = SQLITE_ERROR;
1326 if( rc!=SQLITE_OK ){
1327 fts5TriDelete((Fts5Tokenizer*)pNew);
1328 pNew = 0;
1331 *ppOut = (Fts5Tokenizer*)pNew;
1332 return rc;
1336 ** Trigram tokenizer tokenize routine.
1338 static int fts5TriTokenize(
1339 Fts5Tokenizer *pTok,
1340 void *pCtx,
1341 int unusedFlags,
1342 const char *pText, int nText,
1343 int (*xToken)(void*, int, const char*, int, int, int)
1345 TrigramTokenizer *p = (TrigramTokenizer*)pTok;
1346 int rc = SQLITE_OK;
1347 char aBuf[32];
1348 char *zOut = aBuf;
1349 int ii;
1350 const unsigned char *zIn = (const unsigned char*)pText;
1351 const unsigned char *zEof = &zIn[nText];
1352 u32 iCode;
1353 int aStart[3]; /* Input offset of each character in aBuf[] */
1355 UNUSED_PARAM(unusedFlags);
1357 /* Populate aBuf[] with the characters for the first trigram. */
1358 for(ii=0; ii<3; ii++){
1359 do {
1360 aStart[ii] = zIn - (const unsigned char*)pText;
1361 READ_UTF8(zIn, zEof, iCode);
1362 if( iCode==0 ) return SQLITE_OK;
1363 if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam);
1364 }while( iCode==0 );
1365 WRITE_UTF8(zOut, iCode);
1368 /* At the start of each iteration of this loop:
1370 ** aBuf: Contains 3 characters. The 3 characters of the next trigram.
1371 ** zOut: Points to the byte following the last character in aBuf.
1372 ** aStart[3]: Contains the byte offset in the input text corresponding
1373 ** to the start of each of the three characters in the buffer.
1375 assert( zIn<=zEof );
1376 while( 1 ){
1377 int iNext; /* Start of character following current tri */
1378 const char *z1;
1380 /* Read characters from the input up until the first non-diacritic */
1381 do {
1382 iNext = zIn - (const unsigned char*)pText;
1383 READ_UTF8(zIn, zEof, iCode);
1384 if( iCode==0 ) break;
1385 if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam);
1386 }while( iCode==0 );
1388 /* Pass the current trigram back to fts5 */
1389 rc = xToken(pCtx, 0, aBuf, zOut-aBuf, aStart[0], iNext);
1390 if( iCode==0 || rc!=SQLITE_OK ) break;
1392 /* Remove the first character from buffer aBuf[]. Append the character
1393 ** with codepoint iCode. */
1394 z1 = aBuf;
1395 FTS5_SKIP_UTF8(z1);
1396 memmove(aBuf, z1, zOut - z1);
1397 zOut -= (z1 - aBuf);
1398 WRITE_UTF8(zOut, iCode);
1400 /* Update the aStart[] array */
1401 aStart[0] = aStart[1];
1402 aStart[1] = aStart[2];
1403 aStart[2] = iNext;
1406 return rc;
1410 ** Argument xCreate is a pointer to a constructor function for a tokenizer.
1411 ** pTok is a tokenizer previously created using the same method. This function
1412 ** returns one of FTS5_PATTERN_NONE, FTS5_PATTERN_LIKE or FTS5_PATTERN_GLOB
1413 ** indicating the style of pattern matching that the tokenizer can support.
1414 ** In practice, this is:
1416 ** "trigram" tokenizer, case_sensitive=1 - FTS5_PATTERN_GLOB
1417 ** "trigram" tokenizer, case_sensitive=0 (the default) - FTS5_PATTERN_LIKE
1418 ** all other tokenizers - FTS5_PATTERN_NONE
1420 int sqlite3Fts5TokenizerPattern(
1421 int (*xCreate)(void*, const char**, int, Fts5Tokenizer**),
1422 Fts5Tokenizer *pTok
1424 if( xCreate==fts5TriCreate ){
1425 TrigramTokenizer *p = (TrigramTokenizer*)pTok;
1426 if( p->iFoldParam==0 ){
1427 return p->bFold ? FTS5_PATTERN_LIKE : FTS5_PATTERN_GLOB;
1430 return FTS5_PATTERN_NONE;
1434 ** Register all built-in tokenizers with FTS5.
1436 int sqlite3Fts5TokenizerInit(fts5_api *pApi){
1437 struct BuiltinTokenizer {
1438 const char *zName;
1439 fts5_tokenizer x;
1440 } aBuiltin[] = {
1441 { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}},
1442 { "ascii", {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }},
1443 { "porter", {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }},
1444 { "trigram", {fts5TriCreate, fts5TriDelete, fts5TriTokenize}},
1447 int rc = SQLITE_OK; /* Return code */
1448 int i; /* To iterate through builtin functions */
1450 for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){
1451 rc = pApi->xCreateTokenizer(pApi,
1452 aBuiltin[i].zName,
1453 (void*)pApi,
1454 &aBuiltin[i].x,
1459 return rc;