4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 ******************************************************************************
17 /**************************************************************************
18 ** Start of ascii tokenizer implementation.
22 ** For tokenizers with no "unicode" modifier, the set of token characters
23 ** is the same as the set of ASCII range alphanumeric characters.
25 static unsigned char aAsciiTokenChar
[128] = {
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00..0x0F */
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10..0x1F */
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20..0x2F */
29 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 0x30..0x3F */
30 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40..0x4F */
31 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x50..0x5F */
32 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60..0x6F */
33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x70..0x7F */
36 typedef struct AsciiTokenizer AsciiTokenizer
;
37 struct AsciiTokenizer
{
38 unsigned char aTokenChar
[128];
41 static void fts5AsciiAddExceptions(
47 for(i
=0; zArg
[i
]; i
++){
48 if( (zArg
[i
] & 0x80)==0 ){
49 p
->aTokenChar
[(int)zArg
[i
]] = (unsigned char)bTokenChars
;
55 ** Delete a "ascii" tokenizer.
57 static void fts5AsciiDelete(Fts5Tokenizer
*p
){
62 ** Create an "ascii" tokenizer.
64 static int fts5AsciiCreate(
66 const char **azArg
, int nArg
,
70 AsciiTokenizer
*p
= 0;
71 UNUSED_PARAM(pUnused
);
75 p
= sqlite3_malloc(sizeof(AsciiTokenizer
));
80 memset(p
, 0, sizeof(AsciiTokenizer
));
81 memcpy(p
->aTokenChar
, aAsciiTokenChar
, sizeof(aAsciiTokenChar
));
82 for(i
=0; rc
==SQLITE_OK
&& i
<nArg
; i
+=2){
83 const char *zArg
= azArg
[i
+1];
84 if( 0==sqlite3_stricmp(azArg
[i
], "tokenchars") ){
85 fts5AsciiAddExceptions(p
, zArg
, 1);
87 if( 0==sqlite3_stricmp(azArg
[i
], "separators") ){
88 fts5AsciiAddExceptions(p
, zArg
, 0);
94 fts5AsciiDelete((Fts5Tokenizer
*)p
);
100 *ppOut
= (Fts5Tokenizer
*)p
;
105 static void asciiFold(char *aOut
, const char *aIn
, int nByte
){
107 for(i
=0; i
<nByte
; i
++){
109 if( c
>='A' && c
<='Z' ) c
+= 32;
115 ** Tokenize some text using the ascii tokenizer.
117 static int fts5AsciiTokenize(
118 Fts5Tokenizer
*pTokenizer
,
121 const char *pText
, int nText
,
122 int (*xToken
)(void*, int, const char*, int nToken
, int iStart
, int iEnd
)
124 AsciiTokenizer
*p
= (AsciiTokenizer
*)pTokenizer
;
130 int nFold
= sizeof(aFold
);
132 unsigned char *a
= p
->aTokenChar
;
134 UNUSED_PARAM(iUnused
);
136 while( is
<nText
&& rc
==SQLITE_OK
){
139 /* Skip any leading divider characters. */
140 while( is
<nText
&& ((pText
[is
]&0x80)==0 && a
[(int)pText
[is
]]==0) ){
143 if( is
==nText
) break;
145 /* Count the token characters */
147 while( ie
<nText
&& ((pText
[ie
]&0x80) || a
[(int)pText
[ie
]] ) ){
151 /* Fold to lower case */
154 if( pFold
!=aFold
) sqlite3_free(pFold
);
155 pFold
= sqlite3_malloc64((sqlite3_int64
)nByte
*2);
162 asciiFold(pFold
, &pText
[is
], nByte
);
164 /* Invoke the token callback */
165 rc
= xToken(pCtx
, 0, pFold
, nByte
, is
, ie
);
169 if( pFold
!=aFold
) sqlite3_free(pFold
);
170 if( rc
==SQLITE_DONE
) rc
= SQLITE_OK
;
174 /**************************************************************************
175 ** Start of unicode61 tokenizer implementation.
180 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
181 ** from the sqlite3 source file utf.c. If this file is compiled as part
182 ** of the amalgamation, they are not required.
184 #ifndef SQLITE_AMALGAMATION
186 static const unsigned char sqlite3Utf8Trans1
[] = {
187 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
188 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
189 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
190 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
191 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
192 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
193 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
194 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
197 #define READ_UTF8(zIn, zTerm, c) \
200 c = sqlite3Utf8Trans1[c-0xc0]; \
201 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \
202 c = (c<<6) + (0x3f & *(zIn++)); \
205 || (c&0xFFFFF800)==0xD800 \
206 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
210 #define WRITE_UTF8(zOut, c) { \
212 *zOut++ = (unsigned char)(c&0xFF); \
214 else if( c<0x00800 ){ \
215 *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F); \
216 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
218 else if( c<0x10000 ){ \
219 *zOut++ = 0xE0 + (unsigned char)((c>>12)&0x0F); \
220 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
221 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
223 *zOut++ = 0xF0 + (unsigned char)((c>>18) & 0x07); \
224 *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F); \
225 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
226 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
230 #endif /* ifndef SQLITE_AMALGAMATION */
232 typedef struct Unicode61Tokenizer Unicode61Tokenizer
;
233 struct Unicode61Tokenizer
{
234 unsigned char aTokenChar
[128]; /* ASCII range token characters */
235 char *aFold
; /* Buffer to fold text into */
236 int nFold
; /* Size of aFold[] in bytes */
237 int eRemoveDiacritic
; /* True if remove_diacritics=1 is set */
241 unsigned char aCategory
[32]; /* True for token char categories */
244 /* Values for eRemoveDiacritic (must match internals of fts5_unicode2.c) */
245 #define FTS5_REMOVE_DIACRITICS_NONE 0
246 #define FTS5_REMOVE_DIACRITICS_SIMPLE 1
247 #define FTS5_REMOVE_DIACRITICS_COMPLEX 2
249 static int fts5UnicodeAddExceptions(
250 Unicode61Tokenizer
*p
, /* Tokenizer object */
251 const char *z
, /* Characters to treat as exceptions */
252 int bTokenChars
/* 1 for 'tokenchars', 0 for 'separators' */
255 int n
= (int)strlen(z
);
259 aNew
= (int*)sqlite3_realloc64(p
->aiException
,
260 (n
+p
->nException
)*sizeof(int));
262 int nNew
= p
->nException
;
263 const unsigned char *zCsr
= (const unsigned char*)z
;
264 const unsigned char *zTerm
= (const unsigned char*)&z
[n
];
268 READ_UTF8(zCsr
, zTerm
, iCode
);
270 p
->aTokenChar
[iCode
] = (unsigned char)bTokenChars
;
272 bToken
= p
->aCategory
[sqlite3Fts5UnicodeCategory(iCode
)];
273 assert( (bToken
==0 || bToken
==1) );
274 assert( (bTokenChars
==0 || bTokenChars
==1) );
275 if( bToken
!=bTokenChars
&& sqlite3Fts5UnicodeIsdiacritic(iCode
)==0 ){
277 for(i
=0; i
<nNew
; i
++){
278 if( (u32
)aNew
[i
]>iCode
) break;
280 memmove(&aNew
[i
+1], &aNew
[i
], (nNew
-i
)*sizeof(int));
286 p
->aiException
= aNew
;
287 p
->nException
= nNew
;
297 ** Return true if the p->aiException[] array contains the value iCode.
299 static int fts5UnicodeIsException(Unicode61Tokenizer
*p
, int iCode
){
300 if( p
->nException
>0 ){
301 int *a
= p
->aiException
;
303 int iHi
= p
->nException
-1;
306 int iTest
= (iHi
+ iLo
) / 2;
307 if( iCode
==a
[iTest
] ){
309 }else if( iCode
>a
[iTest
] ){
321 ** Delete a "unicode61" tokenizer.
323 static void fts5UnicodeDelete(Fts5Tokenizer
*pTok
){
325 Unicode61Tokenizer
*p
= (Unicode61Tokenizer
*)pTok
;
326 sqlite3_free(p
->aiException
);
327 sqlite3_free(p
->aFold
);
333 static int unicodeSetCategories(Unicode61Tokenizer
*p
, const char *zCat
){
334 const char *z
= zCat
;
337 while( *z
==' ' || *z
=='\t' ) z
++;
338 if( *z
&& sqlite3Fts5UnicodeCatParse(z
, p
->aCategory
) ){
341 while( *z
!=' ' && *z
!='\t' && *z
!='\0' ) z
++;
344 sqlite3Fts5UnicodeAscii(p
->aCategory
, p
->aTokenChar
);
349 ** Create a "unicode61" tokenizer.
351 static int fts5UnicodeCreate(
353 const char **azArg
, int nArg
,
354 Fts5Tokenizer
**ppOut
356 int rc
= SQLITE_OK
; /* Return code */
357 Unicode61Tokenizer
*p
= 0; /* New tokenizer object */
359 UNUSED_PARAM(pUnused
);
364 p
= (Unicode61Tokenizer
*)sqlite3_malloc(sizeof(Unicode61Tokenizer
));
366 const char *zCat
= "L* N* Co";
368 memset(p
, 0, sizeof(Unicode61Tokenizer
));
370 p
->eRemoveDiacritic
= FTS5_REMOVE_DIACRITICS_SIMPLE
;
372 p
->aFold
= sqlite3_malloc64(p
->nFold
* sizeof(char));
377 /* Search for a "categories" argument */
378 for(i
=0; rc
==SQLITE_OK
&& i
<nArg
; i
+=2){
379 if( 0==sqlite3_stricmp(azArg
[i
], "categories") ){
385 rc
= unicodeSetCategories(p
, zCat
);
388 for(i
=0; rc
==SQLITE_OK
&& i
<nArg
; i
+=2){
389 const char *zArg
= azArg
[i
+1];
390 if( 0==sqlite3_stricmp(azArg
[i
], "remove_diacritics") ){
391 if( (zArg
[0]!='0' && zArg
[0]!='1' && zArg
[0]!='2') || zArg
[1] ){
394 p
->eRemoveDiacritic
= (zArg
[0] - '0');
395 assert( p
->eRemoveDiacritic
==FTS5_REMOVE_DIACRITICS_NONE
396 || p
->eRemoveDiacritic
==FTS5_REMOVE_DIACRITICS_SIMPLE
397 || p
->eRemoveDiacritic
==FTS5_REMOVE_DIACRITICS_COMPLEX
401 if( 0==sqlite3_stricmp(azArg
[i
], "tokenchars") ){
402 rc
= fts5UnicodeAddExceptions(p
, zArg
, 1);
404 if( 0==sqlite3_stricmp(azArg
[i
], "separators") ){
405 rc
= fts5UnicodeAddExceptions(p
, zArg
, 0);
407 if( 0==sqlite3_stricmp(azArg
[i
], "categories") ){
418 fts5UnicodeDelete((Fts5Tokenizer
*)p
);
421 *ppOut
= (Fts5Tokenizer
*)p
;
427 ** Return true if, for the purposes of tokenizing with the tokenizer
428 ** passed as the first argument, codepoint iCode is considered a token
429 ** character (not a separator).
431 static int fts5UnicodeIsAlnum(Unicode61Tokenizer
*p
, int iCode
){
433 p
->aCategory
[sqlite3Fts5UnicodeCategory((u32
)iCode
)]
434 ^ fts5UnicodeIsException(p
, iCode
)
438 static int fts5UnicodeTokenize(
439 Fts5Tokenizer
*pTokenizer
,
442 const char *pText
, int nText
,
443 int (*xToken
)(void*, int, const char*, int nToken
, int iStart
, int iEnd
)
445 Unicode61Tokenizer
*p
= (Unicode61Tokenizer
*)pTokenizer
;
447 unsigned char *a
= p
->aTokenChar
;
449 unsigned char *zTerm
= (unsigned char*)&pText
[nText
];
450 unsigned char *zCsr
= (unsigned char *)pText
;
453 char *aFold
= p
->aFold
;
454 int nFold
= p
->nFold
;
455 const char *pEnd
= &aFold
[nFold
-6];
457 UNUSED_PARAM(iUnused
);
459 /* Each iteration of this loop gobbles up a contiguous run of separators,
460 ** then the next token. */
461 while( rc
==SQLITE_OK
){
462 u32 iCode
; /* non-ASCII codepoint read from input */
467 /* Skip any separator characters. */
469 if( zCsr
>=zTerm
) goto tokenize_done
;
471 /* A character outside of the ascii range. Skip past it if it is
472 ** a separator character. Or break out of the loop if it is not. */
473 is
= zCsr
- (unsigned char*)pText
;
474 READ_UTF8(zCsr
, zTerm
, iCode
);
475 if( fts5UnicodeIsAlnum(p
, iCode
) ){
476 goto non_ascii_tokenchar
;
480 is
= zCsr
- (unsigned char*)pText
;
481 goto ascii_tokenchar
;
487 /* Run through the tokenchars. Fold them into the output buffer along
491 /* Grow the output buffer so that there is sufficient space to fit the
492 ** largest possible utf-8 character. */
494 aFold
= sqlite3_malloc64((sqlite3_int64
)nFold
*2);
499 zOut
= &aFold
[zOut
- p
->aFold
];
500 memcpy(aFold
, p
->aFold
, nFold
);
501 sqlite3_free(p
->aFold
);
503 p
->nFold
= nFold
= nFold
*2;
504 pEnd
= &aFold
[nFold
-6];
508 /* An non-ascii-range character. Fold it into the output buffer if
509 ** it is a token character, or break out of the loop if it is not. */
510 READ_UTF8(zCsr
, zTerm
, iCode
);
511 if( fts5UnicodeIsAlnum(p
,iCode
)||sqlite3Fts5UnicodeIsdiacritic(iCode
) ){
513 iCode
= sqlite3Fts5UnicodeFold(iCode
, p
->eRemoveDiacritic
);
514 if( iCode
) WRITE_UTF8(zOut
, iCode
);
518 }else if( a
[*zCsr
]==0 ){
519 /* An ascii-range separator character. End of token. */
523 if( *zCsr
>='A' && *zCsr
<='Z' ){
524 *zOut
++ = *zCsr
+ 32;
530 ie
= zCsr
- (unsigned char*)pText
;
533 /* Invoke the token callback */
534 rc
= xToken(pCtx
, 0, aFold
, zOut
-aFold
, is
, ie
);
538 if( rc
==SQLITE_DONE
) rc
= SQLITE_OK
;
542 /**************************************************************************
543 ** Start of porter stemmer implementation.
546 /* Any tokens larger than this (in bytes) are passed through without
548 #define FTS5_PORTER_MAX_TOKEN 64
550 typedef struct PorterTokenizer PorterTokenizer
;
551 struct PorterTokenizer
{
552 fts5_tokenizer tokenizer
; /* Parent tokenizer module */
553 Fts5Tokenizer
*pTokenizer
; /* Parent tokenizer instance */
554 char aBuf
[FTS5_PORTER_MAX_TOKEN
+ 64];
558 ** Delete a "porter" tokenizer.
560 static void fts5PorterDelete(Fts5Tokenizer
*pTok
){
562 PorterTokenizer
*p
= (PorterTokenizer
*)pTok
;
564 p
->tokenizer
.xDelete(p
->pTokenizer
);
571 ** Create a "porter" tokenizer.
573 static int fts5PorterCreate(
575 const char **azArg
, int nArg
,
576 Fts5Tokenizer
**ppOut
578 fts5_api
*pApi
= (fts5_api
*)pCtx
;
580 PorterTokenizer
*pRet
;
582 const char *zBase
= "unicode61";
588 pRet
= (PorterTokenizer
*)sqlite3_malloc(sizeof(PorterTokenizer
));
590 memset(pRet
, 0, sizeof(PorterTokenizer
));
591 rc
= pApi
->xFindTokenizer(pApi
, zBase
, &pUserdata
, &pRet
->tokenizer
);
596 int nArg2
= (nArg
>0 ? nArg
-1 : 0);
597 const char **azArg2
= (nArg2
? &azArg
[1] : 0);
598 rc
= pRet
->tokenizer
.xCreate(pUserdata
, azArg2
, nArg2
, &pRet
->pTokenizer
);
602 fts5PorterDelete((Fts5Tokenizer
*)pRet
);
605 *ppOut
= (Fts5Tokenizer
*)pRet
;
609 typedef struct PorterContext PorterContext
;
610 struct PorterContext
{
612 int (*xToken
)(void*, int, const char*, int, int, int);
616 typedef struct PorterRule PorterRule
;
620 int (*xCond
)(char *zStem
, int nStem
);
626 static int fts5PorterApply(char *aBuf
, int *pnBuf
, PorterRule
*aRule
){
631 for(p
=aRule
; p
->zSuffix
; p
++){
632 assert( strlen(p
->zSuffix
)==p
->nSuffix
);
633 assert( strlen(p
->zOutput
)==p
->nOutput
);
634 if( nBuf
<p
->nSuffix
) continue;
635 if( 0==memcmp(&aBuf
[nBuf
- p
->nSuffix
], p
->zSuffix
, p
->nSuffix
) ) break;
639 int nStem
= nBuf
- p
->nSuffix
;
640 if( p
->xCond
==0 || p
->xCond(aBuf
, nStem
) ){
641 memcpy(&aBuf
[nStem
], p
->zOutput
, p
->nOutput
);
642 *pnBuf
= nStem
+ p
->nOutput
;
651 static int fts5PorterIsVowel(char c
, int bYIsVowel
){
653 c
=='a' || c
=='e' || c
=='i' || c
=='o' || c
=='u' || (bYIsVowel
&& c
=='y')
657 static int fts5PorterGobbleVC(char *zStem
, int nStem
, int bPrevCons
){
659 int bCons
= bPrevCons
;
661 /* Scan for a vowel */
662 for(i
=0; i
<nStem
; i
++){
663 if( 0==(bCons
= !fts5PorterIsVowel(zStem
[i
], bCons
)) ) break;
666 /* Scan for a consonent */
667 for(i
++; i
<nStem
; i
++){
668 if( (bCons
= !fts5PorterIsVowel(zStem
[i
], bCons
)) ) return i
+1;
673 /* porter rule condition: (m > 0) */
674 static int fts5Porter_MGt0(char *zStem
, int nStem
){
675 return !!fts5PorterGobbleVC(zStem
, nStem
, 0);
678 /* porter rule condition: (m > 1) */
679 static int fts5Porter_MGt1(char *zStem
, int nStem
){
681 n
= fts5PorterGobbleVC(zStem
, nStem
, 0);
682 if( n
&& fts5PorterGobbleVC(&zStem
[n
], nStem
-n
, 1) ){
688 /* porter rule condition: (m = 1) */
689 static int fts5Porter_MEq1(char *zStem
, int nStem
){
691 n
= fts5PorterGobbleVC(zStem
, nStem
, 0);
692 if( n
&& 0==fts5PorterGobbleVC(&zStem
[n
], nStem
-n
, 1) ){
698 /* porter rule condition: (*o) */
699 static int fts5Porter_Ostar(char *zStem
, int nStem
){
700 if( zStem
[nStem
-1]=='w' || zStem
[nStem
-1]=='x' || zStem
[nStem
-1]=='y' ){
706 for(i
=0; i
<nStem
; i
++){
707 bCons
= !fts5PorterIsVowel(zStem
[i
], bCons
);
708 assert( bCons
==0 || bCons
==1 );
709 mask
= (mask
<< 1) + bCons
;
711 return ((mask
& 0x0007)==0x0005);
715 /* porter rule condition: (m > 1 and (*S or *T)) */
716 static int fts5Porter_MGt1_and_S_or_T(char *zStem
, int nStem
){
718 return (zStem
[nStem
-1]=='s' || zStem
[nStem
-1]=='t')
719 && fts5Porter_MGt1(zStem
, nStem
);
722 /* porter rule condition: (*v*) */
723 static int fts5Porter_Vowel(char *zStem
, int nStem
){
725 for(i
=0; i
<nStem
; i
++){
726 if( fts5PorterIsVowel(zStem
[i
], i
>0) ){
734 /**************************************************************************
735 ***************************************************************************
736 ** GENERATED CODE STARTS HERE (mkportersteps.tcl)
739 static int fts5PorterStep4(char *aBuf
, int *pnBuf
){
742 switch( aBuf
[nBuf
-2] ){
745 if( nBuf
>2 && 0==memcmp("al", &aBuf
[nBuf
-2], 2) ){
746 if( fts5Porter_MGt1(aBuf
, nBuf
-2) ){
753 if( nBuf
>4 && 0==memcmp("ance", &aBuf
[nBuf
-4], 4) ){
754 if( fts5Porter_MGt1(aBuf
, nBuf
-4) ){
757 }else if( nBuf
>4 && 0==memcmp("ence", &aBuf
[nBuf
-4], 4) ){
758 if( fts5Porter_MGt1(aBuf
, nBuf
-4) ){
765 if( nBuf
>2 && 0==memcmp("er", &aBuf
[nBuf
-2], 2) ){
766 if( fts5Porter_MGt1(aBuf
, nBuf
-2) ){
773 if( nBuf
>2 && 0==memcmp("ic", &aBuf
[nBuf
-2], 2) ){
774 if( fts5Porter_MGt1(aBuf
, nBuf
-2) ){
781 if( nBuf
>4 && 0==memcmp("able", &aBuf
[nBuf
-4], 4) ){
782 if( fts5Porter_MGt1(aBuf
, nBuf
-4) ){
785 }else if( nBuf
>4 && 0==memcmp("ible", &aBuf
[nBuf
-4], 4) ){
786 if( fts5Porter_MGt1(aBuf
, nBuf
-4) ){
793 if( nBuf
>3 && 0==memcmp("ant", &aBuf
[nBuf
-3], 3) ){
794 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
797 }else if( nBuf
>5 && 0==memcmp("ement", &aBuf
[nBuf
-5], 5) ){
798 if( fts5Porter_MGt1(aBuf
, nBuf
-5) ){
801 }else if( nBuf
>4 && 0==memcmp("ment", &aBuf
[nBuf
-4], 4) ){
802 if( fts5Porter_MGt1(aBuf
, nBuf
-4) ){
805 }else if( nBuf
>3 && 0==memcmp("ent", &aBuf
[nBuf
-3], 3) ){
806 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
813 if( nBuf
>3 && 0==memcmp("ion", &aBuf
[nBuf
-3], 3) ){
814 if( fts5Porter_MGt1_and_S_or_T(aBuf
, nBuf
-3) ){
817 }else if( nBuf
>2 && 0==memcmp("ou", &aBuf
[nBuf
-2], 2) ){
818 if( fts5Porter_MGt1(aBuf
, nBuf
-2) ){
825 if( nBuf
>3 && 0==memcmp("ism", &aBuf
[nBuf
-3], 3) ){
826 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
833 if( nBuf
>3 && 0==memcmp("ate", &aBuf
[nBuf
-3], 3) ){
834 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
837 }else if( nBuf
>3 && 0==memcmp("iti", &aBuf
[nBuf
-3], 3) ){
838 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
845 if( nBuf
>3 && 0==memcmp("ous", &aBuf
[nBuf
-3], 3) ){
846 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
853 if( nBuf
>3 && 0==memcmp("ive", &aBuf
[nBuf
-3], 3) ){
854 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
861 if( nBuf
>3 && 0==memcmp("ize", &aBuf
[nBuf
-3], 3) ){
862 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
873 static int fts5PorterStep1B2(char *aBuf
, int *pnBuf
){
876 switch( aBuf
[nBuf
-2] ){
879 if( nBuf
>2 && 0==memcmp("at", &aBuf
[nBuf
-2], 2) ){
880 memcpy(&aBuf
[nBuf
-2], "ate", 3);
881 *pnBuf
= nBuf
- 2 + 3;
887 if( nBuf
>2 && 0==memcmp("bl", &aBuf
[nBuf
-2], 2) ){
888 memcpy(&aBuf
[nBuf
-2], "ble", 3);
889 *pnBuf
= nBuf
- 2 + 3;
895 if( nBuf
>2 && 0==memcmp("iz", &aBuf
[nBuf
-2], 2) ){
896 memcpy(&aBuf
[nBuf
-2], "ize", 3);
897 *pnBuf
= nBuf
- 2 + 3;
907 static int fts5PorterStep2(char *aBuf
, int *pnBuf
){
910 switch( aBuf
[nBuf
-2] ){
913 if( nBuf
>7 && 0==memcmp("ational", &aBuf
[nBuf
-7], 7) ){
914 if( fts5Porter_MGt0(aBuf
, nBuf
-7) ){
915 memcpy(&aBuf
[nBuf
-7], "ate", 3);
916 *pnBuf
= nBuf
- 7 + 3;
918 }else if( nBuf
>6 && 0==memcmp("tional", &aBuf
[nBuf
-6], 6) ){
919 if( fts5Porter_MGt0(aBuf
, nBuf
-6) ){
920 memcpy(&aBuf
[nBuf
-6], "tion", 4);
921 *pnBuf
= nBuf
- 6 + 4;
927 if( nBuf
>4 && 0==memcmp("enci", &aBuf
[nBuf
-4], 4) ){
928 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
929 memcpy(&aBuf
[nBuf
-4], "ence", 4);
930 *pnBuf
= nBuf
- 4 + 4;
932 }else if( nBuf
>4 && 0==memcmp("anci", &aBuf
[nBuf
-4], 4) ){
933 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
934 memcpy(&aBuf
[nBuf
-4], "ance", 4);
935 *pnBuf
= nBuf
- 4 + 4;
941 if( nBuf
>4 && 0==memcmp("izer", &aBuf
[nBuf
-4], 4) ){
942 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
943 memcpy(&aBuf
[nBuf
-4], "ize", 3);
944 *pnBuf
= nBuf
- 4 + 3;
950 if( nBuf
>4 && 0==memcmp("logi", &aBuf
[nBuf
-4], 4) ){
951 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
952 memcpy(&aBuf
[nBuf
-4], "log", 3);
953 *pnBuf
= nBuf
- 4 + 3;
959 if( nBuf
>3 && 0==memcmp("bli", &aBuf
[nBuf
-3], 3) ){
960 if( fts5Porter_MGt0(aBuf
, nBuf
-3) ){
961 memcpy(&aBuf
[nBuf
-3], "ble", 3);
962 *pnBuf
= nBuf
- 3 + 3;
964 }else if( nBuf
>4 && 0==memcmp("alli", &aBuf
[nBuf
-4], 4) ){
965 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
966 memcpy(&aBuf
[nBuf
-4], "al", 2);
967 *pnBuf
= nBuf
- 4 + 2;
969 }else if( nBuf
>5 && 0==memcmp("entli", &aBuf
[nBuf
-5], 5) ){
970 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
971 memcpy(&aBuf
[nBuf
-5], "ent", 3);
972 *pnBuf
= nBuf
- 5 + 3;
974 }else if( nBuf
>3 && 0==memcmp("eli", &aBuf
[nBuf
-3], 3) ){
975 if( fts5Porter_MGt0(aBuf
, nBuf
-3) ){
976 memcpy(&aBuf
[nBuf
-3], "e", 1);
977 *pnBuf
= nBuf
- 3 + 1;
979 }else if( nBuf
>5 && 0==memcmp("ousli", &aBuf
[nBuf
-5], 5) ){
980 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
981 memcpy(&aBuf
[nBuf
-5], "ous", 3);
982 *pnBuf
= nBuf
- 5 + 3;
988 if( nBuf
>7 && 0==memcmp("ization", &aBuf
[nBuf
-7], 7) ){
989 if( fts5Porter_MGt0(aBuf
, nBuf
-7) ){
990 memcpy(&aBuf
[nBuf
-7], "ize", 3);
991 *pnBuf
= nBuf
- 7 + 3;
993 }else if( nBuf
>5 && 0==memcmp("ation", &aBuf
[nBuf
-5], 5) ){
994 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
995 memcpy(&aBuf
[nBuf
-5], "ate", 3);
996 *pnBuf
= nBuf
- 5 + 3;
998 }else if( nBuf
>4 && 0==memcmp("ator", &aBuf
[nBuf
-4], 4) ){
999 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
1000 memcpy(&aBuf
[nBuf
-4], "ate", 3);
1001 *pnBuf
= nBuf
- 4 + 3;
1007 if( nBuf
>5 && 0==memcmp("alism", &aBuf
[nBuf
-5], 5) ){
1008 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1009 memcpy(&aBuf
[nBuf
-5], "al", 2);
1010 *pnBuf
= nBuf
- 5 + 2;
1012 }else if( nBuf
>7 && 0==memcmp("iveness", &aBuf
[nBuf
-7], 7) ){
1013 if( fts5Porter_MGt0(aBuf
, nBuf
-7) ){
1014 memcpy(&aBuf
[nBuf
-7], "ive", 3);
1015 *pnBuf
= nBuf
- 7 + 3;
1017 }else if( nBuf
>7 && 0==memcmp("fulness", &aBuf
[nBuf
-7], 7) ){
1018 if( fts5Porter_MGt0(aBuf
, nBuf
-7) ){
1019 memcpy(&aBuf
[nBuf
-7], "ful", 3);
1020 *pnBuf
= nBuf
- 7 + 3;
1022 }else if( nBuf
>7 && 0==memcmp("ousness", &aBuf
[nBuf
-7], 7) ){
1023 if( fts5Porter_MGt0(aBuf
, nBuf
-7) ){
1024 memcpy(&aBuf
[nBuf
-7], "ous", 3);
1025 *pnBuf
= nBuf
- 7 + 3;
1031 if( nBuf
>5 && 0==memcmp("aliti", &aBuf
[nBuf
-5], 5) ){
1032 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1033 memcpy(&aBuf
[nBuf
-5], "al", 2);
1034 *pnBuf
= nBuf
- 5 + 2;
1036 }else if( nBuf
>5 && 0==memcmp("iviti", &aBuf
[nBuf
-5], 5) ){
1037 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1038 memcpy(&aBuf
[nBuf
-5], "ive", 3);
1039 *pnBuf
= nBuf
- 5 + 3;
1041 }else if( nBuf
>6 && 0==memcmp("biliti", &aBuf
[nBuf
-6], 6) ){
1042 if( fts5Porter_MGt0(aBuf
, nBuf
-6) ){
1043 memcpy(&aBuf
[nBuf
-6], "ble", 3);
1044 *pnBuf
= nBuf
- 6 + 3;
1054 static int fts5PorterStep3(char *aBuf
, int *pnBuf
){
1057 switch( aBuf
[nBuf
-2] ){
1060 if( nBuf
>4 && 0==memcmp("ical", &aBuf
[nBuf
-4], 4) ){
1061 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
1062 memcpy(&aBuf
[nBuf
-4], "ic", 2);
1063 *pnBuf
= nBuf
- 4 + 2;
1069 if( nBuf
>4 && 0==memcmp("ness", &aBuf
[nBuf
-4], 4) ){
1070 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
1077 if( nBuf
>5 && 0==memcmp("icate", &aBuf
[nBuf
-5], 5) ){
1078 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1079 memcpy(&aBuf
[nBuf
-5], "ic", 2);
1080 *pnBuf
= nBuf
- 5 + 2;
1082 }else if( nBuf
>5 && 0==memcmp("iciti", &aBuf
[nBuf
-5], 5) ){
1083 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1084 memcpy(&aBuf
[nBuf
-5], "ic", 2);
1085 *pnBuf
= nBuf
- 5 + 2;
1091 if( nBuf
>3 && 0==memcmp("ful", &aBuf
[nBuf
-3], 3) ){
1092 if( fts5Porter_MGt0(aBuf
, nBuf
-3) ){
1099 if( nBuf
>5 && 0==memcmp("ative", &aBuf
[nBuf
-5], 5) ){
1100 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1107 if( nBuf
>5 && 0==memcmp("alize", &aBuf
[nBuf
-5], 5) ){
1108 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1109 memcpy(&aBuf
[nBuf
-5], "al", 2);
1110 *pnBuf
= nBuf
- 5 + 2;
1120 static int fts5PorterStep1B(char *aBuf
, int *pnBuf
){
1123 switch( aBuf
[nBuf
-2] ){
1126 if( nBuf
>3 && 0==memcmp("eed", &aBuf
[nBuf
-3], 3) ){
1127 if( fts5Porter_MGt0(aBuf
, nBuf
-3) ){
1128 memcpy(&aBuf
[nBuf
-3], "ee", 2);
1129 *pnBuf
= nBuf
- 3 + 2;
1131 }else if( nBuf
>2 && 0==memcmp("ed", &aBuf
[nBuf
-2], 2) ){
1132 if( fts5Porter_Vowel(aBuf
, nBuf
-2) ){
1140 if( nBuf
>3 && 0==memcmp("ing", &aBuf
[nBuf
-3], 3) ){
1141 if( fts5Porter_Vowel(aBuf
, nBuf
-3) ){
1153 ** GENERATED CODE ENDS HERE (mkportersteps.tcl)
1154 ***************************************************************************
1155 **************************************************************************/
1157 static void fts5PorterStep1A(char *aBuf
, int *pnBuf
){
1159 if( aBuf
[nBuf
-1]=='s' ){
1160 if( aBuf
[nBuf
-2]=='e' ){
1161 if( (nBuf
>4 && aBuf
[nBuf
-4]=='s' && aBuf
[nBuf
-3]=='s')
1162 || (nBuf
>3 && aBuf
[nBuf
-3]=='i' )
1169 else if( aBuf
[nBuf
-2]!='s' ){
1175 static int fts5PorterCb(
1183 PorterContext
*p
= (PorterContext
*)pCtx
;
1188 if( nToken
>FTS5_PORTER_MAX_TOKEN
|| nToken
<3 ) goto pass_through
;
1191 memcpy(aBuf
, pToken
, nBuf
);
1194 fts5PorterStep1A(aBuf
, &nBuf
);
1195 if( fts5PorterStep1B(aBuf
, &nBuf
) ){
1196 if( fts5PorterStep1B2(aBuf
, &nBuf
)==0 ){
1197 char c
= aBuf
[nBuf
-1];
1198 if( fts5PorterIsVowel(c
, 0)==0
1199 && c
!='l' && c
!='s' && c
!='z' && c
==aBuf
[nBuf
-2]
1202 }else if( fts5Porter_MEq1(aBuf
, nBuf
) && fts5Porter_Ostar(aBuf
, nBuf
) ){
1209 if( aBuf
[nBuf
-1]=='y' && fts5Porter_Vowel(aBuf
, nBuf
-1) ){
1213 /* Steps 2 through 4. */
1214 fts5PorterStep2(aBuf
, &nBuf
);
1215 fts5PorterStep3(aBuf
, &nBuf
);
1216 fts5PorterStep4(aBuf
, &nBuf
);
1220 if( aBuf
[nBuf
-1]=='e' ){
1221 if( fts5Porter_MGt1(aBuf
, nBuf
-1)
1222 || (fts5Porter_MEq1(aBuf
, nBuf
-1) && !fts5Porter_Ostar(aBuf
, nBuf
-1))
1229 if( nBuf
>1 && aBuf
[nBuf
-1]=='l'
1230 && aBuf
[nBuf
-2]=='l' && fts5Porter_MGt1(aBuf
, nBuf
-1)
1235 return p
->xToken(p
->pCtx
, tflags
, aBuf
, nBuf
, iStart
, iEnd
);
1238 return p
->xToken(p
->pCtx
, tflags
, pToken
, nToken
, iStart
, iEnd
);
1242 ** Tokenize using the porter tokenizer.
1244 static int fts5PorterTokenize(
1245 Fts5Tokenizer
*pTokenizer
,
1248 const char *pText
, int nText
,
1249 int (*xToken
)(void*, int, const char*, int nToken
, int iStart
, int iEnd
)
1251 PorterTokenizer
*p
= (PorterTokenizer
*)pTokenizer
;
1253 sCtx
.xToken
= xToken
;
1255 sCtx
.aBuf
= p
->aBuf
;
1256 return p
->tokenizer
.xTokenize(
1257 p
->pTokenizer
, (void*)&sCtx
, flags
, pText
, nText
, fts5PorterCb
1261 /**************************************************************************
1262 ** Start of trigram implementation.
1264 typedef struct TrigramTokenizer TrigramTokenizer
;
1265 struct TrigramTokenizer
{
1266 int bFold
; /* True to fold to lower-case */
1270 ** Free a trigram tokenizer.
1272 static void fts5TriDelete(Fts5Tokenizer
*p
){
1277 ** Allocate a trigram tokenizer.
1279 static int fts5TriCreate(
1283 Fts5Tokenizer
**ppOut
1286 TrigramTokenizer
*pNew
= (TrigramTokenizer
*)sqlite3_malloc(sizeof(*pNew
));
1287 UNUSED_PARAM(pUnused
);
1293 for(i
=0; rc
==SQLITE_OK
&& i
<nArg
; i
+=2){
1294 const char *zArg
= azArg
[i
+1];
1295 if( 0==sqlite3_stricmp(azArg
[i
], "case_sensitive") ){
1296 if( (zArg
[0]!='0' && zArg
[0]!='1') || zArg
[1] ){
1299 pNew
->bFold
= (zArg
[0]=='0');
1305 if( rc
!=SQLITE_OK
){
1306 fts5TriDelete((Fts5Tokenizer
*)pNew
);
1310 *ppOut
= (Fts5Tokenizer
*)pNew
;
1315 ** Trigram tokenizer tokenize routine.
1317 static int fts5TriTokenize(
1318 Fts5Tokenizer
*pTok
,
1321 const char *pText
, int nText
,
1322 int (*xToken
)(void*, int, const char*, int, int, int)
1324 TrigramTokenizer
*p
= (TrigramTokenizer
*)pTok
;
1327 const unsigned char *zIn
= (const unsigned char*)pText
;
1328 const unsigned char *zEof
= &zIn
[nText
];
1331 UNUSED_PARAM(unusedFlags
);
1334 int iStart
= zIn
- (const unsigned char*)pText
;
1335 const unsigned char *zNext
;
1337 READ_UTF8(zIn
, zEof
, iCode
);
1338 if( iCode
==0 ) break;
1341 if( p
->bFold
) iCode
= sqlite3Fts5UnicodeFold(iCode
, 0);
1342 WRITE_UTF8(zOut
, iCode
);
1343 READ_UTF8(zIn
, zEof
, iCode
);
1344 if( iCode
==0 ) break;
1349 if( p
->bFold
) iCode
= sqlite3Fts5UnicodeFold(iCode
, 0);
1350 WRITE_UTF8(zOut
, iCode
);
1351 READ_UTF8(zIn
, zEof
, iCode
);
1352 if( iCode
==0 ) break;
1353 if( p
->bFold
) iCode
= sqlite3Fts5UnicodeFold(iCode
, 0);
1354 WRITE_UTF8(zOut
, iCode
);
1358 rc
= xToken(pCtx
, 0, aBuf
, zOut
-aBuf
, iStart
, iStart
+ zOut
-aBuf
);
1359 if( rc
!=SQLITE_OK
) break;
1367 ** Argument xCreate is a pointer to a constructor function for a tokenizer.
1368 ** pTok is a tokenizer previously created using the same method. This function
1369 ** returns one of FTS5_PATTERN_NONE, FTS5_PATTERN_LIKE or FTS5_PATTERN_GLOB
1370 ** indicating the style of pattern matching that the tokenizer can support.
1371 ** In practice, this is:
1373 ** "trigram" tokenizer, case_sensitive=1 - FTS5_PATTERN_GLOB
1374 ** "trigram" tokenizer, case_sensitive=0 (the default) - FTS5_PATTERN_LIKE
1375 ** all other tokenizers - FTS5_PATTERN_NONE
1377 int sqlite3Fts5TokenizerPattern(
1378 int (*xCreate
)(void*, const char**, int, Fts5Tokenizer
**),
1381 if( xCreate
==fts5TriCreate
){
1382 TrigramTokenizer
*p
= (TrigramTokenizer
*)pTok
;
1383 return p
->bFold
? FTS5_PATTERN_LIKE
: FTS5_PATTERN_GLOB
;
1385 return FTS5_PATTERN_NONE
;
1389 ** Register all built-in tokenizers with FTS5.
1391 int sqlite3Fts5TokenizerInit(fts5_api
*pApi
){
1392 struct BuiltinTokenizer
{
1396 { "unicode61", {fts5UnicodeCreate
, fts5UnicodeDelete
, fts5UnicodeTokenize
}},
1397 { "ascii", {fts5AsciiCreate
, fts5AsciiDelete
, fts5AsciiTokenize
}},
1398 { "porter", {fts5PorterCreate
, fts5PorterDelete
, fts5PorterTokenize
}},
1399 { "trigram", {fts5TriCreate
, fts5TriDelete
, fts5TriTokenize
}},
1402 int rc
= SQLITE_OK
; /* Return code */
1403 int i
; /* To iterate through builtin functions */
1405 for(i
=0; rc
==SQLITE_OK
&& i
<ArraySize(aBuiltin
); i
++){
1406 rc
= pApi
->xCreateTokenizer(pApi
,