4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 ******************************************************************************
17 /**************************************************************************
18 ** Start of ascii tokenizer implementation.
22 ** For tokenizers with no "unicode" modifier, the set of token characters
23 ** is the same as the set of ASCII range alphanumeric characters.
25 static unsigned char aAsciiTokenChar
[128] = {
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00..0x0F */
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10..0x1F */
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20..0x2F */
29 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 0x30..0x3F */
30 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40..0x4F */
31 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x50..0x5F */
32 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60..0x6F */
33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x70..0x7F */
36 typedef struct AsciiTokenizer AsciiTokenizer
;
37 struct AsciiTokenizer
{
38 unsigned char aTokenChar
[128];
41 static void fts5AsciiAddExceptions(
47 for(i
=0; zArg
[i
]; i
++){
48 if( (zArg
[i
] & 0x80)==0 ){
49 p
->aTokenChar
[(int)zArg
[i
]] = (unsigned char)bTokenChars
;
55 ** Delete a "ascii" tokenizer.
57 static void fts5AsciiDelete(Fts5Tokenizer
*p
){
62 ** Create an "ascii" tokenizer.
64 static int fts5AsciiCreate(
66 const char **azArg
, int nArg
,
70 AsciiTokenizer
*p
= 0;
71 UNUSED_PARAM(pUnused
);
75 p
= sqlite3_malloc(sizeof(AsciiTokenizer
));
80 memset(p
, 0, sizeof(AsciiTokenizer
));
81 memcpy(p
->aTokenChar
, aAsciiTokenChar
, sizeof(aAsciiTokenChar
));
82 for(i
=0; rc
==SQLITE_OK
&& i
<nArg
; i
+=2){
83 const char *zArg
= azArg
[i
+1];
84 if( 0==sqlite3_stricmp(azArg
[i
], "tokenchars") ){
85 fts5AsciiAddExceptions(p
, zArg
, 1);
87 if( 0==sqlite3_stricmp(azArg
[i
], "separators") ){
88 fts5AsciiAddExceptions(p
, zArg
, 0);
94 fts5AsciiDelete((Fts5Tokenizer
*)p
);
100 *ppOut
= (Fts5Tokenizer
*)p
;
105 static void asciiFold(char *aOut
, const char *aIn
, int nByte
){
107 for(i
=0; i
<nByte
; i
++){
109 if( c
>='A' && c
<='Z' ) c
+= 32;
115 ** Tokenize some text using the ascii tokenizer.
117 static int fts5AsciiTokenize(
118 Fts5Tokenizer
*pTokenizer
,
121 const char *pText
, int nText
,
122 int (*xToken
)(void*, int, const char*, int nToken
, int iStart
, int iEnd
)
124 AsciiTokenizer
*p
= (AsciiTokenizer
*)pTokenizer
;
130 int nFold
= sizeof(aFold
);
132 unsigned char *a
= p
->aTokenChar
;
134 UNUSED_PARAM(iUnused
);
136 while( is
<nText
&& rc
==SQLITE_OK
){
139 /* Skip any leading divider characters. */
140 while( is
<nText
&& ((pText
[is
]&0x80)==0 && a
[(int)pText
[is
]]==0) ){
143 if( is
==nText
) break;
145 /* Count the token characters */
147 while( ie
<nText
&& ((pText
[ie
]&0x80) || a
[(int)pText
[ie
]] ) ){
151 /* Fold to lower case */
154 if( pFold
!=aFold
) sqlite3_free(pFold
);
155 pFold
= sqlite3_malloc64((sqlite3_int64
)nByte
*2);
162 asciiFold(pFold
, &pText
[is
], nByte
);
164 /* Invoke the token callback */
165 rc
= xToken(pCtx
, 0, pFold
, nByte
, is
, ie
);
169 if( pFold
!=aFold
) sqlite3_free(pFold
);
170 if( rc
==SQLITE_DONE
) rc
= SQLITE_OK
;
174 /**************************************************************************
175 ** Start of unicode61 tokenizer implementation.
180 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
181 ** from the sqlite3 source file utf.c. If this file is compiled as part
182 ** of the amalgamation, they are not required.
184 #ifndef SQLITE_AMALGAMATION
186 static const unsigned char sqlite3Utf8Trans1
[] = {
187 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
188 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
189 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
190 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
191 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
192 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
193 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
194 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
197 #define READ_UTF8(zIn, zTerm, c) \
200 c = sqlite3Utf8Trans1[c-0xc0]; \
201 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \
202 c = (c<<6) + (0x3f & *(zIn++)); \
205 || (c&0xFFFFF800)==0xD800 \
206 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
210 #define WRITE_UTF8(zOut, c) { \
212 *zOut++ = (unsigned char)(c&0xFF); \
214 else if( c<0x00800 ){ \
215 *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F); \
216 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
218 else if( c<0x10000 ){ \
219 *zOut++ = 0xE0 + (unsigned char)((c>>12)&0x0F); \
220 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
221 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
223 *zOut++ = 0xF0 + (unsigned char)((c>>18) & 0x07); \
224 *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F); \
225 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
226 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
230 #endif /* ifndef SQLITE_AMALGAMATION */
232 #define FTS5_SKIP_UTF8(zIn) { \
233 if( ((unsigned char)(*(zIn++)))>=0xc0 ){ \
234 while( (((unsigned char)*zIn) & 0xc0)==0x80 ){ zIn++; } \
238 typedef struct Unicode61Tokenizer Unicode61Tokenizer
;
239 struct Unicode61Tokenizer
{
240 unsigned char aTokenChar
[128]; /* ASCII range token characters */
241 char *aFold
; /* Buffer to fold text into */
242 int nFold
; /* Size of aFold[] in bytes */
243 int eRemoveDiacritic
; /* True if remove_diacritics=1 is set */
247 unsigned char aCategory
[32]; /* True for token char categories */
250 /* Values for eRemoveDiacritic (must match internals of fts5_unicode2.c) */
251 #define FTS5_REMOVE_DIACRITICS_NONE 0
252 #define FTS5_REMOVE_DIACRITICS_SIMPLE 1
253 #define FTS5_REMOVE_DIACRITICS_COMPLEX 2
255 static int fts5UnicodeAddExceptions(
256 Unicode61Tokenizer
*p
, /* Tokenizer object */
257 const char *z
, /* Characters to treat as exceptions */
258 int bTokenChars
/* 1 for 'tokenchars', 0 for 'separators' */
261 int n
= (int)strlen(z
);
265 aNew
= (int*)sqlite3_realloc64(p
->aiException
,
266 (n
+p
->nException
)*sizeof(int));
268 int nNew
= p
->nException
;
269 const unsigned char *zCsr
= (const unsigned char*)z
;
270 const unsigned char *zTerm
= (const unsigned char*)&z
[n
];
274 READ_UTF8(zCsr
, zTerm
, iCode
);
276 p
->aTokenChar
[iCode
] = (unsigned char)bTokenChars
;
278 bToken
= p
->aCategory
[sqlite3Fts5UnicodeCategory(iCode
)];
279 assert( (bToken
==0 || bToken
==1) );
280 assert( (bTokenChars
==0 || bTokenChars
==1) );
281 if( bToken
!=bTokenChars
&& sqlite3Fts5UnicodeIsdiacritic(iCode
)==0 ){
283 for(i
=0; i
<nNew
; i
++){
284 if( (u32
)aNew
[i
]>iCode
) break;
286 memmove(&aNew
[i
+1], &aNew
[i
], (nNew
-i
)*sizeof(int));
292 p
->aiException
= aNew
;
293 p
->nException
= nNew
;
303 ** Return true if the p->aiException[] array contains the value iCode.
305 static int fts5UnicodeIsException(Unicode61Tokenizer
*p
, int iCode
){
306 if( p
->nException
>0 ){
307 int *a
= p
->aiException
;
309 int iHi
= p
->nException
-1;
312 int iTest
= (iHi
+ iLo
) / 2;
313 if( iCode
==a
[iTest
] ){
315 }else if( iCode
>a
[iTest
] ){
327 ** Delete a "unicode61" tokenizer.
329 static void fts5UnicodeDelete(Fts5Tokenizer
*pTok
){
331 Unicode61Tokenizer
*p
= (Unicode61Tokenizer
*)pTok
;
332 sqlite3_free(p
->aiException
);
333 sqlite3_free(p
->aFold
);
339 static int unicodeSetCategories(Unicode61Tokenizer
*p
, const char *zCat
){
340 const char *z
= zCat
;
343 while( *z
==' ' || *z
=='\t' ) z
++;
344 if( *z
&& sqlite3Fts5UnicodeCatParse(z
, p
->aCategory
) ){
347 while( *z
!=' ' && *z
!='\t' && *z
!='\0' ) z
++;
350 sqlite3Fts5UnicodeAscii(p
->aCategory
, p
->aTokenChar
);
355 ** Create a "unicode61" tokenizer.
357 static int fts5UnicodeCreate(
359 const char **azArg
, int nArg
,
360 Fts5Tokenizer
**ppOut
362 int rc
= SQLITE_OK
; /* Return code */
363 Unicode61Tokenizer
*p
= 0; /* New tokenizer object */
365 UNUSED_PARAM(pUnused
);
370 p
= (Unicode61Tokenizer
*)sqlite3_malloc(sizeof(Unicode61Tokenizer
));
372 const char *zCat
= "L* N* Co";
374 memset(p
, 0, sizeof(Unicode61Tokenizer
));
376 p
->eRemoveDiacritic
= FTS5_REMOVE_DIACRITICS_SIMPLE
;
378 p
->aFold
= sqlite3_malloc64(p
->nFold
* sizeof(char));
383 /* Search for a "categories" argument */
384 for(i
=0; rc
==SQLITE_OK
&& i
<nArg
; i
+=2){
385 if( 0==sqlite3_stricmp(azArg
[i
], "categories") ){
391 rc
= unicodeSetCategories(p
, zCat
);
394 for(i
=0; rc
==SQLITE_OK
&& i
<nArg
; i
+=2){
395 const char *zArg
= azArg
[i
+1];
396 if( 0==sqlite3_stricmp(azArg
[i
], "remove_diacritics") ){
397 if( (zArg
[0]!='0' && zArg
[0]!='1' && zArg
[0]!='2') || zArg
[1] ){
400 p
->eRemoveDiacritic
= (zArg
[0] - '0');
401 assert( p
->eRemoveDiacritic
==FTS5_REMOVE_DIACRITICS_NONE
402 || p
->eRemoveDiacritic
==FTS5_REMOVE_DIACRITICS_SIMPLE
403 || p
->eRemoveDiacritic
==FTS5_REMOVE_DIACRITICS_COMPLEX
407 if( 0==sqlite3_stricmp(azArg
[i
], "tokenchars") ){
408 rc
= fts5UnicodeAddExceptions(p
, zArg
, 1);
410 if( 0==sqlite3_stricmp(azArg
[i
], "separators") ){
411 rc
= fts5UnicodeAddExceptions(p
, zArg
, 0);
413 if( 0==sqlite3_stricmp(azArg
[i
], "categories") ){
424 fts5UnicodeDelete((Fts5Tokenizer
*)p
);
427 *ppOut
= (Fts5Tokenizer
*)p
;
433 ** Return true if, for the purposes of tokenizing with the tokenizer
434 ** passed as the first argument, codepoint iCode is considered a token
435 ** character (not a separator).
437 static int fts5UnicodeIsAlnum(Unicode61Tokenizer
*p
, int iCode
){
439 p
->aCategory
[sqlite3Fts5UnicodeCategory((u32
)iCode
)]
440 ^ fts5UnicodeIsException(p
, iCode
)
444 static int fts5UnicodeTokenize(
445 Fts5Tokenizer
*pTokenizer
,
448 const char *pText
, int nText
,
449 int (*xToken
)(void*, int, const char*, int nToken
, int iStart
, int iEnd
)
451 Unicode61Tokenizer
*p
= (Unicode61Tokenizer
*)pTokenizer
;
453 unsigned char *a
= p
->aTokenChar
;
455 unsigned char *zTerm
= (unsigned char*)&pText
[nText
];
456 unsigned char *zCsr
= (unsigned char *)pText
;
459 char *aFold
= p
->aFold
;
460 int nFold
= p
->nFold
;
461 const char *pEnd
= &aFold
[nFold
-6];
463 UNUSED_PARAM(iUnused
);
465 /* Each iteration of this loop gobbles up a contiguous run of separators,
466 ** then the next token. */
467 while( rc
==SQLITE_OK
){
468 u32 iCode
; /* non-ASCII codepoint read from input */
473 /* Skip any separator characters. */
475 if( zCsr
>=zTerm
) goto tokenize_done
;
477 /* A character outside of the ascii range. Skip past it if it is
478 ** a separator character. Or break out of the loop if it is not. */
479 is
= zCsr
- (unsigned char*)pText
;
480 READ_UTF8(zCsr
, zTerm
, iCode
);
481 if( fts5UnicodeIsAlnum(p
, iCode
) ){
482 goto non_ascii_tokenchar
;
486 is
= zCsr
- (unsigned char*)pText
;
487 goto ascii_tokenchar
;
493 /* Run through the tokenchars. Fold them into the output buffer along
497 /* Grow the output buffer so that there is sufficient space to fit the
498 ** largest possible utf-8 character. */
500 aFold
= sqlite3_malloc64((sqlite3_int64
)nFold
*2);
505 zOut
= &aFold
[zOut
- p
->aFold
];
506 memcpy(aFold
, p
->aFold
, nFold
);
507 sqlite3_free(p
->aFold
);
509 p
->nFold
= nFold
= nFold
*2;
510 pEnd
= &aFold
[nFold
-6];
514 /* An non-ascii-range character. Fold it into the output buffer if
515 ** it is a token character, or break out of the loop if it is not. */
516 READ_UTF8(zCsr
, zTerm
, iCode
);
517 if( fts5UnicodeIsAlnum(p
,iCode
)||sqlite3Fts5UnicodeIsdiacritic(iCode
) ){
519 iCode
= sqlite3Fts5UnicodeFold(iCode
, p
->eRemoveDiacritic
);
520 if( iCode
) WRITE_UTF8(zOut
, iCode
);
524 }else if( a
[*zCsr
]==0 ){
525 /* An ascii-range separator character. End of token. */
529 if( *zCsr
>='A' && *zCsr
<='Z' ){
530 *zOut
++ = *zCsr
+ 32;
536 ie
= zCsr
- (unsigned char*)pText
;
539 /* Invoke the token callback */
540 rc
= xToken(pCtx
, 0, aFold
, zOut
-aFold
, is
, ie
);
544 if( rc
==SQLITE_DONE
) rc
= SQLITE_OK
;
548 /**************************************************************************
549 ** Start of porter stemmer implementation.
552 /* Any tokens larger than this (in bytes) are passed through without
554 #define FTS5_PORTER_MAX_TOKEN 64
556 typedef struct PorterTokenizer PorterTokenizer
;
557 struct PorterTokenizer
{
558 fts5_tokenizer tokenizer
; /* Parent tokenizer module */
559 Fts5Tokenizer
*pTokenizer
; /* Parent tokenizer instance */
560 char aBuf
[FTS5_PORTER_MAX_TOKEN
+ 64];
564 ** Delete a "porter" tokenizer.
566 static void fts5PorterDelete(Fts5Tokenizer
*pTok
){
568 PorterTokenizer
*p
= (PorterTokenizer
*)pTok
;
570 p
->tokenizer
.xDelete(p
->pTokenizer
);
577 ** Create a "porter" tokenizer.
579 static int fts5PorterCreate(
581 const char **azArg
, int nArg
,
582 Fts5Tokenizer
**ppOut
584 fts5_api
*pApi
= (fts5_api
*)pCtx
;
586 PorterTokenizer
*pRet
;
588 const char *zBase
= "unicode61";
594 pRet
= (PorterTokenizer
*)sqlite3_malloc(sizeof(PorterTokenizer
));
596 memset(pRet
, 0, sizeof(PorterTokenizer
));
597 rc
= pApi
->xFindTokenizer(pApi
, zBase
, &pUserdata
, &pRet
->tokenizer
);
602 int nArg2
= (nArg
>0 ? nArg
-1 : 0);
603 const char **azArg2
= (nArg2
? &azArg
[1] : 0);
604 rc
= pRet
->tokenizer
.xCreate(pUserdata
, azArg2
, nArg2
, &pRet
->pTokenizer
);
608 fts5PorterDelete((Fts5Tokenizer
*)pRet
);
611 *ppOut
= (Fts5Tokenizer
*)pRet
;
615 typedef struct PorterContext PorterContext
;
616 struct PorterContext
{
618 int (*xToken
)(void*, int, const char*, int, int, int);
622 typedef struct PorterRule PorterRule
;
626 int (*xCond
)(char *zStem
, int nStem
);
632 static int fts5PorterApply(char *aBuf
, int *pnBuf
, PorterRule
*aRule
){
637 for(p
=aRule
; p
->zSuffix
; p
++){
638 assert( strlen(p
->zSuffix
)==p
->nSuffix
);
639 assert( strlen(p
->zOutput
)==p
->nOutput
);
640 if( nBuf
<p
->nSuffix
) continue;
641 if( 0==memcmp(&aBuf
[nBuf
- p
->nSuffix
], p
->zSuffix
, p
->nSuffix
) ) break;
645 int nStem
= nBuf
- p
->nSuffix
;
646 if( p
->xCond
==0 || p
->xCond(aBuf
, nStem
) ){
647 memcpy(&aBuf
[nStem
], p
->zOutput
, p
->nOutput
);
648 *pnBuf
= nStem
+ p
->nOutput
;
657 static int fts5PorterIsVowel(char c
, int bYIsVowel
){
659 c
=='a' || c
=='e' || c
=='i' || c
=='o' || c
=='u' || (bYIsVowel
&& c
=='y')
663 static int fts5PorterGobbleVC(char *zStem
, int nStem
, int bPrevCons
){
665 int bCons
= bPrevCons
;
667 /* Scan for a vowel */
668 for(i
=0; i
<nStem
; i
++){
669 if( 0==(bCons
= !fts5PorterIsVowel(zStem
[i
], bCons
)) ) break;
672 /* Scan for a consonent */
673 for(i
++; i
<nStem
; i
++){
674 if( (bCons
= !fts5PorterIsVowel(zStem
[i
], bCons
)) ) return i
+1;
679 /* porter rule condition: (m > 0) */
680 static int fts5Porter_MGt0(char *zStem
, int nStem
){
681 return !!fts5PorterGobbleVC(zStem
, nStem
, 0);
684 /* porter rule condition: (m > 1) */
685 static int fts5Porter_MGt1(char *zStem
, int nStem
){
687 n
= fts5PorterGobbleVC(zStem
, nStem
, 0);
688 if( n
&& fts5PorterGobbleVC(&zStem
[n
], nStem
-n
, 1) ){
694 /* porter rule condition: (m = 1) */
695 static int fts5Porter_MEq1(char *zStem
, int nStem
){
697 n
= fts5PorterGobbleVC(zStem
, nStem
, 0);
698 if( n
&& 0==fts5PorterGobbleVC(&zStem
[n
], nStem
-n
, 1) ){
704 /* porter rule condition: (*o) */
705 static int fts5Porter_Ostar(char *zStem
, int nStem
){
706 if( zStem
[nStem
-1]=='w' || zStem
[nStem
-1]=='x' || zStem
[nStem
-1]=='y' ){
712 for(i
=0; i
<nStem
; i
++){
713 bCons
= !fts5PorterIsVowel(zStem
[i
], bCons
);
714 assert( bCons
==0 || bCons
==1 );
715 mask
= (mask
<< 1) + bCons
;
717 return ((mask
& 0x0007)==0x0005);
721 /* porter rule condition: (m > 1 and (*S or *T)) */
722 static int fts5Porter_MGt1_and_S_or_T(char *zStem
, int nStem
){
724 return (zStem
[nStem
-1]=='s' || zStem
[nStem
-1]=='t')
725 && fts5Porter_MGt1(zStem
, nStem
);
728 /* porter rule condition: (*v*) */
729 static int fts5Porter_Vowel(char *zStem
, int nStem
){
731 for(i
=0; i
<nStem
; i
++){
732 if( fts5PorterIsVowel(zStem
[i
], i
>0) ){
740 /**************************************************************************
741 ***************************************************************************
742 ** GENERATED CODE STARTS HERE (mkportersteps.tcl)
745 static int fts5PorterStep4(char *aBuf
, int *pnBuf
){
748 switch( aBuf
[nBuf
-2] ){
751 if( nBuf
>2 && 0==memcmp("al", &aBuf
[nBuf
-2], 2) ){
752 if( fts5Porter_MGt1(aBuf
, nBuf
-2) ){
759 if( nBuf
>4 && 0==memcmp("ance", &aBuf
[nBuf
-4], 4) ){
760 if( fts5Porter_MGt1(aBuf
, nBuf
-4) ){
763 }else if( nBuf
>4 && 0==memcmp("ence", &aBuf
[nBuf
-4], 4) ){
764 if( fts5Porter_MGt1(aBuf
, nBuf
-4) ){
771 if( nBuf
>2 && 0==memcmp("er", &aBuf
[nBuf
-2], 2) ){
772 if( fts5Porter_MGt1(aBuf
, nBuf
-2) ){
779 if( nBuf
>2 && 0==memcmp("ic", &aBuf
[nBuf
-2], 2) ){
780 if( fts5Porter_MGt1(aBuf
, nBuf
-2) ){
787 if( nBuf
>4 && 0==memcmp("able", &aBuf
[nBuf
-4], 4) ){
788 if( fts5Porter_MGt1(aBuf
, nBuf
-4) ){
791 }else if( nBuf
>4 && 0==memcmp("ible", &aBuf
[nBuf
-4], 4) ){
792 if( fts5Porter_MGt1(aBuf
, nBuf
-4) ){
799 if( nBuf
>3 && 0==memcmp("ant", &aBuf
[nBuf
-3], 3) ){
800 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
803 }else if( nBuf
>5 && 0==memcmp("ement", &aBuf
[nBuf
-5], 5) ){
804 if( fts5Porter_MGt1(aBuf
, nBuf
-5) ){
807 }else if( nBuf
>4 && 0==memcmp("ment", &aBuf
[nBuf
-4], 4) ){
808 if( fts5Porter_MGt1(aBuf
, nBuf
-4) ){
811 }else if( nBuf
>3 && 0==memcmp("ent", &aBuf
[nBuf
-3], 3) ){
812 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
819 if( nBuf
>3 && 0==memcmp("ion", &aBuf
[nBuf
-3], 3) ){
820 if( fts5Porter_MGt1_and_S_or_T(aBuf
, nBuf
-3) ){
823 }else if( nBuf
>2 && 0==memcmp("ou", &aBuf
[nBuf
-2], 2) ){
824 if( fts5Porter_MGt1(aBuf
, nBuf
-2) ){
831 if( nBuf
>3 && 0==memcmp("ism", &aBuf
[nBuf
-3], 3) ){
832 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
839 if( nBuf
>3 && 0==memcmp("ate", &aBuf
[nBuf
-3], 3) ){
840 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
843 }else if( nBuf
>3 && 0==memcmp("iti", &aBuf
[nBuf
-3], 3) ){
844 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
851 if( nBuf
>3 && 0==memcmp("ous", &aBuf
[nBuf
-3], 3) ){
852 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
859 if( nBuf
>3 && 0==memcmp("ive", &aBuf
[nBuf
-3], 3) ){
860 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
867 if( nBuf
>3 && 0==memcmp("ize", &aBuf
[nBuf
-3], 3) ){
868 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
879 static int fts5PorterStep1B2(char *aBuf
, int *pnBuf
){
882 switch( aBuf
[nBuf
-2] ){
885 if( nBuf
>2 && 0==memcmp("at", &aBuf
[nBuf
-2], 2) ){
886 memcpy(&aBuf
[nBuf
-2], "ate", 3);
887 *pnBuf
= nBuf
- 2 + 3;
893 if( nBuf
>2 && 0==memcmp("bl", &aBuf
[nBuf
-2], 2) ){
894 memcpy(&aBuf
[nBuf
-2], "ble", 3);
895 *pnBuf
= nBuf
- 2 + 3;
901 if( nBuf
>2 && 0==memcmp("iz", &aBuf
[nBuf
-2], 2) ){
902 memcpy(&aBuf
[nBuf
-2], "ize", 3);
903 *pnBuf
= nBuf
- 2 + 3;
913 static int fts5PorterStep2(char *aBuf
, int *pnBuf
){
916 switch( aBuf
[nBuf
-2] ){
919 if( nBuf
>7 && 0==memcmp("ational", &aBuf
[nBuf
-7], 7) ){
920 if( fts5Porter_MGt0(aBuf
, nBuf
-7) ){
921 memcpy(&aBuf
[nBuf
-7], "ate", 3);
922 *pnBuf
= nBuf
- 7 + 3;
924 }else if( nBuf
>6 && 0==memcmp("tional", &aBuf
[nBuf
-6], 6) ){
925 if( fts5Porter_MGt0(aBuf
, nBuf
-6) ){
926 memcpy(&aBuf
[nBuf
-6], "tion", 4);
927 *pnBuf
= nBuf
- 6 + 4;
933 if( nBuf
>4 && 0==memcmp("enci", &aBuf
[nBuf
-4], 4) ){
934 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
935 memcpy(&aBuf
[nBuf
-4], "ence", 4);
936 *pnBuf
= nBuf
- 4 + 4;
938 }else if( nBuf
>4 && 0==memcmp("anci", &aBuf
[nBuf
-4], 4) ){
939 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
940 memcpy(&aBuf
[nBuf
-4], "ance", 4);
941 *pnBuf
= nBuf
- 4 + 4;
947 if( nBuf
>4 && 0==memcmp("izer", &aBuf
[nBuf
-4], 4) ){
948 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
949 memcpy(&aBuf
[nBuf
-4], "ize", 3);
950 *pnBuf
= nBuf
- 4 + 3;
956 if( nBuf
>4 && 0==memcmp("logi", &aBuf
[nBuf
-4], 4) ){
957 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
958 memcpy(&aBuf
[nBuf
-4], "log", 3);
959 *pnBuf
= nBuf
- 4 + 3;
965 if( nBuf
>3 && 0==memcmp("bli", &aBuf
[nBuf
-3], 3) ){
966 if( fts5Porter_MGt0(aBuf
, nBuf
-3) ){
967 memcpy(&aBuf
[nBuf
-3], "ble", 3);
968 *pnBuf
= nBuf
- 3 + 3;
970 }else if( nBuf
>4 && 0==memcmp("alli", &aBuf
[nBuf
-4], 4) ){
971 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
972 memcpy(&aBuf
[nBuf
-4], "al", 2);
973 *pnBuf
= nBuf
- 4 + 2;
975 }else if( nBuf
>5 && 0==memcmp("entli", &aBuf
[nBuf
-5], 5) ){
976 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
977 memcpy(&aBuf
[nBuf
-5], "ent", 3);
978 *pnBuf
= nBuf
- 5 + 3;
980 }else if( nBuf
>3 && 0==memcmp("eli", &aBuf
[nBuf
-3], 3) ){
981 if( fts5Porter_MGt0(aBuf
, nBuf
-3) ){
982 memcpy(&aBuf
[nBuf
-3], "e", 1);
983 *pnBuf
= nBuf
- 3 + 1;
985 }else if( nBuf
>5 && 0==memcmp("ousli", &aBuf
[nBuf
-5], 5) ){
986 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
987 memcpy(&aBuf
[nBuf
-5], "ous", 3);
988 *pnBuf
= nBuf
- 5 + 3;
994 if( nBuf
>7 && 0==memcmp("ization", &aBuf
[nBuf
-7], 7) ){
995 if( fts5Porter_MGt0(aBuf
, nBuf
-7) ){
996 memcpy(&aBuf
[nBuf
-7], "ize", 3);
997 *pnBuf
= nBuf
- 7 + 3;
999 }else if( nBuf
>5 && 0==memcmp("ation", &aBuf
[nBuf
-5], 5) ){
1000 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1001 memcpy(&aBuf
[nBuf
-5], "ate", 3);
1002 *pnBuf
= nBuf
- 5 + 3;
1004 }else if( nBuf
>4 && 0==memcmp("ator", &aBuf
[nBuf
-4], 4) ){
1005 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
1006 memcpy(&aBuf
[nBuf
-4], "ate", 3);
1007 *pnBuf
= nBuf
- 4 + 3;
1013 if( nBuf
>5 && 0==memcmp("alism", &aBuf
[nBuf
-5], 5) ){
1014 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1015 memcpy(&aBuf
[nBuf
-5], "al", 2);
1016 *pnBuf
= nBuf
- 5 + 2;
1018 }else if( nBuf
>7 && 0==memcmp("iveness", &aBuf
[nBuf
-7], 7) ){
1019 if( fts5Porter_MGt0(aBuf
, nBuf
-7) ){
1020 memcpy(&aBuf
[nBuf
-7], "ive", 3);
1021 *pnBuf
= nBuf
- 7 + 3;
1023 }else if( nBuf
>7 && 0==memcmp("fulness", &aBuf
[nBuf
-7], 7) ){
1024 if( fts5Porter_MGt0(aBuf
, nBuf
-7) ){
1025 memcpy(&aBuf
[nBuf
-7], "ful", 3);
1026 *pnBuf
= nBuf
- 7 + 3;
1028 }else if( nBuf
>7 && 0==memcmp("ousness", &aBuf
[nBuf
-7], 7) ){
1029 if( fts5Porter_MGt0(aBuf
, nBuf
-7) ){
1030 memcpy(&aBuf
[nBuf
-7], "ous", 3);
1031 *pnBuf
= nBuf
- 7 + 3;
1037 if( nBuf
>5 && 0==memcmp("aliti", &aBuf
[nBuf
-5], 5) ){
1038 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1039 memcpy(&aBuf
[nBuf
-5], "al", 2);
1040 *pnBuf
= nBuf
- 5 + 2;
1042 }else if( nBuf
>5 && 0==memcmp("iviti", &aBuf
[nBuf
-5], 5) ){
1043 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1044 memcpy(&aBuf
[nBuf
-5], "ive", 3);
1045 *pnBuf
= nBuf
- 5 + 3;
1047 }else if( nBuf
>6 && 0==memcmp("biliti", &aBuf
[nBuf
-6], 6) ){
1048 if( fts5Porter_MGt0(aBuf
, nBuf
-6) ){
1049 memcpy(&aBuf
[nBuf
-6], "ble", 3);
1050 *pnBuf
= nBuf
- 6 + 3;
1060 static int fts5PorterStep3(char *aBuf
, int *pnBuf
){
1063 switch( aBuf
[nBuf
-2] ){
1066 if( nBuf
>4 && 0==memcmp("ical", &aBuf
[nBuf
-4], 4) ){
1067 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
1068 memcpy(&aBuf
[nBuf
-4], "ic", 2);
1069 *pnBuf
= nBuf
- 4 + 2;
1075 if( nBuf
>4 && 0==memcmp("ness", &aBuf
[nBuf
-4], 4) ){
1076 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
1083 if( nBuf
>5 && 0==memcmp("icate", &aBuf
[nBuf
-5], 5) ){
1084 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1085 memcpy(&aBuf
[nBuf
-5], "ic", 2);
1086 *pnBuf
= nBuf
- 5 + 2;
1088 }else if( nBuf
>5 && 0==memcmp("iciti", &aBuf
[nBuf
-5], 5) ){
1089 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1090 memcpy(&aBuf
[nBuf
-5], "ic", 2);
1091 *pnBuf
= nBuf
- 5 + 2;
1097 if( nBuf
>3 && 0==memcmp("ful", &aBuf
[nBuf
-3], 3) ){
1098 if( fts5Porter_MGt0(aBuf
, nBuf
-3) ){
1105 if( nBuf
>5 && 0==memcmp("ative", &aBuf
[nBuf
-5], 5) ){
1106 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1113 if( nBuf
>5 && 0==memcmp("alize", &aBuf
[nBuf
-5], 5) ){
1114 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1115 memcpy(&aBuf
[nBuf
-5], "al", 2);
1116 *pnBuf
= nBuf
- 5 + 2;
1126 static int fts5PorterStep1B(char *aBuf
, int *pnBuf
){
1129 switch( aBuf
[nBuf
-2] ){
1132 if( nBuf
>3 && 0==memcmp("eed", &aBuf
[nBuf
-3], 3) ){
1133 if( fts5Porter_MGt0(aBuf
, nBuf
-3) ){
1134 memcpy(&aBuf
[nBuf
-3], "ee", 2);
1135 *pnBuf
= nBuf
- 3 + 2;
1137 }else if( nBuf
>2 && 0==memcmp("ed", &aBuf
[nBuf
-2], 2) ){
1138 if( fts5Porter_Vowel(aBuf
, nBuf
-2) ){
1146 if( nBuf
>3 && 0==memcmp("ing", &aBuf
[nBuf
-3], 3) ){
1147 if( fts5Porter_Vowel(aBuf
, nBuf
-3) ){
1159 ** GENERATED CODE ENDS HERE (mkportersteps.tcl)
1160 ***************************************************************************
1161 **************************************************************************/
1163 static void fts5PorterStep1A(char *aBuf
, int *pnBuf
){
1165 if( aBuf
[nBuf
-1]=='s' ){
1166 if( aBuf
[nBuf
-2]=='e' ){
1167 if( (nBuf
>4 && aBuf
[nBuf
-4]=='s' && aBuf
[nBuf
-3]=='s')
1168 || (nBuf
>3 && aBuf
[nBuf
-3]=='i' )
1175 else if( aBuf
[nBuf
-2]!='s' ){
1181 static int fts5PorterCb(
1189 PorterContext
*p
= (PorterContext
*)pCtx
;
1194 if( nToken
>FTS5_PORTER_MAX_TOKEN
|| nToken
<3 ) goto pass_through
;
1197 memcpy(aBuf
, pToken
, nBuf
);
1200 fts5PorterStep1A(aBuf
, &nBuf
);
1201 if( fts5PorterStep1B(aBuf
, &nBuf
) ){
1202 if( fts5PorterStep1B2(aBuf
, &nBuf
)==0 ){
1203 char c
= aBuf
[nBuf
-1];
1204 if( fts5PorterIsVowel(c
, 0)==0
1205 && c
!='l' && c
!='s' && c
!='z' && c
==aBuf
[nBuf
-2]
1208 }else if( fts5Porter_MEq1(aBuf
, nBuf
) && fts5Porter_Ostar(aBuf
, nBuf
) ){
1215 if( aBuf
[nBuf
-1]=='y' && fts5Porter_Vowel(aBuf
, nBuf
-1) ){
1219 /* Steps 2 through 4. */
1220 fts5PorterStep2(aBuf
, &nBuf
);
1221 fts5PorterStep3(aBuf
, &nBuf
);
1222 fts5PorterStep4(aBuf
, &nBuf
);
1226 if( aBuf
[nBuf
-1]=='e' ){
1227 if( fts5Porter_MGt1(aBuf
, nBuf
-1)
1228 || (fts5Porter_MEq1(aBuf
, nBuf
-1) && !fts5Porter_Ostar(aBuf
, nBuf
-1))
1235 if( nBuf
>1 && aBuf
[nBuf
-1]=='l'
1236 && aBuf
[nBuf
-2]=='l' && fts5Porter_MGt1(aBuf
, nBuf
-1)
1241 return p
->xToken(p
->pCtx
, tflags
, aBuf
, nBuf
, iStart
, iEnd
);
1244 return p
->xToken(p
->pCtx
, tflags
, pToken
, nToken
, iStart
, iEnd
);
1248 ** Tokenize using the porter tokenizer.
1250 static int fts5PorterTokenize(
1251 Fts5Tokenizer
*pTokenizer
,
1254 const char *pText
, int nText
,
1255 int (*xToken
)(void*, int, const char*, int nToken
, int iStart
, int iEnd
)
1257 PorterTokenizer
*p
= (PorterTokenizer
*)pTokenizer
;
1259 sCtx
.xToken
= xToken
;
1261 sCtx
.aBuf
= p
->aBuf
;
1262 return p
->tokenizer
.xTokenize(
1263 p
->pTokenizer
, (void*)&sCtx
, flags
, pText
, nText
, fts5PorterCb
1267 /**************************************************************************
1268 ** Start of trigram implementation.
1270 typedef struct TrigramTokenizer TrigramTokenizer
;
1271 struct TrigramTokenizer
{
1272 int bFold
; /* True to fold to lower-case */
1273 int iFoldParam
; /* Parameter to pass to Fts5UnicodeFold() */
1277 ** Free a trigram tokenizer.
1279 static void fts5TriDelete(Fts5Tokenizer
*p
){
1284 ** Allocate a trigram tokenizer.
1286 static int fts5TriCreate(
1290 Fts5Tokenizer
**ppOut
1293 TrigramTokenizer
*pNew
= (TrigramTokenizer
*)sqlite3_malloc(sizeof(*pNew
));
1294 UNUSED_PARAM(pUnused
);
1300 pNew
->iFoldParam
= 0;
1301 for(i
=0; rc
==SQLITE_OK
&& i
<nArg
; i
+=2){
1302 const char *zArg
= azArg
[i
+1];
1303 if( 0==sqlite3_stricmp(azArg
[i
], "case_sensitive") ){
1304 if( (zArg
[0]!='0' && zArg
[0]!='1') || zArg
[1] ){
1307 pNew
->bFold
= (zArg
[0]=='0');
1309 }else if( 0==sqlite3_stricmp(azArg
[i
], "remove_diacritics") ){
1310 if( (zArg
[0]!='0' && zArg
[0]!='1' && zArg
[0]!='2') || zArg
[1] ){
1313 pNew
->iFoldParam
= (zArg
[0]!='0') ? 2 : 0;
1320 if( pNew
->iFoldParam
!=0 && pNew
->bFold
==0 ){
1324 if( rc
!=SQLITE_OK
){
1325 fts5TriDelete((Fts5Tokenizer
*)pNew
);
1329 *ppOut
= (Fts5Tokenizer
*)pNew
;
1334 ** Trigram tokenizer tokenize routine.
1336 static int fts5TriTokenize(
1337 Fts5Tokenizer
*pTok
,
1340 const char *pText
, int nText
,
1341 int (*xToken
)(void*, int, const char*, int, int, int)
1343 TrigramTokenizer
*p
= (TrigramTokenizer
*)pTok
;
1348 const unsigned char *zIn
= (const unsigned char*)pText
;
1349 const unsigned char *zEof
= &zIn
[nText
];
1351 int aStart
[3]; /* Input offset of each character in aBuf[] */
1353 UNUSED_PARAM(unusedFlags
);
1355 /* Populate aBuf[] with the characters for the first trigram. */
1356 for(ii
=0; ii
<3; ii
++){
1358 aStart
[ii
] = zIn
- (const unsigned char*)pText
;
1359 READ_UTF8(zIn
, zEof
, iCode
);
1360 if( iCode
==0 ) return SQLITE_OK
;
1361 if( p
->bFold
) iCode
= sqlite3Fts5UnicodeFold(iCode
, p
->iFoldParam
);
1363 WRITE_UTF8(zOut
, iCode
);
1366 /* At the start of each iteration of this loop:
1368 ** aBuf: Contains 3 characters. The 3 characters of the next trigram.
1369 ** zOut: Points to the byte following the last character in aBuf.
1370 ** aStart[3]: Contains the byte offset in the input text corresponding
1371 ** to the start of each of the three characters in the buffer.
1373 assert( zIn
<=zEof
);
1375 int iNext
; /* Start of character following current tri */
1378 /* Read characters from the input up until the first non-diacritic */
1380 iNext
= zIn
- (const unsigned char*)pText
;
1381 READ_UTF8(zIn
, zEof
, iCode
);
1382 if( iCode
==0 ) break;
1383 if( p
->bFold
) iCode
= sqlite3Fts5UnicodeFold(iCode
, p
->iFoldParam
);
1386 /* Pass the current trigram back to fts5 */
1387 rc
= xToken(pCtx
, 0, aBuf
, zOut
-aBuf
, aStart
[0], iNext
);
1388 if( iCode
==0 || rc
!=SQLITE_OK
) break;
1390 /* Remove the first character from buffer aBuf[]. Append the character
1391 ** with codepoint iCode. */
1394 memmove(aBuf
, z1
, zOut
- z1
);
1395 zOut
-= (z1
- aBuf
);
1396 WRITE_UTF8(zOut
, iCode
);
1398 /* Update the aStart[] array */
1399 aStart
[0] = aStart
[1];
1400 aStart
[1] = aStart
[2];
1408 ** Argument xCreate is a pointer to a constructor function for a tokenizer.
1409 ** pTok is a tokenizer previously created using the same method. This function
1410 ** returns one of FTS5_PATTERN_NONE, FTS5_PATTERN_LIKE or FTS5_PATTERN_GLOB
1411 ** indicating the style of pattern matching that the tokenizer can support.
1412 ** In practice, this is:
1414 ** "trigram" tokenizer, case_sensitive=1 - FTS5_PATTERN_GLOB
1415 ** "trigram" tokenizer, case_sensitive=0 (the default) - FTS5_PATTERN_LIKE
1416 ** all other tokenizers - FTS5_PATTERN_NONE
1418 int sqlite3Fts5TokenizerPattern(
1419 int (*xCreate
)(void*, const char**, int, Fts5Tokenizer
**),
1422 if( xCreate
==fts5TriCreate
){
1423 TrigramTokenizer
*p
= (TrigramTokenizer
*)pTok
;
1424 if( p
->iFoldParam
==0 ){
1425 return p
->bFold
? FTS5_PATTERN_LIKE
: FTS5_PATTERN_GLOB
;
1428 return FTS5_PATTERN_NONE
;
1432 ** Register all built-in tokenizers with FTS5.
1434 int sqlite3Fts5TokenizerInit(fts5_api
*pApi
){
1435 struct BuiltinTokenizer
{
1439 { "unicode61", {fts5UnicodeCreate
, fts5UnicodeDelete
, fts5UnicodeTokenize
}},
1440 { "ascii", {fts5AsciiCreate
, fts5AsciiDelete
, fts5AsciiTokenize
}},
1441 { "porter", {fts5PorterCreate
, fts5PorterDelete
, fts5PorterTokenize
}},
1442 { "trigram", {fts5TriCreate
, fts5TriDelete
, fts5TriTokenize
}},
1445 int rc
= SQLITE_OK
; /* Return code */
1446 int i
; /* To iterate through builtin functions */
1448 for(i
=0; rc
==SQLITE_OK
&& i
<ArraySize(aBuiltin
); i
++){
1449 rc
= pApi
->xCreateTokenizer(pApi
,