4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 ******************************************************************************
17 /**************************************************************************
18 ** Start of ascii tokenizer implementation.
22 ** For tokenizers with no "unicode" modifier, the set of token characters
23 ** is the same as the set of ASCII range alphanumeric characters.
25 static unsigned char aAsciiTokenChar
[128] = {
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00..0x0F */
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10..0x1F */
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20..0x2F */
29 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 0x30..0x3F */
30 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40..0x4F */
31 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x50..0x5F */
32 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60..0x6F */
33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x70..0x7F */
36 typedef struct AsciiTokenizer AsciiTokenizer
;
37 struct AsciiTokenizer
{
38 unsigned char aTokenChar
[128];
41 static void fts5AsciiAddExceptions(
47 for(i
=0; zArg
[i
]; i
++){
48 if( (zArg
[i
] & 0x80)==0 ){
49 p
->aTokenChar
[(int)zArg
[i
]] = (unsigned char)bTokenChars
;
55 ** Delete a "ascii" tokenizer.
57 static void fts5AsciiDelete(Fts5Tokenizer
*p
){
62 ** Create an "ascii" tokenizer.
64 static int fts5AsciiCreate(
66 const char **azArg
, int nArg
,
70 AsciiTokenizer
*p
= 0;
71 UNUSED_PARAM(pUnused
);
75 p
= sqlite3_malloc(sizeof(AsciiTokenizer
));
80 memset(p
, 0, sizeof(AsciiTokenizer
));
81 memcpy(p
->aTokenChar
, aAsciiTokenChar
, sizeof(aAsciiTokenChar
));
82 for(i
=0; rc
==SQLITE_OK
&& i
<nArg
-1; i
+=2){
83 const char *zArg
= azArg
[i
+1];
84 if( 0==sqlite3_stricmp(azArg
[i
], "tokenchars") ){
85 fts5AsciiAddExceptions(p
, zArg
, 1);
87 if( 0==sqlite3_stricmp(azArg
[i
], "separators") ){
88 fts5AsciiAddExceptions(p
, zArg
, 0);
93 if( rc
==SQLITE_OK
&& i
<nArg
) rc
= SQLITE_ERROR
;
95 fts5AsciiDelete((Fts5Tokenizer
*)p
);
101 *ppOut
= (Fts5Tokenizer
*)p
;
106 static void asciiFold(char *aOut
, const char *aIn
, int nByte
){
108 for(i
=0; i
<nByte
; i
++){
110 if( c
>='A' && c
<='Z' ) c
+= 32;
116 ** Tokenize some text using the ascii tokenizer.
118 static int fts5AsciiTokenize(
119 Fts5Tokenizer
*pTokenizer
,
122 const char *pText
, int nText
,
123 int (*xToken
)(void*, int, const char*, int nToken
, int iStart
, int iEnd
)
125 AsciiTokenizer
*p
= (AsciiTokenizer
*)pTokenizer
;
131 int nFold
= sizeof(aFold
);
133 unsigned char *a
= p
->aTokenChar
;
135 UNUSED_PARAM(iUnused
);
137 while( is
<nText
&& rc
==SQLITE_OK
){
140 /* Skip any leading divider characters. */
141 while( is
<nText
&& ((pText
[is
]&0x80)==0 && a
[(int)pText
[is
]]==0) ){
144 if( is
==nText
) break;
146 /* Count the token characters */
148 while( ie
<nText
&& ((pText
[ie
]&0x80) || a
[(int)pText
[ie
]] ) ){
152 /* Fold to lower case */
155 if( pFold
!=aFold
) sqlite3_free(pFold
);
156 pFold
= sqlite3_malloc64((sqlite3_int64
)nByte
*2);
163 asciiFold(pFold
, &pText
[is
], nByte
);
165 /* Invoke the token callback */
166 rc
= xToken(pCtx
, 0, pFold
, nByte
, is
, ie
);
170 if( pFold
!=aFold
) sqlite3_free(pFold
);
171 if( rc
==SQLITE_DONE
) rc
= SQLITE_OK
;
175 /**************************************************************************
176 ** Start of unicode61 tokenizer implementation.
181 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
182 ** from the sqlite3 source file utf.c. If this file is compiled as part
183 ** of the amalgamation, they are not required.
185 #ifndef SQLITE_AMALGAMATION
187 static const unsigned char sqlite3Utf8Trans1
[] = {
188 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
189 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
190 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
191 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
192 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
193 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
194 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
195 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
198 #define READ_UTF8(zIn, zTerm, c) \
201 c = sqlite3Utf8Trans1[c-0xc0]; \
202 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \
203 c = (c<<6) + (0x3f & *(zIn++)); \
206 || (c&0xFFFFF800)==0xD800 \
207 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
211 #define WRITE_UTF8(zOut, c) { \
213 *zOut++ = (unsigned char)(c&0xFF); \
215 else if( c<0x00800 ){ \
216 *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F); \
217 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
219 else if( c<0x10000 ){ \
220 *zOut++ = 0xE0 + (unsigned char)((c>>12)&0x0F); \
221 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
222 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
224 *zOut++ = 0xF0 + (unsigned char)((c>>18) & 0x07); \
225 *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F); \
226 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
227 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
231 #endif /* ifndef SQLITE_AMALGAMATION */
233 #define FTS5_SKIP_UTF8(zIn) { \
234 if( ((unsigned char)(*(zIn++)))>=0xc0 ){ \
235 while( (((unsigned char)*zIn) & 0xc0)==0x80 ){ zIn++; } \
239 typedef struct Unicode61Tokenizer Unicode61Tokenizer
;
240 struct Unicode61Tokenizer
{
241 unsigned char aTokenChar
[128]; /* ASCII range token characters */
242 char *aFold
; /* Buffer to fold text into */
243 int nFold
; /* Size of aFold[] in bytes */
244 int eRemoveDiacritic
; /* True if remove_diacritics=1 is set */
248 unsigned char aCategory
[32]; /* True for token char categories */
251 /* Values for eRemoveDiacritic (must match internals of fts5_unicode2.c) */
252 #define FTS5_REMOVE_DIACRITICS_NONE 0
253 #define FTS5_REMOVE_DIACRITICS_SIMPLE 1
254 #define FTS5_REMOVE_DIACRITICS_COMPLEX 2
256 static int fts5UnicodeAddExceptions(
257 Unicode61Tokenizer
*p
, /* Tokenizer object */
258 const char *z
, /* Characters to treat as exceptions */
259 int bTokenChars
/* 1 for 'tokenchars', 0 for 'separators' */
262 int n
= (int)strlen(z
);
266 aNew
= (int*)sqlite3_realloc64(p
->aiException
,
267 (n
+p
->nException
)*sizeof(int));
269 int nNew
= p
->nException
;
270 const unsigned char *zCsr
= (const unsigned char*)z
;
271 const unsigned char *zTerm
= (const unsigned char*)&z
[n
];
275 READ_UTF8(zCsr
, zTerm
, iCode
);
277 p
->aTokenChar
[iCode
] = (unsigned char)bTokenChars
;
279 bToken
= p
->aCategory
[sqlite3Fts5UnicodeCategory(iCode
)];
280 assert( (bToken
==0 || bToken
==1) );
281 assert( (bTokenChars
==0 || bTokenChars
==1) );
282 if( bToken
!=bTokenChars
&& sqlite3Fts5UnicodeIsdiacritic(iCode
)==0 ){
284 for(i
=0; i
<nNew
; i
++){
285 if( (u32
)aNew
[i
]>iCode
) break;
287 memmove(&aNew
[i
+1], &aNew
[i
], (nNew
-i
)*sizeof(int));
293 p
->aiException
= aNew
;
294 p
->nException
= nNew
;
304 ** Return true if the p->aiException[] array contains the value iCode.
306 static int fts5UnicodeIsException(Unicode61Tokenizer
*p
, int iCode
){
307 if( p
->nException
>0 ){
308 int *a
= p
->aiException
;
310 int iHi
= p
->nException
-1;
313 int iTest
= (iHi
+ iLo
) / 2;
314 if( iCode
==a
[iTest
] ){
316 }else if( iCode
>a
[iTest
] ){
328 ** Delete a "unicode61" tokenizer.
330 static void fts5UnicodeDelete(Fts5Tokenizer
*pTok
){
332 Unicode61Tokenizer
*p
= (Unicode61Tokenizer
*)pTok
;
333 sqlite3_free(p
->aiException
);
334 sqlite3_free(p
->aFold
);
340 static int unicodeSetCategories(Unicode61Tokenizer
*p
, const char *zCat
){
341 const char *z
= zCat
;
344 while( *z
==' ' || *z
=='\t' ) z
++;
345 if( *z
&& sqlite3Fts5UnicodeCatParse(z
, p
->aCategory
) ){
348 while( *z
!=' ' && *z
!='\t' && *z
!='\0' ) z
++;
351 sqlite3Fts5UnicodeAscii(p
->aCategory
, p
->aTokenChar
);
356 ** Create a "unicode61" tokenizer.
358 static int fts5UnicodeCreate(
360 const char **azArg
, int nArg
,
361 Fts5Tokenizer
**ppOut
363 int rc
= SQLITE_OK
; /* Return code */
364 Unicode61Tokenizer
*p
= 0; /* New tokenizer object */
366 UNUSED_PARAM(pUnused
);
371 p
= (Unicode61Tokenizer
*)sqlite3_malloc(sizeof(Unicode61Tokenizer
));
373 const char *zCat
= "L* N* Co";
375 memset(p
, 0, sizeof(Unicode61Tokenizer
));
377 p
->eRemoveDiacritic
= FTS5_REMOVE_DIACRITICS_SIMPLE
;
379 p
->aFold
= sqlite3_malloc64(p
->nFold
* sizeof(char));
384 /* Search for a "categories" argument */
385 for(i
=0; rc
==SQLITE_OK
&& i
<nArg
-1; i
+=2){
386 if( 0==sqlite3_stricmp(azArg
[i
], "categories") ){
391 rc
= unicodeSetCategories(p
, zCat
);
394 for(i
=0; rc
==SQLITE_OK
&& i
<nArg
-1; i
+=2){
395 const char *zArg
= azArg
[i
+1];
396 if( 0==sqlite3_stricmp(azArg
[i
], "remove_diacritics") ){
397 if( (zArg
[0]!='0' && zArg
[0]!='1' && zArg
[0]!='2') || zArg
[1] ){
400 p
->eRemoveDiacritic
= (zArg
[0] - '0');
401 assert( p
->eRemoveDiacritic
==FTS5_REMOVE_DIACRITICS_NONE
402 || p
->eRemoveDiacritic
==FTS5_REMOVE_DIACRITICS_SIMPLE
403 || p
->eRemoveDiacritic
==FTS5_REMOVE_DIACRITICS_COMPLEX
407 if( 0==sqlite3_stricmp(azArg
[i
], "tokenchars") ){
408 rc
= fts5UnicodeAddExceptions(p
, zArg
, 1);
410 if( 0==sqlite3_stricmp(azArg
[i
], "separators") ){
411 rc
= fts5UnicodeAddExceptions(p
, zArg
, 0);
413 if( 0==sqlite3_stricmp(azArg
[i
], "categories") ){
419 if( i
<nArg
&& rc
==SQLITE_OK
) rc
= SQLITE_ERROR
;
425 fts5UnicodeDelete((Fts5Tokenizer
*)p
);
428 *ppOut
= (Fts5Tokenizer
*)p
;
434 ** Return true if, for the purposes of tokenizing with the tokenizer
435 ** passed as the first argument, codepoint iCode is considered a token
436 ** character (not a separator).
438 static int fts5UnicodeIsAlnum(Unicode61Tokenizer
*p
, int iCode
){
440 p
->aCategory
[sqlite3Fts5UnicodeCategory((u32
)iCode
)]
441 ^ fts5UnicodeIsException(p
, iCode
)
445 static int fts5UnicodeTokenize(
446 Fts5Tokenizer
*pTokenizer
,
449 const char *pText
, int nText
,
450 int (*xToken
)(void*, int, const char*, int nToken
, int iStart
, int iEnd
)
452 Unicode61Tokenizer
*p
= (Unicode61Tokenizer
*)pTokenizer
;
454 unsigned char *a
= p
->aTokenChar
;
456 unsigned char *zTerm
= (unsigned char*)&pText
[nText
];
457 unsigned char *zCsr
= (unsigned char *)pText
;
460 char *aFold
= p
->aFold
;
461 int nFold
= p
->nFold
;
462 const char *pEnd
= &aFold
[nFold
-6];
464 UNUSED_PARAM(iUnused
);
466 /* Each iteration of this loop gobbles up a contiguous run of separators,
467 ** then the next token. */
468 while( rc
==SQLITE_OK
){
469 u32 iCode
; /* non-ASCII codepoint read from input */
474 /* Skip any separator characters. */
476 if( zCsr
>=zTerm
) goto tokenize_done
;
478 /* A character outside of the ascii range. Skip past it if it is
479 ** a separator character. Or break out of the loop if it is not. */
480 is
= zCsr
- (unsigned char*)pText
;
481 READ_UTF8(zCsr
, zTerm
, iCode
);
482 if( fts5UnicodeIsAlnum(p
, iCode
) ){
483 goto non_ascii_tokenchar
;
487 is
= zCsr
- (unsigned char*)pText
;
488 goto ascii_tokenchar
;
494 /* Run through the tokenchars. Fold them into the output buffer along
498 /* Grow the output buffer so that there is sufficient space to fit the
499 ** largest possible utf-8 character. */
501 aFold
= sqlite3_malloc64((sqlite3_int64
)nFold
*2);
506 zOut
= &aFold
[zOut
- p
->aFold
];
507 memcpy(aFold
, p
->aFold
, nFold
);
508 sqlite3_free(p
->aFold
);
510 p
->nFold
= nFold
= nFold
*2;
511 pEnd
= &aFold
[nFold
-6];
515 /* An non-ascii-range character. Fold it into the output buffer if
516 ** it is a token character, or break out of the loop if it is not. */
517 READ_UTF8(zCsr
, zTerm
, iCode
);
518 if( fts5UnicodeIsAlnum(p
,iCode
)||sqlite3Fts5UnicodeIsdiacritic(iCode
) ){
520 iCode
= sqlite3Fts5UnicodeFold(iCode
, p
->eRemoveDiacritic
);
521 if( iCode
) WRITE_UTF8(zOut
, iCode
);
525 }else if( a
[*zCsr
]==0 ){
526 /* An ascii-range separator character. End of token. */
530 if( *zCsr
>='A' && *zCsr
<='Z' ){
531 *zOut
++ = *zCsr
+ 32;
537 ie
= zCsr
- (unsigned char*)pText
;
540 /* Invoke the token callback */
541 rc
= xToken(pCtx
, 0, aFold
, zOut
-aFold
, is
, ie
);
545 if( rc
==SQLITE_DONE
) rc
= SQLITE_OK
;
549 /**************************************************************************
550 ** Start of porter stemmer implementation.
553 /* Any tokens larger than this (in bytes) are passed through without
555 #define FTS5_PORTER_MAX_TOKEN 64
557 typedef struct PorterTokenizer PorterTokenizer
;
558 struct PorterTokenizer
{
559 fts5_tokenizer tokenizer
; /* Parent tokenizer module */
560 Fts5Tokenizer
*pTokenizer
; /* Parent tokenizer instance */
561 char aBuf
[FTS5_PORTER_MAX_TOKEN
+ 64];
565 ** Delete a "porter" tokenizer.
567 static void fts5PorterDelete(Fts5Tokenizer
*pTok
){
569 PorterTokenizer
*p
= (PorterTokenizer
*)pTok
;
571 p
->tokenizer
.xDelete(p
->pTokenizer
);
578 ** Create a "porter" tokenizer.
580 static int fts5PorterCreate(
582 const char **azArg
, int nArg
,
583 Fts5Tokenizer
**ppOut
585 fts5_api
*pApi
= (fts5_api
*)pCtx
;
587 PorterTokenizer
*pRet
;
589 const char *zBase
= "unicode61";
595 pRet
= (PorterTokenizer
*)sqlite3_malloc(sizeof(PorterTokenizer
));
597 memset(pRet
, 0, sizeof(PorterTokenizer
));
598 rc
= pApi
->xFindTokenizer(pApi
, zBase
, &pUserdata
, &pRet
->tokenizer
);
603 int nArg2
= (nArg
>0 ? nArg
-1 : 0);
604 const char **azArg2
= (nArg2
? &azArg
[1] : 0);
605 rc
= pRet
->tokenizer
.xCreate(pUserdata
, azArg2
, nArg2
, &pRet
->pTokenizer
);
609 fts5PorterDelete((Fts5Tokenizer
*)pRet
);
612 *ppOut
= (Fts5Tokenizer
*)pRet
;
616 typedef struct PorterContext PorterContext
;
617 struct PorterContext
{
619 int (*xToken
)(void*, int, const char*, int, int, int);
623 typedef struct PorterRule PorterRule
;
627 int (*xCond
)(char *zStem
, int nStem
);
633 static int fts5PorterApply(char *aBuf
, int *pnBuf
, PorterRule
*aRule
){
638 for(p
=aRule
; p
->zSuffix
; p
++){
639 assert( strlen(p
->zSuffix
)==p
->nSuffix
);
640 assert( strlen(p
->zOutput
)==p
->nOutput
);
641 if( nBuf
<p
->nSuffix
) continue;
642 if( 0==memcmp(&aBuf
[nBuf
- p
->nSuffix
], p
->zSuffix
, p
->nSuffix
) ) break;
646 int nStem
= nBuf
- p
->nSuffix
;
647 if( p
->xCond
==0 || p
->xCond(aBuf
, nStem
) ){
648 memcpy(&aBuf
[nStem
], p
->zOutput
, p
->nOutput
);
649 *pnBuf
= nStem
+ p
->nOutput
;
658 static int fts5PorterIsVowel(char c
, int bYIsVowel
){
660 c
=='a' || c
=='e' || c
=='i' || c
=='o' || c
=='u' || (bYIsVowel
&& c
=='y')
664 static int fts5PorterGobbleVC(char *zStem
, int nStem
, int bPrevCons
){
666 int bCons
= bPrevCons
;
668 /* Scan for a vowel */
669 for(i
=0; i
<nStem
; i
++){
670 if( 0==(bCons
= !fts5PorterIsVowel(zStem
[i
], bCons
)) ) break;
673 /* Scan for a consonent */
674 for(i
++; i
<nStem
; i
++){
675 if( (bCons
= !fts5PorterIsVowel(zStem
[i
], bCons
)) ) return i
+1;
680 /* porter rule condition: (m > 0) */
681 static int fts5Porter_MGt0(char *zStem
, int nStem
){
682 return !!fts5PorterGobbleVC(zStem
, nStem
, 0);
685 /* porter rule condition: (m > 1) */
686 static int fts5Porter_MGt1(char *zStem
, int nStem
){
688 n
= fts5PorterGobbleVC(zStem
, nStem
, 0);
689 if( n
&& fts5PorterGobbleVC(&zStem
[n
], nStem
-n
, 1) ){
695 /* porter rule condition: (m = 1) */
696 static int fts5Porter_MEq1(char *zStem
, int nStem
){
698 n
= fts5PorterGobbleVC(zStem
, nStem
, 0);
699 if( n
&& 0==fts5PorterGobbleVC(&zStem
[n
], nStem
-n
, 1) ){
705 /* porter rule condition: (*o) */
706 static int fts5Porter_Ostar(char *zStem
, int nStem
){
707 if( zStem
[nStem
-1]=='w' || zStem
[nStem
-1]=='x' || zStem
[nStem
-1]=='y' ){
713 for(i
=0; i
<nStem
; i
++){
714 bCons
= !fts5PorterIsVowel(zStem
[i
], bCons
);
715 assert( bCons
==0 || bCons
==1 );
716 mask
= (mask
<< 1) + bCons
;
718 return ((mask
& 0x0007)==0x0005);
722 /* porter rule condition: (m > 1 and (*S or *T)) */
723 static int fts5Porter_MGt1_and_S_or_T(char *zStem
, int nStem
){
725 return (zStem
[nStem
-1]=='s' || zStem
[nStem
-1]=='t')
726 && fts5Porter_MGt1(zStem
, nStem
);
729 /* porter rule condition: (*v*) */
730 static int fts5Porter_Vowel(char *zStem
, int nStem
){
732 for(i
=0; i
<nStem
; i
++){
733 if( fts5PorterIsVowel(zStem
[i
], i
>0) ){
741 /**************************************************************************
742 ***************************************************************************
743 ** GENERATED CODE STARTS HERE (mkportersteps.tcl)
746 static int fts5PorterStep4(char *aBuf
, int *pnBuf
){
749 switch( aBuf
[nBuf
-2] ){
752 if( nBuf
>2 && 0==memcmp("al", &aBuf
[nBuf
-2], 2) ){
753 if( fts5Porter_MGt1(aBuf
, nBuf
-2) ){
760 if( nBuf
>4 && 0==memcmp("ance", &aBuf
[nBuf
-4], 4) ){
761 if( fts5Porter_MGt1(aBuf
, nBuf
-4) ){
764 }else if( nBuf
>4 && 0==memcmp("ence", &aBuf
[nBuf
-4], 4) ){
765 if( fts5Porter_MGt1(aBuf
, nBuf
-4) ){
772 if( nBuf
>2 && 0==memcmp("er", &aBuf
[nBuf
-2], 2) ){
773 if( fts5Porter_MGt1(aBuf
, nBuf
-2) ){
780 if( nBuf
>2 && 0==memcmp("ic", &aBuf
[nBuf
-2], 2) ){
781 if( fts5Porter_MGt1(aBuf
, nBuf
-2) ){
788 if( nBuf
>4 && 0==memcmp("able", &aBuf
[nBuf
-4], 4) ){
789 if( fts5Porter_MGt1(aBuf
, nBuf
-4) ){
792 }else if( nBuf
>4 && 0==memcmp("ible", &aBuf
[nBuf
-4], 4) ){
793 if( fts5Porter_MGt1(aBuf
, nBuf
-4) ){
800 if( nBuf
>3 && 0==memcmp("ant", &aBuf
[nBuf
-3], 3) ){
801 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
804 }else if( nBuf
>5 && 0==memcmp("ement", &aBuf
[nBuf
-5], 5) ){
805 if( fts5Porter_MGt1(aBuf
, nBuf
-5) ){
808 }else if( nBuf
>4 && 0==memcmp("ment", &aBuf
[nBuf
-4], 4) ){
809 if( fts5Porter_MGt1(aBuf
, nBuf
-4) ){
812 }else if( nBuf
>3 && 0==memcmp("ent", &aBuf
[nBuf
-3], 3) ){
813 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
820 if( nBuf
>3 && 0==memcmp("ion", &aBuf
[nBuf
-3], 3) ){
821 if( fts5Porter_MGt1_and_S_or_T(aBuf
, nBuf
-3) ){
824 }else if( nBuf
>2 && 0==memcmp("ou", &aBuf
[nBuf
-2], 2) ){
825 if( fts5Porter_MGt1(aBuf
, nBuf
-2) ){
832 if( nBuf
>3 && 0==memcmp("ism", &aBuf
[nBuf
-3], 3) ){
833 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
840 if( nBuf
>3 && 0==memcmp("ate", &aBuf
[nBuf
-3], 3) ){
841 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
844 }else if( nBuf
>3 && 0==memcmp("iti", &aBuf
[nBuf
-3], 3) ){
845 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
852 if( nBuf
>3 && 0==memcmp("ous", &aBuf
[nBuf
-3], 3) ){
853 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
860 if( nBuf
>3 && 0==memcmp("ive", &aBuf
[nBuf
-3], 3) ){
861 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
868 if( nBuf
>3 && 0==memcmp("ize", &aBuf
[nBuf
-3], 3) ){
869 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
880 static int fts5PorterStep1B2(char *aBuf
, int *pnBuf
){
883 switch( aBuf
[nBuf
-2] ){
886 if( nBuf
>2 && 0==memcmp("at", &aBuf
[nBuf
-2], 2) ){
887 memcpy(&aBuf
[nBuf
-2], "ate", 3);
888 *pnBuf
= nBuf
- 2 + 3;
894 if( nBuf
>2 && 0==memcmp("bl", &aBuf
[nBuf
-2], 2) ){
895 memcpy(&aBuf
[nBuf
-2], "ble", 3);
896 *pnBuf
= nBuf
- 2 + 3;
902 if( nBuf
>2 && 0==memcmp("iz", &aBuf
[nBuf
-2], 2) ){
903 memcpy(&aBuf
[nBuf
-2], "ize", 3);
904 *pnBuf
= nBuf
- 2 + 3;
914 static int fts5PorterStep2(char *aBuf
, int *pnBuf
){
917 switch( aBuf
[nBuf
-2] ){
920 if( nBuf
>7 && 0==memcmp("ational", &aBuf
[nBuf
-7], 7) ){
921 if( fts5Porter_MGt0(aBuf
, nBuf
-7) ){
922 memcpy(&aBuf
[nBuf
-7], "ate", 3);
923 *pnBuf
= nBuf
- 7 + 3;
925 }else if( nBuf
>6 && 0==memcmp("tional", &aBuf
[nBuf
-6], 6) ){
926 if( fts5Porter_MGt0(aBuf
, nBuf
-6) ){
927 memcpy(&aBuf
[nBuf
-6], "tion", 4);
928 *pnBuf
= nBuf
- 6 + 4;
934 if( nBuf
>4 && 0==memcmp("enci", &aBuf
[nBuf
-4], 4) ){
935 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
936 memcpy(&aBuf
[nBuf
-4], "ence", 4);
937 *pnBuf
= nBuf
- 4 + 4;
939 }else if( nBuf
>4 && 0==memcmp("anci", &aBuf
[nBuf
-4], 4) ){
940 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
941 memcpy(&aBuf
[nBuf
-4], "ance", 4);
942 *pnBuf
= nBuf
- 4 + 4;
948 if( nBuf
>4 && 0==memcmp("izer", &aBuf
[nBuf
-4], 4) ){
949 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
950 memcpy(&aBuf
[nBuf
-4], "ize", 3);
951 *pnBuf
= nBuf
- 4 + 3;
957 if( nBuf
>4 && 0==memcmp("logi", &aBuf
[nBuf
-4], 4) ){
958 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
959 memcpy(&aBuf
[nBuf
-4], "log", 3);
960 *pnBuf
= nBuf
- 4 + 3;
966 if( nBuf
>3 && 0==memcmp("bli", &aBuf
[nBuf
-3], 3) ){
967 if( fts5Porter_MGt0(aBuf
, nBuf
-3) ){
968 memcpy(&aBuf
[nBuf
-3], "ble", 3);
969 *pnBuf
= nBuf
- 3 + 3;
971 }else if( nBuf
>4 && 0==memcmp("alli", &aBuf
[nBuf
-4], 4) ){
972 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
973 memcpy(&aBuf
[nBuf
-4], "al", 2);
974 *pnBuf
= nBuf
- 4 + 2;
976 }else if( nBuf
>5 && 0==memcmp("entli", &aBuf
[nBuf
-5], 5) ){
977 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
978 memcpy(&aBuf
[nBuf
-5], "ent", 3);
979 *pnBuf
= nBuf
- 5 + 3;
981 }else if( nBuf
>3 && 0==memcmp("eli", &aBuf
[nBuf
-3], 3) ){
982 if( fts5Porter_MGt0(aBuf
, nBuf
-3) ){
983 memcpy(&aBuf
[nBuf
-3], "e", 1);
984 *pnBuf
= nBuf
- 3 + 1;
986 }else if( nBuf
>5 && 0==memcmp("ousli", &aBuf
[nBuf
-5], 5) ){
987 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
988 memcpy(&aBuf
[nBuf
-5], "ous", 3);
989 *pnBuf
= nBuf
- 5 + 3;
995 if( nBuf
>7 && 0==memcmp("ization", &aBuf
[nBuf
-7], 7) ){
996 if( fts5Porter_MGt0(aBuf
, nBuf
-7) ){
997 memcpy(&aBuf
[nBuf
-7], "ize", 3);
998 *pnBuf
= nBuf
- 7 + 3;
1000 }else if( nBuf
>5 && 0==memcmp("ation", &aBuf
[nBuf
-5], 5) ){
1001 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1002 memcpy(&aBuf
[nBuf
-5], "ate", 3);
1003 *pnBuf
= nBuf
- 5 + 3;
1005 }else if( nBuf
>4 && 0==memcmp("ator", &aBuf
[nBuf
-4], 4) ){
1006 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
1007 memcpy(&aBuf
[nBuf
-4], "ate", 3);
1008 *pnBuf
= nBuf
- 4 + 3;
1014 if( nBuf
>5 && 0==memcmp("alism", &aBuf
[nBuf
-5], 5) ){
1015 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1016 memcpy(&aBuf
[nBuf
-5], "al", 2);
1017 *pnBuf
= nBuf
- 5 + 2;
1019 }else if( nBuf
>7 && 0==memcmp("iveness", &aBuf
[nBuf
-7], 7) ){
1020 if( fts5Porter_MGt0(aBuf
, nBuf
-7) ){
1021 memcpy(&aBuf
[nBuf
-7], "ive", 3);
1022 *pnBuf
= nBuf
- 7 + 3;
1024 }else if( nBuf
>7 && 0==memcmp("fulness", &aBuf
[nBuf
-7], 7) ){
1025 if( fts5Porter_MGt0(aBuf
, nBuf
-7) ){
1026 memcpy(&aBuf
[nBuf
-7], "ful", 3);
1027 *pnBuf
= nBuf
- 7 + 3;
1029 }else if( nBuf
>7 && 0==memcmp("ousness", &aBuf
[nBuf
-7], 7) ){
1030 if( fts5Porter_MGt0(aBuf
, nBuf
-7) ){
1031 memcpy(&aBuf
[nBuf
-7], "ous", 3);
1032 *pnBuf
= nBuf
- 7 + 3;
1038 if( nBuf
>5 && 0==memcmp("aliti", &aBuf
[nBuf
-5], 5) ){
1039 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1040 memcpy(&aBuf
[nBuf
-5], "al", 2);
1041 *pnBuf
= nBuf
- 5 + 2;
1043 }else if( nBuf
>5 && 0==memcmp("iviti", &aBuf
[nBuf
-5], 5) ){
1044 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1045 memcpy(&aBuf
[nBuf
-5], "ive", 3);
1046 *pnBuf
= nBuf
- 5 + 3;
1048 }else if( nBuf
>6 && 0==memcmp("biliti", &aBuf
[nBuf
-6], 6) ){
1049 if( fts5Porter_MGt0(aBuf
, nBuf
-6) ){
1050 memcpy(&aBuf
[nBuf
-6], "ble", 3);
1051 *pnBuf
= nBuf
- 6 + 3;
1061 static int fts5PorterStep3(char *aBuf
, int *pnBuf
){
1064 switch( aBuf
[nBuf
-2] ){
1067 if( nBuf
>4 && 0==memcmp("ical", &aBuf
[nBuf
-4], 4) ){
1068 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
1069 memcpy(&aBuf
[nBuf
-4], "ic", 2);
1070 *pnBuf
= nBuf
- 4 + 2;
1076 if( nBuf
>4 && 0==memcmp("ness", &aBuf
[nBuf
-4], 4) ){
1077 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
1084 if( nBuf
>5 && 0==memcmp("icate", &aBuf
[nBuf
-5], 5) ){
1085 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1086 memcpy(&aBuf
[nBuf
-5], "ic", 2);
1087 *pnBuf
= nBuf
- 5 + 2;
1089 }else if( nBuf
>5 && 0==memcmp("iciti", &aBuf
[nBuf
-5], 5) ){
1090 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1091 memcpy(&aBuf
[nBuf
-5], "ic", 2);
1092 *pnBuf
= nBuf
- 5 + 2;
1098 if( nBuf
>3 && 0==memcmp("ful", &aBuf
[nBuf
-3], 3) ){
1099 if( fts5Porter_MGt0(aBuf
, nBuf
-3) ){
1106 if( nBuf
>5 && 0==memcmp("ative", &aBuf
[nBuf
-5], 5) ){
1107 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1114 if( nBuf
>5 && 0==memcmp("alize", &aBuf
[nBuf
-5], 5) ){
1115 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1116 memcpy(&aBuf
[nBuf
-5], "al", 2);
1117 *pnBuf
= nBuf
- 5 + 2;
1127 static int fts5PorterStep1B(char *aBuf
, int *pnBuf
){
1130 switch( aBuf
[nBuf
-2] ){
1133 if( nBuf
>3 && 0==memcmp("eed", &aBuf
[nBuf
-3], 3) ){
1134 if( fts5Porter_MGt0(aBuf
, nBuf
-3) ){
1135 memcpy(&aBuf
[nBuf
-3], "ee", 2);
1136 *pnBuf
= nBuf
- 3 + 2;
1138 }else if( nBuf
>2 && 0==memcmp("ed", &aBuf
[nBuf
-2], 2) ){
1139 if( fts5Porter_Vowel(aBuf
, nBuf
-2) ){
1147 if( nBuf
>3 && 0==memcmp("ing", &aBuf
[nBuf
-3], 3) ){
1148 if( fts5Porter_Vowel(aBuf
, nBuf
-3) ){
1160 ** GENERATED CODE ENDS HERE (mkportersteps.tcl)
1161 ***************************************************************************
1162 **************************************************************************/
1164 static void fts5PorterStep1A(char *aBuf
, int *pnBuf
){
1166 if( aBuf
[nBuf
-1]=='s' ){
1167 if( aBuf
[nBuf
-2]=='e' ){
1168 if( (nBuf
>4 && aBuf
[nBuf
-4]=='s' && aBuf
[nBuf
-3]=='s')
1169 || (nBuf
>3 && aBuf
[nBuf
-3]=='i' )
1176 else if( aBuf
[nBuf
-2]!='s' ){
1182 static int fts5PorterCb(
1190 PorterContext
*p
= (PorterContext
*)pCtx
;
1195 if( nToken
>FTS5_PORTER_MAX_TOKEN
|| nToken
<3 ) goto pass_through
;
1198 memcpy(aBuf
, pToken
, nBuf
);
1201 fts5PorterStep1A(aBuf
, &nBuf
);
1202 if( fts5PorterStep1B(aBuf
, &nBuf
) ){
1203 if( fts5PorterStep1B2(aBuf
, &nBuf
)==0 ){
1204 char c
= aBuf
[nBuf
-1];
1205 if( fts5PorterIsVowel(c
, 0)==0
1206 && c
!='l' && c
!='s' && c
!='z' && c
==aBuf
[nBuf
-2]
1209 }else if( fts5Porter_MEq1(aBuf
, nBuf
) && fts5Porter_Ostar(aBuf
, nBuf
) ){
1216 if( aBuf
[nBuf
-1]=='y' && fts5Porter_Vowel(aBuf
, nBuf
-1) ){
1220 /* Steps 2 through 4. */
1221 fts5PorterStep2(aBuf
, &nBuf
);
1222 fts5PorterStep3(aBuf
, &nBuf
);
1223 fts5PorterStep4(aBuf
, &nBuf
);
1227 if( aBuf
[nBuf
-1]=='e' ){
1228 if( fts5Porter_MGt1(aBuf
, nBuf
-1)
1229 || (fts5Porter_MEq1(aBuf
, nBuf
-1) && !fts5Porter_Ostar(aBuf
, nBuf
-1))
1236 if( nBuf
>1 && aBuf
[nBuf
-1]=='l'
1237 && aBuf
[nBuf
-2]=='l' && fts5Porter_MGt1(aBuf
, nBuf
-1)
1242 return p
->xToken(p
->pCtx
, tflags
, aBuf
, nBuf
, iStart
, iEnd
);
1245 return p
->xToken(p
->pCtx
, tflags
, pToken
, nToken
, iStart
, iEnd
);
1249 ** Tokenize using the porter tokenizer.
1251 static int fts5PorterTokenize(
1252 Fts5Tokenizer
*pTokenizer
,
1255 const char *pText
, int nText
,
1256 int (*xToken
)(void*, int, const char*, int nToken
, int iStart
, int iEnd
)
1258 PorterTokenizer
*p
= (PorterTokenizer
*)pTokenizer
;
1260 sCtx
.xToken
= xToken
;
1262 sCtx
.aBuf
= p
->aBuf
;
1263 return p
->tokenizer
.xTokenize(
1264 p
->pTokenizer
, (void*)&sCtx
, flags
, pText
, nText
, fts5PorterCb
1268 /**************************************************************************
1269 ** Start of trigram implementation.
1271 typedef struct TrigramTokenizer TrigramTokenizer
;
1272 struct TrigramTokenizer
{
1273 int bFold
; /* True to fold to lower-case */
1274 int iFoldParam
; /* Parameter to pass to Fts5UnicodeFold() */
1278 ** Free a trigram tokenizer.
1280 static void fts5TriDelete(Fts5Tokenizer
*p
){
1285 ** Allocate a trigram tokenizer.
1287 static int fts5TriCreate(
1291 Fts5Tokenizer
**ppOut
1294 TrigramTokenizer
*pNew
= (TrigramTokenizer
*)sqlite3_malloc(sizeof(*pNew
));
1295 UNUSED_PARAM(pUnused
);
1301 pNew
->iFoldParam
= 0;
1302 for(i
=0; rc
==SQLITE_OK
&& i
<nArg
-1; i
+=2){
1303 const char *zArg
= azArg
[i
+1];
1304 if( 0==sqlite3_stricmp(azArg
[i
], "case_sensitive") ){
1305 if( (zArg
[0]!='0' && zArg
[0]!='1') || zArg
[1] ){
1308 pNew
->bFold
= (zArg
[0]=='0');
1310 }else if( 0==sqlite3_stricmp(azArg
[i
], "remove_diacritics") ){
1311 if( (zArg
[0]!='0' && zArg
[0]!='1' && zArg
[0]!='2') || zArg
[1] ){
1314 pNew
->iFoldParam
= (zArg
[0]!='0') ? 2 : 0;
1320 if( i
<nArg
&& rc
==SQLITE_OK
) rc
= SQLITE_ERROR
;
1322 if( pNew
->iFoldParam
!=0 && pNew
->bFold
==0 ){
1326 if( rc
!=SQLITE_OK
){
1327 fts5TriDelete((Fts5Tokenizer
*)pNew
);
1331 *ppOut
= (Fts5Tokenizer
*)pNew
;
1336 ** Trigram tokenizer tokenize routine.
1338 static int fts5TriTokenize(
1339 Fts5Tokenizer
*pTok
,
1342 const char *pText
, int nText
,
1343 int (*xToken
)(void*, int, const char*, int, int, int)
1345 TrigramTokenizer
*p
= (TrigramTokenizer
*)pTok
;
1350 const unsigned char *zIn
= (const unsigned char*)pText
;
1351 const unsigned char *zEof
= &zIn
[nText
];
1353 int aStart
[3]; /* Input offset of each character in aBuf[] */
1355 UNUSED_PARAM(unusedFlags
);
1357 /* Populate aBuf[] with the characters for the first trigram. */
1358 for(ii
=0; ii
<3; ii
++){
1360 aStart
[ii
] = zIn
- (const unsigned char*)pText
;
1361 READ_UTF8(zIn
, zEof
, iCode
);
1362 if( iCode
==0 ) return SQLITE_OK
;
1363 if( p
->bFold
) iCode
= sqlite3Fts5UnicodeFold(iCode
, p
->iFoldParam
);
1365 WRITE_UTF8(zOut
, iCode
);
1368 /* At the start of each iteration of this loop:
1370 ** aBuf: Contains 3 characters. The 3 characters of the next trigram.
1371 ** zOut: Points to the byte following the last character in aBuf.
1372 ** aStart[3]: Contains the byte offset in the input text corresponding
1373 ** to the start of each of the three characters in the buffer.
1375 assert( zIn
<=zEof
);
1377 int iNext
; /* Start of character following current tri */
1380 /* Read characters from the input up until the first non-diacritic */
1382 iNext
= zIn
- (const unsigned char*)pText
;
1383 READ_UTF8(zIn
, zEof
, iCode
);
1384 if( iCode
==0 ) break;
1385 if( p
->bFold
) iCode
= sqlite3Fts5UnicodeFold(iCode
, p
->iFoldParam
);
1388 /* Pass the current trigram back to fts5 */
1389 rc
= xToken(pCtx
, 0, aBuf
, zOut
-aBuf
, aStart
[0], iNext
);
1390 if( iCode
==0 || rc
!=SQLITE_OK
) break;
1392 /* Remove the first character from buffer aBuf[]. Append the character
1393 ** with codepoint iCode. */
1396 memmove(aBuf
, z1
, zOut
- z1
);
1397 zOut
-= (z1
- aBuf
);
1398 WRITE_UTF8(zOut
, iCode
);
1400 /* Update the aStart[] array */
1401 aStart
[0] = aStart
[1];
1402 aStart
[1] = aStart
[2];
1410 ** Argument xCreate is a pointer to a constructor function for a tokenizer.
1411 ** pTok is a tokenizer previously created using the same method. This function
1412 ** returns one of FTS5_PATTERN_NONE, FTS5_PATTERN_LIKE or FTS5_PATTERN_GLOB
1413 ** indicating the style of pattern matching that the tokenizer can support.
1414 ** In practice, this is:
1416 ** "trigram" tokenizer, case_sensitive=1 - FTS5_PATTERN_GLOB
1417 ** "trigram" tokenizer, case_sensitive=0 (the default) - FTS5_PATTERN_LIKE
1418 ** all other tokenizers - FTS5_PATTERN_NONE
1420 int sqlite3Fts5TokenizerPattern(
1421 int (*xCreate
)(void*, const char**, int, Fts5Tokenizer
**),
1424 if( xCreate
==fts5TriCreate
){
1425 TrigramTokenizer
*p
= (TrigramTokenizer
*)pTok
;
1426 if( p
->iFoldParam
==0 ){
1427 return p
->bFold
? FTS5_PATTERN_LIKE
: FTS5_PATTERN_GLOB
;
1430 return FTS5_PATTERN_NONE
;
1434 ** Register all built-in tokenizers with FTS5.
1436 int sqlite3Fts5TokenizerInit(fts5_api
*pApi
){
1437 struct BuiltinTokenizer
{
1441 { "unicode61", {fts5UnicodeCreate
, fts5UnicodeDelete
, fts5UnicodeTokenize
}},
1442 { "ascii", {fts5AsciiCreate
, fts5AsciiDelete
, fts5AsciiTokenize
}},
1443 { "porter", {fts5PorterCreate
, fts5PorterDelete
, fts5PorterTokenize
}},
1444 { "trigram", {fts5TriCreate
, fts5TriDelete
, fts5TriTokenize
}},
1447 int rc
= SQLITE_OK
; /* Return code */
1448 int i
; /* To iterate through builtin functions */
1450 for(i
=0; rc
==SQLITE_OK
&& i
<ArraySize(aBuiltin
); i
++){
1451 rc
= pApi
->xCreateTokenizer(pApi
,