ext/fts5/fts5_tokenize.c

   1 /*
   2 ** 2014 May 31
   3 **
   4 ** The author disclaims copyright to this source code.  In place of
   5 ** a legal notice, here is a blessing:
   6 **
   7 **    May you do good and not evil.
   8 **    May you find forgiveness for yourself and forgive others.
   9 **    May you share freely, never taking more than you give.
  10 **
  11 ******************************************************************************
  12 */
  13
  14
  15 #include "fts5Int.h"
  16
  17 /**************************************************************************
  18 ** Start of ascii tokenizer implementation.
  19 */
  20
  21 /*
  22 ** For tokenizers with no "unicode" modifier, the set of token characters
  23 ** is the same as the set of ASCII range alphanumeric characters.
  24 */
  25 static unsigned char aAsciiTokenChar[128] = {
  26   0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x00..0x0F */
  27   0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x10..0x1F */
  28   0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x20..0x2F */
  29   1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 0, 0, 0, 0, 0, 0,   /* 0x30..0x3F */
  30   0, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   /* 0x40..0x4F */
  31   1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 0, 0, 0, 0, 0,   /* 0x50..0x5F */
  32   0, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   /* 0x60..0x6F */
  33   1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 0, 0, 0, 0, 0,   /* 0x70..0x7F */
  34 };
  35
  36 typedef struct AsciiTokenizer AsciiTokenizer;
  37 struct AsciiTokenizer {
  38   unsigned char aTokenChar[128];
  39 };
  40
  41 static void fts5AsciiAddExceptions(
  42   AsciiTokenizer *p,
  43   const char *zArg,
  44   int bTokenChars
  45 ){
  46   int i;
  47   for(i=0; zArg[i]; i++){
  48     if( (zArg[i] & 0x80)==0 ){
  49       p->aTokenChar[(int)zArg[i]] = (unsigned char)bTokenChars;
  50     }
  51   }
  52 }
  53
  54 /*
  55 ** Delete a "ascii" tokenizer.
  56 */
  57 static void fts5AsciiDelete(Fts5Tokenizer *p){
  58   sqlite3_free(p);
  59 }
  60
  61 /*
  62 ** Create an "ascii" tokenizer.
  63 */
  64 static int fts5AsciiCreate(
  65   void *pUnused,
  66   const char **azArg, int nArg,
  67   Fts5Tokenizer **ppOut
  68 ){
  69   int rc = SQLITE_OK;
  70   AsciiTokenizer *p = 0;
  71   UNUSED_PARAM(pUnused);
  72   if( nArg%2 ){
  73     rc = SQLITE_ERROR;
  74   }else{
  75     p = sqlite3_malloc(sizeof(AsciiTokenizer));
  76     if( p==0 ){
  77       rc = SQLITE_NOMEM;
  78     }else{
  79       int i;
  80       memset(p, 0, sizeof(AsciiTokenizer));
  81       memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
  82       for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
  83         const char *zArg = azArg[i+1];
  84         if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
  85           fts5AsciiAddExceptions(p, zArg, 1);
  86         }else
  87         if( 0==sqlite3_stricmp(azArg[i], "separators") ){
  88           fts5AsciiAddExceptions(p, zArg, 0);
  89         }else{
  90           rc = SQLITE_ERROR;
  91         }
  92       }
  93       if( rc!=SQLITE_OK ){
  94         fts5AsciiDelete((Fts5Tokenizer*)p);
  95         p = 0;
  96       }
  97     }
  98   }
  99
 100   *ppOut = (Fts5Tokenizer*)p;
 101   return rc;
 102 }
 103
 104
 105 static void asciiFold(char *aOut, const char *aIn, int nByte){
 106   int i;
 107   for(i=0; i<nByte; i++){
 108     char c = aIn[i];
 109     if( c>='A' && c<='Z' ) c += 32;
 110     aOut[i] = c;
 111   }
 112 }
 113
 114 /*
 115 ** Tokenize some text using the ascii tokenizer.
 116 */
 117 static int fts5AsciiTokenize(
 118   Fts5Tokenizer *pTokenizer,
 119   void *pCtx,
 120   int iUnused,
 121   const char *pText, int nText,
 122   int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
 123 ){
 124   AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer;
 125   int rc = SQLITE_OK;
 126   int ie;
 127   int is = 0;
 128
 129   char aFold[64];
 130   int nFold = sizeof(aFold);
 131   char *pFold = aFold;
 132   unsigned char *a = p->aTokenChar;
 133
 134   UNUSED_PARAM(iUnused);
 135
 136   while( is<nText && rc==SQLITE_OK ){
 137     int nByte;
 138
 139     /* Skip any leading divider characters. */
 140     while( is<nText && ((pText[is]&0x80)==0 && a[(int)pText[is]]==0) ){
 141       is++;
 142     }
 143     if( is==nText ) break;
 144
 145     /* Count the token characters */
 146     ie = is+1;
 147     while( ie<nText && ((pText[ie]&0x80) || a[(int)pText[ie]] ) ){
 148       ie++;
 149     }
 150
 151     /* Fold to lower case */
 152     nByte = ie-is;
 153     if( nByte>nFold ){
 154       if( pFold!=aFold ) sqlite3_free(pFold);
 155       pFold = sqlite3_malloc64((sqlite3_int64)nByte*2);
 156       if( pFold==0 ){
 157         rc = SQLITE_NOMEM;
 158         break;
 159       }
 160       nFold = nByte*2;
 161     }
 162     asciiFold(pFold, &pText[is], nByte);
 163
 164     /* Invoke the token callback */
 165     rc = xToken(pCtx, 0, pFold, nByte, is, ie);
 166     is = ie+1;
 167   }
 168
 169   if( pFold!=aFold ) sqlite3_free(pFold);
 170   if( rc==SQLITE_DONE ) rc = SQLITE_OK;
 171   return rc;
 172 }
 173
 174 /**************************************************************************
 175 ** Start of unicode61 tokenizer implementation.
 176 */
 177
 178
 179 /*
 180 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
 181 ** from the sqlite3 source file utf.c. If this file is compiled as part
 182 ** of the amalgamation, they are not required.
 183 */
 184 #ifndef SQLITE_AMALGAMATION
 185
 186 static const unsigned char sqlite3Utf8Trans1[] = {
 187   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 188   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 189   0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
 190   0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
 191   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 192   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 193   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 194   0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
 195 };
 196
 197 #define READ_UTF8(zIn, zTerm, c)                           \
 198   c = *(zIn++);                                            \
 199   if( c>=0xc0 ){                                           \
 200     c = sqlite3Utf8Trans1[c-0xc0];                         \
 201     while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){            \
 202       c = (c<<6) + (0x3f & *(zIn++));                      \
 203     }                                                      \
 204     if( c<0x80                                             \
 205         || (c&0xFFFFF800)==0xD800                          \
 206         || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \
 207   }
 208
 209
 210 #define WRITE_UTF8(zOut, c) {                          \
 211   if( c<0x00080 ){                                     \
 212     *zOut++ = (unsigned char)(c&0xFF);                 \
 213   }                                                    \
 214   else if( c<0x00800 ){                                \
 215     *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F);     \
 216     *zOut++ = 0x80 + (unsigned char)(c & 0x3F);        \
 217   }                                                    \
 218   else if( c<0x10000 ){                                \
 219     *zOut++ = 0xE0 + (unsigned char)((c>>12)&0x0F);    \
 220     *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F);   \
 221     *zOut++ = 0x80 + (unsigned char)(c & 0x3F);        \
 222   }else{                                               \
 223     *zOut++ = 0xF0 + (unsigned char)((c>>18) & 0x07);  \
 224     *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F);  \
 225     *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F);   \
 226     *zOut++ = 0x80 + (unsigned char)(c & 0x3F);        \
 227   }                                                    \
 228 }
 229
 230 #endif /* ifndef SQLITE_AMALGAMATION */
 231
 232 #define FTS5_SKIP_UTF8(zIn) {                               \
 233   if( ((unsigned char)(*(zIn++)))>=0xc0 ){                              \
 234     while( (((unsigned char)*zIn) & 0xc0)==0x80 ){ zIn++; }             \
 235   }                                                    \
 236 }
 237
 238 typedef struct Unicode61Tokenizer Unicode61Tokenizer;
 239 struct Unicode61Tokenizer {
 240   unsigned char aTokenChar[128];  /* ASCII range token characters */
 241   char *aFold;                    /* Buffer to fold text into */
 242   int nFold;                      /* Size of aFold[] in bytes */
 243   int eRemoveDiacritic;           /* True if remove_diacritics=1 is set */
 244   int nException;
 245   int *aiException;
 246
 247   unsigned char aCategory[32];    /* True for token char categories */
 248 };
 249
 250 /* Values for eRemoveDiacritic (must match internals of fts5_unicode2.c) */
 251 #define FTS5_REMOVE_DIACRITICS_NONE    0
 252 #define FTS5_REMOVE_DIACRITICS_SIMPLE  1
 253 #define FTS5_REMOVE_DIACRITICS_COMPLEX 2
 254
 255 static int fts5UnicodeAddExceptions(
 256   Unicode61Tokenizer *p,          /* Tokenizer object */
 257   const char *z,                  /* Characters to treat as exceptions */
 258   int bTokenChars                 /* 1 for 'tokenchars', 0 for 'separators' */
 259 ){
 260   int rc = SQLITE_OK;
 261   int n = (int)strlen(z);
 262   int *aNew;
 263
 264   if( n>0 ){
 265     aNew = (int*)sqlite3_realloc64(p->aiException,
 266                                    (n+p->nException)*sizeof(int));
 267     if( aNew ){
 268       int nNew = p->nException;
 269       const unsigned char *zCsr = (const unsigned char*)z;
 270       const unsigned char *zTerm = (const unsigned char*)&z[n];
 271       while( zCsr<zTerm ){
 272         u32 iCode;
 273         int bToken;
 274         READ_UTF8(zCsr, zTerm, iCode);
 275         if( iCode<128 ){
 276           p->aTokenChar[iCode] = (unsigned char)bTokenChars;
 277         }else{
 278           bToken = p->aCategory[sqlite3Fts5UnicodeCategory(iCode)];
 279           assert( (bToken==0 || bToken==1) );
 280           assert( (bTokenChars==0 || bTokenChars==1) );
 281           if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){
 282             int i;
 283             for(i=0; i<nNew; i++){
 284               if( (u32)aNew[i]>iCode ) break;
 285             }
 286             memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int));
 287             aNew[i] = iCode;
 288             nNew++;
 289           }
 290         }
 291       }
 292       p->aiException = aNew;
 293       p->nException = nNew;
 294     }else{
 295       rc = SQLITE_NOMEM;
 296     }
 297   }
 298
 299   return rc;
 300 }
 301
 302 /*
 303 ** Return true if the p->aiException[] array contains the value iCode.
 304 */
 305 static int fts5UnicodeIsException(Unicode61Tokenizer *p, int iCode){
 306   if( p->nException>0 ){
 307     int *a = p->aiException;
 308     int iLo = 0;
 309     int iHi = p->nException-1;
 310
 311     while( iHi>=iLo ){
 312       int iTest = (iHi + iLo) / 2;
 313       if( iCode==a[iTest] ){
 314         return 1;
 315       }else if( iCode>a[iTest] ){
 316         iLo = iTest+1;
 317       }else{
 318         iHi = iTest-1;
 319       }
 320     }
 321   }
 322
 323   return 0;
 324 }
 325
 326 /*
 327 ** Delete a "unicode61" tokenizer.
 328 */
 329 static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
 330   if( pTok ){
 331     Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
 332     sqlite3_free(p->aiException);
 333     sqlite3_free(p->aFold);
 334     sqlite3_free(p);
 335   }
 336   return;
 337 }
 338
 339 static int unicodeSetCategories(Unicode61Tokenizer *p, const char *zCat){
 340   const char *z = zCat;
 341
 342   while( *z ){
 343     while( *z==' ' || *z=='\t' ) z++;
 344     if( *z && sqlite3Fts5UnicodeCatParse(z, p->aCategory) ){
 345       return SQLITE_ERROR;
 346     }
 347     while( *z!=' ' && *z!='\t' && *z!='\0' ) z++;
 348   }
 349
 350   sqlite3Fts5UnicodeAscii(p->aCategory, p->aTokenChar);
 351   return SQLITE_OK;
 352 }
 353
 354 /*
 355 ** Create a "unicode61" tokenizer.
 356 */
 357 static int fts5UnicodeCreate(
 358   void *pUnused,
 359   const char **azArg, int nArg,
 360   Fts5Tokenizer **ppOut
 361 ){
 362   int rc = SQLITE_OK;             /* Return code */
 363   Unicode61Tokenizer *p = 0;      /* New tokenizer object */
 364
 365   UNUSED_PARAM(pUnused);
 366
 367   if( nArg%2 ){
 368     rc = SQLITE_ERROR;
 369   }else{
 370     p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer));
 371     if( p ){
 372       const char *zCat = "L* N* Co";
 373       int i;
 374       memset(p, 0, sizeof(Unicode61Tokenizer));
 375
 376       p->eRemoveDiacritic = FTS5_REMOVE_DIACRITICS_SIMPLE;
 377       p->nFold = 64;
 378       p->aFold = sqlite3_malloc64(p->nFold * sizeof(char));
 379       if( p->aFold==0 ){
 380         rc = SQLITE_NOMEM;
 381       }
 382
 383       /* Search for a "categories" argument */
 384       for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
 385         if( 0==sqlite3_stricmp(azArg[i], "categories") ){
 386           zCat = azArg[i+1];
 387         }
 388       }
 389
 390       if( rc==SQLITE_OK ){
 391         rc = unicodeSetCategories(p, zCat);
 392       }
 393
 394       for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
 395         const char *zArg = azArg[i+1];
 396         if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
 397           if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){
 398             rc = SQLITE_ERROR;
 399           }else{
 400             p->eRemoveDiacritic = (zArg[0] - '0');
 401             assert( p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_NONE
 402                  || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_SIMPLE
 403                  || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_COMPLEX
 404             );
 405           }
 406         }else
 407         if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
 408           rc = fts5UnicodeAddExceptions(p, zArg, 1);
 409         }else
 410         if( 0==sqlite3_stricmp(azArg[i], "separators") ){
 411           rc = fts5UnicodeAddExceptions(p, zArg, 0);
 412         }else
 413         if( 0==sqlite3_stricmp(azArg[i], "categories") ){
 414           /* no-op */
 415         }else{
 416           rc = SQLITE_ERROR;
 417         }
 418       }
 419
 420     }else{
 421       rc = SQLITE_NOMEM;
 422     }
 423     if( rc!=SQLITE_OK ){
 424       fts5UnicodeDelete((Fts5Tokenizer*)p);
 425       p = 0;
 426     }
 427     *ppOut = (Fts5Tokenizer*)p;
 428   }
 429   return rc;
 430 }
 431
 432 /*
 433 ** Return true if, for the purposes of tokenizing with the tokenizer
 434 ** passed as the first argument, codepoint iCode is considered a token
 435 ** character (not a separator).
 436 */
 437 static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){
 438   return (
 439     p->aCategory[sqlite3Fts5UnicodeCategory((u32)iCode)]
 440     ^ fts5UnicodeIsException(p, iCode)
 441   );
 442 }
 443
 444 static int fts5UnicodeTokenize(
 445   Fts5Tokenizer *pTokenizer,
 446   void *pCtx,
 447   int iUnused,
 448   const char *pText, int nText,
 449   int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
 450 ){
 451   Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
 452   int rc = SQLITE_OK;
 453   unsigned char *a = p->aTokenChar;
 454
 455   unsigned char *zTerm = (unsigned char*)&pText[nText];
 456   unsigned char *zCsr = (unsigned char *)pText;
 457
 458   /* Output buffer */
 459   char *aFold = p->aFold;
 460   int nFold = p->nFold;
 461   const char *pEnd = &aFold[nFold-6];
 462
 463   UNUSED_PARAM(iUnused);
 464
 465   /* Each iteration of this loop gobbles up a contiguous run of separators,
 466   ** then the next token.  */
 467   while( rc==SQLITE_OK ){
 468     u32 iCode;                    /* non-ASCII codepoint read from input */
 469     char *zOut = aFold;
 470     int is;
 471     int ie;
 472
 473     /* Skip any separator characters. */
 474     while( 1 ){
 475       if( zCsr>=zTerm ) goto tokenize_done;
 476       if( *zCsr & 0x80 ) {
 477         /* A character outside of the ascii range. Skip past it if it is
 478         ** a separator character. Or break out of the loop if it is not. */
 479         is = zCsr - (unsigned char*)pText;
 480         READ_UTF8(zCsr, zTerm, iCode);
 481         if( fts5UnicodeIsAlnum(p, iCode) ){
 482           goto non_ascii_tokenchar;
 483         }
 484       }else{
 485         if( a[*zCsr] ){
 486           is = zCsr - (unsigned char*)pText;
 487           goto ascii_tokenchar;
 488         }
 489         zCsr++;
 490       }
 491     }
 492
 493     /* Run through the tokenchars. Fold them into the output buffer along
 494     ** the way.  */
 495     while( zCsr<zTerm ){
 496
 497       /* Grow the output buffer so that there is sufficient space to fit the
 498       ** largest possible utf-8 character.  */
 499       if( zOut>pEnd ){
 500         aFold = sqlite3_malloc64((sqlite3_int64)nFold*2);
 501         if( aFold==0 ){
 502           rc = SQLITE_NOMEM;
 503           goto tokenize_done;
 504         }
 505         zOut = &aFold[zOut - p->aFold];
 506         memcpy(aFold, p->aFold, nFold);
 507         sqlite3_free(p->aFold);
 508         p->aFold = aFold;
 509         p->nFold = nFold = nFold*2;
 510         pEnd = &aFold[nFold-6];
 511       }
 512
 513       if( *zCsr & 0x80 ){
 514         /* An non-ascii-range character. Fold it into the output buffer if
 515         ** it is a token character, or break out of the loop if it is not. */
 516         READ_UTF8(zCsr, zTerm, iCode);
 517         if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
 518  non_ascii_tokenchar:
 519           iCode = sqlite3Fts5UnicodeFold(iCode, p->eRemoveDiacritic);
 520           if( iCode ) WRITE_UTF8(zOut, iCode);
 521         }else{
 522           break;
 523         }
 524       }else if( a[*zCsr]==0 ){
 525         /* An ascii-range separator character. End of token. */
 526         break;
 527       }else{
 528  ascii_tokenchar:
 529         if( *zCsr>='A' && *zCsr<='Z' ){
 530           *zOut++ = *zCsr + 32;
 531         }else{
 532           *zOut++ = *zCsr;
 533         }
 534         zCsr++;
 535       }
 536       ie = zCsr - (unsigned char*)pText;
 537     }
 538
 539     /* Invoke the token callback */
 540     rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie);
 541   }
 542
 543  tokenize_done:
 544   if( rc==SQLITE_DONE ) rc = SQLITE_OK;
 545   return rc;
 546 }
 547
 548 /**************************************************************************
 549 ** Start of porter stemmer implementation.
 550 */
 551
 552 /* Any tokens larger than this (in bytes) are passed through without
 553 ** stemming. */
 554 #define FTS5_PORTER_MAX_TOKEN 64
 555
 556 typedef struct PorterTokenizer PorterTokenizer;
 557 struct PorterTokenizer {
 558   fts5_tokenizer tokenizer;       /* Parent tokenizer module */
 559   Fts5Tokenizer *pTokenizer;      /* Parent tokenizer instance */
 560   char aBuf[FTS5_PORTER_MAX_TOKEN + 64];
 561 };
 562
 563 /*
 564 ** Delete a "porter" tokenizer.
 565 */
 566 static void fts5PorterDelete(Fts5Tokenizer *pTok){
 567   if( pTok ){
 568     PorterTokenizer *p = (PorterTokenizer*)pTok;
 569     if( p->pTokenizer ){
 570       p->tokenizer.xDelete(p->pTokenizer);
 571     }
 572     sqlite3_free(p);
 573   }
 574 }
 575
 576 /*
 577 ** Create a "porter" tokenizer.
 578 */
 579 static int fts5PorterCreate(
 580   void *pCtx,
 581   const char **azArg, int nArg,
 582   Fts5Tokenizer **ppOut
 583 ){
 584   fts5_api *pApi = (fts5_api*)pCtx;
 585   int rc = SQLITE_OK;
 586   PorterTokenizer *pRet;
 587   void *pUserdata = 0;
 588   const char *zBase = "unicode61";
 589
 590   if( nArg>0 ){
 591     zBase = azArg[0];
 592   }
 593
 594   pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer));
 595   if( pRet ){
 596     memset(pRet, 0, sizeof(PorterTokenizer));
 597     rc = pApi->xFindTokenizer(pApi, zBase, &pUserdata, &pRet->tokenizer);
 598   }else{
 599     rc = SQLITE_NOMEM;
 600   }
 601   if( rc==SQLITE_OK ){
 602     int nArg2 = (nArg>0 ? nArg-1 : 0);
 603     const char **azArg2 = (nArg2 ? &azArg[1] : 0);
 604     rc = pRet->tokenizer.xCreate(pUserdata, azArg2, nArg2, &pRet->pTokenizer);
 605   }
 606
 607   if( rc!=SQLITE_OK ){
 608     fts5PorterDelete((Fts5Tokenizer*)pRet);
 609     pRet = 0;
 610   }
 611   *ppOut = (Fts5Tokenizer*)pRet;
 612   return rc;
 613 }
 614
 615 typedef struct PorterContext PorterContext;
 616 struct PorterContext {
 617   void *pCtx;
 618   int (*xToken)(void*, int, const char*, int, int, int);
 619   char *aBuf;
 620 };
 621
 622 typedef struct PorterRule PorterRule;
 623 struct PorterRule {
 624   const char *zSuffix;
 625   int nSuffix;
 626   int (*xCond)(char *zStem, int nStem);
 627   const char *zOutput;
 628   int nOutput;
 629 };
 630
 631 #if 0
 632 static int fts5PorterApply(char *aBuf, int *pnBuf, PorterRule *aRule){
 633   int ret = -1;
 634   int nBuf = *pnBuf;
 635   PorterRule *p;
 636
 637   for(p=aRule; p->zSuffix; p++){
 638     assert( strlen(p->zSuffix)==p->nSuffix );
 639     assert( strlen(p->zOutput)==p->nOutput );
 640     if( nBuf<p->nSuffix ) continue;
 641     if( 0==memcmp(&aBuf[nBuf - p->nSuffix], p->zSuffix, p->nSuffix) ) break;
 642   }
 643
 644   if( p->zSuffix ){
 645     int nStem = nBuf - p->nSuffix;
 646     if( p->xCond==0 || p->xCond(aBuf, nStem) ){
 647       memcpy(&aBuf[nStem], p->zOutput, p->nOutput);
 648       *pnBuf = nStem + p->nOutput;
 649       ret = p - aRule;
 650     }
 651   }
 652
 653   return ret;
 654 }
 655 #endif
 656
 657 static int fts5PorterIsVowel(char c, int bYIsVowel){
 658   return (
 659       c=='a' || c=='e' || c=='i' || c=='o' || c=='u' || (bYIsVowel && c=='y')
 660   );
 661 }
 662
 663 static int fts5PorterGobbleVC(char *zStem, int nStem, int bPrevCons){
 664   int i;
 665   int bCons = bPrevCons;
 666
 667   /* Scan for a vowel */
 668   for(i=0; i<nStem; i++){
 669     if( 0==(bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) break;
 670   }
 671
 672   /* Scan for a consonent */
 673   for(i++; i<nStem; i++){
 674     if( (bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) return i+1;
 675   }
 676   return 0;
 677 }
 678
 679 /* porter rule condition: (m > 0) */
 680 static int fts5Porter_MGt0(char *zStem, int nStem){
 681   return !!fts5PorterGobbleVC(zStem, nStem, 0);
 682 }
 683
 684 /* porter rule condition: (m > 1) */
 685 static int fts5Porter_MGt1(char *zStem, int nStem){
 686   int n;
 687   n = fts5PorterGobbleVC(zStem, nStem, 0);
 688   if( n && fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
 689     return 1;
 690   }
 691   return 0;
 692 }
 693
 694 /* porter rule condition: (m = 1) */
 695 static int fts5Porter_MEq1(char *zStem, int nStem){
 696   int n;
 697   n = fts5PorterGobbleVC(zStem, nStem, 0);
 698   if( n && 0==fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
 699     return 1;
 700   }
 701   return 0;
 702 }
 703
 704 /* porter rule condition: (*o) */
 705 static int fts5Porter_Ostar(char *zStem, int nStem){
 706   if( zStem[nStem-1]=='w' || zStem[nStem-1]=='x' || zStem[nStem-1]=='y' ){
 707     return 0;
 708   }else{
 709     int i;
 710     int mask = 0;
 711     int bCons = 0;
 712     for(i=0; i<nStem; i++){
 713       bCons = !fts5PorterIsVowel(zStem[i], bCons);
 714       assert( bCons==0 || bCons==1 );
 715       mask = (mask << 1) + bCons;
 716     }
 717     return ((mask & 0x0007)==0x0005);
 718   }
 719 }
 720
 721 /* porter rule condition: (m > 1 and (*S or *T)) */
 722 static int fts5Porter_MGt1_and_S_or_T(char *zStem, int nStem){
 723   assert( nStem>0 );
 724   return (zStem[nStem-1]=='s' || zStem[nStem-1]=='t')
 725       && fts5Porter_MGt1(zStem, nStem);
 726 }
 727
 728 /* porter rule condition: (*v*) */
 729 static int fts5Porter_Vowel(char *zStem, int nStem){
 730   int i;
 731   for(i=0; i<nStem; i++){
 732     if( fts5PorterIsVowel(zStem[i], i>0) ){
 733       return 1;
 734     }
 735   }
 736   return 0;
 737 }
 738
 739
 740 /**************************************************************************
 741 ***************************************************************************
 742 ** GENERATED CODE STARTS HERE (mkportersteps.tcl)
 743 */
 744
 745 static int fts5PorterStep4(char *aBuf, int *pnBuf){
 746   int ret = 0;
 747   int nBuf = *pnBuf;
 748   switch( aBuf[nBuf-2] ){
 749
 750     case 'a':
 751       if( nBuf>2 && 0==memcmp("al", &aBuf[nBuf-2], 2) ){
 752         if( fts5Porter_MGt1(aBuf, nBuf-2) ){
 753           *pnBuf = nBuf - 2;
 754         }
 755       }
 756       break;
 757
 758     case 'c':
 759       if( nBuf>4 && 0==memcmp("ance", &aBuf[nBuf-4], 4) ){
 760         if( fts5Porter_MGt1(aBuf, nBuf-4) ){
 761           *pnBuf = nBuf - 4;
 762         }
 763       }else if( nBuf>4 && 0==memcmp("ence", &aBuf[nBuf-4], 4) ){
 764         if( fts5Porter_MGt1(aBuf, nBuf-4) ){
 765           *pnBuf = nBuf - 4;
 766         }
 767       }
 768       break;
 769
 770     case 'e':
 771       if( nBuf>2 && 0==memcmp("er", &aBuf[nBuf-2], 2) ){
 772         if( fts5Porter_MGt1(aBuf, nBuf-2) ){
 773           *pnBuf = nBuf - 2;
 774         }
 775       }
 776       break;
 777
 778     case 'i':
 779       if( nBuf>2 && 0==memcmp("ic", &aBuf[nBuf-2], 2) ){
 780         if( fts5Porter_MGt1(aBuf, nBuf-2) ){
 781           *pnBuf = nBuf - 2;
 782         }
 783       }
 784       break;
 785
 786     case 'l':
 787       if( nBuf>4 && 0==memcmp("able", &aBuf[nBuf-4], 4) ){
 788         if( fts5Porter_MGt1(aBuf, nBuf-4) ){
 789           *pnBuf = nBuf - 4;
 790         }
 791       }else if( nBuf>4 && 0==memcmp("ible", &aBuf[nBuf-4], 4) ){
 792         if( fts5Porter_MGt1(aBuf, nBuf-4) ){
 793           *pnBuf = nBuf - 4;
 794         }
 795       }
 796       break;
 797
 798     case 'n':
 799       if( nBuf>3 && 0==memcmp("ant", &aBuf[nBuf-3], 3) ){
 800         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 801           *pnBuf = nBuf - 3;
 802         }
 803       }else if( nBuf>5 && 0==memcmp("ement", &aBuf[nBuf-5], 5) ){
 804         if( fts5Porter_MGt1(aBuf, nBuf-5) ){
 805           *pnBuf = nBuf - 5;
 806         }
 807       }else if( nBuf>4 && 0==memcmp("ment", &aBuf[nBuf-4], 4) ){
 808         if( fts5Porter_MGt1(aBuf, nBuf-4) ){
 809           *pnBuf = nBuf - 4;
 810         }
 811       }else if( nBuf>3 && 0==memcmp("ent", &aBuf[nBuf-3], 3) ){
 812         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 813           *pnBuf = nBuf - 3;
 814         }
 815       }
 816       break;
 817
 818     case 'o':
 819       if( nBuf>3 && 0==memcmp("ion", &aBuf[nBuf-3], 3) ){
 820         if( fts5Porter_MGt1_and_S_or_T(aBuf, nBuf-3) ){
 821           *pnBuf = nBuf - 3;
 822         }
 823       }else if( nBuf>2 && 0==memcmp("ou", &aBuf[nBuf-2], 2) ){
 824         if( fts5Porter_MGt1(aBuf, nBuf-2) ){
 825           *pnBuf = nBuf - 2;
 826         }
 827       }
 828       break;
 829
 830     case 's':
 831       if( nBuf>3 && 0==memcmp("ism", &aBuf[nBuf-3], 3) ){
 832         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 833           *pnBuf = nBuf - 3;
 834         }
 835       }
 836       break;
 837
 838     case 't':
 839       if( nBuf>3 && 0==memcmp("ate", &aBuf[nBuf-3], 3) ){
 840         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 841           *pnBuf = nBuf - 3;
 842         }
 843       }else if( nBuf>3 && 0==memcmp("iti", &aBuf[nBuf-3], 3) ){
 844         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 845           *pnBuf = nBuf - 3;
 846         }
 847       }
 848       break;
 849
 850     case 'u':
 851       if( nBuf>3 && 0==memcmp("ous", &aBuf[nBuf-3], 3) ){
 852         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 853           *pnBuf = nBuf - 3;
 854         }
 855       }
 856       break;
 857
 858     case 'v':
 859       if( nBuf>3 && 0==memcmp("ive", &aBuf[nBuf-3], 3) ){
 860         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 861           *pnBuf = nBuf - 3;
 862         }
 863       }
 864       break;
 865
 866     case 'z':
 867       if( nBuf>3 && 0==memcmp("ize", &aBuf[nBuf-3], 3) ){
 868         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 869           *pnBuf = nBuf - 3;
 870         }
 871       }
 872       break;
 873
 874   }
 875   return ret;
 876 }
 877
 878
 879 static int fts5PorterStep1B2(char *aBuf, int *pnBuf){
 880   int ret = 0;
 881   int nBuf = *pnBuf;
 882   switch( aBuf[nBuf-2] ){
 883
 884     case 'a':
 885       if( nBuf>2 && 0==memcmp("at", &aBuf[nBuf-2], 2) ){
 886         memcpy(&aBuf[nBuf-2], "ate", 3);
 887         *pnBuf = nBuf - 2 + 3;
 888         ret = 1;
 889       }
 890       break;
 891
 892     case 'b':
 893       if( nBuf>2 && 0==memcmp("bl", &aBuf[nBuf-2], 2) ){
 894         memcpy(&aBuf[nBuf-2], "ble", 3);
 895         *pnBuf = nBuf - 2 + 3;
 896         ret = 1;
 897       }
 898       break;
 899
 900     case 'i':
 901       if( nBuf>2 && 0==memcmp("iz", &aBuf[nBuf-2], 2) ){
 902         memcpy(&aBuf[nBuf-2], "ize", 3);
 903         *pnBuf = nBuf - 2 + 3;
 904         ret = 1;
 905       }
 906       break;
 907
 908   }
 909   return ret;
 910 }
 911
 912
 913 static int fts5PorterStep2(char *aBuf, int *pnBuf){
 914   int ret = 0;
 915   int nBuf = *pnBuf;
 916   switch( aBuf[nBuf-2] ){
 917
 918     case 'a':
 919       if( nBuf>7 && 0==memcmp("ational", &aBuf[nBuf-7], 7) ){
 920         if( fts5Porter_MGt0(aBuf, nBuf-7) ){
 921           memcpy(&aBuf[nBuf-7], "ate", 3);
 922           *pnBuf = nBuf - 7 + 3;
 923         }
 924       }else if( nBuf>6 && 0==memcmp("tional", &aBuf[nBuf-6], 6) ){
 925         if( fts5Porter_MGt0(aBuf, nBuf-6) ){
 926           memcpy(&aBuf[nBuf-6], "tion", 4);
 927           *pnBuf = nBuf - 6 + 4;
 928         }
 929       }
 930       break;
 931
 932     case 'c':
 933       if( nBuf>4 && 0==memcmp("enci", &aBuf[nBuf-4], 4) ){
 934         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
 935           memcpy(&aBuf[nBuf-4], "ence", 4);
 936           *pnBuf = nBuf - 4 + 4;
 937         }
 938       }else if( nBuf>4 && 0==memcmp("anci", &aBuf[nBuf-4], 4) ){
 939         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
 940           memcpy(&aBuf[nBuf-4], "ance", 4);
 941           *pnBuf = nBuf - 4 + 4;
 942         }
 943       }
 944       break;
 945
 946     case 'e':
 947       if( nBuf>4 && 0==memcmp("izer", &aBuf[nBuf-4], 4) ){
 948         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
 949           memcpy(&aBuf[nBuf-4], "ize", 3);
 950           *pnBuf = nBuf - 4 + 3;
 951         }
 952       }
 953       break;
 954
 955     case 'g':
 956       if( nBuf>4 && 0==memcmp("logi", &aBuf[nBuf-4], 4) ){
 957         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
 958           memcpy(&aBuf[nBuf-4], "log", 3);
 959           *pnBuf = nBuf - 4 + 3;
 960         }
 961       }
 962       break;
 963
 964     case 'l':
 965       if( nBuf>3 && 0==memcmp("bli", &aBuf[nBuf-3], 3) ){
 966         if( fts5Porter_MGt0(aBuf, nBuf-3) ){
 967           memcpy(&aBuf[nBuf-3], "ble", 3);
 968           *pnBuf = nBuf - 3 + 3;
 969         }
 970       }else if( nBuf>4 && 0==memcmp("alli", &aBuf[nBuf-4], 4) ){
 971         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
 972           memcpy(&aBuf[nBuf-4], "al", 2);
 973           *pnBuf = nBuf - 4 + 2;
 974         }
 975       }else if( nBuf>5 && 0==memcmp("entli", &aBuf[nBuf-5], 5) ){
 976         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
 977           memcpy(&aBuf[nBuf-5], "ent", 3);
 978           *pnBuf = nBuf - 5 + 3;
 979         }
 980       }else if( nBuf>3 && 0==memcmp("eli", &aBuf[nBuf-3], 3) ){
 981         if( fts5Porter_MGt0(aBuf, nBuf-3) ){
 982           memcpy(&aBuf[nBuf-3], "e", 1);
 983           *pnBuf = nBuf - 3 + 1;
 984         }
 985       }else if( nBuf>5 && 0==memcmp("ousli", &aBuf[nBuf-5], 5) ){
 986         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
 987           memcpy(&aBuf[nBuf-5], "ous", 3);
 988           *pnBuf = nBuf - 5 + 3;
 989         }
 990       }
 991       break;
 992
 993     case 'o':
 994       if( nBuf>7 && 0==memcmp("ization", &aBuf[nBuf-7], 7) ){
 995         if( fts5Porter_MGt0(aBuf, nBuf-7) ){
 996           memcpy(&aBuf[nBuf-7], "ize", 3);
 997           *pnBuf = nBuf - 7 + 3;
 998         }
 999       }else if( nBuf>5 && 0==memcmp("ation", &aBuf[nBuf-5], 5) ){
1000         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1001           memcpy(&aBuf[nBuf-5], "ate", 3);
1002           *pnBuf = nBuf - 5 + 3;
1003         }
1004       }else if( nBuf>4 && 0==memcmp("ator", &aBuf[nBuf-4], 4) ){
1005         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1006           memcpy(&aBuf[nBuf-4], "ate", 3);
1007           *pnBuf = nBuf - 4 + 3;
1008         }
1009       }
1010       break;
1011
1012     case 's':
1013       if( nBuf>5 && 0==memcmp("alism", &aBuf[nBuf-5], 5) ){
1014         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1015           memcpy(&aBuf[nBuf-5], "al", 2);
1016           *pnBuf = nBuf - 5 + 2;
1017         }
1018       }else if( nBuf>7 && 0==memcmp("iveness", &aBuf[nBuf-7], 7) ){
1019         if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1020           memcpy(&aBuf[nBuf-7], "ive", 3);
1021           *pnBuf = nBuf - 7 + 3;
1022         }
1023       }else if( nBuf>7 && 0==memcmp("fulness", &aBuf[nBuf-7], 7) ){
1024         if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1025           memcpy(&aBuf[nBuf-7], "ful", 3);
1026           *pnBuf = nBuf - 7 + 3;
1027         }
1028       }else if( nBuf>7 && 0==memcmp("ousness", &aBuf[nBuf-7], 7) ){
1029         if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1030           memcpy(&aBuf[nBuf-7], "ous", 3);
1031           *pnBuf = nBuf - 7 + 3;
1032         }
1033       }
1034       break;
1035
1036     case 't':
1037       if( nBuf>5 && 0==memcmp("aliti", &aBuf[nBuf-5], 5) ){
1038         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1039           memcpy(&aBuf[nBuf-5], "al", 2);
1040           *pnBuf = nBuf - 5 + 2;
1041         }
1042       }else if( nBuf>5 && 0==memcmp("iviti", &aBuf[nBuf-5], 5) ){
1043         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1044           memcpy(&aBuf[nBuf-5], "ive", 3);
1045           *pnBuf = nBuf - 5 + 3;
1046         }
1047       }else if( nBuf>6 && 0==memcmp("biliti", &aBuf[nBuf-6], 6) ){
1048         if( fts5Porter_MGt0(aBuf, nBuf-6) ){
1049           memcpy(&aBuf[nBuf-6], "ble", 3);
1050           *pnBuf = nBuf - 6 + 3;
1051         }
1052       }
1053       break;
1054
1055   }
1056   return ret;
1057 }
1058
1059
1060 static int fts5PorterStep3(char *aBuf, int *pnBuf){
1061   int ret = 0;
1062   int nBuf = *pnBuf;
1063   switch( aBuf[nBuf-2] ){
1064
1065     case 'a':
1066       if( nBuf>4 && 0==memcmp("ical", &aBuf[nBuf-4], 4) ){
1067         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1068           memcpy(&aBuf[nBuf-4], "ic", 2);
1069           *pnBuf = nBuf - 4 + 2;
1070         }
1071       }
1072       break;
1073
1074     case 's':
1075       if( nBuf>4 && 0==memcmp("ness", &aBuf[nBuf-4], 4) ){
1076         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1077           *pnBuf = nBuf - 4;
1078         }
1079       }
1080       break;
1081
1082     case 't':
1083       if( nBuf>5 && 0==memcmp("icate", &aBuf[nBuf-5], 5) ){
1084         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1085           memcpy(&aBuf[nBuf-5], "ic", 2);
1086           *pnBuf = nBuf - 5 + 2;
1087         }
1088       }else if( nBuf>5 && 0==memcmp("iciti", &aBuf[nBuf-5], 5) ){
1089         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1090           memcpy(&aBuf[nBuf-5], "ic", 2);
1091           *pnBuf = nBuf - 5 + 2;
1092         }
1093       }
1094       break;
1095
1096     case 'u':
1097       if( nBuf>3 && 0==memcmp("ful", &aBuf[nBuf-3], 3) ){
1098         if( fts5Porter_MGt0(aBuf, nBuf-3) ){
1099           *pnBuf = nBuf - 3;
1100         }
1101       }
1102       break;
1103
1104     case 'v':
1105       if( nBuf>5 && 0==memcmp("ative", &aBuf[nBuf-5], 5) ){
1106         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1107           *pnBuf = nBuf - 5;
1108         }
1109       }
1110       break;
1111
1112     case 'z':
1113       if( nBuf>5 && 0==memcmp("alize", &aBuf[nBuf-5], 5) ){
1114         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1115           memcpy(&aBuf[nBuf-5], "al", 2);
1116           *pnBuf = nBuf - 5 + 2;
1117         }
1118       }
1119       break;
1120
1121   }
1122   return ret;
1123 }
1124
1125
1126 static int fts5PorterStep1B(char *aBuf, int *pnBuf){
1127   int ret = 0;
1128   int nBuf = *pnBuf;
1129   switch( aBuf[nBuf-2] ){
1130
1131     case 'e':
1132       if( nBuf>3 && 0==memcmp("eed", &aBuf[nBuf-3], 3) ){
1133         if( fts5Porter_MGt0(aBuf, nBuf-3) ){
1134           memcpy(&aBuf[nBuf-3], "ee", 2);
1135           *pnBuf = nBuf - 3 + 2;
1136         }
1137       }else if( nBuf>2 && 0==memcmp("ed", &aBuf[nBuf-2], 2) ){
1138         if( fts5Porter_Vowel(aBuf, nBuf-2) ){
1139           *pnBuf = nBuf - 2;
1140           ret = 1;
1141         }
1142       }
1143       break;
1144
1145     case 'n':
1146       if( nBuf>3 && 0==memcmp("ing", &aBuf[nBuf-3], 3) ){
1147         if( fts5Porter_Vowel(aBuf, nBuf-3) ){
1148           *pnBuf = nBuf - 3;
1149           ret = 1;
1150         }
1151       }
1152       break;
1153
1154   }
1155   return ret;
1156 }
1157
1158 /*
1159 ** GENERATED CODE ENDS HERE (mkportersteps.tcl)
1160 ***************************************************************************
1161 **************************************************************************/
1162
1163 static void fts5PorterStep1A(char *aBuf, int *pnBuf){
1164   int nBuf = *pnBuf;
1165   if( aBuf[nBuf-1]=='s' ){
1166     if( aBuf[nBuf-2]=='e' ){
1167       if( (nBuf>4 && aBuf[nBuf-4]=='s' && aBuf[nBuf-3]=='s')
1168        || (nBuf>3 && aBuf[nBuf-3]=='i' )
1169       ){
1170         *pnBuf = nBuf-2;
1171       }else{
1172         *pnBuf = nBuf-1;
1173       }
1174     }
1175     else if( aBuf[nBuf-2]!='s' ){
1176       *pnBuf = nBuf-1;
1177     }
1178   }
1179 }
1180
1181 static int fts5PorterCb(
1182   void *pCtx,
1183   int tflags,
1184   const char *pToken,
1185   int nToken,
1186   int iStart,
1187   int iEnd
1188 ){
1189   PorterContext *p = (PorterContext*)pCtx;
1190
1191   char *aBuf;
1192   int nBuf;
1193
1194   if( nToken>FTS5_PORTER_MAX_TOKEN || nToken<3 ) goto pass_through;
1195   aBuf = p->aBuf;
1196   nBuf = nToken;
1197   memcpy(aBuf, pToken, nBuf);
1198
1199   /* Step 1. */
1200   fts5PorterStep1A(aBuf, &nBuf);
1201   if( fts5PorterStep1B(aBuf, &nBuf) ){
1202     if( fts5PorterStep1B2(aBuf, &nBuf)==0 ){
1203       char c = aBuf[nBuf-1];
1204       if( fts5PorterIsVowel(c, 0)==0
1205        && c!='l' && c!='s' && c!='z' && c==aBuf[nBuf-2]
1206       ){
1207         nBuf--;
1208       }else if( fts5Porter_MEq1(aBuf, nBuf) && fts5Porter_Ostar(aBuf, nBuf) ){
1209         aBuf[nBuf++] = 'e';
1210       }
1211     }
1212   }
1213
1214   /* Step 1C. */
1215   if( aBuf[nBuf-1]=='y' && fts5Porter_Vowel(aBuf, nBuf-1) ){
1216     aBuf[nBuf-1] = 'i';
1217   }
1218
1219   /* Steps 2 through 4. */
1220   fts5PorterStep2(aBuf, &nBuf);
1221   fts5PorterStep3(aBuf, &nBuf);
1222   fts5PorterStep4(aBuf, &nBuf);
1223
1224   /* Step 5a. */
1225   assert( nBuf>0 );
1226   if( aBuf[nBuf-1]=='e' ){
1227     if( fts5Porter_MGt1(aBuf, nBuf-1)
1228      || (fts5Porter_MEq1(aBuf, nBuf-1) && !fts5Porter_Ostar(aBuf, nBuf-1))
1229     ){
1230       nBuf--;
1231     }
1232   }
1233
1234   /* Step 5b. */
1235   if( nBuf>1 && aBuf[nBuf-1]=='l'
1236    && aBuf[nBuf-2]=='l' && fts5Porter_MGt1(aBuf, nBuf-1)
1237   ){
1238     nBuf--;
1239   }
1240
1241   return p->xToken(p->pCtx, tflags, aBuf, nBuf, iStart, iEnd);
1242
1243  pass_through:
1244   return p->xToken(p->pCtx, tflags, pToken, nToken, iStart, iEnd);
1245 }
1246
1247 /*
1248 ** Tokenize using the porter tokenizer.
1249 */
1250 static int fts5PorterTokenize(
1251   Fts5Tokenizer *pTokenizer,
1252   void *pCtx,
1253   int flags,
1254   const char *pText, int nText,
1255   int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
1256 ){
1257   PorterTokenizer *p = (PorterTokenizer*)pTokenizer;
1258   PorterContext sCtx;
1259   sCtx.xToken = xToken;
1260   sCtx.pCtx = pCtx;
1261   sCtx.aBuf = p->aBuf;
1262   return p->tokenizer.xTokenize(
1263       p->pTokenizer, (void*)&sCtx, flags, pText, nText, fts5PorterCb
1264   );
1265 }
1266
1267 /**************************************************************************
1268 ** Start of trigram implementation.
1269 */
1270 typedef struct TrigramTokenizer TrigramTokenizer;
1271 struct TrigramTokenizer {
1272   int bFold;                      /* True to fold to lower-case */
1273   int iFoldParam;                 /* Parameter to pass to Fts5UnicodeFold() */
1274 };
1275
1276 /*
1277 ** Free a trigram tokenizer.
1278 */
1279 static void fts5TriDelete(Fts5Tokenizer *p){
1280   sqlite3_free(p);
1281 }
1282
1283 /*
1284 ** Allocate a trigram tokenizer.
1285 */
1286 static int fts5TriCreate(
1287   void *pUnused,
1288   const char **azArg,
1289   int nArg,
1290   Fts5Tokenizer **ppOut
1291 ){
1292   int rc = SQLITE_OK;
1293   TrigramTokenizer *pNew = (TrigramTokenizer*)sqlite3_malloc(sizeof(*pNew));
1294   UNUSED_PARAM(pUnused);
1295   if( pNew==0 ){
1296     rc = SQLITE_NOMEM;
1297   }else{
1298     int i;
1299     pNew->bFold = 1;
1300     pNew->iFoldParam = 0;
1301     for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
1302       const char *zArg = azArg[i+1];
1303       if( 0==sqlite3_stricmp(azArg[i], "case_sensitive") ){
1304         if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
1305           rc = SQLITE_ERROR;
1306         }else{
1307           pNew->bFold = (zArg[0]=='0');
1308         }
1309       }else if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
1310         if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){
1311           rc = SQLITE_ERROR;
1312         }else{
1313           pNew->iFoldParam = (zArg[0]!='0') ? 2 : 0;
1314         }
1315       }else{
1316         rc = SQLITE_ERROR;
1317       }
1318     }
1319
1320     if( pNew->iFoldParam!=0 && pNew->bFold==0 ){
1321       rc = SQLITE_ERROR;
1322     }
1323
1324     if( rc!=SQLITE_OK ){
1325       fts5TriDelete((Fts5Tokenizer*)pNew);
1326       pNew = 0;
1327     }
1328   }
1329   *ppOut = (Fts5Tokenizer*)pNew;
1330   return rc;
1331 }
1332
1333 /*
1334 ** Trigram tokenizer tokenize routine.
1335 */
1336 static int fts5TriTokenize(
1337   Fts5Tokenizer *pTok,
1338   void *pCtx,
1339   int unusedFlags,
1340   const char *pText, int nText,
1341   int (*xToken)(void*, int, const char*, int, int, int)
1342 ){
1343   TrigramTokenizer *p = (TrigramTokenizer*)pTok;
1344   int rc = SQLITE_OK;
1345   char aBuf[32];
1346   char *zOut = aBuf;
1347   int ii;
1348   const unsigned char *zIn = (const unsigned char*)pText;
1349   const unsigned char *zEof = &zIn[nText];
1350   u32 iCode;
1351   int aStart[3];                  /* Input offset of each character in aBuf[] */
1352
1353   UNUSED_PARAM(unusedFlags);
1354
1355   /* Populate aBuf[] with the characters for the first trigram. */
1356   for(ii=0; ii<3; ii++){
1357     do {
1358       aStart[ii] = zIn - (const unsigned char*)pText;
1359       READ_UTF8(zIn, zEof, iCode);
1360       if( iCode==0 ) return SQLITE_OK;
1361       if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam);
1362     }while( iCode==0 );
1363     WRITE_UTF8(zOut, iCode);
1364   }
1365
1366   /* At the start of each iteration of this loop:
1367   **
1368   **  aBuf:      Contains 3 characters. The 3 characters of the next trigram.
1369   **  zOut:      Points to the byte following the last character in aBuf.
1370   **  aStart[3]: Contains the byte offset in the input text corresponding
1371   **             to the start of each of the three characters in the buffer.
1372   */
1373   assert( zIn<=zEof );
1374   while( 1 ){
1375     int iNext;                    /* Start of character following current tri */
1376     const char *z1;
1377
1378     /* Read characters from the input up until the first non-diacritic */
1379     do {
1380       iNext = zIn - (const unsigned char*)pText;
1381       READ_UTF8(zIn, zEof, iCode);
1382       if( iCode==0 ) break;
1383       if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam);
1384     }while( iCode==0 );
1385
1386     /* Pass the current trigram back to fts5 */
1387     rc = xToken(pCtx, 0, aBuf, zOut-aBuf, aStart[0], iNext);
1388     if( iCode==0 || rc!=SQLITE_OK ) break;
1389
1390     /* Remove the first character from buffer aBuf[]. Append the character
1391     ** with codepoint iCode.  */
1392     z1 = aBuf;
1393     FTS5_SKIP_UTF8(z1);
1394     memmove(aBuf, z1, zOut - z1);
1395     zOut -= (z1 - aBuf);
1396     WRITE_UTF8(zOut, iCode);
1397
1398     /* Update the aStart[] array */
1399     aStart[0] = aStart[1];
1400     aStart[1] = aStart[2];
1401     aStart[2] = iNext;
1402   }
1403
1404   return rc;
1405 }
1406
1407 /*
1408 ** Argument xCreate is a pointer to a constructor function for a tokenizer.
1409 ** pTok is a tokenizer previously created using the same method. This function
1410 ** returns one of FTS5_PATTERN_NONE, FTS5_PATTERN_LIKE or FTS5_PATTERN_GLOB
1411 ** indicating the style of pattern matching that the tokenizer can support.
1412 ** In practice, this is:
1413 **
1414 **     "trigram" tokenizer, case_sensitive=1 - FTS5_PATTERN_GLOB
1415 **     "trigram" tokenizer, case_sensitive=0 (the default) - FTS5_PATTERN_LIKE
1416 **     all other tokenizers - FTS5_PATTERN_NONE
1417 */
1418 int sqlite3Fts5TokenizerPattern(
1419     int (*xCreate)(void*, const char**, int, Fts5Tokenizer**),
1420     Fts5Tokenizer *pTok
1421 ){
1422   if( xCreate==fts5TriCreate ){
1423     TrigramTokenizer *p = (TrigramTokenizer*)pTok;
1424     if( p->iFoldParam==0 ){
1425       return p->bFold ? FTS5_PATTERN_LIKE : FTS5_PATTERN_GLOB;
1426     }
1427   }
1428   return FTS5_PATTERN_NONE;
1429 }
1430
1431 /*
1432 ** Register all built-in tokenizers with FTS5.
1433 */
1434 int sqlite3Fts5TokenizerInit(fts5_api *pApi){
1435   struct BuiltinTokenizer {
1436     const char *zName;
1437     fts5_tokenizer x;
1438   } aBuiltin[] = {
1439     { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}},
1440     { "ascii",     {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }},
1441     { "porter",    {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }},
1442     { "trigram",   {fts5TriCreate, fts5TriDelete, fts5TriTokenize}},
1443   };
1444
1445   int rc = SQLITE_OK;             /* Return code */
1446   int i;                          /* To iterate through builtin functions */
1447
1448   for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){
1449     rc = pApi->xCreateTokenizer(pApi,
1450         aBuiltin[i].zName,
1451         (void*)pApi,
1452         &aBuiltin[i].x,
1453         0
1454     );
1455   }
1456
1457   return rc;
1458 }