ext/fts5/fts5_tokenize.c

   1 /*
   2 ** 2014 May 31
   3 **
   4 ** The author disclaims copyright to this source code.  In place of
   5 ** a legal notice, here is a blessing:
   6 **
   7 **    May you do good and not evil.
   8 **    May you find forgiveness for yourself and forgive others.
   9 **    May you share freely, never taking more than you give.
  10 **
  11 ******************************************************************************
  12 */
  13
  14
  15 #include "fts5Int.h"
  16
  17 /**************************************************************************
  18 ** Start of ascii tokenizer implementation.
  19 */
  20
  21 /*
  22 ** For tokenizers with no "unicode" modifier, the set of token characters
  23 ** is the same as the set of ASCII range alphanumeric characters.
  24 */
  25 static unsigned char aAsciiTokenChar[128] = {
  26   0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x00..0x0F */
  27   0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x10..0x1F */
  28   0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x20..0x2F */
  29   1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 0, 0, 0, 0, 0, 0,   /* 0x30..0x3F */
  30   0, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   /* 0x40..0x4F */
  31   1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 0, 0, 0, 0, 0,   /* 0x50..0x5F */
  32   0, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   /* 0x60..0x6F */
  33   1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 0, 0, 0, 0, 0,   /* 0x70..0x7F */
  34 };
  35
  36 typedef struct AsciiTokenizer AsciiTokenizer;
  37 struct AsciiTokenizer {
  38   unsigned char aTokenChar[128];
  39 };
  40
  41 static void fts5AsciiAddExceptions(
  42   AsciiTokenizer *p,
  43   const char *zArg,
  44   int bTokenChars
  45 ){
  46   int i;
  47   for(i=0; zArg[i]; i++){
  48     if( (zArg[i] & 0x80)==0 ){
  49       p->aTokenChar[(int)zArg[i]] = (unsigned char)bTokenChars;
  50     }
  51   }
  52 }
  53
  54 /*
  55 ** Delete a "ascii" tokenizer.
  56 */
  57 static void fts5AsciiDelete(Fts5Tokenizer *p){
  58   sqlite3_free(p);
  59 }
  60
  61 /*
  62 ** Create an "ascii" tokenizer.
  63 */
  64 static int fts5AsciiCreate(
  65   void *pUnused,
  66   const char **azArg, int nArg,
  67   Fts5Tokenizer **ppOut
  68 ){
  69   int rc = SQLITE_OK;
  70   AsciiTokenizer *p = 0;
  71   UNUSED_PARAM(pUnused);
  72   if( nArg%2 ){
  73     rc = SQLITE_ERROR;
  74   }else{
  75     p = sqlite3_malloc(sizeof(AsciiTokenizer));
  76     if( p==0 ){
  77       rc = SQLITE_NOMEM;
  78     }else{
  79       int i;
  80       memset(p, 0, sizeof(AsciiTokenizer));
  81       memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
  82       for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
  83         const char *zArg = azArg[i+1];
  84         if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
  85           fts5AsciiAddExceptions(p, zArg, 1);
  86         }else
  87         if( 0==sqlite3_stricmp(azArg[i], "separators") ){
  88           fts5AsciiAddExceptions(p, zArg, 0);
  89         }else{
  90           rc = SQLITE_ERROR;
  91         }
  92       }
  93       if( rc!=SQLITE_OK ){
  94         fts5AsciiDelete((Fts5Tokenizer*)p);
  95         p = 0;
  96       }
  97     }
  98   }
  99
 100   *ppOut = (Fts5Tokenizer*)p;
 101   return rc;
 102 }
 103
 104
 105 static void asciiFold(char *aOut, const char *aIn, int nByte){
 106   int i;
 107   for(i=0; i<nByte; i++){
 108     char c = aIn[i];
 109     if( c>='A' && c<='Z' ) c += 32;
 110     aOut[i] = c;
 111   }
 112 }
 113
 114 /*
 115 ** Tokenize some text using the ascii tokenizer.
 116 */
 117 static int fts5AsciiTokenize(
 118   Fts5Tokenizer *pTokenizer,
 119   void *pCtx,
 120   int iUnused,
 121   const char *pText, int nText,
 122   int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
 123 ){
 124   AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer;
 125   int rc = SQLITE_OK;
 126   int ie;
 127   int is = 0;
 128
 129   char aFold[64];
 130   int nFold = sizeof(aFold);
 131   char *pFold = aFold;
 132   unsigned char *a = p->aTokenChar;
 133
 134   UNUSED_PARAM(iUnused);
 135
 136   while( is<nText && rc==SQLITE_OK ){
 137     int nByte;
 138
 139     /* Skip any leading divider characters. */
 140     while( is<nText && ((pText[is]&0x80)==0 && a[(int)pText[is]]==0) ){
 141       is++;
 142     }
 143     if( is==nText ) break;
 144
 145     /* Count the token characters */
 146     ie = is+1;
 147     while( ie<nText && ((pText[ie]&0x80) || a[(int)pText[ie]] ) ){
 148       ie++;
 149     }
 150
 151     /* Fold to lower case */
 152     nByte = ie-is;
 153     if( nByte>nFold ){
 154       if( pFold!=aFold ) sqlite3_free(pFold);
 155       pFold = sqlite3_malloc64((sqlite3_int64)nByte*2);
 156       if( pFold==0 ){
 157         rc = SQLITE_NOMEM;
 158         break;
 159       }
 160       nFold = nByte*2;
 161     }
 162     asciiFold(pFold, &pText[is], nByte);
 163
 164     /* Invoke the token callback */
 165     rc = xToken(pCtx, 0, pFold, nByte, is, ie);
 166     is = ie+1;
 167   }
 168
 169   if( pFold!=aFold ) sqlite3_free(pFold);
 170   if( rc==SQLITE_DONE ) rc = SQLITE_OK;
 171   return rc;
 172 }
 173
 174 /**************************************************************************
 175 ** Start of unicode61 tokenizer implementation.
 176 */
 177
 178
 179 /*
 180 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
 181 ** from the sqlite3 source file utf.c. If this file is compiled as part
 182 ** of the amalgamation, they are not required.
 183 */
 184 #ifndef SQLITE_AMALGAMATION
 185
 186 static const unsigned char sqlite3Utf8Trans1[] = {
 187   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 188   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 189   0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
 190   0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
 191   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 192   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 193   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 194   0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
 195 };
 196
 197 #define READ_UTF8(zIn, zTerm, c)                           \
 198   c = *(zIn++);                                            \
 199   if( c>=0xc0 ){                                           \
 200     c = sqlite3Utf8Trans1[c-0xc0];                         \
 201     while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){            \
 202       c = (c<<6) + (0x3f & *(zIn++));                      \
 203     }                                                      \
 204     if( c<0x80                                             \
 205         || (c&0xFFFFF800)==0xD800                          \
 206         || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \
 207   }
 208
 209
 210 #define WRITE_UTF8(zOut, c) {                          \
 211   if( c<0x00080 ){                                     \
 212     *zOut++ = (unsigned char)(c&0xFF);                 \
 213   }                                                    \
 214   else if( c<0x00800 ){                                \
 215     *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F);     \
 216     *zOut++ = 0x80 + (unsigned char)(c & 0x3F);        \
 217   }                                                    \
 218   else if( c<0x10000 ){                                \
 219     *zOut++ = 0xE0 + (unsigned char)((c>>12)&0x0F);    \
 220     *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F);   \
 221     *zOut++ = 0x80 + (unsigned char)(c & 0x3F);        \
 222   }else{                                               \
 223     *zOut++ = 0xF0 + (unsigned char)((c>>18) & 0x07);  \
 224     *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F);  \
 225     *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F);   \
 226     *zOut++ = 0x80 + (unsigned char)(c & 0x3F);        \
 227   }                                                    \
 228 }
 229
 230 #endif /* ifndef SQLITE_AMALGAMATION */
 231
 232 typedef struct Unicode61Tokenizer Unicode61Tokenizer;
 233 struct Unicode61Tokenizer {
 234   unsigned char aTokenChar[128];  /* ASCII range token characters */
 235   char *aFold;                    /* Buffer to fold text into */
 236   int nFold;                      /* Size of aFold[] in bytes */
 237   int eRemoveDiacritic;           /* True if remove_diacritics=1 is set */
 238   int nException;
 239   int *aiException;
 240
 241   unsigned char aCategory[32];    /* True for token char categories */
 242 };
 243
 244 /* Values for eRemoveDiacritic (must match internals of fts5_unicode2.c) */
 245 #define FTS5_REMOVE_DIACRITICS_NONE    0
 246 #define FTS5_REMOVE_DIACRITICS_SIMPLE  1
 247 #define FTS5_REMOVE_DIACRITICS_COMPLEX 2
 248
 249 static int fts5UnicodeAddExceptions(
 250   Unicode61Tokenizer *p,          /* Tokenizer object */
 251   const char *z,                  /* Characters to treat as exceptions */
 252   int bTokenChars                 /* 1 for 'tokenchars', 0 for 'separators' */
 253 ){
 254   int rc = SQLITE_OK;
 255   int n = (int)strlen(z);
 256   int *aNew;
 257
 258   if( n>0 ){
 259     aNew = (int*)sqlite3_realloc64(p->aiException,
 260                                    (n+p->nException)*sizeof(int));
 261     if( aNew ){
 262       int nNew = p->nException;
 263       const unsigned char *zCsr = (const unsigned char*)z;
 264       const unsigned char *zTerm = (const unsigned char*)&z[n];
 265       while( zCsr<zTerm ){
 266         u32 iCode;
 267         int bToken;
 268         READ_UTF8(zCsr, zTerm, iCode);
 269         if( iCode<128 ){
 270           p->aTokenChar[iCode] = (unsigned char)bTokenChars;
 271         }else{
 272           bToken = p->aCategory[sqlite3Fts5UnicodeCategory(iCode)];
 273           assert( (bToken==0 || bToken==1) );
 274           assert( (bTokenChars==0 || bTokenChars==1) );
 275           if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){
 276             int i;
 277             for(i=0; i<nNew; i++){
 278               if( (u32)aNew[i]>iCode ) break;
 279             }
 280             memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int));
 281             aNew[i] = iCode;
 282             nNew++;
 283           }
 284         }
 285       }
 286       p->aiException = aNew;
 287       p->nException = nNew;
 288     }else{
 289       rc = SQLITE_NOMEM;
 290     }
 291   }
 292
 293   return rc;
 294 }
 295
 296 /*
 297 ** Return true if the p->aiException[] array contains the value iCode.
 298 */
 299 static int fts5UnicodeIsException(Unicode61Tokenizer *p, int iCode){
 300   if( p->nException>0 ){
 301     int *a = p->aiException;
 302     int iLo = 0;
 303     int iHi = p->nException-1;
 304
 305     while( iHi>=iLo ){
 306       int iTest = (iHi + iLo) / 2;
 307       if( iCode==a[iTest] ){
 308         return 1;
 309       }else if( iCode>a[iTest] ){
 310         iLo = iTest+1;
 311       }else{
 312         iHi = iTest-1;
 313       }
 314     }
 315   }
 316
 317   return 0;
 318 }
 319
 320 /*
 321 ** Delete a "unicode61" tokenizer.
 322 */
 323 static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
 324   if( pTok ){
 325     Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
 326     sqlite3_free(p->aiException);
 327     sqlite3_free(p->aFold);
 328     sqlite3_free(p);
 329   }
 330   return;
 331 }
 332
 333 static int unicodeSetCategories(Unicode61Tokenizer *p, const char *zCat){
 334   const char *z = zCat;
 335
 336   while( *z ){
 337     while( *z==' ' || *z=='\t' ) z++;
 338     if( *z && sqlite3Fts5UnicodeCatParse(z, p->aCategory) ){
 339       return SQLITE_ERROR;
 340     }
 341     while( *z!=' ' && *z!='\t' && *z!='\0' ) z++;
 342   }
 343
 344   sqlite3Fts5UnicodeAscii(p->aCategory, p->aTokenChar);
 345   return SQLITE_OK;
 346 }
 347
 348 /*
 349 ** Create a "unicode61" tokenizer.
 350 */
 351 static int fts5UnicodeCreate(
 352   void *pUnused,
 353   const char **azArg, int nArg,
 354   Fts5Tokenizer **ppOut
 355 ){
 356   int rc = SQLITE_OK;             /* Return code */
 357   Unicode61Tokenizer *p = 0;      /* New tokenizer object */
 358
 359   UNUSED_PARAM(pUnused);
 360
 361   if( nArg%2 ){
 362     rc = SQLITE_ERROR;
 363   }else{
 364     p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer));
 365     if( p ){
 366       const char *zCat = "L* N* Co";
 367       int i;
 368       memset(p, 0, sizeof(Unicode61Tokenizer));
 369
 370       p->eRemoveDiacritic = FTS5_REMOVE_DIACRITICS_SIMPLE;
 371       p->nFold = 64;
 372       p->aFold = sqlite3_malloc64(p->nFold * sizeof(char));
 373       if( p->aFold==0 ){
 374         rc = SQLITE_NOMEM;
 375       }
 376
 377       /* Search for a "categories" argument */
 378       for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
 379         if( 0==sqlite3_stricmp(azArg[i], "categories") ){
 380           zCat = azArg[i+1];
 381         }
 382       }
 383
 384       if( rc==SQLITE_OK ){
 385         rc = unicodeSetCategories(p, zCat);
 386       }
 387
 388       for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
 389         const char *zArg = azArg[i+1];
 390         if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
 391           if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){
 392             rc = SQLITE_ERROR;
 393           }else{
 394             p->eRemoveDiacritic = (zArg[0] - '0');
 395             assert( p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_NONE
 396                  || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_SIMPLE
 397                  || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_COMPLEX
 398             );
 399           }
 400         }else
 401         if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
 402           rc = fts5UnicodeAddExceptions(p, zArg, 1);
 403         }else
 404         if( 0==sqlite3_stricmp(azArg[i], "separators") ){
 405           rc = fts5UnicodeAddExceptions(p, zArg, 0);
 406         }else
 407         if( 0==sqlite3_stricmp(azArg[i], "categories") ){
 408           /* no-op */
 409         }else{
 410           rc = SQLITE_ERROR;
 411         }
 412       }
 413
 414     }else{
 415       rc = SQLITE_NOMEM;
 416     }
 417     if( rc!=SQLITE_OK ){
 418       fts5UnicodeDelete((Fts5Tokenizer*)p);
 419       p = 0;
 420     }
 421     *ppOut = (Fts5Tokenizer*)p;
 422   }
 423   return rc;
 424 }
 425
 426 /*
 427 ** Return true if, for the purposes of tokenizing with the tokenizer
 428 ** passed as the first argument, codepoint iCode is considered a token
 429 ** character (not a separator).
 430 */
 431 static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){
 432   return (
 433     p->aCategory[sqlite3Fts5UnicodeCategory((u32)iCode)]
 434     ^ fts5UnicodeIsException(p, iCode)
 435   );
 436 }
 437
 438 static int fts5UnicodeTokenize(
 439   Fts5Tokenizer *pTokenizer,
 440   void *pCtx,
 441   int iUnused,
 442   const char *pText, int nText,
 443   int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
 444 ){
 445   Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
 446   int rc = SQLITE_OK;
 447   unsigned char *a = p->aTokenChar;
 448
 449   unsigned char *zTerm = (unsigned char*)&pText[nText];
 450   unsigned char *zCsr = (unsigned char *)pText;
 451
 452   /* Output buffer */
 453   char *aFold = p->aFold;
 454   int nFold = p->nFold;
 455   const char *pEnd = &aFold[nFold-6];
 456
 457   UNUSED_PARAM(iUnused);
 458
 459   /* Each iteration of this loop gobbles up a contiguous run of separators,
 460   ** then the next token.  */
 461   while( rc==SQLITE_OK ){
 462     u32 iCode;                    /* non-ASCII codepoint read from input */
 463     char *zOut = aFold;
 464     int is;
 465     int ie;
 466
 467     /* Skip any separator characters. */
 468     while( 1 ){
 469       if( zCsr>=zTerm ) goto tokenize_done;
 470       if( *zCsr & 0x80 ) {
 471         /* A character outside of the ascii range. Skip past it if it is
 472         ** a separator character. Or break out of the loop if it is not. */
 473         is = zCsr - (unsigned char*)pText;
 474         READ_UTF8(zCsr, zTerm, iCode);
 475         if( fts5UnicodeIsAlnum(p, iCode) ){
 476           goto non_ascii_tokenchar;
 477         }
 478       }else{
 479         if( a[*zCsr] ){
 480           is = zCsr - (unsigned char*)pText;
 481           goto ascii_tokenchar;
 482         }
 483         zCsr++;
 484       }
 485     }
 486
 487     /* Run through the tokenchars. Fold them into the output buffer along
 488     ** the way.  */
 489     while( zCsr<zTerm ){
 490
 491       /* Grow the output buffer so that there is sufficient space to fit the
 492       ** largest possible utf-8 character.  */
 493       if( zOut>pEnd ){
 494         aFold = sqlite3_malloc64((sqlite3_int64)nFold*2);
 495         if( aFold==0 ){
 496           rc = SQLITE_NOMEM;
 497           goto tokenize_done;
 498         }
 499         zOut = &aFold[zOut - p->aFold];
 500         memcpy(aFold, p->aFold, nFold);
 501         sqlite3_free(p->aFold);
 502         p->aFold = aFold;
 503         p->nFold = nFold = nFold*2;
 504         pEnd = &aFold[nFold-6];
 505       }
 506
 507       if( *zCsr & 0x80 ){
 508         /* An non-ascii-range character. Fold it into the output buffer if
 509         ** it is a token character, or break out of the loop if it is not. */
 510         READ_UTF8(zCsr, zTerm, iCode);
 511         if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
 512  non_ascii_tokenchar:
 513           iCode = sqlite3Fts5UnicodeFold(iCode, p->eRemoveDiacritic);
 514           if( iCode ) WRITE_UTF8(zOut, iCode);
 515         }else{
 516           break;
 517         }
 518       }else if( a[*zCsr]==0 ){
 519         /* An ascii-range separator character. End of token. */
 520         break;
 521       }else{
 522  ascii_tokenchar:
 523         if( *zCsr>='A' && *zCsr<='Z' ){
 524           *zOut++ = *zCsr + 32;
 525         }else{
 526           *zOut++ = *zCsr;
 527         }
 528         zCsr++;
 529       }
 530       ie = zCsr - (unsigned char*)pText;
 531     }
 532
 533     /* Invoke the token callback */
 534     rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie);
 535   }
 536
 537  tokenize_done:
 538   if( rc==SQLITE_DONE ) rc = SQLITE_OK;
 539   return rc;
 540 }
 541
 542 /**************************************************************************
 543 ** Start of porter stemmer implementation.
 544 */
 545
 546 /* Any tokens larger than this (in bytes) are passed through without
 547 ** stemming. */
 548 #define FTS5_PORTER_MAX_TOKEN 64
 549
 550 typedef struct PorterTokenizer PorterTokenizer;
 551 struct PorterTokenizer {
 552   fts5_tokenizer tokenizer;       /* Parent tokenizer module */
 553   Fts5Tokenizer *pTokenizer;      /* Parent tokenizer instance */
 554   char aBuf[FTS5_PORTER_MAX_TOKEN + 64];
 555 };
 556
 557 /*
 558 ** Delete a "porter" tokenizer.
 559 */
 560 static void fts5PorterDelete(Fts5Tokenizer *pTok){
 561   if( pTok ){
 562     PorterTokenizer *p = (PorterTokenizer*)pTok;
 563     if( p->pTokenizer ){
 564       p->tokenizer.xDelete(p->pTokenizer);
 565     }
 566     sqlite3_free(p);
 567   }
 568 }
 569
 570 /*
 571 ** Create a "porter" tokenizer.
 572 */
 573 static int fts5PorterCreate(
 574   void *pCtx,
 575   const char **azArg, int nArg,
 576   Fts5Tokenizer **ppOut
 577 ){
 578   fts5_api *pApi = (fts5_api*)pCtx;
 579   int rc = SQLITE_OK;
 580   PorterTokenizer *pRet;
 581   void *pUserdata = 0;
 582   const char *zBase = "unicode61";
 583
 584   if( nArg>0 ){
 585     zBase = azArg[0];
 586   }
 587
 588   pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer));
 589   if( pRet ){
 590     memset(pRet, 0, sizeof(PorterTokenizer));
 591     rc = pApi->xFindTokenizer(pApi, zBase, &pUserdata, &pRet->tokenizer);
 592   }else{
 593     rc = SQLITE_NOMEM;
 594   }
 595   if( rc==SQLITE_OK ){
 596     int nArg2 = (nArg>0 ? nArg-1 : 0);
 597     const char **azArg2 = (nArg2 ? &azArg[1] : 0);
 598     rc = pRet->tokenizer.xCreate(pUserdata, azArg2, nArg2, &pRet->pTokenizer);
 599   }
 600
 601   if( rc!=SQLITE_OK ){
 602     fts5PorterDelete((Fts5Tokenizer*)pRet);
 603     pRet = 0;
 604   }
 605   *ppOut = (Fts5Tokenizer*)pRet;
 606   return rc;
 607 }
 608
 609 typedef struct PorterContext PorterContext;
 610 struct PorterContext {
 611   void *pCtx;
 612   int (*xToken)(void*, int, const char*, int, int, int);
 613   char *aBuf;
 614 };
 615
 616 typedef struct PorterRule PorterRule;
 617 struct PorterRule {
 618   const char *zSuffix;
 619   int nSuffix;
 620   int (*xCond)(char *zStem, int nStem);
 621   const char *zOutput;
 622   int nOutput;
 623 };
 624
 625 #if 0
 626 static int fts5PorterApply(char *aBuf, int *pnBuf, PorterRule *aRule){
 627   int ret = -1;
 628   int nBuf = *pnBuf;
 629   PorterRule *p;
 630
 631   for(p=aRule; p->zSuffix; p++){
 632     assert( strlen(p->zSuffix)==p->nSuffix );
 633     assert( strlen(p->zOutput)==p->nOutput );
 634     if( nBuf<p->nSuffix ) continue;
 635     if( 0==memcmp(&aBuf[nBuf - p->nSuffix], p->zSuffix, p->nSuffix) ) break;
 636   }
 637
 638   if( p->zSuffix ){
 639     int nStem = nBuf - p->nSuffix;
 640     if( p->xCond==0 || p->xCond(aBuf, nStem) ){
 641       memcpy(&aBuf[nStem], p->zOutput, p->nOutput);
 642       *pnBuf = nStem + p->nOutput;
 643       ret = p - aRule;
 644     }
 645   }
 646
 647   return ret;
 648 }
 649 #endif
 650
 651 static int fts5PorterIsVowel(char c, int bYIsVowel){
 652   return (
 653       c=='a' || c=='e' || c=='i' || c=='o' || c=='u' || (bYIsVowel && c=='y')
 654   );
 655 }
 656
 657 static int fts5PorterGobbleVC(char *zStem, int nStem, int bPrevCons){
 658   int i;
 659   int bCons = bPrevCons;
 660
 661   /* Scan for a vowel */
 662   for(i=0; i<nStem; i++){
 663     if( 0==(bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) break;
 664   }
 665
 666   /* Scan for a consonent */
 667   for(i++; i<nStem; i++){
 668     if( (bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) return i+1;
 669   }
 670   return 0;
 671 }
 672
 673 /* porter rule condition: (m > 0) */
 674 static int fts5Porter_MGt0(char *zStem, int nStem){
 675   return !!fts5PorterGobbleVC(zStem, nStem, 0);
 676 }
 677
 678 /* porter rule condition: (m > 1) */
 679 static int fts5Porter_MGt1(char *zStem, int nStem){
 680   int n;
 681   n = fts5PorterGobbleVC(zStem, nStem, 0);
 682   if( n && fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
 683     return 1;
 684   }
 685   return 0;
 686 }
 687
 688 /* porter rule condition: (m = 1) */
 689 static int fts5Porter_MEq1(char *zStem, int nStem){
 690   int n;
 691   n = fts5PorterGobbleVC(zStem, nStem, 0);
 692   if( n && 0==fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
 693     return 1;
 694   }
 695   return 0;
 696 }
 697
 698 /* porter rule condition: (*o) */
 699 static int fts5Porter_Ostar(char *zStem, int nStem){
 700   if( zStem[nStem-1]=='w' || zStem[nStem-1]=='x' || zStem[nStem-1]=='y' ){
 701     return 0;
 702   }else{
 703     int i;
 704     int mask = 0;
 705     int bCons = 0;
 706     for(i=0; i<nStem; i++){
 707       bCons = !fts5PorterIsVowel(zStem[i], bCons);
 708       assert( bCons==0 || bCons==1 );
 709       mask = (mask << 1) + bCons;
 710     }
 711     return ((mask & 0x0007)==0x0005);
 712   }
 713 }
 714
 715 /* porter rule condition: (m > 1 and (*S or *T)) */
 716 static int fts5Porter_MGt1_and_S_or_T(char *zStem, int nStem){
 717   assert( nStem>0 );
 718   return (zStem[nStem-1]=='s' || zStem[nStem-1]=='t')
 719       && fts5Porter_MGt1(zStem, nStem);
 720 }
 721
 722 /* porter rule condition: (*v*) */
 723 static int fts5Porter_Vowel(char *zStem, int nStem){
 724   int i;
 725   for(i=0; i<nStem; i++){
 726     if( fts5PorterIsVowel(zStem[i], i>0) ){
 727       return 1;
 728     }
 729   }
 730   return 0;
 731 }
 732
 733
 734 /**************************************************************************
 735 ***************************************************************************
 736 ** GENERATED CODE STARTS HERE (mkportersteps.tcl)
 737 */
 738
 739 static int fts5PorterStep4(char *aBuf, int *pnBuf){
 740   int ret = 0;
 741   int nBuf = *pnBuf;
 742   switch( aBuf[nBuf-2] ){
 743
 744     case 'a':
 745       if( nBuf>2 && 0==memcmp("al", &aBuf[nBuf-2], 2) ){
 746         if( fts5Porter_MGt1(aBuf, nBuf-2) ){
 747           *pnBuf = nBuf - 2;
 748         }
 749       }
 750       break;
 751
 752     case 'c':
 753       if( nBuf>4 && 0==memcmp("ance", &aBuf[nBuf-4], 4) ){
 754         if( fts5Porter_MGt1(aBuf, nBuf-4) ){
 755           *pnBuf = nBuf - 4;
 756         }
 757       }else if( nBuf>4 && 0==memcmp("ence", &aBuf[nBuf-4], 4) ){
 758         if( fts5Porter_MGt1(aBuf, nBuf-4) ){
 759           *pnBuf = nBuf - 4;
 760         }
 761       }
 762       break;
 763
 764     case 'e':
 765       if( nBuf>2 && 0==memcmp("er", &aBuf[nBuf-2], 2) ){
 766         if( fts5Porter_MGt1(aBuf, nBuf-2) ){
 767           *pnBuf = nBuf - 2;
 768         }
 769       }
 770       break;
 771
 772     case 'i':
 773       if( nBuf>2 && 0==memcmp("ic", &aBuf[nBuf-2], 2) ){
 774         if( fts5Porter_MGt1(aBuf, nBuf-2) ){
 775           *pnBuf = nBuf - 2;
 776         }
 777       }
 778       break;
 779
 780     case 'l':
 781       if( nBuf>4 && 0==memcmp("able", &aBuf[nBuf-4], 4) ){
 782         if( fts5Porter_MGt1(aBuf, nBuf-4) ){
 783           *pnBuf = nBuf - 4;
 784         }
 785       }else if( nBuf>4 && 0==memcmp("ible", &aBuf[nBuf-4], 4) ){
 786         if( fts5Porter_MGt1(aBuf, nBuf-4) ){
 787           *pnBuf = nBuf - 4;
 788         }
 789       }
 790       break;
 791
 792     case 'n':
 793       if( nBuf>3 && 0==memcmp("ant", &aBuf[nBuf-3], 3) ){
 794         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 795           *pnBuf = nBuf - 3;
 796         }
 797       }else if( nBuf>5 && 0==memcmp("ement", &aBuf[nBuf-5], 5) ){
 798         if( fts5Porter_MGt1(aBuf, nBuf-5) ){
 799           *pnBuf = nBuf - 5;
 800         }
 801       }else if( nBuf>4 && 0==memcmp("ment", &aBuf[nBuf-4], 4) ){
 802         if( fts5Porter_MGt1(aBuf, nBuf-4) ){
 803           *pnBuf = nBuf - 4;
 804         }
 805       }else if( nBuf>3 && 0==memcmp("ent", &aBuf[nBuf-3], 3) ){
 806         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 807           *pnBuf = nBuf - 3;
 808         }
 809       }
 810       break;
 811
 812     case 'o':
 813       if( nBuf>3 && 0==memcmp("ion", &aBuf[nBuf-3], 3) ){
 814         if( fts5Porter_MGt1_and_S_or_T(aBuf, nBuf-3) ){
 815           *pnBuf = nBuf - 3;
 816         }
 817       }else if( nBuf>2 && 0==memcmp("ou", &aBuf[nBuf-2], 2) ){
 818         if( fts5Porter_MGt1(aBuf, nBuf-2) ){
 819           *pnBuf = nBuf - 2;
 820         }
 821       }
 822       break;
 823
 824     case 's':
 825       if( nBuf>3 && 0==memcmp("ism", &aBuf[nBuf-3], 3) ){
 826         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 827           *pnBuf = nBuf - 3;
 828         }
 829       }
 830       break;
 831
 832     case 't':
 833       if( nBuf>3 && 0==memcmp("ate", &aBuf[nBuf-3], 3) ){
 834         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 835           *pnBuf = nBuf - 3;
 836         }
 837       }else if( nBuf>3 && 0==memcmp("iti", &aBuf[nBuf-3], 3) ){
 838         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 839           *pnBuf = nBuf - 3;
 840         }
 841       }
 842       break;
 843
 844     case 'u':
 845       if( nBuf>3 && 0==memcmp("ous", &aBuf[nBuf-3], 3) ){
 846         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 847           *pnBuf = nBuf - 3;
 848         }
 849       }
 850       break;
 851
 852     case 'v':
 853       if( nBuf>3 && 0==memcmp("ive", &aBuf[nBuf-3], 3) ){
 854         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 855           *pnBuf = nBuf - 3;
 856         }
 857       }
 858       break;
 859
 860     case 'z':
 861       if( nBuf>3 && 0==memcmp("ize", &aBuf[nBuf-3], 3) ){
 862         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 863           *pnBuf = nBuf - 3;
 864         }
 865       }
 866       break;
 867
 868   }
 869   return ret;
 870 }
 871
 872
 873 static int fts5PorterStep1B2(char *aBuf, int *pnBuf){
 874   int ret = 0;
 875   int nBuf = *pnBuf;
 876   switch( aBuf[nBuf-2] ){
 877
 878     case 'a':
 879       if( nBuf>2 && 0==memcmp("at", &aBuf[nBuf-2], 2) ){
 880         memcpy(&aBuf[nBuf-2], "ate", 3);
 881         *pnBuf = nBuf - 2 + 3;
 882         ret = 1;
 883       }
 884       break;
 885
 886     case 'b':
 887       if( nBuf>2 && 0==memcmp("bl", &aBuf[nBuf-2], 2) ){
 888         memcpy(&aBuf[nBuf-2], "ble", 3);
 889         *pnBuf = nBuf - 2 + 3;
 890         ret = 1;
 891       }
 892       break;
 893
 894     case 'i':
 895       if( nBuf>2 && 0==memcmp("iz", &aBuf[nBuf-2], 2) ){
 896         memcpy(&aBuf[nBuf-2], "ize", 3);
 897         *pnBuf = nBuf - 2 + 3;
 898         ret = 1;
 899       }
 900       break;
 901
 902   }
 903   return ret;
 904 }
 905
 906
 907 static int fts5PorterStep2(char *aBuf, int *pnBuf){
 908   int ret = 0;
 909   int nBuf = *pnBuf;
 910   switch( aBuf[nBuf-2] ){
 911
 912     case 'a':
 913       if( nBuf>7 && 0==memcmp("ational", &aBuf[nBuf-7], 7) ){
 914         if( fts5Porter_MGt0(aBuf, nBuf-7) ){
 915           memcpy(&aBuf[nBuf-7], "ate", 3);
 916           *pnBuf = nBuf - 7 + 3;
 917         }
 918       }else if( nBuf>6 && 0==memcmp("tional", &aBuf[nBuf-6], 6) ){
 919         if( fts5Porter_MGt0(aBuf, nBuf-6) ){
 920           memcpy(&aBuf[nBuf-6], "tion", 4);
 921           *pnBuf = nBuf - 6 + 4;
 922         }
 923       }
 924       break;
 925
 926     case 'c':
 927       if( nBuf>4 && 0==memcmp("enci", &aBuf[nBuf-4], 4) ){
 928         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
 929           memcpy(&aBuf[nBuf-4], "ence", 4);
 930           *pnBuf = nBuf - 4 + 4;
 931         }
 932       }else if( nBuf>4 && 0==memcmp("anci", &aBuf[nBuf-4], 4) ){
 933         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
 934           memcpy(&aBuf[nBuf-4], "ance", 4);
 935           *pnBuf = nBuf - 4 + 4;
 936         }
 937       }
 938       break;
 939
 940     case 'e':
 941       if( nBuf>4 && 0==memcmp("izer", &aBuf[nBuf-4], 4) ){
 942         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
 943           memcpy(&aBuf[nBuf-4], "ize", 3);
 944           *pnBuf = nBuf - 4 + 3;
 945         }
 946       }
 947       break;
 948
 949     case 'g':
 950       if( nBuf>4 && 0==memcmp("logi", &aBuf[nBuf-4], 4) ){
 951         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
 952           memcpy(&aBuf[nBuf-4], "log", 3);
 953           *pnBuf = nBuf - 4 + 3;
 954         }
 955       }
 956       break;
 957
 958     case 'l':
 959       if( nBuf>3 && 0==memcmp("bli", &aBuf[nBuf-3], 3) ){
 960         if( fts5Porter_MGt0(aBuf, nBuf-3) ){
 961           memcpy(&aBuf[nBuf-3], "ble", 3);
 962           *pnBuf = nBuf - 3 + 3;
 963         }
 964       }else if( nBuf>4 && 0==memcmp("alli", &aBuf[nBuf-4], 4) ){
 965         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
 966           memcpy(&aBuf[nBuf-4], "al", 2);
 967           *pnBuf = nBuf - 4 + 2;
 968         }
 969       }else if( nBuf>5 && 0==memcmp("entli", &aBuf[nBuf-5], 5) ){
 970         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
 971           memcpy(&aBuf[nBuf-5], "ent", 3);
 972           *pnBuf = nBuf - 5 + 3;
 973         }
 974       }else if( nBuf>3 && 0==memcmp("eli", &aBuf[nBuf-3], 3) ){
 975         if( fts5Porter_MGt0(aBuf, nBuf-3) ){
 976           memcpy(&aBuf[nBuf-3], "e", 1);
 977           *pnBuf = nBuf - 3 + 1;
 978         }
 979       }else if( nBuf>5 && 0==memcmp("ousli", &aBuf[nBuf-5], 5) ){
 980         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
 981           memcpy(&aBuf[nBuf-5], "ous", 3);
 982           *pnBuf = nBuf - 5 + 3;
 983         }
 984       }
 985       break;
 986
 987     case 'o':
 988       if( nBuf>7 && 0==memcmp("ization", &aBuf[nBuf-7], 7) ){
 989         if( fts5Porter_MGt0(aBuf, nBuf-7) ){
 990           memcpy(&aBuf[nBuf-7], "ize", 3);
 991           *pnBuf = nBuf - 7 + 3;
 992         }
 993       }else if( nBuf>5 && 0==memcmp("ation", &aBuf[nBuf-5], 5) ){
 994         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
 995           memcpy(&aBuf[nBuf-5], "ate", 3);
 996           *pnBuf = nBuf - 5 + 3;
 997         }
 998       }else if( nBuf>4 && 0==memcmp("ator", &aBuf[nBuf-4], 4) ){
 999         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1000           memcpy(&aBuf[nBuf-4], "ate", 3);
1001           *pnBuf = nBuf - 4 + 3;
1002         }
1003       }
1004       break;
1005
1006     case 's':
1007       if( nBuf>5 && 0==memcmp("alism", &aBuf[nBuf-5], 5) ){
1008         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1009           memcpy(&aBuf[nBuf-5], "al", 2);
1010           *pnBuf = nBuf - 5 + 2;
1011         }
1012       }else if( nBuf>7 && 0==memcmp("iveness", &aBuf[nBuf-7], 7) ){
1013         if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1014           memcpy(&aBuf[nBuf-7], "ive", 3);
1015           *pnBuf = nBuf - 7 + 3;
1016         }
1017       }else if( nBuf>7 && 0==memcmp("fulness", &aBuf[nBuf-7], 7) ){
1018         if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1019           memcpy(&aBuf[nBuf-7], "ful", 3);
1020           *pnBuf = nBuf - 7 + 3;
1021         }
1022       }else if( nBuf>7 && 0==memcmp("ousness", &aBuf[nBuf-7], 7) ){
1023         if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1024           memcpy(&aBuf[nBuf-7], "ous", 3);
1025           *pnBuf = nBuf - 7 + 3;
1026         }
1027       }
1028       break;
1029
1030     case 't':
1031       if( nBuf>5 && 0==memcmp("aliti", &aBuf[nBuf-5], 5) ){
1032         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1033           memcpy(&aBuf[nBuf-5], "al", 2);
1034           *pnBuf = nBuf - 5 + 2;
1035         }
1036       }else if( nBuf>5 && 0==memcmp("iviti", &aBuf[nBuf-5], 5) ){
1037         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1038           memcpy(&aBuf[nBuf-5], "ive", 3);
1039           *pnBuf = nBuf - 5 + 3;
1040         }
1041       }else if( nBuf>6 && 0==memcmp("biliti", &aBuf[nBuf-6], 6) ){
1042         if( fts5Porter_MGt0(aBuf, nBuf-6) ){
1043           memcpy(&aBuf[nBuf-6], "ble", 3);
1044           *pnBuf = nBuf - 6 + 3;
1045         }
1046       }
1047       break;
1048
1049   }
1050   return ret;
1051 }
1052
1053
1054 static int fts5PorterStep3(char *aBuf, int *pnBuf){
1055   int ret = 0;
1056   int nBuf = *pnBuf;
1057   switch( aBuf[nBuf-2] ){
1058
1059     case 'a':
1060       if( nBuf>4 && 0==memcmp("ical", &aBuf[nBuf-4], 4) ){
1061         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1062           memcpy(&aBuf[nBuf-4], "ic", 2);
1063           *pnBuf = nBuf - 4 + 2;
1064         }
1065       }
1066       break;
1067
1068     case 's':
1069       if( nBuf>4 && 0==memcmp("ness", &aBuf[nBuf-4], 4) ){
1070         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1071           *pnBuf = nBuf - 4;
1072         }
1073       }
1074       break;
1075
1076     case 't':
1077       if( nBuf>5 && 0==memcmp("icate", &aBuf[nBuf-5], 5) ){
1078         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1079           memcpy(&aBuf[nBuf-5], "ic", 2);
1080           *pnBuf = nBuf - 5 + 2;
1081         }
1082       }else if( nBuf>5 && 0==memcmp("iciti", &aBuf[nBuf-5], 5) ){
1083         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1084           memcpy(&aBuf[nBuf-5], "ic", 2);
1085           *pnBuf = nBuf - 5 + 2;
1086         }
1087       }
1088       break;
1089
1090     case 'u':
1091       if( nBuf>3 && 0==memcmp("ful", &aBuf[nBuf-3], 3) ){
1092         if( fts5Porter_MGt0(aBuf, nBuf-3) ){
1093           *pnBuf = nBuf - 3;
1094         }
1095       }
1096       break;
1097
1098     case 'v':
1099       if( nBuf>5 && 0==memcmp("ative", &aBuf[nBuf-5], 5) ){
1100         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1101           *pnBuf = nBuf - 5;
1102         }
1103       }
1104       break;
1105
1106     case 'z':
1107       if( nBuf>5 && 0==memcmp("alize", &aBuf[nBuf-5], 5) ){
1108         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1109           memcpy(&aBuf[nBuf-5], "al", 2);
1110           *pnBuf = nBuf - 5 + 2;
1111         }
1112       }
1113       break;
1114
1115   }
1116   return ret;
1117 }
1118
1119
1120 static int fts5PorterStep1B(char *aBuf, int *pnBuf){
1121   int ret = 0;
1122   int nBuf = *pnBuf;
1123   switch( aBuf[nBuf-2] ){
1124
1125     case 'e':
1126       if( nBuf>3 && 0==memcmp("eed", &aBuf[nBuf-3], 3) ){
1127         if( fts5Porter_MGt0(aBuf, nBuf-3) ){
1128           memcpy(&aBuf[nBuf-3], "ee", 2);
1129           *pnBuf = nBuf - 3 + 2;
1130         }
1131       }else if( nBuf>2 && 0==memcmp("ed", &aBuf[nBuf-2], 2) ){
1132         if( fts5Porter_Vowel(aBuf, nBuf-2) ){
1133           *pnBuf = nBuf - 2;
1134           ret = 1;
1135         }
1136       }
1137       break;
1138
1139     case 'n':
1140       if( nBuf>3 && 0==memcmp("ing", &aBuf[nBuf-3], 3) ){
1141         if( fts5Porter_Vowel(aBuf, nBuf-3) ){
1142           *pnBuf = nBuf - 3;
1143           ret = 1;
1144         }
1145       }
1146       break;
1147
1148   }
1149   return ret;
1150 }
1151
1152 /*
1153 ** GENERATED CODE ENDS HERE (mkportersteps.tcl)
1154 ***************************************************************************
1155 **************************************************************************/
1156
1157 static void fts5PorterStep1A(char *aBuf, int *pnBuf){
1158   int nBuf = *pnBuf;
1159   if( aBuf[nBuf-1]=='s' ){
1160     if( aBuf[nBuf-2]=='e' ){
1161       if( (nBuf>4 && aBuf[nBuf-4]=='s' && aBuf[nBuf-3]=='s')
1162        || (nBuf>3 && aBuf[nBuf-3]=='i' )
1163       ){
1164         *pnBuf = nBuf-2;
1165       }else{
1166         *pnBuf = nBuf-1;
1167       }
1168     }
1169     else if( aBuf[nBuf-2]!='s' ){
1170       *pnBuf = nBuf-1;
1171     }
1172   }
1173 }
1174
1175 static int fts5PorterCb(
1176   void *pCtx,
1177   int tflags,
1178   const char *pToken,
1179   int nToken,
1180   int iStart,
1181   int iEnd
1182 ){
1183   PorterContext *p = (PorterContext*)pCtx;
1184
1185   char *aBuf;
1186   int nBuf;
1187
1188   if( nToken>FTS5_PORTER_MAX_TOKEN || nToken<3 ) goto pass_through;
1189   aBuf = p->aBuf;
1190   nBuf = nToken;
1191   memcpy(aBuf, pToken, nBuf);
1192
1193   /* Step 1. */
1194   fts5PorterStep1A(aBuf, &nBuf);
1195   if( fts5PorterStep1B(aBuf, &nBuf) ){
1196     if( fts5PorterStep1B2(aBuf, &nBuf)==0 ){
1197       char c = aBuf[nBuf-1];
1198       if( fts5PorterIsVowel(c, 0)==0
1199        && c!='l' && c!='s' && c!='z' && c==aBuf[nBuf-2]
1200       ){
1201         nBuf--;
1202       }else if( fts5Porter_MEq1(aBuf, nBuf) && fts5Porter_Ostar(aBuf, nBuf) ){
1203         aBuf[nBuf++] = 'e';
1204       }
1205     }
1206   }
1207
1208   /* Step 1C. */
1209   if( aBuf[nBuf-1]=='y' && fts5Porter_Vowel(aBuf, nBuf-1) ){
1210     aBuf[nBuf-1] = 'i';
1211   }
1212
1213   /* Steps 2 through 4. */
1214   fts5PorterStep2(aBuf, &nBuf);
1215   fts5PorterStep3(aBuf, &nBuf);
1216   fts5PorterStep4(aBuf, &nBuf);
1217
1218   /* Step 5a. */
1219   assert( nBuf>0 );
1220   if( aBuf[nBuf-1]=='e' ){
1221     if( fts5Porter_MGt1(aBuf, nBuf-1)
1222      || (fts5Porter_MEq1(aBuf, nBuf-1) && !fts5Porter_Ostar(aBuf, nBuf-1))
1223     ){
1224       nBuf--;
1225     }
1226   }
1227
1228   /* Step 5b. */
1229   if( nBuf>1 && aBuf[nBuf-1]=='l'
1230    && aBuf[nBuf-2]=='l' && fts5Porter_MGt1(aBuf, nBuf-1)
1231   ){
1232     nBuf--;
1233   }
1234
1235   return p->xToken(p->pCtx, tflags, aBuf, nBuf, iStart, iEnd);
1236
1237  pass_through:
1238   return p->xToken(p->pCtx, tflags, pToken, nToken, iStart, iEnd);
1239 }
1240
1241 /*
1242 ** Tokenize using the porter tokenizer.
1243 */
1244 static int fts5PorterTokenize(
1245   Fts5Tokenizer *pTokenizer,
1246   void *pCtx,
1247   int flags,
1248   const char *pText, int nText,
1249   int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
1250 ){
1251   PorterTokenizer *p = (PorterTokenizer*)pTokenizer;
1252   PorterContext sCtx;
1253   sCtx.xToken = xToken;
1254   sCtx.pCtx = pCtx;
1255   sCtx.aBuf = p->aBuf;
1256   return p->tokenizer.xTokenize(
1257       p->pTokenizer, (void*)&sCtx, flags, pText, nText, fts5PorterCb
1258   );
1259 }
1260
1261 /**************************************************************************
1262 ** Start of trigram implementation.
1263 */
1264 typedef struct TrigramTokenizer TrigramTokenizer;
1265 struct TrigramTokenizer {
1266   int bFold;                      /* True to fold to lower-case */
1267 };
1268
1269 /*
1270 ** Free a trigram tokenizer.
1271 */
1272 static void fts5TriDelete(Fts5Tokenizer *p){
1273   sqlite3_free(p);
1274 }
1275
1276 /*
1277 ** Allocate a trigram tokenizer.
1278 */
1279 static int fts5TriCreate(
1280   void *pUnused,
1281   const char **azArg,
1282   int nArg,
1283   Fts5Tokenizer **ppOut
1284 ){
1285   int rc = SQLITE_OK;
1286   TrigramTokenizer *pNew = (TrigramTokenizer*)sqlite3_malloc(sizeof(*pNew));
1287   UNUSED_PARAM(pUnused);
1288   if( pNew==0 ){
1289     rc = SQLITE_NOMEM;
1290   }else{
1291     int i;
1292     pNew->bFold = 1;
1293     for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
1294       const char *zArg = azArg[i+1];
1295       if( 0==sqlite3_stricmp(azArg[i], "case_sensitive") ){
1296         if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
1297           rc = SQLITE_ERROR;
1298         }else{
1299           pNew->bFold = (zArg[0]=='0');
1300         }
1301       }else{
1302         rc = SQLITE_ERROR;
1303       }
1304     }
1305     if( rc!=SQLITE_OK ){
1306       fts5TriDelete((Fts5Tokenizer*)pNew);
1307       pNew = 0;
1308     }
1309   }
1310   *ppOut = (Fts5Tokenizer*)pNew;
1311   return rc;
1312 }
1313
1314 /*
1315 ** Trigram tokenizer tokenize routine.
1316 */
1317 static int fts5TriTokenize(
1318   Fts5Tokenizer *pTok,
1319   void *pCtx,
1320   int unusedFlags,
1321   const char *pText, int nText,
1322   int (*xToken)(void*, int, const char*, int, int, int)
1323 ){
1324   TrigramTokenizer *p = (TrigramTokenizer*)pTok;
1325   int rc = SQLITE_OK;
1326   char aBuf[32];
1327   const unsigned char *zIn = (const unsigned char*)pText;
1328   const unsigned char *zEof = &zIn[nText];
1329   u32 iCode;
1330
1331   UNUSED_PARAM(unusedFlags);
1332   while( 1 ){
1333     char *zOut = aBuf;
1334     int iStart = zIn - (const unsigned char*)pText;
1335     const unsigned char *zNext;
1336
1337     READ_UTF8(zIn, zEof, iCode);
1338     if( iCode==0 ) break;
1339     zNext = zIn;
1340     if( zIn<zEof ){
1341       if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0);
1342       WRITE_UTF8(zOut, iCode);
1343       READ_UTF8(zIn, zEof, iCode);
1344       if( iCode==0 ) break;
1345     }else{
1346       break;
1347     }
1348     if( zIn<zEof ){
1349       if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0);
1350       WRITE_UTF8(zOut, iCode);
1351       READ_UTF8(zIn, zEof, iCode);
1352       if( iCode==0 ) break;
1353       if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0);
1354       WRITE_UTF8(zOut, iCode);
1355     }else{
1356       break;
1357     }
1358     rc = xToken(pCtx, 0, aBuf, zOut-aBuf, iStart, iStart + zOut-aBuf);
1359     if( rc!=SQLITE_OK ) break;
1360     zIn = zNext;
1361   }
1362
1363   return rc;
1364 }
1365
1366 /*
1367 ** Argument xCreate is a pointer to a constructor function for a tokenizer.
1368 ** pTok is a tokenizer previously created using the same method. This function
1369 ** returns one of FTS5_PATTERN_NONE, FTS5_PATTERN_LIKE or FTS5_PATTERN_GLOB
1370 ** indicating the style of pattern matching that the tokenizer can support.
1371 ** In practice, this is:
1372 **
1373 **     "trigram" tokenizer, case_sensitive=1 - FTS5_PATTERN_GLOB
1374 **     "trigram" tokenizer, case_sensitive=0 (the default) - FTS5_PATTERN_LIKE
1375 **     all other tokenizers - FTS5_PATTERN_NONE
1376 */
1377 int sqlite3Fts5TokenizerPattern(
1378     int (*xCreate)(void*, const char**, int, Fts5Tokenizer**),
1379     Fts5Tokenizer *pTok
1380 ){
1381   if( xCreate==fts5TriCreate ){
1382     TrigramTokenizer *p = (TrigramTokenizer*)pTok;
1383     return p->bFold ? FTS5_PATTERN_LIKE : FTS5_PATTERN_GLOB;
1384   }
1385   return FTS5_PATTERN_NONE;
1386 }
1387
1388 /*
1389 ** Register all built-in tokenizers with FTS5.
1390 */
1391 int sqlite3Fts5TokenizerInit(fts5_api *pApi){
1392   struct BuiltinTokenizer {
1393     const char *zName;
1394     fts5_tokenizer x;
1395   } aBuiltin[] = {
1396     { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}},
1397     { "ascii",     {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }},
1398     { "porter",    {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }},
1399     { "trigram",   {fts5TriCreate, fts5TriDelete, fts5TriTokenize}},
1400   };
1401
1402   int rc = SQLITE_OK;             /* Return code */
1403   int i;                          /* To iterate through builtin functions */
1404
1405   for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){
1406     rc = pApi->xCreateTokenizer(pApi,
1407         aBuiltin[i].zName,
1408         (void*)pApi,
1409         &aBuiltin[i].x,
1410         0
1411     );
1412   }
1413
1414   return rc;
1415 }