ext/fts5/fts5_tokenize.c

   1 /*
   2 ** 2014 May 31
   3 **
   4 ** The author disclaims copyright to this source code.  In place of
   5 ** a legal notice, here is a blessing:
   6 **
   7 **    May you do good and not evil.
   8 **    May you find forgiveness for yourself and forgive others.
   9 **    May you share freely, never taking more than you give.
  10 **
  11 ******************************************************************************
  12 */
  13
  14
  15 #include "fts5Int.h"
  16
  17 /**************************************************************************
  18 ** Start of ascii tokenizer implementation.
  19 */
  20
  21 /*
  22 ** For tokenizers with no "unicode" modifier, the set of token characters
  23 ** is the same as the set of ASCII range alphanumeric characters.
  24 */
  25 static unsigned char aAsciiTokenChar[128] = {
  26   0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x00..0x0F */
  27   0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x10..0x1F */
  28   0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x20..0x2F */
  29   1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 0, 0, 0, 0, 0, 0,   /* 0x30..0x3F */
  30   0, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   /* 0x40..0x4F */
  31   1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 0, 0, 0, 0, 0,   /* 0x50..0x5F */
  32   0, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   /* 0x60..0x6F */
  33   1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 0, 0, 0, 0, 0,   /* 0x70..0x7F */
  34 };
  35
  36 typedef struct AsciiTokenizer AsciiTokenizer;
  37 struct AsciiTokenizer {
  38   unsigned char aTokenChar[128];
  39 };
  40
  41 static void fts5AsciiAddExceptions(
  42   AsciiTokenizer *p,
  43   const char *zArg,
  44   int bTokenChars
  45 ){
  46   int i;
  47   for(i=0; zArg[i]; i++){
  48     if( (zArg[i] & 0x80)==0 ){
  49       p->aTokenChar[(int)zArg[i]] = (unsigned char)bTokenChars;
  50     }
  51   }
  52 }
  53
  54 /*
  55 ** Delete a "ascii" tokenizer.
  56 */
  57 static void fts5AsciiDelete(Fts5Tokenizer *p){
  58   sqlite3_free(p);
  59 }
  60
  61 /*
  62 ** Create an "ascii" tokenizer.
  63 */
  64 static int fts5AsciiCreate(
  65   void *pUnused,
  66   const char **azArg, int nArg,
  67   Fts5Tokenizer **ppOut
  68 ){
  69   int rc = SQLITE_OK;
  70   AsciiTokenizer *p = 0;
  71   UNUSED_PARAM(pUnused);
  72   if( nArg%2 ){
  73     rc = SQLITE_ERROR;
  74   }else{
  75     p = sqlite3_malloc(sizeof(AsciiTokenizer));
  76     if( p==0 ){
  77       rc = SQLITE_NOMEM;
  78     }else{
  79       int i;
  80       memset(p, 0, sizeof(AsciiTokenizer));
  81       memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
  82       for(i=0; rc==SQLITE_OK && i<nArg-1; i+=2){
  83         const char *zArg = azArg[i+1];
  84         if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
  85           fts5AsciiAddExceptions(p, zArg, 1);
  86         }else
  87         if( 0==sqlite3_stricmp(azArg[i], "separators") ){
  88           fts5AsciiAddExceptions(p, zArg, 0);
  89         }else{
  90           rc = SQLITE_ERROR;
  91         }
  92       }
  93       if( rc==SQLITE_OK && i<nArg ) rc = SQLITE_ERROR;
  94       if( rc!=SQLITE_OK ){
  95         fts5AsciiDelete((Fts5Tokenizer*)p);
  96         p = 0;
  97       }
  98     }
  99   }
 100
 101   *ppOut = (Fts5Tokenizer*)p;
 102   return rc;
 103 }
 104
 105
 106 static void asciiFold(char *aOut, const char *aIn, int nByte){
 107   int i;
 108   for(i=0; i<nByte; i++){
 109     char c = aIn[i];
 110     if( c>='A' && c<='Z' ) c += 32;
 111     aOut[i] = c;
 112   }
 113 }
 114
 115 /*
 116 ** Tokenize some text using the ascii tokenizer.
 117 */
 118 static int fts5AsciiTokenize(
 119   Fts5Tokenizer *pTokenizer,
 120   void *pCtx,
 121   int iUnused,
 122   const char *pText, int nText,
 123   int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
 124 ){
 125   AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer;
 126   int rc = SQLITE_OK;
 127   int ie;
 128   int is = 0;
 129
 130   char aFold[64];
 131   int nFold = sizeof(aFold);
 132   char *pFold = aFold;
 133   unsigned char *a = p->aTokenChar;
 134
 135   UNUSED_PARAM(iUnused);
 136
 137   while( is<nText && rc==SQLITE_OK ){
 138     int nByte;
 139
 140     /* Skip any leading divider characters. */
 141     while( is<nText && ((pText[is]&0x80)==0 && a[(int)pText[is]]==0) ){
 142       is++;
 143     }
 144     if( is==nText ) break;
 145
 146     /* Count the token characters */
 147     ie = is+1;
 148     while( ie<nText && ((pText[ie]&0x80) || a[(int)pText[ie]] ) ){
 149       ie++;
 150     }
 151
 152     /* Fold to lower case */
 153     nByte = ie-is;
 154     if( nByte>nFold ){
 155       if( pFold!=aFold ) sqlite3_free(pFold);
 156       pFold = sqlite3_malloc64((sqlite3_int64)nByte*2);
 157       if( pFold==0 ){
 158         rc = SQLITE_NOMEM;
 159         break;
 160       }
 161       nFold = nByte*2;
 162     }
 163     asciiFold(pFold, &pText[is], nByte);
 164
 165     /* Invoke the token callback */
 166     rc = xToken(pCtx, 0, pFold, nByte, is, ie);
 167     is = ie+1;
 168   }
 169
 170   if( pFold!=aFold ) sqlite3_free(pFold);
 171   if( rc==SQLITE_DONE ) rc = SQLITE_OK;
 172   return rc;
 173 }
 174
 175 /**************************************************************************
 176 ** Start of unicode61 tokenizer implementation.
 177 */
 178
 179
 180 /*
 181 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
 182 ** from the sqlite3 source file utf.c. If this file is compiled as part
 183 ** of the amalgamation, they are not required.
 184 */
 185 #ifndef SQLITE_AMALGAMATION
 186
 187 static const unsigned char sqlite3Utf8Trans1[] = {
 188   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 189   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 190   0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
 191   0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
 192   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 193   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 194   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 195   0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
 196 };
 197
 198 #define READ_UTF8(zIn, zTerm, c)                           \
 199   c = *(zIn++);                                            \
 200   if( c>=0xc0 ){                                           \
 201     c = sqlite3Utf8Trans1[c-0xc0];                         \
 202     while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){            \
 203       c = (c<<6) + (0x3f & *(zIn++));                      \
 204     }                                                      \
 205     if( c<0x80                                             \
 206         || (c&0xFFFFF800)==0xD800                          \
 207         || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \
 208   }
 209
 210
 211 #define WRITE_UTF8(zOut, c) {                          \
 212   if( c<0x00080 ){                                     \
 213     *zOut++ = (unsigned char)(c&0xFF);                 \
 214   }                                                    \
 215   else if( c<0x00800 ){                                \
 216     *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F);     \
 217     *zOut++ = 0x80 + (unsigned char)(c & 0x3F);        \
 218   }                                                    \
 219   else if( c<0x10000 ){                                \
 220     *zOut++ = 0xE0 + (unsigned char)((c>>12)&0x0F);    \
 221     *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F);   \
 222     *zOut++ = 0x80 + (unsigned char)(c & 0x3F);        \
 223   }else{                                               \
 224     *zOut++ = 0xF0 + (unsigned char)((c>>18) & 0x07);  \
 225     *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F);  \
 226     *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F);   \
 227     *zOut++ = 0x80 + (unsigned char)(c & 0x3F);        \
 228   }                                                    \
 229 }
 230
 231 #endif /* ifndef SQLITE_AMALGAMATION */
 232
 233 #define FTS5_SKIP_UTF8(zIn) {                               \
 234   if( ((unsigned char)(*(zIn++)))>=0xc0 ){                              \
 235     while( (((unsigned char)*zIn) & 0xc0)==0x80 ){ zIn++; }             \
 236   }                                                    \
 237 }
 238
 239 typedef struct Unicode61Tokenizer Unicode61Tokenizer;
 240 struct Unicode61Tokenizer {
 241   unsigned char aTokenChar[128];  /* ASCII range token characters */
 242   char *aFold;                    /* Buffer to fold text into */
 243   int nFold;                      /* Size of aFold[] in bytes */
 244   int eRemoveDiacritic;           /* True if remove_diacritics=1 is set */
 245   int nException;
 246   int *aiException;
 247
 248   unsigned char aCategory[32];    /* True for token char categories */
 249 };
 250
 251 /* Values for eRemoveDiacritic (must match internals of fts5_unicode2.c) */
 252 #define FTS5_REMOVE_DIACRITICS_NONE    0
 253 #define FTS5_REMOVE_DIACRITICS_SIMPLE  1
 254 #define FTS5_REMOVE_DIACRITICS_COMPLEX 2
 255
 256 static int fts5UnicodeAddExceptions(
 257   Unicode61Tokenizer *p,          /* Tokenizer object */
 258   const char *z,                  /* Characters to treat as exceptions */
 259   int bTokenChars                 /* 1 for 'tokenchars', 0 for 'separators' */
 260 ){
 261   int rc = SQLITE_OK;
 262   int n = (int)strlen(z);
 263   int *aNew;
 264
 265   if( n>0 ){
 266     aNew = (int*)sqlite3_realloc64(p->aiException,
 267                                    (n+p->nException)*sizeof(int));
 268     if( aNew ){
 269       int nNew = p->nException;
 270       const unsigned char *zCsr = (const unsigned char*)z;
 271       const unsigned char *zTerm = (const unsigned char*)&z[n];
 272       while( zCsr<zTerm ){
 273         u32 iCode;
 274         int bToken;
 275         READ_UTF8(zCsr, zTerm, iCode);
 276         if( iCode<128 ){
 277           p->aTokenChar[iCode] = (unsigned char)bTokenChars;
 278         }else{
 279           bToken = p->aCategory[sqlite3Fts5UnicodeCategory(iCode)];
 280           assert( (bToken==0 || bToken==1) );
 281           assert( (bTokenChars==0 || bTokenChars==1) );
 282           if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){
 283             int i;
 284             for(i=0; i<nNew; i++){
 285               if( (u32)aNew[i]>iCode ) break;
 286             }
 287             memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int));
 288             aNew[i] = iCode;
 289             nNew++;
 290           }
 291         }
 292       }
 293       p->aiException = aNew;
 294       p->nException = nNew;
 295     }else{
 296       rc = SQLITE_NOMEM;
 297     }
 298   }
 299
 300   return rc;
 301 }
 302
 303 /*
 304 ** Return true if the p->aiException[] array contains the value iCode.
 305 */
 306 static int fts5UnicodeIsException(Unicode61Tokenizer *p, int iCode){
 307   if( p->nException>0 ){
 308     int *a = p->aiException;
 309     int iLo = 0;
 310     int iHi = p->nException-1;
 311
 312     while( iHi>=iLo ){
 313       int iTest = (iHi + iLo) / 2;
 314       if( iCode==a[iTest] ){
 315         return 1;
 316       }else if( iCode>a[iTest] ){
 317         iLo = iTest+1;
 318       }else{
 319         iHi = iTest-1;
 320       }
 321     }
 322   }
 323
 324   return 0;
 325 }
 326
 327 /*
 328 ** Delete a "unicode61" tokenizer.
 329 */
 330 static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
 331   if( pTok ){
 332     Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
 333     sqlite3_free(p->aiException);
 334     sqlite3_free(p->aFold);
 335     sqlite3_free(p);
 336   }
 337   return;
 338 }
 339
 340 static int unicodeSetCategories(Unicode61Tokenizer *p, const char *zCat){
 341   const char *z = zCat;
 342
 343   while( *z ){
 344     while( *z==' ' || *z=='\t' ) z++;
 345     if( *z && sqlite3Fts5UnicodeCatParse(z, p->aCategory) ){
 346       return SQLITE_ERROR;
 347     }
 348     while( *z!=' ' && *z!='\t' && *z!='\0' ) z++;
 349   }
 350
 351   sqlite3Fts5UnicodeAscii(p->aCategory, p->aTokenChar);
 352   return SQLITE_OK;
 353 }
 354
 355 /*
 356 ** Create a "unicode61" tokenizer.
 357 */
 358 static int fts5UnicodeCreate(
 359   void *pUnused,
 360   const char **azArg, int nArg,
 361   Fts5Tokenizer **ppOut
 362 ){
 363   int rc = SQLITE_OK;             /* Return code */
 364   Unicode61Tokenizer *p = 0;      /* New tokenizer object */
 365
 366   UNUSED_PARAM(pUnused);
 367
 368   if( nArg%2 ){
 369     rc = SQLITE_ERROR;
 370   }else{
 371     p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer));
 372     if( p ){
 373       const char *zCat = "L* N* Co";
 374       int i;
 375       memset(p, 0, sizeof(Unicode61Tokenizer));
 376
 377       p->eRemoveDiacritic = FTS5_REMOVE_DIACRITICS_SIMPLE;
 378       p->nFold = 64;
 379       p->aFold = sqlite3_malloc64(p->nFold * sizeof(char));
 380       if( p->aFold==0 ){
 381         rc = SQLITE_NOMEM;
 382       }
 383
 384       /* Search for a "categories" argument */
 385       for(i=0; rc==SQLITE_OK && i<nArg-1; i+=2){
 386         if( 0==sqlite3_stricmp(azArg[i], "categories") ){
 387           zCat = azArg[i+1];
 388         }
 389       }
 390       if( rc==SQLITE_OK ){
 391         rc = unicodeSetCategories(p, zCat);
 392       }
 393
 394       for(i=0; rc==SQLITE_OK && i<nArg-1; i+=2){
 395         const char *zArg = azArg[i+1];
 396         if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
 397           if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){
 398             rc = SQLITE_ERROR;
 399           }else{
 400             p->eRemoveDiacritic = (zArg[0] - '0');
 401             assert( p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_NONE
 402                  || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_SIMPLE
 403                  || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_COMPLEX
 404             );
 405           }
 406         }else
 407         if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
 408           rc = fts5UnicodeAddExceptions(p, zArg, 1);
 409         }else
 410         if( 0==sqlite3_stricmp(azArg[i], "separators") ){
 411           rc = fts5UnicodeAddExceptions(p, zArg, 0);
 412         }else
 413         if( 0==sqlite3_stricmp(azArg[i], "categories") ){
 414           /* no-op */
 415         }else{
 416           rc = SQLITE_ERROR;
 417         }
 418       }
 419       if( i<nArg && rc==SQLITE_OK ) rc = SQLITE_ERROR;
 420
 421     }else{
 422       rc = SQLITE_NOMEM;
 423     }
 424     if( rc!=SQLITE_OK ){
 425       fts5UnicodeDelete((Fts5Tokenizer*)p);
 426       p = 0;
 427     }
 428     *ppOut = (Fts5Tokenizer*)p;
 429   }
 430   return rc;
 431 }
 432
 433 /*
 434 ** Return true if, for the purposes of tokenizing with the tokenizer
 435 ** passed as the first argument, codepoint iCode is considered a token
 436 ** character (not a separator).
 437 */
 438 static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){
 439   return (
 440     p->aCategory[sqlite3Fts5UnicodeCategory((u32)iCode)]
 441     ^ fts5UnicodeIsException(p, iCode)
 442   );
 443 }
 444
 445 static int fts5UnicodeTokenize(
 446   Fts5Tokenizer *pTokenizer,
 447   void *pCtx,
 448   int iUnused,
 449   const char *pText, int nText,
 450   int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
 451 ){
 452   Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
 453   int rc = SQLITE_OK;
 454   unsigned char *a = p->aTokenChar;
 455
 456   unsigned char *zTerm = (unsigned char*)&pText[nText];
 457   unsigned char *zCsr = (unsigned char *)pText;
 458
 459   /* Output buffer */
 460   char *aFold = p->aFold;
 461   int nFold = p->nFold;
 462   const char *pEnd = &aFold[nFold-6];
 463
 464   UNUSED_PARAM(iUnused);
 465
 466   /* Each iteration of this loop gobbles up a contiguous run of separators,
 467   ** then the next token.  */
 468   while( rc==SQLITE_OK ){
 469     u32 iCode;                    /* non-ASCII codepoint read from input */
 470     char *zOut = aFold;
 471     int is;
 472     int ie;
 473
 474     /* Skip any separator characters. */
 475     while( 1 ){
 476       if( zCsr>=zTerm ) goto tokenize_done;
 477       if( *zCsr & 0x80 ) {
 478         /* A character outside of the ascii range. Skip past it if it is
 479         ** a separator character. Or break out of the loop if it is not. */
 480         is = zCsr - (unsigned char*)pText;
 481         READ_UTF8(zCsr, zTerm, iCode);
 482         if( fts5UnicodeIsAlnum(p, iCode) ){
 483           goto non_ascii_tokenchar;
 484         }
 485       }else{
 486         if( a[*zCsr] ){
 487           is = zCsr - (unsigned char*)pText;
 488           goto ascii_tokenchar;
 489         }
 490         zCsr++;
 491       }
 492     }
 493
 494     /* Run through the tokenchars. Fold them into the output buffer along
 495     ** the way.  */
 496     while( zCsr<zTerm ){
 497
 498       /* Grow the output buffer so that there is sufficient space to fit the
 499       ** largest possible utf-8 character.  */
 500       if( zOut>pEnd ){
 501         aFold = sqlite3_malloc64((sqlite3_int64)nFold*2);
 502         if( aFold==0 ){
 503           rc = SQLITE_NOMEM;
 504           goto tokenize_done;
 505         }
 506         zOut = &aFold[zOut - p->aFold];
 507         memcpy(aFold, p->aFold, nFold);
 508         sqlite3_free(p->aFold);
 509         p->aFold = aFold;
 510         p->nFold = nFold = nFold*2;
 511         pEnd = &aFold[nFold-6];
 512       }
 513
 514       if( *zCsr & 0x80 ){
 515         /* An non-ascii-range character. Fold it into the output buffer if
 516         ** it is a token character, or break out of the loop if it is not. */
 517         READ_UTF8(zCsr, zTerm, iCode);
 518         if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
 519  non_ascii_tokenchar:
 520           iCode = sqlite3Fts5UnicodeFold(iCode, p->eRemoveDiacritic);
 521           if( iCode ) WRITE_UTF8(zOut, iCode);
 522         }else{
 523           break;
 524         }
 525       }else if( a[*zCsr]==0 ){
 526         /* An ascii-range separator character. End of token. */
 527         break;
 528       }else{
 529  ascii_tokenchar:
 530         if( *zCsr>='A' && *zCsr<='Z' ){
 531           *zOut++ = *zCsr + 32;
 532         }else{
 533           *zOut++ = *zCsr;
 534         }
 535         zCsr++;
 536       }
 537       ie = zCsr - (unsigned char*)pText;
 538     }
 539
 540     /* Invoke the token callback */
 541     rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie);
 542   }
 543
 544  tokenize_done:
 545   if( rc==SQLITE_DONE ) rc = SQLITE_OK;
 546   return rc;
 547 }
 548
 549 /**************************************************************************
 550 ** Start of porter stemmer implementation.
 551 */
 552
 553 /* Any tokens larger than this (in bytes) are passed through without
 554 ** stemming. */
 555 #define FTS5_PORTER_MAX_TOKEN 64
 556
 557 typedef struct PorterTokenizer PorterTokenizer;
 558 struct PorterTokenizer {
 559   fts5_tokenizer tokenizer;       /* Parent tokenizer module */
 560   Fts5Tokenizer *pTokenizer;      /* Parent tokenizer instance */
 561   char aBuf[FTS5_PORTER_MAX_TOKEN + 64];
 562 };
 563
 564 /*
 565 ** Delete a "porter" tokenizer.
 566 */
 567 static void fts5PorterDelete(Fts5Tokenizer *pTok){
 568   if( pTok ){
 569     PorterTokenizer *p = (PorterTokenizer*)pTok;
 570     if( p->pTokenizer ){
 571       p->tokenizer.xDelete(p->pTokenizer);
 572     }
 573     sqlite3_free(p);
 574   }
 575 }
 576
 577 /*
 578 ** Create a "porter" tokenizer.
 579 */
 580 static int fts5PorterCreate(
 581   void *pCtx,
 582   const char **azArg, int nArg,
 583   Fts5Tokenizer **ppOut
 584 ){
 585   fts5_api *pApi = (fts5_api*)pCtx;
 586   int rc = SQLITE_OK;
 587   PorterTokenizer *pRet;
 588   void *pUserdata = 0;
 589   const char *zBase = "unicode61";
 590
 591   if( nArg>0 ){
 592     zBase = azArg[0];
 593   }
 594
 595   pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer));
 596   if( pRet ){
 597     memset(pRet, 0, sizeof(PorterTokenizer));
 598     rc = pApi->xFindTokenizer(pApi, zBase, &pUserdata, &pRet->tokenizer);
 599   }else{
 600     rc = SQLITE_NOMEM;
 601   }
 602   if( rc==SQLITE_OK ){
 603     int nArg2 = (nArg>0 ? nArg-1 : 0);
 604     const char **azArg2 = (nArg2 ? &azArg[1] : 0);
 605     rc = pRet->tokenizer.xCreate(pUserdata, azArg2, nArg2, &pRet->pTokenizer);
 606   }
 607
 608   if( rc!=SQLITE_OK ){
 609     fts5PorterDelete((Fts5Tokenizer*)pRet);
 610     pRet = 0;
 611   }
 612   *ppOut = (Fts5Tokenizer*)pRet;
 613   return rc;
 614 }
 615
 616 typedef struct PorterContext PorterContext;
 617 struct PorterContext {
 618   void *pCtx;
 619   int (*xToken)(void*, int, const char*, int, int, int);
 620   char *aBuf;
 621 };
 622
 623 typedef struct PorterRule PorterRule;
 624 struct PorterRule {
 625   const char *zSuffix;
 626   int nSuffix;
 627   int (*xCond)(char *zStem, int nStem);
 628   const char *zOutput;
 629   int nOutput;
 630 };
 631
 632 #if 0
 633 static int fts5PorterApply(char *aBuf, int *pnBuf, PorterRule *aRule){
 634   int ret = -1;
 635   int nBuf = *pnBuf;
 636   PorterRule *p;
 637
 638   for(p=aRule; p->zSuffix; p++){
 639     assert( strlen(p->zSuffix)==p->nSuffix );
 640     assert( strlen(p->zOutput)==p->nOutput );
 641     if( nBuf<p->nSuffix ) continue;
 642     if( 0==memcmp(&aBuf[nBuf - p->nSuffix], p->zSuffix, p->nSuffix) ) break;
 643   }
 644
 645   if( p->zSuffix ){
 646     int nStem = nBuf - p->nSuffix;
 647     if( p->xCond==0 || p->xCond(aBuf, nStem) ){
 648       memcpy(&aBuf[nStem], p->zOutput, p->nOutput);
 649       *pnBuf = nStem + p->nOutput;
 650       ret = p - aRule;
 651     }
 652   }
 653
 654   return ret;
 655 }
 656 #endif
 657
 658 static int fts5PorterIsVowel(char c, int bYIsVowel){
 659   return (
 660       c=='a' || c=='e' || c=='i' || c=='o' || c=='u' || (bYIsVowel && c=='y')
 661   );
 662 }
 663
 664 static int fts5PorterGobbleVC(char *zStem, int nStem, int bPrevCons){
 665   int i;
 666   int bCons = bPrevCons;
 667
 668   /* Scan for a vowel */
 669   for(i=0; i<nStem; i++){
 670     if( 0==(bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) break;
 671   }
 672
 673   /* Scan for a consonent */
 674   for(i++; i<nStem; i++){
 675     if( (bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) return i+1;
 676   }
 677   return 0;
 678 }
 679
 680 /* porter rule condition: (m > 0) */
 681 static int fts5Porter_MGt0(char *zStem, int nStem){
 682   return !!fts5PorterGobbleVC(zStem, nStem, 0);
 683 }
 684
 685 /* porter rule condition: (m > 1) */
 686 static int fts5Porter_MGt1(char *zStem, int nStem){
 687   int n;
 688   n = fts5PorterGobbleVC(zStem, nStem, 0);
 689   if( n && fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
 690     return 1;
 691   }
 692   return 0;
 693 }
 694
 695 /* porter rule condition: (m = 1) */
 696 static int fts5Porter_MEq1(char *zStem, int nStem){
 697   int n;
 698   n = fts5PorterGobbleVC(zStem, nStem, 0);
 699   if( n && 0==fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
 700     return 1;
 701   }
 702   return 0;
 703 }
 704
 705 /* porter rule condition: (*o) */
 706 static int fts5Porter_Ostar(char *zStem, int nStem){
 707   if( zStem[nStem-1]=='w' || zStem[nStem-1]=='x' || zStem[nStem-1]=='y' ){
 708     return 0;
 709   }else{
 710     int i;
 711     int mask = 0;
 712     int bCons = 0;
 713     for(i=0; i<nStem; i++){
 714       bCons = !fts5PorterIsVowel(zStem[i], bCons);
 715       assert( bCons==0 || bCons==1 );
 716       mask = (mask << 1) + bCons;
 717     }
 718     return ((mask & 0x0007)==0x0005);
 719   }
 720 }
 721
 722 /* porter rule condition: (m > 1 and (*S or *T)) */
 723 static int fts5Porter_MGt1_and_S_or_T(char *zStem, int nStem){
 724   assert( nStem>0 );
 725   return (zStem[nStem-1]=='s' || zStem[nStem-1]=='t')
 726       && fts5Porter_MGt1(zStem, nStem);
 727 }
 728
 729 /* porter rule condition: (*v*) */
 730 static int fts5Porter_Vowel(char *zStem, int nStem){
 731   int i;
 732   for(i=0; i<nStem; i++){
 733     if( fts5PorterIsVowel(zStem[i], i>0) ){
 734       return 1;
 735     }
 736   }
 737   return 0;
 738 }
 739
 740
 741 /**************************************************************************
 742 ***************************************************************************
 743 ** GENERATED CODE STARTS HERE (mkportersteps.tcl)
 744 */
 745
 746 static int fts5PorterStep4(char *aBuf, int *pnBuf){
 747   int ret = 0;
 748   int nBuf = *pnBuf;
 749   switch( aBuf[nBuf-2] ){
 750
 751     case 'a':
 752       if( nBuf>2 && 0==memcmp("al", &aBuf[nBuf-2], 2) ){
 753         if( fts5Porter_MGt1(aBuf, nBuf-2) ){
 754           *pnBuf = nBuf - 2;
 755         }
 756       }
 757       break;
 758
 759     case 'c':
 760       if( nBuf>4 && 0==memcmp("ance", &aBuf[nBuf-4], 4) ){
 761         if( fts5Porter_MGt1(aBuf, nBuf-4) ){
 762           *pnBuf = nBuf - 4;
 763         }
 764       }else if( nBuf>4 && 0==memcmp("ence", &aBuf[nBuf-4], 4) ){
 765         if( fts5Porter_MGt1(aBuf, nBuf-4) ){
 766           *pnBuf = nBuf - 4;
 767         }
 768       }
 769       break;
 770
 771     case 'e':
 772       if( nBuf>2 && 0==memcmp("er", &aBuf[nBuf-2], 2) ){
 773         if( fts5Porter_MGt1(aBuf, nBuf-2) ){
 774           *pnBuf = nBuf - 2;
 775         }
 776       }
 777       break;
 778
 779     case 'i':
 780       if( nBuf>2 && 0==memcmp("ic", &aBuf[nBuf-2], 2) ){
 781         if( fts5Porter_MGt1(aBuf, nBuf-2) ){
 782           *pnBuf = nBuf - 2;
 783         }
 784       }
 785       break;
 786
 787     case 'l':
 788       if( nBuf>4 && 0==memcmp("able", &aBuf[nBuf-4], 4) ){
 789         if( fts5Porter_MGt1(aBuf, nBuf-4) ){
 790           *pnBuf = nBuf - 4;
 791         }
 792       }else if( nBuf>4 && 0==memcmp("ible", &aBuf[nBuf-4], 4) ){
 793         if( fts5Porter_MGt1(aBuf, nBuf-4) ){
 794           *pnBuf = nBuf - 4;
 795         }
 796       }
 797       break;
 798
 799     case 'n':
 800       if( nBuf>3 && 0==memcmp("ant", &aBuf[nBuf-3], 3) ){
 801         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 802           *pnBuf = nBuf - 3;
 803         }
 804       }else if( nBuf>5 && 0==memcmp("ement", &aBuf[nBuf-5], 5) ){
 805         if( fts5Porter_MGt1(aBuf, nBuf-5) ){
 806           *pnBuf = nBuf - 5;
 807         }
 808       }else if( nBuf>4 && 0==memcmp("ment", &aBuf[nBuf-4], 4) ){
 809         if( fts5Porter_MGt1(aBuf, nBuf-4) ){
 810           *pnBuf = nBuf - 4;
 811         }
 812       }else if( nBuf>3 && 0==memcmp("ent", &aBuf[nBuf-3], 3) ){
 813         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 814           *pnBuf = nBuf - 3;
 815         }
 816       }
 817       break;
 818
 819     case 'o':
 820       if( nBuf>3 && 0==memcmp("ion", &aBuf[nBuf-3], 3) ){
 821         if( fts5Porter_MGt1_and_S_or_T(aBuf, nBuf-3) ){
 822           *pnBuf = nBuf - 3;
 823         }
 824       }else if( nBuf>2 && 0==memcmp("ou", &aBuf[nBuf-2], 2) ){
 825         if( fts5Porter_MGt1(aBuf, nBuf-2) ){
 826           *pnBuf = nBuf - 2;
 827         }
 828       }
 829       break;
 830
 831     case 's':
 832       if( nBuf>3 && 0==memcmp("ism", &aBuf[nBuf-3], 3) ){
 833         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 834           *pnBuf = nBuf - 3;
 835         }
 836       }
 837       break;
 838
 839     case 't':
 840       if( nBuf>3 && 0==memcmp("ate", &aBuf[nBuf-3], 3) ){
 841         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 842           *pnBuf = nBuf - 3;
 843         }
 844       }else if( nBuf>3 && 0==memcmp("iti", &aBuf[nBuf-3], 3) ){
 845         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 846           *pnBuf = nBuf - 3;
 847         }
 848       }
 849       break;
 850
 851     case 'u':
 852       if( nBuf>3 && 0==memcmp("ous", &aBuf[nBuf-3], 3) ){
 853         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 854           *pnBuf = nBuf - 3;
 855         }
 856       }
 857       break;
 858
 859     case 'v':
 860       if( nBuf>3 && 0==memcmp("ive", &aBuf[nBuf-3], 3) ){
 861         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 862           *pnBuf = nBuf - 3;
 863         }
 864       }
 865       break;
 866
 867     case 'z':
 868       if( nBuf>3 && 0==memcmp("ize", &aBuf[nBuf-3], 3) ){
 869         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
 870           *pnBuf = nBuf - 3;
 871         }
 872       }
 873       break;
 874
 875   }
 876   return ret;
 877 }
 878
 879
 880 static int fts5PorterStep1B2(char *aBuf, int *pnBuf){
 881   int ret = 0;
 882   int nBuf = *pnBuf;
 883   switch( aBuf[nBuf-2] ){
 884
 885     case 'a':
 886       if( nBuf>2 && 0==memcmp("at", &aBuf[nBuf-2], 2) ){
 887         memcpy(&aBuf[nBuf-2], "ate", 3);
 888         *pnBuf = nBuf - 2 + 3;
 889         ret = 1;
 890       }
 891       break;
 892
 893     case 'b':
 894       if( nBuf>2 && 0==memcmp("bl", &aBuf[nBuf-2], 2) ){
 895         memcpy(&aBuf[nBuf-2], "ble", 3);
 896         *pnBuf = nBuf - 2 + 3;
 897         ret = 1;
 898       }
 899       break;
 900
 901     case 'i':
 902       if( nBuf>2 && 0==memcmp("iz", &aBuf[nBuf-2], 2) ){
 903         memcpy(&aBuf[nBuf-2], "ize", 3);
 904         *pnBuf = nBuf - 2 + 3;
 905         ret = 1;
 906       }
 907       break;
 908
 909   }
 910   return ret;
 911 }
 912
 913
 914 static int fts5PorterStep2(char *aBuf, int *pnBuf){
 915   int ret = 0;
 916   int nBuf = *pnBuf;
 917   switch( aBuf[nBuf-2] ){
 918
 919     case 'a':
 920       if( nBuf>7 && 0==memcmp("ational", &aBuf[nBuf-7], 7) ){
 921         if( fts5Porter_MGt0(aBuf, nBuf-7) ){
 922           memcpy(&aBuf[nBuf-7], "ate", 3);
 923           *pnBuf = nBuf - 7 + 3;
 924         }
 925       }else if( nBuf>6 && 0==memcmp("tional", &aBuf[nBuf-6], 6) ){
 926         if( fts5Porter_MGt0(aBuf, nBuf-6) ){
 927           memcpy(&aBuf[nBuf-6], "tion", 4);
 928           *pnBuf = nBuf - 6 + 4;
 929         }
 930       }
 931       break;
 932
 933     case 'c':
 934       if( nBuf>4 && 0==memcmp("enci", &aBuf[nBuf-4], 4) ){
 935         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
 936           memcpy(&aBuf[nBuf-4], "ence", 4);
 937           *pnBuf = nBuf - 4 + 4;
 938         }
 939       }else if( nBuf>4 && 0==memcmp("anci", &aBuf[nBuf-4], 4) ){
 940         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
 941           memcpy(&aBuf[nBuf-4], "ance", 4);
 942           *pnBuf = nBuf - 4 + 4;
 943         }
 944       }
 945       break;
 946
 947     case 'e':
 948       if( nBuf>4 && 0==memcmp("izer", &aBuf[nBuf-4], 4) ){
 949         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
 950           memcpy(&aBuf[nBuf-4], "ize", 3);
 951           *pnBuf = nBuf - 4 + 3;
 952         }
 953       }
 954       break;
 955
 956     case 'g':
 957       if( nBuf>4 && 0==memcmp("logi", &aBuf[nBuf-4], 4) ){
 958         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
 959           memcpy(&aBuf[nBuf-4], "log", 3);
 960           *pnBuf = nBuf - 4 + 3;
 961         }
 962       }
 963       break;
 964
 965     case 'l':
 966       if( nBuf>3 && 0==memcmp("bli", &aBuf[nBuf-3], 3) ){
 967         if( fts5Porter_MGt0(aBuf, nBuf-3) ){
 968           memcpy(&aBuf[nBuf-3], "ble", 3);
 969           *pnBuf = nBuf - 3 + 3;
 970         }
 971       }else if( nBuf>4 && 0==memcmp("alli", &aBuf[nBuf-4], 4) ){
 972         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
 973           memcpy(&aBuf[nBuf-4], "al", 2);
 974           *pnBuf = nBuf - 4 + 2;
 975         }
 976       }else if( nBuf>5 && 0==memcmp("entli", &aBuf[nBuf-5], 5) ){
 977         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
 978           memcpy(&aBuf[nBuf-5], "ent", 3);
 979           *pnBuf = nBuf - 5 + 3;
 980         }
 981       }else if( nBuf>3 && 0==memcmp("eli", &aBuf[nBuf-3], 3) ){
 982         if( fts5Porter_MGt0(aBuf, nBuf-3) ){
 983           memcpy(&aBuf[nBuf-3], "e", 1);
 984           *pnBuf = nBuf - 3 + 1;
 985         }
 986       }else if( nBuf>5 && 0==memcmp("ousli", &aBuf[nBuf-5], 5) ){
 987         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
 988           memcpy(&aBuf[nBuf-5], "ous", 3);
 989           *pnBuf = nBuf - 5 + 3;
 990         }
 991       }
 992       break;
 993
 994     case 'o':
 995       if( nBuf>7 && 0==memcmp("ization", &aBuf[nBuf-7], 7) ){
 996         if( fts5Porter_MGt0(aBuf, nBuf-7) ){
 997           memcpy(&aBuf[nBuf-7], "ize", 3);
 998           *pnBuf = nBuf - 7 + 3;
 999         }
1000       }else if( nBuf>5 && 0==memcmp("ation", &aBuf[nBuf-5], 5) ){
1001         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1002           memcpy(&aBuf[nBuf-5], "ate", 3);
1003           *pnBuf = nBuf - 5 + 3;
1004         }
1005       }else if( nBuf>4 && 0==memcmp("ator", &aBuf[nBuf-4], 4) ){
1006         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1007           memcpy(&aBuf[nBuf-4], "ate", 3);
1008           *pnBuf = nBuf - 4 + 3;
1009         }
1010       }
1011       break;
1012
1013     case 's':
1014       if( nBuf>5 && 0==memcmp("alism", &aBuf[nBuf-5], 5) ){
1015         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1016           memcpy(&aBuf[nBuf-5], "al", 2);
1017           *pnBuf = nBuf - 5 + 2;
1018         }
1019       }else if( nBuf>7 && 0==memcmp("iveness", &aBuf[nBuf-7], 7) ){
1020         if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1021           memcpy(&aBuf[nBuf-7], "ive", 3);
1022           *pnBuf = nBuf - 7 + 3;
1023         }
1024       }else if( nBuf>7 && 0==memcmp("fulness", &aBuf[nBuf-7], 7) ){
1025         if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1026           memcpy(&aBuf[nBuf-7], "ful", 3);
1027           *pnBuf = nBuf - 7 + 3;
1028         }
1029       }else if( nBuf>7 && 0==memcmp("ousness", &aBuf[nBuf-7], 7) ){
1030         if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1031           memcpy(&aBuf[nBuf-7], "ous", 3);
1032           *pnBuf = nBuf - 7 + 3;
1033         }
1034       }
1035       break;
1036
1037     case 't':
1038       if( nBuf>5 && 0==memcmp("aliti", &aBuf[nBuf-5], 5) ){
1039         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1040           memcpy(&aBuf[nBuf-5], "al", 2);
1041           *pnBuf = nBuf - 5 + 2;
1042         }
1043       }else if( nBuf>5 && 0==memcmp("iviti", &aBuf[nBuf-5], 5) ){
1044         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1045           memcpy(&aBuf[nBuf-5], "ive", 3);
1046           *pnBuf = nBuf - 5 + 3;
1047         }
1048       }else if( nBuf>6 && 0==memcmp("biliti", &aBuf[nBuf-6], 6) ){
1049         if( fts5Porter_MGt0(aBuf, nBuf-6) ){
1050           memcpy(&aBuf[nBuf-6], "ble", 3);
1051           *pnBuf = nBuf - 6 + 3;
1052         }
1053       }
1054       break;
1055
1056   }
1057   return ret;
1058 }
1059
1060
1061 static int fts5PorterStep3(char *aBuf, int *pnBuf){
1062   int ret = 0;
1063   int nBuf = *pnBuf;
1064   switch( aBuf[nBuf-2] ){
1065
1066     case 'a':
1067       if( nBuf>4 && 0==memcmp("ical", &aBuf[nBuf-4], 4) ){
1068         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1069           memcpy(&aBuf[nBuf-4], "ic", 2);
1070           *pnBuf = nBuf - 4 + 2;
1071         }
1072       }
1073       break;
1074
1075     case 's':
1076       if( nBuf>4 && 0==memcmp("ness", &aBuf[nBuf-4], 4) ){
1077         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1078           *pnBuf = nBuf - 4;
1079         }
1080       }
1081       break;
1082
1083     case 't':
1084       if( nBuf>5 && 0==memcmp("icate", &aBuf[nBuf-5], 5) ){
1085         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1086           memcpy(&aBuf[nBuf-5], "ic", 2);
1087           *pnBuf = nBuf - 5 + 2;
1088         }
1089       }else if( nBuf>5 && 0==memcmp("iciti", &aBuf[nBuf-5], 5) ){
1090         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1091           memcpy(&aBuf[nBuf-5], "ic", 2);
1092           *pnBuf = nBuf - 5 + 2;
1093         }
1094       }
1095       break;
1096
1097     case 'u':
1098       if( nBuf>3 && 0==memcmp("ful", &aBuf[nBuf-3], 3) ){
1099         if( fts5Porter_MGt0(aBuf, nBuf-3) ){
1100           *pnBuf = nBuf - 3;
1101         }
1102       }
1103       break;
1104
1105     case 'v':
1106       if( nBuf>5 && 0==memcmp("ative", &aBuf[nBuf-5], 5) ){
1107         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1108           *pnBuf = nBuf - 5;
1109         }
1110       }
1111       break;
1112
1113     case 'z':
1114       if( nBuf>5 && 0==memcmp("alize", &aBuf[nBuf-5], 5) ){
1115         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1116           memcpy(&aBuf[nBuf-5], "al", 2);
1117           *pnBuf = nBuf - 5 + 2;
1118         }
1119       }
1120       break;
1121
1122   }
1123   return ret;
1124 }
1125
1126
1127 static int fts5PorterStep1B(char *aBuf, int *pnBuf){
1128   int ret = 0;
1129   int nBuf = *pnBuf;
1130   switch( aBuf[nBuf-2] ){
1131
1132     case 'e':
1133       if( nBuf>3 && 0==memcmp("eed", &aBuf[nBuf-3], 3) ){
1134         if( fts5Porter_MGt0(aBuf, nBuf-3) ){
1135           memcpy(&aBuf[nBuf-3], "ee", 2);
1136           *pnBuf = nBuf - 3 + 2;
1137         }
1138       }else if( nBuf>2 && 0==memcmp("ed", &aBuf[nBuf-2], 2) ){
1139         if( fts5Porter_Vowel(aBuf, nBuf-2) ){
1140           *pnBuf = nBuf - 2;
1141           ret = 1;
1142         }
1143       }
1144       break;
1145
1146     case 'n':
1147       if( nBuf>3 && 0==memcmp("ing", &aBuf[nBuf-3], 3) ){
1148         if( fts5Porter_Vowel(aBuf, nBuf-3) ){
1149           *pnBuf = nBuf - 3;
1150           ret = 1;
1151         }
1152       }
1153       break;
1154
1155   }
1156   return ret;
1157 }
1158
1159 /*
1160 ** GENERATED CODE ENDS HERE (mkportersteps.tcl)
1161 ***************************************************************************
1162 **************************************************************************/
1163
1164 static void fts5PorterStep1A(char *aBuf, int *pnBuf){
1165   int nBuf = *pnBuf;
1166   if( aBuf[nBuf-1]=='s' ){
1167     if( aBuf[nBuf-2]=='e' ){
1168       if( (nBuf>4 && aBuf[nBuf-4]=='s' && aBuf[nBuf-3]=='s')
1169        || (nBuf>3 && aBuf[nBuf-3]=='i' )
1170       ){
1171         *pnBuf = nBuf-2;
1172       }else{
1173         *pnBuf = nBuf-1;
1174       }
1175     }
1176     else if( aBuf[nBuf-2]!='s' ){
1177       *pnBuf = nBuf-1;
1178     }
1179   }
1180 }
1181
1182 static int fts5PorterCb(
1183   void *pCtx,
1184   int tflags,
1185   const char *pToken,
1186   int nToken,
1187   int iStart,
1188   int iEnd
1189 ){
1190   PorterContext *p = (PorterContext*)pCtx;
1191
1192   char *aBuf;
1193   int nBuf;
1194
1195   if( nToken>FTS5_PORTER_MAX_TOKEN || nToken<3 ) goto pass_through;
1196   aBuf = p->aBuf;
1197   nBuf = nToken;
1198   memcpy(aBuf, pToken, nBuf);
1199
1200   /* Step 1. */
1201   fts5PorterStep1A(aBuf, &nBuf);
1202   if( fts5PorterStep1B(aBuf, &nBuf) ){
1203     if( fts5PorterStep1B2(aBuf, &nBuf)==0 ){
1204       char c = aBuf[nBuf-1];
1205       if( fts5PorterIsVowel(c, 0)==0
1206        && c!='l' && c!='s' && c!='z' && c==aBuf[nBuf-2]
1207       ){
1208         nBuf--;
1209       }else if( fts5Porter_MEq1(aBuf, nBuf) && fts5Porter_Ostar(aBuf, nBuf) ){
1210         aBuf[nBuf++] = 'e';
1211       }
1212     }
1213   }
1214
1215   /* Step 1C. */
1216   if( aBuf[nBuf-1]=='y' && fts5Porter_Vowel(aBuf, nBuf-1) ){
1217     aBuf[nBuf-1] = 'i';
1218   }
1219
1220   /* Steps 2 through 4. */
1221   fts5PorterStep2(aBuf, &nBuf);
1222   fts5PorterStep3(aBuf, &nBuf);
1223   fts5PorterStep4(aBuf, &nBuf);
1224
1225   /* Step 5a. */
1226   assert( nBuf>0 );
1227   if( aBuf[nBuf-1]=='e' ){
1228     if( fts5Porter_MGt1(aBuf, nBuf-1)
1229      || (fts5Porter_MEq1(aBuf, nBuf-1) && !fts5Porter_Ostar(aBuf, nBuf-1))
1230     ){
1231       nBuf--;
1232     }
1233   }
1234
1235   /* Step 5b. */
1236   if( nBuf>1 && aBuf[nBuf-1]=='l'
1237    && aBuf[nBuf-2]=='l' && fts5Porter_MGt1(aBuf, nBuf-1)
1238   ){
1239     nBuf--;
1240   }
1241
1242   return p->xToken(p->pCtx, tflags, aBuf, nBuf, iStart, iEnd);
1243
1244  pass_through:
1245   return p->xToken(p->pCtx, tflags, pToken, nToken, iStart, iEnd);
1246 }
1247
1248 /*
1249 ** Tokenize using the porter tokenizer.
1250 */
1251 static int fts5PorterTokenize(
1252   Fts5Tokenizer *pTokenizer,
1253   void *pCtx,
1254   int flags,
1255   const char *pText, int nText,
1256   int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
1257 ){
1258   PorterTokenizer *p = (PorterTokenizer*)pTokenizer;
1259   PorterContext sCtx;
1260   sCtx.xToken = xToken;
1261   sCtx.pCtx = pCtx;
1262   sCtx.aBuf = p->aBuf;
1263   return p->tokenizer.xTokenize(
1264       p->pTokenizer, (void*)&sCtx, flags, pText, nText, fts5PorterCb
1265   );
1266 }
1267
1268 /**************************************************************************
1269 ** Start of trigram implementation.
1270 */
1271 typedef struct TrigramTokenizer TrigramTokenizer;
1272 struct TrigramTokenizer {
1273   int bFold;                      /* True to fold to lower-case */
1274   int iFoldParam;                 /* Parameter to pass to Fts5UnicodeFold() */
1275 };
1276
1277 /*
1278 ** Free a trigram tokenizer.
1279 */
1280 static void fts5TriDelete(Fts5Tokenizer *p){
1281   sqlite3_free(p);
1282 }
1283
1284 /*
1285 ** Allocate a trigram tokenizer.
1286 */
1287 static int fts5TriCreate(
1288   void *pUnused,
1289   const char **azArg,
1290   int nArg,
1291   Fts5Tokenizer **ppOut
1292 ){
1293   int rc = SQLITE_OK;
1294   TrigramTokenizer *pNew = (TrigramTokenizer*)sqlite3_malloc(sizeof(*pNew));
1295   UNUSED_PARAM(pUnused);
1296   if( pNew==0 ){
1297     rc = SQLITE_NOMEM;
1298   }else{
1299     int i;
1300     pNew->bFold = 1;
1301     pNew->iFoldParam = 0;
1302     for(i=0; rc==SQLITE_OK && i<nArg-1; i+=2){
1303       const char *zArg = azArg[i+1];
1304       if( 0==sqlite3_stricmp(azArg[i], "case_sensitive") ){
1305         if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
1306           rc = SQLITE_ERROR;
1307         }else{
1308           pNew->bFold = (zArg[0]=='0');
1309         }
1310       }else if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
1311         if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){
1312           rc = SQLITE_ERROR;
1313         }else{
1314           pNew->iFoldParam = (zArg[0]!='0') ? 2 : 0;
1315         }
1316       }else{
1317         rc = SQLITE_ERROR;
1318       }
1319     }
1320     if( i<nArg && rc==SQLITE_OK ) rc = SQLITE_ERROR;
1321
1322     if( pNew->iFoldParam!=0 && pNew->bFold==0 ){
1323       rc = SQLITE_ERROR;
1324     }
1325
1326     if( rc!=SQLITE_OK ){
1327       fts5TriDelete((Fts5Tokenizer*)pNew);
1328       pNew = 0;
1329     }
1330   }
1331   *ppOut = (Fts5Tokenizer*)pNew;
1332   return rc;
1333 }
1334
1335 /*
1336 ** Trigram tokenizer tokenize routine.
1337 */
1338 static int fts5TriTokenize(
1339   Fts5Tokenizer *pTok,
1340   void *pCtx,
1341   int unusedFlags,
1342   const char *pText, int nText,
1343   int (*xToken)(void*, int, const char*, int, int, int)
1344 ){
1345   TrigramTokenizer *p = (TrigramTokenizer*)pTok;
1346   int rc = SQLITE_OK;
1347   char aBuf[32];
1348   char *zOut = aBuf;
1349   int ii;
1350   const unsigned char *zIn = (const unsigned char*)pText;
1351   const unsigned char *zEof = &zIn[nText];
1352   u32 iCode;
1353   int aStart[3];                  /* Input offset of each character in aBuf[] */
1354
1355   UNUSED_PARAM(unusedFlags);
1356
1357   /* Populate aBuf[] with the characters for the first trigram. */
1358   for(ii=0; ii<3; ii++){
1359     do {
1360       aStart[ii] = zIn - (const unsigned char*)pText;
1361       READ_UTF8(zIn, zEof, iCode);
1362       if( iCode==0 ) return SQLITE_OK;
1363       if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam);
1364     }while( iCode==0 );
1365     WRITE_UTF8(zOut, iCode);
1366   }
1367
1368   /* At the start of each iteration of this loop:
1369   **
1370   **  aBuf:      Contains 3 characters. The 3 characters of the next trigram.
1371   **  zOut:      Points to the byte following the last character in aBuf.
1372   **  aStart[3]: Contains the byte offset in the input text corresponding
1373   **             to the start of each of the three characters in the buffer.
1374   */
1375   assert( zIn<=zEof );
1376   while( 1 ){
1377     int iNext;                    /* Start of character following current tri */
1378     const char *z1;
1379
1380     /* Read characters from the input up until the first non-diacritic */
1381     do {
1382       iNext = zIn - (const unsigned char*)pText;
1383       READ_UTF8(zIn, zEof, iCode);
1384       if( iCode==0 ) break;
1385       if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, p->iFoldParam);
1386     }while( iCode==0 );
1387
1388     /* Pass the current trigram back to fts5 */
1389     rc = xToken(pCtx, 0, aBuf, zOut-aBuf, aStart[0], iNext);
1390     if( iCode==0 || rc!=SQLITE_OK ) break;
1391
1392     /* Remove the first character from buffer aBuf[]. Append the character
1393     ** with codepoint iCode.  */
1394     z1 = aBuf;
1395     FTS5_SKIP_UTF8(z1);
1396     memmove(aBuf, z1, zOut - z1);
1397     zOut -= (z1 - aBuf);
1398     WRITE_UTF8(zOut, iCode);
1399
1400     /* Update the aStart[] array */
1401     aStart[0] = aStart[1];
1402     aStart[1] = aStart[2];
1403     aStart[2] = iNext;
1404   }
1405
1406   return rc;
1407 }
1408
1409 /*
1410 ** Argument xCreate is a pointer to a constructor function for a tokenizer.
1411 ** pTok is a tokenizer previously created using the same method. This function
1412 ** returns one of FTS5_PATTERN_NONE, FTS5_PATTERN_LIKE or FTS5_PATTERN_GLOB
1413 ** indicating the style of pattern matching that the tokenizer can support.
1414 ** In practice, this is:
1415 **
1416 **     "trigram" tokenizer, case_sensitive=1 - FTS5_PATTERN_GLOB
1417 **     "trigram" tokenizer, case_sensitive=0 (the default) - FTS5_PATTERN_LIKE
1418 **     all other tokenizers - FTS5_PATTERN_NONE
1419 */
1420 int sqlite3Fts5TokenizerPattern(
1421     int (*xCreate)(void*, const char**, int, Fts5Tokenizer**),
1422     Fts5Tokenizer *pTok
1423 ){
1424   if( xCreate==fts5TriCreate ){
1425     TrigramTokenizer *p = (TrigramTokenizer*)pTok;
1426     if( p->iFoldParam==0 ){
1427       return p->bFold ? FTS5_PATTERN_LIKE : FTS5_PATTERN_GLOB;
1428     }
1429   }
1430   return FTS5_PATTERN_NONE;
1431 }
1432
1433 /*
1434 ** Register all built-in tokenizers with FTS5.
1435 */
1436 int sqlite3Fts5TokenizerInit(fts5_api *pApi){
1437   struct BuiltinTokenizer {
1438     const char *zName;
1439     fts5_tokenizer x;
1440   } aBuiltin[] = {
1441     { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}},
1442     { "ascii",     {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }},
1443     { "porter",    {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }},
1444     { "trigram",   {fts5TriCreate, fts5TriDelete, fts5TriTokenize}},
1445   };
1446
1447   int rc = SQLITE_OK;             /* Return code */
1448   int i;                          /* To iterate through builtin functions */
1449
1450   for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){
1451     rc = pApi->xCreateTokenizer(pApi,
1452         aBuiltin[i].zName,
1453         (void*)pApi,
1454         &aBuiltin[i].x,
1455         0
1456     );
1457   }
1458
1459   return rc;
1460 }