Merge pull request #37 from developernotes/prerelease
[sqlcipher.git] / src / test_spellfix.c
blob5a221e0b1b043a8af8708567e8d4a1dd28e4ef1b
1 /*
2 ** 2012 April 10
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 *************************************************************************
13 ** This module implements a VIRTUAL TABLE that can be used to search
14 ** a large vocabulary for close matches. For example, this virtual
15 ** table can be used to suggest corrections to misspelled words. Or,
16 ** it could be used with FTS4 to do full-text search using potentially
17 ** misspelled words.
19 ** Create an instance of the virtual table this way:
21 ** CREATE VIRTUAL TABLE demo USING spellfix1;
23 ** The "spellfix1" term is the name of this module. The "demo" is the
24 ** name of the virtual table you will be creating. The table is initially
25 ** empty. You have to populate it with your vocabulary. Suppose you
26 ** have a list of words in a table named "big_vocabulary". Then do this:
28 ** INSERT INTO demo(word) SELECT word FROM big_vocabulary;
30 ** If you intend to use this virtual table in cooperation with an FTS4
31 ** table (for spelling correctly of search terms) then you can extract
32 ** the vocabulary using an fts3aux table:
34 ** INSERT INTO demo(word) SELECT term FROM search_aux WHERE col='*';
36 ** You can also provide the virtual table with a "rank" for each word.
37 ** The "rank" is an estimate of how common the word is. Larger numbers
38 ** mean the word is more common. If you omit the rank when populating
39 ** the table, then a rank of 1 is assumed. But if you have rank
40 ** information, you can supply it and the virtual table will show a
41 ** slight preference for selecting more commonly used terms. To
42 ** populate the rank from an fts4aux table "search_aux" do something
43 ** like this:
45 ** INSERT INTO demo(word,rank)
46 ** SELECT term, documents FROM search_aux WHERE col='*';
48 ** To query the virtual table, include a MATCH operator in the WHERE
49 ** clause. For example:
51 ** SELECT word FROM demo WHERE word MATCH 'kennasaw';
53 ** Using a dataset of American place names (derived from
54 ** http://geonames.usgs.gov/domestic/download_data.htm) the query above
55 ** returns 20 results beginning with:
57 ** kennesaw
58 ** kenosha
59 ** kenesaw
60 ** kenaga
61 ** keanak
63 ** If you append the character '*' to the end of the pattern, then
64 ** a prefix search is performed. For example:
66 ** SELECT word FROM demo WHERE word MATCH 'kennes*';
68 ** Yields 20 results beginning with:
70 ** kennesaw
71 ** kennestone
72 ** kenneson
73 ** kenneys
74 ** keanes
75 ** keenes
77 ** The virtual table actually has a unique rowid with five columns plus three
78 ** extra hidden columns. The columns are as follows:
80 ** rowid A unique integer number associated with each
81 ** vocabulary item in the table. This can be used
82 ** as a foreign key on other tables in the database.
84 ** word The text of the word that matches the pattern.
85 ** Both word and pattern can contains unicode characters
86 ** and can be mixed case.
88 ** rank This is the rank of the word, as specified in the
89 ** original INSERT statement.
91 ** distance This is an edit distance or Levensthein distance going
92 ** from the pattern to the word.
94 ** langid This is the language-id of the word. All queries are
95 ** against a single language-id, which defaults to 0.
96 ** For any given query this value is the same on all rows.
98 ** score The score is a combination of rank and distance. The
99 ** idea is that a lower score is better. The virtual table
100 ** attempts to find words with the lowest score and
101 ** by default (unless overridden by ORDER BY) returns
102 ** results in order of increasing score.
104 ** top (HIDDEN) For any query, this value is the same on all
105 ** rows. It is an integer which is the maximum number of
106 ** rows that will be output. The actually number of rows
107 ** output might be less than this number, but it will never
108 ** be greater. The default value for top is 20, but that
109 ** can be changed for each query by including a term of
110 ** the form "top=N" in the WHERE clause of the query.
112 ** scope (HIDDEN) For any query, this value is the same on all
113 ** rows. The scope is a measure of how widely the virtual
114 ** table looks for matching words. Smaller values of
115 ** scope cause a broader search. The scope is normally
116 ** choosen automatically and is capped at 4. Applications
117 ** can change the scope by including a term of the form
118 ** "scope=N" in the WHERE clause of the query. Increasing
119 ** the scope will make the query run faster, but will reduce
120 ** the possible corrections.
122 ** srchcnt (HIDDEN) For any query, this value is the same on all
123 ** rows. This value is an integer which is the number of
124 ** of words examined using the edit-distance algorithm to
125 ** find the top matches that are ultimately displayed. This
126 ** value is for diagnostic use only.
128 ** soundslike (HIDDEN) When inserting vocabulary entries, this field
129 ** can be set to an spelling that matches what the word
130 ** sounds like. See the DEALING WITH UNUSUAL AND DIFFICULT
131 ** SPELLINGS section below for details.
133 ** When inserting into or updating the virtual table, only the rowid, word,
134 ** rank, and langid may be changes. Any attempt to set or modify the values
135 ** of distance, score, top, scope, or srchcnt is silently ignored.
137 ** ALGORITHM
139 ** A shadow table named "%_vocab" (where the % is replaced by the name of
140 ** the virtual table; Ex: "demo_vocab" for the "demo" virtual table) is
141 ** constructed with these columns:
143 ** id The unique id (INTEGER PRIMARY KEY)
145 ** rank The rank of word.
147 ** langid The language id for this entry.
149 ** word The original UTF8 text of the vocabulary word
151 ** k1 The word transliterated into lower-case ASCII.
152 ** There is a standard table of mappings from non-ASCII
153 ** characters into ASCII. Examples: "æ" -> "ae",
154 ** "þ" -> "th", "ß" -> "ss", "á" -> "a", ... The
155 ** accessory function spellfix1_translit(X) will do
156 ** the non-ASCII to ASCII mapping. The built-in lower(X)
157 ** function will convert to lower-case. Thus:
158 ** k1 = lower(spellfix1_translit(word)).
160 ** k2 This field holds a phonetic code derived from k1. Letters
161 ** that have similar sounds are mapped into the same symbol.
162 ** For example, all vowels and vowel clusters become the
163 ** single symbol "A". And the letters "p", "b", "f", and
164 ** "v" all become "B". All nasal sounds are represented
165 ** as "N". And so forth. The mapping is base on
166 ** ideas found in Soundex, Metaphone, and other
167 ** long-standing phonetic matching systems. This key can
168 ** be generated by the function spellfix1_charclass(X).
169 ** Hence: k2 = spellfix1_charclass(k1)
171 ** There is also a function for computing the Wagner edit distance or the
172 ** Levenshtein distance between a pattern and a word. This function
173 ** is exposed as spellfix1_editdist(X,Y). The edit distance function
174 ** returns the "cost" of converting X into Y. Some transformations
175 ** cost more than others. Changing one vowel into a different vowel,
176 ** for example is relatively cheap, as is doubling a constant, or
177 ** omitting the second character of a double-constant. Other transformations
178 ** or more expensive. The idea is that the edit distance function returns
179 ** a low cost of words that are similar and a higher cost for words
180 ** that are futher apart. In this implementation, the maximum cost
181 ** of any single-character edit (delete, insert, or substitute) is 100,
182 ** with lower costs for some edits (such as transforming vowels).
184 ** The "score" for a comparison is the edit distance between the pattern
185 ** and the word, adjusted down by the base-2 logorithm of the word rank.
186 ** For example, a match with distance 100 but rank 1000 would have a
187 ** score of 122 (= 100 - log2(1000) + 32) where as a match with distance
188 ** 100 with a rank of 1 would have a score of 131 (100 - log2(1) + 32).
189 ** (NB: The constant 32 is added to each score to keep it from going
190 ** negative in case the edit distance is zero.) In this way, frequently
191 ** used words get a slightly lower cost which tends to move them toward
192 ** the top of the list of alternative spellings.
194 ** A straightforward implementation of a spelling corrector would be
195 ** to compare the search term against every word in the vocabulary
196 ** and select the 20 with the lowest scores. However, there will
197 ** typically be hundreds of thousands or millions of words in the
198 ** vocabulary, and so this approach is not fast enough.
200 ** Suppose the term that is being spell-corrected is X. To limit
201 ** the search space, X is converted to a k2-like key using the
202 ** equivalent of:
204 ** key = spellfix1_charclass(lower(spellfix1_translit(X)))
206 ** This key is then limited to "scope" characters. The default scope
207 ** value is 4, but an alternative scope can be specified using the
208 ** "scope=N" term in the WHERE clause. After the key has been truncated,
209 ** the edit distance is run against every term in the vocabulary that
210 ** has a k2 value that begins with the abbreviated key.
212 ** For example, suppose the input word is "Paskagula". The phonetic
213 ** key is "BACACALA" which is then truncated to 4 characters "BACA".
214 ** The edit distance is then run on the 4980 entries (out of
215 ** 272,597 entries total) of the vocabulary whose k2 values begin with
216 ** BACA, yielding "Pascagoula" as the best match.
218 ** Only terms of the vocabulary with a matching langid are searched.
219 ** Hence, the same table can contain entries from multiple languages
220 ** and only the requested language will be used. The default langid
221 ** is 0.
223 ** DEALING WITH UNUSUAL AND DIFFICULT SPELLINGS
225 ** The algorithm above works quite well for most cases, but there are
226 ** exceptions. These exceptions can be dealt with by making additional
227 ** entries in the virtual table using the "soundslike" column.
229 ** For example, many words of Greek origin begin with letters "ps" where
230 ** the "p" is silent. Ex: psalm, pseudonym, psoriasis, psyche. In
231 ** another example, many Scottish surnames can be spelled with an
232 ** initial "Mac" or "Mc". Thus, "MacKay" and "McKay" are both pronounced
233 ** the same.
235 ** Accommodation can be made for words that are not spelled as they
236 ** sound by making additional entries into the virtual table for the
237 ** same word, but adding an alternative spelling in the "soundslike"
238 ** column. For example, the canonical entry for "psalm" would be this:
240 ** INSERT INTO demo(word) VALUES('psalm');
242 ** To enhance the ability to correct the spelling of "salm" into
243 ** "psalm", make an addition entry like this:
245 ** INSERT INTO demo(word,soundslike) VALUES('psalm','salm');
247 ** It is ok to make multiple entries for the same word as long as
248 ** each entry has a different soundslike value. Note that if no
249 ** soundslike value is specified, the soundslike defaults to the word
250 ** itself.
252 ** Listed below are some cases where it might make sense to add additional
253 ** soundslike entries. The specific entries will depend on the application
254 ** and the target language.
256 ** * Silent "p" in words beginning with "ps": psalm, psyche
258 ** * Silent "p" in words beginning with "pn": pneumonia, pneumatic
260 ** * Silent "p" in words beginning with "pt": pterodactyl, ptolemaic
262 ** * Silent "d" in words beginning with "dj": djinn, Djikarta
264 ** * Silent "k" in words beginning with "kn": knight, Knuthson
266 ** * Silent "g" in words beginning with "gn": gnarly, gnome, gnat
268 ** * "Mac" versus "Mc" beginning Scottish surnames
270 ** * "Tch" sounds in Slavic words: Tchaikovsky vs. Chaykovsky
272 ** * The letter "j" pronounced like "h" in Spanish: LaJolla
274 ** * Words beginning with "wr" versus "r": write vs. rite
276 ** * Miscellanous problem words such as "debt", "tsetse",
277 ** "Nguyen", "Van Nuyes".
279 #if SQLITE_CORE
280 # include "sqliteInt.h"
281 #else
282 # include <string.h>
283 # include <stdio.h>
284 # include <stdlib.h>
285 # include "sqlite3ext.h"
286 SQLITE_EXTENSION_INIT1
287 #endif /* !SQLITE_CORE */
290 ** Character classes for ASCII characters:
292 ** 0 '' Silent letters: H W
293 ** 1 'A' Any vowel: A E I O U (Y)
294 ** 2 'B' A bilabeal stop or fricative: B F P V
295 ** 3 'C' Other fricatives or back stops: C G J K Q S X Z
296 ** 4 'D' Alveolar stops: D T
297 ** 5 'H' Letter H at the beginning of a word
298 ** 6 'L' Glides: L R
299 ** 7 'M' Nasals: M N
300 ** 8 'W' Letter W at the beginning of a word
301 ** 9 'Y' Letter Y at the beginning of a word.
302 ** 10 '9' A digit: 0 1 2 3 4 5 6 7 8 9
303 ** 11 ' ' White space
304 ** 12 '?' Other.
306 #define CCLASS_SILENT 0
307 #define CCLASS_VOWEL 1
308 #define CCLASS_B 2
309 #define CCLASS_C 3
310 #define CCLASS_D 4
311 #define CCLASS_H 5
312 #define CCLASS_L 6
313 #define CCLASS_M 7
314 #define CCLASS_W 8
315 #define CCLASS_Y 9
316 #define CCLASS_DIGIT 10
317 #define CCLASS_SPACE 11
318 #define CCLASS_OTHER 12
321 ** The following table gives the character class for non-initial ASCII
322 ** characters.
324 static const unsigned char midClass[] = {
325 /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xa xb xc xd xe xf */
326 /* 0x */ 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 12, 11, 12, 12, 12,
327 /* 1x */ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
328 /* 2x */ 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
329 /* 3x */ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 12, 12, 12, 12, 12,
330 /* 4x */ 12, 1, 2, 3, 4, 1, 2, 3, 0, 1, 3, 3, 6, 7, 7, 1,
331 /* 5x */ 2, 3, 6, 3, 4, 1, 2, 0, 3, 1, 3, 12, 12, 12, 12, 12,
332 /* 6x */ 12, 1, 2, 3, 4, 1, 2, 3, 0, 1, 3, 3, 6, 7, 7, 1,
333 /* 7x */ 2, 3, 6, 3, 4, 1, 2, 0, 3, 1, 3, 12, 12, 12, 12, 12,
337 ** This tables gives the character class for ASCII characters that form the
338 ** initial character of a word. The only difference from midClass is with
339 ** the letters H, W, and Y.
341 static const unsigned char initClass[] = {
342 /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xa xb xc xd xe xf */
343 /* 0x */ 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 12, 11, 12, 12, 12,
344 /* 1x */ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
345 /* 2x */ 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
346 /* 3x */ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 12, 12, 12, 12, 12,
347 /* 4x */ 12, 1, 2, 3, 4, 1, 2, 3, 5, 1, 3, 3, 6, 7, 7, 1,
348 /* 5x */ 2, 3, 6, 3, 4, 1, 2, 8, 3, 9, 3, 12, 12, 12, 12, 12,
349 /* 6x */ 12, 1, 2, 3, 4, 1, 2, 3, 5, 1, 3, 3, 6, 7, 7, 1,
350 /* 7x */ 2, 3, 6, 3, 4, 1, 2, 8, 3, 9, 3, 12, 12, 12, 12, 12,
354 ** Mapping from the character class number (0-12) to a symbol for each
355 ** character class. Note that initClass[] can be used to map the class
356 ** symbol back into the class number.
358 static const unsigned char className[] = ".ABCDHLMWY9 ?";
361 ** Generate a string of character classes corresponding to the
362 ** ASCII characters in the input string zIn. If the input is not
363 ** ASCII then the behavior is undefined.
365 ** Space to hold the result is obtained from sqlite3_malloc()
367 ** Return NULL if memory allocation fails.
369 static unsigned char *characterClassString(const unsigned char *zIn, int nIn){
370 unsigned char *zOut = sqlite3_malloc( nIn + 1 );
371 int i;
372 int nOut = 0;
373 char cPrev = 0x77;
374 const unsigned char *aClass = initClass;
376 if( zOut==0 ) return 0;
377 for(i=0; i<nIn; i++){
378 unsigned char c = zIn[i];
379 c = aClass[c&0x7f];
380 if( c==CCLASS_OTHER && cPrev!=CCLASS_DIGIT ) continue;
381 cPrev = c;
382 if( c==CCLASS_SILENT ) continue;
383 if( c==CCLASS_SPACE ) continue;
384 aClass = midClass;
385 c = className[c];
386 if( c!=zOut[nOut-1] ) zOut[nOut++] = c;
388 zOut[nOut] = 0;
389 return zOut;
393 ** This is an SQL function wrapper around characterClassString(). See
394 ** the description of characterClassString() for additional information.
396 static void characterClassSqlFunc(
397 sqlite3_context *context,
398 int argc,
399 sqlite3_value **argv
401 const unsigned char *zIn;
402 unsigned char *zOut;
404 zIn = sqlite3_value_text(argv[0]);
405 if( zIn==0 ) return;
406 zOut = characterClassString(zIn, sqlite3_value_bytes(argv[0]));
407 if( zOut==0 ){
408 sqlite3_result_error_nomem(context);
409 }else{
410 sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free);
415 ** Return the character class number for a character given its
416 ** context.
418 static char characterClass(char cPrev, char c){
419 return cPrev==0 ? initClass[c&0x7f] : midClass[c&0x7f];
423 ** Return the cost of inserting or deleting character c immediately
424 ** following character cPrev. If cPrev==0, that means c is the first
425 ** character of the word.
427 static int insertOrDeleteCost(char cPrev, char c){
428 char classC = characterClass(cPrev, c);
429 char classCprev;
431 if( classC==CCLASS_SILENT ){
432 /* Insert or delete "silent" characters such as H or W */
433 return 1;
435 if( cPrev==c ){
436 /* Repeated characters, or miss a repeat */
437 return 10;
439 classCprev = characterClass(cPrev, cPrev);
440 if( classC==classCprev ){
441 if( classC==CCLASS_VOWEL ){
442 /* Remove or add a new vowel to a vowel cluster */
443 return 15;
444 }else{
445 /* Remove or add a consonant not in the same class */
446 return 50;
450 /* any other character insertion or deletion */
451 return 100;
455 ** Divide the insertion cost by this factor when appending to the
456 ** end of the word.
458 #define FINAL_INS_COST_DIV 4
461 ** Return the cost of substituting cTo in place of cFrom assuming
462 ** the previous character is cPrev. If cPrev==0 then cTo is the first
463 ** character of the word.
465 static int substituteCost(char cPrev, char cFrom, char cTo){
466 char classFrom, classTo;
467 if( cFrom==cTo ){
468 /* Exact match */
469 return 0;
471 if( cFrom==(cTo^0x20) && ((cTo>='A' && cTo<='Z') || (cTo>='a' && cTo<='z')) ){
472 /* differ only in case */
473 return 0;
475 classFrom = characterClass(cPrev, cFrom);
476 classTo = characterClass(cPrev, cTo);
477 if( classFrom==classTo ){
478 /* Same character class */
479 return classFrom=='A' ? 25 : 40;
481 if( classFrom>=CCLASS_B && classFrom<=CCLASS_Y
482 && classTo>=CCLASS_B && classTo<=CCLASS_Y ){
483 /* Convert from one consonant to another, but in a different class */
484 return 75;
486 /* Any other subsitution */
487 return 100;
491 ** Given two strings zA and zB which are pure ASCII, return the cost
492 ** of transforming zA into zB. If zA ends with '*' assume that it is
493 ** a prefix of zB and give only minimal penalty for extra characters
494 ** on the end of zB.
496 ** Smaller numbers mean a closer match.
498 ** Negative values indicate an error:
499 ** -1 One of the inputs is NULL
500 ** -2 Non-ASCII characters on input
501 ** -3 Unable to allocate memory
503 static int editdist(const char *zA, const char *zB){
504 int nA, nB; /* Number of characters in zA[] and zB[] */
505 int xA, xB; /* Loop counters for zA[] and zB[] */
506 char cA, cB; /* Current character of zA and zB */
507 char cAprev, cBprev; /* Previous character of zA and zB */
508 int d; /* North-west cost value */
509 int dc = 0; /* North-west character value */
510 int res; /* Final result */
511 int *m; /* The cost matrix */
512 char *cx; /* Corresponding character values */
513 int *toFree = 0; /* Malloced space */
514 int mStack[60+15]; /* Stack space to use if not too much is needed */
516 /* Early out if either input is NULL */
517 if( zA==0 || zB==0 ) return -1;
519 /* Skip any common prefix */
520 while( zA[0] && zA[0]==zB[0] ){ dc = zA[0]; zA++; zB++; }
521 if( zA[0]==0 && zB[0]==0 ) return 0;
523 #if 0
524 printf("A=\"%s\" B=\"%s\" dc=%c\n", zA, zB, dc?dc:' ');
525 #endif
527 /* Verify input strings and measure their lengths */
528 for(nA=0; zA[nA]; nA++){
529 if( zA[nA]>127 ) return -2;
531 for(nB=0; zB[nB]; nB++){
532 if( zB[nB]>127 ) return -2;
535 /* Special processing if either string is empty */
536 if( nA==0 ){
537 cBprev = dc;
538 for(xB=res=0; (cB = zB[xB])!=0; xB++){
539 res += insertOrDeleteCost(cBprev, cB)/FINAL_INS_COST_DIV;
540 cBprev = cB;
542 return res;
544 if( nB==0 ){
545 cAprev = dc;
546 for(xA=res=0; (cA = zA[xA])!=0; xA++){
547 res += insertOrDeleteCost(cAprev, cA);
548 cAprev = cA;
550 return res;
553 /* A is a prefix of B */
554 if( zA[0]=='*' && zA[1]==0 ) return 0;
556 /* Allocate and initialize the Wagner matrix */
557 if( nB<(sizeof(mStack)*4)/(sizeof(mStack[0])*5) ){
558 m = mStack;
559 }else{
560 m = toFree = sqlite3_malloc( (nB+1)*5*sizeof(m[0])/4 );
561 if( m==0 ) return -3;
563 cx = (char*)&m[nB+1];
565 /* Compute the Wagner edit distance */
566 m[0] = 0;
567 cx[0] = dc;
568 cBprev = dc;
569 for(xB=1; xB<=nB; xB++){
570 cB = zB[xB-1];
571 cx[xB] = cB;
572 m[xB] = m[xB-1] + insertOrDeleteCost(cBprev, cB);
573 cBprev = cB;
575 cAprev = dc;
576 for(xA=1; xA<=nA; xA++){
577 int lastA = (xA==nA);
578 cA = zA[xA-1];
579 if( cA=='*' && lastA ) break;
580 d = m[0];
581 dc = cx[0];
582 m[0] = d + insertOrDeleteCost(cAprev, cA);
583 cBprev = 0;
584 for(xB=1; xB<=nB; xB++){
585 int totalCost, insCost, delCost, subCost, ncx;
586 cB = zB[xB-1];
588 /* Cost to insert cB */
589 insCost = insertOrDeleteCost(cx[xB-1], cB);
590 if( lastA ) insCost /= FINAL_INS_COST_DIV;
592 /* Cost to delete cA */
593 delCost = insertOrDeleteCost(cx[xB], cA);
595 /* Cost to substitute cA->cB */
596 subCost = substituteCost(cx[xB-1], cA, cB);
598 /* Best cost */
599 totalCost = insCost + m[xB-1];
600 ncx = cB;
601 if( (delCost + m[xB])<totalCost ){
602 totalCost = delCost + m[xB];
603 ncx = cA;
605 if( (subCost + d)<totalCost ){
606 totalCost = subCost + d;
609 #if 0
610 printf("%d,%d d=%4d u=%4d r=%4d dc=%c cA=%c cB=%c"
611 " ins=%4d del=%4d sub=%4d t=%4d ncx=%c\n",
612 xA, xB, d, m[xB], m[xB-1], dc?dc:' ', cA, cB,
613 insCost, delCost, subCost, totalCost, ncx?ncx:' ');
614 #endif
616 /* Update the matrix */
617 d = m[xB];
618 dc = cx[xB];
619 m[xB] = totalCost;
620 cx[xB] = ncx;
621 cBprev = cB;
623 cAprev = cA;
626 /* Free the wagner matrix and return the result */
627 if( cA=='*' && nB>nA ){
628 res = m[nA];
629 for(xB=nA+1; xB<=nB; xB++){
630 if( m[xB]<res ) res = m[xB];
632 }else{
633 res = m[nB];
635 sqlite3_free(toFree);
636 return res;
640 ** Function: editdist(A,B)
642 ** Return the cost of transforming string A into string B. Both strings
643 ** must be pure ASCII text. If A ends with '*' then it is assumed to be
644 ** a prefix of B and extra characters on the end of B have minimal additional
645 ** cost.
647 static void editdistSqlFunc(
648 sqlite3_context *context,
649 int argc,
650 sqlite3_value **argv
652 int res = editdist((const char*)sqlite3_value_text(argv[0]),
653 (const char*)sqlite3_value_text(argv[1]));
654 if( res<0 ){
655 if( res==(-3) ){
656 sqlite3_result_error_nomem(context);
657 }else if( res==(-2) ){
658 sqlite3_result_error(context, "non-ASCII input to editdist()", -1);
659 }else{
660 sqlite3_result_error(context, "NULL input to editdist()", -1);
662 }else{
663 sqlite3_result_int(context, res);
667 #if !SQLITE_CORE
669 ** This lookup table is used to help decode the first byte of
670 ** a multi-byte UTF8 character.
672 static const unsigned char sqlite3Utf8Trans1[] = {
673 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
674 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
675 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
676 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
677 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
678 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
679 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
680 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
682 #endif
685 ** Return the value of the first UTF-8 character in the string.
687 static int utf8Read(const unsigned char *z, int n, int *pSize){
688 int c, i;
690 if( n==0 ){
691 c = i = 0;
692 }else{
693 c = z[0];
694 i = 1;
695 if( c>=0xc0 ){
696 c = sqlite3Utf8Trans1[c-0xc0];
697 while( i<n && (z[i] & 0xc0)==0x80 ){
698 c = (c<<6) + (0x3f & z[i++]);
702 *pSize = i;
703 return c;
707 ** Table of translations from unicode characters into ASCII.
709 static const struct {
710 unsigned short int cFrom;
711 unsigned char cTo0, cTo1;
712 } translit[] = {
713 { 0x00A0, 0x20, 0x00 }, /*   to */
714 { 0x00B5, 0x75, 0x00 }, /* µ to u */
715 { 0x00C0, 0x41, 0x00 }, /* À to A */
716 { 0x00C1, 0x41, 0x00 }, /* Á to A */
717 { 0x00C2, 0x41, 0x00 }, /* Â to A */
718 { 0x00C3, 0x41, 0x00 }, /* Ã to A */
719 { 0x00C4, 0x41, 0x65 }, /* Ä to Ae */
720 { 0x00C5, 0x41, 0x61 }, /* Å to Aa */
721 { 0x00C6, 0x41, 0x45 }, /* Æ to AE */
722 { 0x00C7, 0x43, 0x00 }, /* Ç to C */
723 { 0x00C8, 0x45, 0x00 }, /* È to E */
724 { 0x00C9, 0x45, 0x00 }, /* É to E */
725 { 0x00CA, 0x45, 0x00 }, /* Ê to E */
726 { 0x00CB, 0x45, 0x00 }, /* Ë to E */
727 { 0x00CC, 0x49, 0x00 }, /* Ì to I */
728 { 0x00CD, 0x49, 0x00 }, /* Í to I */
729 { 0x00CE, 0x49, 0x00 }, /* Î to I */
730 { 0x00CF, 0x49, 0x00 }, /* Ï to I */
731 { 0x00D0, 0x44, 0x00 }, /* Ð to D */
732 { 0x00D1, 0x4E, 0x00 }, /* Ñ to N */
733 { 0x00D2, 0x4F, 0x00 }, /* Ò to O */
734 { 0x00D3, 0x4F, 0x00 }, /* Ó to O */
735 { 0x00D4, 0x4F, 0x00 }, /* Ô to O */
736 { 0x00D5, 0x4F, 0x00 }, /* Õ to O */
737 { 0x00D6, 0x4F, 0x65 }, /* Ö to Oe */
738 { 0x00D7, 0x78, 0x00 }, /* × to x */
739 { 0x00D8, 0x4F, 0x00 }, /* Ø to O */
740 { 0x00D9, 0x55, 0x00 }, /* Ù to U */
741 { 0x00DA, 0x55, 0x00 }, /* Ú to U */
742 { 0x00DB, 0x55, 0x00 }, /* Û to U */
743 { 0x00DC, 0x55, 0x65 }, /* Ü to Ue */
744 { 0x00DD, 0x59, 0x00 }, /* Ý to Y */
745 { 0x00DE, 0x54, 0x68 }, /* Þ to Th */
746 { 0x00DF, 0x73, 0x73 }, /* ß to ss */
747 { 0x00E0, 0x61, 0x00 }, /* à to a */
748 { 0x00E1, 0x61, 0x00 }, /* á to a */
749 { 0x00E2, 0x61, 0x00 }, /* â to a */
750 { 0x00E3, 0x61, 0x00 }, /* ã to a */
751 { 0x00E4, 0x61, 0x65 }, /* ä to ae */
752 { 0x00E5, 0x61, 0x61 }, /* å to aa */
753 { 0x00E6, 0x61, 0x65 }, /* æ to ae */
754 { 0x00E7, 0x63, 0x00 }, /* ç to c */
755 { 0x00E8, 0x65, 0x00 }, /* è to e */
756 { 0x00E9, 0x65, 0x00 }, /* é to e */
757 { 0x00EA, 0x65, 0x00 }, /* ê to e */
758 { 0x00EB, 0x65, 0x00 }, /* ë to e */
759 { 0x00EC, 0x69, 0x00 }, /* ì to i */
760 { 0x00ED, 0x69, 0x00 }, /* í to i */
761 { 0x00EE, 0x69, 0x00 }, /* î to i */
762 { 0x00EF, 0x69, 0x00 }, /* ï to i */
763 { 0x00F0, 0x64, 0x00 }, /* ð to d */
764 { 0x00F1, 0x6E, 0x00 }, /* ñ to n */
765 { 0x00F2, 0x6F, 0x00 }, /* ò to o */
766 { 0x00F3, 0x6F, 0x00 }, /* ó to o */
767 { 0x00F4, 0x6F, 0x00 }, /* ô to o */
768 { 0x00F5, 0x6F, 0x00 }, /* õ to o */
769 { 0x00F6, 0x6F, 0x65 }, /* ö to oe */
770 { 0x00F7, 0x3A, 0x00 }, /* ÷ to : */
771 { 0x00F8, 0x6F, 0x00 }, /* ø to o */
772 { 0x00F9, 0x75, 0x00 }, /* ù to u */
773 { 0x00FA, 0x75, 0x00 }, /* ú to u */
774 { 0x00FB, 0x75, 0x00 }, /* û to u */
775 { 0x00FC, 0x75, 0x65 }, /* ü to ue */
776 { 0x00FD, 0x79, 0x00 }, /* ý to y */
777 { 0x00FE, 0x74, 0x68 }, /* þ to th */
778 { 0x00FF, 0x79, 0x00 }, /* ÿ to y */
779 { 0x0100, 0x41, 0x00 }, /* Ā to A */
780 { 0x0101, 0x61, 0x00 }, /* ā to a */
781 { 0x0102, 0x41, 0x00 }, /* Ă to A */
782 { 0x0103, 0x61, 0x00 }, /* ă to a */
783 { 0x0104, 0x41, 0x00 }, /* Ą to A */
784 { 0x0105, 0x61, 0x00 }, /* ą to a */
785 { 0x0106, 0x43, 0x00 }, /* Ć to C */
786 { 0x0107, 0x63, 0x00 }, /* ć to c */
787 { 0x0108, 0x43, 0x68 }, /* Ĉ to Ch */
788 { 0x0109, 0x63, 0x68 }, /* ĉ to ch */
789 { 0x010A, 0x43, 0x00 }, /* Ċ to C */
790 { 0x010B, 0x63, 0x00 }, /* ċ to c */
791 { 0x010C, 0x43, 0x00 }, /* Č to C */
792 { 0x010D, 0x63, 0x00 }, /* č to c */
793 { 0x010E, 0x44, 0x00 }, /* Ď to D */
794 { 0x010F, 0x64, 0x00 }, /* ď to d */
795 { 0x0110, 0x44, 0x00 }, /* Đ to D */
796 { 0x0111, 0x64, 0x00 }, /* đ to d */
797 { 0x0112, 0x45, 0x00 }, /* Ē to E */
798 { 0x0113, 0x65, 0x00 }, /* ē to e */
799 { 0x0114, 0x45, 0x00 }, /* Ĕ to E */
800 { 0x0115, 0x65, 0x00 }, /* ĕ to e */
801 { 0x0116, 0x45, 0x00 }, /* Ė to E */
802 { 0x0117, 0x65, 0x00 }, /* ė to e */
803 { 0x0118, 0x45, 0x00 }, /* Ę to E */
804 { 0x0119, 0x65, 0x00 }, /* ę to e */
805 { 0x011A, 0x45, 0x00 }, /* Ě to E */
806 { 0x011B, 0x65, 0x00 }, /* ě to e */
807 { 0x011C, 0x47, 0x68 }, /* Ĝ to Gh */
808 { 0x011D, 0x67, 0x68 }, /* ĝ to gh */
809 { 0x011E, 0x47, 0x00 }, /* Ğ to G */
810 { 0x011F, 0x67, 0x00 }, /* ğ to g */
811 { 0x0120, 0x47, 0x00 }, /* Ġ to G */
812 { 0x0121, 0x67, 0x00 }, /* ġ to g */
813 { 0x0122, 0x47, 0x00 }, /* Ģ to G */
814 { 0x0123, 0x67, 0x00 }, /* ģ to g */
815 { 0x0124, 0x48, 0x68 }, /* Ĥ to Hh */
816 { 0x0125, 0x68, 0x68 }, /* ĥ to hh */
817 { 0x0126, 0x48, 0x00 }, /* Ħ to H */
818 { 0x0127, 0x68, 0x00 }, /* ħ to h */
819 { 0x0128, 0x49, 0x00 }, /* Ĩ to I */
820 { 0x0129, 0x69, 0x00 }, /* ĩ to i */
821 { 0x012A, 0x49, 0x00 }, /* Ī to I */
822 { 0x012B, 0x69, 0x00 }, /* ī to i */
823 { 0x012C, 0x49, 0x00 }, /* Ĭ to I */
824 { 0x012D, 0x69, 0x00 }, /* ĭ to i */
825 { 0x012E, 0x49, 0x00 }, /* Į to I */
826 { 0x012F, 0x69, 0x00 }, /* į to i */
827 { 0x0130, 0x49, 0x00 }, /* İ to I */
828 { 0x0131, 0x69, 0x00 }, /* ı to i */
829 { 0x0132, 0x49, 0x4A }, /* IJ to IJ */
830 { 0x0133, 0x69, 0x6A }, /* ij to ij */
831 { 0x0134, 0x4A, 0x68 }, /* Ĵ to Jh */
832 { 0x0135, 0x6A, 0x68 }, /* ĵ to jh */
833 { 0x0136, 0x4B, 0x00 }, /* Ķ to K */
834 { 0x0137, 0x6B, 0x00 }, /* ķ to k */
835 { 0x0138, 0x6B, 0x00 }, /* ĸ to k */
836 { 0x0139, 0x4C, 0x00 }, /* Ĺ to L */
837 { 0x013A, 0x6C, 0x00 }, /* ĺ to l */
838 { 0x013B, 0x4C, 0x00 }, /* Ļ to L */
839 { 0x013C, 0x6C, 0x00 }, /* ļ to l */
840 { 0x013D, 0x4C, 0x00 }, /* Ľ to L */
841 { 0x013E, 0x6C, 0x00 }, /* ľ to l */
842 { 0x013F, 0x4C, 0x2E }, /* Ŀ to L. */
843 { 0x0140, 0x6C, 0x2E }, /* ŀ to l. */
844 { 0x0141, 0x4C, 0x00 }, /* Ł to L */
845 { 0x0142, 0x6C, 0x00 }, /* ł to l */
846 { 0x0143, 0x4E, 0x00 }, /* Ń to N */
847 { 0x0144, 0x6E, 0x00 }, /* ń to n */
848 { 0x0145, 0x4E, 0x00 }, /* Ņ to N */
849 { 0x0146, 0x6E, 0x00 }, /* ņ to n */
850 { 0x0147, 0x4E, 0x00 }, /* Ň to N */
851 { 0x0148, 0x6E, 0x00 }, /* ň to n */
852 { 0x0149, 0x27, 0x6E }, /* ʼn to 'n */
853 { 0x014A, 0x4E, 0x47 }, /* Ŋ to NG */
854 { 0x014B, 0x6E, 0x67 }, /* ŋ to ng */
855 { 0x014C, 0x4F, 0x00 }, /* Ō to O */
856 { 0x014D, 0x6F, 0x00 }, /* ō to o */
857 { 0x014E, 0x4F, 0x00 }, /* Ŏ to O */
858 { 0x014F, 0x6F, 0x00 }, /* ŏ to o */
859 { 0x0150, 0x4F, 0x00 }, /* Ő to O */
860 { 0x0151, 0x6F, 0x00 }, /* ő to o */
861 { 0x0152, 0x4F, 0x45 }, /* Œ to OE */
862 { 0x0153, 0x6F, 0x65 }, /* œ to oe */
863 { 0x0154, 0x52, 0x00 }, /* Ŕ to R */
864 { 0x0155, 0x72, 0x00 }, /* ŕ to r */
865 { 0x0156, 0x52, 0x00 }, /* Ŗ to R */
866 { 0x0157, 0x72, 0x00 }, /* ŗ to r */
867 { 0x0158, 0x52, 0x00 }, /* Ř to R */
868 { 0x0159, 0x72, 0x00 }, /* ř to r */
869 { 0x015A, 0x53, 0x00 }, /* Ś to S */
870 { 0x015B, 0x73, 0x00 }, /* ś to s */
871 { 0x015C, 0x53, 0x68 }, /* Ŝ to Sh */
872 { 0x015D, 0x73, 0x68 }, /* ŝ to sh */
873 { 0x015E, 0x53, 0x00 }, /* Ş to S */
874 { 0x015F, 0x73, 0x00 }, /* ş to s */
875 { 0x0160, 0x53, 0x00 }, /* Š to S */
876 { 0x0161, 0x73, 0x00 }, /* š to s */
877 { 0x0162, 0x54, 0x00 }, /* Ţ to T */
878 { 0x0163, 0x74, 0x00 }, /* ţ to t */
879 { 0x0164, 0x54, 0x00 }, /* Ť to T */
880 { 0x0165, 0x74, 0x00 }, /* ť to t */
881 { 0x0166, 0x54, 0x00 }, /* Ŧ to T */
882 { 0x0167, 0x74, 0x00 }, /* ŧ to t */
883 { 0x0168, 0x55, 0x00 }, /* Ũ to U */
884 { 0x0169, 0x75, 0x00 }, /* ũ to u */
885 { 0x016A, 0x55, 0x00 }, /* Ū to U */
886 { 0x016B, 0x75, 0x00 }, /* ū to u */
887 { 0x016C, 0x55, 0x00 }, /* Ŭ to U */
888 { 0x016D, 0x75, 0x00 }, /* ŭ to u */
889 { 0x016E, 0x55, 0x00 }, /* Ů to U */
890 { 0x016F, 0x75, 0x00 }, /* ů to u */
891 { 0x0170, 0x55, 0x00 }, /* Ű to U */
892 { 0x0171, 0x75, 0x00 }, /* ű to u */
893 { 0x0172, 0x55, 0x00 }, /* Ų to U */
894 { 0x0173, 0x75, 0x00 }, /* ų to u */
895 { 0x0174, 0x57, 0x00 }, /* Ŵ to W */
896 { 0x0175, 0x77, 0x00 }, /* ŵ to w */
897 { 0x0176, 0x59, 0x00 }, /* Ŷ to Y */
898 { 0x0177, 0x79, 0x00 }, /* ŷ to y */
899 { 0x0178, 0x59, 0x00 }, /* Ÿ to Y */
900 { 0x0179, 0x5A, 0x00 }, /* Ź to Z */
901 { 0x017A, 0x7A, 0x00 }, /* ź to z */
902 { 0x017B, 0x5A, 0x00 }, /* Ż to Z */
903 { 0x017C, 0x7A, 0x00 }, /* ż to z */
904 { 0x017D, 0x5A, 0x00 }, /* Ž to Z */
905 { 0x017E, 0x7A, 0x00 }, /* ž to z */
906 { 0x017F, 0x73, 0x00 }, /* ſ to s */
907 { 0x0192, 0x66, 0x00 }, /* ƒ to f */
908 { 0x0218, 0x53, 0x00 }, /* Ș to S */
909 { 0x0219, 0x73, 0x00 }, /* ș to s */
910 { 0x021A, 0x54, 0x00 }, /* Ț to T */
911 { 0x021B, 0x74, 0x00 }, /* ț to t */
912 { 0x0386, 0x41, 0x00 }, /* Ά to A */
913 { 0x0388, 0x45, 0x00 }, /* Έ to E */
914 { 0x0389, 0x49, 0x00 }, /* Ή to I */
915 { 0x038A, 0x49, 0x00 }, /* Ί to I */
916 { 0x038C, 0x4f, 0x00 }, /* Ό to O */
917 { 0x038E, 0x59, 0x00 }, /* Ύ to Y */
918 { 0x038F, 0x4f, 0x00 }, /* Ώ to O */
919 { 0x0390, 0x69, 0x00 }, /* ΐ to i */
920 { 0x0391, 0x41, 0x00 }, /* Α to A */
921 { 0x0392, 0x42, 0x00 }, /* Β to B */
922 { 0x0393, 0x47, 0x00 }, /* Γ to G */
923 { 0x0394, 0x44, 0x00 }, /* Δ to D */
924 { 0x0395, 0x45, 0x00 }, /* Ε to E */
925 { 0x0396, 0x5a, 0x00 }, /* Ζ to Z */
926 { 0x0397, 0x49, 0x00 }, /* Η to I */
927 { 0x0398, 0x54, 0x68 }, /* Θ to Th */
928 { 0x0399, 0x49, 0x00 }, /* Ι to I */
929 { 0x039A, 0x4b, 0x00 }, /* Κ to K */
930 { 0x039B, 0x4c, 0x00 }, /* Λ to L */
931 { 0x039C, 0x4d, 0x00 }, /* Μ to M */
932 { 0x039D, 0x4e, 0x00 }, /* Ν to N */
933 { 0x039E, 0x58, 0x00 }, /* Ξ to X */
934 { 0x039F, 0x4f, 0x00 }, /* Ο to O */
935 { 0x03A0, 0x50, 0x00 }, /* Π to P */
936 { 0x03A1, 0x52, 0x00 }, /* Ρ to R */
937 { 0x03A3, 0x53, 0x00 }, /* Σ to S */
938 { 0x03A4, 0x54, 0x00 }, /* Τ to T */
939 { 0x03A5, 0x59, 0x00 }, /* Υ to Y */
940 { 0x03A6, 0x46, 0x00 }, /* Φ to F */
941 { 0x03A7, 0x43, 0x68 }, /* Χ to Ch */
942 { 0x03A8, 0x50, 0x73 }, /* Ψ to Ps */
943 { 0x03A9, 0x4f, 0x00 }, /* Ω to O */
944 { 0x03AA, 0x49, 0x00 }, /* Ϊ to I */
945 { 0x03AB, 0x59, 0x00 }, /* Ϋ to Y */
946 { 0x03AC, 0x61, 0x00 }, /* ά to a */
947 { 0x03AD, 0x65, 0x00 }, /* έ to e */
948 { 0x03AE, 0x69, 0x00 }, /* ή to i */
949 { 0x03AF, 0x69, 0x00 }, /* ί to i */
950 { 0x03B1, 0x61, 0x00 }, /* α to a */
951 { 0x03B2, 0x62, 0x00 }, /* β to b */
952 { 0x03B3, 0x67, 0x00 }, /* γ to g */
953 { 0x03B4, 0x64, 0x00 }, /* δ to d */
954 { 0x03B5, 0x65, 0x00 }, /* ε to e */
955 { 0x03B6, 0x7a, 0x00 }, /* ζ to z */
956 { 0x03B7, 0x69, 0x00 }, /* η to i */
957 { 0x03B8, 0x74, 0x68 }, /* θ to th */
958 { 0x03B9, 0x69, 0x00 }, /* ι to i */
959 { 0x03BA, 0x6b, 0x00 }, /* κ to k */
960 { 0x03BB, 0x6c, 0x00 }, /* λ to l */
961 { 0x03BC, 0x6d, 0x00 }, /* μ to m */
962 { 0x03BD, 0x6e, 0x00 }, /* ν to n */
963 { 0x03BE, 0x78, 0x00 }, /* ξ to x */
964 { 0x03BF, 0x6f, 0x00 }, /* ο to o */
965 { 0x03C0, 0x70, 0x00 }, /* π to p */
966 { 0x03C1, 0x72, 0x00 }, /* ρ to r */
967 { 0x03C3, 0x73, 0x00 }, /* σ to s */
968 { 0x03C4, 0x74, 0x00 }, /* τ to t */
969 { 0x03C5, 0x79, 0x00 }, /* υ to y */
970 { 0x03C6, 0x66, 0x00 }, /* φ to f */
971 { 0x03C7, 0x63, 0x68 }, /* χ to ch */
972 { 0x03C8, 0x70, 0x73 }, /* ψ to ps */
973 { 0x03C9, 0x6f, 0x00 }, /* ω to o */
974 { 0x03CA, 0x69, 0x00 }, /* ϊ to i */
975 { 0x03CB, 0x79, 0x00 }, /* ϋ to y */
976 { 0x03CC, 0x6f, 0x00 }, /* ό to o */
977 { 0x03CD, 0x79, 0x00 }, /* ύ to y */
978 { 0x03CE, 0x69, 0x00 }, /* ώ to i */
979 { 0x0400, 0x45, 0x00 }, /* Ѐ to E */
980 { 0x0401, 0x45, 0x00 }, /* Ё to E */
981 { 0x0402, 0x44, 0x00 }, /* Ђ to D */
982 { 0x0403, 0x47, 0x00 }, /* Ѓ to G */
983 { 0x0404, 0x45, 0x00 }, /* Є to E */
984 { 0x0405, 0x5a, 0x00 }, /* Ѕ to Z */
985 { 0x0406, 0x49, 0x00 }, /* І to I */
986 { 0x0407, 0x49, 0x00 }, /* Ї to I */
987 { 0x0408, 0x4a, 0x00 }, /* Ј to J */
988 { 0x0409, 0x49, 0x00 }, /* Љ to I */
989 { 0x040A, 0x4e, 0x00 }, /* Њ to N */
990 { 0x040B, 0x44, 0x00 }, /* Ћ to D */
991 { 0x040C, 0x4b, 0x00 }, /* Ќ to K */
992 { 0x040D, 0x49, 0x00 }, /* Ѝ to I */
993 { 0x040E, 0x55, 0x00 }, /* Ў to U */
994 { 0x040F, 0x44, 0x00 }, /* Џ to D */
995 { 0x0410, 0x41, 0x00 }, /* А to A */
996 { 0x0411, 0x42, 0x00 }, /* Б to B */
997 { 0x0412, 0x56, 0x00 }, /* В to V */
998 { 0x0413, 0x47, 0x00 }, /* Г to G */
999 { 0x0414, 0x44, 0x00 }, /* Д to D */
1000 { 0x0415, 0x45, 0x00 }, /* Е to E */
1001 { 0x0416, 0x5a, 0x68 }, /* Ж to Zh */
1002 { 0x0417, 0x5a, 0x00 }, /* З to Z */
1003 { 0x0418, 0x49, 0x00 }, /* И to I */
1004 { 0x0419, 0x49, 0x00 }, /* Й to I */
1005 { 0x041A, 0x4b, 0x00 }, /* К to K */
1006 { 0x041B, 0x4c, 0x00 }, /* Л to L */
1007 { 0x041C, 0x4d, 0x00 }, /* М to M */
1008 { 0x041D, 0x4e, 0x00 }, /* Н to N */
1009 { 0x041E, 0x4f, 0x00 }, /* О to O */
1010 { 0x041F, 0x50, 0x00 }, /* П to P */
1011 { 0x0420, 0x52, 0x00 }, /* Р to R */
1012 { 0x0421, 0x53, 0x00 }, /* С to S */
1013 { 0x0422, 0x54, 0x00 }, /* Т to T */
1014 { 0x0423, 0x55, 0x00 }, /* У to U */
1015 { 0x0424, 0x46, 0x00 }, /* Ф to F */
1016 { 0x0425, 0x4b, 0x68 }, /* Х to Kh */
1017 { 0x0426, 0x54, 0x63 }, /* Ц to Tc */
1018 { 0x0427, 0x43, 0x68 }, /* Ч to Ch */
1019 { 0x0428, 0x53, 0x68 }, /* Ш to Sh */
1020 { 0x0429, 0x53, 0x68 }, /* Щ to Shch */
1021 { 0x042B, 0x59, 0x00 }, /* Ы to Y */
1022 { 0x042D, 0x45, 0x00 }, /* Э to E */
1023 { 0x042E, 0x49, 0x75 }, /* Ю to Iu */
1024 { 0x042F, 0x49, 0x61 }, /* Я to Ia */
1025 { 0x0430, 0x61, 0x00 }, /* а to a */
1026 { 0x0431, 0x62, 0x00 }, /* б to b */
1027 { 0x0432, 0x76, 0x00 }, /* в to v */
1028 { 0x0433, 0x67, 0x00 }, /* г to g */
1029 { 0x0434, 0x64, 0x00 }, /* д to d */
1030 { 0x0435, 0x65, 0x00 }, /* е to e */
1031 { 0x0436, 0x7a, 0x68 }, /* ж to zh */
1032 { 0x0437, 0x7a, 0x00 }, /* з to z */
1033 { 0x0438, 0x69, 0x00 }, /* и to i */
1034 { 0x0439, 0x69, 0x00 }, /* й to i */
1035 { 0x043A, 0x6b, 0x00 }, /* к to k */
1036 { 0x043B, 0x6c, 0x00 }, /* л to l */
1037 { 0x043C, 0x6d, 0x00 }, /* м to m */
1038 { 0x043D, 0x6e, 0x00 }, /* н to n */
1039 { 0x043E, 0x6f, 0x00 }, /* о to o */
1040 { 0x043F, 0x70, 0x00 }, /* п to p */
1041 { 0x0440, 0x72, 0x00 }, /* р to r */
1042 { 0x0441, 0x73, 0x00 }, /* с to s */
1043 { 0x0442, 0x74, 0x00 }, /* т to t */
1044 { 0x0443, 0x75, 0x00 }, /* у to u */
1045 { 0x0444, 0x66, 0x00 }, /* ф to f */
1046 { 0x0445, 0x6b, 0x68 }, /* х to kh */
1047 { 0x0446, 0x74, 0x63 }, /* ц to tc */
1048 { 0x0447, 0x63, 0x68 }, /* ч to ch */
1049 { 0x0448, 0x73, 0x68 }, /* ш to sh */
1050 { 0x0449, 0x73, 0x68 }, /* щ to shch */
1051 { 0x044B, 0x79, 0x00 }, /* ы to y */
1052 { 0x044D, 0x65, 0x00 }, /* э to e */
1053 { 0x044E, 0x69, 0x75 }, /* ю to iu */
1054 { 0x044F, 0x69, 0x61 }, /* я to ia */
1055 { 0x0450, 0x65, 0x00 }, /* ѐ to e */
1056 { 0x0451, 0x65, 0x00 }, /* ё to e */
1057 { 0x0452, 0x64, 0x00 }, /* ђ to d */
1058 { 0x0453, 0x67, 0x00 }, /* ѓ to g */
1059 { 0x0454, 0x65, 0x00 }, /* є to e */
1060 { 0x0455, 0x7a, 0x00 }, /* ѕ to z */
1061 { 0x0456, 0x69, 0x00 }, /* і to i */
1062 { 0x0457, 0x69, 0x00 }, /* ї to i */
1063 { 0x0458, 0x6a, 0x00 }, /* ј to j */
1064 { 0x0459, 0x69, 0x00 }, /* љ to i */
1065 { 0x045A, 0x6e, 0x00 }, /* њ to n */
1066 { 0x045B, 0x64, 0x00 }, /* ћ to d */
1067 { 0x045C, 0x6b, 0x00 }, /* ќ to k */
1068 { 0x045D, 0x69, 0x00 }, /* ѝ to i */
1069 { 0x045E, 0x75, 0x00 }, /* ў to u */
1070 { 0x045F, 0x64, 0x00 }, /* џ to d */
1071 { 0x1E02, 0x42, 0x00 }, /* Ḃ to B */
1072 { 0x1E03, 0x62, 0x00 }, /* ḃ to b */
1073 { 0x1E0A, 0x44, 0x00 }, /* Ḋ to D */
1074 { 0x1E0B, 0x64, 0x00 }, /* ḋ to d */
1075 { 0x1E1E, 0x46, 0x00 }, /* Ḟ to F */
1076 { 0x1E1F, 0x66, 0x00 }, /* ḟ to f */
1077 { 0x1E40, 0x4D, 0x00 }, /* Ṁ to M */
1078 { 0x1E41, 0x6D, 0x00 }, /* ṁ to m */
1079 { 0x1E56, 0x50, 0x00 }, /* Ṗ to P */
1080 { 0x1E57, 0x70, 0x00 }, /* ṗ to p */
1081 { 0x1E60, 0x53, 0x00 }, /* Ṡ to S */
1082 { 0x1E61, 0x73, 0x00 }, /* ṡ to s */
1083 { 0x1E6A, 0x54, 0x00 }, /* Ṫ to T */
1084 { 0x1E6B, 0x74, 0x00 }, /* ṫ to t */
1085 { 0x1E80, 0x57, 0x00 }, /* Ẁ to W */
1086 { 0x1E81, 0x77, 0x00 }, /* ẁ to w */
1087 { 0x1E82, 0x57, 0x00 }, /* Ẃ to W */
1088 { 0x1E83, 0x77, 0x00 }, /* ẃ to w */
1089 { 0x1E84, 0x57, 0x00 }, /* Ẅ to W */
1090 { 0x1E85, 0x77, 0x00 }, /* ẅ to w */
1091 { 0x1EF2, 0x59, 0x00 }, /* Ỳ to Y */
1092 { 0x1EF3, 0x79, 0x00 }, /* ỳ to y */
1093 { 0xFB00, 0x66, 0x66 }, /* ff to ff */
1094 { 0xFB01, 0x66, 0x69 }, /* fi to fi */
1095 { 0xFB02, 0x66, 0x6C }, /* fl to fl */
1096 { 0xFB05, 0x73, 0x74 }, /* ſt to st */
1097 { 0xFB06, 0x73, 0x74 }, /* st to st */
1101 ** Convert the input string from UTF-8 into pure ASCII by converting
1102 ** all non-ASCII characters to some combination of characters in the
1103 ** ASCII subset.
1105 ** The returned string might contain more characters than the input.
1107 ** Space to hold the returned string comes from sqlite3_malloc() and
1108 ** should be freed by the caller.
1110 static unsigned char *transliterate(const unsigned char *zIn, int nIn){
1111 unsigned char *zOut = sqlite3_malloc( nIn*4 + 1 );
1112 int i, c, sz, nOut;
1113 if( zOut==0 ) return 0;
1114 i = nOut = 0;
1115 while( i<nIn ){
1116 c = utf8Read(zIn, nIn, &sz);
1117 zIn += sz;
1118 nIn -= sz;
1119 if( c<=127 ){
1120 zOut[nOut++] = c;
1121 }else{
1122 int xTop, xBtm, x;
1123 xTop = sizeof(translit)/sizeof(translit[0]) - 1;
1124 xBtm = 0;
1125 while( xTop>=xBtm ){
1126 x = (xTop + xBtm)/2;
1127 if( translit[x].cFrom==c ){
1128 zOut[nOut++] = translit[x].cTo0;
1129 if( translit[x].cTo1 ){
1130 zOut[nOut++] = translit[x].cTo1;
1131 /* Add an extra "ch" after the "sh" for Щ and щ */
1132 if( c==0x0429 || c== 0x0449 ){
1133 zOut[nOut++] = 'c';
1134 zOut[nOut++] = 'h';
1137 c = 0;
1138 break;
1139 }else if( translit[x].cFrom>c ){
1140 xTop = x-1;
1141 }else{
1142 xBtm = x+1;
1145 if( c ) zOut[nOut++] = '?';
1148 zOut[nOut] = 0;
1149 return zOut;
1153 ** spellfix1_translit(X)
1155 ** Convert a string that contains non-ASCII Roman characters into
1156 ** pure ASCII.
1158 static void transliterateSqlFunc(
1159 sqlite3_context *context,
1160 int argc,
1161 sqlite3_value **argv
1163 const unsigned char *zIn = sqlite3_value_text(argv[0]);
1164 int nIn = sqlite3_value_bytes(argv[0]);
1165 unsigned char *zOut = transliterate(zIn, nIn);
1166 if( zOut==0 ){
1167 sqlite3_result_error_nomem(context);
1168 }else{
1169 sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free);
1174 ** spellfix1_scriptcode(X)
1176 ** Try to determine the dominant script used by the word X and return
1177 ** its ISO 15924 numeric code.
1179 ** The current implementation only understands the following scripts:
1181 ** 215 (Latin)
1182 ** 220 (Cyrillic)
1183 ** 200 (Greek)
1185 ** This routine will return 998 if the input X contains characters from
1186 ** two or more of the above scripts or 999 if X contains no characters
1187 ** from any of the above scripts.
1189 static void scriptCodeSqlFunc(
1190 sqlite3_context *context,
1191 int argc,
1192 sqlite3_value **argv
1194 const unsigned char *zIn = sqlite3_value_text(argv[0]);
1195 int nIn = sqlite3_value_bytes(argv[0]);
1196 int c, sz;
1197 int scriptMask = 0;
1198 int res;
1199 # define SCRIPT_LATIN 0x0001
1200 # define SCRIPT_CYRILLIC 0x0002
1201 # define SCRIPT_GREEK 0x0004
1203 while( nIn>0 ){
1204 c = utf8Read(zIn, nIn, &sz);
1205 zIn += sz;
1206 nIn -= sz;
1207 if( c<0x02af ){
1208 scriptMask |= SCRIPT_LATIN;
1209 }else if( c>=0x0400 && c<=0x04ff ){
1210 scriptMask |= SCRIPT_CYRILLIC;
1211 }else if( c>=0x0386 && c<=0x03ce ){
1212 scriptMask |= SCRIPT_GREEK;
1215 switch( scriptMask ){
1216 case 0: res = 999; break;
1217 case SCRIPT_LATIN: res = 215; break;
1218 case SCRIPT_CYRILLIC: res = 220; break;
1219 case SCRIPT_GREEK: res = 200; break;
1220 default: res = 998; break;
1222 sqlite3_result_int(context, res);
1225 /*****************************************************************************
1226 ** Fuzzy-search virtual table
1227 *****************************************************************************/
1229 typedef struct spellfix1_vtab spellfix1_vtab;
1230 typedef struct spellfix1_cursor spellfix1_cursor;
1232 /* Fuzzy-search virtual table object */
1233 struct spellfix1_vtab {
1234 sqlite3_vtab base; /* Base class - must be first */
1235 sqlite3 *db; /* Database connection */
1236 char *zDbName; /* Name of database holding this table */
1237 char *zTableName; /* Name of the virtual table */
1240 /* Fuzzy-search cursor object */
1241 struct spellfix1_cursor {
1242 sqlite3_vtab_cursor base; /* Base class - must be first */
1243 spellfix1_vtab *pVTab; /* The table to which this cursor belongs */
1244 int nRow; /* Number of rows of content */
1245 int nAlloc; /* Number of allocated rows */
1246 int iRow; /* Current row of content */
1247 int iLang; /* Value of the lang= constraint */
1248 int iTop; /* Value of the top= constraint */
1249 int iScope; /* Value of the scope= constraint */
1250 int nSearch; /* Number of vocabulary items checked */
1251 struct spellfix1_row { /* For each row of content */
1252 sqlite3_int64 iRowid; /* Rowid for this row */
1253 char *zWord; /* Text for this row */
1254 int iRank; /* Rank for this row */
1255 int iDistance; /* Distance from pattern for this row */
1256 int iScore; /* Score for sorting */
1257 } *a;
1261 ** Construct one or more SQL statements from the format string given
1262 ** and then evaluate those statements. The success code is written
1263 ** into *pRc.
1265 ** If *pRc is initially non-zero then this routine is a no-op.
1267 static void spellfix1DbExec(
1268 int *pRc, /* Success code */
1269 sqlite3 *db, /* Database in which to run SQL */
1270 const char *zFormat, /* Format string for SQL */
1271 ... /* Arguments to the format string */
1273 va_list ap;
1274 char *zSql;
1275 if( *pRc ) return;
1276 va_start(ap, zFormat);
1277 zSql = sqlite3_vmprintf(zFormat, ap);
1278 va_end(ap);
1279 if( zSql==0 ){
1280 *pRc = SQLITE_NOMEM;
1281 }else{
1282 *pRc = sqlite3_exec(db, zSql, 0, 0, 0);
1283 sqlite3_free(zSql);
1288 ** xDisconnect/xDestroy method for the fuzzy-search module.
1290 static int spellfix1Uninit(int isDestroy, sqlite3_vtab *pVTab){
1291 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
1292 int rc = SQLITE_OK;
1293 if( isDestroy ){
1294 sqlite3 *db = p->db;
1295 spellfix1DbExec(&rc, db, "DROP TABLE IF EXISTS \"%w\".\"%w_vocab\"",
1296 p->zDbName, p->zTableName);
1298 if( rc==SQLITE_OK ){
1299 sqlite3_free(p->zTableName);
1300 sqlite3_free(p);
1302 return rc;
1304 static int spellfix1Disconnect(sqlite3_vtab *pVTab){
1305 return spellfix1Uninit(0, pVTab);
1307 static int spellfix1Destroy(sqlite3_vtab *pVTab){
1308 return spellfix1Uninit(1, pVTab);
1312 ** xConnect/xCreate method for the spellfix1 module. Arguments are:
1314 ** argv[0] -> module name ("spellfix1")
1315 ** argv[1] -> database name
1316 ** argv[2] -> table name
1317 ** argv[3].. -> optional arguments (currently ignored)
1319 static int spellfix1Init(
1320 int isCreate,
1321 sqlite3 *db,
1322 void *pAux,
1323 int argc, const char *const*argv,
1324 sqlite3_vtab **ppVTab,
1325 char **pzErr
1327 spellfix1_vtab *pNew = 0;
1328 const char *zModule = argv[0];
1329 const char *zDbName = argv[1];
1330 const char *zTableName = argv[2];
1331 int nDbName;
1332 int rc = SQLITE_OK;
1334 if( argc<3 ){
1335 *pzErr = sqlite3_mprintf(
1336 "%s: wrong number of CREATE VIRTUAL TABLE arguments", argv[0]
1338 rc = SQLITE_ERROR;
1339 }else{
1340 nDbName = strlen(zDbName);
1341 pNew = sqlite3_malloc( sizeof(*pNew) + nDbName + 1);
1342 if( pNew==0 ){
1343 rc = SQLITE_NOMEM;
1344 }else{
1345 memset(pNew, 0, sizeof(*pNew));
1346 pNew->zDbName = (char*)&pNew[1];
1347 memcpy(pNew->zDbName, zDbName, nDbName+1);
1348 pNew->zTableName = sqlite3_mprintf("%s", zTableName);
1349 pNew->db = db;
1350 if( pNew->zTableName==0 ){
1351 rc = SQLITE_NOMEM;
1352 }else{
1353 rc = sqlite3_declare_vtab(db,
1354 "CREATE TABLE x(word,rank,distance,langid,"
1355 "score,top HIDDEN,scope HIDDEN,srchcnt HIDDEN,"
1356 "soundslike HIDDEN)"
1359 if( rc==SQLITE_OK && isCreate ){
1360 sqlite3_uint64 r;
1361 spellfix1DbExec(&rc, db,
1362 "CREATE TABLE IF NOT EXISTS \"%w\".\"%w_vocab\"(\n"
1363 " id INTEGER PRIMARY KEY,\n"
1364 " rank INT,\n"
1365 " langid INT,\n"
1366 " word TEXT,\n"
1367 " k1 TEXT,\n"
1368 " k2 TEXT\n"
1369 ");\n",
1370 zDbName, zTableName
1372 sqlite3_randomness(sizeof(r), &r);
1373 spellfix1DbExec(&rc, db,
1374 "CREATE INDEX IF NOT EXISTS \"%w\".\"%w_index_%llx\" "
1375 "ON \"%w_vocab\"(langid,k2);",
1376 zDbName, zModule, r, zTableName
1382 *ppVTab = (sqlite3_vtab *)pNew;
1383 return rc;
1387 ** The xConnect and xCreate methods
1389 static int spellfix1Connect(
1390 sqlite3 *db,
1391 void *pAux,
1392 int argc, const char *const*argv,
1393 sqlite3_vtab **ppVTab,
1394 char **pzErr
1396 return spellfix1Init(0, db, pAux, argc, argv, ppVTab, pzErr);
1398 static int spellfix1Create(
1399 sqlite3 *db,
1400 void *pAux,
1401 int argc, const char *const*argv,
1402 sqlite3_vtab **ppVTab,
1403 char **pzErr
1405 return spellfix1Init(1, db, pAux, argc, argv, ppVTab, pzErr);
1409 ** Reset a cursor so that it contains zero rows of content but holds
1410 ** space for N rows.
1412 static void spellfix1ResetCursor(spellfix1_cursor *pCur, int N){
1413 int i;
1414 for(i=0; i<pCur->nRow; i++){
1415 sqlite3_free(pCur->a[i].zWord);
1417 pCur->a = sqlite3_realloc(pCur->a, sizeof(pCur->a[0])*N);
1418 pCur->nAlloc = N;
1419 pCur->nRow = 0;
1420 pCur->iRow = 0;
1421 pCur->nSearch = 0;
1425 ** Close a fuzzy-search cursor.
1427 static int spellfix1Close(sqlite3_vtab_cursor *cur){
1428 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
1429 spellfix1ResetCursor(pCur, 0);
1430 sqlite3_free(pCur);
1431 return SQLITE_OK;
1435 ** Search for terms of these forms:
1437 ** (A) word MATCH $str
1438 ** (B) langid == $langid
1439 ** (C) top = $top
1440 ** (D) scope = $scope
1442 ** The plan number is a bit mask formed with these bits:
1444 ** 0x01 (A) is found
1445 ** 0x02 (B) is found
1446 ** 0x04 (C) is found
1447 ** 0x08 (D) is found
1449 ** filter.argv[*] values contains $str, $langid, $top, and $scope,
1450 ** if specified and in that order.
1452 static int spellfix1BestIndex(sqlite3_vtab *tab, sqlite3_index_info *pIdxInfo){
1453 int iPlan = 0;
1454 int iLangTerm = -1;
1455 int iTopTerm = -1;
1456 int iScopeTerm = -1;
1457 int i;
1458 const struct sqlite3_index_constraint *pConstraint;
1459 pConstraint = pIdxInfo->aConstraint;
1460 for(i=0; i<pIdxInfo->nConstraint; i++, pConstraint++){
1461 if( pConstraint->usable==0 ) continue;
1463 /* Terms of the form: word MATCH $str */
1464 if( (iPlan & 1)==0
1465 && pConstraint->iColumn==0
1466 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH
1468 iPlan |= 1;
1469 pIdxInfo->aConstraintUsage[i].argvIndex = 1;
1470 pIdxInfo->aConstraintUsage[i].omit = 1;
1473 /* Terms of the form: langid = $langid */
1474 if( (iPlan & 2)==0
1475 && pConstraint->iColumn==3
1476 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
1478 iPlan |= 2;
1479 iLangTerm = i;
1482 /* Terms of the form: top = $top */
1483 if( (iPlan & 4)==0
1484 && pConstraint->iColumn==5
1485 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
1487 iPlan |= 4;
1488 iTopTerm = i;
1491 /* Terms of the form: scope = $scope */
1492 if( (iPlan & 8)==0
1493 && pConstraint->iColumn==6
1494 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
1496 iPlan |= 8;
1497 iScopeTerm = i;
1500 if( iPlan&1 ){
1501 int idx = 2;
1502 pIdxInfo->idxNum = iPlan;
1503 if( pIdxInfo->nOrderBy==1
1504 && pIdxInfo->aOrderBy[0].iColumn==4
1505 && pIdxInfo->aOrderBy[0].desc==0
1507 pIdxInfo->orderByConsumed = 1; /* Default order by iScore */
1509 if( iPlan&2 ){
1510 pIdxInfo->aConstraintUsage[iLangTerm].argvIndex = idx++;
1511 pIdxInfo->aConstraintUsage[iLangTerm].omit = 1;
1513 if( iPlan&4 ){
1514 pIdxInfo->aConstraintUsage[iTopTerm].argvIndex = idx++;
1515 pIdxInfo->aConstraintUsage[iTopTerm].omit = 1;
1517 if( iPlan&8 ){
1518 pIdxInfo->aConstraintUsage[iScopeTerm].argvIndex = idx++;
1519 pIdxInfo->aConstraintUsage[iScopeTerm].omit = 1;
1521 pIdxInfo->estimatedCost = (double)10000;
1522 }else{
1523 pIdxInfo->idxNum = 0;
1524 pIdxInfo->estimatedCost = (double)10000000;
1526 return SQLITE_OK;
1530 ** Open a new fuzzy-search cursor.
1532 static int spellfix1Open(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
1533 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
1534 spellfix1_cursor *pCur;
1535 pCur = sqlite3_malloc( sizeof(*pCur) );
1536 if( pCur==0 ) return SQLITE_NOMEM;
1537 memset(pCur, 0, sizeof(*pCur));
1538 pCur->pVTab = p;
1539 *ppCursor = &pCur->base;
1540 return SQLITE_OK;
1544 ** Adjust a distance measurement by the words rank in order to show
1545 ** preference to common words.
1547 static int spellfix1Score(int iDistance, int iRank){
1548 int iLog2;
1549 for(iLog2=0; iRank>0; iLog2++, iRank>>=1){}
1550 return iDistance + 32 - iLog2;
1554 ** Compare two spellfix1_row objects for sorting purposes in qsort() such
1555 ** that they sort in order of increasing distance.
1557 static int spellfix1RowCompare(const void *A, const void *B){
1558 const struct spellfix1_row *a = (const struct spellfix1_row*)A;
1559 const struct spellfix1_row *b = (const struct spellfix1_row*)B;
1560 return a->iScore - b->iScore;
1564 ** This version of the xFilter method work if the MATCH term is present
1565 ** and we are doing a scan.
1567 static int spellfix1FilterForMatch(
1568 spellfix1_cursor *pCur,
1569 int idxNum,
1570 int argc,
1571 sqlite3_value **argv
1573 const unsigned char *zPatternIn;
1574 char *zPattern;
1575 int nPattern;
1576 char *zClass;
1577 int nClass;
1578 int iLimit = 20;
1579 int iScope = 4;
1580 int iLang = 0;
1581 char *zSql;
1582 int rc;
1583 sqlite3_stmt *pStmt;
1584 int idx = 1;
1585 spellfix1_vtab *p = pCur->pVTab;
1587 if( idxNum&2 ){
1588 iLang = sqlite3_value_int(argv[idx++]);
1590 if( idxNum&4 ){
1591 iLimit = sqlite3_value_int(argv[idx++]);
1592 if( iLimit<1 ) iLimit = 1;
1594 if( idxNum&8 ){
1595 iScope = sqlite3_value_int(argv[idx++]);
1596 if( iScope<1 ) iScope = 1;
1598 spellfix1ResetCursor(pCur, iLimit);
1599 zPatternIn = sqlite3_value_text(argv[0]);
1600 if( zPatternIn==0 ) return SQLITE_OK;
1601 zPattern = (char*)transliterate(zPatternIn, sqlite3_value_bytes(argv[0]));
1602 if( zPattern==0 ) return SQLITE_NOMEM;
1603 nPattern = strlen(zPattern);
1604 if( zPattern[nPattern-1]=='*' ) nPattern--;
1605 if( nPattern<iScope ) iScope = nPattern;
1606 zClass = (char*)characterClassString((unsigned char*)zPattern,
1607 strlen(zPattern));
1608 nClass = strlen(zClass);
1609 if( nClass>iScope ){
1610 zClass[iScope] = 0;
1611 nClass = iScope;
1613 zSql = sqlite3_mprintf(
1614 "SELECT id, word, rank, k1"
1615 " FROM \"%w\".\"%w_vocab\""
1616 " WHERE langid=%d AND k2 GLOB '%q*'",
1617 p->zDbName, p->zTableName, iLang, zClass
1619 rc = sqlite3_prepare_v2(p->db, zSql, -1, &pStmt, 0);
1620 sqlite3_free(zSql);
1621 if( rc==SQLITE_OK ){
1622 const char *zK1;
1623 int iDist;
1624 int iRank;
1625 int iScore;
1626 int iWorst = 999999999;
1627 int idx;
1628 int idxWorst;
1629 int i;
1631 while( sqlite3_step(pStmt)==SQLITE_ROW ){
1632 zK1 = (const char*)sqlite3_column_text(pStmt, 3);
1633 if( zK1==0 ) continue;
1634 pCur->nSearch++;
1635 iRank = sqlite3_column_int(pStmt, 2);
1636 iDist = editdist(zPattern, zK1);
1637 iScore = spellfix1Score(iDist,iRank);
1638 if( pCur->nRow<pCur->nAlloc ){
1639 idx = pCur->nRow;
1640 }else if( iScore<iWorst ){
1641 idx = idxWorst;
1642 sqlite3_free(pCur->a[idx].zWord);
1643 }else{
1644 continue;
1646 pCur->a[idx].zWord = sqlite3_mprintf("%s", sqlite3_column_text(pStmt, 1));
1647 pCur->a[idx].iRowid = sqlite3_column_int64(pStmt, 0);
1648 pCur->a[idx].iRank = iRank;
1649 pCur->a[idx].iDistance = iDist;
1650 pCur->a[idx].iScore = iScore;
1651 if( pCur->nRow<pCur->nAlloc ) pCur->nRow++;
1652 if( pCur->nRow==pCur->nAlloc ){
1653 iWorst = pCur->a[0].iScore;
1654 idxWorst = 0;
1655 for(i=1; i<pCur->nRow; i++){
1656 iScore = pCur->a[i].iScore;
1657 if( iWorst<iScore ){
1658 iWorst = iScore;
1659 idxWorst = i;
1665 qsort(pCur->a, pCur->nRow, sizeof(pCur->a[0]), spellfix1RowCompare);
1666 pCur->iTop = iLimit;
1667 pCur->iScope = iScope;
1668 sqlite3_finalize(pStmt);
1669 sqlite3_free(zPattern);
1670 sqlite3_free(zClass);
1671 return SQLITE_OK;
1675 ** This version of xFilter handles a full-table scan case
1677 static int spellfix1FilterForFullScan(
1678 spellfix1_cursor *pCur,
1679 int idxNum,
1680 int argc,
1681 sqlite3_value **argv
1683 spellfix1ResetCursor(pCur, 0);
1684 return SQLITE_OK;
1689 ** Called to "rewind" a cursor back to the beginning so that
1690 ** it starts its output over again. Always called at least once
1691 ** prior to any spellfix1Column, spellfix1Rowid, or spellfix1Eof call.
1693 static int spellfix1Filter(
1694 sqlite3_vtab_cursor *cur,
1695 int idxNum, const char *idxStr,
1696 int argc, sqlite3_value **argv
1698 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
1699 int rc;
1700 if( idxNum & 1 ){
1701 rc = spellfix1FilterForMatch(pCur, idxNum, argc, argv);
1702 }else{
1703 rc = spellfix1FilterForFullScan(pCur, idxNum, argc, argv);
1705 return rc;
1710 ** Advance a cursor to its next row of output
1712 static int spellfix1Next(sqlite3_vtab_cursor *cur){
1713 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
1714 if( pCur->iRow < pCur->nRow ) pCur->iRow++;
1715 return SQLITE_OK;
1719 ** Return TRUE if we are at the end-of-file
1721 static int spellfix1Eof(sqlite3_vtab_cursor *cur){
1722 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
1723 return pCur->iRow>=pCur->nRow;
1727 ** Return columns from the current row.
1729 static int spellfix1Column(sqlite3_vtab_cursor *cur, sqlite3_context *ctx, int i){
1730 spellfix1_cursor *pCur = (spellfix1_cursor*)cur;
1731 switch( i ){
1732 case 0: {
1733 sqlite3_result_text(ctx, pCur->a[pCur->iRow].zWord, -1, SQLITE_STATIC);
1734 break;
1736 case 1: {
1737 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iRank);
1738 break;
1740 case 2: {
1741 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iDistance);
1742 break;
1744 case 3: {
1745 sqlite3_result_int(ctx, pCur->iLang);
1746 break;
1748 case 4: {
1749 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iScore);
1750 break;
1752 case 5: {
1753 sqlite3_result_int(ctx, pCur->iTop);
1754 break;
1756 case 6: {
1757 sqlite3_result_int(ctx, pCur->iScope);
1758 break;
1760 case 7: {
1761 sqlite3_result_int(ctx, pCur->nSearch);
1762 break;
1764 default: {
1765 sqlite3_result_null(ctx);
1766 break;
1769 return SQLITE_OK;
1773 ** The rowid.
1775 static int spellfix1Rowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid){
1776 spellfix1_cursor *pCur = (spellfix1_cursor*)cur;
1777 *pRowid = pCur->a[pCur->iRow].iRowid;
1778 return SQLITE_OK;
1782 ** The xUpdate() method.
1784 static int spellfix1Update(
1785 sqlite3_vtab *pVTab,
1786 int argc,
1787 sqlite3_value **argv,
1788 sqlite_int64 *pRowid
1790 int rc = SQLITE_OK;
1791 sqlite3_int64 rowid, newRowid;
1792 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
1793 sqlite3 *db = p->db;
1795 if( argc==1 ){
1796 /* A delete operation on the rowid given by argv[0] */
1797 rowid = *pRowid = sqlite3_value_int64(argv[0]);
1798 spellfix1DbExec(&rc, db, "DELETE FROM \"%w\".\"%w_vocab\" "
1799 " WHERE id=%lld",
1800 p->zDbName, p->zTableName, rowid);
1801 }else{
1802 const unsigned char *zWord = sqlite3_value_text(argv[2]);
1803 int nWord = sqlite3_value_bytes(argv[2]);
1804 int iLang = sqlite3_value_int(argv[5]);
1805 int iRank = sqlite3_value_int(argv[3]);
1806 const unsigned char *zSoundslike = sqlite3_value_text(argv[10]);
1807 int nSoundslike = sqlite3_value_bytes(argv[10]);
1808 char *zK1, *zK2;
1809 int i;
1810 char c;
1812 if( zWord==0 ){
1813 pVTab->zErrMsg = sqlite3_mprintf("%w.word may not be NULL",
1814 p->zTableName);
1815 return SQLITE_CONSTRAINT;
1817 if( iRank<1 ) iRank = 1;
1818 if( zSoundslike ){
1819 zK1 = (char*)transliterate(zSoundslike, nSoundslike);
1820 }else{
1821 zK1 = (char*)transliterate(zWord, nWord);
1823 if( zK1==0 ) return SQLITE_NOMEM;
1824 for(i=0; (c = zK1[i])!=0; i++){
1825 if( c>='A' && c<='Z' ) zK1[i] += 'a' - 'A';
1827 zK2 = (char*)characterClassString((const unsigned char*)zK1, i);
1828 if( zK2==0 ){
1829 sqlite3_free(zK1);
1830 return SQLITE_NOMEM;
1832 if( sqlite3_value_type(argv[0])==SQLITE_NULL ){
1833 spellfix1DbExec(&rc, db,
1834 "INSERT INTO \"%w\".\"%w_vocab\"(rank,langid,word,k1,k2) "
1835 "VALUES(%d,%d,%Q,%Q,%Q)",
1836 p->zDbName, p->zTableName,
1837 iRank, iLang, zWord, zK1, zK2
1839 *pRowid = sqlite3_last_insert_rowid(db);
1840 }else{
1841 rowid = sqlite3_value_int64(argv[0]);
1842 newRowid = *pRowid = sqlite3_value_int64(argv[1]);
1843 spellfix1DbExec(&rc, db,
1844 "UPDATE \"%w\".\"%w_vocab\" SET id=%lld, rank=%d, lang=%d,"
1845 " word=%Q, rank=%d, k1=%Q, k2=%Q WHERE id=%lld",
1846 p->zDbName, p->zTableName, newRowid, iRank, iLang,
1847 zWord, zK1, zK2, rowid
1850 sqlite3_free(zK1);
1851 sqlite3_free(zK2);
1853 return rc;
1857 ** Rename the spellfix1 table.
1859 static int spellfix1Rename(sqlite3_vtab *pVTab, const char *zNew){
1860 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
1861 sqlite3 *db = p->db;
1862 int rc = SQLITE_OK;
1863 char *zNewName = sqlite3_mprintf("%s", zNew);
1864 if( zNewName==0 ){
1865 return SQLITE_NOMEM;
1867 spellfix1DbExec(&rc, db,
1868 "ALTER TABLE \"%w\".\"%w_vocab\" RENAME TO \"%w_vocab\"",
1869 p->zDbName, p->zTableName, zNewName
1871 if( rc==SQLITE_OK ){
1872 sqlite3_free(p->zTableName);
1873 p->zTableName = zNewName;
1875 return rc;
1880 ** A virtual table module that provides fuzzy search.
1882 static sqlite3_module spellfix1Module = {
1883 0, /* iVersion */
1884 spellfix1Create, /* xCreate - handle CREATE VIRTUAL TABLE */
1885 spellfix1Connect, /* xConnect - reconnected to an existing table */
1886 spellfix1BestIndex, /* xBestIndex - figure out how to do a query */
1887 spellfix1Disconnect, /* xDisconnect - close a connection */
1888 spellfix1Destroy, /* xDestroy - handle DROP TABLE */
1889 spellfix1Open, /* xOpen - open a cursor */
1890 spellfix1Close, /* xClose - close a cursor */
1891 spellfix1Filter, /* xFilter - configure scan constraints */
1892 spellfix1Next, /* xNext - advance a cursor */
1893 spellfix1Eof, /* xEof - check for end of scan */
1894 spellfix1Column, /* xColumn - read data */
1895 spellfix1Rowid, /* xRowid - read data */
1896 spellfix1Update, /* xUpdate */
1897 0, /* xBegin */
1898 0, /* xSync */
1899 0, /* xCommit */
1900 0, /* xRollback */
1901 0, /* xFindMethod */
1902 spellfix1Rename, /* xRename */
1906 ** Register the various functions and the virtual table.
1908 static int spellfix1Register(sqlite3 *db){
1909 int nErr = 0;
1910 int i;
1911 nErr += sqlite3_create_function(db, "spellfix1_translit", 1, SQLITE_UTF8, 0,
1912 transliterateSqlFunc, 0, 0);
1913 nErr += sqlite3_create_function(db, "spellfix1_editdist", 2, SQLITE_UTF8, 0,
1914 editdistSqlFunc, 0, 0);
1915 nErr += sqlite3_create_function(db, "spellfix1_charclass", 1, SQLITE_UTF8, 0,
1916 characterClassSqlFunc, 0, 0);
1917 nErr += sqlite3_create_function(db, "spellfix1_scriptcode", 1, SQLITE_UTF8, 0,
1918 scriptCodeSqlFunc, 0, 0);
1919 nErr += sqlite3_create_module(db, "spellfix1", &spellfix1Module, 0);
1921 /* Verify sanity of the translit[] table */
1922 for(i=0; i<sizeof(translit)/sizeof(translit[0])-1; i++){
1923 assert( translit[i].cFrom<translit[i+1].cFrom );
1926 return nErr ? SQLITE_ERROR : SQLITE_OK;
1929 #if SQLITE_CORE || defined(SQLITE_TEST)
1931 ** Register the spellfix1 virtual table and its associated functions.
1933 int sqlite3Spellfix1Register(sqlite3 *db){
1934 return spellfix1Register(db);
1936 #endif
1939 #if !SQLITE_CORE
1941 ** Extension load function.
1943 int sqlite3_extension_init(
1944 sqlite3 *db,
1945 char **pzErrMsg,
1946 const sqlite3_api_routines *pApi
1948 SQLITE_EXTENSION_INIT2(pApi);
1949 return spellfix1Register(db);
1951 #endif /* !SQLITE_CORE */