4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 *************************************************************************
13 ** This module implements the spellfix1 VIRTUAL TABLE that can be used
14 ** to search a large vocabulary for close matches. See separate
15 ** documentation (http://www.sqlite.org/spellfix1.html) for details.
17 #include "sqlite3ext.h"
18 SQLITE_EXTENSION_INIT1
20 #ifndef SQLITE_AMALGAMATION
27 typedef unsigned char u8
;
28 typedef unsigned short u16
;
32 #ifndef SQLITE_OMIT_VIRTUALTABLE
35 ** Character classes for ASCII characters:
37 ** 0 '' Silent letters: H W
38 ** 1 'A' Any vowel: A E I O U (Y)
39 ** 2 'B' A bilabeal stop or fricative: B F P V W
40 ** 3 'C' Other fricatives or back stops: C G J K Q S X Z
41 ** 4 'D' Alveolar stops: D T
42 ** 5 'H' Letter H at the beginning of a word
46 ** 9 'Y' Letter Y at the beginning of a word.
47 ** 10 '9' Digits: 0 1 2 3 4 5 6 7 8 9
51 #define CCLASS_SILENT 0
52 #define CCLASS_VOWEL 1
61 #define CCLASS_DIGIT 10
62 #define CCLASS_SPACE 11
63 #define CCLASS_OTHER 12
66 ** The following table gives the character class for non-initial ASCII
69 static const unsigned char midClass
[] = {
70 /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
,
71 /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
,
72 /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
,
73 /* */ CCLASS_SPACE
, /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
,
74 /* */ CCLASS_SPACE
, /* */ CCLASS_SPACE
, /* */ CCLASS_OTHER
,
75 /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
,
76 /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
,
77 /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
,
78 /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
,
79 /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
,
80 /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
, /* */ CCLASS_SPACE
,
81 /* ! */ CCLASS_OTHER
, /* " */ CCLASS_OTHER
, /* # */ CCLASS_OTHER
,
82 /* $ */ CCLASS_OTHER
, /* % */ CCLASS_OTHER
, /* & */ CCLASS_OTHER
,
83 /* ' */ CCLASS_SILENT
, /* ( */ CCLASS_OTHER
, /* ) */ CCLASS_OTHER
,
84 /* * */ CCLASS_OTHER
, /* + */ CCLASS_OTHER
, /* , */ CCLASS_OTHER
,
85 /* - */ CCLASS_OTHER
, /* . */ CCLASS_OTHER
, /* / */ CCLASS_OTHER
,
86 /* 0 */ CCLASS_DIGIT
, /* 1 */ CCLASS_DIGIT
, /* 2 */ CCLASS_DIGIT
,
87 /* 3 */ CCLASS_DIGIT
, /* 4 */ CCLASS_DIGIT
, /* 5 */ CCLASS_DIGIT
,
88 /* 6 */ CCLASS_DIGIT
, /* 7 */ CCLASS_DIGIT
, /* 8 */ CCLASS_DIGIT
,
89 /* 9 */ CCLASS_DIGIT
, /* : */ CCLASS_OTHER
, /* ; */ CCLASS_OTHER
,
90 /* < */ CCLASS_OTHER
, /* = */ CCLASS_OTHER
, /* > */ CCLASS_OTHER
,
91 /* ? */ CCLASS_OTHER
, /* @ */ CCLASS_OTHER
, /* A */ CCLASS_VOWEL
,
92 /* B */ CCLASS_B
, /* C */ CCLASS_C
, /* D */ CCLASS_D
,
93 /* E */ CCLASS_VOWEL
, /* F */ CCLASS_B
, /* G */ CCLASS_C
,
94 /* H */ CCLASS_SILENT
, /* I */ CCLASS_VOWEL
, /* J */ CCLASS_C
,
95 /* K */ CCLASS_C
, /* L */ CCLASS_L
, /* M */ CCLASS_M
,
96 /* N */ CCLASS_M
, /* O */ CCLASS_VOWEL
, /* P */ CCLASS_B
,
97 /* Q */ CCLASS_C
, /* R */ CCLASS_R
, /* S */ CCLASS_C
,
98 /* T */ CCLASS_D
, /* U */ CCLASS_VOWEL
, /* V */ CCLASS_B
,
99 /* W */ CCLASS_B
, /* X */ CCLASS_C
, /* Y */ CCLASS_VOWEL
,
100 /* Z */ CCLASS_C
, /* [ */ CCLASS_OTHER
, /* \ */ CCLASS_OTHER
,
101 /* ] */ CCLASS_OTHER
, /* ^ */ CCLASS_OTHER
, /* _ */ CCLASS_OTHER
,
102 /* ` */ CCLASS_OTHER
, /* a */ CCLASS_VOWEL
, /* b */ CCLASS_B
,
103 /* c */ CCLASS_C
, /* d */ CCLASS_D
, /* e */ CCLASS_VOWEL
,
104 /* f */ CCLASS_B
, /* g */ CCLASS_C
, /* h */ CCLASS_SILENT
,
105 /* i */ CCLASS_VOWEL
, /* j */ CCLASS_C
, /* k */ CCLASS_C
,
106 /* l */ CCLASS_L
, /* m */ CCLASS_M
, /* n */ CCLASS_M
,
107 /* o */ CCLASS_VOWEL
, /* p */ CCLASS_B
, /* q */ CCLASS_C
,
108 /* r */ CCLASS_R
, /* s */ CCLASS_C
, /* t */ CCLASS_D
,
109 /* u */ CCLASS_VOWEL
, /* v */ CCLASS_B
, /* w */ CCLASS_B
,
110 /* x */ CCLASS_C
, /* y */ CCLASS_VOWEL
, /* z */ CCLASS_C
,
111 /* { */ CCLASS_OTHER
, /* | */ CCLASS_OTHER
, /* } */ CCLASS_OTHER
,
112 /* ~ */ CCLASS_OTHER
, /* */ CCLASS_OTHER
,
115 ** This tables gives the character class for ASCII characters that form the
116 ** initial character of a word. The only difference from midClass is with
117 ** the letters H, W, and Y.
119 static const unsigned char initClass
[] = {
120 /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
,
121 /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
,
122 /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
,
123 /* */ CCLASS_SPACE
, /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
,
124 /* */ CCLASS_SPACE
, /* */ CCLASS_SPACE
, /* */ CCLASS_OTHER
,
125 /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
,
126 /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
,
127 /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
,
128 /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
,
129 /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
,
130 /* */ CCLASS_OTHER
, /* */ CCLASS_OTHER
, /* */ CCLASS_SPACE
,
131 /* ! */ CCLASS_OTHER
, /* " */ CCLASS_OTHER
, /* # */ CCLASS_OTHER
,
132 /* $ */ CCLASS_OTHER
, /* % */ CCLASS_OTHER
, /* & */ CCLASS_OTHER
,
133 /* ' */ CCLASS_OTHER
, /* ( */ CCLASS_OTHER
, /* ) */ CCLASS_OTHER
,
134 /* * */ CCLASS_OTHER
, /* + */ CCLASS_OTHER
, /* , */ CCLASS_OTHER
,
135 /* - */ CCLASS_OTHER
, /* . */ CCLASS_OTHER
, /* / */ CCLASS_OTHER
,
136 /* 0 */ CCLASS_DIGIT
, /* 1 */ CCLASS_DIGIT
, /* 2 */ CCLASS_DIGIT
,
137 /* 3 */ CCLASS_DIGIT
, /* 4 */ CCLASS_DIGIT
, /* 5 */ CCLASS_DIGIT
,
138 /* 6 */ CCLASS_DIGIT
, /* 7 */ CCLASS_DIGIT
, /* 8 */ CCLASS_DIGIT
,
139 /* 9 */ CCLASS_DIGIT
, /* : */ CCLASS_OTHER
, /* ; */ CCLASS_OTHER
,
140 /* < */ CCLASS_OTHER
, /* = */ CCLASS_OTHER
, /* > */ CCLASS_OTHER
,
141 /* ? */ CCLASS_OTHER
, /* @ */ CCLASS_OTHER
, /* A */ CCLASS_VOWEL
,
142 /* B */ CCLASS_B
, /* C */ CCLASS_C
, /* D */ CCLASS_D
,
143 /* E */ CCLASS_VOWEL
, /* F */ CCLASS_B
, /* G */ CCLASS_C
,
144 /* H */ CCLASS_SILENT
, /* I */ CCLASS_VOWEL
, /* J */ CCLASS_C
,
145 /* K */ CCLASS_C
, /* L */ CCLASS_L
, /* M */ CCLASS_M
,
146 /* N */ CCLASS_M
, /* O */ CCLASS_VOWEL
, /* P */ CCLASS_B
,
147 /* Q */ CCLASS_C
, /* R */ CCLASS_R
, /* S */ CCLASS_C
,
148 /* T */ CCLASS_D
, /* U */ CCLASS_VOWEL
, /* V */ CCLASS_B
,
149 /* W */ CCLASS_B
, /* X */ CCLASS_C
, /* Y */ CCLASS_Y
,
150 /* Z */ CCLASS_C
, /* [ */ CCLASS_OTHER
, /* \ */ CCLASS_OTHER
,
151 /* ] */ CCLASS_OTHER
, /* ^ */ CCLASS_OTHER
, /* _ */ CCLASS_OTHER
,
152 /* ` */ CCLASS_OTHER
, /* a */ CCLASS_VOWEL
, /* b */ CCLASS_B
,
153 /* c */ CCLASS_C
, /* d */ CCLASS_D
, /* e */ CCLASS_VOWEL
,
154 /* f */ CCLASS_B
, /* g */ CCLASS_C
, /* h */ CCLASS_SILENT
,
155 /* i */ CCLASS_VOWEL
, /* j */ CCLASS_C
, /* k */ CCLASS_C
,
156 /* l */ CCLASS_L
, /* m */ CCLASS_M
, /* n */ CCLASS_M
,
157 /* o */ CCLASS_VOWEL
, /* p */ CCLASS_B
, /* q */ CCLASS_C
,
158 /* r */ CCLASS_R
, /* s */ CCLASS_C
, /* t */ CCLASS_D
,
159 /* u */ CCLASS_VOWEL
, /* v */ CCLASS_B
, /* w */ CCLASS_B
,
160 /* x */ CCLASS_C
, /* y */ CCLASS_Y
, /* z */ CCLASS_C
,
161 /* { */ CCLASS_OTHER
, /* | */ CCLASS_OTHER
, /* } */ CCLASS_OTHER
,
162 /* ~ */ CCLASS_OTHER
, /* */ CCLASS_OTHER
,
166 ** Mapping from the character class number (0-13) to a symbol for each
167 ** character class. Note that initClass[] can be used to map the class
168 ** symbol back into the class number.
170 static const unsigned char className
[] = ".ABCDHLRMY9 ?";
173 ** Generate a "phonetic hash" from a string of ASCII characters
176 ** * Map characters by character class as defined above.
177 ** * Omit double-letters
178 ** * Omit vowels beside R and L
179 ** * Omit T when followed by CH
180 ** * Omit W when followed by R
181 ** * Omit D when followed by J or G
182 ** * Omit K in KN or G in GN at the beginning of a word
184 ** Space to hold the result is obtained from sqlite3_malloc()
186 ** Return NULL if memory allocation fails.
188 static unsigned char *phoneticHash(const unsigned char *zIn
, int nIn
){
189 unsigned char *zOut
= sqlite3_malloc( nIn
+ 1 );
194 const unsigned char *aClass
= initClass
;
196 if( zOut
==0 ) return 0;
201 if( zIn
[1]=='n' ){ zIn
++; nIn
--; }
206 for(i
=0; i
<nIn
; i
++){
207 unsigned char c
= zIn
[i
];
209 if( c
=='w' && zIn
[i
+1]=='r' ) continue;
210 if( c
=='d' && (zIn
[i
+1]=='j' || zIn
[i
+1]=='g') ) continue;
212 if( c
=='t' && zIn
[i
+1]=='c' && zIn
[i
+2]=='h' ) continue;
216 if( c
==CCLASS_SPACE
) continue;
217 if( c
==CCLASS_OTHER
&& cPrev
!=CCLASS_DIGIT
) continue;
219 if( c
==CCLASS_VOWEL
&& (cPrevX
==CCLASS_R
|| cPrevX
==CCLASS_L
) ){
220 continue; /* No vowels beside L or R */
222 if( (c
==CCLASS_R
|| c
==CCLASS_L
) && cPrevX
==CCLASS_VOWEL
){
223 nOut
--; /* No vowels beside L or R */
226 if( c
==CCLASS_SILENT
) continue;
230 if( nOut
==0 || c
!=zOut
[nOut
-1] ) zOut
[nOut
++] = c
;
237 ** This is an SQL function wrapper around phoneticHash(). See
238 ** the description of phoneticHash() for additional information.
240 static void phoneticHashSqlFunc(
241 sqlite3_context
*context
,
245 const unsigned char *zIn
;
248 zIn
= sqlite3_value_text(argv
[0]);
250 zOut
= phoneticHash(zIn
, sqlite3_value_bytes(argv
[0]));
252 sqlite3_result_error_nomem(context
);
254 sqlite3_result_text(context
, (char*)zOut
, -1, sqlite3_free
);
259 ** Return the character class number for a character given its
262 static char characterClass(char cPrev
, char c
){
263 return cPrev
==0 ? initClass
[c
&0x7f] : midClass
[c
&0x7f];
267 ** Return the cost of inserting or deleting character c immediately
268 ** following character cPrev. If cPrev==0, that means c is the first
269 ** character of the word.
271 static int insertOrDeleteCost(char cPrev
, char c
, char cNext
){
272 char classC
= characterClass(cPrev
, c
);
275 if( classC
==CCLASS_SILENT
){
276 /* Insert or delete "silent" characters such as H or W */
280 /* Repeated characters, or miss a repeat */
283 if( classC
==CCLASS_VOWEL
&& (cPrev
=='r' || cNext
=='r') ){
284 return 20; /* Insert a vowel before or after 'r' */
286 classCprev
= characterClass(cPrev
, cPrev
);
287 if( classC
==classCprev
){
288 if( classC
==CCLASS_VOWEL
){
289 /* Remove or add a new vowel to a vowel cluster */
292 /* Remove or add a consonant not in the same class */
297 /* any other character insertion or deletion */
302 ** Divide the insertion cost by this factor when appending to the
305 #define FINAL_INS_COST_DIV 4
308 ** Return the cost of substituting cTo in place of cFrom assuming
309 ** the previous character is cPrev. If cPrev==0 then cTo is the first
310 ** character of the word.
312 static int substituteCost(char cPrev
, char cFrom
, char cTo
){
313 char classFrom
, classTo
;
318 if( cFrom
==(cTo
^0x20) && ((cTo
>='A' && cTo
<='Z') || (cTo
>='a' && cTo
<='z')) ){
319 /* differ only in case */
322 classFrom
= characterClass(cPrev
, cFrom
);
323 classTo
= characterClass(cPrev
, cTo
);
324 if( classFrom
==classTo
){
325 /* Same character class */
328 if( classFrom
>=CCLASS_B
&& classFrom
<=CCLASS_Y
329 && classTo
>=CCLASS_B
&& classTo
<=CCLASS_Y
){
330 /* Convert from one consonant to another, but in a different class */
333 /* Any other subsitution */
338 ** Given two strings zA and zB which are pure ASCII, return the cost
339 ** of transforming zA into zB. If zA ends with '*' assume that it is
340 ** a prefix of zB and give only minimal penalty for extra characters
343 ** Smaller numbers mean a closer match.
345 ** Negative values indicate an error:
346 ** -1 One of the inputs is NULL
347 ** -2 Non-ASCII characters on input
348 ** -3 Unable to allocate memory
350 ** If pnMatch is not NULL, then *pnMatch is set to the number of bytes
351 ** of zB that matched the pattern in zA. If zA does not end with a '*',
352 ** then this value is always the number of bytes in zB (i.e. strlen(zB)).
353 ** If zA does end in a '*', then it is the number of bytes in the prefix
354 ** of zB that was deemed to match zA.
356 static int editdist1(const char *zA
, const char *zB
, int *pnMatch
){
357 int nA
, nB
; /* Number of characters in zA[] and zB[] */
358 int xA
, xB
; /* Loop counters for zA[] and zB[] */
359 char cA
, cB
; /* Current character of zA and zB */
360 char cAprev
, cBprev
; /* Previous character of zA and zB */
361 char cAnext
, cBnext
; /* Next character in zA and zB */
362 int d
; /* North-west cost value */
363 int dc
= 0; /* North-west character value */
364 int res
; /* Final result */
365 int *m
; /* The cost matrix */
366 char *cx
; /* Corresponding character values */
367 int *toFree
= 0; /* Malloced space */
368 int mStack
[60+15]; /* Stack space to use if not too much is needed */
371 /* Early out if either input is NULL */
372 if( zA
==0 || zB
==0 ) return -1;
374 /* Skip any common prefix */
375 while( zA
[0] && zA
[0]==zB
[0] ){ dc
= zA
[0]; zA
++; zB
++; nMatch
++; }
376 if( pnMatch
) *pnMatch
= nMatch
;
377 if( zA
[0]==0 && zB
[0]==0 ) return 0;
380 printf("A=\"%s\" B=\"%s\" dc=%c\n", zA
, zB
, dc
?dc
:' ');
383 /* Verify input strings and measure their lengths */
384 for(nA
=0; zA
[nA
]; nA
++){
385 if( zA
[nA
]&0x80 ) return -2;
387 for(nB
=0; zB
[nB
]; nB
++){
388 if( zB
[nB
]&0x80 ) return -2;
391 /* Special processing if either string is empty */
394 for(xB
=res
=0; (cB
= zB
[xB
])!=0; xB
++){
395 res
+= insertOrDeleteCost(cBprev
, cB
, zB
[xB
+1])/FINAL_INS_COST_DIV
;
402 for(xA
=res
=0; (cA
= zA
[xA
])!=0; xA
++){
403 res
+= insertOrDeleteCost(cAprev
, cA
, zA
[xA
+1]);
409 /* A is a prefix of B */
410 if( zA
[0]=='*' && zA
[1]==0 ) return 0;
412 /* Allocate and initialize the Wagner matrix */
413 if( nB
<(sizeof(mStack
)*4)/(sizeof(mStack
[0])*5) ){
416 m
= toFree
= sqlite3_malloc( (nB
+1)*5*sizeof(m
[0])/4 );
417 if( m
==0 ) return -3;
419 cx
= (char*)&m
[nB
+1];
421 /* Compute the Wagner edit distance */
425 for(xB
=1; xB
<=nB
; xB
++){
429 m
[xB
] = m
[xB
-1] + insertOrDeleteCost(cBprev
, cB
, cBnext
);
433 for(xA
=1; xA
<=nA
; xA
++){
434 int lastA
= (xA
==nA
);
437 if( cA
=='*' && lastA
) break;
440 m
[0] = d
+ insertOrDeleteCost(cAprev
, cA
, cAnext
);
442 for(xB
=1; xB
<=nB
; xB
++){
443 int totalCost
, insCost
, delCost
, subCost
, ncx
;
447 /* Cost to insert cB */
448 insCost
= insertOrDeleteCost(cx
[xB
-1], cB
, cBnext
);
449 if( lastA
) insCost
/= FINAL_INS_COST_DIV
;
451 /* Cost to delete cA */
452 delCost
= insertOrDeleteCost(cx
[xB
], cA
, cBnext
);
454 /* Cost to substitute cA->cB */
455 subCost
= substituteCost(cx
[xB
-1], cA
, cB
);
458 totalCost
= insCost
+ m
[xB
-1];
460 if( (delCost
+ m
[xB
])<totalCost
){
461 totalCost
= delCost
+ m
[xB
];
464 if( (subCost
+ d
)<totalCost
){
465 totalCost
= subCost
+ d
;
469 printf("%d,%d d=%4d u=%4d r=%4d dc=%c cA=%c cB=%c"
470 " ins=%4d del=%4d sub=%4d t=%4d ncx=%c\n",
471 xA
, xB
, d
, m
[xB
], m
[xB
-1], dc
?dc
:' ', cA
, cB
,
472 insCost
, delCost
, subCost
, totalCost
, ncx
?ncx
:' ');
475 /* Update the matrix */
485 /* Free the wagner matrix and return the result */
488 for(xB
=1; xB
<=nB
; xB
++){
491 if( pnMatch
) *pnMatch
= xB
+nMatch
;
496 /* In the current implementation, pnMatch is always NULL if zA does
498 assert( pnMatch
==0 );
500 sqlite3_free(toFree
);
505 ** Function: editdist(A,B)
507 ** Return the cost of transforming string A into string B. Both strings
508 ** must be pure ASCII text. If A ends with '*' then it is assumed to be
509 ** a prefix of B and extra characters on the end of B have minimal additional
512 static void editdistSqlFunc(
513 sqlite3_context
*context
,
518 (const char*)sqlite3_value_text(argv
[0]),
519 (const char*)sqlite3_value_text(argv
[1]),
523 sqlite3_result_error_nomem(context
);
524 }else if( res
==(-2) ){
525 sqlite3_result_error(context
, "non-ASCII input to editdist()", -1);
527 sqlite3_result_error(context
, "NULL input to editdist()", -1);
530 sqlite3_result_int(context
, res
);
534 /* End of the fixed-cost edit distance implementation
535 ******************************************************************************
536 *****************************************************************************
537 ** Begin: Configurable cost unicode edit distance routines
539 /* Forward declaration of structures */
540 typedef struct EditDist3Cost EditDist3Cost
;
541 typedef struct EditDist3Config EditDist3Config
;
542 typedef struct EditDist3Point EditDist3Point
;
543 typedef struct EditDist3From EditDist3From
;
544 typedef struct EditDist3FromString EditDist3FromString
;
545 typedef struct EditDist3To EditDist3To
;
546 typedef struct EditDist3ToString EditDist3ToString
;
547 typedef struct EditDist3Lang EditDist3Lang
;
551 ** An entry in the edit cost table
553 struct EditDist3Cost
{
554 EditDist3Cost
*pNext
; /* Next cost element */
555 u8 nFrom
; /* Number of bytes in aFrom */
556 u8 nTo
; /* Number of bytes in aTo */
557 u16 iCost
; /* Cost of this transformation */
558 char a
[4] ; /* FROM string followed by TO string */
559 /* Additional TO and FROM string bytes appended as necessary */
563 ** Edit costs for a particular language ID
565 struct EditDist3Lang
{
566 int iLang
; /* Language ID */
567 int iInsCost
; /* Default insertion cost */
568 int iDelCost
; /* Default deletion cost */
569 int iSubCost
; /* Default substitution cost */
570 EditDist3Cost
*pCost
; /* Costs */
575 ** The default EditDist3Lang object, with default costs.
577 static const EditDist3Lang editDist3Lang
= { 0, 100, 100, 150, 0 };
580 ** Complete configuration
582 struct EditDist3Config
{
583 int nLang
; /* Number of language IDs. Size of a[] */
584 EditDist3Lang
*a
; /* One for each distinct language ID */
588 ** Extra information about each character in the FROM string.
590 struct EditDist3From
{
591 int nSubst
; /* Number of substitution cost entries */
592 int nDel
; /* Number of deletion cost entries */
593 int nByte
; /* Number of bytes in this character */
594 EditDist3Cost
**apSubst
; /* Array of substitution costs for this element */
595 EditDist3Cost
**apDel
; /* Array of deletion cost entries */
599 ** A precompiled FROM string.
601 ** In the common case we expect the FROM string to be reused multiple times.
602 ** In other words, the common case will be to measure the edit distance
603 ** from a single origin string to multiple target strings.
605 struct EditDist3FromString
{
606 char *z
; /* The complete text of the FROM string */
607 int n
; /* Number of characters in the FROM string */
608 int isPrefix
; /* True if ends with '*' character */
609 EditDist3From
*a
; /* Extra info about each char of the FROM string */
613 ** Extra information about each character in the TO string.
616 int nIns
; /* Number of insertion cost entries */
617 int nByte
; /* Number of bytes in this character */
618 EditDist3Cost
**apIns
; /* Array of deletion cost entries */
622 ** A precompiled FROM string
624 struct EditDist3ToString
{
625 char *z
; /* The complete text of the TO string */
626 int n
; /* Number of characters in the TO string */
627 EditDist3To
*a
; /* Extra info about each char of the TO string */
631 ** Clear or delete an instance of the object that records all edit-distance
634 static void editDist3ConfigClear(EditDist3Config
*p
){
637 for(i
=0; i
<p
->nLang
; i
++){
638 EditDist3Cost
*pCost
, *pNext
;
639 pCost
= p
->a
[i
].pCost
;
641 pNext
= pCost
->pNext
;
647 memset(p
, 0, sizeof(*p
));
649 static void editDist3ConfigDelete(void *pIn
){
650 EditDist3Config
*p
= (EditDist3Config
*)pIn
;
651 editDist3ConfigClear(p
);
656 ** Load all edit-distance weights from a table.
658 static int editDist3ConfigLoad(
659 EditDist3Config
*p
, /* The edit distance configuration to load */
660 sqlite3
*db
, /* Load from this database */
661 const char *zTable
/* Name of the table from which to load */
666 int iLangPrev
= -9999;
667 EditDist3Lang
*pLang
= 0;
669 zSql
= sqlite3_mprintf("SELECT iLang, cFrom, cTo, iCost"
670 " FROM \"%w\" WHERE iLang>=0 ORDER BY iLang", zTable
);
671 if( zSql
==0 ) return SQLITE_NOMEM
;
672 rc
= sqlite3_prepare(db
, zSql
, -1, &pStmt
, 0);
675 editDist3ConfigClear(p
);
676 while( sqlite3_step(pStmt
)==SQLITE_ROW
){
677 int iLang
= sqlite3_column_int(pStmt
, 0);
678 const char *zFrom
= (const char*)sqlite3_column_text(pStmt
, 1);
679 int nFrom
= zFrom
? sqlite3_column_bytes(pStmt
, 1) : 0;
680 const char *zTo
= (const char*)sqlite3_column_text(pStmt
, 2);
681 int nTo
= zTo
? sqlite3_column_bytes(pStmt
, 2) : 0;
682 int iCost
= sqlite3_column_int(pStmt
, 3);
684 assert( zFrom
!=0 || nFrom
==0 );
685 assert( zTo
!=0 || nTo
==0 );
686 if( nFrom
>100 || nTo
>100 ) continue;
687 if( iCost
<0 ) continue;
688 if( pLang
==0 || iLang
!=iLangPrev
){
690 pNew
= sqlite3_realloc(p
->a
, (p
->nLang
+1)*sizeof(p
->a
[0]));
691 if( pNew
==0 ){ rc
= SQLITE_NOMEM
; break; }
693 pLang
= &p
->a
[p
->nLang
];
695 pLang
->iLang
= iLang
;
696 pLang
->iInsCost
= 100;
697 pLang
->iDelCost
= 100;
698 pLang
->iSubCost
= 150;
702 if( nFrom
==1 && zFrom
[0]=='?' && nTo
==0 ){
703 pLang
->iDelCost
= iCost
;
704 }else if( nFrom
==0 && nTo
==1 && zTo
[0]=='?' ){
705 pLang
->iInsCost
= iCost
;
706 }else if( nFrom
==1 && nTo
==1 && zFrom
[0]=='?' && zTo
[0]=='?' ){
707 pLang
->iSubCost
= iCost
;
709 EditDist3Cost
*pCost
;
710 int nExtra
= nFrom
+ nTo
- 4;
711 if( nExtra
<0 ) nExtra
= 0;
712 pCost
= sqlite3_malloc( sizeof(*pCost
) + nExtra
);
713 if( pCost
==0 ){ rc
= SQLITE_NOMEM
; break; }
714 pCost
->nFrom
= nFrom
;
716 pCost
->iCost
= iCost
;
717 memcpy(pCost
->a
, zFrom
, nFrom
);
718 memcpy(pCost
->a
+ nFrom
, zTo
, nTo
);
719 pCost
->pNext
= pLang
->pCost
;
720 pLang
->pCost
= pCost
;
723 rc2
= sqlite3_finalize(pStmt
);
724 if( rc
==SQLITE_OK
) rc
= rc2
;
729 ** Return the length (in bytes) of a utf-8 character. Or return a maximum
732 static int utf8Len(unsigned char c
, int N
){
735 if( (c
&0xe0)==0xc0 ){
737 }else if( (c
&0xf0)==0xe0 ){
748 ** Return TRUE (non-zero) if the To side of the given cost matches
751 static int matchTo(EditDist3Cost
*p
, const char *z
, int n
){
752 if( p
->nTo
>n
) return 0;
753 if( strncmp(p
->a
+p
->nFrom
, z
, p
->nTo
)!=0 ) return 0;
758 ** Return TRUE (non-zero) if the From side of the given cost matches
761 static int matchFrom(EditDist3Cost
*p
, const char *z
, int n
){
762 assert( p
->nFrom
<=n
);
763 if( strncmp(p
->a
, z
, p
->nFrom
)!=0 ) return 0;
768 ** Return TRUE (non-zero) of the next FROM character and the next TO
769 ** character are the same.
771 static int matchFromTo(
772 EditDist3FromString
*pStr
, /* Left hand string */
773 int n1
, /* Index of comparison character on the left */
774 const char *z2
, /* Right-handl comparison character */
775 int n2
/* Bytes remaining in z2[] */
777 int b1
= pStr
->a
[n1
].nByte
;
778 if( b1
>n2
) return 0;
779 if( memcmp(pStr
->z
+n1
, z2
, b1
)!=0 ) return 0;
784 ** Delete an EditDist3FromString objecct
786 static void editDist3FromStringDelete(EditDist3FromString
*p
){
789 for(i
=0; i
<p
->n
; i
++){
790 sqlite3_free(p
->a
[i
].apDel
);
791 sqlite3_free(p
->a
[i
].apSubst
);
798 ** Create a EditDist3FromString object.
800 static EditDist3FromString
*editDist3FromStringNew(
801 const EditDist3Lang
*pLang
,
805 EditDist3FromString
*pStr
;
810 if( n
<0 ) n
= (int)strlen(z
);
811 pStr
= sqlite3_malloc( sizeof(*pStr
) + sizeof(pStr
->a
[0])*n
+ n
+ 1 );
812 if( pStr
==0 ) return 0;
813 pStr
->a
= (EditDist3From
*)&pStr
[1];
814 memset(pStr
->a
, 0, sizeof(pStr
->a
[0])*n
);
816 pStr
->z
= (char*)&pStr
->a
[n
];
817 memcpy(pStr
->z
, z
, n
+1);
818 if( n
&& z
[n
-1]=='*' ){
828 EditDist3From
*pFrom
= &pStr
->a
[i
];
829 memset(pFrom
, 0, sizeof(*pFrom
));
830 pFrom
->nByte
= utf8Len((unsigned char)z
[i
], n
-i
);
831 for(p
=pLang
->pCost
; p
; p
=p
->pNext
){
832 EditDist3Cost
**apNew
;
833 if( i
+p
->nFrom
>n
) continue;
834 if( matchFrom(p
, z
+i
, n
-i
)==0 ) continue;
836 apNew
= sqlite3_realloc(pFrom
->apDel
,
837 sizeof(*apNew
)*(pFrom
->nDel
+1));
838 if( apNew
==0 ) break;
839 pFrom
->apDel
= apNew
;
840 apNew
[pFrom
->nDel
++] = p
;
842 apNew
= sqlite3_realloc(pFrom
->apSubst
,
843 sizeof(*apNew
)*(pFrom
->nSubst
+1));
844 if( apNew
==0 ) break;
845 pFrom
->apSubst
= apNew
;
846 apNew
[pFrom
->nSubst
++] = p
;
850 editDist3FromStringDelete(pStr
);
859 ** Update entry m[i] such that it is the minimum of its current value
862 ** If the iCost is 1,000,000 or greater, then consider the cost to be
863 ** infinite and skip the update.
865 static void updateCost(
873 unsigned int b
= m
[j
] + iCost
;
874 if( b
<m
[i
] ) m
[i
] = b
;
878 /* Compute the edit distance between two strings.
880 ** If an error occurs, return a negative number which is the error code.
882 ** If pnMatch is not NULL, then *pnMatch is set to the number of characters
883 ** (not bytes) in z2 that matched the search pattern in *pFrom. If pFrom does
884 ** not contain the pattern for a prefix-search, then this is always the number
885 ** of characters in z2. If pFrom does contain a prefix search pattern, then
886 ** it is the number of characters in the prefix of z2 that was deemed to
889 static int editDist3Core(
890 EditDist3FromString
*pFrom
, /* The FROM string */
891 const char *z2
, /* The TO string */
892 int n2
, /* Length of the TO string */
893 const EditDist3Lang
*pLang
, /* Edit weights for a particular language ID */
894 int *pnMatch
/* OUT: Characters in matched prefix */
899 EditDist3FromString f
= *pFrom
;
906 /* allocate the Wagner matrix and the aTo[] array for the TO string */
909 m
= sqlite3_malloc( n
*sizeof(m
[0]) + sizeof(a2
[0])*n2
);
910 if( m
==0 ) return -1; /* Out of memory */
911 a2
= (EditDist3To
*)&m
[n
];
912 memset(a2
, 0, sizeof(a2
[0])*n2
);
914 /* Fill in the a1[] matrix for all characters of the TO string */
915 for(i2
=0; i2
<n2
; i2
++){
916 a2
[i2
].nByte
= utf8Len((unsigned char)z2
[i2
], n2
-i2
);
917 for(p
=pLang
->pCost
; p
; p
=p
->pNext
){
918 EditDist3Cost
**apNew
;
919 if( p
->nFrom
>0 ) continue;
920 if( i2
+p
->nTo
>n2
) continue;
921 if( matchTo(p
, z2
+i2
, n2
-i2
)==0 ) continue;
923 apNew
= sqlite3_realloc(a2
[i2
].apIns
, sizeof(*apNew
)*a2
[i2
].nIns
);
925 res
= -1; /* Out of memory */
928 a2
[i2
].apIns
= apNew
;
929 a2
[i2
].apIns
[a2
[i2
].nIns
-1] = p
;
933 /* Prepare to compute the minimum edit distance */
935 memset(m
, 0x01, (n2
+1)*szRow
*sizeof(m
[0]));
938 /* First fill in the top-row of the matrix with FROM deletion costs */
939 for(i1
=0; i1
<f
.n
; i1
+= b1
){
941 updateCost(m
, i1
+b1
, i1
, pLang
->iDelCost
);
942 for(k
=0; k
<f
.a
[i1
].nDel
; k
++){
943 p
= f
.a
[i1
].apDel
[k
];
944 updateCost(m
, i1
+p
->nFrom
, i1
, p
->iCost
);
948 /* Fill in all subsequent rows, top-to-bottom, left-to-right */
949 for(i2
=0; i2
<n2
; i2
+= b2
){
950 int rx
; /* Starting index for current row */
951 int rxp
; /* Starting index for previous row */
955 updateCost(m
, rx
, rxp
, pLang
->iInsCost
);
956 for(k
=0; k
<a2
[i2
].nIns
; k
++){
958 updateCost(m
, szRow
*(i2
+p
->nTo
), rxp
, p
->iCost
);
960 for(i1
=0; i1
<f
.n
; i1
+=b1
){
961 int cx
; /* Index of current cell */
962 int cxp
; /* Index of cell immediately to the left */
963 int cxd
; /* Index of cell to the left and one row above */
964 int cxu
; /* Index of cell immediately above */
970 updateCost(m
, cx
, cxp
, pLang
->iDelCost
);
971 for(k
=0; k
<f
.a
[i1
].nDel
; k
++){
972 p
= f
.a
[i1
].apDel
[k
];
973 updateCost(m
, cxp
+p
->nFrom
, cxp
, p
->iCost
);
975 updateCost(m
, cx
, cxu
, pLang
->iInsCost
);
976 if( matchFromTo(&f
, i1
, z2
+i2
, n2
-i2
) ){
977 updateCost(m
, cx
, cxd
, 0);
979 updateCost(m
, cx
, cxd
, pLang
->iSubCost
);
980 for(k
=0; k
<f
.a
[i1
].nSubst
; k
++){
981 p
= f
.a
[i1
].apSubst
[k
];
982 if( matchTo(p
, z2
+i2
, n2
-i2
) ){
983 updateCost(m
, cxd
+p
->nFrom
+szRow
*p
->nTo
, cxd
, p
->iCost
);
989 #if 0 /* Enable for debugging */
991 for(i1
=0; i1
<f
.n
; i1
++) printf(" %c-%2x", f
.z
[i1
], f
.z
[i1
]&0xff);
993 for(i1
=0; i1
<szRow
; i1
++){
995 if( v
>9999 ) printf(" ****");
996 else printf(" %4d", v
);
999 for(i2
=0; i2
<n2
; i2
++){
1000 printf("%c-%02x:", z2
[i2
], z2
[i2
]&0xff);
1001 for(i1
=0; i1
<szRow
; i1
++){
1002 int v
= m
[(i2
+1)*szRow
+i1
];
1003 if( v
>9999 ) printf(" ****");
1004 else printf(" %4d", v
);
1010 /* Free memory allocations and return the result */
1011 res
= (int)m
[szRow
*(n2
+1)-1];
1014 for(i2
=1; i2
<=n2
; i2
++){
1015 int b
= m
[szRow
*i2
-1];
1025 if( (z2
[k
] & 0xc0)==0x80 ) nExtra
++;
1027 *pnMatch
= n
- nExtra
;
1031 for(i2
=0; i2
<n2
; i2
++) sqlite3_free(a2
[i2
].apIns
);
1037 ** Get an appropriate EditDist3Lang object.
1039 static const EditDist3Lang
*editDist3FindLang(
1040 EditDist3Config
*pConfig
,
1044 for(i
=0; i
<pConfig
->nLang
; i
++){
1045 if( pConfig
->a
[i
].iLang
==iLang
) return &pConfig
->a
[i
];
1047 return &editDist3Lang
;
1051 ** Function: editdist3(A,B,iLang)
1052 ** editdist3(tablename)
1054 ** Return the cost of transforming string A into string B using edit
1055 ** weights for iLang.
1057 ** The second form loads edit weights into memory from a table.
1059 static void editDist3SqlFunc(
1060 sqlite3_context
*context
,
1062 sqlite3_value
**argv
1064 EditDist3Config
*pConfig
= (EditDist3Config
*)sqlite3_user_data(context
);
1065 sqlite3
*db
= sqlite3_context_db_handle(context
);
1068 const char *zTable
= (const char*)sqlite3_value_text(argv
[0]);
1069 rc
= editDist3ConfigLoad(pConfig
, db
, zTable
);
1070 if( rc
) sqlite3_result_error_code(context
, rc
);
1072 const char *zA
= (const char*)sqlite3_value_text(argv
[0]);
1073 const char *zB
= (const char*)sqlite3_value_text(argv
[1]);
1074 int nA
= sqlite3_value_bytes(argv
[0]);
1075 int nB
= sqlite3_value_bytes(argv
[1]);
1076 int iLang
= argc
==3 ? sqlite3_value_int(argv
[2]) : 0;
1077 const EditDist3Lang
*pLang
= editDist3FindLang(pConfig
, iLang
);
1078 EditDist3FromString
*pFrom
;
1081 pFrom
= editDist3FromStringNew(pLang
, zA
, nA
);
1083 sqlite3_result_error_nomem(context
);
1086 dist
= editDist3Core(pFrom
, zB
, nB
, pLang
, 0);
1087 editDist3FromStringDelete(pFrom
);
1089 sqlite3_result_error_nomem(context
);
1091 sqlite3_result_int(context
, dist
);
1097 ** Register the editDist3 function with SQLite
1099 static int editDist3Install(sqlite3
*db
){
1101 EditDist3Config
*pConfig
= sqlite3_malloc( sizeof(*pConfig
) );
1102 if( pConfig
==0 ) return SQLITE_NOMEM
;
1103 memset(pConfig
, 0, sizeof(*pConfig
));
1104 rc
= sqlite3_create_function_v2(db
, "editdist3",
1105 2, SQLITE_UTF8
, pConfig
, editDist3SqlFunc
, 0, 0, 0);
1106 if( rc
==SQLITE_OK
){
1107 rc
= sqlite3_create_function_v2(db
, "editdist3",
1108 3, SQLITE_UTF8
, pConfig
, editDist3SqlFunc
, 0, 0, 0);
1110 if( rc
==SQLITE_OK
){
1111 rc
= sqlite3_create_function_v2(db
, "editdist3",
1112 1, SQLITE_UTF8
, pConfig
, editDist3SqlFunc
, 0, 0,
1113 editDist3ConfigDelete
);
1115 sqlite3_free(pConfig
);
1119 /* End configurable cost unicode edit distance routines
1120 ******************************************************************************
1121 ******************************************************************************
1122 ** Begin transliterate unicode-to-ascii implementation
1125 #if !SQLITE_AMALGAMATION
1127 ** This lookup table is used to help decode the first byte of
1128 ** a multi-byte UTF8 character.
1130 static const unsigned char sqlite3Utf8Trans1
[] = {
1131 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1132 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1133 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
1134 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
1135 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1136 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1137 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1138 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
1143 ** Return the value of the first UTF-8 character in the string.
1145 static int utf8Read(const unsigned char *z
, int n
, int *pSize
){
1148 /* All callers to this routine (in the current implementation)
1149 ** always have n>0. */
1156 c
= sqlite3Utf8Trans1
[c
-0xc0];
1157 while( i
<n
&& (z
[i
] & 0xc0)==0x80 ){
1158 c
= (c
<<6) + (0x3f & z
[i
++]);
1167 ** Return the number of characters in the utf-8 string in the nIn byte
1168 ** buffer pointed to by zIn.
1170 static int utf8Charlen(const char *zIn
, int nIn
){
1173 for(i
=0; i
<nIn
; nChar
++){
1175 utf8Read((const unsigned char *)&zIn
[i
], nIn
-i
, &sz
);
1182 ** Table of translations from unicode characters into ASCII.
1184 static const struct {
1185 unsigned short int cFrom
;
1186 unsigned char cTo0
, cTo1
;
1188 { 0x00A0, 0x20, 0x00 }, /* to */
1189 { 0x00B5, 0x75, 0x00 }, /* µ to u */
1190 { 0x00C0, 0x41, 0x00 }, /* À to A */
1191 { 0x00C1, 0x41, 0x00 }, /* Á to A */
1192 { 0x00C2, 0x41, 0x00 }, /* Â to A */
1193 { 0x00C3, 0x41, 0x00 }, /* Ã to A */
1194 { 0x00C4, 0x41, 0x65 }, /* Ä to Ae */
1195 { 0x00C5, 0x41, 0x61 }, /* Å to Aa */
1196 { 0x00C6, 0x41, 0x45 }, /* Æ to AE */
1197 { 0x00C7, 0x43, 0x00 }, /* Ç to C */
1198 { 0x00C8, 0x45, 0x00 }, /* È to E */
1199 { 0x00C9, 0x45, 0x00 }, /* É to E */
1200 { 0x00CA, 0x45, 0x00 }, /* Ê to E */
1201 { 0x00CB, 0x45, 0x00 }, /* Ë to E */
1202 { 0x00CC, 0x49, 0x00 }, /* Ì to I */
1203 { 0x00CD, 0x49, 0x00 }, /* Í to I */
1204 { 0x00CE, 0x49, 0x00 }, /* Î to I */
1205 { 0x00CF, 0x49, 0x00 }, /* Ï to I */
1206 { 0x00D0, 0x44, 0x00 }, /* Ð to D */
1207 { 0x00D1, 0x4E, 0x00 }, /* Ñ to N */
1208 { 0x00D2, 0x4F, 0x00 }, /* Ò to O */
1209 { 0x00D3, 0x4F, 0x00 }, /* Ó to O */
1210 { 0x00D4, 0x4F, 0x00 }, /* Ô to O */
1211 { 0x00D5, 0x4F, 0x00 }, /* Õ to O */
1212 { 0x00D6, 0x4F, 0x65 }, /* Ö to Oe */
1213 { 0x00D7, 0x78, 0x00 }, /* × to x */
1214 { 0x00D8, 0x4F, 0x00 }, /* Ø to O */
1215 { 0x00D9, 0x55, 0x00 }, /* Ù to U */
1216 { 0x00DA, 0x55, 0x00 }, /* Ú to U */
1217 { 0x00DB, 0x55, 0x00 }, /* Û to U */
1218 { 0x00DC, 0x55, 0x65 }, /* Ü to Ue */
1219 { 0x00DD, 0x59, 0x00 }, /* Ý to Y */
1220 { 0x00DE, 0x54, 0x68 }, /* Þ to Th */
1221 { 0x00DF, 0x73, 0x73 }, /* ß to ss */
1222 { 0x00E0, 0x61, 0x00 }, /* à to a */
1223 { 0x00E1, 0x61, 0x00 }, /* á to a */
1224 { 0x00E2, 0x61, 0x00 }, /* â to a */
1225 { 0x00E3, 0x61, 0x00 }, /* ã to a */
1226 { 0x00E4, 0x61, 0x65 }, /* ä to ae */
1227 { 0x00E5, 0x61, 0x61 }, /* å to aa */
1228 { 0x00E6, 0x61, 0x65 }, /* æ to ae */
1229 { 0x00E7, 0x63, 0x00 }, /* ç to c */
1230 { 0x00E8, 0x65, 0x00 }, /* è to e */
1231 { 0x00E9, 0x65, 0x00 }, /* é to e */
1232 { 0x00EA, 0x65, 0x00 }, /* ê to e */
1233 { 0x00EB, 0x65, 0x00 }, /* ë to e */
1234 { 0x00EC, 0x69, 0x00 }, /* ì to i */
1235 { 0x00ED, 0x69, 0x00 }, /* í to i */
1236 { 0x00EE, 0x69, 0x00 }, /* î to i */
1237 { 0x00EF, 0x69, 0x00 }, /* ï to i */
1238 { 0x00F0, 0x64, 0x00 }, /* ð to d */
1239 { 0x00F1, 0x6E, 0x00 }, /* ñ to n */
1240 { 0x00F2, 0x6F, 0x00 }, /* ò to o */
1241 { 0x00F3, 0x6F, 0x00 }, /* ó to o */
1242 { 0x00F4, 0x6F, 0x00 }, /* ô to o */
1243 { 0x00F5, 0x6F, 0x00 }, /* õ to o */
1244 { 0x00F6, 0x6F, 0x65 }, /* ö to oe */
1245 { 0x00F7, 0x3A, 0x00 }, /* ÷ to : */
1246 { 0x00F8, 0x6F, 0x00 }, /* ø to o */
1247 { 0x00F9, 0x75, 0x00 }, /* ù to u */
1248 { 0x00FA, 0x75, 0x00 }, /* ú to u */
1249 { 0x00FB, 0x75, 0x00 }, /* û to u */
1250 { 0x00FC, 0x75, 0x65 }, /* ü to ue */
1251 { 0x00FD, 0x79, 0x00 }, /* ý to y */
1252 { 0x00FE, 0x74, 0x68 }, /* þ to th */
1253 { 0x00FF, 0x79, 0x00 }, /* ÿ to y */
1254 { 0x0100, 0x41, 0x00 }, /* Ā to A */
1255 { 0x0101, 0x61, 0x00 }, /* ā to a */
1256 { 0x0102, 0x41, 0x00 }, /* Ă to A */
1257 { 0x0103, 0x61, 0x00 }, /* ă to a */
1258 { 0x0104, 0x41, 0x00 }, /* Ą to A */
1259 { 0x0105, 0x61, 0x00 }, /* ą to a */
1260 { 0x0106, 0x43, 0x00 }, /* Ć to C */
1261 { 0x0107, 0x63, 0x00 }, /* ć to c */
1262 { 0x0108, 0x43, 0x68 }, /* Ĉ to Ch */
1263 { 0x0109, 0x63, 0x68 }, /* ĉ to ch */
1264 { 0x010A, 0x43, 0x00 }, /* Ċ to C */
1265 { 0x010B, 0x63, 0x00 }, /* ċ to c */
1266 { 0x010C, 0x43, 0x00 }, /* Č to C */
1267 { 0x010D, 0x63, 0x00 }, /* č to c */
1268 { 0x010E, 0x44, 0x00 }, /* Ď to D */
1269 { 0x010F, 0x64, 0x00 }, /* ď to d */
1270 { 0x0110, 0x44, 0x00 }, /* Đ to D */
1271 { 0x0111, 0x64, 0x00 }, /* đ to d */
1272 { 0x0112, 0x45, 0x00 }, /* Ē to E */
1273 { 0x0113, 0x65, 0x00 }, /* ē to e */
1274 { 0x0114, 0x45, 0x00 }, /* Ĕ to E */
1275 { 0x0115, 0x65, 0x00 }, /* ĕ to e */
1276 { 0x0116, 0x45, 0x00 }, /* Ė to E */
1277 { 0x0117, 0x65, 0x00 }, /* ė to e */
1278 { 0x0118, 0x45, 0x00 }, /* Ę to E */
1279 { 0x0119, 0x65, 0x00 }, /* ę to e */
1280 { 0x011A, 0x45, 0x00 }, /* Ě to E */
1281 { 0x011B, 0x65, 0x00 }, /* ě to e */
1282 { 0x011C, 0x47, 0x68 }, /* Ĝ to Gh */
1283 { 0x011D, 0x67, 0x68 }, /* ĝ to gh */
1284 { 0x011E, 0x47, 0x00 }, /* Ğ to G */
1285 { 0x011F, 0x67, 0x00 }, /* ğ to g */
1286 { 0x0120, 0x47, 0x00 }, /* Ġ to G */
1287 { 0x0121, 0x67, 0x00 }, /* ġ to g */
1288 { 0x0122, 0x47, 0x00 }, /* Ģ to G */
1289 { 0x0123, 0x67, 0x00 }, /* ģ to g */
1290 { 0x0124, 0x48, 0x68 }, /* Ĥ to Hh */
1291 { 0x0125, 0x68, 0x68 }, /* ĥ to hh */
1292 { 0x0126, 0x48, 0x00 }, /* Ħ to H */
1293 { 0x0127, 0x68, 0x00 }, /* ħ to h */
1294 { 0x0128, 0x49, 0x00 }, /* Ĩ to I */
1295 { 0x0129, 0x69, 0x00 }, /* ĩ to i */
1296 { 0x012A, 0x49, 0x00 }, /* Ī to I */
1297 { 0x012B, 0x69, 0x00 }, /* ī to i */
1298 { 0x012C, 0x49, 0x00 }, /* Ĭ to I */
1299 { 0x012D, 0x69, 0x00 }, /* ĭ to i */
1300 { 0x012E, 0x49, 0x00 }, /* Į to I */
1301 { 0x012F, 0x69, 0x00 }, /* į to i */
1302 { 0x0130, 0x49, 0x00 }, /* İ to I */
1303 { 0x0131, 0x69, 0x00 }, /* ı to i */
1304 { 0x0132, 0x49, 0x4A }, /* IJ to IJ */
1305 { 0x0133, 0x69, 0x6A }, /* ij to ij */
1306 { 0x0134, 0x4A, 0x68 }, /* Ĵ to Jh */
1307 { 0x0135, 0x6A, 0x68 }, /* ĵ to jh */
1308 { 0x0136, 0x4B, 0x00 }, /* Ķ to K */
1309 { 0x0137, 0x6B, 0x00 }, /* ķ to k */
1310 { 0x0138, 0x6B, 0x00 }, /* ĸ to k */
1311 { 0x0139, 0x4C, 0x00 }, /* Ĺ to L */
1312 { 0x013A, 0x6C, 0x00 }, /* ĺ to l */
1313 { 0x013B, 0x4C, 0x00 }, /* Ļ to L */
1314 { 0x013C, 0x6C, 0x00 }, /* ļ to l */
1315 { 0x013D, 0x4C, 0x00 }, /* Ľ to L */
1316 { 0x013E, 0x6C, 0x00 }, /* ľ to l */
1317 { 0x013F, 0x4C, 0x2E }, /* Ŀ to L. */
1318 { 0x0140, 0x6C, 0x2E }, /* ŀ to l. */
1319 { 0x0141, 0x4C, 0x00 }, /* Ł to L */
1320 { 0x0142, 0x6C, 0x00 }, /* ł to l */
1321 { 0x0143, 0x4E, 0x00 }, /* Ń to N */
1322 { 0x0144, 0x6E, 0x00 }, /* ń to n */
1323 { 0x0145, 0x4E, 0x00 }, /* Ņ to N */
1324 { 0x0146, 0x6E, 0x00 }, /* ņ to n */
1325 { 0x0147, 0x4E, 0x00 }, /* Ň to N */
1326 { 0x0148, 0x6E, 0x00 }, /* ň to n */
1327 { 0x0149, 0x27, 0x6E }, /* ʼn to 'n */
1328 { 0x014A, 0x4E, 0x47 }, /* Ŋ to NG */
1329 { 0x014B, 0x6E, 0x67 }, /* ŋ to ng */
1330 { 0x014C, 0x4F, 0x00 }, /* Ō to O */
1331 { 0x014D, 0x6F, 0x00 }, /* ō to o */
1332 { 0x014E, 0x4F, 0x00 }, /* Ŏ to O */
1333 { 0x014F, 0x6F, 0x00 }, /* ŏ to o */
1334 { 0x0150, 0x4F, 0x00 }, /* Ő to O */
1335 { 0x0151, 0x6F, 0x00 }, /* ő to o */
1336 { 0x0152, 0x4F, 0x45 }, /* Œ to OE */
1337 { 0x0153, 0x6F, 0x65 }, /* œ to oe */
1338 { 0x0154, 0x52, 0x00 }, /* Ŕ to R */
1339 { 0x0155, 0x72, 0x00 }, /* ŕ to r */
1340 { 0x0156, 0x52, 0x00 }, /* Ŗ to R */
1341 { 0x0157, 0x72, 0x00 }, /* ŗ to r */
1342 { 0x0158, 0x52, 0x00 }, /* Ř to R */
1343 { 0x0159, 0x72, 0x00 }, /* ř to r */
1344 { 0x015A, 0x53, 0x00 }, /* Ś to S */
1345 { 0x015B, 0x73, 0x00 }, /* ś to s */
1346 { 0x015C, 0x53, 0x68 }, /* Ŝ to Sh */
1347 { 0x015D, 0x73, 0x68 }, /* ŝ to sh */
1348 { 0x015E, 0x53, 0x00 }, /* Ş to S */
1349 { 0x015F, 0x73, 0x00 }, /* ş to s */
1350 { 0x0160, 0x53, 0x00 }, /* Š to S */
1351 { 0x0161, 0x73, 0x00 }, /* š to s */
1352 { 0x0162, 0x54, 0x00 }, /* Ţ to T */
1353 { 0x0163, 0x74, 0x00 }, /* ţ to t */
1354 { 0x0164, 0x54, 0x00 }, /* Ť to T */
1355 { 0x0165, 0x74, 0x00 }, /* ť to t */
1356 { 0x0166, 0x54, 0x00 }, /* Ŧ to T */
1357 { 0x0167, 0x74, 0x00 }, /* ŧ to t */
1358 { 0x0168, 0x55, 0x00 }, /* Ũ to U */
1359 { 0x0169, 0x75, 0x00 }, /* ũ to u */
1360 { 0x016A, 0x55, 0x00 }, /* Ū to U */
1361 { 0x016B, 0x75, 0x00 }, /* ū to u */
1362 { 0x016C, 0x55, 0x00 }, /* Ŭ to U */
1363 { 0x016D, 0x75, 0x00 }, /* ŭ to u */
1364 { 0x016E, 0x55, 0x00 }, /* Ů to U */
1365 { 0x016F, 0x75, 0x00 }, /* ů to u */
1366 { 0x0170, 0x55, 0x00 }, /* Ű to U */
1367 { 0x0171, 0x75, 0x00 }, /* ű to u */
1368 { 0x0172, 0x55, 0x00 }, /* Ų to U */
1369 { 0x0173, 0x75, 0x00 }, /* ų to u */
1370 { 0x0174, 0x57, 0x00 }, /* Ŵ to W */
1371 { 0x0175, 0x77, 0x00 }, /* ŵ to w */
1372 { 0x0176, 0x59, 0x00 }, /* Ŷ to Y */
1373 { 0x0177, 0x79, 0x00 }, /* ŷ to y */
1374 { 0x0178, 0x59, 0x00 }, /* Ÿ to Y */
1375 { 0x0179, 0x5A, 0x00 }, /* Ź to Z */
1376 { 0x017A, 0x7A, 0x00 }, /* ź to z */
1377 { 0x017B, 0x5A, 0x00 }, /* Ż to Z */
1378 { 0x017C, 0x7A, 0x00 }, /* ż to z */
1379 { 0x017D, 0x5A, 0x00 }, /* Ž to Z */
1380 { 0x017E, 0x7A, 0x00 }, /* ž to z */
1381 { 0x017F, 0x73, 0x00 }, /* ſ to s */
1382 { 0x0192, 0x66, 0x00 }, /* ƒ to f */
1383 { 0x0218, 0x53, 0x00 }, /* Ș to S */
1384 { 0x0219, 0x73, 0x00 }, /* ș to s */
1385 { 0x021A, 0x54, 0x00 }, /* Ț to T */
1386 { 0x021B, 0x74, 0x00 }, /* ț to t */
1387 { 0x0386, 0x41, 0x00 }, /* Ά to A */
1388 { 0x0388, 0x45, 0x00 }, /* Έ to E */
1389 { 0x0389, 0x49, 0x00 }, /* Ή to I */
1390 { 0x038A, 0x49, 0x00 }, /* Ί to I */
1391 { 0x038C, 0x4f, 0x00 }, /* Ό to O */
1392 { 0x038E, 0x59, 0x00 }, /* Ύ to Y */
1393 { 0x038F, 0x4f, 0x00 }, /* Ώ to O */
1394 { 0x0390, 0x69, 0x00 }, /* ΐ to i */
1395 { 0x0391, 0x41, 0x00 }, /* Α to A */
1396 { 0x0392, 0x42, 0x00 }, /* Β to B */
1397 { 0x0393, 0x47, 0x00 }, /* Γ to G */
1398 { 0x0394, 0x44, 0x00 }, /* Δ to D */
1399 { 0x0395, 0x45, 0x00 }, /* Ε to E */
1400 { 0x0396, 0x5a, 0x00 }, /* Ζ to Z */
1401 { 0x0397, 0x49, 0x00 }, /* Η to I */
1402 { 0x0398, 0x54, 0x68 }, /* Θ to Th */
1403 { 0x0399, 0x49, 0x00 }, /* Ι to I */
1404 { 0x039A, 0x4b, 0x00 }, /* Κ to K */
1405 { 0x039B, 0x4c, 0x00 }, /* Λ to L */
1406 { 0x039C, 0x4d, 0x00 }, /* Μ to M */
1407 { 0x039D, 0x4e, 0x00 }, /* Ν to N */
1408 { 0x039E, 0x58, 0x00 }, /* Ξ to X */
1409 { 0x039F, 0x4f, 0x00 }, /* Ο to O */
1410 { 0x03A0, 0x50, 0x00 }, /* Π to P */
1411 { 0x03A1, 0x52, 0x00 }, /* Ρ to R */
1412 { 0x03A3, 0x53, 0x00 }, /* Σ to S */
1413 { 0x03A4, 0x54, 0x00 }, /* Τ to T */
1414 { 0x03A5, 0x59, 0x00 }, /* Υ to Y */
1415 { 0x03A6, 0x46, 0x00 }, /* Φ to F */
1416 { 0x03A7, 0x43, 0x68 }, /* Χ to Ch */
1417 { 0x03A8, 0x50, 0x73 }, /* Ψ to Ps */
1418 { 0x03A9, 0x4f, 0x00 }, /* Ω to O */
1419 { 0x03AA, 0x49, 0x00 }, /* Ϊ to I */
1420 { 0x03AB, 0x59, 0x00 }, /* Ϋ to Y */
1421 { 0x03AC, 0x61, 0x00 }, /* ά to a */
1422 { 0x03AD, 0x65, 0x00 }, /* έ to e */
1423 { 0x03AE, 0x69, 0x00 }, /* ή to i */
1424 { 0x03AF, 0x69, 0x00 }, /* ί to i */
1425 { 0x03B1, 0x61, 0x00 }, /* α to a */
1426 { 0x03B2, 0x62, 0x00 }, /* β to b */
1427 { 0x03B3, 0x67, 0x00 }, /* γ to g */
1428 { 0x03B4, 0x64, 0x00 }, /* δ to d */
1429 { 0x03B5, 0x65, 0x00 }, /* ε to e */
1430 { 0x03B6, 0x7a, 0x00 }, /* ζ to z */
1431 { 0x03B7, 0x69, 0x00 }, /* η to i */
1432 { 0x03B8, 0x74, 0x68 }, /* θ to th */
1433 { 0x03B9, 0x69, 0x00 }, /* ι to i */
1434 { 0x03BA, 0x6b, 0x00 }, /* κ to k */
1435 { 0x03BB, 0x6c, 0x00 }, /* λ to l */
1436 { 0x03BC, 0x6d, 0x00 }, /* μ to m */
1437 { 0x03BD, 0x6e, 0x00 }, /* ν to n */
1438 { 0x03BE, 0x78, 0x00 }, /* ξ to x */
1439 { 0x03BF, 0x6f, 0x00 }, /* ο to o */
1440 { 0x03C0, 0x70, 0x00 }, /* π to p */
1441 { 0x03C1, 0x72, 0x00 }, /* ρ to r */
1442 { 0x03C3, 0x73, 0x00 }, /* σ to s */
1443 { 0x03C4, 0x74, 0x00 }, /* τ to t */
1444 { 0x03C5, 0x79, 0x00 }, /* υ to y */
1445 { 0x03C6, 0x66, 0x00 }, /* φ to f */
1446 { 0x03C7, 0x63, 0x68 }, /* χ to ch */
1447 { 0x03C8, 0x70, 0x73 }, /* ψ to ps */
1448 { 0x03C9, 0x6f, 0x00 }, /* ω to o */
1449 { 0x03CA, 0x69, 0x00 }, /* ϊ to i */
1450 { 0x03CB, 0x79, 0x00 }, /* ϋ to y */
1451 { 0x03CC, 0x6f, 0x00 }, /* ό to o */
1452 { 0x03CD, 0x79, 0x00 }, /* ύ to y */
1453 { 0x03CE, 0x69, 0x00 }, /* ώ to i */
1454 { 0x0400, 0x45, 0x00 }, /* Ѐ to E */
1455 { 0x0401, 0x45, 0x00 }, /* Ё to E */
1456 { 0x0402, 0x44, 0x00 }, /* Ђ to D */
1457 { 0x0403, 0x47, 0x00 }, /* Ѓ to G */
1458 { 0x0404, 0x45, 0x00 }, /* Є to E */
1459 { 0x0405, 0x5a, 0x00 }, /* Ѕ to Z */
1460 { 0x0406, 0x49, 0x00 }, /* І to I */
1461 { 0x0407, 0x49, 0x00 }, /* Ї to I */
1462 { 0x0408, 0x4a, 0x00 }, /* Ј to J */
1463 { 0x0409, 0x49, 0x00 }, /* Љ to I */
1464 { 0x040A, 0x4e, 0x00 }, /* Њ to N */
1465 { 0x040B, 0x44, 0x00 }, /* Ћ to D */
1466 { 0x040C, 0x4b, 0x00 }, /* Ќ to K */
1467 { 0x040D, 0x49, 0x00 }, /* Ѝ to I */
1468 { 0x040E, 0x55, 0x00 }, /* Ў to U */
1469 { 0x040F, 0x44, 0x00 }, /* Џ to D */
1470 { 0x0410, 0x41, 0x00 }, /* А to A */
1471 { 0x0411, 0x42, 0x00 }, /* Б to B */
1472 { 0x0412, 0x56, 0x00 }, /* В to V */
1473 { 0x0413, 0x47, 0x00 }, /* Г to G */
1474 { 0x0414, 0x44, 0x00 }, /* Д to D */
1475 { 0x0415, 0x45, 0x00 }, /* Е to E */
1476 { 0x0416, 0x5a, 0x68 }, /* Ж to Zh */
1477 { 0x0417, 0x5a, 0x00 }, /* З to Z */
1478 { 0x0418, 0x49, 0x00 }, /* И to I */
1479 { 0x0419, 0x49, 0x00 }, /* Й to I */
1480 { 0x041A, 0x4b, 0x00 }, /* К to K */
1481 { 0x041B, 0x4c, 0x00 }, /* Л to L */
1482 { 0x041C, 0x4d, 0x00 }, /* М to M */
1483 { 0x041D, 0x4e, 0x00 }, /* Н to N */
1484 { 0x041E, 0x4f, 0x00 }, /* О to O */
1485 { 0x041F, 0x50, 0x00 }, /* П to P */
1486 { 0x0420, 0x52, 0x00 }, /* Р to R */
1487 { 0x0421, 0x53, 0x00 }, /* С to S */
1488 { 0x0422, 0x54, 0x00 }, /* Т to T */
1489 { 0x0423, 0x55, 0x00 }, /* У to U */
1490 { 0x0424, 0x46, 0x00 }, /* Ф to F */
1491 { 0x0425, 0x4b, 0x68 }, /* Х to Kh */
1492 { 0x0426, 0x54, 0x63 }, /* Ц to Tc */
1493 { 0x0427, 0x43, 0x68 }, /* Ч to Ch */
1494 { 0x0428, 0x53, 0x68 }, /* Ш to Sh */
1495 { 0x0429, 0x53, 0x68 }, /* Щ to Shch */
1496 { 0x042A, 0x61, 0x00 }, /* to A */
1497 { 0x042B, 0x59, 0x00 }, /* Ы to Y */
1498 { 0x042C, 0x59, 0x00 }, /* to Y */
1499 { 0x042D, 0x45, 0x00 }, /* Э to E */
1500 { 0x042E, 0x49, 0x75 }, /* Ю to Iu */
1501 { 0x042F, 0x49, 0x61 }, /* Я to Ia */
1502 { 0x0430, 0x61, 0x00 }, /* а to a */
1503 { 0x0431, 0x62, 0x00 }, /* б to b */
1504 { 0x0432, 0x76, 0x00 }, /* в to v */
1505 { 0x0433, 0x67, 0x00 }, /* г to g */
1506 { 0x0434, 0x64, 0x00 }, /* д to d */
1507 { 0x0435, 0x65, 0x00 }, /* е to e */
1508 { 0x0436, 0x7a, 0x68 }, /* ж to zh */
1509 { 0x0437, 0x7a, 0x00 }, /* з to z */
1510 { 0x0438, 0x69, 0x00 }, /* и to i */
1511 { 0x0439, 0x69, 0x00 }, /* й to i */
1512 { 0x043A, 0x6b, 0x00 }, /* к to k */
1513 { 0x043B, 0x6c, 0x00 }, /* л to l */
1514 { 0x043C, 0x6d, 0x00 }, /* м to m */
1515 { 0x043D, 0x6e, 0x00 }, /* н to n */
1516 { 0x043E, 0x6f, 0x00 }, /* о to o */
1517 { 0x043F, 0x70, 0x00 }, /* п to p */
1518 { 0x0440, 0x72, 0x00 }, /* р to r */
1519 { 0x0441, 0x73, 0x00 }, /* с to s */
1520 { 0x0442, 0x74, 0x00 }, /* т to t */
1521 { 0x0443, 0x75, 0x00 }, /* у to u */
1522 { 0x0444, 0x66, 0x00 }, /* ф to f */
1523 { 0x0445, 0x6b, 0x68 }, /* х to kh */
1524 { 0x0446, 0x74, 0x63 }, /* ц to tc */
1525 { 0x0447, 0x63, 0x68 }, /* ч to ch */
1526 { 0x0448, 0x73, 0x68 }, /* ш to sh */
1527 { 0x0449, 0x73, 0x68 }, /* щ to shch */
1528 { 0x044A, 0x61, 0x00 }, /* to a */
1529 { 0x044B, 0x79, 0x00 }, /* ы to y */
1530 { 0x044C, 0x79, 0x00 }, /* to y */
1531 { 0x044D, 0x65, 0x00 }, /* э to e */
1532 { 0x044E, 0x69, 0x75 }, /* ю to iu */
1533 { 0x044F, 0x69, 0x61 }, /* я to ia */
1534 { 0x0450, 0x65, 0x00 }, /* ѐ to e */
1535 { 0x0451, 0x65, 0x00 }, /* ё to e */
1536 { 0x0452, 0x64, 0x00 }, /* ђ to d */
1537 { 0x0453, 0x67, 0x00 }, /* ѓ to g */
1538 { 0x0454, 0x65, 0x00 }, /* є to e */
1539 { 0x0455, 0x7a, 0x00 }, /* ѕ to z */
1540 { 0x0456, 0x69, 0x00 }, /* і to i */
1541 { 0x0457, 0x69, 0x00 }, /* ї to i */
1542 { 0x0458, 0x6a, 0x00 }, /* ј to j */
1543 { 0x0459, 0x69, 0x00 }, /* љ to i */
1544 { 0x045A, 0x6e, 0x00 }, /* њ to n */
1545 { 0x045B, 0x64, 0x00 }, /* ћ to d */
1546 { 0x045C, 0x6b, 0x00 }, /* ќ to k */
1547 { 0x045D, 0x69, 0x00 }, /* ѝ to i */
1548 { 0x045E, 0x75, 0x00 }, /* ў to u */
1549 { 0x045F, 0x64, 0x00 }, /* џ to d */
1550 { 0x1E02, 0x42, 0x00 }, /* Ḃ to B */
1551 { 0x1E03, 0x62, 0x00 }, /* ḃ to b */
1552 { 0x1E0A, 0x44, 0x00 }, /* Ḋ to D */
1553 { 0x1E0B, 0x64, 0x00 }, /* ḋ to d */
1554 { 0x1E1E, 0x46, 0x00 }, /* Ḟ to F */
1555 { 0x1E1F, 0x66, 0x00 }, /* ḟ to f */
1556 { 0x1E40, 0x4D, 0x00 }, /* Ṁ to M */
1557 { 0x1E41, 0x6D, 0x00 }, /* ṁ to m */
1558 { 0x1E56, 0x50, 0x00 }, /* Ṗ to P */
1559 { 0x1E57, 0x70, 0x00 }, /* ṗ to p */
1560 { 0x1E60, 0x53, 0x00 }, /* Ṡ to S */
1561 { 0x1E61, 0x73, 0x00 }, /* ṡ to s */
1562 { 0x1E6A, 0x54, 0x00 }, /* Ṫ to T */
1563 { 0x1E6B, 0x74, 0x00 }, /* ṫ to t */
1564 { 0x1E80, 0x57, 0x00 }, /* Ẁ to W */
1565 { 0x1E81, 0x77, 0x00 }, /* ẁ to w */
1566 { 0x1E82, 0x57, 0x00 }, /* Ẃ to W */
1567 { 0x1E83, 0x77, 0x00 }, /* ẃ to w */
1568 { 0x1E84, 0x57, 0x00 }, /* Ẅ to W */
1569 { 0x1E85, 0x77, 0x00 }, /* ẅ to w */
1570 { 0x1EF2, 0x59, 0x00 }, /* Ỳ to Y */
1571 { 0x1EF3, 0x79, 0x00 }, /* ỳ to y */
1572 { 0xFB00, 0x66, 0x66 }, /* ff to ff */
1573 { 0xFB01, 0x66, 0x69 }, /* fi to fi */
1574 { 0xFB02, 0x66, 0x6C }, /* fl to fl */
1575 { 0xFB05, 0x73, 0x74 }, /* ſt to st */
1576 { 0xFB06, 0x73, 0x74 }, /* st to st */
1580 ** Convert the input string from UTF-8 into pure ASCII by converting
1581 ** all non-ASCII characters to some combination of characters in the
1584 ** The returned string might contain more characters than the input.
1586 ** Space to hold the returned string comes from sqlite3_malloc() and
1587 ** should be freed by the caller.
1589 static unsigned char *transliterate(const unsigned char *zIn
, int nIn
){
1590 unsigned char *zOut
= sqlite3_malloc( nIn
*4 + 1 );
1592 if( zOut
==0 ) return 0;
1595 c
= utf8Read(zIn
, nIn
, &sz
);
1602 xTop
= sizeof(translit
)/sizeof(translit
[0]) - 1;
1604 while( xTop
>=xBtm
){
1605 x
= (xTop
+ xBtm
)/2;
1606 if( translit
[x
].cFrom
==c
){
1607 zOut
[nOut
++] = translit
[x
].cTo0
;
1608 if( translit
[x
].cTo1
){
1609 zOut
[nOut
++] = translit
[x
].cTo1
;
1610 /* Add an extra "ch" after the "sh" for Щ and щ */
1611 if( c
==0x0429 || c
== 0x0449 ){
1618 }else if( translit
[x
].cFrom
>c
){
1624 if( c
) zOut
[nOut
++] = '?';
1632 ** Return the number of characters in the shortest prefix of the input
1633 ** string that transliterates to an ASCII string nTrans bytes or longer.
1634 ** Or, if the transliteration of the input string is less than nTrans
1635 ** bytes in size, return the number of characters in the input string.
1637 static int translen_to_charlen(const char *zIn
, int nIn
, int nTrans
){
1642 for(nChar
=0; i
<nIn
&& nOut
<nTrans
; nChar
++){
1643 c
= utf8Read((const unsigned char *)&zIn
[i
], nIn
-i
, &sz
);
1649 xTop
= sizeof(translit
)/sizeof(translit
[0]) - 1;
1651 while( xTop
>=xBtm
){
1652 x
= (xTop
+ xBtm
)/2;
1653 if( translit
[x
].cFrom
==c
){
1654 if( translit
[x
].cTo1
) nOut
++;
1655 if( c
==0x0429 || c
== 0x0449 ) nOut
+= 2;
1657 }else if( translit
[x
].cFrom
>c
){
1671 ** spellfix1_translit(X)
1673 ** Convert a string that contains non-ASCII Roman characters into
1676 static void transliterateSqlFunc(
1677 sqlite3_context
*context
,
1679 sqlite3_value
**argv
1681 const unsigned char *zIn
= sqlite3_value_text(argv
[0]);
1682 int nIn
= sqlite3_value_bytes(argv
[0]);
1683 unsigned char *zOut
= transliterate(zIn
, nIn
);
1685 sqlite3_result_error_nomem(context
);
1687 sqlite3_result_text(context
, (char*)zOut
, -1, sqlite3_free
);
1692 ** spellfix1_scriptcode(X)
1694 ** Try to determine the dominant script used by the word X and return
1695 ** its ISO 15924 numeric code.
1697 ** The current implementation only understands the following scripts:
1703 ** This routine will return 998 if the input X contains characters from
1704 ** two or more of the above scripts or 999 if X contains no characters
1705 ** from any of the above scripts.
1707 static void scriptCodeSqlFunc(
1708 sqlite3_context
*context
,
1710 sqlite3_value
**argv
1712 const unsigned char *zIn
= sqlite3_value_text(argv
[0]);
1713 int nIn
= sqlite3_value_bytes(argv
[0]);
1717 # define SCRIPT_LATIN 0x0001
1718 # define SCRIPT_CYRILLIC 0x0002
1719 # define SCRIPT_GREEK 0x0004
1722 c
= utf8Read(zIn
, nIn
, &sz
);
1726 scriptMask
|= SCRIPT_LATIN
;
1727 }else if( c
>=0x0400 && c
<=0x04ff ){
1728 scriptMask
|= SCRIPT_CYRILLIC
;
1729 }else if( c
>=0x0386 && c
<=0x03ce ){
1730 scriptMask
|= SCRIPT_GREEK
;
1733 switch( scriptMask
){
1734 case 0: res
= 999; break;
1735 case SCRIPT_LATIN
: res
= 215; break;
1736 case SCRIPT_CYRILLIC
: res
= 220; break;
1737 case SCRIPT_GREEK
: res
= 200; break;
1738 default: res
= 998; break;
1740 sqlite3_result_int(context
, res
);
1743 /* End transliterate
1744 ******************************************************************************
1745 ******************************************************************************
1746 ** Begin spellfix1 virtual table.
1749 /* Maximum length of a phonehash used for querying the shadow table */
1750 #define SPELLFIX_MX_HASH 8
1752 /* Maximum number of hash strings to examine per query */
1753 #define SPELLFIX_MX_RUN 1
1755 typedef struct spellfix1_vtab spellfix1_vtab
;
1756 typedef struct spellfix1_cursor spellfix1_cursor
;
1758 /* Fuzzy-search virtual table object */
1759 struct spellfix1_vtab
{
1760 sqlite3_vtab base
; /* Base class - must be first */
1761 sqlite3
*db
; /* Database connection */
1762 char *zDbName
; /* Name of database holding this table */
1763 char *zTableName
; /* Name of the virtual table */
1764 char *zCostTable
; /* Table holding edit-distance cost numbers */
1765 EditDist3Config
*pConfig3
; /* Parsed edit distance costs */
1768 /* Fuzzy-search cursor object */
1769 struct spellfix1_cursor
{
1770 sqlite3_vtab_cursor base
; /* Base class - must be first */
1771 spellfix1_vtab
*pVTab
; /* The table to which this cursor belongs */
1772 char *zPattern
; /* rhs of MATCH clause */
1773 int nRow
; /* Number of rows of content */
1774 int nAlloc
; /* Number of allocated rows */
1775 int iRow
; /* Current row of content */
1776 int iLang
; /* Value of the langid= constraint */
1777 int iTop
; /* Value of the top= constraint */
1778 int iScope
; /* Value of the scope= constraint */
1779 int nSearch
; /* Number of vocabulary items checked */
1780 sqlite3_stmt
*pFullScan
; /* Shadow query for a full table scan */
1781 struct spellfix1_row
{ /* For each row of content */
1782 sqlite3_int64 iRowid
; /* Rowid for this row */
1783 char *zWord
; /* Text for this row */
1784 int iRank
; /* Rank for this row */
1785 int iDistance
; /* Distance from pattern for this row */
1786 int iScore
; /* Score for sorting */
1787 int iMatchlen
; /* Value of matchlen column (or -1) */
1788 char zHash
[SPELLFIX_MX_HASH
]; /* the phonehash used for this match */
1793 ** Construct one or more SQL statements from the format string given
1794 ** and then evaluate those statements. The success code is written
1797 ** If *pRc is initially non-zero then this routine is a no-op.
1799 static void spellfix1DbExec(
1800 int *pRc
, /* Success code */
1801 sqlite3
*db
, /* Database in which to run SQL */
1802 const char *zFormat
, /* Format string for SQL */
1803 ... /* Arguments to the format string */
1808 va_start(ap
, zFormat
);
1809 zSql
= sqlite3_vmprintf(zFormat
, ap
);
1812 *pRc
= SQLITE_NOMEM
;
1814 *pRc
= sqlite3_exec(db
, zSql
, 0, 0, 0);
1820 ** xDisconnect/xDestroy method for the fuzzy-search module.
1822 static int spellfix1Uninit(int isDestroy
, sqlite3_vtab
*pVTab
){
1823 spellfix1_vtab
*p
= (spellfix1_vtab
*)pVTab
;
1826 sqlite3
*db
= p
->db
;
1827 spellfix1DbExec(&rc
, db
, "DROP TABLE IF EXISTS \"%w\".\"%w_vocab\"",
1828 p
->zDbName
, p
->zTableName
);
1830 if( rc
==SQLITE_OK
){
1831 sqlite3_free(p
->zTableName
);
1832 editDist3ConfigDelete(p
->pConfig3
);
1833 sqlite3_free(p
->zCostTable
);
1838 static int spellfix1Disconnect(sqlite3_vtab
*pVTab
){
1839 return spellfix1Uninit(0, pVTab
);
1841 static int spellfix1Destroy(sqlite3_vtab
*pVTab
){
1842 return spellfix1Uninit(1, pVTab
);
1846 ** Make a copy of a string. Remove leading and trailing whitespace
1849 static char *spellfix1Dequote(const char *zIn
){
1853 while( isspace(zIn
[0]) ) zIn
++;
1854 zOut
= sqlite3_mprintf("%s", zIn
);
1855 if( zOut
==0 ) return 0;
1856 i
= (int)strlen(zOut
);
1857 #if 0 /* The parser will never leave spaces at the end */
1858 while( i
>0 && isspace(zOut
[i
-1]) ){ i
--; }
1862 if( c
=='\'' || c
=='"' ){
1863 for(i
=1, j
=0; ALWAYS(zOut
[i
]); i
++){
1864 zOut
[j
++] = zOut
[i
];
1880 ** xConnect/xCreate method for the spellfix1 module. Arguments are:
1882 ** argv[0] -> module name ("spellfix1")
1883 ** argv[1] -> database name
1884 ** argv[2] -> table name
1885 ** argv[3].. -> optional arguments (i.e. "edit_cost_table" parameter)
1887 static int spellfix1Init(
1891 int argc
, const char *const*argv
,
1892 sqlite3_vtab
**ppVTab
,
1895 spellfix1_vtab
*pNew
= 0;
1896 /* const char *zModule = argv[0]; // not used */
1897 const char *zDbName
= argv
[1];
1898 const char *zTableName
= argv
[2];
1903 nDbName
= (int)strlen(zDbName
);
1904 pNew
= sqlite3_malloc( sizeof(*pNew
) + nDbName
+ 1);
1908 memset(pNew
, 0, sizeof(*pNew
));
1909 pNew
->zDbName
= (char*)&pNew
[1];
1910 memcpy(pNew
->zDbName
, zDbName
, nDbName
+1);
1911 pNew
->zTableName
= sqlite3_mprintf("%s", zTableName
);
1913 if( pNew
->zTableName
==0 ){
1916 rc
= sqlite3_declare_vtab(db
,
1917 "CREATE TABLE x(word,rank,distance,langid, "
1918 "score, matchlen, phonehash HIDDEN, "
1919 "top HIDDEN, scope HIDDEN, srchcnt HIDDEN, "
1920 "soundslike HIDDEN, command HIDDEN)"
1922 #define SPELLFIX_COL_WORD 0
1923 #define SPELLFIX_COL_RANK 1
1924 #define SPELLFIX_COL_DISTANCE 2
1925 #define SPELLFIX_COL_LANGID 3
1926 #define SPELLFIX_COL_SCORE 4
1927 #define SPELLFIX_COL_MATCHLEN 5
1928 #define SPELLFIX_COL_PHONEHASH 6
1929 #define SPELLFIX_COL_TOP 7
1930 #define SPELLFIX_COL_SCOPE 8
1931 #define SPELLFIX_COL_SRCHCNT 9
1932 #define SPELLFIX_COL_SOUNDSLIKE 10
1933 #define SPELLFIX_COL_COMMAND 11
1935 if( rc
==SQLITE_OK
&& isCreate
){
1936 spellfix1DbExec(&rc
, db
,
1937 "CREATE TABLE IF NOT EXISTS \"%w\".\"%w_vocab\"(\n"
1938 " id INTEGER PRIMARY KEY,\n"
1947 spellfix1DbExec(&rc
, db
,
1948 "CREATE INDEX IF NOT EXISTS \"%w\".\"%w_vocab_index_langid_k2\" "
1949 "ON \"%w_vocab\"(langid,k2);",
1950 zDbName
, zTableName
, zTableName
1953 for(i
=3; rc
==SQLITE_OK
&& i
<argc
; i
++){
1954 if( strncmp(argv
[i
],"edit_cost_table=",16)==0 && pNew
->zCostTable
==0 ){
1955 pNew
->zCostTable
= spellfix1Dequote(&argv
[i
][16]);
1956 if( pNew
->zCostTable
==0 ) rc
= SQLITE_NOMEM
;
1959 *pzErr
= sqlite3_mprintf("bad argument to spellfix1(): \"%s\"", argv
[i
]);
1966 spellfix1Uninit(0, &pNew
->base
);
1968 *ppVTab
= (sqlite3_vtab
*)pNew
;
1974 ** The xConnect and xCreate methods
1976 static int spellfix1Connect(
1979 int argc
, const char *const*argv
,
1980 sqlite3_vtab
**ppVTab
,
1983 return spellfix1Init(0, db
, pAux
, argc
, argv
, ppVTab
, pzErr
);
1985 static int spellfix1Create(
1988 int argc
, const char *const*argv
,
1989 sqlite3_vtab
**ppVTab
,
1992 return spellfix1Init(1, db
, pAux
, argc
, argv
, ppVTab
, pzErr
);
1996 ** Clear all of the content from a cursor.
1998 static void spellfix1ResetCursor(spellfix1_cursor
*pCur
){
2000 for(i
=0; i
<pCur
->nRow
; i
++){
2001 sqlite3_free(pCur
->a
[i
].zWord
);
2006 if( pCur
->pFullScan
){
2007 sqlite3_finalize(pCur
->pFullScan
);
2008 pCur
->pFullScan
= 0;
2013 ** Resize the cursor to hold up to N rows of content
2015 static void spellfix1ResizeCursor(spellfix1_cursor
*pCur
, int N
){
2016 struct spellfix1_row
*aNew
;
2017 assert( N
>=pCur
->nRow
);
2018 aNew
= sqlite3_realloc(pCur
->a
, sizeof(pCur
->a
[0])*N
);
2019 if( aNew
==0 && N
>0 ){
2020 spellfix1ResetCursor(pCur
);
2021 sqlite3_free(pCur
->a
);
2032 ** Close a fuzzy-search cursor.
2034 static int spellfix1Close(sqlite3_vtab_cursor
*cur
){
2035 spellfix1_cursor
*pCur
= (spellfix1_cursor
*)cur
;
2036 spellfix1ResetCursor(pCur
);
2037 spellfix1ResizeCursor(pCur
, 0);
2038 sqlite3_free(pCur
->zPattern
);
2044 ** Search for terms of these forms:
2046 ** (A) word MATCH $str
2047 ** (B) langid == $langid
2049 ** (D) scope = $scope
2050 ** (E) distance < $distance
2051 ** (F) distance <= $distance
2052 ** (G) rowid = $rowid
2054 ** The plan number is a bit mask formed with these bits:
2056 ** 0x01 (A) is found
2057 ** 0x02 (B) is found
2058 ** 0x04 (C) is found
2059 ** 0x08 (D) is found
2060 ** 0x10 (E) is found
2061 ** 0x20 (F) is found
2062 ** 0x40 (G) is found
2064 ** filter.argv[*] values contains $str, $langid, $top, $scope and $rowid
2065 ** if specified and in that order.
2067 static int spellfix1BestIndex(sqlite3_vtab
*tab
, sqlite3_index_info
*pIdxInfo
){
2071 int iScopeTerm
= -1;
2073 int iRowidTerm
= -1;
2075 const struct sqlite3_index_constraint
*pConstraint
;
2076 pConstraint
= pIdxInfo
->aConstraint
;
2077 for(i
=0; i
<pIdxInfo
->nConstraint
; i
++, pConstraint
++){
2078 if( pConstraint
->usable
==0 ) continue;
2080 /* Terms of the form: word MATCH $str */
2082 && pConstraint
->iColumn
==SPELLFIX_COL_WORD
2083 && pConstraint
->op
==SQLITE_INDEX_CONSTRAINT_MATCH
2086 pIdxInfo
->aConstraintUsage
[i
].argvIndex
= 1;
2087 pIdxInfo
->aConstraintUsage
[i
].omit
= 1;
2090 /* Terms of the form: langid = $langid */
2092 && pConstraint
->iColumn
==SPELLFIX_COL_LANGID
2093 && pConstraint
->op
==SQLITE_INDEX_CONSTRAINT_EQ
2099 /* Terms of the form: top = $top */
2101 && pConstraint
->iColumn
==SPELLFIX_COL_TOP
2102 && pConstraint
->op
==SQLITE_INDEX_CONSTRAINT_EQ
2108 /* Terms of the form: scope = $scope */
2110 && pConstraint
->iColumn
==SPELLFIX_COL_SCOPE
2111 && pConstraint
->op
==SQLITE_INDEX_CONSTRAINT_EQ
2117 /* Terms of the form: distance < $dist or distance <= $dist */
2118 if( (iPlan
& (16|32))==0
2119 && pConstraint
->iColumn
==SPELLFIX_COL_DISTANCE
2120 && (pConstraint
->op
==SQLITE_INDEX_CONSTRAINT_LT
2121 || pConstraint
->op
==SQLITE_INDEX_CONSTRAINT_LE
)
2123 iPlan
|= pConstraint
->op
==SQLITE_INDEX_CONSTRAINT_LT
? 16 : 32;
2127 /* Terms of the form: distance < $dist or distance <= $dist */
2129 && pConstraint
->iColumn
<0
2130 && pConstraint
->op
==SQLITE_INDEX_CONSTRAINT_EQ
2138 pIdxInfo
->idxNum
= iPlan
;
2139 if( pIdxInfo
->nOrderBy
==1
2140 && pIdxInfo
->aOrderBy
[0].iColumn
==SPELLFIX_COL_SCORE
2141 && pIdxInfo
->aOrderBy
[0].desc
==0
2143 pIdxInfo
->orderByConsumed
= 1; /* Default order by iScore */
2146 pIdxInfo
->aConstraintUsage
[iLangTerm
].argvIndex
= idx
++;
2147 pIdxInfo
->aConstraintUsage
[iLangTerm
].omit
= 1;
2150 pIdxInfo
->aConstraintUsage
[iTopTerm
].argvIndex
= idx
++;
2151 pIdxInfo
->aConstraintUsage
[iTopTerm
].omit
= 1;
2154 pIdxInfo
->aConstraintUsage
[iScopeTerm
].argvIndex
= idx
++;
2155 pIdxInfo
->aConstraintUsage
[iScopeTerm
].omit
= 1;
2157 if( iPlan
&(16|32) ){
2158 pIdxInfo
->aConstraintUsage
[iDistTerm
].argvIndex
= idx
++;
2159 pIdxInfo
->aConstraintUsage
[iDistTerm
].omit
= 1;
2161 pIdxInfo
->estimatedCost
= 1e5
;
2162 }else if( (iPlan
& 64) ){
2163 pIdxInfo
->idxNum
= 64;
2164 pIdxInfo
->aConstraintUsage
[iRowidTerm
].argvIndex
= 1;
2165 pIdxInfo
->aConstraintUsage
[iRowidTerm
].omit
= 1;
2166 pIdxInfo
->estimatedCost
= 5;
2168 pIdxInfo
->idxNum
= 0;
2169 pIdxInfo
->estimatedCost
= 1e50
;
2175 ** Open a new fuzzy-search cursor.
2177 static int spellfix1Open(sqlite3_vtab
*pVTab
, sqlite3_vtab_cursor
**ppCursor
){
2178 spellfix1_vtab
*p
= (spellfix1_vtab
*)pVTab
;
2179 spellfix1_cursor
*pCur
;
2180 pCur
= sqlite3_malloc( sizeof(*pCur
) );
2181 if( pCur
==0 ) return SQLITE_NOMEM
;
2182 memset(pCur
, 0, sizeof(*pCur
));
2184 *ppCursor
= &pCur
->base
;
2189 ** Adjust a distance measurement by the words rank in order to show
2190 ** preference to common words.
2192 static int spellfix1Score(int iDistance
, int iRank
){
2194 for(iLog2
=0; iRank
>0; iLog2
++, iRank
>>=1){}
2195 return iDistance
+ 32 - iLog2
;
2199 ** Compare two spellfix1_row objects for sorting purposes in qsort() such
2200 ** that they sort in order of increasing distance.
2202 static int spellfix1RowCompare(const void *A
, const void *B
){
2203 const struct spellfix1_row
*a
= (const struct spellfix1_row
*)A
;
2204 const struct spellfix1_row
*b
= (const struct spellfix1_row
*)B
;
2205 return a
->iScore
- b
->iScore
;
2209 ** A structure used to pass information from spellfix1FilterForMatch()
2210 ** into spellfix1RunQuery().
2212 typedef struct MatchQuery
{
2213 spellfix1_cursor
*pCur
; /* The cursor being queried */
2214 sqlite3_stmt
*pStmt
; /* shadow table query statment */
2215 char zHash
[SPELLFIX_MX_HASH
]; /* The current phonehash for zPattern */
2216 const char *zPattern
; /* Transliterated input string */
2217 int nPattern
; /* Length of zPattern */
2218 EditDist3FromString
*pMatchStr3
; /* Original unicode string */
2219 EditDist3Config
*pConfig3
; /* Edit-distance cost coefficients */
2220 const EditDist3Lang
*pLang
; /* The selected language coefficients */
2221 int iLang
; /* The language id */
2222 int iScope
; /* Default scope */
2223 int iMaxDist
; /* Maximum allowed edit distance, or -1 */
2224 int rc
; /* Error code */
2225 int nRun
; /* Number of prior runs for the same zPattern */
2226 char azPrior
[SPELLFIX_MX_RUN
][SPELLFIX_MX_HASH
]; /* Prior hashes */
2230 ** Run a query looking for the best matches against zPattern using
2231 ** zHash as the character class seed hash.
2233 static void spellfix1RunQuery(MatchQuery
*p
, const char *zQuery
, int nQuery
){
2243 int iScope
= p
->iScope
;
2244 spellfix1_cursor
*pCur
= p
->pCur
;
2245 sqlite3_stmt
*pStmt
= p
->pStmt
;
2246 char zHash1
[SPELLFIX_MX_HASH
];
2247 char zHash2
[SPELLFIX_MX_HASH
];
2252 if( pCur
->a
==0 || p
->rc
) return; /* Prior memory allocation failure */
2253 zClass
= (char*)phoneticHash((unsigned char*)zQuery
, nQuery
);
2255 p
->rc
= SQLITE_NOMEM
;
2258 nClass
= (int)strlen(zClass
);
2259 if( nClass
>SPELLFIX_MX_HASH
-2 ){
2260 nClass
= SPELLFIX_MX_HASH
-2;
2263 if( nClass
<=iScope
){
2270 memcpy(zHash1
, zClass
, iScope
);
2271 sqlite3_free(zClass
);
2273 memcpy(zHash2
, zHash1
, iScope
);
2274 zHash2
[iScope
] = 'Z';
2275 zHash2
[iScope
+1] = 0;
2276 #if SPELLFIX_MX_RUN>1
2277 for(i
=0; i
<p
->nRun
; i
++){
2278 if( strcmp(p
->azPrior
[i
], zHash1
)==0 ) return;
2281 assert( p
->nRun
<SPELLFIX_MX_RUN
);
2282 memcpy(p
->azPrior
[p
->nRun
++], zHash1
, iScope
+1);
2283 if( sqlite3_bind_text(pStmt
, 1, zHash1
, -1, SQLITE_STATIC
)==SQLITE_NOMEM
2284 || sqlite3_bind_text(pStmt
, 2, zHash2
, -1, SQLITE_STATIC
)==SQLITE_NOMEM
2286 p
->rc
= SQLITE_NOMEM
;
2289 #if SPELLFIX_MX_RUN>1
2290 for(i
=0; i
<pCur
->nRow
; i
++){
2291 if( pCur
->a
[i
].iScore
>iWorst
){
2292 iWorst
= pCur
->a
[i
].iScore
;
2297 while( sqlite3_step(pStmt
)==SQLITE_ROW
){
2299 iRank
= sqlite3_column_int(pStmt
, 2);
2300 if( p
->pMatchStr3
){
2301 int nWord
= sqlite3_column_bytes(pStmt
, 1);
2302 zWord
= (const char*)sqlite3_column_text(pStmt
, 1);
2303 iDist
= editDist3Core(p
->pMatchStr3
, zWord
, nWord
, p
->pLang
, &iMatchlen
);
2305 zK1
= (const char*)sqlite3_column_text(pStmt
, 3);
2306 if( zK1
==0 ) continue;
2307 iDist
= editdist1(p
->zPattern
, zK1
, 0);
2310 p
->rc
= SQLITE_NOMEM
;
2314 iScore
= spellfix1Score(iDist
,iRank
);
2315 if( p
->iMaxDist
>=0 ){
2316 if( iDist
>p
->iMaxDist
) continue;
2317 if( pCur
->nRow
>=pCur
->nAlloc
-1 ){
2318 spellfix1ResizeCursor(pCur
, pCur
->nAlloc
*2 + 10);
2319 if( pCur
->a
==0 ) break;
2322 }else if( pCur
->nRow
<pCur
->nAlloc
){
2324 }else if( iScore
<iWorst
){
2326 sqlite3_free(pCur
->a
[idx
].zWord
);
2330 pCur
->a
[idx
].zWord
= sqlite3_mprintf("%s", sqlite3_column_text(pStmt
, 1));
2331 if( pCur
->a
[idx
].zWord
==0 ){
2332 p
->rc
= SQLITE_NOMEM
;
2335 pCur
->a
[idx
].iRowid
= sqlite3_column_int64(pStmt
, 0);
2336 pCur
->a
[idx
].iRank
= iRank
;
2337 pCur
->a
[idx
].iDistance
= iDist
;
2338 pCur
->a
[idx
].iScore
= iScore
;
2339 pCur
->a
[idx
].iMatchlen
= iMatchlen
;
2340 memcpy(pCur
->a
[idx
].zHash
, zHash1
, iScope
+1);
2341 if( pCur
->nRow
<pCur
->nAlloc
) pCur
->nRow
++;
2342 if( pCur
->nRow
==pCur
->nAlloc
){
2343 iWorst
= pCur
->a
[0].iScore
;
2345 for(i
=1; i
<pCur
->nRow
; i
++){
2346 iScore
= pCur
->a
[i
].iScore
;
2347 if( iWorst
<iScore
){
2354 rc
= sqlite3_reset(pStmt
);
2355 if( rc
) p
->rc
= rc
;
2359 ** This version of the xFilter method work if the MATCH term is present
2360 ** and we are doing a scan.
2362 static int spellfix1FilterForMatch(
2363 spellfix1_cursor
*pCur
,
2366 sqlite3_value
**argv
2368 const unsigned char *zMatchThis
; /* RHS of the MATCH operator */
2369 EditDist3FromString
*pMatchStr3
= 0; /* zMatchThis as an editdist string */
2370 char *zPattern
; /* Transliteration of zMatchThis */
2371 int nPattern
; /* Length of zPattern */
2372 int iLimit
= 20; /* Max number of rows of output */
2373 int iScope
= 3; /* Use this many characters of zClass */
2374 int iLang
= 0; /* Language code */
2375 char *zSql
; /* SQL of shadow table query */
2376 sqlite3_stmt
*pStmt
= 0; /* Shadow table query */
2377 int rc
; /* Result code */
2378 int idx
= 1; /* Next available filter parameter */
2379 spellfix1_vtab
*p
= pCur
->pVTab
; /* The virtual table that owns pCur */
2380 MatchQuery x
; /* For passing info to RunQuery() */
2382 /* Load the cost table if we have not already done so */
2383 if( p
->zCostTable
!=0 && p
->pConfig3
==0 ){
2384 p
->pConfig3
= sqlite3_malloc( sizeof(p
->pConfig3
[0]) );
2385 if( p
->pConfig3
==0 ) return SQLITE_NOMEM
;
2386 memset(p
->pConfig3
, 0, sizeof(p
->pConfig3
[0]));
2387 rc
= editDist3ConfigLoad(p
->pConfig3
, p
->db
, p
->zCostTable
);
2390 memset(&x
, 0, sizeof(x
));
2391 x
.iScope
= 3; /* Default scope if none specified by "WHERE scope=N" */
2392 x
.iMaxDist
= -1; /* Maximum allowed edit distance */
2395 iLang
= sqlite3_value_int(argv
[idx
++]);
2398 iLimit
= sqlite3_value_int(argv
[idx
++]);
2399 if( iLimit
<1 ) iLimit
= 1;
2402 x
.iScope
= sqlite3_value_int(argv
[idx
++]);
2403 if( x
.iScope
<1 ) x
.iScope
= 1;
2404 if( x
.iScope
>SPELLFIX_MX_HASH
-2 ) x
.iScope
= SPELLFIX_MX_HASH
-2;
2406 if( idxNum
&(16|32) ){
2407 x
.iMaxDist
= sqlite3_value_int(argv
[idx
++]);
2408 if( idxNum
&16 ) x
.iMaxDist
--;
2409 if( x
.iMaxDist
<0 ) x
.iMaxDist
= 0;
2411 spellfix1ResetCursor(pCur
);
2412 spellfix1ResizeCursor(pCur
, iLimit
);
2413 zMatchThis
= sqlite3_value_text(argv
[0]);
2414 if( zMatchThis
==0 ) return SQLITE_OK
;
2416 x
.pLang
= editDist3FindLang(p
->pConfig3
, iLang
);
2417 pMatchStr3
= editDist3FromStringNew(x
.pLang
, (const char*)zMatchThis
, -1);
2418 if( pMatchStr3
==0 ){
2419 x
.rc
= SQLITE_NOMEM
;
2425 zPattern
= (char*)transliterate(zMatchThis
, sqlite3_value_bytes(argv
[0]));
2426 sqlite3_free(pCur
->zPattern
);
2427 pCur
->zPattern
= zPattern
;
2429 x
.rc
= SQLITE_NOMEM
;
2432 nPattern
= (int)strlen(zPattern
);
2433 if( zPattern
[nPattern
-1]=='*' ) nPattern
--;
2434 zSql
= sqlite3_mprintf(
2435 "SELECT id, word, rank, k1"
2436 " FROM \"%w\".\"%w_vocab\""
2437 " WHERE langid=%d AND k2>=?1 AND k2<?2",
2438 p
->zDbName
, p
->zTableName
, iLang
2441 x
.rc
= SQLITE_NOMEM
;
2445 rc
= sqlite3_prepare_v2(p
->db
, zSql
, -1, &pStmt
, 0);
2447 pCur
->iLang
= iLang
;
2450 x
.zPattern
= zPattern
;
2451 x
.nPattern
= nPattern
;
2452 x
.pMatchStr3
= pMatchStr3
;
2455 x
.pConfig3
= p
->pConfig3
;
2456 if( x
.rc
==SQLITE_OK
){
2457 spellfix1RunQuery(&x
, zPattern
, nPattern
);
2461 qsort(pCur
->a
, pCur
->nRow
, sizeof(pCur
->a
[0]), spellfix1RowCompare
);
2462 pCur
->iTop
= iLimit
;
2463 pCur
->iScope
= iScope
;
2465 x
.rc
= SQLITE_NOMEM
;
2469 sqlite3_finalize(pStmt
);
2470 editDist3FromStringDelete(pMatchStr3
);
2475 ** This version of xFilter handles a full-table scan case
2477 static int spellfix1FilterForFullScan(
2478 spellfix1_cursor
*pCur
,
2481 sqlite3_value
**argv
2485 spellfix1_vtab
*pVTab
= pCur
->pVTab
;
2486 spellfix1ResetCursor(pCur
);
2487 assert( idxNum
==0 || idxNum
==64 );
2488 zSql
= sqlite3_mprintf(
2489 "SELECT word, rank, NULL, langid, id FROM \"%w\".\"%w_vocab\"%s",
2490 pVTab
->zDbName
, pVTab
->zTableName
,
2491 ((idxNum
& 64) ? " WHERE rowid=?" : "")
2493 if( zSql
==0 ) return SQLITE_NOMEM
;
2494 rc
= sqlite3_prepare_v2(pVTab
->db
, zSql
, -1, &pCur
->pFullScan
, 0);
2496 if( rc
==SQLITE_OK
&& (idxNum
& 64) ){
2498 rc
= sqlite3_bind_value(pCur
->pFullScan
, 1, argv
[0]);
2500 pCur
->nRow
= pCur
->iRow
= 0;
2501 if( rc
==SQLITE_OK
){
2502 rc
= sqlite3_step(pCur
->pFullScan
);
2503 if( rc
==SQLITE_ROW
){ pCur
->iRow
= -1; rc
= SQLITE_OK
; }
2504 if( rc
==SQLITE_DONE
){ rc
= SQLITE_OK
; }
2513 ** Called to "rewind" a cursor back to the beginning so that
2514 ** it starts its output over again. Always called at least once
2515 ** prior to any spellfix1Column, spellfix1Rowid, or spellfix1Eof call.
2517 static int spellfix1Filter(
2518 sqlite3_vtab_cursor
*cur
,
2519 int idxNum
, const char *idxStr
,
2520 int argc
, sqlite3_value
**argv
2522 spellfix1_cursor
*pCur
= (spellfix1_cursor
*)cur
;
2525 rc
= spellfix1FilterForMatch(pCur
, idxNum
, argc
, argv
);
2527 rc
= spellfix1FilterForFullScan(pCur
, idxNum
, argc
, argv
);
2534 ** Advance a cursor to its next row of output
2536 static int spellfix1Next(sqlite3_vtab_cursor
*cur
){
2537 spellfix1_cursor
*pCur
= (spellfix1_cursor
*)cur
;
2539 if( pCur
->iRow
< pCur
->nRow
){
2540 if( pCur
->pFullScan
){
2541 rc
= sqlite3_step(pCur
->pFullScan
);
2542 if( rc
!=SQLITE_ROW
) pCur
->iRow
= pCur
->nRow
;
2543 if( rc
==SQLITE_ROW
|| rc
==SQLITE_DONE
) rc
= SQLITE_OK
;
2552 ** Return TRUE if we are at the end-of-file
2554 static int spellfix1Eof(sqlite3_vtab_cursor
*cur
){
2555 spellfix1_cursor
*pCur
= (spellfix1_cursor
*)cur
;
2556 return pCur
->iRow
>=pCur
->nRow
;
2560 ** Return columns from the current row.
2562 static int spellfix1Column(
2563 sqlite3_vtab_cursor
*cur
,
2564 sqlite3_context
*ctx
,
2567 spellfix1_cursor
*pCur
= (spellfix1_cursor
*)cur
;
2568 if( pCur
->pFullScan
){
2569 if( i
<=SPELLFIX_COL_LANGID
){
2570 sqlite3_result_value(ctx
, sqlite3_column_value(pCur
->pFullScan
, i
));
2572 sqlite3_result_null(ctx
);
2577 case SPELLFIX_COL_WORD
: {
2578 sqlite3_result_text(ctx
, pCur
->a
[pCur
->iRow
].zWord
, -1, SQLITE_STATIC
);
2581 case SPELLFIX_COL_RANK
: {
2582 sqlite3_result_int(ctx
, pCur
->a
[pCur
->iRow
].iRank
);
2585 case SPELLFIX_COL_DISTANCE
: {
2586 sqlite3_result_int(ctx
, pCur
->a
[pCur
->iRow
].iDistance
);
2589 case SPELLFIX_COL_LANGID
: {
2590 sqlite3_result_int(ctx
, pCur
->iLang
);
2593 case SPELLFIX_COL_SCORE
: {
2594 sqlite3_result_int(ctx
, pCur
->a
[pCur
->iRow
].iScore
);
2597 case SPELLFIX_COL_MATCHLEN
: {
2598 int iMatchlen
= pCur
->a
[pCur
->iRow
].iMatchlen
;
2600 int nPattern
= (int)strlen(pCur
->zPattern
);
2601 char *zWord
= pCur
->a
[pCur
->iRow
].zWord
;
2602 int nWord
= (int)strlen(zWord
);
2604 if( nPattern
>0 && pCur
->zPattern
[nPattern
-1]=='*' ){
2607 zTranslit
= (char *)transliterate((unsigned char *)zWord
, nWord
);
2608 if( !zTranslit
) return SQLITE_NOMEM
;
2609 res
= editdist1(pCur
->zPattern
, zTranslit
, &iMatchlen
);
2610 sqlite3_free(zTranslit
);
2611 if( res
<0 ) return SQLITE_NOMEM
;
2612 iMatchlen
= translen_to_charlen(zWord
, nWord
, iMatchlen
);
2614 iMatchlen
= utf8Charlen(zWord
, nWord
);
2618 sqlite3_result_int(ctx
, iMatchlen
);
2621 case SPELLFIX_COL_PHONEHASH
: {
2622 sqlite3_result_text(ctx
, pCur
->a
[pCur
->iRow
].zHash
, -1, SQLITE_STATIC
);
2625 case SPELLFIX_COL_TOP
: {
2626 sqlite3_result_int(ctx
, pCur
->iTop
);
2629 case SPELLFIX_COL_SCOPE
: {
2630 sqlite3_result_int(ctx
, pCur
->iScope
);
2633 case SPELLFIX_COL_SRCHCNT
: {
2634 sqlite3_result_int(ctx
, pCur
->nSearch
);
2638 sqlite3_result_null(ctx
);
2648 static int spellfix1Rowid(sqlite3_vtab_cursor
*cur
, sqlite_int64
*pRowid
){
2649 spellfix1_cursor
*pCur
= (spellfix1_cursor
*)cur
;
2650 if( pCur
->pFullScan
){
2651 *pRowid
= sqlite3_column_int64(pCur
->pFullScan
, 4);
2653 *pRowid
= pCur
->a
[pCur
->iRow
].iRowid
;
2659 ** The xUpdate() method.
2661 static int spellfix1Update(
2662 sqlite3_vtab
*pVTab
,
2664 sqlite3_value
**argv
,
2665 sqlite_int64
*pRowid
2668 sqlite3_int64 rowid
, newRowid
;
2669 spellfix1_vtab
*p
= (spellfix1_vtab
*)pVTab
;
2670 sqlite3
*db
= p
->db
;
2673 /* A delete operation on the rowid given by argv[0] */
2674 rowid
= *pRowid
= sqlite3_value_int64(argv
[0]);
2675 spellfix1DbExec(&rc
, db
, "DELETE FROM \"%w\".\"%w_vocab\" "
2677 p
->zDbName
, p
->zTableName
, rowid
);
2679 const unsigned char *zWord
= sqlite3_value_text(argv
[SPELLFIX_COL_WORD
+2]);
2680 int nWord
= sqlite3_value_bytes(argv
[SPELLFIX_COL_WORD
+2]);
2681 int iLang
= sqlite3_value_int(argv
[SPELLFIX_COL_LANGID
+2]);
2682 int iRank
= sqlite3_value_int(argv
[SPELLFIX_COL_RANK
+2]);
2683 const unsigned char *zSoundslike
=
2684 sqlite3_value_text(argv
[SPELLFIX_COL_SOUNDSLIKE
+2]);
2685 int nSoundslike
= sqlite3_value_bytes(argv
[SPELLFIX_COL_SOUNDSLIKE
+2]);
2691 /* Inserts of the form: INSERT INTO table(command) VALUES('xyzzy');
2692 ** cause zWord to be NULL, so we look at the "command" column to see
2693 ** what special actions to take */
2695 (const char*)sqlite3_value_text(argv
[SPELLFIX_COL_COMMAND
+2]);
2697 pVTab
->zErrMsg
= sqlite3_mprintf("NOT NULL constraint failed: %s.word",
2699 return SQLITE_CONSTRAINT_NOTNULL
;
2701 if( strcmp(zCmd
,"reset")==0 ){
2702 /* Reset the edit cost table (if there is one). */
2703 editDist3ConfigDelete(p
->pConfig3
);
2707 if( strncmp(zCmd
,"edit_cost_table=",16)==0 ){
2708 editDist3ConfigDelete(p
->pConfig3
);
2710 sqlite3_free(p
->zCostTable
);
2711 p
->zCostTable
= spellfix1Dequote(zCmd
+16);
2712 if( p
->zCostTable
==0 ) return SQLITE_NOMEM
;
2713 if( p
->zCostTable
[0]==0 || sqlite3_stricmp(p
->zCostTable
,"null")==0 ){
2714 sqlite3_free(p
->zCostTable
);
2719 pVTab
->zErrMsg
= sqlite3_mprintf("unknown value for %s.command: \"%w\"",
2720 p
->zTableName
, zCmd
);
2721 return SQLITE_ERROR
;
2723 if( iRank
<1 ) iRank
= 1;
2725 zK1
= (char*)transliterate(zSoundslike
, nSoundslike
);
2727 zK1
= (char*)transliterate(zWord
, nWord
);
2729 if( zK1
==0 ) return SQLITE_NOMEM
;
2730 for(i
=0; (c
= zK1
[i
])!=0; i
++){
2731 if( c
>='A' && c
<='Z' ) zK1
[i
] += 'a' - 'A';
2733 zK2
= (char*)phoneticHash((const unsigned char*)zK1
, i
);
2736 return SQLITE_NOMEM
;
2738 if( sqlite3_value_type(argv
[0])==SQLITE_NULL
){
2739 if( sqlite3_value_type(argv
[1])==SQLITE_NULL
){
2740 spellfix1DbExec(&rc
, db
,
2741 "INSERT INTO \"%w\".\"%w_vocab\"(rank,langid,word,k1,k2) "
2742 "VALUES(%d,%d,%Q,%Q,%Q)",
2743 p
->zDbName
, p
->zTableName
,
2744 iRank
, iLang
, zWord
, zK1
, zK2
2747 newRowid
= sqlite3_value_int64(argv
[1]);
2748 spellfix1DbExec(&rc
, db
,
2749 "INSERT INTO \"%w\".\"%w_vocab\"(id,rank,langid,word,k1,k2) "
2750 "VALUES(%lld,%d,%d,%Q,%Q,%Q)",
2751 p
->zDbName
, p
->zTableName
,
2752 newRowid
, iRank
, iLang
, zWord
, zK1
, zK2
2755 *pRowid
= sqlite3_last_insert_rowid(db
);
2757 rowid
= sqlite3_value_int64(argv
[0]);
2758 newRowid
= *pRowid
= sqlite3_value_int64(argv
[1]);
2759 spellfix1DbExec(&rc
, db
,
2760 "UPDATE \"%w\".\"%w_vocab\" SET id=%lld, rank=%d, langid=%d,"
2761 " word=%Q, k1=%Q, k2=%Q WHERE id=%lld",
2762 p
->zDbName
, p
->zTableName
, newRowid
, iRank
, iLang
,
2763 zWord
, zK1
, zK2
, rowid
2773 ** Rename the spellfix1 table.
2775 static int spellfix1Rename(sqlite3_vtab
*pVTab
, const char *zNew
){
2776 spellfix1_vtab
*p
= (spellfix1_vtab
*)pVTab
;
2777 sqlite3
*db
= p
->db
;
2779 char *zNewName
= sqlite3_mprintf("%s", zNew
);
2781 return SQLITE_NOMEM
;
2783 spellfix1DbExec(&rc
, db
,
2784 "ALTER TABLE \"%w\".\"%w_vocab\" RENAME TO \"%w_vocab\"",
2785 p
->zDbName
, p
->zTableName
, zNewName
2787 if( rc
==SQLITE_OK
){
2788 sqlite3_free(p
->zTableName
);
2789 p
->zTableName
= zNewName
;
2791 sqlite3_free(zNewName
);
2798 ** A virtual table module that provides fuzzy search.
2800 static sqlite3_module spellfix1Module
= {
2802 spellfix1Create
, /* xCreate - handle CREATE VIRTUAL TABLE */
2803 spellfix1Connect
, /* xConnect - reconnected to an existing table */
2804 spellfix1BestIndex
, /* xBestIndex - figure out how to do a query */
2805 spellfix1Disconnect
, /* xDisconnect - close a connection */
2806 spellfix1Destroy
, /* xDestroy - handle DROP TABLE */
2807 spellfix1Open
, /* xOpen - open a cursor */
2808 spellfix1Close
, /* xClose - close a cursor */
2809 spellfix1Filter
, /* xFilter - configure scan constraints */
2810 spellfix1Next
, /* xNext - advance a cursor */
2811 spellfix1Eof
, /* xEof - check for end of scan */
2812 spellfix1Column
, /* xColumn - read data */
2813 spellfix1Rowid
, /* xRowid - read data */
2814 spellfix1Update
, /* xUpdate */
2819 0, /* xFindMethod */
2820 spellfix1Rename
, /* xRename */
2824 ** Register the various functions and the virtual table.
2826 static int spellfix1Register(sqlite3
*db
){
2829 rc
= sqlite3_create_function(db
, "spellfix1_translit", 1, SQLITE_UTF8
, 0,
2830 transliterateSqlFunc
, 0, 0);
2831 if( rc
==SQLITE_OK
){
2832 rc
= sqlite3_create_function(db
, "spellfix1_editdist", 2, SQLITE_UTF8
, 0,
2833 editdistSqlFunc
, 0, 0);
2835 if( rc
==SQLITE_OK
){
2836 rc
= sqlite3_create_function(db
, "spellfix1_phonehash", 1, SQLITE_UTF8
, 0,
2837 phoneticHashSqlFunc
, 0, 0);
2839 if( rc
==SQLITE_OK
){
2840 rc
= sqlite3_create_function(db
, "spellfix1_scriptcode", 1, SQLITE_UTF8
, 0,
2841 scriptCodeSqlFunc
, 0, 0);
2843 if( rc
==SQLITE_OK
){
2844 rc
= sqlite3_create_module(db
, "spellfix1", &spellfix1Module
, 0);
2846 if( rc
==SQLITE_OK
){
2847 rc
= editDist3Install(db
);
2850 /* Verify sanity of the translit[] table */
2851 for(i
=0; i
<sizeof(translit
)/sizeof(translit
[0])-1; i
++){
2852 assert( translit
[i
].cFrom
<translit
[i
+1].cFrom
);
2858 #endif /* SQLITE_OMIT_VIRTUALTABLE */
2861 ** Extension load function.
2864 __declspec(dllexport
)
2866 int sqlite3_spellfix_init(
2869 const sqlite3_api_routines
*pApi
2871 SQLITE_EXTENSION_INIT2(pApi
);
2872 #ifndef SQLITE_OMIT_VIRTUALTABLE
2873 return spellfix1Register(db
);