Snapshot of upstream SQLite 3.46.1
[sqlcipher.git] / ext / fts3 / fts3_unicode2.c
blobc510162496dd95c8d21c0b588b8f247114503aca
1 /*
2 ** 2012-05-25
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 ******************************************************************************
15 ** DO NOT EDIT THIS MACHINE GENERATED FILE.
18 #ifndef SQLITE_DISABLE_FTS3_UNICODE
19 #if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)
21 #include <assert.h>
24 ** Return true if the argument corresponds to a unicode codepoint
25 ** classified as either a letter or a number. Otherwise false.
27 ** The results are undefined if the value passed to this function
28 ** is less than zero.
30 int sqlite3FtsUnicodeIsalnum(int c){
31 /* Each unsigned integer in the following array corresponds to a contiguous
32 ** range of unicode codepoints that are not either letters or numbers (i.e.
33 ** codepoints for which this function should return 0).
35 ** The most significant 22 bits in each 32-bit value contain the first
36 ** codepoint in the range. The least significant 10 bits are used to store
37 ** the size of the range (always at least 1). In other words, the value
38 ** ((C<<22) + N) represents a range of N codepoints starting with codepoint
39 ** C. It is not possible to represent a range larger than 1023 codepoints
40 ** using this format.
42 static const unsigned int aEntry[] = {
43 0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07,
44 0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01,
45 0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401,
46 0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01,
47 0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01,
48 0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802,
49 0x00192C15, 0x0019A804, 0x0019C001, 0x001B5001, 0x001B580F,
50 0x001B9C07, 0x001BF402, 0x001C000E, 0x001C3C01, 0x001C4401,
51 0x001CC01B, 0x001E980B, 0x001FAC09, 0x001FD804, 0x00205804,
52 0x00206C09, 0x00209403, 0x0020A405, 0x0020C00F, 0x00216403,
53 0x00217801, 0x0023901B, 0x00240004, 0x0024E803, 0x0024F812,
54 0x00254407, 0x00258804, 0x0025C001, 0x00260403, 0x0026F001,
55 0x0026F807, 0x00271C02, 0x00272C03, 0x00275C01, 0x00278802,
56 0x0027C802, 0x0027E802, 0x00280403, 0x0028F001, 0x0028F805,
57 0x00291C02, 0x00292C03, 0x00294401, 0x0029C002, 0x0029D401,
58 0x002A0403, 0x002AF001, 0x002AF808, 0x002B1C03, 0x002B2C03,
59 0x002B8802, 0x002BC002, 0x002C0403, 0x002CF001, 0x002CF807,
60 0x002D1C02, 0x002D2C03, 0x002D5802, 0x002D8802, 0x002DC001,
61 0x002E0801, 0x002EF805, 0x002F1803, 0x002F2804, 0x002F5C01,
62 0x002FCC08, 0x00300403, 0x0030F807, 0x00311803, 0x00312804,
63 0x00315402, 0x00318802, 0x0031FC01, 0x00320802, 0x0032F001,
64 0x0032F807, 0x00331803, 0x00332804, 0x00335402, 0x00338802,
65 0x00340802, 0x0034F807, 0x00351803, 0x00352804, 0x00355C01,
66 0x00358802, 0x0035E401, 0x00360802, 0x00372801, 0x00373C06,
67 0x00375801, 0x00376008, 0x0037C803, 0x0038C401, 0x0038D007,
68 0x0038FC01, 0x00391C09, 0x00396802, 0x003AC401, 0x003AD006,
69 0x003AEC02, 0x003B2006, 0x003C041F, 0x003CD00C, 0x003DC417,
70 0x003E340B, 0x003E6424, 0x003EF80F, 0x003F380D, 0x0040AC14,
71 0x00412806, 0x00415804, 0x00417803, 0x00418803, 0x00419C07,
72 0x0041C404, 0x0042080C, 0x00423C01, 0x00426806, 0x0043EC01,
73 0x004D740C, 0x004E400A, 0x00500001, 0x0059B402, 0x005A0001,
74 0x005A6C02, 0x005BAC03, 0x005C4803, 0x005CC805, 0x005D4802,
75 0x005DC802, 0x005ED023, 0x005F6004, 0x005F7401, 0x0060000F,
76 0x0062A401, 0x0064800C, 0x0064C00C, 0x00650001, 0x00651002,
77 0x0066C011, 0x00672002, 0x00677822, 0x00685C05, 0x00687802,
78 0x0069540A, 0x0069801D, 0x0069FC01, 0x006A8007, 0x006AA006,
79 0x006C0005, 0x006CD011, 0x006D6823, 0x006E0003, 0x006E840D,
80 0x006F980E, 0x006FF004, 0x00709014, 0x0070EC05, 0x0071F802,
81 0x00730008, 0x00734019, 0x0073B401, 0x0073C803, 0x00770027,
82 0x0077F004, 0x007EF401, 0x007EFC03, 0x007F3403, 0x007F7403,
83 0x007FB403, 0x007FF402, 0x00800065, 0x0081A806, 0x0081E805,
84 0x00822805, 0x0082801A, 0x00834021, 0x00840002, 0x00840C04,
85 0x00842002, 0x00845001, 0x00845803, 0x00847806, 0x00849401,
86 0x00849C01, 0x0084A401, 0x0084B801, 0x0084E802, 0x00850005,
87 0x00852804, 0x00853C01, 0x00864264, 0x00900027, 0x0091000B,
88 0x0092704E, 0x00940200, 0x009C0475, 0x009E53B9, 0x00AD400A,
89 0x00B39406, 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001,
90 0x00B5FC01, 0x00B7804F, 0x00B8C00C, 0x00BA001A, 0x00BA6C59,
91 0x00BC00D6, 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807,
92 0x00C0D802, 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01,
93 0x00C64002, 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E,
94 0x00C94001, 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100,
95 0x01370040, 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10,
96 0x029A7C01, 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402,
97 0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804,
98 0x02A1D004, 0x02A20002, 0x02A2D011, 0x02A33802, 0x02A38012,
99 0x02A3E003, 0x02A4980A, 0x02A51C0D, 0x02A57C01, 0x02A60004,
100 0x02A6CC1B, 0x02A77802, 0x02A8A40E, 0x02A90C01, 0x02A93002,
101 0x02A97004, 0x02A9DC03, 0x02A9EC01, 0x02AAC001, 0x02AAC803,
102 0x02AADC02, 0x02AAF802, 0x02AB0401, 0x02AB7802, 0x02ABAC07,
103 0x02ABD402, 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02,
104 0x037FFC01, 0x03EC7801, 0x03ECA401, 0x03EEC810, 0x03F4F802,
105 0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023, 0x03F95013,
106 0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807, 0x03FCEC06,
107 0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405, 0x04040003,
108 0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E, 0x040E7C01,
109 0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01, 0x04280403,
110 0x04281402, 0x04283004, 0x0428E003, 0x0428FC01, 0x04294009,
111 0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016, 0x04420003,
112 0x0442C012, 0x04440003, 0x04449C0E, 0x04450004, 0x04460003,
113 0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004, 0x05BD442E,
114 0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5, 0x07480046,
115 0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01, 0x075C5401,
116 0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401, 0x075EA401,
117 0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064, 0x07C2800F,
118 0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F, 0x07C4C03C,
119 0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009, 0x07C94002,
120 0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014, 0x07CE8025,
121 0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001, 0x07D108B6,
122 0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018, 0x07D7EC46,
123 0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401, 0x38008060,
124 0x380400F0,
126 static const unsigned int aAscii[4] = {
127 0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
130 if( (unsigned int)c<128 ){
131 return ( (aAscii[c >> 5] & ((unsigned int)1 << (c & 0x001F)))==0 );
132 }else if( (unsigned int)c<(1<<22) ){
133 unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
134 int iRes = 0;
135 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
136 int iLo = 0;
137 while( iHi>=iLo ){
138 int iTest = (iHi + iLo) / 2;
139 if( key >= aEntry[iTest] ){
140 iRes = iTest;
141 iLo = iTest+1;
142 }else{
143 iHi = iTest-1;
146 assert( aEntry[0]<key );
147 assert( key>=aEntry[iRes] );
148 return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
150 return 1;
155 ** If the argument is a codepoint corresponding to a lowercase letter
156 ** in the ASCII range with a diacritic added, return the codepoint
157 ** of the ASCII letter only. For example, if passed 235 - "LATIN
158 ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
159 ** E"). The resuls of passing a codepoint that corresponds to an
160 ** uppercase letter are undefined.
162 static int remove_diacritic(int c, int bComplex){
163 unsigned short aDia[] = {
164 0, 1797, 1848, 1859, 1891, 1928, 1940, 1995,
165 2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286,
166 2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732,
167 2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336,
168 3456, 3696, 3712, 3728, 3744, 3766, 3832, 3896,
169 3912, 3928, 3944, 3968, 4008, 4040, 4056, 4106,
170 4138, 4170, 4202, 4234, 4266, 4296, 4312, 4344,
171 4408, 4424, 4442, 4472, 4488, 4504, 6148, 6198,
172 6264, 6280, 6360, 6429, 6505, 6529, 61448, 61468,
173 61512, 61534, 61592, 61610, 61642, 61672, 61688, 61704,
174 61726, 61784, 61800, 61816, 61836, 61880, 61896, 61914,
175 61948, 61998, 62062, 62122, 62154, 62184, 62200, 62218,
176 62252, 62302, 62364, 62410, 62442, 62478, 62536, 62554,
177 62584, 62604, 62640, 62648, 62656, 62664, 62730, 62766,
178 62830, 62890, 62924, 62974, 63032, 63050, 63082, 63118,
179 63182, 63242, 63274, 63310, 63368, 63390,
181 #define HIBIT ((unsigned char)0x80)
182 unsigned char aChar[] = {
183 '\0', 'a', 'c', 'e', 'i', 'n',
184 'o', 'u', 'y', 'y', 'a', 'c',
185 'd', 'e', 'e', 'g', 'h', 'i',
186 'j', 'k', 'l', 'n', 'o', 'r',
187 's', 't', 'u', 'u', 'w', 'y',
188 'z', 'o', 'u', 'a', 'i', 'o',
189 'u', 'u'|HIBIT, 'a'|HIBIT, 'g', 'k', 'o',
190 'o'|HIBIT, 'j', 'g', 'n', 'a'|HIBIT, 'a',
191 'e', 'i', 'o', 'r', 'u', 's',
192 't', 'h', 'a', 'e', 'o'|HIBIT, 'o',
193 'o'|HIBIT, 'y', '\0', '\0', '\0', '\0',
194 '\0', '\0', '\0', '\0', 'a', 'b',
195 'c'|HIBIT, 'd', 'd', 'e'|HIBIT, 'e', 'e'|HIBIT,
196 'f', 'g', 'h', 'h', 'i', 'i'|HIBIT,
197 'k', 'l', 'l'|HIBIT, 'l', 'm', 'n',
198 'o'|HIBIT, 'p', 'r', 'r'|HIBIT, 'r', 's',
199 's'|HIBIT, 't', 'u', 'u'|HIBIT, 'v', 'w',
200 'w', 'x', 'y', 'z', 'h', 't',
201 'w', 'y', 'a', 'a'|HIBIT, 'a'|HIBIT, 'a'|HIBIT,
202 'e', 'e'|HIBIT, 'e'|HIBIT, 'i', 'o', 'o'|HIBIT,
203 'o'|HIBIT, 'o'|HIBIT, 'u', 'u'|HIBIT, 'u'|HIBIT, 'y',
206 unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
207 int iRes = 0;
208 int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
209 int iLo = 0;
210 while( iHi>=iLo ){
211 int iTest = (iHi + iLo) / 2;
212 if( key >= aDia[iTest] ){
213 iRes = iTest;
214 iLo = iTest+1;
215 }else{
216 iHi = iTest-1;
219 assert( key>=aDia[iRes] );
220 if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
221 return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);
226 ** Return true if the argument interpreted as a unicode codepoint
227 ** is a diacritical modifier character.
229 int sqlite3FtsUnicodeIsdiacritic(int c){
230 unsigned int mask0 = 0x08029FDF;
231 unsigned int mask1 = 0x000361F8;
232 if( c<768 || c>817 ) return 0;
233 return (c < 768+32) ?
234 (mask0 & ((unsigned int)1 << (c-768))) :
235 (mask1 & ((unsigned int)1 << (c-768-32)));
240 ** Interpret the argument as a unicode codepoint. If the codepoint
241 ** is an upper case character that has a lower case equivalent,
242 ** return the codepoint corresponding to the lower case version.
243 ** Otherwise, return a copy of the argument.
245 ** The results are undefined if the value passed to this function
246 ** is less than zero.
248 int sqlite3FtsUnicodeFold(int c, int eRemoveDiacritic){
249 /* Each entry in the following array defines a rule for folding a range
250 ** of codepoints to lower case. The rule applies to a range of nRange
251 ** codepoints starting at codepoint iCode.
253 ** If the least significant bit in flags is clear, then the rule applies
254 ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
255 ** need to be folded). Or, if it is set, then the rule only applies to
256 ** every second codepoint in the range, starting with codepoint C.
258 ** The 7 most significant bits in flags are an index into the aiOff[]
259 ** array. If a specific codepoint C does require folding, then its lower
260 ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
262 ** The contents of this array are generated by parsing the CaseFolding.txt
263 ** file distributed as part of the "Unicode Character Database". See
264 ** http://www.unicode.org for details.
266 static const struct TableEntry {
267 unsigned short iCode;
268 unsigned char flags;
269 unsigned char nRange;
270 } aEntry[] = {
271 {65, 14, 26}, {181, 64, 1}, {192, 14, 23},
272 {216, 14, 7}, {256, 1, 48}, {306, 1, 6},
273 {313, 1, 16}, {330, 1, 46}, {376, 116, 1},
274 {377, 1, 6}, {383, 104, 1}, {385, 50, 1},
275 {386, 1, 4}, {390, 44, 1}, {391, 0, 1},
276 {393, 42, 2}, {395, 0, 1}, {398, 32, 1},
277 {399, 38, 1}, {400, 40, 1}, {401, 0, 1},
278 {403, 42, 1}, {404, 46, 1}, {406, 52, 1},
279 {407, 48, 1}, {408, 0, 1}, {412, 52, 1},
280 {413, 54, 1}, {415, 56, 1}, {416, 1, 6},
281 {422, 60, 1}, {423, 0, 1}, {425, 60, 1},
282 {428, 0, 1}, {430, 60, 1}, {431, 0, 1},
283 {433, 58, 2}, {435, 1, 4}, {439, 62, 1},
284 {440, 0, 1}, {444, 0, 1}, {452, 2, 1},
285 {453, 0, 1}, {455, 2, 1}, {456, 0, 1},
286 {458, 2, 1}, {459, 1, 18}, {478, 1, 18},
287 {497, 2, 1}, {498, 1, 4}, {502, 122, 1},
288 {503, 134, 1}, {504, 1, 40}, {544, 110, 1},
289 {546, 1, 18}, {570, 70, 1}, {571, 0, 1},
290 {573, 108, 1}, {574, 68, 1}, {577, 0, 1},
291 {579, 106, 1}, {580, 28, 1}, {581, 30, 1},
292 {582, 1, 10}, {837, 36, 1}, {880, 1, 4},
293 {886, 0, 1}, {902, 18, 1}, {904, 16, 3},
294 {908, 26, 1}, {910, 24, 2}, {913, 14, 17},
295 {931, 14, 9}, {962, 0, 1}, {975, 4, 1},
296 {976, 140, 1}, {977, 142, 1}, {981, 146, 1},
297 {982, 144, 1}, {984, 1, 24}, {1008, 136, 1},
298 {1009, 138, 1}, {1012, 130, 1}, {1013, 128, 1},
299 {1015, 0, 1}, {1017, 152, 1}, {1018, 0, 1},
300 {1021, 110, 3}, {1024, 34, 16}, {1040, 14, 32},
301 {1120, 1, 34}, {1162, 1, 54}, {1216, 6, 1},
302 {1217, 1, 14}, {1232, 1, 88}, {1329, 22, 38},
303 {4256, 66, 38}, {4295, 66, 1}, {4301, 66, 1},
304 {7680, 1, 150}, {7835, 132, 1}, {7838, 96, 1},
305 {7840, 1, 96}, {7944, 150, 8}, {7960, 150, 6},
306 {7976, 150, 8}, {7992, 150, 8}, {8008, 150, 6},
307 {8025, 151, 8}, {8040, 150, 8}, {8072, 150, 8},
308 {8088, 150, 8}, {8104, 150, 8}, {8120, 150, 2},
309 {8122, 126, 2}, {8124, 148, 1}, {8126, 100, 1},
310 {8136, 124, 4}, {8140, 148, 1}, {8152, 150, 2},
311 {8154, 120, 2}, {8168, 150, 2}, {8170, 118, 2},
312 {8172, 152, 1}, {8184, 112, 2}, {8186, 114, 2},
313 {8188, 148, 1}, {8486, 98, 1}, {8490, 92, 1},
314 {8491, 94, 1}, {8498, 12, 1}, {8544, 8, 16},
315 {8579, 0, 1}, {9398, 10, 26}, {11264, 22, 47},
316 {11360, 0, 1}, {11362, 88, 1}, {11363, 102, 1},
317 {11364, 90, 1}, {11367, 1, 6}, {11373, 84, 1},
318 {11374, 86, 1}, {11375, 80, 1}, {11376, 82, 1},
319 {11378, 0, 1}, {11381, 0, 1}, {11390, 78, 2},
320 {11392, 1, 100}, {11499, 1, 4}, {11506, 0, 1},
321 {42560, 1, 46}, {42624, 1, 24}, {42786, 1, 14},
322 {42802, 1, 62}, {42873, 1, 4}, {42877, 76, 1},
323 {42878, 1, 10}, {42891, 0, 1}, {42893, 74, 1},
324 {42896, 1, 4}, {42912, 1, 10}, {42922, 72, 1},
325 {65313, 14, 26},
327 static const unsigned short aiOff[] = {
328 1, 2, 8, 15, 16, 26, 28, 32,
329 37, 38, 40, 48, 63, 64, 69, 71,
330 79, 80, 116, 202, 203, 205, 206, 207,
331 209, 210, 211, 213, 214, 217, 218, 219,
332 775, 7264, 10792, 10795, 23228, 23256, 30204, 54721,
333 54753, 54754, 54756, 54787, 54793, 54809, 57153, 57274,
334 57921, 58019, 58363, 61722, 65268, 65341, 65373, 65406,
335 65408, 65410, 65415, 65424, 65436, 65439, 65450, 65462,
336 65472, 65476, 65478, 65480, 65482, 65488, 65506, 65511,
337 65514, 65521, 65527, 65528, 65529,
340 int ret = c;
342 assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
344 if( c<128 ){
345 if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
346 }else if( c<65536 ){
347 const struct TableEntry *p;
348 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
349 int iLo = 0;
350 int iRes = -1;
352 assert( c>aEntry[0].iCode );
353 while( iHi>=iLo ){
354 int iTest = (iHi + iLo) / 2;
355 int cmp = (c - aEntry[iTest].iCode);
356 if( cmp>=0 ){
357 iRes = iTest;
358 iLo = iTest+1;
359 }else{
360 iHi = iTest-1;
364 assert( iRes>=0 && c>=aEntry[iRes].iCode );
365 p = &aEntry[iRes];
366 if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
367 ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
368 assert( ret>0 );
371 if( eRemoveDiacritic ){
372 ret = remove_diacritic(ret, eRemoveDiacritic==2);
376 else if( c>=66560 && c<66600 ){
377 ret = c + 40;
380 return ret;
382 #endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */
383 #endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */