track 3.7.13
[sqlcipher.git] / ext / fts3 / fts3_unicode2.c
blob226d5ee419d7337f6ebee50951221993b6254b67
1 /*
2 ** 2012 May 25
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 ******************************************************************************
15 ** DO NOT EDIT THIS MACHINE GENERATED FILE.
18 #if defined(SQLITE_ENABLE_FTS4_UNICODE61)
19 #if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)
21 #include <assert.h>
24 ** Return true if the argument corresponds to a unicode codepoint
25 ** classified as either a letter or a number. Otherwise false.
27 ** The results are undefined if the value passed to this function
28 ** is less than zero.
30 int sqlite3FtsUnicodeIsalnum(int c){
31 /* Each unsigned integer in the following array corresponds to a contiguous
32 ** range of unicode codepoints that are not either letters or numbers (i.e.
33 ** codepoints for which this function should return 0).
35 ** The most significant 22 bits in each 32-bit value contain the first
36 ** codepoint in the range. The least significant 10 bits are used to store
37 ** the size of the range (always at least 1). In other words, the value
38 ** ((C<<22) + N) represents a range of N codepoints starting with codepoint
39 ** C. It is not possible to represent a range larger than 1023 codepoints
40 ** using this format.
42 const static unsigned int aEntry[] = {
43 0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07,
44 0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01,
45 0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401,
46 0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01,
47 0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01,
48 0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802,
49 0x00192C15, 0x0019A804, 0x0019C001, 0x001B5001, 0x001B580F,
50 0x001B9C07, 0x001BF402, 0x001C000E, 0x001C3C01, 0x001C4401,
51 0x001CC01B, 0x001E980B, 0x001FAC09, 0x001FD804, 0x00205804,
52 0x00206C09, 0x00209403, 0x0020A405, 0x0020C00F, 0x00216403,
53 0x00217801, 0x0023901B, 0x00240004, 0x0024E803, 0x0024F812,
54 0x00254407, 0x00258804, 0x0025C001, 0x00260403, 0x0026F001,
55 0x0026F807, 0x00271C02, 0x00272C03, 0x00275C01, 0x00278802,
56 0x0027C802, 0x0027E802, 0x00280403, 0x0028F001, 0x0028F805,
57 0x00291C02, 0x00292C03, 0x00294401, 0x0029C002, 0x0029D401,
58 0x002A0403, 0x002AF001, 0x002AF808, 0x002B1C03, 0x002B2C03,
59 0x002B8802, 0x002BC002, 0x002C0403, 0x002CF001, 0x002CF807,
60 0x002D1C02, 0x002D2C03, 0x002D5802, 0x002D8802, 0x002DC001,
61 0x002E0801, 0x002EF805, 0x002F1803, 0x002F2804, 0x002F5C01,
62 0x002FCC08, 0x00300403, 0x0030F807, 0x00311803, 0x00312804,
63 0x00315402, 0x00318802, 0x0031FC01, 0x00320802, 0x0032F001,
64 0x0032F807, 0x00331803, 0x00332804, 0x00335402, 0x00338802,
65 0x00340802, 0x0034F807, 0x00351803, 0x00352804, 0x00355C01,
66 0x00358802, 0x0035E401, 0x00360802, 0x00372801, 0x00373C06,
67 0x00375801, 0x00376008, 0x0037C803, 0x0038C401, 0x0038D007,
68 0x0038FC01, 0x00391C09, 0x00396802, 0x003AC401, 0x003AD006,
69 0x003AEC02, 0x003B2006, 0x003C041F, 0x003CD00C, 0x003DC417,
70 0x003E340B, 0x003E6424, 0x003EF80F, 0x003F380D, 0x0040AC14,
71 0x00412806, 0x00415804, 0x00417803, 0x00418803, 0x00419C07,
72 0x0041C404, 0x0042080C, 0x00423C01, 0x00426806, 0x0043EC01,
73 0x004D740C, 0x004E400A, 0x00500001, 0x0059B402, 0x005A0001,
74 0x005A6C02, 0x005BAC03, 0x005C4803, 0x005CC805, 0x005D4802,
75 0x005DC802, 0x005ED023, 0x005F6004, 0x005F7401, 0x0060000F,
76 0x0062A401, 0x0064800C, 0x0064C00C, 0x00650001, 0x00651002,
77 0x0066C011, 0x00672002, 0x00677822, 0x00685C05, 0x00687802,
78 0x0069540A, 0x0069801D, 0x0069FC01, 0x006A8007, 0x006AA006,
79 0x006C0005, 0x006CD011, 0x006D6823, 0x006E0003, 0x006E840D,
80 0x006F980E, 0x006FF004, 0x00709014, 0x0070EC05, 0x0071F802,
81 0x00730008, 0x00734019, 0x0073B401, 0x0073C803, 0x00770027,
82 0x0077F004, 0x007EF401, 0x007EFC03, 0x007F3403, 0x007F7403,
83 0x007FB403, 0x007FF402, 0x00800065, 0x0081A806, 0x0081E805,
84 0x00822805, 0x0082801A, 0x00834021, 0x00840002, 0x00840C04,
85 0x00842002, 0x00845001, 0x00845803, 0x00847806, 0x00849401,
86 0x00849C01, 0x0084A401, 0x0084B801, 0x0084E802, 0x00850005,
87 0x00852804, 0x00853C01, 0x00864264, 0x00900027, 0x0091000B,
88 0x0092704E, 0x00940200, 0x009C0475, 0x009E53B9, 0x00AD400A,
89 0x00B39406, 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001,
90 0x00B5FC01, 0x00B7804F, 0x00B8C00C, 0x00BA001A, 0x00BA6C59,
91 0x00BC00D6, 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807,
92 0x00C0D802, 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01,
93 0x00C64002, 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E,
94 0x00C94001, 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100,
95 0x01370040, 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10,
96 0x029A7C01, 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402,
97 0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804,
98 0x02A1D004, 0x02A20002, 0x02A2D011, 0x02A33802, 0x02A38012,
99 0x02A3E003, 0x02A4980A, 0x02A51C0D, 0x02A57C01, 0x02A60004,
100 0x02A6CC1B, 0x02A77802, 0x02A8A40E, 0x02A90C01, 0x02A93002,
101 0x02A97004, 0x02A9DC03, 0x02A9EC01, 0x02AAC001, 0x02AAC803,
102 0x02AADC02, 0x02AAF802, 0x02AB0401, 0x02AB7802, 0x02ABAC07,
103 0x02ABD402, 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02,
104 0x037FFC02, 0x03E3FC01, 0x03EC7801, 0x03ECA401, 0x03EEC810,
105 0x03F4F802, 0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023,
106 0x03F95013, 0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807,
107 0x03FCEC06, 0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405,
108 0x04040003, 0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E,
109 0x040E7C01, 0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01,
110 0x04280403, 0x04281402, 0x04283004, 0x0428E003, 0x0428FC01,
111 0x04294009, 0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016,
112 0x04420003, 0x0442C012, 0x04440003, 0x04449C0E, 0x04450004,
113 0x04460003, 0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004,
114 0x05BD442E, 0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5,
115 0x07480046, 0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01,
116 0x075C5401, 0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401,
117 0x075EA401, 0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064,
118 0x07C2800F, 0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F,
119 0x07C4C03C, 0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009,
120 0x07C94002, 0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014,
121 0x07CE8025, 0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001,
122 0x07D108B6, 0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018,
123 0x07D7EC46, 0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401,
124 0x38008060, 0x380400F0, 0x3C000001, 0x3FFFF401, 0x40000001,
125 0x43FFF401,
127 static const unsigned int aAscii[4] = {
128 0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
131 if( c<128 ){
132 return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 );
133 }else if( c<(1<<22) ){
134 unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
135 int iRes;
136 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
137 int iLo = 0;
138 while( iHi>=iLo ){
139 int iTest = (iHi + iLo) / 2;
140 if( key >= aEntry[iTest] ){
141 iRes = iTest;
142 iLo = iTest+1;
143 }else{
144 iHi = iTest-1;
147 assert( aEntry[0]<key );
148 assert( key>=aEntry[iRes] );
149 return (c >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
151 return 1;
156 ** If the argument is a codepoint corresponding to a lowercase letter
157 ** in the ASCII range with a diacritic added, return the codepoint
158 ** of the ASCII letter only. For example, if passed 235 - "LATIN
159 ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
160 ** E"). The resuls of passing a codepoint that corresponds to an
161 ** uppercase letter are undefined.
163 static int remove_diacritic(int c){
164 unsigned short aDia[] = {
165 0, 1797, 1848, 1859, 1891, 1928, 1940, 1995,
166 2024, 2040, 2060, 2110, 2168, 2206, 2264, 2286,
167 2344, 2383, 2472, 2488, 2516, 2596, 2668, 2732,
168 2782, 2842, 2894, 2954, 2984, 3000, 3028, 3336,
169 3456, 3696, 3712, 3728, 3744, 3896, 3912, 3928,
170 3968, 4008, 4040, 4106, 4138, 4170, 4202, 4234,
171 4266, 4296, 4312, 4344, 4408, 4424, 4472, 4504,
172 6148, 6198, 6264, 6280, 6360, 6429, 6505, 6529,
173 61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726,
174 61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122,
175 62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536,
176 62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730,
177 62924, 63050, 63082, 63274, 63390,
179 char aChar[] = {
180 '\0', 'a', 'c', 'e', 'i', 'n', 'o', 'u', 'y', 'y', 'a', 'c',
181 'd', 'e', 'e', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'o', 'r',
182 's', 't', 'u', 'u', 'w', 'y', 'z', 'o', 'u', 'a', 'i', 'o',
183 'u', 'g', 'k', 'o', 'j', 'g', 'n', 'a', 'e', 'i', 'o', 'r',
184 'u', 's', 't', 'h', 'a', 'e', 'o', 'y', '\0', '\0', '\0', '\0',
185 '\0', '\0', '\0', '\0', 'a', 'b', 'd', 'd', 'e', 'f', 'g', 'h',
186 'h', 'i', 'k', 'l', 'l', 'm', 'n', 'p', 'r', 'r', 's', 't',
187 'u', 'v', 'w', 'w', 'x', 'y', 'z', 'h', 't', 'w', 'y', 'a',
188 'e', 'i', 'o', 'u', 'y',
191 unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
192 int iRes = 0;
193 int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
194 int iLo = 0;
195 while( iHi>=iLo ){
196 int iTest = (iHi + iLo) / 2;
197 if( key >= aDia[iTest] ){
198 iRes = iTest;
199 iLo = iTest+1;
200 }else{
201 iHi = iTest-1;
204 assert( key>=aDia[iRes] );
205 return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
210 ** Return true if the argument interpreted as a unicode codepoint
211 ** is a diacritical modifier character.
213 int sqlite3FtsUnicodeIsdiacritic(int c){
214 unsigned int mask0 = 0x08029FDF;
215 unsigned int mask1 = 0x000361F8;
216 if( c<768 || c>817 ) return 0;
217 return (c < 768+32) ?
218 (mask0 & (1 << (c-768))) :
219 (mask1 & (1 << (c-768-32)));
224 ** Interpret the argument as a unicode codepoint. If the codepoint
225 ** is an upper case character that has a lower case equivalent,
226 ** return the codepoint corresponding to the lower case version.
227 ** Otherwise, return a copy of the argument.
229 ** The results are undefined if the value passed to this function
230 ** is less than zero.
232 int sqlite3FtsUnicodeFold(int c, int bRemoveDiacritic){
233 /* Each entry in the following array defines a rule for folding a range
234 ** of codepoints to lower case. The rule applies to a range of nRange
235 ** codepoints starting at codepoint iCode.
237 ** If the least significant bit in flags is clear, then the rule applies
238 ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
239 ** need to be folded). Or, if it is set, then the rule only applies to
240 ** every second codepoint in the range, starting with codepoint C.
242 ** The 7 most significant bits in flags are an index into the aiOff[]
243 ** array. If a specific codepoint C does require folding, then its lower
244 ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
246 ** The contents of this array are generated by parsing the CaseFolding.txt
247 ** file distributed as part of the "Unicode Character Database". See
248 ** http://www.unicode.org for details.
250 static const struct TableEntry {
251 unsigned short iCode;
252 unsigned char flags;
253 unsigned char nRange;
254 } aEntry[] = {
255 {65, 14, 26}, {181, 64, 1}, {192, 14, 23},
256 {216, 14, 7}, {256, 1, 48}, {306, 1, 6},
257 {313, 1, 16}, {330, 1, 46}, {376, 116, 1},
258 {377, 1, 6}, {383, 104, 1}, {385, 50, 1},
259 {386, 1, 4}, {390, 44, 1}, {391, 0, 1},
260 {393, 42, 2}, {395, 0, 1}, {398, 32, 1},
261 {399, 38, 1}, {400, 40, 1}, {401, 0, 1},
262 {403, 42, 1}, {404, 46, 1}, {406, 52, 1},
263 {407, 48, 1}, {408, 0, 1}, {412, 52, 1},
264 {413, 54, 1}, {415, 56, 1}, {416, 1, 6},
265 {422, 60, 1}, {423, 0, 1}, {425, 60, 1},
266 {428, 0, 1}, {430, 60, 1}, {431, 0, 1},
267 {433, 58, 2}, {435, 1, 4}, {439, 62, 1},
268 {440, 0, 1}, {444, 0, 1}, {452, 2, 1},
269 {453, 0, 1}, {455, 2, 1}, {456, 0, 1},
270 {458, 2, 1}, {459, 1, 18}, {478, 1, 18},
271 {497, 2, 1}, {498, 1, 4}, {502, 122, 1},
272 {503, 134, 1}, {504, 1, 40}, {544, 110, 1},
273 {546, 1, 18}, {570, 70, 1}, {571, 0, 1},
274 {573, 108, 1}, {574, 68, 1}, {577, 0, 1},
275 {579, 106, 1}, {580, 28, 1}, {581, 30, 1},
276 {582, 1, 10}, {837, 36, 1}, {880, 1, 4},
277 {886, 0, 1}, {902, 18, 1}, {904, 16, 3},
278 {908, 26, 1}, {910, 24, 2}, {913, 14, 17},
279 {931, 14, 9}, {962, 0, 1}, {975, 4, 1},
280 {976, 140, 1}, {977, 142, 1}, {981, 146, 1},
281 {982, 144, 1}, {984, 1, 24}, {1008, 136, 1},
282 {1009, 138, 1}, {1012, 130, 1}, {1013, 128, 1},
283 {1015, 0, 1}, {1017, 152, 1}, {1018, 0, 1},
284 {1021, 110, 3}, {1024, 34, 16}, {1040, 14, 32},
285 {1120, 1, 34}, {1162, 1, 54}, {1216, 6, 1},
286 {1217, 1, 14}, {1232, 1, 88}, {1329, 22, 38},
287 {4256, 66, 38}, {4295, 66, 1}, {4301, 66, 1},
288 {7680, 1, 150}, {7835, 132, 1}, {7838, 96, 1},
289 {7840, 1, 96}, {7944, 150, 8}, {7960, 150, 6},
290 {7976, 150, 8}, {7992, 150, 8}, {8008, 150, 6},
291 {8025, 151, 8}, {8040, 150, 8}, {8072, 150, 8},
292 {8088, 150, 8}, {8104, 150, 8}, {8120, 150, 2},
293 {8122, 126, 2}, {8124, 148, 1}, {8126, 100, 1},
294 {8136, 124, 4}, {8140, 148, 1}, {8152, 150, 2},
295 {8154, 120, 2}, {8168, 150, 2}, {8170, 118, 2},
296 {8172, 152, 1}, {8184, 112, 2}, {8186, 114, 2},
297 {8188, 148, 1}, {8486, 98, 1}, {8490, 92, 1},
298 {8491, 94, 1}, {8498, 12, 1}, {8544, 8, 16},
299 {8579, 0, 1}, {9398, 10, 26}, {11264, 22, 47},
300 {11360, 0, 1}, {11362, 88, 1}, {11363, 102, 1},
301 {11364, 90, 1}, {11367, 1, 6}, {11373, 84, 1},
302 {11374, 86, 1}, {11375, 80, 1}, {11376, 82, 1},
303 {11378, 0, 1}, {11381, 0, 1}, {11390, 78, 2},
304 {11392, 1, 100}, {11499, 1, 4}, {11506, 0, 1},
305 {42560, 1, 46}, {42624, 1, 24}, {42786, 1, 14},
306 {42802, 1, 62}, {42873, 1, 4}, {42877, 76, 1},
307 {42878, 1, 10}, {42891, 0, 1}, {42893, 74, 1},
308 {42896, 1, 4}, {42912, 1, 10}, {42922, 72, 1},
309 {65313, 14, 26},
311 static const unsigned short aiOff[] = {
312 1, 2, 8, 15, 16, 26, 28, 32,
313 37, 38, 40, 48, 63, 64, 69, 71,
314 79, 80, 116, 202, 203, 205, 206, 207,
315 209, 210, 211, 213, 214, 217, 218, 219,
316 775, 7264, 10792, 10795, 23228, 23256, 30204, 54721,
317 54753, 54754, 54756, 54787, 54793, 54809, 57153, 57274,
318 57921, 58019, 58363, 61722, 65268, 65341, 65373, 65406,
319 65408, 65410, 65415, 65424, 65436, 65439, 65450, 65462,
320 65472, 65476, 65478, 65480, 65482, 65488, 65506, 65511,
321 65514, 65521, 65527, 65528, 65529,
324 int ret = c;
326 assert( c>=0 );
327 assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
329 if( c<128 ){
330 if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
331 }else if( c<65536 ){
332 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
333 int iLo = 0;
334 int iRes = -1;
336 while( iHi>=iLo ){
337 int iTest = (iHi + iLo) / 2;
338 int cmp = (c - aEntry[iTest].iCode);
339 if( cmp>=0 ){
340 iRes = iTest;
341 iLo = iTest+1;
342 }else{
343 iHi = iTest-1;
346 assert( iRes<0 || c>=aEntry[iRes].iCode );
348 if( iRes>=0 ){
349 const struct TableEntry *p = &aEntry[iRes];
350 if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
351 ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
352 assert( ret>0 );
356 if( bRemoveDiacritic ) ret = remove_diacritic(ret);
359 else if( c>=66560 && c<66600 ){
360 ret = c + 40;
363 return ret;
365 #endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */
366 #endif /* !defined(SQLITE_ENABLE_FTS4_UNICODE61) */