Merge remote-tracking branch 'redux/master' into sh4-pool
[tamarin-stm.git] / eval / eval-unicode.cpp
blobba4edebf1d0ca2fff9d1695fe4b16342474f64d3
1 /* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 4 -*- */
2 /* vi: set ts=4 sw=4 expandtab: (add to ~/.vimrc: set modeline modelines=5) */
3 /* ***** BEGIN LICENSE BLOCK *****
4 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
14 * License.
16 * The Original Code is [Open Source Virtual Machine.].
18 * The Initial Developer of the Original Code is
19 * Adobe System Incorporated.
20 * Portions created by the Initial Developer are Copyright (C) 2008
21 * the Initial Developer. All Rights Reserved.
23 * Contributor(s):
24 * Adobe AS3 Team
26 * Alternatively, the contents of this file may be used under the terms of
27 * either the GNU General Public License Version 2 or later (the "GPL"), or
28 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
29 * in which case the provisions of the GPL or the LGPL are applicable instead
30 * of those above. If you wish to allow use of your version of this file only
31 * under the terms of either the GPL or the LGPL, and not to allow others to
32 * use your version of this file under the terms of the MPL, indicate your
33 * decision by deleting the provisions above and replace them with the notice
34 * and other provisions required by the GPL or the LGPL. If you do not delete
35 * the provisions above, a recipient may use your version of this file under
36 * the terms of any one of the MPL, the GPL or the LGPL.
38 * ***** END LICENSE BLOCK ***** */
40 #include "avmplus.h"
42 #ifdef VMCFG_EVAL
44 #include "eval.h"
46 /* Rudimentary Unicode support - enough to handle identifier lexing. These tables
47 * are slightly optimized for space, but more could be done. In particular, delta
48 * coding can be useful because most deltas - both in character ranges and between
49 * ranges - fit in a single byte. Delta coding might shrink the tables by slightly
50 * less than a factor of 2. As it is, the total size of these tables is about 2KB.
52 * Speed is not a big issue here because almost no programs have identifiers that
53 * contain characters outside the ASCII range, and ASCII is handled specially
54 * outside this file using faster code paths (see code in eval-lex.cpp).
56 * The tables are generated from the Unicode data file by generate-unicode-tables.c
57 * in this directory.
59 namespace avmplus
61 namespace RTC
63 struct Range {
64 uint16_t lo;
65 uint16_t hi;
68 struct UnicodeTable {
69 uint32_t nranges;
70 Range const * ranges;
71 uint32_t nsingletons;
72 uint16_t const * singletons;
75 // UnicodeLetter (Lu Ll Lt Lm Lo Nl)
76 static const Range unicodeLetter_ranges[] = {
77 {0x0041, 0x005A},
78 {0x0061, 0x007A},
79 {0x00C0, 0x00D6},
80 {0x00D8, 0x00F6},
81 {0x00F8, 0x02C1},
82 {0x02C6, 0x02D1},
83 {0x02E0, 0x02E4},
84 {0x0370, 0x0374},
85 {0x0376, 0x0377},
86 {0x037A, 0x037D},
87 {0x0388, 0x038A},
88 {0x038E, 0x03A1},
89 {0x03A3, 0x03F5},
90 {0x03F7, 0x0481},
91 {0x048A, 0x0523},
92 {0x0531, 0x0556},
93 {0x0561, 0x0587},
94 {0x05D0, 0x05EA},
95 {0x05F0, 0x05F2},
96 {0x0621, 0x064A},
97 {0x066E, 0x066F},
98 {0x0671, 0x06D3},
99 {0x06E5, 0x06E6},
100 {0x06EE, 0x06EF},
101 {0x06FA, 0x06FC},
102 {0x0712, 0x072F},
103 {0x074D, 0x07A5},
104 {0x07CA, 0x07EA},
105 {0x07F4, 0x07F5},
106 {0x0904, 0x0939},
107 {0x0958, 0x0961},
108 {0x0971, 0x0972},
109 {0x097B, 0x097F},
110 {0x0985, 0x098C},
111 {0x098F, 0x0990},
112 {0x0993, 0x09A8},
113 {0x09AA, 0x09B0},
114 {0x09B6, 0x09B9},
115 {0x09DC, 0x09DD},
116 {0x09DF, 0x09E1},
117 {0x09F0, 0x09F1},
118 {0x0A05, 0x0A0A},
119 {0x0A0F, 0x0A10},
120 {0x0A13, 0x0A28},
121 {0x0A2A, 0x0A30},
122 {0x0A32, 0x0A33},
123 {0x0A35, 0x0A36},
124 {0x0A38, 0x0A39},
125 {0x0A59, 0x0A5C},
126 {0x0A72, 0x0A74},
127 {0x0A85, 0x0A8D},
128 {0x0A8F, 0x0A91},
129 {0x0A93, 0x0AA8},
130 {0x0AAA, 0x0AB0},
131 {0x0AB2, 0x0AB3},
132 {0x0AB5, 0x0AB9},
133 {0x0AE0, 0x0AE1},
134 {0x0B05, 0x0B0C},
135 {0x0B0F, 0x0B10},
136 {0x0B13, 0x0B28},
137 {0x0B2A, 0x0B30},
138 {0x0B32, 0x0B33},
139 {0x0B35, 0x0B39},
140 {0x0B5C, 0x0B5D},
141 {0x0B5F, 0x0B61},
142 {0x0B85, 0x0B8A},
143 {0x0B8E, 0x0B90},
144 {0x0B92, 0x0B95},
145 {0x0B99, 0x0B9A},
146 {0x0B9E, 0x0B9F},
147 {0x0BA3, 0x0BA4},
148 {0x0BA8, 0x0BAA},
149 {0x0BAE, 0x0BB9},
150 {0x0C05, 0x0C0C},
151 {0x0C0E, 0x0C10},
152 {0x0C12, 0x0C28},
153 {0x0C2A, 0x0C33},
154 {0x0C35, 0x0C39},
155 {0x0C58, 0x0C59},
156 {0x0C60, 0x0C61},
157 {0x0C85, 0x0C8C},
158 {0x0C8E, 0x0C90},
159 {0x0C92, 0x0CA8},
160 {0x0CAA, 0x0CB3},
161 {0x0CB5, 0x0CB9},
162 {0x0CE0, 0x0CE1},
163 {0x0D05, 0x0D0C},
164 {0x0D0E, 0x0D10},
165 {0x0D12, 0x0D28},
166 {0x0D2A, 0x0D39},
167 {0x0D60, 0x0D61},
168 {0x0D7A, 0x0D7F},
169 {0x0D85, 0x0D96},
170 {0x0D9A, 0x0DB1},
171 {0x0DB3, 0x0DBB},
172 {0x0DC0, 0x0DC6},
173 {0x0E01, 0x0E30},
174 {0x0E32, 0x0E33},
175 {0x0E40, 0x0E46},
176 {0x0E81, 0x0E82},
177 {0x0E87, 0x0E88},
178 {0x0E94, 0x0E97},
179 {0x0E99, 0x0E9F},
180 {0x0EA1, 0x0EA3},
181 {0x0EAA, 0x0EAB},
182 {0x0EAD, 0x0EB0},
183 {0x0EB2, 0x0EB3},
184 {0x0EC0, 0x0EC4},
185 {0x0EDC, 0x0EDD},
186 {0x0F40, 0x0F47},
187 {0x0F49, 0x0F6C},
188 {0x0F88, 0x0F8B},
189 {0x1000, 0x102A},
190 {0x1050, 0x1055},
191 {0x105A, 0x105D},
192 {0x1065, 0x1066},
193 {0x106E, 0x1070},
194 {0x1075, 0x1081},
195 {0x10A0, 0x10C5},
196 {0x10D0, 0x10FA},
197 {0x1100, 0x1159},
198 {0x115F, 0x11A2},
199 {0x11A8, 0x11F9},
200 {0x1200, 0x1248},
201 {0x124A, 0x124D},
202 {0x1250, 0x1256},
203 {0x125A, 0x125D},
204 {0x1260, 0x1288},
205 {0x128A, 0x128D},
206 {0x1290, 0x12B0},
207 {0x12B2, 0x12B5},
208 {0x12B8, 0x12BE},
209 {0x12C2, 0x12C5},
210 {0x12C8, 0x12D6},
211 {0x12D8, 0x1310},
212 {0x1312, 0x1315},
213 {0x1318, 0x135A},
214 {0x1380, 0x138F},
215 {0x13A0, 0x13F4},
216 {0x1401, 0x166C},
217 {0x166F, 0x1676},
218 {0x1681, 0x169A},
219 {0x16A0, 0x16EA},
220 {0x16EE, 0x16F0},
221 {0x1700, 0x170C},
222 {0x170E, 0x1711},
223 {0x1720, 0x1731},
224 {0x1740, 0x1751},
225 {0x1760, 0x176C},
226 {0x176E, 0x1770},
227 {0x1780, 0x17B3},
228 {0x1820, 0x1877},
229 {0x1880, 0x18A8},
230 {0x1900, 0x191C},
231 {0x1950, 0x196D},
232 {0x1970, 0x1974},
233 {0x1980, 0x19A9},
234 {0x19C1, 0x19C7},
235 {0x1A00, 0x1A16},
236 {0x1B05, 0x1B33},
237 {0x1B45, 0x1B4B},
238 {0x1B83, 0x1BA0},
239 {0x1BAE, 0x1BAF},
240 {0x1C00, 0x1C23},
241 {0x1C4D, 0x1C4F},
242 {0x1C5A, 0x1C7D},
243 {0x1D00, 0x1DBF},
244 {0x1E00, 0x1F15},
245 {0x1F18, 0x1F1D},
246 {0x1F20, 0x1F45},
247 {0x1F48, 0x1F4D},
248 {0x1F50, 0x1F57},
249 {0x1F5F, 0x1F7D},
250 {0x1F80, 0x1FB4},
251 {0x1FB6, 0x1FBC},
252 {0x1FC2, 0x1FC4},
253 {0x1FC6, 0x1FCC},
254 {0x1FD0, 0x1FD3},
255 {0x1FD6, 0x1FDB},
256 {0x1FE0, 0x1FEC},
257 {0x1FF2, 0x1FF4},
258 {0x1FF6, 0x1FFC},
259 {0x2090, 0x2094},
260 {0x210A, 0x2113},
261 {0x2119, 0x211D},
262 {0x212A, 0x212D},
263 {0x212F, 0x2139},
264 {0x213C, 0x213F},
265 {0x2145, 0x2149},
266 {0x2160, 0x2188},
267 {0x2C00, 0x2C2E},
268 {0x2C30, 0x2C5E},
269 {0x2C60, 0x2C6F},
270 {0x2C71, 0x2C7D},
271 {0x2C80, 0x2CE4},
272 {0x2D00, 0x2D25},
273 {0x2D30, 0x2D65},
274 {0x2D80, 0x2D96},
275 {0x2DA0, 0x2DA6},
276 {0x2DA8, 0x2DAE},
277 {0x2DB0, 0x2DB6},
278 {0x2DB8, 0x2DBE},
279 {0x2DC0, 0x2DC6},
280 {0x2DC8, 0x2DCE},
281 {0x2DD0, 0x2DD6},
282 {0x2DD8, 0x2DDE},
283 {0x3005, 0x3007},
284 {0x3021, 0x3029},
285 {0x3031, 0x3035},
286 {0x3038, 0x303C},
287 {0x3041, 0x3096},
288 {0x309D, 0x309F},
289 {0x30A1, 0x30FA},
290 {0x30FC, 0x30FF},
291 {0x3105, 0x312D},
292 {0x3131, 0x318E},
293 {0x31A0, 0x31B7},
294 {0x31F0, 0x31FF},
295 {0xA000, 0xA48C},
296 {0xA500, 0xA60C},
297 {0xA610, 0xA61F},
298 {0xA62A, 0xA62B},
299 {0xA640, 0xA65F},
300 {0xA662, 0xA66E},
301 {0xA67F, 0xA697},
302 {0xA717, 0xA71F},
303 {0xA722, 0xA788},
304 {0xA78B, 0xA78C},
305 {0xA7FB, 0xA801},
306 {0xA803, 0xA805},
307 {0xA807, 0xA80A},
308 {0xA80C, 0xA822},
309 {0xA840, 0xA873},
310 {0xA882, 0xA8B3},
311 {0xA90A, 0xA925},
312 {0xA930, 0xA946},
313 {0xAA00, 0xAA28},
314 {0xAA40, 0xAA42},
315 {0xAA44, 0xAA4B},
316 {0xF900, 0xFA2D},
317 {0xFA30, 0xFA6A},
318 {0xFA70, 0xFAD9},
319 {0xFB00, 0xFB06},
320 {0xFB13, 0xFB17},
321 {0xFB1F, 0xFB28},
322 {0xFB2A, 0xFB36},
323 {0xFB38, 0xFB3C},
324 {0xFB40, 0xFB41},
325 {0xFB43, 0xFB44},
326 {0xFB46, 0xFBB1},
327 {0xFBD3, 0xFD3D},
328 {0xFD50, 0xFD8F},
329 {0xFD92, 0xFDC7},
330 {0xFDF0, 0xFDFB},
331 {0xFE70, 0xFE74},
332 {0xFE76, 0xFEFC},
333 {0xFF21, 0xFF3A},
334 {0xFF41, 0xFF5A},
335 {0xFF66, 0xFFBE},
336 {0xFFC2, 0xFFC7},
337 {0xFFCA, 0xFFCF},
338 {0xFFD2, 0xFFD7},
339 {0xFFDA, 0xFFDC},
342 static const uint16_t unicodeLetter_singletons[] = {
343 0x00AA,
344 0x00B5,
345 0x00BA,
346 0x02EC,
347 0x02EE,
348 0x0386,
349 0x038C,
350 0x0559,
351 0x06D5,
352 0x06FF,
353 0x0710,
354 0x07B1,
355 0x07FA,
356 0x093D,
357 0x0950,
358 0x09B2,
359 0x09BD,
360 0x09CE,
361 0x0A5E,
362 0x0ABD,
363 0x0AD0,
364 0x0B3D,
365 0x0B71,
366 0x0B83,
367 0x0B9C,
368 0x0BD0,
369 0x0C3D,
370 0x0CBD,
371 0x0CDE,
372 0x0D3D,
373 0x0DBD,
374 0x0E84,
375 0x0E8A,
376 0x0E8D,
377 0x0EA5,
378 0x0EA7,
379 0x0EBD,
380 0x0EC6,
381 0x0F00,
382 0x103F,
383 0x1061,
384 0x108E,
385 0x10FC,
386 0x1258,
387 0x12C0,
388 0x17D7,
389 0x17DC,
390 0x18AA,
391 0x1F59,
392 0x1F5B,
393 0x1F5D,
394 0x1FBE,
395 0x2071,
396 0x207F,
397 0x2102,
398 0x2107,
399 0x2115,
400 0x2124,
401 0x2126,
402 0x2128,
403 0x214E,
404 0x2D6F,
405 0x2E2F,
406 0x3400,
407 0x4DB5,
408 0x4E00,
409 0x9FC3,
410 0xAC00,
411 0xD7A3,
412 0xFB1D,
413 0xFB3E,
416 static const UnicodeTable unicodeLetter = {
417 263,
418 unicodeLetter_ranges,
420 unicodeLetter_singletons
423 // UnicodeCombiningMark (Mn, Mc)
424 // UnicodeDigit (Nd)
425 // UnicodeConnectorPunctuation (Pc)
426 static const Range identifier_subsequent_ranges[] = {
427 {0x0030, 0x0039},
428 {0x0300, 0x036F},
429 {0x0483, 0x0487},
430 {0x0591, 0x05BD},
431 {0x05C1, 0x05C2},
432 {0x05C4, 0x05C5},
433 {0x0610, 0x061A},
434 {0x064B, 0x065E},
435 {0x0660, 0x0669},
436 {0x06D6, 0x06DC},
437 {0x06DF, 0x06E4},
438 {0x06E7, 0x06E8},
439 {0x06EA, 0x06ED},
440 {0x06F0, 0x06F9},
441 {0x0730, 0x074A},
442 {0x07A6, 0x07B0},
443 {0x07C0, 0x07C9},
444 {0x07EB, 0x07F3},
445 {0x0901, 0x0903},
446 {0x093E, 0x094D},
447 {0x0951, 0x0954},
448 {0x0962, 0x0963},
449 {0x0966, 0x096F},
450 {0x0981, 0x0983},
451 {0x09BE, 0x09C4},
452 {0x09C7, 0x09C8},
453 {0x09CB, 0x09CD},
454 {0x09E2, 0x09E3},
455 {0x09E6, 0x09EF},
456 {0x0A01, 0x0A03},
457 {0x0A3E, 0x0A42},
458 {0x0A47, 0x0A48},
459 {0x0A4B, 0x0A4D},
460 {0x0A66, 0x0A71},
461 {0x0A81, 0x0A83},
462 {0x0ABE, 0x0AC5},
463 {0x0AC7, 0x0AC9},
464 {0x0ACB, 0x0ACD},
465 {0x0AE2, 0x0AE3},
466 {0x0AE6, 0x0AEF},
467 {0x0B01, 0x0B03},
468 {0x0B3E, 0x0B44},
469 {0x0B47, 0x0B48},
470 {0x0B4B, 0x0B4D},
471 {0x0B56, 0x0B57},
472 {0x0B62, 0x0B63},
473 {0x0B66, 0x0B6F},
474 {0x0BBE, 0x0BC2},
475 {0x0BC6, 0x0BC8},
476 {0x0BCA, 0x0BCD},
477 {0x0BE6, 0x0BEF},
478 {0x0C01, 0x0C03},
479 {0x0C3E, 0x0C44},
480 {0x0C46, 0x0C48},
481 {0x0C4A, 0x0C4D},
482 {0x0C55, 0x0C56},
483 {0x0C62, 0x0C63},
484 {0x0C66, 0x0C6F},
485 {0x0C82, 0x0C83},
486 {0x0CBE, 0x0CC4},
487 {0x0CC6, 0x0CC8},
488 {0x0CCA, 0x0CCD},
489 {0x0CD5, 0x0CD6},
490 {0x0CE2, 0x0CE3},
491 {0x0CE6, 0x0CEF},
492 {0x0D02, 0x0D03},
493 {0x0D3E, 0x0D44},
494 {0x0D46, 0x0D48},
495 {0x0D4A, 0x0D4D},
496 {0x0D62, 0x0D63},
497 {0x0D66, 0x0D6F},
498 {0x0D82, 0x0D83},
499 {0x0DCF, 0x0DD4},
500 {0x0DD8, 0x0DDF},
501 {0x0DF2, 0x0DF3},
502 {0x0E34, 0x0E3A},
503 {0x0E47, 0x0E4E},
504 {0x0E50, 0x0E59},
505 {0x0EB4, 0x0EB9},
506 {0x0EBB, 0x0EBC},
507 {0x0EC8, 0x0ECD},
508 {0x0ED0, 0x0ED9},
509 {0x0F18, 0x0F19},
510 {0x0F20, 0x0F29},
511 {0x0F3E, 0x0F3F},
512 {0x0F71, 0x0F84},
513 {0x0F86, 0x0F87},
514 {0x0F90, 0x0F97},
515 {0x0F99, 0x0FBC},
516 {0x102B, 0x103E},
517 {0x1040, 0x1049},
518 {0x1056, 0x1059},
519 {0x105E, 0x1060},
520 {0x1062, 0x1064},
521 {0x1067, 0x106D},
522 {0x1071, 0x1074},
523 {0x1082, 0x108D},
524 {0x108F, 0x1099},
525 {0x1712, 0x1714},
526 {0x1732, 0x1734},
527 {0x1752, 0x1753},
528 {0x1772, 0x1773},
529 {0x17B6, 0x17D3},
530 {0x17E0, 0x17E9},
531 {0x180B, 0x180D},
532 {0x1810, 0x1819},
533 {0x1920, 0x192B},
534 {0x1930, 0x193B},
535 {0x1946, 0x194F},
536 {0x19B0, 0x19C0},
537 {0x19C8, 0x19C9},
538 {0x19D0, 0x19D9},
539 {0x1A17, 0x1A1B},
540 {0x1B00, 0x1B04},
541 {0x1B34, 0x1B44},
542 {0x1B50, 0x1B59},
543 {0x1B6B, 0x1B73},
544 {0x1B80, 0x1B82},
545 {0x1BA1, 0x1BAA},
546 {0x1BB0, 0x1BB9},
547 {0x1C24, 0x1C37},
548 {0x1C40, 0x1C49},
549 {0x1C50, 0x1C59},
550 {0x1DC0, 0x1DE6},
551 {0x1DFE, 0x1DFF},
552 {0x203F, 0x2040},
553 {0x20D0, 0x20DC},
554 {0x20E5, 0x20F0},
555 {0x2DE0, 0x2DFF},
556 {0x302A, 0x302F},
557 {0x3099, 0x309A},
558 {0xA620, 0xA629},
559 {0xA67C, 0xA67D},
560 {0xA823, 0xA827},
561 {0xA880, 0xA881},
562 {0xA8B4, 0xA8C4},
563 {0xA8D0, 0xA8D9},
564 {0xA900, 0xA909},
565 {0xA926, 0xA92D},
566 {0xA947, 0xA953},
567 {0xAA29, 0xAA36},
568 {0xAA4C, 0xAA4D},
569 {0xAA50, 0xAA59},
570 {0xFE00, 0xFE0F},
571 {0xFE20, 0xFE26},
572 {0xFE33, 0xFE34},
573 {0xFE4D, 0xFE4F},
574 {0xFF10, 0xFF19},
577 static const uint16_t identifier_subsequent_singletons[] = {
578 0x005F,
579 0x05BF,
580 0x05C7,
581 0x0670,
582 0x0711,
583 0x093C,
584 0x09BC,
585 0x09D7,
586 0x0A3C,
587 0x0A51,
588 0x0A75,
589 0x0ABC,
590 0x0B3C,
591 0x0B82,
592 0x0BD7,
593 0x0CBC,
594 0x0D57,
595 0x0DCA,
596 0x0DD6,
597 0x0E31,
598 0x0EB1,
599 0x0F35,
600 0x0F37,
601 0x0F39,
602 0x0FC6,
603 0x135F,
604 0x17DD,
605 0x18A9,
606 0x2054,
607 0x20E1,
608 0xA66F,
609 0xA802,
610 0xA806,
611 0xA80B,
612 0xAA43,
613 0xFB1E,
614 0xFF3F,
617 static const UnicodeTable identifier_subsequent = {
618 148,
619 identifier_subsequent_ranges,
621 identifier_subsequent_singletons
624 static bool unicodeLookup(const UnicodeTable* tbl, wchar c)
626 int32_t lo = 0;
627 int32_t hi = tbl->nranges-1;
629 while (lo <= hi) {
630 int32_t mid = (lo + hi) / 2;
631 if (tbl->ranges[mid].lo <= c && c <= tbl->ranges[mid].hi)
632 return true;
633 if (c < tbl->ranges[mid].lo)
634 hi = mid-1;
635 else
636 lo = mid+1;
639 lo = 0;
640 hi = tbl->nsingletons-1;
641 while (lo <= hi) {
642 int32_t mid = (lo + hi) / 2;
643 if (tbl->singletons[mid] == c)
644 return true;
645 if (c < tbl->singletons[mid])
646 hi = mid-1;
647 else
648 lo = mid+1;
651 return false;
654 bool isNonASCIIIdentifierStart(wchar c)
656 return unicodeLookup(&unicodeLetter, c);
659 bool isNonASCIIIdentifierSubsequent(wchar c)
661 return unicodeLookup(&unicodeLetter, c) ||
662 unicodeLookup(&identifier_subsequent, c);
665 bool isUnicodeLetter(wchar c)
667 return unicodeLookup(&unicodeLetter, c);
670 bool isUnicodeDigit(wchar c)
672 // FIXME: not quite right, we want a proper lookup table for unicodeDigit
673 return c >= '0' && c <= '9';
678 #endif // VMCFG_EVAL