Implement Array.splice method
[vadmium-streams.git] / javascript / __init__.py
blob7c09de0e3029902fe2c05c4cab63a491dc64e66a
1 from io import StringIO
2 import re
3 import math
5 OP_PRECEDENCE = (
6 ('*', '/', '%'), ('+', '-'),
7 ('<<', '>>', '>>>'),
8 ('<', '>', '<=', '>=', 'instanceof', 'in'),
9 ('==', '!=', '===', '!=='),
10 ('&',), ('^',), ('|',), ('&&',), ('||',),
11 ('=', '*=', '/=', '%=', '+=', '-=',
12 '<<=', '>>=', '>>>=', '&=', '^=', '|='),
13 (',',),
15 OP_PRECEDENCE = {op: -prec
16 for [prec, set] in enumerate(OP_PRECEDENCE)
17 for op in set
19 UNARY_PRECEDENCE = +1
21 RESERVED_WORDS = {
22 # Keywords:
23 'break', 'else', 'new', 'var', 'case', 'finally', 'return',
24 'void', 'catch', 'for', 'switch', 'while', 'continue',
25 'function', 'this', 'with', 'default', 'if', 'throw', 'delete',
26 'in', 'try', 'do', 'instanceof', 'typeof',
27 # Future-reserved words:
28 'abstract', 'enum', 'int', 'short', 'boolean', 'export',
29 'interface', 'static', 'byte', 'extends', 'long', 'super',
30 'char', 'final', 'native', 'synchronized', 'class', 'float',
31 'package', 'throws', 'const', 'goto', 'private', 'transient',
32 'debugger', 'implements', 'protected', 'volatile', 'double',
33 'import', 'public',
34 # Literals:
35 'null', 'true', 'false',
38 # "$", "_", plus L and Nl categories from UnicodeData-3.0.1.txt
39 IDENTIFIER_START = r'$_' \
40 'A-Za-z\u00AA\u00B5\u00BA\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02B8' \
41 '\u02BB-\u02C1\u02D0\u02D1\u02E0-\u02E4\u02EE\u037A\u0386' \
42 '\u0388-\u0481\u048C-\u0559\u0561-\u0587\u05D0-\u05F2\u0621-\u064A' \
43 '\u0671-\u06D3\u06D5\u06E5-\u06E6\u06FA-\u06FC\u0710\u0712-\u072C' \
44 '\u0780-\u07A5\u0905-\u0939\u093D\u0950\u0958-\u0961\u0985-\u09B9' \
45 '\u09DC-\u09E1\u09F0-\u09F1\u0A05-\u0A39\u0A59-\u0A5E\u0A72-\u0A74' \
46 '\u0A85-\u0AB9\u0ABD\u0AD0-\u0AE0\u0B05-\u0B39\u0B3D\u0B5C-\u0B61' \
47 '\u0B85-\u0BB9\u0C05-\u0C39\u0C60-\u0C61\u0C85-\u0CB9\u0CDE-\u0CE1' \
48 '\u0D05-\u0D39\u0D60-\u0D61\u0D85-\u0DC6\u0E01-\u0E30\u0E32-\u0E33' \
49 '\u0E40-\u0E46\u0E81-\u0EB0\u0EB2-\u0EB3\u0EBD-\u0EC6\u0EDC-\u0F00' \
50 '\u0F40-\u0F6A\u0F88-\u0F8B\u1000-\u102A\u1050-\u1055\u10A0-\u10F6' \
51 '\u1100-\u135A\u13A0-\u166C\u166F-\u1676\u1681-\u169A\u16A0-\u16EA' \
52 '\u1780-\u17B3\u1820-\u18A8\u1E00-\u1FBC\u1FBE\u1FC2-\u1FCC' \
53 '\u1FD0-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FFC\u207F\u2102\u2107' \
54 '\u210A-\u2113\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D' \
55 '\u212F-\u2131\u2133-\u2139\u2160-\u2183\u3005-\u3007\u3021-\u3029' \
56 '\u3031-\u3035\u3038-\u303A\u3041-\u3094\u309D-\u30FA\u30FC-\u318E' \
57 '\u31A0-\u31B7\u3400-\uA48C\uAC00-\uD7A3\uF900-\uFB1D\uFB1F-\uFB28' \
58 '\uFB2A-\uFD3D\uFD50-\uFDFB\uFE70-\uFEFC\uFF21-\uFF3A\uFF41-\uFF5A' \
59 '\uFF66-\uFFDC'
61 # "$" plus L, Nl, Mn, Mc, Nd and Pc (includes "_") categories
62 IDENTIFIER_PART = r'$\w' \
63 '\u00AA\u00B5\u00BA\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02B8' \
64 '\u02BB-\u02C1\u02D0-\u02D1\u02E0-\u02E4\u02EE-\u0362\u037A\u0386' \
65 '\u0388-\u0481\u0483-\u0486\u048C-\u0559\u0561-\u0587\u0591-\u05BD' \
66 '\u05BF\u05C1-\u05C2\u05C4-\u05F2\u0621-\u0669\u0670-\u06D3' \
67 '\u06D5-\u06DC\u06DF-\u06E8\u06EA-\u06FC\u0710-\u0963\u0966-\u096F' \
68 '\u0981-\u09F1\u0A02-\u0B6F\u0B82-\u0BEF\u0C01-\u0DF3\u0E01-\u0E3A' \
69 '\u0E40-\u0E4E\u0E50-\u0E59\u0E81-\u0F00\u0F18-\u0F19\u0F20-\u0F29' \
70 '\u0F35\u0F37\u0F39\u0F3E-\u0F84\u0F86-\u0FBC\u0FC6\u1000-\u1049' \
71 '\u1050-\u10F6\u1100-\u135A\u1369-\u1371\u13A0-\u166C\u166F-\u1676' \
72 '\u1681-\u169A\u16A0-\u16EA\u1780-\u17D3\u17E0-\u17E9\u1810-\u1FBC' \
73 '\u1FBE\u1FC2-\u1FCC\u1FD0-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FFC' \
74 '\u203F-\u2040\u207F\u20D0-\u20DC\u20E1\u2102\u2107\u210A-\u2113' \
75 '\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D\u212F-\u2131' \
76 '\u2133-\u2139\u2160-\u2183\u3005-\u3007\u3021-\u302F\u3031-\u3035' \
77 '\u3038-\u303A\u3041-\u309A\u309D-\u318E\u31A0-\u31B7\u3400-\uA48C' \
78 '\uAC00-\uD7A3\uF900-\uFB28\uFB2A-\uFD3D\uFD50-\uFE23\uFE33-\uFE34' \
79 '\uFE4D-\uFE4F\uFE70-\uFEFC\uFF10-\uFF19\uFF21-\uFF3A\uFF3F' \
80 '\uFF41-\uFF5A\uFF65-\uFFDC'
81 IDENTIFIER_PART = fr'[{IDENTIFIER_PART}]|\\u[\da-fA-F]{{4}}'
83 IDENTIFIER_NAMES = re.compile(fr'''
84 ([{IDENTIFIER_START}] | \\u[\da-fA-F]{{4}})
85 ({IDENTIFIER_PART})*
86 ''', re.VERBOSE ^ re.ASCII)
87 is_identifier = IDENTIFIER_NAMES.fullmatch
89 def parse(reader):
90 parser = Parser()
91 parser.read = reader.read
93 parser.REMOVE_Cf = dict()
94 for [first, last] in Cf_RANGES:
95 for c in range(first, last + 1):
96 parser.REMOVE_Cf[c] = None
98 parser.buffer = ''
99 parser.pos = 0
100 parser.eof = False
102 stmts = parser.parse_block(func_defs=True)
103 yield next(stmts) # Ensure at least one statement
104 yield from stmts
105 assert parser.token is None
107 Cf_RANGES = ( # From UnicodeData-2.1.9.txt
108 (0x200C, 0x200F),
109 (0x202A, 0x202E),
110 (0x206A, 0x206F),
111 (0xFEFF, 0xFEFF),
114 class Parser:
115 def parse_block(self, func_defs=False):
116 self.read_token(self.parse_regex)
117 while self.token not in {None, ('punctuator', '}')}:
118 if self.token == ('reserved word', 'function'):
119 assert func_defs
120 stmt = self.parse_function()
121 assert stmt[1] is not None
122 yield ((), stmt)
123 self.read_token(self.parse_regex)
124 else:
125 yield self.parse_statement()
127 def read_token(self, slash=None):
128 self.handle_slash = slash
129 self.new_line = False
130 while True:
131 match = Parser.TOKENS.search(self.buffer, self.pos)
132 if not match:
133 if self.eof:
134 self.token = None
135 break
136 self.buffer = str()
137 self.fill()
138 continue
139 self.pos = match.end()
140 if self.pos == len(self.buffer) and not self.eof:
141 self.buffer = self.buffer[match.start():]
142 self.fill()
143 continue
144 handle = getattr(self, f'handle_{match.lastgroup}')
145 self.token = handle(match.group())
146 if self.token is not None:
147 break
149 def parse_statement(self):
150 labels = set()
151 while True:
152 if self.token == ('punctuator', '{'):
153 stmt = tuple(self.parse_block())
154 assert self.token == ('punctuator', '}')
155 self.read_token(self.parse_regex)
156 return (labels, ('block', stmt))
157 elif self.token == ('reserved word', 'return'):
158 self.read_token(self.parse_regex)
159 stmt = ('return', ('undefined',))
160 if not self.new_line and self.token not in \
161 {('punctuator', ';'), ('punctuator', '}'), None}:
162 stmt = ('return', self.parse_expr())
163 elif self.token == ('reserved word', 'if'):
164 self.read_token()
165 assert self.token == ('punctuator', '(')
167 self.read_token(self.parse_regex)
168 expr = self.parse_expr()
169 assert self.token == ('punctuator', ')')
171 self.read_token(self.parse_regex)
172 if_stmt = self.parse_statement()
173 if self.token == ('reserved word', 'else'):
174 self.read_token(self.parse_regex)
175 else_stmt = self.parse_statement()
176 else:
177 else_stmt = (frozenset(), ('block', ()))
178 return (labels, ('if', expr, if_stmt, else_stmt))
179 elif self.token == ('reserved word', 'var'):
180 stmt = self.parse_var()
181 elif self.token == ('reserved word', 'try'):
182 block = self.parse_curly_block()
183 if self.token == ('reserved word', 'catch'):
184 self.read_token()
185 assert self.token == ('punctuator', '(')
187 self.read_token()
188 [type, id] = self.token
189 assert type == 'identifier'
191 self.read_token()
192 assert self.token == ('punctuator', ')')
194 catch = self.parse_curly_block()
195 else:
196 id = None
197 catch = ()
199 if self.token == ('reserved word', 'finally'):
200 finally_block = self.parse_curly_block()
201 else:
202 assert catch != ()
203 finally_block = []
204 return (labels, ('try', block, id, catch, finally_block))
205 elif self.token == ('reserved word', 'for'):
206 self.read_token()
207 assert self.token == ('punctuator', '(')
209 self.read_token(self.parse_regex)
210 if self.token == ('reserved word', 'var'):
211 init = self.parse_var(disallow='in')
212 elif self.token == ('punctuator', ';'):
213 init = ('undefined',)
214 else:
215 init = self.parse_expr(disallow='in')
216 if self.token == ('punctuator', ';'):
217 self.read_token(self.parse_regex)
218 if self.token == ('punctuator', ';'):
219 test = True
220 else:
221 test = self.parse_expr()
222 assert self.token == ('punctuator', ';')
224 self.read_token(self.parse_regex)
225 if self.token == ('punctuator', ')'):
226 inc = ('undefined',)
227 else:
228 inc = self.parse_expr()
229 stmt = ('for', init, test, inc)
230 else:
231 assert self.token == ('reserved word', 'in')
232 self.read_token(self.parse_regex)
233 expr = self.parse_expr()
234 stmt = ('for in', init, expr)
235 assert self.token == ('punctuator', ')')
237 self.read_token(self.parse_regex)
238 body = self.parse_statement()
239 return (labels, (*stmt, body))
240 elif self.token == ('reserved word', 'throw'):
241 self.read_token(self.parse_regex)
242 assert not self.new_line
243 stmt = ('throw', self.parse_expr())
244 elif self.token in {('reserved word', 'break'),
245 ('reserved word', 'continue')}:
246 stmt = self.token[1]
247 self.read_token(self.parse_regex)
248 id = None
249 if not self.new_line and self.token not in \
250 {('punctuator', ';'), ('punctuator', '}'), None}:
251 id = self.parse_expr()
252 assert isinstance(id, tuple) and id[0] == 'identifier'
253 id = id[1]
254 stmt = (stmt, id)
255 elif self.token == ('reserved word', 'do'):
256 self.read_token(self.parse_regex)
257 stmt = self.parse_statement()
258 assert self.token == ('reserved word', 'while')
259 self.read_token()
260 assert self.token == ('punctuator', '(')
261 self.read_token(self.parse_regex)
262 stmt = ('do', stmt, self.parse_expr())
263 assert self.token == ('punctuator', ')')
264 self.read_token(self.parse_regex)
265 elif self.token == ('reserved word', 'switch'):
266 self.read_token()
267 assert self.token == ('punctuator', '(')
268 self.read_token(self.parse_regex)
269 expr = self.parse_expr()
270 assert self.token == ('punctuator', ')')
271 self.read_token()
272 assert self.token == ('punctuator', '{')
273 default = False
274 cases = list()
275 self.read_token()
276 while self.token != ('punctuator', '}'):
277 if self.token == ('reserved word', 'case'):
278 self.read_token(self.parse_regex)
279 case = self.parse_expr()
280 else:
281 assert self.token == ('reserved word', 'default')
282 assert not default
283 case = ('undefined',)
284 self.read_token()
285 default = True
286 assert self.token == ('punctuator', ':')
287 self.read_token()
288 stmts = list()
289 while self.token not in {
290 ('reserved word', 'case'),
291 ('reserved word', 'default'),
292 ('punctuator', '}'),
294 stmts.append(self.parse_statement())
295 cases.append((case, stmts))
296 self.read_token(self.parse_regex)
297 return (labels, ('switch', expr, cases))
298 elif self.token == ('punctuator', ';'):
299 stmt = ('block', ())
300 else:
301 assert self.token != ('reserved word', 'function')
302 stmt = self.parse_expr()
303 if self.token == ('punctuator', ':'):
304 assert isinstance(stmt, tuple) and stmt[0] == 'identifier'
305 assert stmt[1] not in labels
306 labels.add(stmt[1])
307 self.read_token(self.parse_regex)
308 continue
309 break
310 if self.token == ('punctuator', ';'):
311 self.read_token(self.parse_regex)
312 else:
313 assert self.new_line or self.token in {('punctuator', '}'), None}
314 return (labels, stmt)
316 def parse_var(self, **kw):
317 vars = list()
318 while True:
319 self.read_token()
320 [type, id] = self.token
321 assert type == 'identifier'
323 self.read_token()
324 if self.token == ('punctuator', '='):
325 self.read_token(self.parse_regex)
326 value = self.parse_expr(self.COMMA_PRECEDENCE, **kw)
327 else:
328 value = ('undefined',)
329 vars.append((id, value))
330 if self.token != ('punctuator', ','):
331 break
332 return ('var', vars)
334 def parse_curly_block(self):
335 self.read_token()
336 assert self.token == ('punctuator', '{')
338 block = list(self.parse_block())
339 assert self.token == ('punctuator', '}')
341 self.read_token(self.parse_regex)
342 return block
344 def parse_expr(self, precedence=-math.inf, *,
345 disallow=None, disallow_call=False):
346 if self.token == ('reserved word', 'new'):
347 self.read_token(self.parse_regex)
348 expr = self.parse_expr(precedence=+math.inf, disallow_call=True)
349 if self.token == ('punctuator', '('):
350 expr = ('new', expr, self.parse_args())
351 self.read_token(self.parse_div)
352 else:
353 expr = ('new', expr, ())
354 elif self.token in {
355 ('reserved word', 'delete'), ('reserved word', 'void'),
356 ('reserved word', 'typeof'),
357 ('punctuator', '++'), ('punctuator', '--'),
358 ('punctuator', '+'), ('punctuator', '-'),
359 ('punctuator', '~'), ('punctuator', '!'),
361 expr = f'prefix {self.token[1]}'
362 self.read_token(self.parse_regex)
363 expr = (expr, self.parse_expr(precedence=UNARY_PRECEDENCE))
364 else:
365 if self.token == ('punctuator', '('):
366 self.read_token(self.parse_regex)
367 expr = self.parse_expr()
368 assert self.token == ('punctuator', ')')
369 elif self.token == ('punctuator', '['):
370 expr = list()
371 while True:
372 self.read_token(self.parse_regex)
373 if self.token == ('punctuator', ']'):
374 break
375 if self.token == ('punctuator', ','):
376 expr.append(('undefined',))
377 continue
378 expr.append(self.parse_expr(self.COMMA_PRECEDENCE))
379 if self.token != ('punctuator', ','):
380 assert self.token == ('punctuator', ']')
381 break
382 elif self.token == ('reserved word', 'function'):
383 expr = self.parse_function()
384 elif self.token == ('punctuator', '{'):
385 expr = list()
386 self.read_token()
387 if self.token != ('punctuator', '}'):
388 while True:
389 name = self.token
390 if not isinstance(name, (float, str)):
391 assert isinstance(name, tuple)
392 [token, name] = name
393 assert token in {'identifier', 'reserved word'}
395 self.read_token()
396 assert self.token == ('punctuator', ':')
398 self.read_token(self.parse_regex)
399 value = self.parse_expr(self.COMMA_PRECEDENCE)
400 expr.append((name, value))
402 if self.token != ('punctuator', ','):
403 break
404 self.read_token()
405 assert self.token == ('punctuator', '}')
406 expr = ('object', expr)
407 elif isinstance(self.token, (float, str)) \
408 or self.token[0] in {'identifier', 'regex'}:
409 expr = self.token
410 else:
411 LITERALS = {
412 ('reserved word', 'null'): None,
413 ('reserved word', 'true'): True,
414 ('reserved word', 'false'): False,
415 ('reserved word', 'this'): ('this',),
417 expr = LITERALS[self.token]
418 self.read_token(self.parse_div)
419 while True:
420 if self.token == ('punctuator', '.'):
421 self.read_token()
422 [type, member] = self.token
423 assert type in {'identifier', 'reserved word'}
424 expr = ('property', expr, member)
425 elif self.token == ('punctuator', '(') and not disallow_call:
426 expr = ('call', expr, self.parse_args())
427 elif self.token == ('punctuator', '['):
428 self.read_token(self.parse_regex)
429 expr = ('property', expr, self.parse_expr())
430 assert self.token == ('punctuator', ']')
431 elif self.token == ('punctuator', '?'):
432 if precedence > self.ASSIGN_PRECEDENCE:
433 return expr
434 self.read_token(self.parse_regex)
435 a = self.parse_expr(self.COMMA_PRECEDENCE)
436 assert self.token == ('punctuator', ':')
437 self.read_token(self.parse_regex)
438 b = self.parse_expr(self.COMMA_PRECEDENCE, disallow=disallow)
439 expr = ('?', expr, a, b)
440 continue
441 elif not self.new_line \
442 and self.token in \
443 {('punctuator', '++'), ('punctuator', '--')} \
444 and precedence <= UNARY_PRECEDENCE:
445 expr = (f'postfix {self.token[1]}', expr)
446 elif self.token is None \
447 or self.token[0] not in {'punctuator', 'reserved word'} \
448 or self.token[1] not in OP_PRECEDENCE \
449 or self.token[1] == disallow:
450 return expr
451 else:
452 op = self.token[1]
453 # Assignments are right-associative; others are left-associative
454 rhs_precedence = OP_PRECEDENCE[op]
455 if rhs_precedence < precedence or rhs_precedence \
456 == precedence != self.ASSIGN_PRECEDENCE:
457 return expr
458 self.read_token(self.parse_regex)
459 rhs = self.parse_expr(rhs_precedence, disallow=disallow)
460 expr = (op, expr, rhs)
461 continue
462 self.read_token(self.parse_div)
464 def parse_args(self):
465 args = list()
466 self.read_token(self.parse_regex)
467 if self.token != ('punctuator', ')'):
468 while True:
469 args.append(self.parse_expr(self.COMMA_PRECEDENCE))
470 if self.token == ('punctuator', ')'):
471 break
472 assert self.token == ('punctuator', ',')
473 self.read_token(self.parse_regex)
474 return args
476 ASSIGN_PRECEDENCE = OP_PRECEDENCE['=']
477 COMMA_PRECEDENCE = OP_PRECEDENCE[',']
479 def parse_function(self):
480 self.read_token()
481 if self.token[0] == 'identifier':
482 name = self.token[1]
483 self.read_token()
484 else:
485 name = None
486 assert self.token == ('punctuator', '(')
488 params = list()
489 self.read_token()
490 if self.token != ('punctuator', ')'):
491 while True:
492 assert self.token[0] == 'identifier'
493 params.append(self.token[1])
495 self.read_token()
496 if self.token == ('punctuator', ')'):
497 break
498 assert self.token == ('punctuator', ',')
500 self.read_token()
502 self.read_token()
503 assert self.token == ('punctuator', '{')
504 body = list(self.parse_block(func_defs=True))
505 assert self.token == ('punctuator', '}')
506 return ('function', name, params, body)
508 BUFSIZE = 0x10000
510 IDENTIFIER_START_CHAR = re.compile(fr'[{IDENTIFIER_START}]|\\')
511 LINE_TERMINATORS = '\n\r\u2028\u2029'
512 TOKENS = re.compile(
513 fr'(?P<line_terminator> [{LINE_TERMINATORS}])'
514 r'| (?P<singleline_comment> //)'
515 r'| (?P<multiline_comment> /\*)'
516 fr'| (?P<identifier> {IDENTIFIER_START_CHAR.pattern})'
517 r'| (?P<number> [\d.])'
518 r'| (?P<punctuator>'
519 r'\+\+ | -- | && | \|\|'
520 r'| (<< | >>>? | [=!]= | [-<>+*%&^|])=?'
521 r'| [][{}().;,!~?:=])'
522 r'| (?P<slash> /)'
523 '| (?P<string> ["\'])'
524 # Anything else other than whitespace:
525 '| (?P<illegal_codepoint> [^\t\v\f \xA0\u2000-\u200B\u3000])',
526 re.DOTALL ^ re.VERBOSE ^ re.ASCII)
528 def handle_line_terminator(self, token):
529 self.new_line = True
531 LINE_TERMINATOR_PATTERN \
532 = re.compile(fr'[{LINE_TERMINATORS}]')
534 def handle_singleline_comment(self, start):
535 self.read_matches(fr'[^{self.LINE_TERMINATORS}]', 1)
537 def handle_multiline_comment(self, start):
538 while True:
539 end = self.buffer.find('*/', self.pos)
540 if end >= 0:
541 break
542 if self.LINE_TERMINATOR_PATTERN.search(self.buffer, self.pos):
543 self.new_line = True
544 if self.eof:
545 raise EOFError('Unterminated multi-line comment')
546 self.buffer = self.buffer[max(len(self.buffer) - 1, self.pos):]
547 self.fill()
548 last = self.pos
549 self.pos = end + 2
550 if self.LINE_TERMINATOR_PATTERN.search(self.buffer, last, end):
551 self.new_line = True
553 def handle_identifier(self, token):
554 token += self.read_matches(IDENTIFIER_PART, 6, re.ASCII)
555 if token in RESERVED_WORDS:
556 return ('reserved word', token)
557 token = self.UNICODE_ESCAPES.sub(self.decode_unicode, token)
558 assert is_identifier(token)
559 assert token not in RESERVED_WORDS
560 return ('identifier', token)
562 UNICODE_ESCAPES = re.compile(r'\\u(.{4})', re.ASCII)
564 @staticmethod
565 def decode_unicode(match):
566 return chr(int(match.group(1), 16))
568 def handle_number(self, token):
569 number = None
570 if token == '.':
571 token += self.read_matches(r'\d', 1, re.ASCII)
572 if token == '.':
573 return ('punctuator', '.')
574 else:
575 if token == '0':
576 if self.read_match(*self.X):
577 number = self.read_matches(r'[\da-fA-F]', 1, re.ASCII)
578 assert number > ''
579 number = int(number, 16)
580 if number >= 2**1024 - 2**(1024 - 54):
581 number = math.inf
582 else: # Nonzero digit
583 token += self.read_matches(r'\d', 1, re.ASCII)
584 if number is None and self.read_match(*self.POINT):
585 token += '.' + self.read_matches(r'\d', 1, re.ASCII)
586 if number is None:
587 prefix = self.read_match(*self.EXPONENT_PREFIX)
588 if prefix:
589 exp = self.read_matches(r'\d', 1, re.ASCII)
590 assert exp > ''
591 token += prefix.group() + exp
592 number = token
593 assert not self.read_match(self.IDENTIFIER_START_CHAR, 1)
594 return float(number)
596 X = (re.compile(r'[xX]'), 1)
597 POINT = (re.compile(r'\.'), 1)
598 EXPONENT_PREFIX = (re.compile(r'[eE][-+]?'), 2)
600 def handle_punctuator(self, token):
601 return ('punctuator', token)
603 def parse_div(self, token):
604 if self.read_match(*self.EQUALS):
605 return ('punctuator', '/=')
606 return ('punctuator', '/')
608 EQUALS = (re.compile(r'='), 1)
610 def parse_regex(self, token):
611 self.pos -= len(token) - 1
612 pattern = list()
613 while True:
614 pattern.append(self.read_matches(
615 fr'[^{self.LINE_TERMINATORS}{self.BACKSLASH}/[]'
616 fr'|\\[^{self.LINE_TERMINATORS}]', 2))
617 end = self.read_match(*self.SLASH_OR_CLASS)
618 if end.group() != '[':
619 break
620 pattern.extend(('[', self.read_matches(
621 fr'[^{self.LINE_TERMINATORS}\]{self.BACKSLASH}]'
622 fr'|\\[^{self.LINE_TERMINATORS}]', 2), ']'))
623 end = self.read_match(*self.CLASS_END)
624 assert end
625 flags = self.read_matches(IDENTIFIER_PART, 6, re.ASCII)
626 return ('regex', ''.join(pattern), flags)
628 SLASH_OR_CLASS = (re.compile(r'[/[]'), 1)
629 CLASS_END = (re.compile(r']'), 1)
631 def handle_string(self, quote):
632 s = self.read_matches(
633 fr'[^{quote}{self.BACKSLASH}{self.LINE_TERMINATORS}]'
634 fr'|\\[^1-9xu{self.LINE_TERMINATORS}]'
635 r'|\\x[\da-fA-F]{2}'
636 r'|\\u[\da-fA-F]{4}', 6, re.ASCII)
637 end = self.read_match(re.compile(quote), 1)
638 assert end
639 assert not self.INVALID_NUL.search(s)
640 return self.ESCAPE_SEQUENCES.sub(self.decode_escape, s)
642 BACKSLASH = '\\{:03o}'.format(ord('\\'))
643 INVALID_NUL = re.compile(r'\\0\d', re.ASCII)
644 ESCAPE_SEQUENCES = re.compile(r'\\(x.{2}|u.{4}|.)', re.ASCII)
646 @staticmethod
647 def decode_escape(match):
648 match = match.group(1)
649 if match.startswith(('x', 'u')):
650 c = chr(int(match[1:], 16))
651 else:
652 escapes = {
653 'n': '\n', 'b': '\b', 'f': '\f', 'r': '\r', 't': '\t',
654 'v': '\v', '0': '\x00'}
655 c = escapes.get(match, match)
656 return c
658 def handle_illegal_codepoint(self, codepoint):
659 raise ValueError(f'Illegal codepoint: {codepoint!r}')
661 def read_matches(self, pattern, itemlen, flags=0):
662 pattern = re.compile(r'(?:{})*'.format(pattern), flags)
663 result = StringIO()
664 while True:
665 match = pattern.match(self.buffer, self.pos)
666 result.write(match.group())
667 self.pos = match.end()
668 if len(self.buffer) - self.pos >= itemlen or self.eof:
669 break
670 self.buffer = self.buffer[self.pos:]
671 self.fill()
672 return result.getvalue()
674 def read_match(self, pattern, maxlen):
675 while self.pos + maxlen > len(self.buffer) and not self.eof:
676 self.buffer = self.buffer[self.pos:]
677 self.fill()
678 match = pattern.match(self.buffer, self.pos)
679 if match:
680 self.pos = match.end()
681 return match
683 def fill(self):
684 size = self.BUFSIZE - len(self.buffer)
685 new = self.read(size)
686 self.eof = len(new) < size
687 self.buffer += new.translate(self.REMOVE_Cf)
688 self.pos = 0