1 from io
import StringIO
6 ('*', '/', '%'), ('+', '-'),
8 ('<', '>', '<=', '>=', 'instanceof', 'in'),
9 ('==', '!=', '===', '!=='),
10 ('&',), ('^',), ('|',), ('&&',), ('||',),
11 ('=', '*=', '/=', '%=', '+=', '-=',
12 '<<=', '>>=', '>>>=', '&=', '^=', '|='),
15 OP_PRECEDENCE
= {op
: -prec
16 for [prec
, set] in enumerate(OP_PRECEDENCE
)
23 'break', 'else', 'new', 'var', 'case', 'finally', 'return',
24 'void', 'catch', 'for', 'switch', 'while', 'continue',
25 'function', 'this', 'with', 'default', 'if', 'throw', 'delete',
26 'in', 'try', 'do', 'instanceof', 'typeof',
27 # Future-reserved words:
28 'abstract', 'enum', 'int', 'short', 'boolean', 'export',
29 'interface', 'static', 'byte', 'extends', 'long', 'super',
30 'char', 'final', 'native', 'synchronized', 'class', 'float',
31 'package', 'throws', 'const', 'goto', 'private', 'transient',
32 'debugger', 'implements', 'protected', 'volatile', 'double',
35 'null', 'true', 'false',
38 # "$", "_", plus L and Nl categories from UnicodeData-3.0.1.txt
39 IDENTIFIER_START
= r
'$_' \
40 'A-Za-z\u00AA\u00B5\u00BA\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02B8' \
41 '\u02BB-\u02C1\u02D0\u02D1\u02E0-\u02E4\u02EE\u037A\u0386' \
42 '\u0388-\u0481\u048C-\u0559\u0561-\u0587\u05D0-\u05F2\u0621-\u064A' \
43 '\u0671-\u06D3\u06D5\u06E5-\u06E6\u06FA-\u06FC\u0710\u0712-\u072C' \
44 '\u0780-\u07A5\u0905-\u0939\u093D\u0950\u0958-\u0961\u0985-\u09B9' \
45 '\u09DC-\u09E1\u09F0-\u09F1\u0A05-\u0A39\u0A59-\u0A5E\u0A72-\u0A74' \
46 '\u0A85-\u0AB9\u0ABD\u0AD0-\u0AE0\u0B05-\u0B39\u0B3D\u0B5C-\u0B61' \
47 '\u0B85-\u0BB9\u0C05-\u0C39\u0C60-\u0C61\u0C85-\u0CB9\u0CDE-\u0CE1' \
48 '\u0D05-\u0D39\u0D60-\u0D61\u0D85-\u0DC6\u0E01-\u0E30\u0E32-\u0E33' \
49 '\u0E40-\u0E46\u0E81-\u0EB0\u0EB2-\u0EB3\u0EBD-\u0EC6\u0EDC-\u0F00' \
50 '\u0F40-\u0F6A\u0F88-\u0F8B\u1000-\u102A\u1050-\u1055\u10A0-\u10F6' \
51 '\u1100-\u135A\u13A0-\u166C\u166F-\u1676\u1681-\u169A\u16A0-\u16EA' \
52 '\u1780-\u17B3\u1820-\u18A8\u1E00-\u1FBC\u1FBE\u1FC2-\u1FCC' \
53 '\u1FD0-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FFC\u207F\u2102\u2107' \
54 '\u210A-\u2113\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D' \
55 '\u212F-\u2131\u2133-\u2139\u2160-\u2183\u3005-\u3007\u3021-\u3029' \
56 '\u3031-\u3035\u3038-\u303A\u3041-\u3094\u309D-\u30FA\u30FC-\u318E' \
57 '\u31A0-\u31B7\u3400-\uA48C\uAC00-\uD7A3\uF900-\uFB1D\uFB1F-\uFB28' \
58 '\uFB2A-\uFD3D\uFD50-\uFDFB\uFE70-\uFEFC\uFF21-\uFF3A\uFF41-\uFF5A' \
61 # "$" plus L, Nl, Mn, Mc, Nd and Pc (includes "_") categories
62 IDENTIFIER_PART
= r
'$\w' \
63 '\u00AA\u00B5\u00BA\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02B8' \
64 '\u02BB-\u02C1\u02D0-\u02D1\u02E0-\u02E4\u02EE-\u0362\u037A\u0386' \
65 '\u0388-\u0481\u0483-\u0486\u048C-\u0559\u0561-\u0587\u0591-\u05BD' \
66 '\u05BF\u05C1-\u05C2\u05C4-\u05F2\u0621-\u0669\u0670-\u06D3' \
67 '\u06D5-\u06DC\u06DF-\u06E8\u06EA-\u06FC\u0710-\u0963\u0966-\u096F' \
68 '\u0981-\u09F1\u0A02-\u0B6F\u0B82-\u0BEF\u0C01-\u0DF3\u0E01-\u0E3A' \
69 '\u0E40-\u0E4E\u0E50-\u0E59\u0E81-\u0F00\u0F18-\u0F19\u0F20-\u0F29' \
70 '\u0F35\u0F37\u0F39\u0F3E-\u0F84\u0F86-\u0FBC\u0FC6\u1000-\u1049' \
71 '\u1050-\u10F6\u1100-\u135A\u1369-\u1371\u13A0-\u166C\u166F-\u1676' \
72 '\u1681-\u169A\u16A0-\u16EA\u1780-\u17D3\u17E0-\u17E9\u1810-\u1FBC' \
73 '\u1FBE\u1FC2-\u1FCC\u1FD0-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FFC' \
74 '\u203F-\u2040\u207F\u20D0-\u20DC\u20E1\u2102\u2107\u210A-\u2113' \
75 '\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D\u212F-\u2131' \
76 '\u2133-\u2139\u2160-\u2183\u3005-\u3007\u3021-\u302F\u3031-\u3035' \
77 '\u3038-\u303A\u3041-\u309A\u309D-\u318E\u31A0-\u31B7\u3400-\uA48C' \
78 '\uAC00-\uD7A3\uF900-\uFB28\uFB2A-\uFD3D\uFD50-\uFE23\uFE33-\uFE34' \
79 '\uFE4D-\uFE4F\uFE70-\uFEFC\uFF10-\uFF19\uFF21-\uFF3A\uFF3F' \
80 '\uFF41-\uFF5A\uFF65-\uFFDC'
81 IDENTIFIER_PART
= fr
'[{IDENTIFIER_PART}]|\\u[\da-fA-F]{{4}}'
83 IDENTIFIER_NAMES
= re
.compile(fr
'''
84 ([{IDENTIFIER_START}] | \\u[\da-fA-F]{{4}})
86 ''', re
.VERBOSE ^ re
.ASCII
)
87 is_identifier
= IDENTIFIER_NAMES
.fullmatch
91 parser
.read
= reader
.read
93 parser
.REMOVE_Cf
= dict()
94 for [first
, last
] in Cf_RANGES
:
95 for c
in range(first
, last
+ 1):
96 parser
.REMOVE_Cf
[c
] = None
102 stmts
= parser
.parse_block(func_defs
=True)
103 yield next(stmts
) # Ensure at least one statement
105 assert parser
.token
is None
107 Cf_RANGES
= ( # From UnicodeData-2.1.9.txt
115 def parse_block(self
, func_defs
=False):
116 self
.read_token(self
.parse_regex
)
117 while self
.token
not in {None, ('punctuator', '}')}:
118 if self
.token
== ('reserved word', 'function'):
120 stmt
= self
.parse_function()
121 assert stmt
[1] is not None
123 self
.read_token(self
.parse_regex
)
125 yield self
.parse_statement()
127 def read_token(self
, slash
=None):
128 self
.handle_slash
= slash
129 self
.new_line
= False
131 match
= Parser
.TOKENS
.search(self
.buffer, self
.pos
)
139 self
.pos
= match
.end()
140 if self
.pos
== len(self
.buffer) and not self
.eof
:
141 self
.buffer = self
.buffer[match
.start():]
144 handle
= getattr(self
, f
'handle_{match.lastgroup}')
145 self
.token
= handle(match
.group())
146 if self
.token
is not None:
149 def parse_statement(self
):
152 if self
.token
== ('punctuator', '{'):
153 stmt
= tuple(self
.parse_block())
154 assert self
.token
== ('punctuator', '}')
155 self
.read_token(self
.parse_regex
)
156 return (labels
, ('block', stmt
))
157 elif self
.token
== ('reserved word', 'return'):
158 self
.read_token(self
.parse_regex
)
159 stmt
= ('return', ('undefined',))
160 if not self
.new_line
and self
.token
not in \
161 {('punctuator', ';'), ('punctuator', '}'), None}:
162 stmt
= ('return', self
.parse_expr())
163 elif self
.token
== ('reserved word', 'if'):
165 assert self
.token
== ('punctuator', '(')
167 self
.read_token(self
.parse_regex
)
168 expr
= self
.parse_expr()
169 assert self
.token
== ('punctuator', ')')
171 self
.read_token(self
.parse_regex
)
172 if_stmt
= self
.parse_statement()
173 if self
.token
== ('reserved word', 'else'):
174 self
.read_token(self
.parse_regex
)
175 else_stmt
= self
.parse_statement()
177 else_stmt
= (frozenset(), ('block', ()))
178 return (labels
, ('if', expr
, if_stmt
, else_stmt
))
179 elif self
.token
== ('reserved word', 'var'):
180 stmt
= self
.parse_var()
181 elif self
.token
== ('reserved word', 'try'):
182 block
= self
.parse_curly_block()
183 if self
.token
== ('reserved word', 'catch'):
185 assert self
.token
== ('punctuator', '(')
188 [type, id] = self
.token
189 assert type == 'identifier'
192 assert self
.token
== ('punctuator', ')')
194 catch
= self
.parse_curly_block()
199 if self
.token
== ('reserved word', 'finally'):
200 finally_block
= self
.parse_curly_block()
204 return (labels
, ('try', block
, id, catch
, finally_block
))
205 elif self
.token
== ('reserved word', 'for'):
207 assert self
.token
== ('punctuator', '(')
209 self
.read_token(self
.parse_regex
)
210 if self
.token
== ('reserved word', 'var'):
211 init
= self
.parse_var(disallow
='in')
212 elif self
.token
== ('punctuator', ';'):
213 init
= ('undefined',)
215 init
= self
.parse_expr(disallow
='in')
216 if self
.token
== ('punctuator', ';'):
217 self
.read_token(self
.parse_regex
)
218 if self
.token
== ('punctuator', ';'):
221 test
= self
.parse_expr()
222 assert self
.token
== ('punctuator', ';')
224 self
.read_token(self
.parse_regex
)
225 if self
.token
== ('punctuator', ')'):
228 inc
= self
.parse_expr()
229 stmt
= ('for', init
, test
, inc
)
231 assert self
.token
== ('reserved word', 'in')
232 self
.read_token(self
.parse_regex
)
233 expr
= self
.parse_expr()
234 stmt
= ('for in', init
, expr
)
235 assert self
.token
== ('punctuator', ')')
237 self
.read_token(self
.parse_regex
)
238 body
= self
.parse_statement()
239 return (labels
, (*stmt
, body
))
240 elif self
.token
== ('reserved word', 'throw'):
241 self
.read_token(self
.parse_regex
)
242 assert not self
.new_line
243 stmt
= ('throw', self
.parse_expr())
244 elif self
.token
in {('reserved word', 'break'),
245 ('reserved word', 'continue')}:
247 self
.read_token(self
.parse_regex
)
249 if not self
.new_line
and self
.token
not in \
250 {('punctuator', ';'), ('punctuator', '}'), None}:
251 id = self
.parse_expr()
252 assert isinstance(id, tuple) and id[0] == 'identifier'
255 elif self
.token
== ('reserved word', 'do'):
256 self
.read_token(self
.parse_regex
)
257 stmt
= self
.parse_statement()
258 assert self
.token
== ('reserved word', 'while')
260 assert self
.token
== ('punctuator', '(')
261 self
.read_token(self
.parse_regex
)
262 stmt
= ('do', stmt
, self
.parse_expr())
263 assert self
.token
== ('punctuator', ')')
264 self
.read_token(self
.parse_regex
)
265 elif self
.token
== ('reserved word', 'switch'):
267 assert self
.token
== ('punctuator', '(')
268 self
.read_token(self
.parse_regex
)
269 expr
= self
.parse_expr()
270 assert self
.token
== ('punctuator', ')')
272 assert self
.token
== ('punctuator', '{')
276 while self
.token
!= ('punctuator', '}'):
277 if self
.token
== ('reserved word', 'case'):
278 self
.read_token(self
.parse_regex
)
279 case
= self
.parse_expr()
281 assert self
.token
== ('reserved word', 'default')
283 case
= ('undefined',)
286 assert self
.token
== ('punctuator', ':')
289 while self
.token
not in {
290 ('reserved word', 'case'),
291 ('reserved word', 'default'),
294 stmts
.append(self
.parse_statement())
295 cases
.append((case
, stmts
))
296 self
.read_token(self
.parse_regex
)
297 return (labels
, ('switch', expr
, cases
))
298 elif self
.token
== ('punctuator', ';'):
301 assert self
.token
!= ('reserved word', 'function')
302 stmt
= self
.parse_expr()
303 if self
.token
== ('punctuator', ':'):
304 assert isinstance(stmt
, tuple) and stmt
[0] == 'identifier'
305 assert stmt
[1] not in labels
307 self
.read_token(self
.parse_regex
)
310 if self
.token
== ('punctuator', ';'):
311 self
.read_token(self
.parse_regex
)
313 assert self
.new_line
or self
.token
in {('punctuator', '}'), None}
314 return (labels
, stmt
)
316 def parse_var(self
, **kw
):
320 [type, id] = self
.token
321 assert type == 'identifier'
324 if self
.token
== ('punctuator', '='):
325 self
.read_token(self
.parse_regex
)
326 value
= self
.parse_expr(self
.COMMA_PRECEDENCE
, **kw
)
328 value
= ('undefined',)
329 vars.append((id, value
))
330 if self
.token
!= ('punctuator', ','):
334 def parse_curly_block(self
):
336 assert self
.token
== ('punctuator', '{')
338 block
= list(self
.parse_block())
339 assert self
.token
== ('punctuator', '}')
341 self
.read_token(self
.parse_regex
)
344 def parse_expr(self
, precedence
=-math
.inf
, *,
345 disallow
=None, disallow_call
=False):
346 if self
.token
== ('reserved word', 'new'):
347 self
.read_token(self
.parse_regex
)
348 expr
= self
.parse_expr(precedence
=+math
.inf
, disallow_call
=True)
349 if self
.token
== ('punctuator', '('):
350 expr
= ('new', expr
, self
.parse_args())
351 self
.read_token(self
.parse_div
)
353 expr
= ('new', expr
, ())
355 ('reserved word', 'delete'), ('reserved word', 'void'),
356 ('reserved word', 'typeof'),
357 ('punctuator', '++'), ('punctuator', '--'),
358 ('punctuator', '+'), ('punctuator', '-'),
359 ('punctuator', '~'), ('punctuator', '!'),
361 expr
= f
'prefix {self.token[1]}'
362 self
.read_token(self
.parse_regex
)
363 expr
= (expr
, self
.parse_expr(precedence
=UNARY_PRECEDENCE
))
365 if self
.token
== ('punctuator', '('):
366 self
.read_token(self
.parse_regex
)
367 expr
= self
.parse_expr()
368 assert self
.token
== ('punctuator', ')')
369 elif self
.token
== ('punctuator', '['):
372 self
.read_token(self
.parse_regex
)
373 if self
.token
== ('punctuator', ']'):
375 if self
.token
== ('punctuator', ','):
376 expr
.append(('undefined',))
378 expr
.append(self
.parse_expr(self
.COMMA_PRECEDENCE
))
379 if self
.token
!= ('punctuator', ','):
380 assert self
.token
== ('punctuator', ']')
382 elif self
.token
== ('reserved word', 'function'):
383 expr
= self
.parse_function()
384 elif self
.token
== ('punctuator', '{'):
387 if self
.token
!= ('punctuator', '}'):
390 if not isinstance(name
, (float, str)):
391 assert isinstance(name
, tuple)
393 assert token
in {'identifier', 'reserved word'}
396 assert self
.token
== ('punctuator', ':')
398 self
.read_token(self
.parse_regex
)
399 value
= self
.parse_expr(self
.COMMA_PRECEDENCE
)
400 expr
.append((name
, value
))
402 if self
.token
!= ('punctuator', ','):
405 assert self
.token
== ('punctuator', '}')
406 expr
= ('object', expr
)
407 elif isinstance(self
.token
, (float, str)) \
408 or self
.token
[0] in {'identifier', 'regex'}:
412 ('reserved word', 'null'): None,
413 ('reserved word', 'true'): True,
414 ('reserved word', 'false'): False,
415 ('reserved word', 'this'): ('this',),
417 expr
= LITERALS
[self
.token
]
418 self
.read_token(self
.parse_div
)
420 if self
.token
== ('punctuator', '.'):
422 [type, member
] = self
.token
423 assert type in {'identifier', 'reserved word'}
424 expr
= ('property', expr
, member
)
425 elif self
.token
== ('punctuator', '(') and not disallow_call
:
426 expr
= ('call', expr
, self
.parse_args())
427 elif self
.token
== ('punctuator', '['):
428 self
.read_token(self
.parse_regex
)
429 expr
= ('property', expr
, self
.parse_expr())
430 assert self
.token
== ('punctuator', ']')
431 elif self
.token
== ('punctuator', '?'):
432 if precedence
> self
.ASSIGN_PRECEDENCE
:
434 self
.read_token(self
.parse_regex
)
435 a
= self
.parse_expr(self
.COMMA_PRECEDENCE
)
436 assert self
.token
== ('punctuator', ':')
437 self
.read_token(self
.parse_regex
)
438 b
= self
.parse_expr(self
.COMMA_PRECEDENCE
, disallow
=disallow
)
439 expr
= ('?', expr
, a
, b
)
441 elif not self
.new_line \
443 {('punctuator', '++'), ('punctuator', '--')} \
444 and precedence
<= UNARY_PRECEDENCE
:
445 expr
= (f
'postfix {self.token[1]}', expr
)
446 elif self
.token
is None \
447 or self
.token
[0] not in {'punctuator', 'reserved word'} \
448 or self
.token
[1] not in OP_PRECEDENCE \
449 or self
.token
[1] == disallow
:
453 # Assignments are right-associative; others are left-associative
454 rhs_precedence
= OP_PRECEDENCE
[op
]
455 if rhs_precedence
< precedence
or rhs_precedence \
456 == precedence
!= self
.ASSIGN_PRECEDENCE
:
458 self
.read_token(self
.parse_regex
)
459 rhs
= self
.parse_expr(rhs_precedence
, disallow
=disallow
)
460 expr
= (op
, expr
, rhs
)
462 self
.read_token(self
.parse_div
)
464 def parse_args(self
):
466 self
.read_token(self
.parse_regex
)
467 if self
.token
!= ('punctuator', ')'):
469 args
.append(self
.parse_expr(self
.COMMA_PRECEDENCE
))
470 if self
.token
== ('punctuator', ')'):
472 assert self
.token
== ('punctuator', ',')
473 self
.read_token(self
.parse_regex
)
476 ASSIGN_PRECEDENCE
= OP_PRECEDENCE
['=']
477 COMMA_PRECEDENCE
= OP_PRECEDENCE
[',']
479 def parse_function(self
):
481 if self
.token
[0] == 'identifier':
486 assert self
.token
== ('punctuator', '(')
490 if self
.token
!= ('punctuator', ')'):
492 assert self
.token
[0] == 'identifier'
493 params
.append(self
.token
[1])
496 if self
.token
== ('punctuator', ')'):
498 assert self
.token
== ('punctuator', ',')
503 assert self
.token
== ('punctuator', '{')
504 body
= list(self
.parse_block(func_defs
=True))
505 assert self
.token
== ('punctuator', '}')
506 return ('function', name
, params
, body
)
510 IDENTIFIER_START_CHAR
= re
.compile(fr
'[{IDENTIFIER_START}]|\\')
511 LINE_TERMINATORS
= '\n\r\u2028\u2029'
513 fr
'(?P<line_terminator> [{LINE_TERMINATORS}])'
514 r
'| (?P<singleline_comment> //)'
515 r
'| (?P<multiline_comment> /\*)'
516 fr
'| (?P<identifier> {IDENTIFIER_START_CHAR.pattern})'
517 r
'| (?P<number> [\d.])'
519 r
'\+\+ | -- | && | \|\|'
520 r
'| (<< | >>>? | [=!]= | [-<>+*%&^|])=?'
521 r
'| [][{}().;,!~?:=])'
523 '| (?P<string> ["\'])'
524 # Anything else other than whitespace:
525 '| (?P<illegal_codepoint> [^\t\v\f \xA0\u2000-\u200B\u3000])',
526 re
.DOTALL ^ re
.VERBOSE ^ re
.ASCII
)
528 def handle_line_terminator(self
, token
):
531 LINE_TERMINATOR_PATTERN \
532 = re
.compile(fr
'[{LINE_TERMINATORS}]')
534 def handle_singleline_comment(self
, start
):
535 self
.read_matches(fr
'[^{self.LINE_TERMINATORS}]', 1)
537 def handle_multiline_comment(self
, start
):
539 end
= self
.buffer.find('*/', self
.pos
)
542 if self
.LINE_TERMINATOR_PATTERN
.search(self
.buffer, self
.pos
):
545 raise EOFError('Unterminated multi-line comment')
546 self
.buffer = self
.buffer[max(len(self
.buffer) - 1, self
.pos
):]
550 if self
.LINE_TERMINATOR_PATTERN
.search(self
.buffer, last
, end
):
553 def handle_identifier(self
, token
):
554 token
+= self
.read_matches(IDENTIFIER_PART
, 6, re
.ASCII
)
555 if token
in RESERVED_WORDS
:
556 return ('reserved word', token
)
557 token
= self
.UNICODE_ESCAPES
.sub(self
.decode_unicode
, token
)
558 assert is_identifier(token
)
559 assert token
not in RESERVED_WORDS
560 return ('identifier', token
)
562 UNICODE_ESCAPES
= re
.compile(r
'\\u(.{4})', re
.ASCII
)
565 def decode_unicode(match
):
566 return chr(int(match
.group(1), 16))
568 def handle_number(self
, token
):
571 token
+= self
.read_matches(r
'\d', 1, re
.ASCII
)
573 return ('punctuator', '.')
576 if self
.read_match(*self
.X
):
577 number
= self
.read_matches(r
'[\da-fA-F]', 1, re
.ASCII
)
579 number
= int(number
, 16)
580 if number
>= 2**1024 - 2**(1024 - 54):
582 else: # Nonzero digit
583 token
+= self
.read_matches(r
'\d', 1, re
.ASCII
)
584 if number
is None and self
.read_match(*self
.POINT
):
585 token
+= '.' + self
.read_matches(r
'\d', 1, re
.ASCII
)
587 prefix
= self
.read_match(*self
.EXPONENT_PREFIX
)
589 exp
= self
.read_matches(r
'\d', 1, re
.ASCII
)
591 token
+= prefix
.group() + exp
593 assert not self
.read_match(self
.IDENTIFIER_START_CHAR
, 1)
596 X
= (re
.compile(r
'[xX]'), 1)
597 POINT
= (re
.compile(r
'\.'), 1)
598 EXPONENT_PREFIX
= (re
.compile(r
'[eE][-+]?'), 2)
600 def handle_punctuator(self
, token
):
601 return ('punctuator', token
)
603 def parse_div(self
, token
):
604 if self
.read_match(*self
.EQUALS
):
605 return ('punctuator', '/=')
606 return ('punctuator', '/')
608 EQUALS
= (re
.compile(r
'='), 1)
610 def parse_regex(self
, token
):
611 self
.pos
-= len(token
) - 1
614 pattern
.append(self
.read_matches(
615 fr
'[^{self.LINE_TERMINATORS}{self.BACKSLASH}/[]'
616 fr
'|\\[^{self.LINE_TERMINATORS}]', 2))
617 end
= self
.read_match(*self
.SLASH_OR_CLASS
)
618 if end
.group() != '[':
620 pattern
.extend(('[', self
.read_matches(
621 fr
'[^{self.LINE_TERMINATORS}\]{self.BACKSLASH}]'
622 fr
'|\\[^{self.LINE_TERMINATORS}]', 2), ']'))
623 end
= self
.read_match(*self
.CLASS_END
)
625 flags
= self
.read_matches(IDENTIFIER_PART
, 6, re
.ASCII
)
626 return ('regex', ''.join(pattern
), flags
)
628 SLASH_OR_CLASS
= (re
.compile(r
'[/[]'), 1)
629 CLASS_END
= (re
.compile(r
']'), 1)
631 def handle_string(self
, quote
):
632 s
= self
.read_matches(
633 fr
'[^{quote}{self.BACKSLASH}{self.LINE_TERMINATORS}]'
634 fr
'|\\[^1-9xu{self.LINE_TERMINATORS}]'
636 r
'|\\u[\da-fA-F]{4}', 6, re
.ASCII
)
637 end
= self
.read_match(re
.compile(quote
), 1)
639 assert not self
.INVALID_NUL
.search(s
)
640 return self
.ESCAPE_SEQUENCES
.sub(self
.decode_escape
, s
)
642 BACKSLASH
= '\\{:03o}'.format(ord('\\'))
643 INVALID_NUL
= re
.compile(r
'\\0\d', re
.ASCII
)
644 ESCAPE_SEQUENCES
= re
.compile(r
'\\(x.{2}|u.{4}|.)', re
.ASCII
)
647 def decode_escape(match
):
648 match
= match
.group(1)
649 if match
.startswith(('x', 'u')):
650 c
= chr(int(match
[1:], 16))
653 'n': '\n', 'b': '\b', 'f': '\f', 'r': '\r', 't': '\t',
654 'v': '\v', '0': '\x00'}
655 c
= escapes
.get(match
, match
)
658 def handle_illegal_codepoint(self
, codepoint
):
659 raise ValueError(f
'Illegal codepoint: {codepoint!r}')
661 def read_matches(self
, pattern
, itemlen
, flags
=0):
662 pattern
= re
.compile(r
'(?:{})*'.format(pattern
), flags
)
665 match
= pattern
.match(self
.buffer, self
.pos
)
666 result
.write(match
.group())
667 self
.pos
= match
.end()
668 if len(self
.buffer) - self
.pos
>= itemlen
or self
.eof
:
670 self
.buffer = self
.buffer[self
.pos
:]
672 return result
.getvalue()
674 def read_match(self
, pattern
, maxlen
):
675 while self
.pos
+ maxlen
> len(self
.buffer) and not self
.eof
:
676 self
.buffer = self
.buffer[self
.pos
:]
678 match
= pattern
.match(self
.buffer, self
.pos
)
680 self
.pos
= match
.end()
684 size
= self
.BUFSIZE
- len(self
.buffer)
685 new
= self
.read(size
)
686 self
.eof
= len(new
) < size
687 self
.buffer += new
.translate(self
.REMOVE_Cf
)