1 # Copyright (C) 2006-2009, Parrot Foundation.
6 Perl6Regex - compiler and parser for Perl 6 regex
10 =item C<compile_perl6regex(PMC source, PMC adverbs :slurpy :named)>
12 Return the result of compiling C<source> according to Perl 6
13 regex syntax and the associated C<adverbs>. Normally this
14 function is obtained using C<compreg 'PGE::Perl6Regex'> instead
15 of calling it directly.
17 Returns the compiled regular expression. If a C<target>
18 named parameter is supplied, then it will return the parse tree
19 (target='parse'), the expression tree (target='exp'),
20 or the resulting PIR code (target='PIR').
24 .namespace [ 'PGE';'Perl6Regex' ]
26 .sub 'compile_perl6regex'
28 .param pmc args :slurpy
29 .param pmc adverbs :slurpy :named
31 unless null adverbs goto set_adverbs
35 $I0 = exists adverbs['grammar']
36 if $I0 goto with_grammar
37 unless args goto adverb_grammar_1
39 adverbs['grammar'] = $S0
42 adverbs['grammar'] = 'PGE::Grammar'
44 $I0 = exists adverbs['name']
46 unless args goto with_name
50 $I0 = exists adverbs['lang']
52 adverbs['lang'] = 'PIR'
54 $I0 = exists adverbs['ignorecase']
55 if $I0 goto with_ignorecase
57 adverbs['ignorecase'] = $I0
59 $I0 = exists adverbs['sigspace']
60 if $I0 goto with_sigspace
61 $I0 = exists adverbs['s']
63 $I0 = exists adverbs['words']
64 if $I0 goto with_words
66 adverbs['sigspace'] = $I0
70 adverbs['sigspace'] = $I0
73 $I0 = adverbs['words']
74 adverbs['sigspace'] = $I0
78 target = adverbs['target']
79 target = downcase target
81 ## If we're passed the results of a previous parse, use it.
83 $I0 = isa source, ['PGE';'Match']
84 if $I0 == 0 goto parse
86 if null $P0 goto parse
87 $I0 = isa $P0, ['PGE';'Exp']
88 if $I0 == 0 goto parse
93 ## Let's parse the source as a regex
94 $P0 = get_global 'regex'
95 match = $P0(source, adverbs :flat :named)
96 if source == '' goto err_null
97 if target != 'parse' goto check
101 unless match goto check_1
104 if $S0 == $S1 goto analyze
114 pad['lexscope'] = $P0
115 exp = exp.'perl6exp'(pad)
116 if null exp goto err_null
117 .tailcall exp.'compile'(adverbs :flat :named)
121 'parse_error'(match, $I0, 'Null pattern illegal')
125 =item C<regex(PMC mob, PMC adverbs :slurpy :named)>
127 Parses a regex according to Perl 6 regex syntax, and returns
128 the corresponding parse tree.
134 .param pmc adverbs :slurpy :named
136 .local string stop, tighter
137 .local pmc stopstack, optable, match
139 stopstack = get_global '@!stopstack'
140 optable = get_global '$optable'
142 stop = adverbs['stop']
143 tighter = adverbs['tighter']
145 match = optable.'parse'(mob, 'stop'=>stop, 'tighter'=>tighter)
153 Parse and calculate various Perl 6 string escapes, such as \n, \r,
154 \x, \o, and \c. For the latter escapes, also handle the bracketed
155 forms and other special forms.
157 Note that this function is used directly by PCT::Grammar and Rakudo,
158 and someday may be refactored to a different location.
164 .local int rpos, lpos
166 lpos = find_not_cclass .CCLASS_WHITESPACE, s, 0, rpos
168 unless rpos > lpos goto rtrim_done
170 $I0 = is_cclass .CCLASS_WHITESPACE, s, rpos
171 if $I0 goto rtrim_loop
175 $S0 = substr s, lpos, $I0
182 .param pmc adverbs :slurpy :named
183 .local string target, backchar, literal
184 .local int pos, lastpos
185 $P0 = get_hll_global ['PGE'], '$!MATCH'
186 (mob, pos, target) = $P0.'new'(mob, adverbs :flat :named)
187 lastpos = length target
188 if pos >= lastpos goto fail
189 $S0 = substr target, pos, 1
191 if $S0 != "\\" goto fail
192 if pos >= lastpos goto fail
193 backchar = substr target, pos, 1
195 backchar = downcase backchar
196 $I0 = index "\\0abefnrtxco", backchar
198 if $I0 >= 9 goto scan_xco
199 literal = substr "\\\0\a\b\e\f\n\r\t", $I0, 1
202 ## Handle \x, \c, and \o escapes. Start by converting
203 ## backchar into the appropriate radix, then loop through
204 ## the characters that follow to compute the decimal value
205 ## of codepoints, and concatenate the codepoints into a
207 .local int base, decnum, isbracketed
208 base = index ' o c x', backchar
210 $S0 = substr target, pos, 1
211 isbracketed = iseq $S0, '['
213 ## Handle the case of \cC (control escape).
214 if base != 10 goto scan_xco_char
215 if isbracketed goto scan_xco_char
216 $I0 = is_cclass .CCLASS_NUMERIC, $S0, 0
217 if $I0 goto scan_xco_char
226 # inside brackets, skip leading ws
227 unless isbracketed goto scan_xco_char_ws
228 pos = find_not_cclass .CCLASS_WHITESPACE, target, pos, lastpos
230 if base != 10 goto scan_xco_char_digits
231 unless isbracketed goto scan_xco_char_digits
232 $I0 = is_cclass .CCLASS_NUMERIC, target, pos
233 if $I0 goto scan_xco_char_digits
234 ## look up character by name
236 namepos = index target, ']', pos
237 if namepos < 0 goto err_missing_bracket
238 $I0 = index target, ',', pos
239 if $I0 < 0 goto have_namepos
240 if namepos < $I0 goto have_namepos
244 $S0 = substr target, pos, $I0
246 $P0 = new 'CodeString'
247 decnum = $P0.'charname_to_ord'($S0)
248 if decnum < 0 goto err_unicode_name
250 goto scan_xco_char_end
251 scan_xco_char_digits:
252 $S0 = substr target, pos, 1
253 $I0 = index "0123456789abcdef0123456789ABCDEF", $S0
254 if $I0 < 0 goto scan_xco_char_end
256 if $I0 >= base goto scan_xco_char_end
260 goto scan_xco_char_digits
264 unless isbracketed goto scan_xco_end
265 pos = find_not_cclass .CCLASS_WHITESPACE, target, pos, lastpos
266 $S0 = substr target, pos, 1
267 if $S0 == ']' goto scan_xco_end
268 if $S0 == '' goto err_missing_bracket
269 if $S0 != ',' goto err_digit
281 $S0 = concat "Unrecognized character name ", $S0
282 'parse_error'(mob, pos, $S0)
284 'parse_error'(mob, pos, "Missing close bracket for \\x[...], \\o[...], or \\c[...]")
286 'parse_error'(mob, pos, "Invalid digit in \\x[...], \\o[...], or \\c[...]")
292 Initializes the Perl6Regex parser and other data structures
293 needed for compiling regexes.
297 .include 'cclass.pasm'
299 .namespace [ 'PGE';'Perl6Regex' ]
301 .sub '__onload' :load
303 p6meta = new 'P6metaclass'
304 p6meta.'new_class'('PGE::Exp::WS', 'parent'=>'PGE::Exp::Subrule')
305 p6meta.'new_class'('PGE::Exp::Alias', 'parent'=>'PGE::Exp')
308 optable = new ['PGE';'OPTable']
309 set_global '$optable', optable
311 $P0 = get_global 'parse_term'
312 optable.'newtok'('term:', 'precedence'=>'=', 'nows'=>1, 'parsed'=>$P0)
314 $P0 = get_global 'parse_term_ws'
315 optable.'newtok'('term:#', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
317 $P0 = get_global 'parse_term_backslash'
318 optable.'newtok'("term:\\", 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
320 optable.'newtok'('term:^', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::Anchor')
321 optable.'newtok'('term:^^', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::Anchor')
322 optable.'newtok'('term:$$', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::Anchor')
323 optable.'newtok'('term:\b', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::Anchor')
324 optable.'newtok'('term:\B', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::Anchor')
325 optable.'newtok'('term:<<', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::Anchor')
326 optable.'newtok'('term:>>', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::Anchor')
327 optable.'newtok'('term:<?>', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::Anchor')
328 optable.'newtok'('term:<!>', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::Anchor')
329 optable.'newtok'(unicode:"term:\xab", 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::Anchor')
330 optable.'newtok'(unicode:"term:\xbb", 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::Anchor')
332 optable.'newtok'('term:.', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::CCShortcut')
333 optable.'newtok'('term:\d', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::CCShortcut')
334 optable.'newtok'('term:\D', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::CCShortcut')
335 optable.'newtok'('term:\s', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::CCShortcut')
336 optable.'newtok'('term:\S', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::CCShortcut')
337 optable.'newtok'('term:\w', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::CCShortcut')
338 optable.'newtok'('term:\W', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::CCShortcut')
339 optable.'newtok'('term:\N', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::CCShortcut')
340 optable.'newtok'('term:\n', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::Newline')
342 $P0 = get_global 'parse_dollar'
343 optable.'newtok'('term:$', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
345 $P0 = get_global 'parse_subrule'
346 optable.'newtok'('term:<', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
347 optable.'newtok'('term:<?', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
348 optable.'newtok'('term:<!', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
349 optable.'newtok'('term:<.', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
351 $P0 = get_global 'parse_enumcharclass'
352 optable.'newtok'('term:<[', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
353 optable.'newtok'('term:<+', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
354 optable.'newtok'('term:<-', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
355 optable.'newtok'('term:<![', 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
357 $P0 = get_global 'parse_quoted_literal'
358 optable.'newtok'("term:'", 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
360 $P0 = get_global 'parse_goal'
361 optable.'newtok'('term:~', 'equiv'=>'term:', 'parsed'=>$P0)
363 optable.'newtok'('term:::', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::Cut')
364 optable.'newtok'('term::::', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::Cut')
365 optable.'newtok'('term:<cut>', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::Cut')
366 optable.'newtok'('term:<commit>', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::Cut')
368 $P0 = get_global 'parse_closure'
369 optable.'newtok'("term:{{", 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
370 optable.'newtok'("term:<?{{", 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
371 optable.'newtok'("term:<!{{", 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
373 $P0 = get_global 'parse_action'
374 optable.'newtok'("term:{*}", 'equiv'=>'term:', 'nows'=>1, 'parsed'=>$P0)
377 optable.'newtok'('circumfix:[ ]', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::Group')
378 optable.'newtok'('circumfix:( )', 'equiv'=>'term:', 'nows'=>1, 'match'=>'PGE::Exp::CGroup')
380 $P0 = get_global 'parse_quant'
381 optable.'newtok'('postfix:*', 'looser'=>'term:', 'parsed'=>$P0)
382 optable.'newtok'('postfix:+', 'equiv'=>'postfix:*', 'parsed'=>$P0)
383 optable.'newtok'('postfix:?', 'equiv'=>'postfix:*', 'parsed'=>$P0)
384 optable.'newtok'('postfix::', 'equiv'=>'postfix:*', 'parsed'=>$P0)
385 optable.'newtok'('postfix:**', 'equiv'=>'postfix:*', 'parsed'=>$P0)
386 $P0 = get_global 'parse_quant_error'
387 optable.'newtok'('term:*', 'equiv'=>'term:', 'parsed'=>$P0)
388 optable.'newtok'('term:+', 'equiv'=>'term:', 'parsed'=>$P0)
389 optable.'newtok'('term:?', 'equiv'=>'term:', 'parsed'=>$P0)
391 optable.'newtok'('infix:', 'looser'=>'postfix:*', 'assoc'=>'list', 'nows'=>1, 'match'=>'PGE::Exp::Concat')
392 optable.'newtok'('infix:&', 'looser'=>'infix:', 'nows'=>1, 'match'=>'PGE::Exp::Conj')
393 optable.'newtok'('infix:|', 'looser'=>'infix:&', 'nows'=>1, 'match'=>'PGE::Exp::Alt')
394 optable.'newtok'('prefix:|', 'equiv'=>'infix:|', 'nows'=>1, 'match'=>'PGE::Exp::Alt')
395 optable.'newtok'('infix:||', 'equiv'=>'infix:|', 'nows'=>1, 'match'=>'PGE::Exp::Alt')
396 optable.'newtok'('prefix:||', 'equiv'=>'infix:|', 'nows'=>1, 'match'=>'PGE::Exp::Alt')
398 optable.'newtok'('infix::=', 'tighter'=>'infix:', 'assoc'=>'right', 'match'=>'PGE::Exp::Alias')
399 optable.'newtok'('infix:=', 'tighter'=>'infix:', 'assoc'=>'right', 'match'=>'PGE::Exp::Alias')
401 $P0 = get_global 'parse_modifier'
402 optable.'newtok'('prefix::', 'looser'=>'infix:|', 'parsed'=>$P0)
404 optable.'newtok'('close:}', 'precedence'=>'<', 'nows'=>1)
408 set_global '%esclist', esclist
413 esclist['v'] = unicode:"\x0a\x0b\x0c\x0d\x85\u2028\u2029"
414 esclist['h'] = unicode:"\x09\x20\xa0\u1680\u180e\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u202f\u205f\u3000"
415 esclist['n'] = unicode:"\x0a\x0d\x0c\x85\u2028\u2029"
416 # See http://www.unicode.org/Public/UNIDATA/PropList.txt for above
418 # Create and store closure preprocessors in %closure_pp
420 set_hll_global ['PGE';'Perl6Regex'], '%closure_pp', $P0
421 $P1 = get_hll_global ['PGE';'Perl6Regex'], 'PIR_closure'
424 # Create an array for holding stop tokens
425 $P0 = new 'ResizablePMCArray'
426 set_hll_global ['PGE';'Perl6Regex'], '@!stopstack', $P0
428 $P0 = get_global 'compile_perl6regex'
429 compreg 'PGE::Perl6Regex', $P0
434 =item C<parse_term(PMC mob [, PMC adverbs :slurpy :named])>
436 Parses literal strings and whitespace.
437 Return a failed match if the stoptoken is found.
443 .param pmc adverbs :slurpy :named
446 .local int pos, lastpos
447 $P0 = getattribute mob, '$.target'
449 $P0 = getattribute mob, '$.pos'
451 lastpos = length target
454 $P0 = get_hll_global ['PGE';'Perl6Regex'], '@!stopstack'
457 $I0 = is_cclass .CCLASS_WHITESPACE, target, pos
460 if $I0 == 0 goto not_stop
461 $S0 = substr target, pos, $I0
462 if $S0 == stop goto end_noterm
464 ## find length of word character sequence
466 $I0 = find_not_cclass .CCLASS_WORD, target, pos, lastpos
469 ## if we didn't find any, return no term
470 if litlen == 0 goto end_noterm
472 ## for multi-char unquoted literals, leave the last character
473 ## in case it's quantified (it gets processed as a subsequent term)
474 if litlen < 2 goto term_literal
477 $S0 = substr target, pos, litlen
479 mob = mob.'new'(mob, 'grammar'=>'PGE::Exp::Literal')
485 .tailcall 'parse_term_ws'(mob)
488 $S0 = substr target, pos, 1
489 if $S0 == ':' goto err_cut
490 (mob) = mob.'new'(mob, 'grammar'=>'PGE::Exp::Literal')
493 'parse_error'(mob, pos, 'Quantifier follows nothing in regex')
498 =item C<parse_term_backslash(mob [, adverbs :slurpy :named])>
500 Parses terms beginning with backslash.
504 .sub 'parse_term_backslash'
506 .param pmc adverbs :slurpy :named
509 .local int pos, lastpos, isnegated
510 $P0 = getattribute mob, '$.target'
512 $P0 = getattribute mob, '$.pos'
514 lastpos = length target
517 .local string backchar, charlist
518 ## get whatever follows the backslash
519 backchar = substr target, pos, 1
523 ## if it's not a word character, it's a quoted metachar
524 $I0 = is_cclass .CCLASS_WORD, backchar, 0
525 unless $I0 goto term_literal
527 ## if it's a word character, it may be negated
528 isnegated = is_cclass .CCLASS_UPPERCASE, backchar, 0
529 ## $S0 = downcase charlist
532 backchar = downcase $S0
534 ## if it's \x, \c, or \o, parse as string escape
535 $I0 = index 'xco', backchar
536 if $I0 < 0 goto meta_esclist
539 $P0 = 'p6escapes'(mob, 'pos' => $I0)
540 unless $P0 goto err_xcoparse
542 charlist = $P0.'ast'()
543 unless isnegated goto term_literal
544 $I0 = length charlist
545 if $I0 > 1 goto err_negated_brackets
549 $P0 = get_global '%esclist'
550 $I0 = exists $P0[backchar]
551 unless $I0 goto err_reserved_metachar
552 charlist = $P0[backchar]
553 if isnegated goto term_charlist
554 $I0 = length charlist
555 if $I0 > 1 goto term_charlist
558 mob = mob.'new'(mob, 'grammar'=>'PGE::Exp::Literal')
559 mob.'!make'(charlist)
564 mob = mob.'new'(mob, 'grammar'=>'PGE::Exp::EnumCharList')
565 mob.'!make'(charlist)
566 mob['isnegated'] = isnegated
571 parse_error(mob, pos, 'Unable to parse \x, \c, or \o value')
572 err_negated_brackets:
574 parse_error(mob, pos, 'Cannot use comma in \\X[...] or \\O[...]')
575 err_reserved_metachar:
576 parse_error(mob, pos, 'Alphanumeric metacharacters are reserved')
580 =item C<parse_term_ws(PMC mob)>
582 Parses a whitespace term.
589 .local int pos, lastpos
590 (mob, pos, target) = mob.'new'(mob, 'grammar'=>'PGE::Exp::WS')
591 lastpos = length target
594 ## scan for the next non-whitespace character
595 pos = find_not_cclass .CCLASS_WHITESPACE, target, pos, lastpos
596 $S0 = substr target, pos, 1
597 if $S0 != '#' goto end
598 ## we have a #-comment, determine its closing delimiter
600 .local string closedelim
602 $S0 = substr target, pos, 1
603 $I0 = index '<[{(', $S0
604 if $I0 < 0 goto term_ws_loop_1
605 closedelim = substr '>]})', $I0, 1
607 $I0 = index target, closedelim, pos
609 if pos > 0 goto term_ws_loop
617 =item C<parse_quant(PMC mob)>
619 Parses a quantifier, such as *, +, ?, :, and all of their wondrous
628 .local int pos, lastpos
630 (mob, pos, target) = mob.'new'(mob, 'grammar'=>'PGE::Exp::Quant')
631 lastpos = length target
633 .local int min, max, suffixpos, sepws
637 sepws = is_cclass .CCLASS_WHITESPACE, target, pos
638 suffixpos = find_not_cclass .CCLASS_WHITESPACE, target, pos, lastpos
640 if key == '**' goto quant_suffix
641 if key == ':' goto quant_cut
642 if key == '+' goto quant_max
643 ## quantifier is '?' or '*'
646 if key == '?' goto quant_suffix
647 ## quantifier is '+' or '*'
652 # The postfix:<:> operator may bring us here when it's really a
653 # term:<::> term. So, we check for that here and fail this match
654 # if we really have a cut term.
655 if key != ':' goto quant_suffix
656 $S0 = substr target, pos, 1
657 if $S0 == ':' goto end
658 mob['backtrack'] = PGE_BACKTRACK_NONE
661 suffix = substr target, suffixpos, 2
662 if suffix == ':?' goto quant_eager
663 if suffix == ':!' goto quant_greedy
665 suffix = substr target, suffixpos, 1
666 if suffix == '?' goto quant_eager
667 if suffix == '!' goto quant_greedy
668 if suffix != ':' goto quant
670 mob['backtrack'] = PGE_BACKTRACK_NONE
671 goto quant_skip_suffix
673 mob['backtrack'] = PGE_BACKTRACK_EAGER
674 goto quant_skip_suffix
676 mob['backtrack'] = PGE_BACKTRACK_GREEDY
679 pos = suffixpos + $I0
682 if key != '**' goto quant_set
684 $I0 = is_cclass .CCLASS_WHITESPACE, target, pos
686 pos = find_not_cclass .CCLASS_WHITESPACE, target, pos, lastpos
688 isconst = is_cclass .CCLASS_NUMERIC, target, pos
689 if isconst goto brace_skip
690 $S0 = substr target, pos, 1
691 if $S0 != "{" goto parse_repetition_controller
694 $I1 = find_not_cclass .CCLASS_NUMERIC, target, pos, lastpos
695 if $I1 <= pos goto err_closure
696 $S0 = substr target, pos
700 $S0 = substr target, pos, 2
701 if $S0 != '..' goto quant_closure_end
704 $S0 = substr target, pos, 1
705 if $S0 != '*' goto quant_range_end
707 goto quant_closure_end
709 $I1 = find_not_cclass .CCLASS_NUMERIC, target, pos, lastpos
710 if $I1 <= pos goto err_closure
711 $S0 = substr target, pos
715 if isconst goto brace_skip2
716 $S0 = substr target, pos, 1
717 if $S0 != "}" goto err_closure
720 suffixpos = find_not_cclass .CCLASS_WHITESPACE, target, pos, lastpos
729 parse_repetition_controller:
730 .local pmc regex, repetition_controller
732 regex = get_global 'regex'
733 #parse everything down to concatenation precedence
734 repetition_controller = regex(mob, 'tighter'=>'infix:')
735 unless repetition_controller goto err_repetition_controller
737 #update pos to after the matched
738 pos = repetition_controller.'to'()
739 repetition_controller = repetition_controller['expr']
741 # if there's surrounding ws, then add WS nodes
742 unless sepws goto sepws_done
743 $P0 = mob.'new'(mob, 'grammar'=>'PGE::Exp::Concat')
745 $P1 = mob.'new'(mob, 'grammar'=>'PGE::Exp::WS')
748 push $P0, repetition_controller
749 $P1 = mob.'new'(mob, 'grammar'=>'PGE::Exp::WS')
752 repetition_controller = $P0
755 #save the matched in the mob as sep
756 mob['sep'] = repetition_controller
758 #force the match to be 1..Inf
762 #move position to after the matched
766 err_repetition_controller:
767 'parse_error'(mob, pos, "Error in repetition controller")
769 'parse_error'(mob, pos, "Error in closure quantifier")
773 =item C<parse_quant_error(mob)>
775 Throw an exception for quantifiers in term position.
779 .sub 'parse_quant_error'
783 'parse_error'(mob, pos, "Quantifier follows nothing in regex")
787 =item C<parse_dollar(PMC mob)>
789 Parse things that begin with a dollar sign, such as scalars,
790 anchors, and match subscripts.
797 .local int pos, lastpos
799 $P0 = getattribute mob, '$.target'
801 $P0 = getattribute mob, '$.pos'
803 lastpos = length target
804 $S0 = substr target, pos, 1
805 if $S0 == '<' goto name
806 $I0 = find_not_cclass .CCLASS_NUMERIC, target, pos, lastpos
807 if $I0 > pos goto numeric
808 $I0 = find_not_cclass .CCLASS_WORD, target, pos, lastpos
809 if $I0 > pos goto scalar
812 mob = mob.'new'(mob, 'grammar'=>'PGE::Exp::Anchor')
817 mob = mob.'new'(mob, 'grammar'=>'PGE::Exp::Scalar')
820 cname = substr target, pos, $I1
821 cname = concat '"', cname
822 cname = concat cname, '"'
828 mob = mob.'new'(mob, 'grammar'=>'PGE::Exp::Scalar')
830 cname = substr target, pos, $I1
837 mob = mob.'new'(mob, 'grammar'=>'PGE::Exp::Scalar')
838 $I0 = index target, ">", pos
839 if $I0 < pos goto err_close
842 cname = substr target, pos, $I1
844 cname = concat '"', cname
845 cname = concat cname, '"'
852 parse_error(mob, pos, "Missing close '>' in scalar")
857 =item C<parse_subname(STR target, INT pos)>
859 Scan C<target> starting at C<pos> looking for a subrule name
860 (following Perl 6's identifier syntax). Returns any subrule
861 name found, and the ending position of the name.
869 .local int startpos, targetlen
871 targetlen = length target
875 $I1 = find_not_cclass .CCLASS_WORD, target, $I0, targetlen
876 if $I1 == $I0 goto end
878 $S0 = substr target, pos, 2
879 if $S0 != '::' goto end
884 $S0 = substr target, startpos, $I0
889 =item C<parse_subrule(PMC mob)>
891 Parses a subrule token.
899 .local int pos, lastpos
903 (mob, pos, target) = mob.'new'(mob, 'grammar'=>'PGE::Exp::Subrule')
904 lastpos = length target
906 ## default to non-capturing rule
910 ## see what type of subrule this is
911 if key == '<.' goto scan_subname
912 if key == '<?' goto zerowidth
913 if key == '<!' goto negated
915 ## capturing subrule, get its name/alias
917 .local string subname, cname
918 (subname, pos) = 'parse_subname'(target, pos)
920 $S0 = substr target, pos, 1
921 unless $S0 == '=' goto subrule_arg
922 ## aliased subrule, skip the '=' and get the real name
929 mob['iszerowidth'] = 1
932 (subname, pos) = 'parse_subname'(target, pos)
935 mob['subname'] = subname
936 $S0 = substr target, pos, 1
937 if $S0 == ':' goto subrule_text_arg
938 if $S0 != ' ' goto subrule_end
943 regex = get_global 'regex'
944 $P1 = regex(mob, 'stop'=>'>')
953 pos = find_not_cclass .CCLASS_WHITESPACE, target, $I0, lastpos
954 if pos == $I0 goto end
955 if pos >= lastpos goto end
956 .local string textarg, closedelim
959 $S0 = substr target, pos, 1
960 if $S0 == '"' goto subrule_text_quote
961 if $S0 != "'" goto subrule_text_loop
966 if pos >= lastpos goto end
967 $S0 = substr target, pos, 1
968 if $S0 == closedelim goto subrule_text_end
969 if $S0 != "\\" goto subrule_text_add
971 $S0 = substr target, pos, 1
972 if $S0 == closedelim goto subrule_text_add
973 if $S0 == "\\" goto subrule_text_add
978 goto subrule_text_loop
981 if closedelim == '>' goto subrule_end
983 pos = find_not_cclass .CCLASS_WHITESPACE, target, pos, lastpos
985 $S0 = substr target, pos, 1
986 if $S0 != '>' goto end
989 mob['iscapture'] = iscapture
990 unless iscapture goto end
992 $S0 = concat '"', $S0
993 $S0 = concat $S0, '"'
1000 =item C<parse_enumcharclass(PMC mob)>
1002 Extract an enumerated character list.
1006 .sub 'parse_enumcharclass'
1008 .param pmc adverbs :slurpy :named
1009 .local string target
1012 .local int pos, lastpos
1014 $P0 = getattribute mob, '$.target'
1017 lastpos = length target
1020 ## handle the case of <[, <+[, <-[, and <![ as the token
1021 ## by converting to <, <+, <-, or <!
1022 $S0 = substr op, -1, 1
1023 if $S0 != '[' goto parse_loop
1028 pos = find_not_cclass .CCLASS_WHITESPACE, target, pos, lastpos
1029 if pos >= lastpos goto err_close
1030 $S0 = substr target, pos, 1
1031 if $S0 != '[' goto subrule
1035 .local string charlist
1041 ## skip leading whitespace and get next character
1042 pos = find_not_cclass .CCLASS_WHITESPACE, target, pos, lastpos
1043 if pos >= lastpos goto err_close
1044 $S0 = substr target, pos, 1
1045 if $S0 == ']' goto enum_close
1046 if $S0 == '-' goto err_hyphen
1047 if $S0 == '.' goto enum_dotrange
1048 if $S0 != "\\" goto enum_addchar
1051 ## get escaped character
1052 $S0 = substr target, pos, 1
1053 ## handle metas such as \n, \t, \r, etc.
1054 $I0 = index 'nrtfae0xco', $S0
1055 if $I0 == -1 goto enum_addchar
1056 if $I0 >= 7 goto enum_xco
1057 $S0 = substr "\n\r\t\f\a\e\0", $I0, 1
1061 $P0 = 'p6escapes'(mob, 'pos'=>$I0)
1068 if isrange goto enum_addrange
1072 ## check if we have a .. range marker
1073 if isrange goto enum_addrange
1074 $S1 = substr target, pos, 2
1075 if $S1 != '..' goto enum_addchar
1080 ## add character range to charlist
1082 $I2 = ord charlist, -1
1086 if $I2 > $I0 goto enum_loop
1089 goto enum_addrange_1
1092 ## create a node for the charlist
1093 term = mob.'new'(mob, 'grammar'=>'PGE::Exp::EnumCharList')
1095 term.'!make'(charlist)
1100 .local string subname
1101 (subname, pos) = 'parse_subname'(target, $I0)
1102 if pos == $I0 goto err
1103 term = mob.'new'(mob, 'grammar'=>'PGE::Exp::Subrule')
1106 term['subname'] = subname
1107 term['iscapture'] = 0
1110 ## find out what operator preceded this term
1111 if op == '+' goto combine_plus
1112 if op == '-' goto combine_minus
1113 if op == '<' goto combine_init
1114 if op == '<+' goto combine_init
1115 ## token was '<-' or '<!'
1116 term['isnegated'] = 1
1117 term['iszerowidth'] = 1
1118 if op == '<!' goto combine_init
1119 ## token is '<-', we need to match a char by concat dot
1120 $P0 = mob.'new'(mob, 'grammar'=>'PGE::Exp::CCShortcut')
1123 mob = mob.'new'(mob, 'grammar'=>'PGE::Exp::Concat')
1134 ## <a+b> ==> <a> | <b>
1135 $P0 = mob.'new'(mob, 'grammar'=>'PGE::Exp::Alt')
1143 ## <a-b> ==> <!b> <a>
1144 term['isnegated'] = 1
1145 term['iszerowidth'] = 1
1146 $P0 = mob.'new'(mob, 'grammar'=>'PGE::Exp::Concat')
1154 pos = find_not_cclass .CCLASS_WHITESPACE, target, pos, lastpos
1155 if pos >= lastpos goto err_close
1157 op = substr target, pos, 1
1159 if op == '+' goto parse_loop
1160 if op == '-' goto parse_loop
1161 if op != '>' goto err
1166 parse_error(mob, pos, "Error parsing enumerated character class")
1169 parse_error(mob, pos, "Unescaped '-' in charlist (use '..' or '\\-')")
1172 parse_error(mob, pos, "Missing close '>' or ']>' in enumerated character class")
1178 =item C<parse_quoted_literal>
1180 Parses '...' literals.
1184 .sub 'parse_quoted_literal'
1186 .local int pos, lastpos
1187 .local string target
1188 (mob, pos, target) = mob.'new'(mob, 'grammar'=>'PGE::Exp::Literal')
1189 lastpos = length target
1194 if pos > lastpos goto literal_error
1195 $S0 = substr target, pos, 1
1196 if $S0 == "'" goto literal_end
1197 if $S0 != "\\" goto literal_add
1199 $S0 = substr target, pos, 1
1210 parse_error(mob, pos, "No closing ' in quoted literal")
1223 .local int pos, lastpos
1224 .local string target
1225 (mob, pos, target) = mob.'new'(mob, 'grammar'=>'PGE::Exp::Concat')
1226 lastpos = length target
1227 ## skip any leading whitespace before goal
1228 pos = find_not_cclass .CCLASS_WHITESPACE, target, pos, lastpos
1229 .local pmc regex, goal, expr, alt, failsub
1230 regex = get_global 'regex'
1231 ## parse the goal, down to concatenation precedence
1233 goal = regex(mob, 'tighter'=>'infix:')
1234 unless goal goto fail_goal
1237 ## skip any leading whitespace before expression
1238 pos = find_not_cclass .CCLASS_WHITESPACE, target, pos, lastpos
1239 ## parse the goal, down to concatenation precedence
1241 expr = regex(mob, 'tighter'=>'infix:')
1242 unless expr goto fail_expr
1246 failsub = mob.'new'(mob, 'grammar'=>'PGE::Exp::Subrule')
1248 failsub['subname'] = 'FAILGOAL'
1250 failsub['arg'] = $S0
1251 alt = mob.'new'(mob, 'grammar'=>'PGE::Exp::Alt')
1259 'parse_error'(mob, pos, 'Unable to parse goal after ~')
1261 'parse_error'(mob, pos, 'Unable to parse expression after ~')
1265 =item C<parse_modifier>
1271 .sub 'parse_modifier'
1273 .local int pos, lastpos
1274 .local string target, value
1277 (mob, pos, target) = mob.'new'(mob, 'grammar'=>'PGE::Exp::Modifier')
1278 lastpos = length target
1281 pos = find_not_cclass .CCLASS_NUMERIC, target, pos, lastpos
1282 if pos == $I0 goto name
1284 value = substr target, $I0, $I1
1287 pos = find_not_cclass .CCLASS_WORD, target, pos, lastpos
1289 if $I1 == 0 goto fail
1290 $S0 = substr target, $I0, $I1
1293 $S0 = substr target, pos, 1
1294 if $S0 != '(' goto end
1296 pos = index target, ')', pos
1298 $S0 = substr target, $I0, $I1
1302 ### XXX pos = find_not_cclass .CCLASS_WHITESPACE, target, pos, lastpos
1310 .sub 'parse_closure'
1314 .local string target
1316 (mob, pos, target) = mob.'new'(mob, 'grammar'=>'PGE::Exp::Closure')
1319 $S0 = substr target, pos, 1
1320 if $S0 != "{" goto body
1326 close = repeat "}", len
1327 if key == '<?{{' goto assert_pos
1328 if key == '<!{{' goto assert_neg
1331 mob['isnegated'] = 1
1333 mob['iszerowidth'] = 1
1337 $I0 = index target, close, pos
1338 if $I0 < pos goto err_noclose
1340 $S1 = substr target, pos, $I1
1346 parse_error(mob, pos, "Missing closing braces for closure")
1353 .local string target
1354 .local int pos, keypos
1355 (mob, pos, target) = mob.'new'(mob, 'grammar' => 'PGE::Exp::Action')
1356 keypos = index target, '#= ', pos
1357 if keypos < 0 goto end
1358 $I0 = find_cclass .CCLASS_NEWLINE, target, pos, keypos
1359 if $I0 < keypos goto end
1360 .local string actionkey
1363 actionkey = substr target, keypos, $I0
1364 actionkey = 'trim'(actionkey)
1365 mob['actionkey'] = actionkey
1375 .param string message
1376 $P0 = getattribute mob, '$.pos'
1378 $P0 = new 'Exception'
1379 $S0 = 'perl6regex parse error: '
1381 $S0 .= ' at offset '
1385 $P1 = getattribute mob, '$.target'
1387 $S1 = substr $S1, pos, 1
1398 .namespace [ 'PGE';'Exp' ]
1400 .sub 'perl6exp' :method
1406 .namespace [ 'PGE';'Exp';'Literal' ]
1408 .sub 'perl6exp' :method
1410 $I0 = pad['ignorecase']
1411 self['ignorecase'] = $I0
1416 .namespace [ 'PGE';'Exp';'Concat' ]
1418 .sub 'perl6exp' :method
1421 .local pmc array, exp
1423 array = self.'list'()
1428 if i >= n goto iter_end
1431 exp = exp.'perl6exp'(pad)
1432 if null exp goto iter_loop
1446 .namespace [ 'PGE';'Exp';'Quant' ]
1448 .sub 'perl6exp' :method
1451 $I0 = exists self['backtrack']
1452 if $I0 goto backtrack_done
1453 self['backtrack'] = PGE_BACKTRACK_GREEDY
1454 $I0 = pad['ratchet']
1455 if $I0 == 0 goto backtrack_done
1456 self['backtrack'] = PGE_BACKTRACK_NONE
1459 .local pmc exp0, sep
1461 isarray = pad['isarray']
1464 $I0 = isa exp0, ['PGE';'Exp';'WS']
1465 if $I0 goto err_parse_quant
1467 exp0 = exp0.'perl6exp'(pad)
1470 if null sep goto sep_done
1471 sep = sep.'perl6exp'(pad)
1474 pad['isarray'] = isarray
1477 $P0 = get_hll_global ['PGE';'Perl6Regex'], 'parse_quant_error'
1483 .namespace [ 'PGE';'Exp';'Group' ]
1485 .sub 'perl6exp' :method
1489 $I0 = self['isquant']
1490 if $I0 goto backtrack_done
1491 $I0 = exists self['backtrack']
1492 if $I0 goto backtrack_done
1493 $I0 = pad['ratchet']
1494 if $I0 == 0 goto backtrack_done
1495 self['backtrack'] = PGE_BACKTRACK_NONE
1499 exp0 = exp0.'perl6exp'(pad)
1505 .namespace [ 'PGE';'Exp';'CGroup' ]
1507 .sub 'perl6exp' :method
1511 $I0 = self['isquant']
1512 if $I0 goto backtrack_done
1513 $I0 = exists self['backtrack']
1514 if $I0 goto backtrack_done
1515 $I0 = pad['ratchet']
1516 if $I0 == 0 goto backtrack_done
1517 self['backtrack'] = PGE_BACKTRACK_NONE
1520 self['iscapture'] = 1
1521 $I0 = exists self['isscope']
1522 if $I0 goto set_cname
1526 $I0 = exists self['cname']
1527 if $I0 goto set_subpats
1528 $I0 = pad['subpats']
1533 cname = self['cname']
1534 $S0 = substr cname, 0, 1
1535 if $S0 == '"' goto set_lexicals
1538 pad['subpats'] = $I0
1544 lexscope = pad['lexscope']
1545 $I0 = exists lexscope[cname]
1546 if $I0 == 0 goto set_lexicals_1
1547 $P0 = lexscope[cname]
1551 lexscope[cname] = self
1554 padarray = pad['isarray']
1556 self['isarray'] = isarray
1557 $I0 = self['isscope']
1558 if $I0 == 0 goto unscoped
1562 subpats = pad['subpats']
1566 pad['lexscope'] = $P0
1568 exp = exp.'perl6exp'(pad)
1570 pad['lexscope'] = lexscope
1571 pad['isarray'] = padarray
1572 pad['subpats'] = subpats
1577 exp = exp.'perl6exp'(pad)
1584 .namespace [ 'PGE';'Exp';'Subrule' ]
1586 .sub 'perl6exp' :method
1589 $I0 = self['isquant']
1590 if $I0 goto backtrack_done
1591 $I0 = exists self['backtrack']
1592 if $I0 goto backtrack_done
1593 $I0 = pad['ratchet']
1594 if $I0 == 0 goto backtrack_done
1595 self['backtrack'] = PGE_BACKTRACK_NONE
1598 .local int iscapture, isarray
1600 iscapture = self['iscapture']
1601 if iscapture == 0 goto end
1603 cname = self['cname']
1604 isarray = pad['isarray']
1605 lexscope = pad['lexscope']
1606 $I0 = exists lexscope[cname]
1607 if $I0 == 0 goto lexscope_1
1608 $P0 = lexscope[cname]
1612 lexscope[cname] = self
1613 self['isarray'] = isarray
1615 $S0 = substr cname, 0, 1
1616 if $S0 == '"' goto end
1619 pad['subpats'] = $I0
1627 .namespace [ 'PGE';'Exp';'WS' ]
1629 .sub 'perl6exp' :method
1632 $I0 = pad['sigspace']
1637 self['subname'] = 'ws'
1638 self['iscapture'] = 0
1639 $I0 = pad['ratchet']
1640 unless $I0 goto end_1
1641 self['backtrack'] = PGE_BACKTRACK_NONE
1647 .namespace [ 'PGE';'Exp';'Alt' ]
1649 .sub 'perl6exp' :method
1652 .local pmc exp0, exp1
1656 ## if we only have one operand (prefix:|),
1657 ## reduce and return it.
1658 $I0 = defined self[1]
1659 if $I0 goto with_rhs
1660 .tailcall exp0.'perl6exp'(pad)
1663 ## if lhs is whitespace, then this is a prefix-alt and
1664 ## we ignore it (by simply returning its rhs)
1665 $I0 = isa exp0, ['PGE';'Exp';'WS']
1666 if $I0 == 0 goto with_lhs
1667 .tailcall exp1.'perl6exp'(pad)
1670 .local pmc lexscope, savescope, it
1671 lexscope = pad['lexscope']
1672 savescope = new 'Hash'
1675 unless it goto iter_end
1678 savescope[$P1] = $P2
1681 $I0 = pad['subpats']
1682 exp0 = exp0.'perl6exp'(pad)
1685 $I1 = pad['subpats']
1686 pad['subpats'] = $I0
1687 pad['lexscope'] = savescope
1688 exp1 = exp1.'perl6exp'(pad)
1690 $I0 = pad['subpats']
1691 if $I0 >= $I1 goto end
1692 pad['subpats'] = $I1
1698 .namespace [ 'PGE';'Exp';'Alias' ]
1700 .sub 'perl6exp' :method
1703 .local pmc exp0, exp1
1706 $I0 = isa exp0, ['PGE';'Exp';'Scalar']
1707 unless $I0 goto err_no_lvalue
1709 cname = exp0['cname']
1712 ## If we're aliasing a capture group or a quantified capture
1713 ## group, then we just move the alias name to that group.
1714 ## Otherwise, we need to create a capture group for this
1715 ## alias and return that.
1717 $I0 = isa exp1, ['PGE';'Exp';'CGroup']
1718 if $I0 == 1 goto make_alias
1719 $I0 = isa exp1, ['PGE';'Exp';'Subrule']
1720 if $I0 == 1 goto make_alias
1721 $I0 = isa exp1, ['PGE';'Exp';'Quant']
1722 if $I0 == 0 goto add_cgroup
1724 $I0 = isa $P0, ['PGE';'Exp';'CGroup']
1725 if $I0 == 0 goto add_cgroup
1726 $P0['cname'] = cname
1731 cexp = self.'new'(self, 'grammar'=>'PGE::Exp::CGroup')
1738 cexp['iscapture'] = 1
1739 cexp['cname'] = cname
1740 cexp = cexp.'perl6exp'(pad)
1744 exp1['cname'] = cname
1745 exp1['iscapture'] = 1
1747 exp1 = exp1.'perl6exp'(pad)
1751 $P0 = get_hll_global ['PGE';'Perl6Regex'], 'parse_error'
1753 $P0(self, $I0, 'LHS of alias must be lvalue')
1757 .namespace [ 'PGE';'Exp';'Modifier' ]
1759 .sub 'perl6exp' :method
1764 value = self.'ast'()
1765 if key == 'words' goto sigspace
1766 if key == 's' goto sigspace
1767 if key == 'w' goto sigspace
1768 if key == 'i' goto ignorecase
1780 exp = exp.'perl6exp'(pad)
1786 .namespace [ 'PGE';'Exp';'Conj' ]
1788 .sub 'perl6exp' :method
1791 $P0 = $P0.'perl6exp'(pad)
1794 $P1 = $P1.'perl6exp'(pad)
1800 .namespace [ 'PGE';'Exp';'Closure' ]
1802 .sub 'perl6exp' :method
1805 .local pmc closure_pp
1806 .local pmc closure_fn
1809 # see if we need to do any pre-processing of the closure
1810 closure_pp = get_hll_global ['PGE';'Perl6Regex'], '%closure_pp'
1811 $I0 = defined closure_pp[lang]
1812 if $I0 == 0 goto end
1813 closure_fn = closure_pp[lang]
1815 $S1 = closure_fn($S1)
1827 =item C<PIR_closure(string code)>
1829 This helper function helps with :lang(PIR) closures in rules
1830 by adding a ".sub" wrapper around the code if one isn't
1836 .namespace [ 'PGE';'Perl6Regex' ]
1840 $I0 = index code, '.sub'
1841 if $I0 >= 0 goto end
1842 code = concat ".sub anon :anon\n.param pmc match\n", code
1849 .namespace [ 'PGE';'Exp';'Action' ]
1851 .sub 'perl6exp' :method
1854 self['actionname'] = $S0
1859 .namespace [ 'PGE';'Exp';'Cut' ]
1861 .sub 'perl6exp' :method
1864 if $S0 == ':::' goto cut_rule
1865 if $S0 == '<commit>' goto cut_match
1866 self['cutmark'] = PGE_CUT_GROUP
1869 self['cutmark'] = PGE_CUT_RULE
1872 self['cutmark'] = PGE_CUT_MATCH
1880 # vim: expandtab shiftwidth=4 ft=pir: