1 //Written in the D programming language
3 Regular expression pattern parser.
5 module std
.regex
.internal
.parser
;
7 import std
.regex
.internal
.ir
;
8 import std
.range
.primitives
, std
.uni
, std
.meta
,
9 std
.traits
, std
.typecons
, std
.exception
;
10 static import std
.ascii
;
12 // package relevant info from parser into a regex object
13 auto makeRegex(S
, CG
)(Parser
!(S
, CG
) p
)
15 import std
.regex
.internal
.backtracking
: BacktrackingMatcher
;
16 import std
.regex
.internal
.thompson
: ThompsonMatcher
;
17 import std
.algorithm
.searching
: canFind
;
18 alias Char
= BasicElementOf
!S
;
26 maxCounterDepth
= g
.counterDepth
;
28 charsets
= g
.charsets
;
29 matchers
= g
.matchers
;
30 backrefed
= g
.backrefed
;
31 re
.pattern
= p
.origin
.idup
;
33 // check if we have backreferences, if so - use backtracking
34 if (__ctfe
) factory
= null; // allows us to use the awful enum re = regex(...);
35 else if (re
.backrefed
.canFind
!"a != 0")
36 factory
= new RuntimeFactory
!(BacktrackingMatcher
, Char
);
38 factory
= new RuntimeFactory
!(ThompsonMatcher
, Char
);
39 debug(std_regex_parser
)
43 //@@@BUG@@@ (not reduced)
44 //somehow just using validate _collides_ with std.utf.validate (!)
45 version (assert) re
.validateRe();
50 // helper for unittest
51 auto makeRegex(S
)(S arg
)
54 return makeRegex(Parser
!(S
, CodeGen
)(arg
, ""));
59 import std
.algorithm
.comparison
: equal
;
60 auto re
= makeRegex(`(?P<name>\w+) = (?P<var>\d+)`);
61 auto nc
= re
.namedCaptures
;
62 static assert(isRandomAccessRange
!(typeof(nc
)));
64 assert(nc
.length
== 2);
65 assert(nc
.equal(["name", "var"]));
66 assert(nc
[0] == "name");
67 assert(nc
[1..$].equal(["var"]));
69 re
= makeRegex(`(\w+) (?P<named>\w+) (\w+)`);
70 nc
= re
.namedCaptures
;
71 assert(nc
.length
== 1);
72 assert(nc
[0] == "named");
73 assert(nc
.front
== "named");
74 assert(nc
.back
== "named");
76 re
= makeRegex(`(\w+) (\w+)`);
77 nc
= re
.namedCaptures
;
80 re
= makeRegex(`(?P<year>\d{4})/(?P<month>\d{2})/(?P<day>\d{2})/`);
81 nc
= re
.namedCaptures
;
85 assert(nc
.equal(cp
[1..$]));
87 assert(nc
.equal(cp
[1 .. $ - 1]));
91 @trusted void reverseBytecode()(Bytecode
[] code
)
93 Bytecode
[] rev
= new Bytecode
[code
.length
];
94 uint revPc
= cast(uint) rev
.length
;
95 Stack
!(Tuple
!(uint, uint, uint)) stack
;
97 uint end
= cast(uint) code
.length
;
100 for (uint pc
= start
; pc
< end
; )
102 immutable len
= code
[pc
].length
;
103 if (code
[pc
].code
== IR
.GotoEndOr
)
104 break; //pick next alternation branch
107 rev
[revPc
- len
.. revPc
] = code
[pc
.. pc
+ len
];
111 else if (code
[pc
].isStart || code
[pc
].isEnd
)
113 //skip over other embedded lookbehinds they are reversed
114 if (code
[pc
].code
== IR
.LookbehindStart
115 || code
[pc
].code
== IR
.NeglookbehindStart
)
117 immutable blockLen
= len
+ code
[pc
].data
118 + code
[pc
].pairedLength
;
119 rev
[revPc
- blockLen
.. revPc
] = code
[pc
.. pc
+ blockLen
];
124 immutable second
= code
[pc
].indexOfPair(pc
);
125 immutable secLen
= code
[second
].length
;
126 rev
[revPc
- secLen
.. revPc
] = code
[second
.. second
+ secLen
];
128 if (code
[pc
].code
== IR
.OrStart
)
130 //we pass len bytes forward, but secLen in reverse
131 immutable revStart
= revPc
- (second
+ len
- secLen
- pc
);
133 uint i
= pc
+ IRL
!(IR
.OrStart
);
134 while (code
[i
].code
== IR
.Option
)
136 if (code
[i
- 1].code
!= IR
.OrStart
)
138 assert(code
[i
- 1].code
== IR
.GotoEndOr
);
139 rev
[r
- 1] = code
[i
- 1];
142 auto newStart
= i
+ IRL
!(IR
.Option
);
143 auto newEnd
= newStart
+ code
[i
].data
;
144 auto newRpc
= r
+ code
[i
].data
+ IRL
!(IR
.Option
);
145 if (code
[newEnd
].code
!= IR
.OrEnd
)
149 stack
.push(tuple(newStart
, newEnd
, newRpc
));
150 r
+= code
[i
].data
+ IRL
!(IR
.Option
);
151 i
+= code
[i
].data
+ IRL
!(IR
.Option
);
155 assert(code
[pc
].code
== IR
.OrEnd
);
163 start
= stack
.top
[0];
165 revPc
= stack
.top
[2];
173 Bytecode
[] ir
; // resulting bytecode
174 Stack
!(uint) fixupStack
; // stack of opened start instructions
175 NamedGroup
[] dict
; // maps name -> user group number
176 Stack
!(uint) groupStack
; // stack of current number of group
177 uint nesting
= 0; // group nesting level and repetitions step
178 uint lookaroundNest
= 0; // nesting of lookaround
179 uint counterDepth
= 0; // current depth of nested counted repetitions
180 CodepointSet
[] charsets
; // sets for char classes
181 const(CharMatcher
)[] matchers
; // matchers for char classes
182 uint[] backrefed
; // bitarray for groups refered by backref
183 uint ngroup
; // final number of groups (of all patterns)
185 void start(uint length
)
188 ir
.reserve((length
*5+2)/4);
190 groupStack
.push(1);//0 - whole match
193 //mark referenced groups for latter processing
194 void markBackref(uint n
)
196 if (n
/32 >= backrefed
.length
)
197 backrefed
.length
= n
/32 + 1;
198 backrefed
[n
/ 32] |
= 1 << (n
& 31);
201 bool isOpenGroup(uint n
)
203 import std
.algorithm
.searching
: canFind
;
204 // walk the fixup stack and see if there are groups labeled 'n'
205 // fixup '0' is reserved for alternations
206 return fixupStack
.data
[1..$].
207 canFind
!(fix
=> ir
[fix
].code
== IR
.GroupStart
&& ir
[fix
].data
== n
)();
210 void put(Bytecode code
)
212 enforce(ir
.length
< maxCompiledLength
,
213 "maximum compiled pattern length is exceeded");
217 void putRaw(uint number
)
219 enforce(ir
.length
< maxCompiledLength
,
220 "maximum compiled pattern length is exceeded");
221 ir
~= Bytecode
.fromRaw(number
);
224 //try to generate optimal IR code for this CodepointSet
225 @trusted void charsetToIr(CodepointSet set
)
226 {//@@@BUG@@@ writeln is @system
227 uint chars
= cast(uint) set
.length
;
228 if (chars
< Bytecode
.maxSequence
)
233 put(Bytecode(IR
.Char
, set
.byCodepoint
.front
));
236 throw new RegexException("empty CodepointSet not allowed");
238 foreach (ch
; set
.byCodepoint
)
239 put(Bytecode(IR
.OrChar
, ch
, chars
));
244 import std
.algorithm
.searching
: countUntil
;
245 const ivals
= set
.byInterval
;
246 immutable n
= charsets
.countUntil(set
);
249 if (ivals
.length
*2 > maxCharsetUsed
)
250 put(Bytecode(IR
.Trie
, cast(uint) n
));
252 put(Bytecode(IR
.CodepointSet
, cast(uint) n
));
255 if (ivals
.length
*2 > maxCharsetUsed
)
257 auto t
= getMatcher(set
);
258 put(Bytecode(IR
.Trie
, cast(uint) matchers
.length
));
260 debug(std_regex_allocation
) writeln("Trie generated");
264 put(Bytecode(IR
.CodepointSet
, cast(uint) charsets
.length
));
265 matchers
~= CharMatcher
.init
;
268 assert(charsets
.length
== matchers
.length
);
276 put(Bytecode(IR
.Nop
, 0));
283 immutable nglob
= groupStack
.top
++;
284 enforce(groupStack
.top
<= maxGroupNumber
, "limit on number of submatches is exceeded");
285 put(Bytecode(IR
.GroupStart
, nglob
));
288 void genNamedGroup(string name
)
290 import std
.array
: insertInPlace
;
291 import std
.range
: assumeSorted
;
294 immutable nglob
= groupStack
.top
++;
295 enforce(groupStack
.top
<= maxGroupNumber
, "limit on submatches is exceeded");
296 auto t
= NamedGroup(name
, nglob
);
297 auto d
= assumeSorted
!"a.name < b.name"(dict
);
298 immutable ind
= d
.lowerBound(t
).length
;
299 insertInPlace(dict
, ind
, t
);
300 put(Bytecode(IR
.GroupStart
, nglob
));
303 //generate code for start of lookaround: (?= (?! (?<= (?<!
304 void genLookaround(IR opcode
)
308 put(Bytecode(opcode
, 0));
309 put(Bytecode
.fromRaw(0));
310 put(Bytecode
.fromRaw(0));
313 enforce(lookaroundNest
<= maxLookaroundDepth
,
314 "maximum lookaround depth is exceeded");
317 void endPattern(uint num
)
319 import std
.algorithm
.comparison
: max
;
320 put(Bytecode(IR
.End
, num
));
321 ngroup
= max(ngroup
, groupStack
.top
);
322 groupStack
.top
= 1; // reset group counter
325 //fixup lookaround with start at offset fix and append a proper *-End opcode
326 void fixLookaround(uint fix
)
329 ir
[fix
] = Bytecode(ir
[fix
].code
,
330 cast(uint) ir
.length
- fix
- IRL
!(IR
.LookaheadStart
));
331 auto g
= groupStack
.pop();
332 assert(!groupStack
.empty
);
333 ir
[fix
+1] = Bytecode
.fromRaw(groupStack
.top
);
334 //groups are cumulative across lookarounds
335 ir
[fix
+2] = Bytecode
.fromRaw(groupStack
.top
+g
);
337 if (ir
[fix
].code
== IR
.LookbehindStart || ir
[fix
].code
== IR
.NeglookbehindStart
)
339 reverseBytecode(ir
[fix
+ IRL
!(IR
.LookbehindStart
) .. $]);
344 // repetition of {1,1}
345 void fixRepetition(uint offset
)
347 import std
.algorithm
.mutation
: copy
;
348 immutable replace
= ir
[offset
].code
== IR
.Nop
;
351 copy(ir
[offset
+ 1 .. $], ir
[offset
.. $ - 1]);
356 // repetition of {x,y}
357 void fixRepetition(uint offset
, uint min
, uint max
, bool greedy
)
359 static import std
.algorithm
.comparison
;
360 import std
.algorithm
.mutation
: copy
;
361 import std
.array
: insertInPlace
;
362 immutable replace
= ir
[offset
].code
== IR
.Nop
;
363 immutable len
= cast(uint) ir
.length
- offset
- replace
;
366 if (min
!= 1 || max
!= 1)
368 Bytecode op
= Bytecode(greedy ? IR
.RepeatStart
: IR
.RepeatQStart
, len
);
372 insertInPlace(ir
, offset
, op
);
373 put(Bytecode(greedy ? IR
.RepeatEnd
: IR
.RepeatQEnd
, len
));
374 put(Bytecode
.init
); //hotspot
378 counterDepth
= std
.algorithm
.comparison
.max(counterDepth
, nesting
+1);
381 else if (min
) //&& max is infinite
385 Bytecode op
= Bytecode(greedy ? IR
.RepeatStart
: IR
.RepeatQStart
, len
);
389 insertInPlace(ir
, offset
, op
);
390 offset
+= 1;//so it still points to the repeated block
391 put(Bytecode(greedy ? IR
.RepeatEnd
: IR
.RepeatQEnd
, len
));
392 put(Bytecode
.init
); //hotspot
396 counterDepth
= std
.algorithm
.comparison
.max(counterDepth
, nesting
+1);
400 copy(ir
[offset
+1 .. $], ir
[offset
.. $-1]);
403 put(Bytecode(greedy ? IR
.InfiniteStart
: IR
.InfiniteQStart
, len
));
404 enforce(ir
.length
+ len
< maxCompiledLength
, "maximum compiled pattern length is exceeded");
405 ir
~= ir
[offset
.. offset
+len
];
406 //IR.InfinteX is always a hotspot
407 put(Bytecode(greedy ? IR
.InfiniteEnd
: IR
.InfiniteQEnd
, len
));
408 put(Bytecode
.init
); //merge index
412 Bytecode op
= Bytecode(greedy ? IR
.InfiniteStart
: IR
.InfiniteQStart
, len
);
416 insertInPlace(ir
, offset
, op
);
417 //IR.InfinteX is always a hotspot
418 put(Bytecode(greedy ? IR
.InfiniteEnd
: IR
.InfiniteQEnd
, len
));
419 put(Bytecode
.init
); //merge index
423 void fixAlternation()
425 import std
.array
: insertInPlace
;
426 uint fix
= fixupStack
.top
;
427 if (ir
.length
> fix
&& ir
[fix
].code
== IR
.Option
)
429 ir
[fix
] = Bytecode(ir
[fix
].code
, cast(uint) ir
.length
- fix
);
430 put(Bytecode(IR
.GotoEndOr
, 0));
431 fixupStack
.top
= cast(uint) ir
.length
; //replace latest fixup for Option
432 put(Bytecode(IR
.Option
, 0));
437 if (fixupStack
.length
== 1)
438 {//only root entry, effectively no fixup
439 len
= cast(uint) ir
.length
+ IRL
!(IR
.GotoEndOr
);
443 {//IR.lookahead, etc. fixups that have length > 1, thus check ir[x].length
444 len
= cast(uint) ir
.length
- fix
- (ir
[fix
].length
- 1);
445 orStart
= fix
+ ir
[fix
].length
;
447 insertInPlace(ir
, orStart
, Bytecode(IR
.OrStart
, 0), Bytecode(IR
.Option
, len
));
448 assert(ir
[orStart
].code
== IR
.OrStart
);
449 put(Bytecode(IR
.GotoEndOr
, 0));
450 fixupStack
.push(orStart
); //fixup for StartOR
451 fixupStack
.push(cast(uint) ir
.length
); //for second Option
452 put(Bytecode(IR
.Option
, 0));
455 // finalizes IR.Option, fix points to the first option of sequence
456 void finishAlternation(uint fix
)
458 enforce(ir
[fix
].code
== IR
.Option
, "no matching ')'");
459 ir
[fix
] = Bytecode(ir
[fix
].code
, cast(uint) ir
.length
- fix
- IRL
!(IR
.OrStart
));
460 fix
= fixupStack
.pop();
461 enforce(ir
[fix
].code
== IR
.OrStart
, "no matching ')'");
462 ir
[fix
] = Bytecode(IR
.OrStart
, cast(uint) ir
.length
- fix
- IRL
!(IR
.OrStart
));
463 put(Bytecode(IR
.OrEnd
, cast(uint) ir
.length
- fix
- IRL
!(IR
.OrStart
)));
464 uint pc
= fix
+ IRL
!(IR
.OrStart
);
465 while (ir
[pc
].code
== IR
.Option
)
467 pc
= pc
+ ir
[pc
].data
;
468 if (ir
[pc
].code
!= IR
.GotoEndOr
)
470 ir
[pc
] = Bytecode(IR
.GotoEndOr
, cast(uint)(ir
.length
- pc
- IRL
!(IR
.OrEnd
)));
471 pc
+= IRL
!(IR
.GotoEndOr
);
473 put(Bytecode
.fromRaw(0));
476 // returns: (flag - repetition possible?, fixup of the start of this "group")
477 Tuple
!(bool, uint) onClose()
480 uint fix
= popFixup();
481 switch (ir
[fix
].code
)
484 put(Bytecode(IR
.GroupEnd
, ir
[fix
].data
));
485 return tuple(true, fix
);
486 case IR
.LookaheadStart
, IR
.NeglookaheadStart
, IR
.LookbehindStart
, IR
.NeglookbehindStart
:
487 assert(lookaroundNest
);
489 return tuple(false, 0u);
490 case IR
.Option
: //| xxx )
491 //two fixups: last option + full OR
492 finishAlternation(fix
);
494 switch (ir
[fix
].code
)
498 put(Bytecode(IR
.GroupEnd
, ir
[fix
].data
));
499 return tuple(true, fix
);
500 case IR
.LookaheadStart
, IR
.NeglookaheadStart
, IR
.LookbehindStart
, IR
.NeglookbehindStart
:
501 assert(lookaroundNest
);
504 return tuple(false, 0u);
507 return tuple(true, fix
);
510 return tuple(true, fix
);
514 uint popFixup(){ return fixupStack
.pop(); }
516 void pushFixup(uint val
){ return fixupStack
.push(val
); }
518 @property uint topFixup(){ return fixupStack
.top
; }
520 @property size_t
fixupLength(){ return fixupStack
.data
.length
; }
522 @property uint length(){ return cast(uint) ir
.length
; }
526 enum maxGroupNumber
= 2^^
19;
527 enum maxLookaroundDepth
= 16;
528 // *Bytecode.sizeof, i.e. 1Mb of bytecode alone
529 enum maxCompiledLength
= 2^^
18;
530 // amounts to up to 4 Mb of auxilary table for matching
531 enum maxCumulativeRepetitionLength
= 2^^
20;
532 // marker to indicate infinite repetition
535 struct Parser(R
, Generator
)
536 if (isForwardRange
!R
&& is(ElementType
!R
: dchar))
540 R pat
, origin
; //keep full pattern for pretty printing error messages
541 uint re_flags
= 0; //global flags e.g. multiline + internal ones
544 @trusted this(S
)(R pattern
, S flags
)
547 pat
= origin
= pattern
;
548 //reserve slightly more then avg as sampled from unittests
550 front
= ' ';//a safe default for freeform parsing
552 g
.start(cast(uint) pat
.length
);
559 error(e
.msg
);//also adds pattern location
579 while (!empty
&& isWhite(front
)) _popFront();
585 if (re_flags
& RegexOption
.freeform
) skipSpace();
588 auto save(){ return this; }
590 //parsing number with basic overflow check
594 while (std
.ascii
.isDigit(front
))
596 if (r
>= (uint.max
/10))
597 error("Overflow in decimal number");
598 r
= 10*r
+ cast(uint)(front
-'0');
606 @trusted void parseFlags(S
)(S flags
)
607 {//@@@BUG@@@ text is @system
608 import std
.conv
: text
;
609 foreach (ch
; flags
)//flags are ASCII anyway
615 foreach (i
, op
; __traits(allMembers
, RegexOption
))
617 case RegexOptionNames
[i
]:
618 if (re_flags
& mixin("RegexOption."~op
))
619 throw new RegexException(text("redundant flag specified: ",ch
));
620 re_flags |
= mixin("RegexOption."~op
);
624 throw new RegexException(text("unknown regex flag '",ch
,"'"));
629 //parse and store IR for regex pattern
630 @trusted void parseRegex()
632 uint fix
;//fixup pointer
636 debug(std_regex_parser
)
637 __ctfe ||
writeln("*LR*\nSource: ", pat
, "\nStack: ",fixupStack
.data
);
651 enforce(!empty
, "Unexpected end of pattern");
664 g
.genLookaround(IR
.LookaheadStart
);
668 g
.genLookaround(IR
.NeglookaheadStart
);
673 enforce(front
== '<', "Expected '<' in named group");
676 if (empty ||
!(isAlpha(front
) || front
== '_'))
677 error("Expected alpha starting a named group");
680 while (!empty
&& (isAlpha(front
) ||
681 front
== '_' || std
.ascii
.isDigit(front
)))
686 enforce(front
== '>', "Expected '>' closing named group");
688 g
.genNamedGroup(name
);
693 g
.genLookaround(IR
.LookbehindStart
);
694 else if (front
== '!')
695 g
.genLookaround(IR
.NeglookbehindStart
);
697 error("'!' or '=' expected after '<'");
701 uint enableFlags
, disableFlags
;
709 enableFlags |
= RegexOption
.singleline
;
711 disableFlags |
= RegexOption
.singleline
;
715 enableFlags |
= RegexOption
.freeform
;
717 disableFlags |
= RegexOption
.freeform
;
721 enableFlags |
= RegexOption
.casefold
;
723 disableFlags |
= RegexOption
.casefold
;
727 enableFlags |
= RegexOption
.multiline
;
729 disableFlags |
= RegexOption
.multiline
;
733 error(" unexpected second '-' in flags");
737 error(" 's', 'x', 'i', 'm' or '-' expected after '(?' ");
740 }while (front
!= ')');
742 re_flags |
= enableFlags
;
743 re_flags
&= ~disableFlags
;
752 enforce(g
.nesting
, "Unmatched ')'");
754 auto pair
= g
.onClose();
756 parseQuantifier(pair
[1]);
762 default://no groups or whatever
763 immutable start
= g
.length
;
765 parseQuantifier(start
);
769 if (g
.fixupLength
!= 1)
772 g
.finishAlternation(fix
);
773 enforce(g
.fixupLength
== 1, "no matching ')'");
778 //parse and store IR for atom-quantifier pair
779 @trusted void parseQuantifier(uint offset
)
782 return g
.fixRepetition(offset
);
800 enforce(!empty
, "Unexpected end of regex pattern");
801 enforce(std
.ascii
.isDigit(front
), "First number required in repetition");
802 min
= parseDecimal();
805 else if (front
== ',')
808 if (std
.ascii
.isDigit(front
))
809 max
= parseDecimal();
810 else if (front
== '}')
813 error("Unexpected symbol in regex pattern");
815 enforce(front
== '}', "Unmatched '{' in regex pattern");
818 error("Unexpected symbol in regex pattern");
819 enforce(min
<= max
, "Illegal {n,m} quantifier");
822 g
.fixRepetition(offset
);
826 //check only if we managed to get new symbol
828 if (!empty
&& front
== '?')
833 g
.fixRepetition(offset
, min
, max
, greedy
);
836 //parse and store IR for atom
843 case '*', '?', '+', '|', '{', '}':
844 return error("'*', '+', '?', '{', '}' not allowed in atom");
846 if (re_flags
& RegexOption
.singleline
)
847 g
.put(Bytecode(IR
.Any
, 0));
851 g
.charsetToIr(set
.add('\n','\n'+1).add('\r', '\r'+1).inverted
);
860 enforce(!empty
, "Unfinished escape sequence");
864 if (re_flags
& RegexOption
.multiline
)
865 g
.put(Bytecode(IR
.Bol
, 0));
867 g
.put(Bytecode(IR
.Bof
, 0));
871 if (re_flags
& RegexOption
.multiline
)
872 g
.put(Bytecode(IR
.Eol
, 0));
874 g
.put(Bytecode(IR
.Eof
, 0));
878 if (re_flags
& RegexOption
.casefold
)
880 auto range
= simpleCaseFoldings(front
);
881 assert(range
.length
<= 5);
882 if (range
.length
== 1)
883 g
.put(Bytecode(IR
.Char
, range
.front
));
886 g
.put(Bytecode(IR
.OrChar
, v
, cast(uint) range
.length
));
889 g
.put(Bytecode(IR
.Char
, front
));
894 //parse and store IR for CodepointSet
897 const save
= re_flags
;
898 re_flags
&= ~RegexOption
.freeform
; // stop ignoring whitespace if we did
899 bool casefold
= cast(bool)(re_flags
& RegexOption
.casefold
);
900 g
.charsetToIr(unicode
.parseSet(this, casefold
));
902 // Last next() in parseCharset is executed w/o freeform flag
903 if (re_flags
& RegexOption
.freeform
) skipSpace();
906 //parse and generate IR for escape stand alone escape sequence
907 @trusted void parseEscape()
908 {//accesses array of appender
909 import std
.algorithm
.iteration
: sum
;
912 case 'f': popFront(); g
.put(Bytecode(IR
.Char
, '\f')); break;
913 case 'n': popFront(); g
.put(Bytecode(IR
.Char
, '\n')); break;
914 case 'r': popFront(); g
.put(Bytecode(IR
.Char
, '\r')); break;
915 case 't': popFront(); g
.put(Bytecode(IR
.Char
, '\t')); break;
916 case 'v': popFront(); g
.put(Bytecode(IR
.Char
, '\v')); break;
920 g
.charsetToIr(unicode
.Nd
);
924 g
.charsetToIr(unicode
.Nd
.inverted
);
926 case 'b': popFront(); g
.put(Bytecode(IR
.Wordboundary
, 0)); break;
927 case 'B': popFront(); g
.put(Bytecode(IR
.Notwordboundary
, 0)); break;
930 g
.charsetToIr(unicode
.White_Space
);
934 g
.charsetToIr(unicode
.White_Space
.inverted
);
938 g
.charsetToIr(wordCharacter
);
942 g
.charsetToIr(wordCharacter
.inverted
);
945 bool casefold
= cast(bool)(re_flags
& RegexOption
.casefold
);
946 auto set
= unicode
.parsePropertySpec(this, front
== 'P', casefold
);
950 immutable code
= parseUniHex(pat
, 2);
952 g
.put(Bytecode(IR
.Char
,code
));
955 immutable code
= parseUniHex(pat
, front
== 'u' ?
4 : 8);
957 g
.put(Bytecode(IR
.Char
, code
));
959 case 'c': //control codes
960 Bytecode code
= Bytecode(IR
.Char
, unicode
.parseControlCode(this));
966 g
.put(Bytecode(IR
.Char
, 0));//NUL character
968 case '1': .. case '9':
969 uint nref
= cast(uint) front
- '0';
970 immutable maxBackref
= sum(g
.groupStack
.data
);
971 enforce(nref
< maxBackref
, "Backref to unseen group");
972 //perl's disambiguation rule i.e.
973 //get next digit only if there is such group number
975 while (nref
< maxBackref
&& !empty
&& std
.ascii
.isDigit(front
))
977 nref
= nref
* 10 + front
- '0';
980 if (nref
>= maxBackref
)
982 enforce(!g
.isOpenGroup(nref
), "Backref to open group");
983 uint localLimit
= maxBackref
- g
.groupStack
.top
;
984 if (nref
>= localLimit
)
986 g
.put(Bytecode(IR
.Backref
, nref
-localLimit
));
987 g
.ir
[$-1].setLocalRef();
990 g
.put(Bytecode(IR
.Backref
, nref
));
994 if (front
== '\\' && !pat
.empty
)
996 if (pat
.front
>= privateUseStart
&& pat
.front
<= privateUseEnd
)
997 enforce(false, "invalid escape sequence");
999 if (front
>= privateUseStart
&& front
<= privateUseEnd
)
1001 g
.endPattern(front
- privateUseStart
+ 1);
1004 auto op
= Bytecode(IR
.Char
, front
);
1011 @trusted void error(string msg
)
1013 import std
.conv
: text
;
1015 app
~= "\nPattern with error: `";
1016 app
~= origin
[0..$-pat
.length
].text
;
1017 app
~= "` <--HERE-- `";
1020 throw new RegexException(app
);
1023 alias Char
= BasicElementOf
!R
;
1027 return makeRegex(this);
1032 Postproces the IR, then optimize.
1034 @trusted void postprocess(Char
)(ref Regex
!Char zis
)
1035 {//@@@BUG@@@ write is @system
1038 struct FixedStack(T
)
1042 //this(T[] storage){ arr = storage; _top = -1; }
1043 @property ref T
top(){ assert(!empty
); return arr
[_top
]; }
1044 void push(T x
){ arr
[++_top
] = x
; }
1045 T
pop() { assert(!empty
); return arr
[_top
--]; }
1046 @property bool empty(){ return _top
== -1; }
1048 auto counterRange
= FixedStack
!uint(new uint[maxCounterDepth
+1], -1);
1049 counterRange
.push(1);
1051 for (uint i
= 0; i
< ir
.length
; i
+= ir
[i
].length
)
1055 assert(i
+ 1 < ir
.length
,
1056 "unexpected end of IR while looking for hotspot");
1057 ir
[i
+1] = Bytecode
.fromRaw(hotspotTableSize
);
1058 hotspotTableSize
+= counterRange
.top
;
1062 case IR
.RepeatStart
, IR
.RepeatQStart
:
1063 uint repEnd
= cast(uint)(i
+ ir
[i
].data
+ IRL
!(IR
.RepeatStart
));
1064 assert(ir
[repEnd
].code
== ir
[i
].paired
.code
);
1065 immutable max
= ir
[repEnd
+ 4].raw
;
1066 ir
[repEnd
+2].raw
= counterRange
.top
;
1067 ir
[repEnd
+3].raw
*= counterRange
.top
;
1068 ir
[repEnd
+4].raw
*= counterRange
.top
;
1069 ulong cntRange
= cast(ulong)(max
)*counterRange
.top
;
1070 cumRange
+= cntRange
;
1071 enforce(cumRange
< maxCumulativeRepetitionLength
,
1072 "repetition length limit is exceeded");
1073 counterRange
.push(cast(uint) cntRange
+ counterRange
.top
);
1074 threadCount
+= counterRange
.top
;
1076 case IR
.RepeatEnd
, IR
.RepeatQEnd
:
1077 threadCount
+= counterRange
.top
;
1081 if (isBackref(ir
[i
].data
))
1082 ir
[i
].setBackrefence();
1083 threadCount
+= counterRange
.top
;
1086 if (isBackref(ir
[i
].data
))
1087 ir
[i
].setBackrefence();
1088 threadCount
+= counterRange
.top
;
1091 threadCount
+= counterRange
.top
;
1095 if (!(flags
& RegexInfo
.oneShot
))
1096 kickstart
= Kickstart
!Char(zis
, new uint[](256));
1097 debug(std_regex_allocation
) writefln("IR processed, max threads: %d", threadCount
);
1102 void fixupBytecode()(Bytecode
[] ir
)
1106 with(IR
) for (uint i
=0; i
<ir
.length
; i
+= ir
[i
].length
)
1108 if (ir
[i
].isStart || ir
[i
].code
== Option
)
1110 else if (ir
[i
].code
== OrEnd
)
1112 // Alternatives need more care
1113 auto j
= fixups
.pop(); // last Option
1114 ir
[j
].data
= i
- j
- ir
[j
].length
;
1115 j
= fixups
.pop(); // OrStart
1116 ir
[j
].data
= i
- j
- ir
[j
].length
;
1117 ir
[i
].data
= ir
[j
].data
;
1119 // fixup all GotoEndOrs
1120 j
= j
+ IRL
!(OrStart
);
1121 assert(ir
[j
].code
== Option
);
1124 auto next
= j
+ ir
[j
].data
+ IRL
!(Option
);
1125 if (ir
[next
].code
== IR
.OrEnd
)
1127 ir
[next
- IRL
!(GotoEndOr
)].data
= i
- next
;
1131 else if (ir
[i
].code
== GotoEndOr
)
1133 auto j
= fixups
.pop(); // Option
1134 ir
[j
].data
= i
- j
+ IRL
!(GotoEndOr
)- IRL
!(Option
); // to the next option
1136 else if (ir
[i
].isEnd
)
1138 auto j
= fixups
.pop();
1139 ir
[i
].data
= i
- j
- ir
[j
].length
;
1140 ir
[j
].data
= ir
[i
].data
;
1143 assert(fixups
.empty
);
1146 void optimize(Char
)(ref Regex
!Char zis
)
1148 import std
.array
: insertInPlace
;
1149 CodepointSet
nextSet(uint idx
)
1154 for (uint i
= idx
; i
< ir
.length
; i
+= ir
[i
].length
)
1159 set
.add(ir
[i
].data
, ir
[i
].data
+1);
1162 case Trie
, CodepointSet
:
1163 set
= zis
.charsets
[ir
[i
].data
];
1165 case GroupStart
,GroupEnd
:
1174 with(zis
) with(IR
) for (uint i
= 0; i
< ir
.length
; i
+= ir
[i
].length
)
1176 if (ir
[i
].code
== InfiniteEnd
)
1178 auto set
= nextSet(i
+IRL
!(InfiniteEnd
));
1179 if (!set
.empty
&& set
.length
< 10_000)
1181 ir
[i
] = Bytecode(InfiniteBloomEnd
, ir
[i
].data
);
1182 ir
[i
- ir
[i
].data
- IRL
!(InfiniteStart
)] =
1183 Bytecode(InfiniteBloomStart
, ir
[i
].data
);
1184 ir
.insertInPlace(i
+IRL
!(InfiniteEnd
),
1185 Bytecode
.fromRaw(cast(uint) zis
.filters
.length
));
1186 zis
.filters
~= BitTable(set
);
1193 //IR code validator - proper nesting, illegal instructions, etc.
1194 @trusted void validateRe(Char
)(ref Regex
!Char zis
)
1195 {//@@@BUG@@@ text is @system
1196 import std
.conv
: text
;
1199 for (uint pc
= 0; pc
< ir
.length
; pc
+= ir
[pc
].length
)
1201 if (ir
[pc
].isStart || ir
[pc
].isEnd
)
1203 immutable dest
= ir
[pc
].indexOfPair(pc
);
1204 assert(dest
< ir
.length
, text("Wrong length in opcode at pc=",
1205 pc
, " ", dest
, " vs ", ir
.length
));
1206 assert(ir
[dest
].paired
== ir
[pc
],
1207 text("Wrong pairing of opcodes at pc=", pc
, "and pc=", dest
));
1209 else if (ir
[pc
].isAtom
)
1214 assert(0, text("Unknown type of instruction at pc=", pc
));