d: Merge upstream dmd 4d1bfcf14, druntime 9ba9a6ae, phobos c0cc5e917.
[official-gcc.git] / libphobos / src / std / regex / internal / parser.d
blob448bb99a9a132fd2a7c37cae59abec446b316a40
1 //Written in the D programming language
2 /*
3 Regular expression pattern parser.
4 */
5 module std.regex.internal.parser;
7 import std.regex.internal.ir;
8 import std.range.primitives, std.uni, std.meta,
9 std.traits, std.typecons, std.exception;
10 static import std.ascii;
12 // package relevant info from parser into a regex object
13 auto makeRegex(S, CG)(Parser!(S, CG) p)
15 import std.regex.internal.backtracking : BacktrackingMatcher;
16 import std.regex.internal.thompson : ThompsonMatcher;
17 import std.algorithm.searching : canFind;
18 alias Char = BasicElementOf!S;
19 Regex!Char re;
20 auto g = p.g;
21 with(re)
23 ir = g.ir;
24 dict = g.dict;
25 ngroup = g.ngroup;
26 maxCounterDepth = g.counterDepth;
27 flags = p.re_flags;
28 charsets = g.charsets;
29 matchers = g.matchers;
30 backrefed = g.backrefed;
31 re.pattern = p.origin.idup;
32 re.postprocess();
33 // check if we have backreferences, if so - use backtracking
34 if (__ctfe) factory = null; // allows us to use the awful enum re = regex(...);
35 else if (re.backrefed.canFind!"a != 0")
36 factory = new RuntimeFactory!(BacktrackingMatcher, Char);
37 else
38 factory = new RuntimeFactory!(ThompsonMatcher, Char);
39 debug(std_regex_parser)
41 __ctfe || print();
43 //@@@BUG@@@ (not reduced)
44 //somehow just using validate _collides_ with std.utf.validate (!)
45 version (assert) re.validateRe();
47 return re;
50 // helper for unittest
51 auto makeRegex(S)(S arg)
52 if (isSomeString!S)
54 return makeRegex(Parser!(S, CodeGen)(arg, ""));
57 @system unittest
59 import std.algorithm.comparison : equal;
60 auto re = makeRegex(`(?P<name>\w+) = (?P<var>\d+)`);
61 auto nc = re.namedCaptures;
62 static assert(isRandomAccessRange!(typeof(nc)));
63 assert(!nc.empty);
64 assert(nc.length == 2);
65 assert(nc.equal(["name", "var"]));
66 assert(nc[0] == "name");
67 assert(nc[1..$].equal(["var"]));
69 re = makeRegex(`(\w+) (?P<named>\w+) (\w+)`);
70 nc = re.namedCaptures;
71 assert(nc.length == 1);
72 assert(nc[0] == "named");
73 assert(nc.front == "named");
74 assert(nc.back == "named");
76 re = makeRegex(`(\w+) (\w+)`);
77 nc = re.namedCaptures;
78 assert(nc.empty);
80 re = makeRegex(`(?P<year>\d{4})/(?P<month>\d{2})/(?P<day>\d{2})/`);
81 nc = re.namedCaptures;
82 auto cp = nc.save;
83 assert(nc.equal(cp));
84 nc.popFront();
85 assert(nc.equal(cp[1..$]));
86 nc.popBack();
87 assert(nc.equal(cp[1 .. $ - 1]));
91 @trusted void reverseBytecode()(Bytecode[] code)
93 Bytecode[] rev = new Bytecode[code.length];
94 uint revPc = cast(uint) rev.length;
95 Stack!(Tuple!(uint, uint, uint)) stack;
96 uint start = 0;
97 uint end = cast(uint) code.length;
98 for (;;)
100 for (uint pc = start; pc < end; )
102 immutable len = code[pc].length;
103 if (code[pc].code == IR.GotoEndOr)
104 break; //pick next alternation branch
105 if (code[pc].isAtom)
107 rev[revPc - len .. revPc] = code[pc .. pc + len];
108 revPc -= len;
109 pc += len;
111 else if (code[pc].isStart || code[pc].isEnd)
113 //skip over other embedded lookbehinds they are reversed
114 if (code[pc].code == IR.LookbehindStart
115 || code[pc].code == IR.NeglookbehindStart)
117 immutable blockLen = len + code[pc].data
118 + code[pc].pairedLength;
119 rev[revPc - blockLen .. revPc] = code[pc .. pc + blockLen];
120 pc += blockLen;
121 revPc -= blockLen;
122 continue;
124 immutable second = code[pc].indexOfPair(pc);
125 immutable secLen = code[second].length;
126 rev[revPc - secLen .. revPc] = code[second .. second + secLen];
127 revPc -= secLen;
128 if (code[pc].code == IR.OrStart)
130 //we pass len bytes forward, but secLen in reverse
131 immutable revStart = revPc - (second + len - secLen - pc);
132 uint r = revStart;
133 uint i = pc + IRL!(IR.OrStart);
134 while (code[i].code == IR.Option)
136 if (code[i - 1].code != IR.OrStart)
138 assert(code[i - 1].code == IR.GotoEndOr);
139 rev[r - 1] = code[i - 1];
141 rev[r] = code[i];
142 auto newStart = i + IRL!(IR.Option);
143 auto newEnd = newStart + code[i].data;
144 auto newRpc = r + code[i].data + IRL!(IR.Option);
145 if (code[newEnd].code != IR.OrEnd)
147 newRpc--;
149 stack.push(tuple(newStart, newEnd, newRpc));
150 r += code[i].data + IRL!(IR.Option);
151 i += code[i].data + IRL!(IR.Option);
153 pc = i;
154 revPc = revStart;
155 assert(code[pc].code == IR.OrEnd);
157 else
158 pc += len;
161 if (stack.empty)
162 break;
163 start = stack.top[0];
164 end = stack.top[1];
165 revPc = stack.top[2];
166 stack.pop();
168 code[] = rev[];
171 struct CodeGen
173 Bytecode[] ir; // resulting bytecode
174 Stack!(uint) fixupStack; // stack of opened start instructions
175 NamedGroup[] dict; // maps name -> user group number
176 Stack!(uint) groupStack; // stack of current number of group
177 uint nesting = 0; // group nesting level and repetitions step
178 uint lookaroundNest = 0; // nesting of lookaround
179 uint counterDepth = 0; // current depth of nested counted repetitions
180 CodepointSet[] charsets; // sets for char classes
181 const(CharMatcher)[] matchers; // matchers for char classes
182 uint[] backrefed; // bitarray for groups refered by backref
183 uint ngroup; // final number of groups (of all patterns)
185 void start(uint length)
187 if (!__ctfe)
188 ir.reserve((length*5+2)/4);
189 fixupStack.push(0);
190 groupStack.push(1);//0 - whole match
193 //mark referenced groups for latter processing
194 void markBackref(uint n)
196 if (n/32 >= backrefed.length)
197 backrefed.length = n/32 + 1;
198 backrefed[n / 32] |= 1 << (n & 31);
201 bool isOpenGroup(uint n)
203 import std.algorithm.searching : canFind;
204 // walk the fixup stack and see if there are groups labeled 'n'
205 // fixup '0' is reserved for alternations
206 return fixupStack.data[1..$].
207 canFind!(fix => ir[fix].code == IR.GroupStart && ir[fix].data == n)();
210 void put(Bytecode code)
212 enforce(ir.length < maxCompiledLength,
213 "maximum compiled pattern length is exceeded");
214 ir ~= code;
217 void putRaw(uint number)
219 enforce(ir.length < maxCompiledLength,
220 "maximum compiled pattern length is exceeded");
221 ir ~= Bytecode.fromRaw(number);
224 //try to generate optimal IR code for this CodepointSet
225 @trusted void charsetToIr(CodepointSet set)
226 {//@@@BUG@@@ writeln is @system
227 uint chars = cast(uint) set.length;
228 if (chars < Bytecode.maxSequence)
230 switch (chars)
232 case 1:
233 put(Bytecode(IR.Char, set.byCodepoint.front));
234 break;
235 case 0:
236 throw new RegexException("empty CodepointSet not allowed");
237 default:
238 foreach (ch; set.byCodepoint)
239 put(Bytecode(IR.OrChar, ch, chars));
242 else
244 import std.algorithm.searching : countUntil;
245 const ivals = set.byInterval;
246 immutable n = charsets.countUntil(set);
247 if (n >= 0)
249 if (ivals.length*2 > maxCharsetUsed)
250 put(Bytecode(IR.Trie, cast(uint) n));
251 else
252 put(Bytecode(IR.CodepointSet, cast(uint) n));
253 return;
255 if (ivals.length*2 > maxCharsetUsed)
257 auto t = getMatcher(set);
258 put(Bytecode(IR.Trie, cast(uint) matchers.length));
259 matchers ~= t;
260 debug(std_regex_allocation) writeln("Trie generated");
262 else
264 put(Bytecode(IR.CodepointSet, cast(uint) charsets.length));
265 matchers ~= CharMatcher.init;
267 charsets ~= set;
268 assert(charsets.length == matchers.length);
272 void genLogicGroup()
274 nesting++;
275 pushFixup(length);
276 put(Bytecode(IR.Nop, 0));
279 void genGroup()
281 nesting++;
282 pushFixup(length);
283 immutable nglob = groupStack.top++;
284 enforce(groupStack.top <= maxGroupNumber, "limit on number of submatches is exceeded");
285 put(Bytecode(IR.GroupStart, nglob));
288 void genNamedGroup(string name)
290 import std.array : insertInPlace;
291 import std.range : assumeSorted;
292 nesting++;
293 pushFixup(length);
294 immutable nglob = groupStack.top++;
295 enforce(groupStack.top <= maxGroupNumber, "limit on submatches is exceeded");
296 auto t = NamedGroup(name, nglob);
297 auto d = assumeSorted!"a.name < b.name"(dict);
298 immutable ind = d.lowerBound(t).length;
299 insertInPlace(dict, ind, t);
300 put(Bytecode(IR.GroupStart, nglob));
303 //generate code for start of lookaround: (?= (?! (?<= (?<!
304 void genLookaround(IR opcode)
306 nesting++;
307 pushFixup(length);
308 put(Bytecode(opcode, 0));
309 put(Bytecode.fromRaw(0));
310 put(Bytecode.fromRaw(0));
311 groupStack.push(0);
312 lookaroundNest++;
313 enforce(lookaroundNest <= maxLookaroundDepth,
314 "maximum lookaround depth is exceeded");
317 void endPattern(uint num)
319 import std.algorithm.comparison : max;
320 put(Bytecode(IR.End, num));
321 ngroup = max(ngroup, groupStack.top);
322 groupStack.top = 1; // reset group counter
325 //fixup lookaround with start at offset fix and append a proper *-End opcode
326 void fixLookaround(uint fix)
328 lookaroundNest--;
329 ir[fix] = Bytecode(ir[fix].code,
330 cast(uint) ir.length - fix - IRL!(IR.LookaheadStart));
331 auto g = groupStack.pop();
332 assert(!groupStack.empty);
333 ir[fix+1] = Bytecode.fromRaw(groupStack.top);
334 //groups are cumulative across lookarounds
335 ir[fix+2] = Bytecode.fromRaw(groupStack.top+g);
336 groupStack.top += g;
337 if (ir[fix].code == IR.LookbehindStart || ir[fix].code == IR.NeglookbehindStart)
339 reverseBytecode(ir[fix + IRL!(IR.LookbehindStart) .. $]);
341 put(ir[fix].paired);
344 // repetition of {1,1}
345 void fixRepetition(uint offset)
347 import std.algorithm.mutation : copy;
348 immutable replace = ir[offset].code == IR.Nop;
349 if (replace)
351 copy(ir[offset + 1 .. $], ir[offset .. $ - 1]);
352 ir.length -= 1;
356 // repetition of {x,y}
357 void fixRepetition(uint offset, uint min, uint max, bool greedy)
359 static import std.algorithm.comparison;
360 import std.algorithm.mutation : copy;
361 import std.array : insertInPlace;
362 immutable replace = ir[offset].code == IR.Nop;
363 immutable len = cast(uint) ir.length - offset - replace;
364 if (max != infinite)
366 if (min != 1 || max != 1)
368 Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len);
369 if (replace)
370 ir[offset] = op;
371 else
372 insertInPlace(ir, offset, op);
373 put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len));
374 put(Bytecode.init); //hotspot
375 putRaw(1);
376 putRaw(min);
377 putRaw(max);
378 counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1);
381 else if (min) //&& max is infinite
383 if (min != 1)
385 Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len);
386 if (replace)
387 ir[offset] = op;
388 else
389 insertInPlace(ir, offset, op);
390 offset += 1;//so it still points to the repeated block
391 put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len));
392 put(Bytecode.init); //hotspot
393 putRaw(1);
394 putRaw(min);
395 putRaw(min);
396 counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1);
398 else if (replace)
400 copy(ir[offset+1 .. $], ir[offset .. $-1]);
401 ir.length -= 1;
403 put(Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len));
404 enforce(ir.length + len < maxCompiledLength, "maximum compiled pattern length is exceeded");
405 ir ~= ir[offset .. offset+len];
406 //IR.InfinteX is always a hotspot
407 put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len));
408 put(Bytecode.init); //merge index
410 else//vanila {0,inf}
412 Bytecode op = Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len);
413 if (replace)
414 ir[offset] = op;
415 else
416 insertInPlace(ir, offset, op);
417 //IR.InfinteX is always a hotspot
418 put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len));
419 put(Bytecode.init); //merge index
423 void fixAlternation()
425 import std.array : insertInPlace;
426 uint fix = fixupStack.top;
427 if (ir.length > fix && ir[fix].code == IR.Option)
429 ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix);
430 put(Bytecode(IR.GotoEndOr, 0));
431 fixupStack.top = cast(uint) ir.length; //replace latest fixup for Option
432 put(Bytecode(IR.Option, 0));
433 return;
435 uint len, orStart;
436 //start a new option
437 if (fixupStack.length == 1)
438 {//only root entry, effectively no fixup
439 len = cast(uint) ir.length + IRL!(IR.GotoEndOr);
440 orStart = 0;
442 else
443 {//IR.lookahead, etc. fixups that have length > 1, thus check ir[x].length
444 len = cast(uint) ir.length - fix - (ir[fix].length - 1);
445 orStart = fix + ir[fix].length;
447 insertInPlace(ir, orStart, Bytecode(IR.OrStart, 0), Bytecode(IR.Option, len));
448 assert(ir[orStart].code == IR.OrStart);
449 put(Bytecode(IR.GotoEndOr, 0));
450 fixupStack.push(orStart); //fixup for StartOR
451 fixupStack.push(cast(uint) ir.length); //for second Option
452 put(Bytecode(IR.Option, 0));
455 // finalizes IR.Option, fix points to the first option of sequence
456 void finishAlternation(uint fix)
458 enforce(ir[fix].code == IR.Option, "no matching ')'");
459 ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix - IRL!(IR.OrStart));
460 fix = fixupStack.pop();
461 enforce(ir[fix].code == IR.OrStart, "no matching ')'");
462 ir[fix] = Bytecode(IR.OrStart, cast(uint) ir.length - fix - IRL!(IR.OrStart));
463 put(Bytecode(IR.OrEnd, cast(uint) ir.length - fix - IRL!(IR.OrStart)));
464 uint pc = fix + IRL!(IR.OrStart);
465 while (ir[pc].code == IR.Option)
467 pc = pc + ir[pc].data;
468 if (ir[pc].code != IR.GotoEndOr)
469 break;
470 ir[pc] = Bytecode(IR.GotoEndOr, cast(uint)(ir.length - pc - IRL!(IR.OrEnd)));
471 pc += IRL!(IR.GotoEndOr);
473 put(Bytecode.fromRaw(0));
476 // returns: (flag - repetition possible?, fixup of the start of this "group")
477 Tuple!(bool, uint) onClose()
479 nesting--;
480 uint fix = popFixup();
481 switch (ir[fix].code)
483 case IR.GroupStart:
484 put(Bytecode(IR.GroupEnd, ir[fix].data));
485 return tuple(true, fix);
486 case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
487 assert(lookaroundNest);
488 fixLookaround(fix);
489 return tuple(false, 0u);
490 case IR.Option: //| xxx )
491 //two fixups: last option + full OR
492 finishAlternation(fix);
493 fix = topFixup;
494 switch (ir[fix].code)
496 case IR.GroupStart:
497 popFixup();
498 put(Bytecode(IR.GroupEnd, ir[fix].data));
499 return tuple(true, fix);
500 case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
501 assert(lookaroundNest);
502 fix = popFixup();
503 fixLookaround(fix);
504 return tuple(false, 0u);
505 default://(?:xxx)
506 popFixup();
507 return tuple(true, fix);
509 default://(?:xxx)
510 return tuple(true, fix);
514 uint popFixup(){ return fixupStack.pop(); }
516 void pushFixup(uint val){ return fixupStack.push(val); }
518 @property uint topFixup(){ return fixupStack.top; }
520 @property size_t fixupLength(){ return fixupStack.data.length; }
522 @property uint length(){ return cast(uint) ir.length; }
525 // safety limits
526 enum maxGroupNumber = 2^^19;
527 enum maxLookaroundDepth = 16;
528 // *Bytecode.sizeof, i.e. 1Mb of bytecode alone
529 enum maxCompiledLength = 2^^18;
530 // amounts to up to 4 Mb of auxilary table for matching
531 enum maxCumulativeRepetitionLength = 2^^20;
532 // marker to indicate infinite repetition
533 enum infinite = ~0u;
535 struct Parser(R, Generator)
536 if (isForwardRange!R && is(ElementType!R : dchar))
538 dchar front;
539 bool empty;
540 R pat, origin; //keep full pattern for pretty printing error messages
541 uint re_flags = 0; //global flags e.g. multiline + internal ones
542 Generator g;
544 @trusted this(S)(R pattern, S flags)
545 if (isSomeString!S)
547 pat = origin = pattern;
548 //reserve slightly more then avg as sampled from unittests
549 parseFlags(flags);
550 front = ' ';//a safe default for freeform parsing
551 popFront();
552 g.start(cast(uint) pat.length);
555 parseRegex();
557 catch (Exception e)
559 error(e.msg);//also adds pattern location
561 g.endPattern(1);
564 void _popFront()
566 if (pat.empty)
568 empty = true;
570 else
572 front = pat.front;
573 pat.popFront();
577 void skipSpace()
579 while (!empty && isWhite(front)) _popFront();
582 void popFront()
584 _popFront();
585 if (re_flags & RegexOption.freeform) skipSpace();
588 auto save(){ return this; }
590 //parsing number with basic overflow check
591 uint parseDecimal()
593 uint r = 0;
594 while (std.ascii.isDigit(front))
596 if (r >= (uint.max/10))
597 error("Overflow in decimal number");
598 r = 10*r + cast(uint)(front-'0');
599 popFront();
600 if (empty) break;
602 return r;
606 @trusted void parseFlags(S)(S flags)
607 {//@@@BUG@@@ text is @system
608 import std.conv : text;
609 foreach (ch; flags)//flags are ASCII anyway
611 L_FlagSwitch:
612 switch (ch)
615 foreach (i, op; __traits(allMembers, RegexOption))
617 case RegexOptionNames[i]:
618 if (re_flags & mixin("RegexOption."~op))
619 throw new RegexException(text("redundant flag specified: ",ch));
620 re_flags |= mixin("RegexOption."~op);
621 break L_FlagSwitch;
623 default:
624 throw new RegexException(text("unknown regex flag '",ch,"'"));
629 //parse and store IR for regex pattern
630 @trusted void parseRegex()
632 uint fix;//fixup pointer
634 while (!empty)
636 debug(std_regex_parser)
637 __ctfe || writeln("*LR*\nSource: ", pat, "\nStack: ",fixupStack.data);
638 switch (front)
640 case '(':
641 popFront();
642 if (front == '?')
644 popFront();
645 switch (front)
647 case '#':
648 for (;;)
650 popFront();
651 enforce(!empty, "Unexpected end of pattern");
652 if (front == ')')
654 popFront();
655 break;
658 break;
659 case ':':
660 g.genLogicGroup();
661 popFront();
662 break;
663 case '=':
664 g.genLookaround(IR.LookaheadStart);
665 popFront();
666 break;
667 case '!':
668 g.genLookaround(IR.NeglookaheadStart);
669 popFront();
670 break;
671 case 'P':
672 popFront();
673 enforce(front == '<', "Expected '<' in named group");
674 string name;
675 popFront();
676 if (empty || !(isAlpha(front) || front == '_'))
677 error("Expected alpha starting a named group");
678 name ~= front;
679 popFront();
680 while (!empty && (isAlpha(front) ||
681 front == '_' || std.ascii.isDigit(front)))
683 name ~= front;
684 popFront();
686 enforce(front == '>', "Expected '>' closing named group");
687 popFront();
688 g.genNamedGroup(name);
689 break;
690 case '<':
691 popFront();
692 if (front == '=')
693 g.genLookaround(IR.LookbehindStart);
694 else if (front == '!')
695 g.genLookaround(IR.NeglookbehindStart);
696 else
697 error("'!' or '=' expected after '<'");
698 popFront();
699 break;
700 default:
701 uint enableFlags, disableFlags;
702 bool enable = true;
705 switch (front)
707 case 's':
708 if (enable)
709 enableFlags |= RegexOption.singleline;
710 else
711 disableFlags |= RegexOption.singleline;
712 break;
713 case 'x':
714 if (enable)
715 enableFlags |= RegexOption.freeform;
716 else
717 disableFlags |= RegexOption.freeform;
718 break;
719 case 'i':
720 if (enable)
721 enableFlags |= RegexOption.casefold;
722 else
723 disableFlags |= RegexOption.casefold;
724 break;
725 case 'm':
726 if (enable)
727 enableFlags |= RegexOption.multiline;
728 else
729 disableFlags |= RegexOption.multiline;
730 break;
731 case '-':
732 if (!enable)
733 error(" unexpected second '-' in flags");
734 enable = false;
735 break;
736 default:
737 error(" 's', 'x', 'i', 'm' or '-' expected after '(?' ");
739 popFront();
740 }while (front != ')');
741 popFront();
742 re_flags |= enableFlags;
743 re_flags &= ~disableFlags;
746 else
748 g.genGroup();
750 break;
751 case ')':
752 enforce(g.nesting, "Unmatched ')'");
753 popFront();
754 auto pair = g.onClose();
755 if (pair[0])
756 parseQuantifier(pair[1]);
757 break;
758 case '|':
759 popFront();
760 g.fixAlternation();
761 break;
762 default://no groups or whatever
763 immutable start = g.length;
764 parseAtom();
765 parseQuantifier(start);
769 if (g.fixupLength != 1)
771 fix = g.popFixup();
772 g.finishAlternation(fix);
773 enforce(g.fixupLength == 1, "no matching ')'");
778 //parse and store IR for atom-quantifier pair
779 @trusted void parseQuantifier(uint offset)
780 {//copy is @system
781 if (empty)
782 return g.fixRepetition(offset);
783 uint min, max;
784 switch (front)
786 case '*':
787 min = 0;
788 max = infinite;
789 break;
790 case '?':
791 min = 0;
792 max = 1;
793 break;
794 case '+':
795 min = 1;
796 max = infinite;
797 break;
798 case '{':
799 popFront();
800 enforce(!empty, "Unexpected end of regex pattern");
801 enforce(std.ascii.isDigit(front), "First number required in repetition");
802 min = parseDecimal();
803 if (front == '}')
804 max = min;
805 else if (front == ',')
807 popFront();
808 if (std.ascii.isDigit(front))
809 max = parseDecimal();
810 else if (front == '}')
811 max = infinite;
812 else
813 error("Unexpected symbol in regex pattern");
814 skipSpace();
815 enforce(front == '}', "Unmatched '{' in regex pattern");
817 else
818 error("Unexpected symbol in regex pattern");
819 enforce(min <= max, "Illegal {n,m} quantifier");
820 break;
821 default:
822 g.fixRepetition(offset);
823 return;
825 bool greedy = true;
826 //check only if we managed to get new symbol
827 popFront();
828 if (!empty && front == '?')
830 greedy = false;
831 popFront();
833 g.fixRepetition(offset, min, max, greedy);
836 //parse and store IR for atom
837 void parseAtom()
839 if (empty)
840 return;
841 switch (front)
843 case '*', '?', '+', '|', '{', '}':
844 return error("'*', '+', '?', '{', '}' not allowed in atom");
845 case '.':
846 if (re_flags & RegexOption.singleline)
847 g.put(Bytecode(IR.Any, 0));
848 else
850 CodepointSet set;
851 g.charsetToIr(set.add('\n','\n'+1).add('\r', '\r'+1).inverted);
853 popFront();
854 break;
855 case '[':
856 parseCharset();
857 break;
858 case '\\':
859 _popFront();
860 enforce(!empty, "Unfinished escape sequence");
861 parseEscape();
862 break;
863 case '^':
864 if (re_flags & RegexOption.multiline)
865 g.put(Bytecode(IR.Bol, 0));
866 else
867 g.put(Bytecode(IR.Bof, 0));
868 popFront();
869 break;
870 case '$':
871 if (re_flags & RegexOption.multiline)
872 g.put(Bytecode(IR.Eol, 0));
873 else
874 g.put(Bytecode(IR.Eof, 0));
875 popFront();
876 break;
877 default:
878 if (re_flags & RegexOption.casefold)
880 auto range = simpleCaseFoldings(front);
881 assert(range.length <= 5);
882 if (range.length == 1)
883 g.put(Bytecode(IR.Char, range.front));
884 else
885 foreach (v; range)
886 g.put(Bytecode(IR.OrChar, v, cast(uint) range.length));
888 else
889 g.put(Bytecode(IR.Char, front));
890 popFront();
894 //parse and store IR for CodepointSet
895 void parseCharset()
897 const save = re_flags;
898 re_flags &= ~RegexOption.freeform; // stop ignoring whitespace if we did
899 bool casefold = cast(bool)(re_flags & RegexOption.casefold);
900 g.charsetToIr(unicode.parseSet(this, casefold));
901 re_flags = save;
902 // Last next() in parseCharset is executed w/o freeform flag
903 if (re_flags & RegexOption.freeform) skipSpace();
906 //parse and generate IR for escape stand alone escape sequence
907 @trusted void parseEscape()
908 {//accesses array of appender
909 import std.algorithm.iteration : sum;
910 switch (front)
912 case 'f': popFront(); g.put(Bytecode(IR.Char, '\f')); break;
913 case 'n': popFront(); g.put(Bytecode(IR.Char, '\n')); break;
914 case 'r': popFront(); g.put(Bytecode(IR.Char, '\r')); break;
915 case 't': popFront(); g.put(Bytecode(IR.Char, '\t')); break;
916 case 'v': popFront(); g.put(Bytecode(IR.Char, '\v')); break;
918 case 'd':
919 popFront();
920 g.charsetToIr(unicode.Nd);
921 break;
922 case 'D':
923 popFront();
924 g.charsetToIr(unicode.Nd.inverted);
925 break;
926 case 'b': popFront(); g.put(Bytecode(IR.Wordboundary, 0)); break;
927 case 'B': popFront(); g.put(Bytecode(IR.Notwordboundary, 0)); break;
928 case 's':
929 popFront();
930 g.charsetToIr(unicode.White_Space);
931 break;
932 case 'S':
933 popFront();
934 g.charsetToIr(unicode.White_Space.inverted);
935 break;
936 case 'w':
937 popFront();
938 g.charsetToIr(wordCharacter);
939 break;
940 case 'W':
941 popFront();
942 g.charsetToIr(wordCharacter.inverted);
943 break;
944 case 'p': case 'P':
945 bool casefold = cast(bool)(re_flags & RegexOption.casefold);
946 auto set = unicode.parsePropertySpec(this, front == 'P', casefold);
947 g.charsetToIr(set);
948 break;
949 case 'x':
950 immutable code = parseUniHex(pat, 2);
951 popFront();
952 g.put(Bytecode(IR.Char,code));
953 break;
954 case 'u': case 'U':
955 immutable code = parseUniHex(pat, front == 'u' ? 4 : 8);
956 popFront();
957 g.put(Bytecode(IR.Char, code));
958 break;
959 case 'c': //control codes
960 Bytecode code = Bytecode(IR.Char, unicode.parseControlCode(this));
961 popFront();
962 g.put(code);
963 break;
964 case '0':
965 popFront();
966 g.put(Bytecode(IR.Char, 0));//NUL character
967 break;
968 case '1': .. case '9':
969 uint nref = cast(uint) front - '0';
970 immutable maxBackref = sum(g.groupStack.data);
971 enforce(nref < maxBackref, "Backref to unseen group");
972 //perl's disambiguation rule i.e.
973 //get next digit only if there is such group number
974 popFront();
975 while (nref < maxBackref && !empty && std.ascii.isDigit(front))
977 nref = nref * 10 + front - '0';
978 popFront();
980 if (nref >= maxBackref)
981 nref /= 10;
982 enforce(!g.isOpenGroup(nref), "Backref to open group");
983 uint localLimit = maxBackref - g.groupStack.top;
984 if (nref >= localLimit)
986 g.put(Bytecode(IR.Backref, nref-localLimit));
987 g.ir[$-1].setLocalRef();
989 else
990 g.put(Bytecode(IR.Backref, nref));
991 g.markBackref(nref);
992 break;
993 default:
994 if (front == '\\' && !pat.empty)
996 if (pat.front >= privateUseStart && pat.front <= privateUseEnd)
997 enforce(false, "invalid escape sequence");
999 if (front >= privateUseStart && front <= privateUseEnd)
1001 g.endPattern(front - privateUseStart + 1);
1002 break;
1004 auto op = Bytecode(IR.Char, front);
1005 popFront();
1006 g.put(op);
1011 @trusted void error(string msg)
1013 import std.array : appender;
1014 import std.format.write : formattedWrite;
1015 auto app = appender!string();
1016 formattedWrite(app, "%s\nPattern with error: `%s` <--HERE-- `%s`",
1017 msg, origin[0..$-pat.length], pat);
1018 throw new RegexException(app.data);
1021 alias Char = BasicElementOf!R;
1023 @property program()
1025 return makeRegex(this);
1030 Postproces the IR, then optimize.
1032 @trusted void postprocess(Char)(ref Regex!Char zis)
1033 {//@@@BUG@@@ write is @system
1034 with(zis)
1036 struct FixedStack(T)
1038 T[] arr;
1039 uint _top;
1040 //this(T[] storage){ arr = storage; _top = -1; }
1041 @property ref T top(){ assert(!empty); return arr[_top]; }
1042 void push(T x){ arr[++_top] = x; }
1043 T pop() { assert(!empty); return arr[_top--]; }
1044 @property bool empty(){ return _top == -1; }
1046 auto counterRange = FixedStack!uint(new uint[maxCounterDepth+1], -1);
1047 counterRange.push(1);
1048 ulong cumRange = 0;
1049 for (uint i = 0; i < ir.length; i += ir[i].length)
1051 if (ir[i].hotspot)
1053 assert(i + 1 < ir.length,
1054 "unexpected end of IR while looking for hotspot");
1055 ir[i+1] = Bytecode.fromRaw(hotspotTableSize);
1056 hotspotTableSize += counterRange.top;
1058 switch (ir[i].code)
1060 case IR.RepeatStart, IR.RepeatQStart:
1061 uint repEnd = cast(uint)(i + ir[i].data + IRL!(IR.RepeatStart));
1062 assert(ir[repEnd].code == ir[i].paired.code);
1063 immutable max = ir[repEnd + 4].raw;
1064 ir[repEnd+2].raw = counterRange.top;
1065 ir[repEnd+3].raw *= counterRange.top;
1066 ir[repEnd+4].raw *= counterRange.top;
1067 ulong cntRange = cast(ulong)(max)*counterRange.top;
1068 cumRange += cntRange;
1069 enforce(cumRange < maxCumulativeRepetitionLength,
1070 "repetition length limit is exceeded");
1071 counterRange.push(cast(uint) cntRange + counterRange.top);
1072 threadCount += counterRange.top;
1073 break;
1074 case IR.RepeatEnd, IR.RepeatQEnd:
1075 threadCount += counterRange.top;
1076 counterRange.pop();
1077 break;
1078 case IR.GroupStart:
1079 if (isBackref(ir[i].data))
1080 ir[i].setBackrefence();
1081 threadCount += counterRange.top;
1082 break;
1083 case IR.GroupEnd:
1084 if (isBackref(ir[i].data))
1085 ir[i].setBackrefence();
1086 threadCount += counterRange.top;
1087 break;
1088 default:
1089 threadCount += counterRange.top;
1092 checkIfOneShot();
1093 if (!(flags & RegexInfo.oneShot))
1094 kickstart = Kickstart!Char(zis, new uint[](256));
1095 debug(std_regex_allocation) writefln("IR processed, max threads: %d", threadCount);
1096 optimize(zis);
1100 void fixupBytecode()(Bytecode[] ir)
1102 Stack!uint fixups;
1104 with(IR) for (uint i=0; i<ir.length; i+= ir[i].length)
1106 if (ir[i].isStart || ir[i].code == Option)
1107 fixups.push(i);
1108 else if (ir[i].code == OrEnd)
1110 // Alternatives need more care
1111 auto j = fixups.pop(); // last Option
1112 ir[j].data = i - j - ir[j].length;
1113 j = fixups.pop(); // OrStart
1114 ir[j].data = i - j - ir[j].length;
1115 ir[i].data = ir[j].data;
1117 // fixup all GotoEndOrs
1118 j = j + IRL!(OrStart);
1119 assert(ir[j].code == Option);
1120 for (;;)
1122 auto next = j + ir[j].data + IRL!(Option);
1123 if (ir[next].code == IR.OrEnd)
1124 break;
1125 ir[next - IRL!(GotoEndOr)].data = i - next;
1126 j = next;
1129 else if (ir[i].code == GotoEndOr)
1131 auto j = fixups.pop(); // Option
1132 ir[j].data = i - j + IRL!(GotoEndOr)- IRL!(Option); // to the next option
1134 else if (ir[i].isEnd)
1136 auto j = fixups.pop();
1137 ir[i].data = i - j - ir[j].length;
1138 ir[j].data = ir[i].data;
1141 assert(fixups.empty);
1144 void optimize(Char)(ref Regex!Char zis)
1146 import std.array : insertInPlace;
1147 CodepointSet nextSet(uint idx)
1149 CodepointSet set;
1150 with(zis) with(IR)
1151 Outer:
1152 for (uint i = idx; i < ir.length; i += ir[i].length)
1154 switch (ir[i].code)
1156 case Char:
1157 set.add(ir[i].data, ir[i].data+1);
1158 goto default;
1159 //TODO: OrChar
1160 case Trie, CodepointSet:
1161 set = zis.charsets[ir[i].data];
1162 goto default;
1163 case GroupStart,GroupEnd:
1164 break;
1165 default:
1166 break Outer;
1169 return set;
1172 with(zis) with(IR) for (uint i = 0; i < ir.length; i += ir[i].length)
1174 if (ir[i].code == InfiniteEnd)
1176 auto set = nextSet(i+IRL!(InfiniteEnd));
1177 if (!set.empty && set.length < 10_000)
1179 ir[i] = Bytecode(InfiniteBloomEnd, ir[i].data);
1180 ir[i - ir[i].data - IRL!(InfiniteStart)] =
1181 Bytecode(InfiniteBloomStart, ir[i].data);
1182 ir.insertInPlace(i+IRL!(InfiniteEnd),
1183 Bytecode.fromRaw(cast(uint) zis.filters.length));
1184 zis.filters ~= BitTable(set);
1185 fixupBytecode(ir);
1191 //IR code validator - proper nesting, illegal instructions, etc.
1192 @trusted void validateRe(Char)(ref Regex!Char zis)
1193 {//@@@BUG@@@ text is @system
1194 import std.conv : text;
1195 with(zis)
1197 for (uint pc = 0; pc < ir.length; pc += ir[pc].length)
1199 if (ir[pc].isStart || ir[pc].isEnd)
1201 immutable dest = ir[pc].indexOfPair(pc);
1202 assert(dest < ir.length, text("Wrong length in opcode at pc=",
1203 pc, " ", dest, " vs ", ir.length));
1204 assert(ir[dest].paired == ir[pc],
1205 text("Wrong pairing of opcodes at pc=", pc, "and pc=", dest));
1207 else if (ir[pc].isAtom)
1211 else
1212 assert(0, text("Unknown type of instruction at pc=", pc));