1 //Written in the D programming language
3 Implementation of Thompson NFA std.regex engine.
4 Key point is evaluation of all possible threads (state) at each step
5 in a breadth-first manner, thereby geting some nice properties:
6 - looking at each character only once
7 - merging of equivalent threads, that gives matching process linear time complexity
9 module std
.regex
.internal
.thompson
;
13 import std
.range
.primitives
;
14 import std
.regex
.internal
.ir
;
17 struct Thread(DataIndex
)
19 Thread
* next
; //intrusive linked list
21 uint counter
; //loop counter
22 uint uopCounter
; //counts micro operations inside one macro instruction (e.g. BackRef)
23 Group
!DataIndex
[1] matches
;
26 //head-tail singly-linked list
27 struct ThreadList(DataIndex
)
29 Thread
!DataIndex
* tip
= null, toe
= null;
30 //add new thread to the start of list
31 void insertFront(Thread
!DataIndex
* t
)
44 //add new thread to the end of list
45 void insertBack(Thread
!DataIndex
* t
)
56 //move head element out of list
57 Thread
!DataIndex
* fetch()
66 //non-destructive iteration of ThreadList
69 const(Thread
!DataIndex
)* ct
;
70 this(ThreadList tlist
){ ct
= tlist
.tip
; }
71 @property bool empty(){ return ct
is null; }
72 @property const(Thread
!DataIndex
)* front(){ return ct
; }
79 @property bool empty()
85 return ThreadRange(this);
89 template ThompsonOps(E
, S
, bool withInput
:true)
92 static bool op(IR code
:IR
.End
)(E e
, S
* state
)
96 finish(t
, matches
, re
.ir
[t
.pc
].data
);
97 //fix endpoint of the whole match
98 matches
[0].end
= index
;
100 //cut off low priority threads
103 debug(std_regex_matcher
) writeln("Finished thread ", matches
);
104 return false; // no more state to eval
108 static bool op(IR code
:IR
.Wordboundary
)(E e
, S
* state
)
114 //at start & end of input
115 if (atStart
&& wordMatcher
[front
])
117 t
.pc
+= IRL
!(IR
.Wordboundary
);
120 else if (atEnd
&& s
.loopBack(index
).nextChar(back
, bi
)
121 && wordMatcher
[back
])
123 t
.pc
+= IRL
!(IR
.Wordboundary
);
126 else if (s
.loopBack(index
).nextChar(back
, bi
))
128 bool af
= wordMatcher
[front
];
129 bool ab
= wordMatcher
[back
];
132 t
.pc
+= IRL
!(IR
.Wordboundary
);
140 static bool op(IR code
:IR
.Notwordboundary
)(E e
, S
* state
)
146 //at start & end of input
147 if (atStart
&& wordMatcher
[front
])
151 else if (atEnd
&& s
.loopBack(index
).nextChar(back
, bi
)
152 && wordMatcher
[back
])
156 else if (s
.loopBack(index
).nextChar(back
, bi
))
158 bool af
= wordMatcher
[front
];
159 bool ab
= wordMatcher
[back
] != 0;
165 t
.pc
+= IRL
!(IR
.Notwordboundary
);
170 static bool op(IR code
:IR
.Bof
)(E e
, S
* state
)
176 t
.pc
+= IRL
!(IR
.Bof
);
186 static bool op(IR code
:IR
.Bol
)(E e
, S
* state
)
193 ||
(s
.loopBack(index
).nextChar(back
,bi
)
194 && startOfLine(back
, front
== '\n')))
196 t
.pc
+= IRL
!(IR
.Bol
);
206 static bool op(IR code
:IR
.Eof
)(E e
, S
* state
)
212 t
.pc
+= IRL
!(IR
.Eol
);
222 static bool op(IR code
:IR
.Eol
)(E e
, S
* state
)
228 //no matching inside \r\n
229 if (atEnd ||
(endOfLine(front
, s
.loopBack(index
).nextChar(back
, bi
)
232 t
.pc
+= IRL
!(IR
.Eol
);
243 static bool op(IR code
:IR
.InfiniteStart
)(E e
, S
* state
)
246 t
.pc
+= re
.ir
[t
.pc
].data
+ IRL
!(IR
.InfiniteStart
);
247 return op
!(IR
.InfiniteEnd
)(e
,state
);
250 static bool op(IR code
:IR
.InfiniteBloomStart
)(E e
, S
* state
)
253 t
.pc
+= re
.ir
[t
.pc
].data
+ IRL
!(IR
.InfiniteBloomStart
);
254 return op
!(IR
.InfiniteBloomEnd
)(e
,state
);
257 static bool op(IR code
:IR
.InfiniteQStart
)(E e
, S
* state
)
260 t
.pc
+= re
.ir
[t
.pc
].data
+ IRL
!(IR
.InfiniteQStart
);
261 return op
!(IR
.InfiniteQEnd
)(e
,state
);
264 static bool op(IR code
:IR
.RepeatStart
)(E e
, S
* state
)
267 t
.pc
+= re
.ir
[t
.pc
].data
+ IRL
!(IR
.RepeatStart
);
268 return op
!(IR
.RepeatEnd
)(e
,state
);
271 static bool op(IR code
:IR
.RepeatQStart
)(E e
, S
* state
)
274 t
.pc
+= re
.ir
[t
.pc
].data
+ IRL
!(IR
.RepeatQStart
);
275 return op
!(IR
.RepeatQEnd
)(e
,state
);
278 static bool op(IR code
)(E e
, S
* state
)
279 if (code
== IR
.RepeatEnd || code
== IR
.RepeatQEnd
)
283 //len, step, min, max
284 uint len
= re
.ir
[t
.pc
].data
;
285 uint step
= re
.ir
[t
.pc
+2].raw
;
286 uint min
= re
.ir
[t
.pc
+3].raw
;
293 if (merge
[re
.ir
[t
.pc
+ 1].raw
+t
.counter
] < genCounter
)
295 debug(std_regex_matcher
) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
296 t
.pc
, index
, genCounter
, merge
[re
.ir
[t
.pc
+ 1].raw
+t
.counter
] );
297 merge
[re
.ir
[t
.pc
+ 1].raw
+t
.counter
] = genCounter
;
301 debug(std_regex_matcher
)
302 writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
303 t
.pc
, index
, genCounter
, merge
[re
.ir
[t
.pc
+ 1].raw
+t
.counter
] );
306 uint max
= re
.ir
[t
.pc
+4].raw
;
309 if (re
.ir
[t
.pc
].code
== IR
.RepeatEnd
)
311 //queue out-of-loop thread
312 worklist
.insertFront(fork(t
, t
.pc
+ IRL
!(IR
.RepeatEnd
), t
.counter
% step
));
318 //queue into-loop thread
319 worklist
.insertFront(fork(t
, t
.pc
- len
, t
.counter
+ step
));
321 t
.pc
+= IRL
!(IR
.RepeatEnd
);
327 t
.pc
+= IRL
!(IR
.RepeatEnd
);
333 static bool op(IR code
)(E e
, S
* state
)
334 if (code
== IR
.InfiniteEnd || code
== IR
.InfiniteQEnd
)
338 if (merge
[re
.ir
[t
.pc
+ 1].raw
+t
.counter
] < genCounter
)
340 debug(std_regex_matcher
) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
341 t
.pc
, index
, genCounter
, merge
[re
.ir
[t
.pc
+ 1].raw
+t
.counter
] );
342 merge
[re
.ir
[t
.pc
+ 1].raw
+t
.counter
] = genCounter
;
346 debug(std_regex_matcher
) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
347 t
.pc
, index
, genCounter
, merge
[re
.ir
[t
.pc
+ 1].raw
+t
.counter
] );
350 uint len
= re
.ir
[t
.pc
].data
;
351 uint pc1
, pc2
; //branches to take in priority order
352 if (re
.ir
[t
.pc
].code
== IR
.InfiniteEnd
)
355 pc2
= t
.pc
+ IRL
!(IR
.InfiniteEnd
);
359 pc1
= t
.pc
+ IRL
!(IR
.InfiniteEnd
);
362 worklist
.insertFront(fork(t
, pc2
, t
.counter
));
368 static bool op(IR code
)(E e
, S
* state
)
369 if (code
== IR
.InfiniteBloomEnd
)
373 if (merge
[re
.ir
[t
.pc
+ 1].raw
+t
.counter
] < genCounter
)
375 debug(std_regex_matcher
) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
376 t
.pc
, index
, genCounter
, merge
[re
.ir
[t
.pc
+ 1].raw
+t
.counter
] );
377 merge
[re
.ir
[t
.pc
+ 1].raw
+t
.counter
] = genCounter
;
381 debug(std_regex_matcher
) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
382 t
.pc
, index
, genCounter
, merge
[re
.ir
[t
.pc
+ 1].raw
+t
.counter
] );
385 uint len
= re
.ir
[t
.pc
].data
;
386 uint pc1
, pc2
; //branches to take in priority order
388 pc2
= t
.pc
+ IRL
!(IR
.InfiniteBloomEnd
);
389 uint filterIndex
= re
.ir
[t
.pc
+ 2].raw
;
390 if (re
.filters
[filterIndex
][front
])
391 worklist
.insertFront(fork(t
, pc2
, t
.counter
));
397 static bool op(IR code
:IR
.OrEnd
)(E e
, S
* state
)
401 if (merge
[re
.ir
[t
.pc
+ 1].raw
+t
.counter
] < genCounter
)
403 debug(std_regex_matcher
) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s",
404 t
.pc
, s
[index
.. s
.lastIndex
], genCounter
, merge
[re
.ir
[t
.pc
+ 1].raw
+ t
.counter
] );
405 merge
[re
.ir
[t
.pc
+ 1].raw
+t
.counter
] = genCounter
;
406 t
.pc
+= IRL
!(IR
.OrEnd
);
410 debug(std_regex_matcher
) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s",
411 t
.pc
, s
[index
.. s
.lastIndex
], genCounter
, merge
[re
.ir
[t
.pc
+ 1].raw
+ t
.counter
] );
418 static bool op(IR code
:IR
.OrStart
)(E e
, S
* state
)
422 t
.pc
+= IRL
!(IR
.OrStart
);
423 return op
!(IR
.Option
)(e
,state
);
427 static bool op(IR code
:IR
.Option
)(E e
, S
* state
)
431 uint next
= t
.pc
+ re
.ir
[t
.pc
].data
+ IRL
!(IR
.Option
);
433 if (re
.ir
[next
].code
== IR
.Option
)
435 worklist
.insertFront(fork(t
, next
, t
.counter
));
437 t
.pc
+= IRL
!(IR
.Option
);
442 static bool op(IR code
:IR
.GotoEndOr
)(E e
, S
* state
)
446 t
.pc
= t
.pc
+ re
.ir
[t
.pc
].data
+ IRL
!(IR
.GotoEndOr
);
447 return op
!(IR
.OrEnd
)(e
, state
);
451 static bool op(IR code
:IR
.GroupStart
)(E e
, S
* state
)
455 uint n
= re
.ir
[t
.pc
].data
;
456 t
.matches
.ptr
[n
].begin
= index
;
457 t
.pc
+= IRL
!(IR
.GroupStart
);
461 static bool op(IR code
:IR
.GroupEnd
)(E e
, S
* state
)
465 uint n
= re
.ir
[t
.pc
].data
;
466 t
.matches
.ptr
[n
].end
= index
;
467 t
.pc
+= IRL
!(IR
.GroupEnd
);
472 static bool op(IR code
:IR
.Backref
)(E e
, S
* state
)
476 uint n
= re
.ir
[t
.pc
].data
;
477 Group
!DataIndex
* source
= re
.ir
[t
.pc
].localRef ? t
.matches
.ptr
: backrefed
.ptr
;
479 if (source
[n
].begin
== source
[n
].end
)//zero-width Backref!
481 t
.pc
+= IRL
!(IR
.Backref
);
486 size_t idx
= source
[n
].begin
+ t
.uopCounter
;
487 size_t end
= source
[n
].end
;
488 if (s
[idx
.. end
].front
== front
)
490 import std
.utf
: stride
;
492 t
.uopCounter
+= stride(s
[idx
.. end
], 0);
493 if (t
.uopCounter
+ source
[n
].begin
== source
[n
].end
)
495 t
.pc
+= IRL
!(IR
.Backref
);
502 t
= worklist
.fetch();
509 static bool op(IR code
)(E e
, S
* state
)
510 if (code
== IR
.LookbehindStart || code
== IR
.NeglookbehindStart
)
514 uint len
= re
.ir
[t
.pc
].data
;
515 uint ms
= re
.ir
[t
.pc
+ 1].raw
, me
= re
.ir
[t
.pc
+ 2].raw
;
516 uint end
= t
.pc
+ len
+ IRL
!(IR
.LookbehindEnd
) + IRL
!(IR
.LookbehindStart
);
517 bool positive
= re
.ir
[t
.pc
].code
== IR
.LookbehindStart
;
518 static if (Stream
.isLoopback
)
519 auto matcher
= fwdMatcher(t
.pc
, end
, me
- ms
, subCounters
.get(t
.pc
, 0));
521 auto matcher
= bwdMatcher(t
.pc
, end
, me
- ms
, subCounters
.get(t
.pc
, 0));
522 matcher
.backrefed
= backrefed
.empty ? t
.matches
: backrefed
;
524 auto mRes
= matcher
.matchOneShot(t
.matches
.ptr
[ms
.. me
], IRL
!(IR
.LookbehindStart
));
525 freelist
= matcher
.freelist
;
526 subCounters
[t
.pc
] = matcher
.genCounter
;
527 if ((mRes
!= 0 ) ^ positive
)
536 static bool op(IR code
)(E e
, S
* state
)
537 if (code
== IR
.LookaheadStart || code
== IR
.NeglookaheadStart
)
542 uint len
= re
.ir
[t
.pc
].data
;
543 uint ms
= re
.ir
[t
.pc
+1].raw
, me
= re
.ir
[t
.pc
+2].raw
;
544 uint end
= t
.pc
+len
+IRL
!(IR
.LookaheadEnd
)+IRL
!(IR
.LookaheadStart
);
545 bool positive
= re
.ir
[t
.pc
].code
== IR
.LookaheadStart
;
546 static if (Stream
.isLoopback
)
547 auto matcher
= bwdMatcher(t
.pc
, end
, me
- ms
, subCounters
.get(t
.pc
, 0));
549 auto matcher
= fwdMatcher(t
.pc
, end
, me
- ms
, subCounters
.get(t
.pc
, 0));
550 matcher
.backrefed
= backrefed
.empty ? t
.matches
: backrefed
;
551 auto mRes
= matcher
.matchOneShot(t
.matches
.ptr
[ms
.. me
], IRL
!(IR
.LookaheadStart
));
552 freelist
= matcher
.freelist
;
553 subCounters
[t
.pc
] = matcher
.genCounter
;
556 if ((mRes
!= 0) ^ positive
)
565 static bool op(IR code
)(E e
, S
* state
)
566 if (code
== IR
.LookaheadEnd || code
== IR
.NeglookaheadEnd ||
567 code
== IR
.LookbehindEnd || code
== IR
.NeglookbehindEnd
)
571 finish(t
, matches
.ptr
[0 .. re
.ngroup
], re
.ir
[t
.pc
].data
);
573 //cut off low priority threads
576 return false; // no more state
580 static bool op(IR code
:IR
.Nop
)(E e
, S
* state
)
582 with(state
) t
.pc
+= IRL
!(IR
.Nop
);
586 static bool op(IR code
:IR
.OrChar
)(E e
, S
* state
)
590 uint len
= re
.ir
[t
.pc
].sequence
;
591 uint end
= t
.pc
+ len
;
592 static assert(IRL
!(IR
.OrChar
) == 1);
593 for (; t
.pc
< end
; t
.pc
++)
594 if (re
.ir
[t
.pc
].data
== front
)
603 t
= worklist
.fetch();
608 static bool op(IR code
:IR
.Char
)(E e
, S
* state
)
612 if (front
== re
.ir
[t
.pc
].data
)
614 t
.pc
+= IRL
!(IR
.Char
);
619 t
= worklist
.fetch();
624 static bool op(IR code
:IR
.Any
)(E e
, S
* state
)
628 t
.pc
+= IRL
!(IR
.Any
);
630 t
= worklist
.fetch();
635 static bool op(IR code
:IR
.CodepointSet
)(E e
, S
* state
)
639 if (re
.charsets
[re
.ir
[t
.pc
].data
].scanFor(front
))
641 t
.pc
+= IRL
!(IR
.CodepointSet
);
648 t
= worklist
.fetch();
653 static bool op(IR code
:IR
.Trie
)(E e
, S
* state
)
657 if (re
.matchers
[re
.ir
[t
.pc
].data
][front
])
659 t
.pc
+= IRL
!(IR
.Trie
);
666 t
= worklist
.fetch();
673 template ThompsonOps(E
,S
, bool withInput
:false)
676 // can't match these without input
677 static bool op(IR code
)(E e
, S
* state
)
678 if (code
== IR
.Char || code
== IR
.OrChar || code
== IR
.CodepointSet
679 || code
== IR
.Trie || code
== IR
.Char || code
== IR
.Any
)
681 return state
.popState(e
);
684 // special case of zero-width backref
685 static bool op(IR code
:IR
.Backref
)(E e
, S
* state
)
689 uint n
= re
.ir
[t
.pc
].data
;
690 Group
!DataIndex
* source
= re
.ir
[t
.pc
].localRef ? t
.matches
.ptr
: backrefed
.ptr
;
692 if (source
[n
].begin
== source
[n
].end
)//zero-width Backref!
694 t
.pc
+= IRL
!(IR
.Backref
);
702 // forward all control flow to normal versions
703 static bool op(IR code
)(E e
, S
* state
)
704 if (code
!= IR
.Char
&& code
!= IR
.OrChar
&& code
!= IR
.CodepointSet
705 && code
!= IR
.Trie
&& code
!= IR
.Char
&& code
!= IR
.Any
&& code
!= IR
.Backref
)
707 return ThompsonOps
!(E
,S
,true).op
!code(e
,state
);
712 Thomspon matcher does all matching in lockstep,
713 never looking at the same char twice
715 @trusted class ThompsonMatcher(Char
, StreamType
= Input
!Char
): Matcher
!Char
716 if (is(Char
: dchar))
718 alias DataIndex
= Stream
.DataIndex
;
719 alias Stream
= StreamType
;
720 alias OpFunc
= bool function(ThompsonMatcher
, State
*) pure;
721 alias BackMatcher
= ThompsonMatcher
!(Char
, BackLooper
!(Stream
));
722 alias OpBackFunc
= bool function(BackMatcher
, BackMatcher
.State
*) pure;
723 Thread
!DataIndex
* freelist
;
724 ThreadList
!DataIndex clist
, nlist
;
726 Group
!DataIndex
[] backrefed
;
727 const Regex
!Char re
; //regex program
731 DataIndex genCounter
; //merge trace counter, goes up on every dchar
732 size_t
[size_t
] subCounters
; //a table of gen counter per sub-engine: PC -> counter
733 OpFunc
[] opCacheTrue
; // pointers to Op!(IR.xyz) for each bytecode
734 OpFunc
[] opCacheFalse
; // ditto
735 OpBackFunc
[] opCacheBackTrue
; // ditto
736 OpBackFunc
[] opCacheBackFalse
; // ditto
747 ThreadList
!DataIndex worklist
;
748 Group
!DataIndex
[] matches
;
750 bool popState(E
)(E e
)
755 t
= worklist
.fetch();
762 enum kicked
= __traits(hasMember
, Stream
, "search");
764 static size_t
getThreadSize(const ref Regex
!Char re
)
767 ?
(Thread
!DataIndex
).sizeof
+ (re
.ngroup
-1)*(Group
!DataIndex
).sizeof
768 : (Thread
!DataIndex
).sizeof
- (Group
!DataIndex
).sizeof
;
771 static size_t
initialMemory(const ref Regex
!Char re
)
773 return getThreadSize(re
)*re
.threadCount
+ re
.hotspotTableSize
*size_t
.sizeof
774 +4*OpFunc
.sizeof
*re
.ir
.length
;
777 //true if it's start of input
778 @property bool atStart(){ return index
== 0; }
780 //true if it's end of input
781 @property bool atEnd(){ return index
== s
.lastIndex
&& s
.atEnd
; }
783 override @property ref size_t
refCount() @safe { return _refCount
; }
785 override @property ref const(Regex
!Char
) pattern() @safe { return re
; }
789 if (!s
.nextChar(front
, index
))
802 if (!s
.search(re
.kickstart
, front
, index
))
811 void initExternalMemory(void[] memory
)
813 threadSize
= getThreadSize(re
);
814 prepareFreeList(re
.threadCount
, memory
);
815 if (re
.hotspotTableSize
)
817 merge
= arrayInChunk
!(DataIndex
)(re
.hotspotTableSize
, memory
);
820 opCacheTrue
= arrayInChunk
!(OpFunc
)(re
.ir
.length
, memory
);
821 opCacheFalse
= arrayInChunk
!(OpFunc
)(re
.ir
.length
, memory
);
822 opCacheBackTrue
= arrayInChunk
!(OpBackFunc
)(re
.ir
.length
, memory
);
823 opCacheBackFalse
= arrayInChunk
!(OpBackFunc
)(re
.ir
.length
, memory
);
825 for (uint pc
= 0; pc
<re
.ir
.length
; pc
+= re
.ir
[pc
].length
)
828 switch (re
.ir
[pc
].code
)
830 foreach (e
; __traits(allMembers
, IR
))
832 mixin(`case IR.`~e
~`:
833 opCacheTrue[pc] = &Ops!(true).op!(IR.`~e
~`);
834 opCacheBackTrue[pc] = &BackOps!(true).op!(IR.`~e
~`);
835 opCacheFalse[pc] = &Ops!(false).op!(IR.`~e
~`);
836 opCacheBackFalse[pc] = &BackOps!(false).op!(IR.`~e
~`);
841 assert(0, "Unrecognized instruction "~re
.ir
[pc
].mnemonic
);
846 override Matcher
!Char
rearm(in Char
[] data
)
854 this()(ref const Regex
!Char program
, Stream stream
, void[] memory
)
856 // We are emplace'd to malloced memory w/o blitting T.init over it\
857 // make sure we initialize all fields explicitly
865 initExternalMemory(memory
);
869 this(ThompsonMatcher matcher
, size_t lo
, size_t hi
, uint nGroup
, Stream stream
)
872 subCounters
= matcher
.subCounters
;
874 auto code
= matcher
.re
.ir
[lo
.. hi
];
875 re
= matcher
.re
.withCode(code
).withNGroup(nGroup
);
876 threadSize
= matcher
.threadSize
;
877 merge
= matcher
.merge
;
878 freelist
= matcher
.freelist
;
879 opCacheTrue
= matcher
.opCacheTrue
[lo
.. hi
];
880 opCacheBackTrue
= matcher
.opCacheBackTrue
[lo
.. hi
];
881 opCacheFalse
= matcher
.opCacheFalse
[lo
.. hi
];
882 opCacheBackFalse
= matcher
.opCacheBackFalse
[lo
.. hi
];
883 front
= matcher
.front
;
884 index
= matcher
.index
;
887 this(BackMatcher matcher
, size_t lo
, size_t hi
, uint nGroup
, Stream stream
)
890 subCounters
= matcher
.subCounters
;
892 auto code
= matcher
.re
.ir
[lo
.. hi
];
893 re
= matcher
.re
.withCode(code
).withNGroup(nGroup
);
894 threadSize
= matcher
.threadSize
;
895 merge
= matcher
.merge
;
896 freelist
= matcher
.freelist
;
897 opCacheTrue
= matcher
.opCacheBackTrue
[lo
.. hi
];
898 opCacheBackTrue
= matcher
.opCacheTrue
[lo
.. hi
];
899 opCacheFalse
= matcher
.opCacheBackFalse
[lo
.. hi
];
900 opCacheBackFalse
= matcher
.opCacheFalse
[lo
.. hi
];
901 front
= matcher
.front
;
902 index
= matcher
.index
;
905 auto fwdMatcher()(size_t lo
, size_t hi
, uint nGroup
, size_t counter
)
907 auto m
= new ThompsonMatcher
!(Char
, Stream
)(this, lo
, hi
, nGroup
, s
);
908 m
.genCounter
= counter
;
912 auto bwdMatcher()(size_t lo
, size_t hi
, uint nGroup
, size_t counter
)
914 alias BackLooper
= typeof(s
.loopBack(index
));
915 auto m
= new ThompsonMatcher
!(Char
, BackLooper
)(this, lo
, hi
, nGroup
, s
.loopBack(index
));
916 m
.genCounter
= counter
;
921 override void dupTo(Matcher
!Char engine
, void[] memory
)
923 auto thompson
= cast(ThompsonMatcher
) engine
;
925 thompson
.subCounters
= null;
926 thompson
.front
= front
;
927 thompson
.index
= index
;
928 thompson
.matched
= matched
;
929 thompson
.exhausted
= exhausted
;
930 thompson
.initExternalMemory(memory
);
933 override int match(Group
!DataIndex
[] matches
)
935 debug(std_regex_matcher
)
936 writeln("------------------------------------------");
941 if (re
.flags
& RegexInfo
.oneShot
)
945 return matchOneShot(matches
);
948 if (!re
.kickstart
.empty
)
949 return matchImpl
!(true)(matches
);
950 return matchImpl
!(false)(matches
);
953 //match the input and fill matches
954 int matchImpl(bool withSearch
)(Group
!DataIndex
[] matches
)
956 if (!matched
&& clist
.empty
)
958 static if (withSearch
)
963 else//char in question is fetched in prev call to match
968 state
.matches
= matches
;
970 if (!atEnd
)//if no char
974 debug(std_regex_matcher
)
976 writefln("Threaded matching threads at %s", s
[index
.. s
.lastIndex
]);
980 writef("pc=%s ",t
.pc
);
985 for (state
.t
= clist
.fetch(); state
.t
; state
.t
= clist
.fetch())
989 //if we already have match no need to push the engine
992 state
.t
= createStart(index
);
993 eval
!true(&state
);//new thread staring at this position
995 else if (nlist
.empty
)
997 debug(std_regex_matcher
) writeln("Stopped matching before consuming full input");
998 break;//not a partial match for sure
1001 nlist
= (ThreadList
!DataIndex
).init
;
1002 if (clist
.tip
is null)
1004 static if (withSearch
)
1017 if (!atEnd
) return false;
1023 genCounter
++; //increment also on each end
1024 debug(std_regex_matcher
) writefln("Threaded matching threads at end");
1025 //try out all zero-width posibilities
1026 for (state
.t
= clist
.fetch(); state
.t
; state
.t
= clist
.fetch())
1032 state
.t
= createStart(index
);
1033 eval
!false(&state
);//new thread starting at end of input
1036 {//in case NFA found match along the way
1037 //and last possible longer alternative ultimately failed
1038 s
.reset(matches
[0].end
);//reset to last successful match
1039 next();//and reload front character
1040 //--- here the exact state of stream was restored ---
1041 exhausted
= atEnd ||
!(re
.flags
& RegexOption
.global
);
1042 //+ empty match advances the input
1043 if (!exhausted
&& matches
[0].begin
== matches
[0].end
)
1050 handle succesful threads
1052 void finish(const(Thread
!DataIndex
)* t
, Group
!DataIndex
[] matches
, int code
)
1054 matches
.ptr
[0 .. re
.ngroup
] = t
.matches
.ptr
[0 .. re
.ngroup
];
1055 debug(std_regex_matcher
)
1057 writef("FOUND pc=%s prog_len=%s",
1058 t
.pc
, re
.ir
.length
);
1060 writefln(": %s..%s", matches
[0].begin
, matches
[0].end
);
1061 foreach (v
; matches
)
1062 writefln("%d .. %d", v
.begin
, v
.end
);
1067 alias Ops(bool withInput
) = ThompsonOps
!(ThompsonMatcher
, State
, withInput
);
1068 alias BackOps(bool withInput
) = ThompsonOps
!(BackMatcher
, BackMatcher
.State
, withInput
);
1071 match thread against codepoint, cutting trough all 0-width instructions
1072 and taking care of control flow, then add it to nlist
1074 void eval(bool withInput
)(State
* state
)
1076 debug(std_regex_matcher
) writeln("---- Evaluating thread");
1077 static if (withInput
)
1078 while (opCacheTrue
.ptr
[state
.t
.pc
](this, state
)){}
1080 while (opCacheFalse
.ptr
[state
.t
.pc
](this, state
)){}
1082 enum uint RestartPc
= uint.max
;
1083 //match the input, evaluating IR without searching
1084 int matchOneShot(Group
!DataIndex
[] matches
, uint startPc
= 0)
1086 debug(std_regex_matcher
)
1088 writefln("---------------single shot match ----------------- ");
1090 alias evalFn
= eval
;
1091 assert(clist
== (ThreadList
!DataIndex
).init || startPc
== RestartPc
); // incorrect after a partial match
1092 assert(nlist
== (ThreadList
!DataIndex
).init || startPc
== RestartPc
);
1094 state
.matches
= matches
;
1095 if (!atEnd
)//if no char
1097 debug(std_regex_matcher
)
1099 writefln("-- Threaded matching threads at %s", s
[index
.. s
.lastIndex
]);
1101 if (startPc
!= RestartPc
)
1103 state
.t
= createStart(index
, startPc
);
1105 evalFn
!true(&state
);
1109 debug(std_regex_matcher
) writeln("\n-- Started iteration of main cycle");
1111 debug(std_regex_matcher
)
1113 foreach (t
; clist
[])
1118 for (state
.t
= clist
.fetch(); state
.t
; state
.t
= clist
.fetch())
1120 evalFn
!true(&state
);
1124 debug(std_regex_matcher
) writeln("Stopped matching before consuming full input");
1125 break;//not a partial match for sure
1128 nlist
= (ThreadList
!DataIndex
).init
;
1131 debug(std_regex_matcher
) writeln("-- Ended iteration of main cycle\n");
1134 genCounter
++; //increment also on each end
1135 debug(std_regex_matcher
) writefln("-- Matching threads at end");
1136 //try out all zero-width posibilities
1137 for (state
.t
= clist
.fetch(); state
.t
; state
.t
= clist
.fetch())
1139 evalFn
!false(&state
);
1143 state
.t
= createStart(index
, startPc
);
1144 evalFn
!false(&state
);
1149 //get a dirty recycled Thread
1150 Thread
!DataIndex
* allocate()
1152 assert(freelist
, "not enough preallocated memory");
1153 Thread
!DataIndex
* t
= freelist
;
1154 freelist
= freelist
.next
;
1158 //link memory into a free list of Threads
1159 void prepareFreeList(size_t size
, ref void[] memory
)
1161 void[] mem
= memory
[0 .. threadSize
*size
];
1162 memory
= memory
[threadSize
* size
.. $];
1163 freelist
= cast(Thread
!DataIndex
*)&mem
[0];
1165 for (i
= threadSize
; i
< threadSize
*size
; i
+= threadSize
)
1166 (cast(Thread
!DataIndex
*)&mem
[i
-threadSize
]).next
= cast(Thread
!DataIndex
*)&mem
[i
];
1167 (cast(Thread
!DataIndex
*)&mem
[i
-threadSize
]).next
= null;
1171 void recycle(Thread
!DataIndex
* t
)
1177 //dispose list of threads
1178 void recycle(ref ThreadList
!DataIndex list
)
1182 // just put this head-tail list in front of freelist
1183 list
.toe
.next
= freelist
;
1184 freelist
= list
.tip
;
1189 //creates a copy of master thread with given pc
1190 Thread
!DataIndex
* fork(Thread
!DataIndex
* master
, uint pc
, uint counter
)
1192 auto t
= allocate();
1193 t
.matches
.ptr
[0 .. re
.ngroup
] = master
.matches
.ptr
[0 .. re
.ngroup
];
1195 t
.counter
= counter
;
1200 //creates a start thread
1201 Thread
!DataIndex
* createStart(DataIndex index
, uint pc
= 0)
1203 auto t
= allocate();
1204 t
.matches
.ptr
[0 .. re
.ngroup
] = (Group
!DataIndex
).init
;
1205 t
.matches
[0].begin
= index
;