1 ! Copyright (C) 2008 Doug Coleman.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors arrays assocs grouping kernel regexp.backend
4 locals math namespaces regexp.parser sequences fry quotations
5 math.order math.ranges vectors unicode.categories regexp.utils
6 regexp.transition-tables words sets regexp.classes unicode.case.private ;
7 ! This uses unicode.case.private for ch>upper and ch>lower
8 ! but case-insensitive matching should be done by case-folding everything
9 ! before processing starts
12 ERROR: feature-is-broken feature ;
15 : negated? ( -- ? ) negation-mode get 0 or odd? ;
20 SINGLETON: lookahead-on INSTANCE: lookahead-on traversal-flag
21 SINGLETON: lookahead-off INSTANCE: lookahead-off traversal-flag
22 SINGLETON: lookbehind-on INSTANCE: lookbehind-on traversal-flag
23 SINGLETON: lookbehind-off INSTANCE: lookbehind-off traversal-flag
24 SINGLETON: capture-group-on INSTANCE: capture-group-on traversal-flag
25 SINGLETON: capture-group-off INSTANCE: capture-group-off traversal-flag
26 SINGLETON: front-anchor INSTANCE: front-anchor traversal-flag
27 SINGLETON: back-anchor INSTANCE: back-anchor traversal-flag
28 SINGLETON: word-boundary INSTANCE: word-boundary traversal-flag
30 : options ( -- obj ) current-regexp get options>> ;
32 : option? ( obj -- ? ) options key? ;
34 : option-on ( obj -- ) options conjoin ;
36 : option-off ( obj -- ) options delete-at ;
38 : next-state ( regexp -- state )
39 [ state>> ] [ [ 1+ ] change-state drop ] bi ;
41 : set-start-state ( regexp -- )
45 [ nfa-table>> ] [ pop first ] bi* >>start-state drop
48 GENERIC: nfa-node ( node -- )
50 :: add-simple-entry ( obj class -- )
51 [let* | regexp [ current-regexp get ]
52 s0 [ regexp next-state ]
53 s1 [ regexp next-state ]
54 stack [ regexp stack>> ]
55 table [ regexp nfa-table>> ] |
57 s0 f obj class make-transition table add-transition
58 s0 s1 <default-transition> table add-transition
60 s0 s1 obj class make-transition table add-transition
62 s0 s1 2array stack push
63 t s1 table final-states>> set-at ] ;
65 : add-traversal-flag ( flag -- )
67 current-regexp get nfa-traversal-flags>> push-at ;
69 :: concatenate-nodes ( -- )
70 [let* | regexp [ current-regexp get ]
71 stack [ regexp stack>> ]
72 table [ regexp nfa-table>> ]
73 s2 [ stack peek first ]
74 s3 [ stack pop second ]
75 s0 [ stack peek first ]
76 s1 [ stack pop second ] |
77 s1 s2 eps <literal-transition> table add-transition
78 s1 table final-states>> delete-at
79 s0 s3 2array stack push ] ;
81 :: alternate-nodes ( -- )
82 [let* | regexp [ current-regexp get ]
83 stack [ regexp stack>> ]
84 table [ regexp nfa-table>> ]
85 s2 [ stack peek first ]
86 s3 [ stack pop second ]
87 s0 [ stack peek first ]
88 s1 [ stack pop second ]
89 s4 [ regexp next-state ]
90 s5 [ regexp next-state ] |
91 s4 s0 eps <literal-transition> table add-transition
92 s4 s2 eps <literal-transition> table add-transition
93 s1 s5 eps <literal-transition> table add-transition
94 s3 s5 eps <literal-transition> table add-transition
95 s1 table final-states>> delete-at
96 s3 table final-states>> delete-at
97 t s5 table final-states>> set-at
98 s4 s5 2array stack push ] ;
100 M: kleene-star nfa-node ( node -- )
102 [let* | regexp [ current-regexp get ]
103 stack [ regexp stack>> ]
104 s0 [ stack peek first ]
105 s1 [ stack pop second ]
106 s2 [ regexp next-state ]
107 s3 [ regexp next-state ]
108 table [ regexp nfa-table>> ] |
109 s1 table final-states>> delete-at
110 t s3 table final-states>> set-at
111 s1 s0 eps <literal-transition> table add-transition
112 s2 s0 eps <literal-transition> table add-transition
113 s2 s3 eps <literal-transition> table add-transition
114 s1 s3 eps <literal-transition> table add-transition
115 s2 s3 2array stack push ] ;
117 M: concatenation nfa-node ( node -- )
119 reversed-regexp option? [ <reversed> ] when
120 [ [ nfa-node ] each ]
121 [ length 1- [ concatenate-nodes ] times ] bi ;
123 M: alternation nfa-node ( node -- )
125 [ [ nfa-node ] each ]
126 [ length 1- [ alternate-nodes ] times ] bi ;
128 M: constant nfa-node ( node -- )
129 case-insensitive option? [
130 dup char>> [ ch>lower ] [ ch>upper ] bi
133 char>> literal-transition add-simple-entry
135 [ literal-transition add-simple-entry ] bi@
139 char>> literal-transition add-simple-entry
142 M: epsilon nfa-node ( node -- )
143 drop eps literal-transition add-simple-entry ;
145 M: word nfa-node ( node -- ) class-transition add-simple-entry ;
147 M: any-char nfa-node ( node -- )
148 [ dotall option? ] dip any-char-no-nl ?
149 class-transition add-simple-entry ;
151 ! M: beginning-of-text nfa-node ( node -- ) ;
153 M: beginning-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
155 M: end-of-line nfa-node ( node -- ) class-transition add-simple-entry ;
157 : choose-letter-class ( node -- node' )
158 case-insensitive option? Letter-class rot ? ;
160 M: letter-class nfa-node ( node -- )
161 choose-letter-class class-transition add-simple-entry ;
163 M: LETTER-class nfa-node ( node -- )
164 choose-letter-class class-transition add-simple-entry ;
166 M: character-class-range nfa-node ( node -- )
167 case-insensitive option? [
168 ! This should be implemented for Unicode by case-folding
169 ! the input and all strings in the regexp.
170 dup [ from>> ] [ to>> ] bi
171 2dup [ Letter? ] bi@ and [
173 [ [ ch>lower ] bi@ character-class-range boa ]
174 [ [ ch>upper ] bi@ character-class-range boa ] 2bi
175 [ class-transition add-simple-entry ] bi@
179 class-transition add-simple-entry
182 class-transition add-simple-entry
185 M: capture-group nfa-node ( node -- )
186 "capture-groups" feature-is-broken
187 eps literal-transition add-simple-entry
188 capture-group-on add-traversal-flag
190 eps literal-transition add-simple-entry
191 capture-group-off add-traversal-flag
192 2 [ concatenate-nodes ] times ;
195 M: non-capture-group nfa-node ( node -- )
198 M: reluctant-kleene-star nfa-node ( node -- )
199 term>> <kleene-star> nfa-node ;
201 M: negation nfa-node ( node -- )
206 M: lookahead nfa-node ( node -- )
207 "lookahead" feature-is-broken
208 eps literal-transition add-simple-entry
209 lookahead-on add-traversal-flag
211 eps literal-transition add-simple-entry
212 lookahead-off add-traversal-flag
213 2 [ concatenate-nodes ] times ;
215 M: lookbehind nfa-node ( node -- )
216 "lookbehind" feature-is-broken
217 eps literal-transition add-simple-entry
218 lookbehind-on add-traversal-flag
220 eps literal-transition add-simple-entry
221 lookbehind-off add-traversal-flag
222 2 [ concatenate-nodes ] times ;
224 M: option nfa-node ( node -- )
225 [ option>> ] [ on?>> ] bi [ option-on ] [ option-off ] if
226 eps literal-transition add-simple-entry ;
228 : construct-nfa ( regexp -- )
232 [ current-regexp set ]
233 [ parse-tree>> nfa-node ]
234 [ set-start-state ] tri