1 ! Copyright (C) 2008 Doug Coleman.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: accessors arrays assocs combinators io io.streams.string
4 kernel math math.parser namespaces sets
5 quotations sequences splitting vectors math.order
6 strings regexp.backend regexp.utils
7 unicode.case unicode.categories words locals regexp.classes ;
10 FROM: math.ranges => [a,b] ;
12 TUPLE: concatenation seq ; INSTANCE: concatenation node
13 TUPLE: alternation seq ; INSTANCE: alternation node
14 TUPLE: kleene-star term ; INSTANCE: kleene-star node
17 TUPLE: possessive-question term ; INSTANCE: possessive-question node
18 TUPLE: possessive-kleene-star term ; INSTANCE: possessive-kleene-star node
21 TUPLE: reluctant-question term ; INSTANCE: reluctant-question node
22 TUPLE: reluctant-kleene-star term ; INSTANCE: reluctant-kleene-star node
24 TUPLE: negation term ; INSTANCE: negation node
25 TUPLE: constant char ; INSTANCE: constant node
26 TUPLE: range from to ; INSTANCE: range node
28 MIXIN: parentheses-group
29 TUPLE: lookahead term ; INSTANCE: lookahead node
30 INSTANCE: lookahead parentheses-group
31 TUPLE: lookbehind term ; INSTANCE: lookbehind node
32 INSTANCE: lookbehind parentheses-group
33 TUPLE: capture-group term ; INSTANCE: capture-group node
34 INSTANCE: capture-group parentheses-group
35 TUPLE: non-capture-group term ; INSTANCE: non-capture-group node
36 INSTANCE: non-capture-group parentheses-group
37 TUPLE: independent-group term ; INSTANCE: independent-group node ! atomic group
38 INSTANCE: independent-group parentheses-group
39 TUPLE: comment-group term ; INSTANCE: comment-group node
40 INSTANCE: comment-group parentheses-group
42 SINGLETON: epsilon INSTANCE: epsilon node
44 TUPLE: option option on? ; INSTANCE: option node
46 SINGLETONS: unix-lines dotall multiline comments case-insensitive
47 unicode-case reversed-regexp ;
49 SINGLETONS: beginning-of-character-class end-of-character-class
50 left-parenthesis pipe caret dash ;
52 : push1 ( obj -- ) input-stream get stream>> push ;
53 : peek1 ( -- obj ) input-stream get stream>> [ f ] [ peek ] if-empty ;
54 : pop3 ( seq -- obj1 obj2 obj3 ) [ pop ] [ pop ] [ pop ] tri spin ;
55 : drop1 ( -- ) read1 drop ;
57 : stack ( -- obj ) current-regexp get stack>> ;
58 : change-whole-stack ( quot -- )
60 [ stack>> swap call ] keep (>>stack) ; inline
61 : push-stack ( obj -- ) stack push ;
62 : pop-stack ( -- obj ) stack pop ;
63 : cut-out ( vector n -- vector' vector ) cut rest ;
64 ERROR: cut-stack-error ;
65 : cut-stack ( obj vector -- vector' vector )
66 [ nip ] [ last-index ] 2bi [ cut-stack-error ] unless* cut-out swap ;
68 : <possessive-kleene-star> ( obj -- kleene ) possessive-kleene-star boa ;
69 : <reluctant-kleene-star> ( obj -- kleene ) reluctant-kleene-star boa ;
70 : <possessive-question> ( obj -- kleene ) possessive-question boa ;
71 : <reluctant-question> ( obj -- kleene ) reluctant-question boa ;
73 : <negation> ( obj -- negation ) negation boa ;
74 : <concatenation> ( seq -- concatenation )
75 >vector [ epsilon ] [ concatenation boa ] if-empty ;
76 : <alternation> ( seq -- alternation ) >vector alternation boa ;
77 : <capture-group> ( obj -- capture-group ) capture-group boa ;
78 : <kleene-star> ( obj -- kleene-star ) kleene-star boa ;
79 : <constant> ( obj -- constant ) constant boa ;
81 : first|concatenation ( seq -- first/concatenation )
82 dup length 1 = [ first ] [ <concatenation> ] if ;
84 : first|alternation ( seq -- first/alternation )
85 dup length 1 = [ first ] [ <alternation> ] if ;
87 : <character-class-range> ( from to -- obj )
89 [ character-class-range boa ] [ 2drop unmatchable-class ] if ;
91 ERROR: unmatched-parentheses ;
93 ERROR: unknown-regexp-option option ;
95 : ch>option ( ch -- singleton )
97 { CHAR: i [ case-insensitive ] }
98 { CHAR: d [ unix-lines ] }
99 { CHAR: m [ multiline ] }
100 { CHAR: n [ multiline ] }
101 { CHAR: r [ reversed-regexp ] }
102 { CHAR: s [ dotall ] }
103 { CHAR: u [ unicode-case ] }
104 { CHAR: x [ comments ] }
105 [ unknown-regexp-option ]
108 : option>ch ( option -- string )
110 { case-insensitive [ CHAR: i ] }
111 { multiline [ CHAR: m ] }
112 { reversed-regexp [ CHAR: r ] }
113 { dotall [ CHAR: s ] }
114 [ unknown-regexp-option ]
117 : toggle-option ( ch ? -- )
118 [ ch>option ] dip option boa push-stack ;
120 : (parse-options) ( string ? -- ) [ toggle-option ] curry each ;
122 : parse-options ( string -- )
123 "-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ;
125 ERROR: bad-special-group string ;
127 DEFER: (parse-regexp)
128 : nested-parse-regexp ( token ? -- )
129 [ push-stack (parse-regexp) pop-stack ] dip
130 [ <negation> ] when pop-stack new swap >>term push-stack ;
132 ! non-capturing groups
133 : (parse-special-group) ( -- )
135 { [ dup CHAR: # = ] ! comment
136 [ drop comment-group f nested-parse-regexp pop-stack drop ] }
138 [ drop non-capture-group f nested-parse-regexp ] }
140 [ drop lookahead f nested-parse-regexp ] }
142 [ drop lookahead t nested-parse-regexp ] }
144 [ drop non-capture-group f nested-parse-regexp ] }
145 { [ dup CHAR: < = peek1 CHAR: = = and ]
146 [ drop drop1 lookbehind f nested-parse-regexp ] }
147 { [ dup CHAR: < = peek1 CHAR: ! = and ]
148 [ drop drop1 lookbehind t nested-parse-regexp ] }
153 { CHAR: : [ parse-options non-capture-group f nested-parse-regexp ] }
154 { CHAR: ) [ parse-options ] }
155 [ drop bad-special-group ]
160 : handle-left-parenthesis ( -- )
162 [ drop1 (parse-special-group) ]
163 [ capture-group f nested-parse-regexp ] if ;
165 : handle-dot ( -- ) any-char push-stack ;
166 : handle-pipe ( -- ) pipe push-stack ;
167 : (handle-star) ( obj -- kleene-star )
169 { CHAR: + [ drop1 <possessive-kleene-star> ] }
170 { CHAR: ? [ drop1 <reluctant-kleene-star> ] }
171 [ drop <kleene-star> ]
173 : handle-star ( -- ) stack pop (handle-star) push-stack ;
174 : handle-question ( -- )
176 { CHAR: + [ drop1 <possessive-question> ] }
177 { CHAR: ? [ drop1 <reluctant-question> ] }
178 [ drop epsilon 2array <alternation> ]
181 stack pop dup (handle-star)
182 2array <concatenation> push-stack ;
184 ERROR: unmatched-brace ;
185 : parse-repetition ( -- start finish ? )
186 "}" read-until [ unmatched-brace ] unless
187 [ "," split1 [ string>number ] bi@ ]
188 [ CHAR: , swap index >boolean ] bi ;
190 : replicate/concatenate ( n obj -- obj' )
191 over zero? [ 2drop epsilon ]
192 [ <repetition> first|concatenation ] if ;
195 stack pop replicate/concatenate push-stack ;
197 : at-least-n ( n -- )
199 [ replicate/concatenate ] keep
200 <kleene-star> 2array <concatenation> push-stack ;
205 [ replicate/concatenate ] curry map <alternation> push-stack ;
207 : from-m-to-n ( m n -- )
210 [ replicate/concatenate ] curry map
211 <alternation> push-stack ;
213 ERROR: invalid-range a b ;
215 : handle-left-brace ( -- )
217 [ 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ ] dip
219 2dup and [ from-m-to-n ]
220 [ [ nip at-most-n ] [ at-least-n ] if* ] if
221 ] [ drop 0 max exactly-n ] if ;
223 : handle-front-anchor ( -- ) beginning-of-line push-stack ;
224 : handle-back-anchor ( -- ) end-of-line push-stack ;
226 ERROR: bad-character-class obj ;
227 ERROR: expected-posix-class ;
229 : parse-posix-class ( -- obj )
230 read1 CHAR: { = [ expected-posix-class ] unless
231 "}" read-until [ bad-character-class ] unless
233 { "Lower" [ letter-class ] }
234 { "Upper" [ LETTER-class ] }
235 { "Alpha" [ Letter-class ] }
236 { "ASCII" [ ascii-class ] }
237 { "Digit" [ digit-class ] }
238 { "Alnum" [ alpha-class ] }
239 { "Punct" [ punctuation-class ] }
240 { "Graph" [ java-printable-class ] }
241 { "Print" [ java-printable-class ] }
242 { "Blank" [ non-newline-blank-class ] }
243 { "Cntrl" [ control-character-class ] }
244 { "XDigit" [ hex-digit-class ] }
245 { "Space" [ java-blank-class ] }
246 ! TODO: unicode-character-class, fallthrough in unicode is bad-char-clss
247 [ bad-character-class ]
250 : parse-octal ( -- n ) 3 read oct> check-octal ;
251 : parse-short-hex ( -- n ) 2 read hex> check-hex ;
252 : parse-long-hex ( -- n ) 6 read hex> check-hex ;
253 : parse-control-character ( -- n ) read1 ;
255 ERROR: bad-escaped-literals seq ;
257 : parse-til-E ( -- obj )
258 "\\E" read-until [ bad-escaped-literals ] unless ;
260 :: (parse-escaped-literals) ( quot: ( obj -- obj' ) -- obj )
264 quot call [ <constant> ] V{ } map-as
268 : parse-escaped-literals ( -- obj )
269 [ ] (parse-escaped-literals) ;
271 : lower-case-literals ( -- obj )
272 [ >lower ] (parse-escaped-literals) ;
274 : upper-case-literals ( -- obj )
275 [ >upper ] (parse-escaped-literals) ;
277 : parse-escaped ( -- obj )
280 { CHAR: t [ CHAR: \t <constant> ] }
281 { CHAR: n [ CHAR: \n <constant> ] }
282 { CHAR: r [ CHAR: \r <constant> ] }
283 { CHAR: f [ HEX: c <constant> ] }
284 { CHAR: a [ HEX: 7 <constant> ] }
285 { CHAR: e [ HEX: 1b <constant> ] }
287 { CHAR: w [ c-identifier-class ] }
288 { CHAR: W [ c-identifier-class <negation> ] }
289 { CHAR: s [ java-blank-class ] }
290 { CHAR: S [ java-blank-class <negation> ] }
291 { CHAR: d [ digit-class ] }
292 { CHAR: D [ digit-class <negation> ] }
294 { CHAR: p [ parse-posix-class ] }
295 { CHAR: P [ parse-posix-class <negation> ] }
296 { CHAR: x [ parse-short-hex <constant> ] }
297 { CHAR: u [ parse-long-hex <constant> ] }
298 { CHAR: 0 [ parse-octal <constant> ] }
299 { CHAR: c [ parse-control-character ] }
301 { CHAR: Q [ parse-escaped-literals ] }
303 ! { CHAR: b [ word-boundary-class ] }
304 ! { CHAR: B [ word-boundary-class <negation> ] }
305 ! { CHAR: A [ handle-beginning-of-input ] }
306 ! { CHAR: z [ handle-end-of-input ] }
308 ! { CHAR: Z [ handle-end-of-input ] } ! plus a final terminator
311 ! { CHAR: G [ end of previous match ] }
314 ! { CHAR: 1 [ CHAR: 1 <constant> ] }
315 ! { CHAR: 2 [ CHAR: 2 <constant> ] }
316 ! { CHAR: 3 [ CHAR: 3 <constant> ] }
317 ! { CHAR: 4 [ CHAR: 4 <constant> ] }
318 ! { CHAR: 5 [ CHAR: 5 <constant> ] }
319 ! { CHAR: 6 [ CHAR: 6 <constant> ] }
320 ! { CHAR: 7 [ CHAR: 7 <constant> ] }
321 ! { CHAR: 8 [ CHAR: 8 <constant> ] }
322 ! { CHAR: 9 [ CHAR: 9 <constant> ] }
325 ! can't do \l and \u because \u is already a 4-hex
326 { CHAR: L [ lower-case-literals ] }
327 { CHAR: U [ upper-case-literals ] }
332 : handle-escape ( -- ) parse-escaped push-stack ;
334 : handle-dash ( vector -- vector' )
335 H{ { dash CHAR: - } } substitute ;
337 : character-class>alternation ( seq -- alternation )
338 [ dup number? [ <constant> ] when ] map first|alternation ;
340 : handle-caret ( vector -- vector' )
341 dup [ length 2 >= ] [ first caret eq? ] bi and [
342 rest-slice character-class>alternation <negation>
344 character-class>alternation
347 : make-character-class ( -- character-class )
348 [ beginning-of-character-class swap cut-stack ] change-whole-stack
349 handle-dash handle-caret ;
352 stack [ pop3 nip <character-class-range> ] keep push ;
354 : apply-dash? ( -- ? )
355 stack dup length 3 >=
356 [ [ length 2 - ] keep nth dash eq? ] [ drop f ] if ;
358 ERROR: empty-negated-character-class ;
359 DEFER: handle-left-bracket
360 : (parse-character-class) ( -- )
361 read1 [ empty-negated-character-class ] unless* {
362 { CHAR: [ [ handle-left-bracket t ] }
363 { CHAR: ] [ make-character-class push-stack f ] }
364 { CHAR: - [ dash push-stack t ] }
365 { CHAR: \ [ parse-escaped push-stack t ] }
366 [ push-stack apply-dash? [ apply-dash ] when t ]
368 [ (parse-character-class) ] when ;
370 : push-constant ( ch -- ) <constant> push-stack ;
372 : parse-character-class-second ( -- )
374 { CHAR: [ [ CHAR: [ push-constant ] }
375 { CHAR: ] [ CHAR: ] push-constant ] }
376 { CHAR: - [ CHAR: - push-constant ] }
380 : parse-character-class-first ( -- )
382 { CHAR: ^ [ caret push-stack parse-character-class-second ] }
383 { CHAR: [ [ CHAR: [ push-constant ] }
384 { CHAR: ] [ CHAR: ] push-constant ] }
385 { CHAR: - [ CHAR: - push-constant ] }
389 : handle-left-bracket ( -- )
390 beginning-of-character-class push-stack
391 parse-character-class-first (parse-character-class) ;
393 : finish-regexp-parse ( stack -- obj )
395 [ first|concatenation ] map first|alternation ;
397 : handle-right-parenthesis ( -- )
398 stack dup [ parentheses-group "members" word-prop member? ] find-last
400 [ [ push ] keep current-regexp get (>>stack) ]
401 [ finish-regexp-parse push-stack ] bi* ;
403 : parse-regexp-token ( token -- ? )
405 { CHAR: ( [ handle-left-parenthesis t ] } ! handle (?..) at beginning?
406 { CHAR: ) [ handle-right-parenthesis f ] }
407 { CHAR: . [ handle-dot t ] }
408 { CHAR: | [ handle-pipe t ] }
409 { CHAR: ? [ handle-question t ] }
410 { CHAR: * [ handle-star t ] }
411 { CHAR: + [ handle-plus t ] }
412 { CHAR: { [ handle-left-brace t ] }
413 { CHAR: [ [ handle-left-bracket t ] }
414 { CHAR: \ [ handle-escape t ] }
416 dup CHAR: $ = peek1 f = and
417 [ drop handle-back-anchor f ]
418 [ push-constant t ] if
422 : (parse-regexp) ( -- )
423 read1 [ parse-regexp-token [ (parse-regexp) ] when ] when* ;
425 : parse-regexp-beginning ( -- )
426 peek1 CHAR: ^ = [ drop1 handle-front-anchor ] when ;
428 : parse-regexp ( regexp -- )
432 parse-regexp-beginning (parse-regexp)
435 current-regexp get [ finish-regexp-parse ] change-stack
436 dup stack>> >>parse-tree drop