1 USING: arrays combinators kernel lists math math.parser
2 namespaces parser lexer parser-combinators
3 parser-combinators.simple promises quotations sequences strings
4 math.order assocs prettyprint.backend prettyprint.custom memoize
5 ascii unicode.categories combinators.short-circuit
7 IN: parser-combinators.regexp
13 : char=-quot ( ch -- quot )
15 [ ch>upper [ swap ch>upper = ] ] [ [ = ] ] if
18 : char-between?-quot ( ch1 ch2 -- quot )
20 [ [ ch>upper ] bi@ [ [ ch>upper ] 2dip between? ] ]
24 : <@literal ( parser obj -- action ) [ nip ] curry <@ ;
26 : <@delay ( parser quot -- action ) [ curry ] curry <@ ;
33 : octal-digit? ( n -- ? )
34 CHAR: 0 CHAR: 7 between? ;
36 : decimal-digit? ( n -- ? )
37 CHAR: 0 CHAR: 9 between? ;
39 : hex-digit? ( n -- ? )
41 over CHAR: a CHAR: f between? or
42 swap CHAR: A CHAR: F between? or ;
44 : control-char? ( n -- ? )
45 dup 0 HEX: 1f between?
49 "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" member? ;
51 : c-identifier-char? ( ch -- ? )
52 dup alpha? swap CHAR: _ = or ;
54 : java-blank? ( n -- ? )
57 CHAR: \t CHAR: \n CHAR: \r
61 : java-printable? ( n -- ? )
62 dup alpha? swap punct? or ;
64 : 'ordinary-char' ( -- parser )
65 [ "\\^*+?|(){}[$" member? not ] satisfy
68 : 'octal-digit' ( -- parser ) [ octal-digit? ] satisfy ;
70 : 'octal' ( -- parser )
71 "0" token 'octal-digit' 1 3 from-m-to-n &>
74 : 'hex-digit' ( -- parser ) [ hex-digit? ] satisfy ;
77 "x" token 'hex-digit' 2 exactly-n &>
78 "u" token 'hex-digit' 6 exactly-n &> <|>
81 : satisfy-tokens ( assoc -- parser )
82 [ [ token ] dip <@literal ] { } assoc>map <or-parser> ;
84 : 'simple-escape-char' ( -- parser )
93 } [ char=-quot ] assoc-map satisfy-tokens ;
95 : 'predefined-char-class' ( -- parser )
98 { "D" [ digit? not ] }
99 { "s" [ java-blank? ] }
100 { "S" [ java-blank? not ] }
101 { "w" [ c-identifier-char? ] }
102 { "W" [ c-identifier-char? not ] }
105 : 'posix-character-class' ( -- parser )
107 { "Lower" [ letter? ] }
108 { "Upper" [ LETTER? ] }
109 { "ASCII" [ ascii? ] }
110 { "Alpha" [ Letter? ] }
111 { "Digit" [ digit? ] }
112 { "Alnum" [ alpha? ] }
113 { "Punct" [ punct? ] }
114 { "Graph" [ java-printable? ] }
115 { "Print" [ java-printable? ] }
116 { "Blank" [ " \t" member? ] }
117 { "Cntrl" [ control-char? ] }
118 { "XDigit" [ hex-digit? ] }
119 { "Space" [ java-blank? ] }
120 } satisfy-tokens "p{" "}" surrounded-by ;
122 : 'simple-escape' ( -- parser )
125 "c" token [ LETTER? ] satisfy &> <|>
129 : 'escape' ( -- parser )
132 'predefined-char-class' <|>
133 'posix-character-class' <|>
134 'simple-escape' <|> &> ;
136 : 'any-char' ( -- parser )
137 "." token [ drop t ] <@literal ;
139 : 'char' ( -- parser )
140 'any-char' 'escape' 'ordinary-char' <|> <|> [ satisfy ] <@ ;
144 TUPLE: group-result str ;
146 C: <group-result> group-result
148 : 'non-capturing-group' ( -- parser )
149 "?:" token 'regexp' &> ;
151 : 'positive-lookahead-group' ( -- parser )
152 "?=" token 'regexp' &> [ ensure ] <@ ;
154 : 'negative-lookahead-group' ( -- parser )
155 "?!" token 'regexp' &> [ ensure-not ] <@ ;
157 : 'simple-group' ( -- parser )
158 'regexp' [ [ <group-result> ] <@ ] <@ ;
160 : 'group' ( -- parser )
161 'non-capturing-group'
162 'positive-lookahead-group'
163 'negative-lookahead-group'
164 'simple-group' <|> <|> <|>
165 "(" ")" surrounded-by ;
167 : 'range' ( -- parser )
168 [ CHAR: ] = not ] satisfy "-" token <&
169 [ CHAR: ] = not ] satisfy <&>
170 [ first2 char-between?-quot ] <@ ;
172 : 'character-class-term' ( -- parser )
175 [ "\\]" member? not ] satisfy [ char=-quot ] <@ <|> ;
177 : 'positive-character-class' ( -- parser )
178 "]" token [ CHAR: ] = ] <@literal 'character-class-term' <*> <&:>
179 'character-class-term' <+> <|>
180 [ [ 1|| ] curry ] <@ ;
182 : 'negative-character-class' ( -- parser )
183 "^" token 'positive-character-class' &>
184 [ [ not ] append ] <@ ;
186 : 'character-class' ( -- parser )
187 'negative-character-class' 'positive-character-class' <|>
188 "[" "]" surrounded-by [ satisfy ] <@ ;
190 : 'escaped-seq' ( -- parser )
192 [ ignore-case? get <token-parser> ] <@
193 "\\Q" "\\E" surrounded-by ;
195 : 'break' ( quot -- parser )
196 satisfy ensure epsilon just <|> ;
198 : 'break-escape' ( -- parser )
199 "$" token [ "\r\n" member? ] 'break' <@literal
200 "\\b" token [ blank? ] 'break' <@literal <|>
201 "\\B" token [ blank? not ] 'break' <@literal <|>
202 "\\z" token epsilon just <@literal <|> ;
204 : 'simple' ( -- parser )
208 'character-class' <|>
211 : 'exactly-n' ( -- parser )
212 'integer' [ exactly-n ] <@delay ;
214 : 'at-least-n' ( -- parser )
215 'integer' "," token <& [ at-least-n ] <@delay ;
217 : 'at-most-n' ( -- parser )
218 "," token 'integer' &> [ at-most-n ] <@delay ;
220 : 'from-m-to-n' ( -- parser )
221 'integer' "," token <& 'integer' <&> [ first2 from-m-to-n ] <@delay ;
223 : 'greedy-interval' ( -- parser )
224 'exactly-n' 'at-least-n' <|> 'at-most-n' <|> 'from-m-to-n' <|> ;
226 : 'interval' ( -- parser )
228 'greedy-interval' "?" token <& [ "reluctant {}" print ] <@ <|>
229 'greedy-interval' "+" token <& [ "possessive {}" print ] <@ <|>
230 "{" "}" surrounded-by ;
232 : 'repetition' ( -- parser )
234 "*+" token [ <!*> ] <@literal
235 "++" token [ <!+> ] <@literal <|>
236 "?+" token [ <!?> ] <@literal <|>
238 "*?" token [ <(*)> ] <@literal <|>
239 "+?" token [ <(+)> ] <@literal <|>
240 "??" token [ <(?)> ] <@literal <|>
242 "*" token [ <*> ] <@literal <|>
243 "+" token [ <+> ] <@literal <|>
244 "?" token [ <?> ] <@literal <|> ;
246 : 'dummy' ( -- parser )
247 epsilon [ ] <@literal ;
249 MEMO: 'term' ( -- parser )
251 'repetition' 'interval' 'dummy' <|> <|> <&> [ first2 call ] <@
252 <!+> [ <and-parser> ] <@ ;
254 LAZY: 'regexp' ( -- parser )
255 'term' "|" token nonempty-list-of [ <or-parser> ] <@ ;
256 ! "^" token 'term' "|" token nonempty-list-of [ <or-parser> ] <@
257 ! &> [ "caret" print ] <@ <|>
258 ! 'term' "|" token nonempty-list-of [ <or-parser> ] <@
259 ! "$" token <& [ "dollar" print ] <@ <|>
260 ! "^" token 'term' "|" token nonempty-list-of [ <or-parser> ] <@ &>
261 ! "$" token [ "caret dollar" print ] <@ <& <|> ;
263 TUPLE: regexp source parser ignore-case? ;
265 : <regexp> ( string ignore-case? -- regexp )
268 dup 'regexp' just parse-1
272 : do-ignore-case ( string regexp -- string regexp )
273 dup ignore-case?>> [ [ >upper ] dip ] when ;
275 : matches? ( string regexp -- ? )
276 do-ignore-case parser>> just parse nil? not ;
278 : match-head ( string regexp -- end )
279 do-ignore-case parser>> parse dup nil?
280 [ drop f ] [ car unparsed>> from>> ] if ;
282 ! Literal syntax for regexps
283 : parse-options ( string -- ? )
290 : parse-regexp ( accum end -- accum )
291 lexer get dup skip-blank
292 [ [ index-from dup 1+ swap ] 2keep swapd subseq swap ] change-lexer-column
293 lexer get dup still-parsing-line?
294 [ (parse-token) parse-options ] [ drop f ] if
297 : R! CHAR: ! parse-regexp ; parsing
298 : R" CHAR: " parse-regexp ; parsing
299 : R# CHAR: # parse-regexp ; parsing
300 : R' CHAR: ' parse-regexp ; parsing
301 : R( CHAR: ) parse-regexp ; parsing
302 : R/ CHAR: / parse-regexp ; parsing
303 : R@ CHAR: @ parse-regexp ; parsing
304 : R[ CHAR: ] parse-regexp ; parsing
305 : R` CHAR: ` parse-regexp ; parsing
306 : R{ CHAR: } parse-regexp ; parsing
307 : R| CHAR: | parse-regexp ; parsing
309 : find-regexp-syntax ( string -- prefix suffix )
322 } swap [ subseq? not nip ] curry assoc-find drop ;
327 dup find-regexp-syntax swap % swap % %
328 dup ignore-case?>> [ "i" % ] when