1 ! Copyright (C) 2007 Chris Double.
2 ! See http://factorcode.org/license.txt for BSD license.
3 USING: kernel compiler.units words arrays strings math.parser
4 sequences quotations vectors namespaces make math assocs
5 continuations peg peg.parsers unicode.categories multiline
6 splitting accessors effects sequences.deep peg.search
7 combinators.short-circuit lexer io.streams.string stack-checker
8 io combinators parser ;
11 : rule ( name word -- parser )
12 #! Given an EBNF word produced from EBNF: return the EBNF rule
13 "ebnf-parser" word-prop at ;
15 TUPLE: tokenizer any one many ;
17 : default-tokenizer ( -- tokenizer )
21 [ [ = ] curry any-char swap semantic ]
24 : parser-tokenizer ( parser -- tokenizer )
26 [ swap [ = ] curry semantic ] curry dup \ tokenizer boa ;
28 : rule-tokenizer ( name word -- tokenizer )
29 rule parser-tokenizer ;
31 : tokenizer ( -- word )
32 \ tokenizer get-global [ default-tokenizer ] unless* ;
34 : reset-tokenizer ( -- )
35 default-tokenizer \ tokenizer set-global ;
38 scan search [ "Tokenizer not found" throw ] unless*
39 execute \ tokenizer set-global ; parsing
41 TUPLE: ebnf-non-terminal symbol ;
42 TUPLE: ebnf-terminal symbol ;
43 TUPLE: ebnf-foreign word rule ;
44 TUPLE: ebnf-any-character ;
45 TUPLE: ebnf-range pattern ;
46 TUPLE: ebnf-ensure group ;
47 TUPLE: ebnf-ensure-not group ;
48 TUPLE: ebnf-choice options ;
49 TUPLE: ebnf-sequence elements ;
50 TUPLE: ebnf-repeat0 group ;
51 TUPLE: ebnf-repeat1 group ;
52 TUPLE: ebnf-optional group ;
53 TUPLE: ebnf-whitespace group ;
54 TUPLE: ebnf-tokenizer elements ;
55 TUPLE: ebnf-rule symbol elements ;
56 TUPLE: ebnf-action parser code ;
57 TUPLE: ebnf-var parser name ;
58 TUPLE: ebnf-semantic parser code ;
61 C: <ebnf-non-terminal> ebnf-non-terminal
62 C: <ebnf-terminal> ebnf-terminal
63 C: <ebnf-foreign> ebnf-foreign
64 C: <ebnf-any-character> ebnf-any-character
65 C: <ebnf-range> ebnf-range
66 C: <ebnf-ensure> ebnf-ensure
67 C: <ebnf-ensure-not> ebnf-ensure-not
68 C: <ebnf-choice> ebnf-choice
69 C: <ebnf-sequence> ebnf-sequence
70 C: <ebnf-repeat0> ebnf-repeat0
71 C: <ebnf-repeat1> ebnf-repeat1
72 C: <ebnf-optional> ebnf-optional
73 C: <ebnf-whitespace> ebnf-whitespace
74 C: <ebnf-tokenizer> ebnf-tokenizer
75 C: <ebnf-rule> ebnf-rule
76 C: <ebnf-action> ebnf-action
77 C: <ebnf-var> ebnf-var
78 C: <ebnf-semantic> ebnf-semantic
81 : filter-hidden ( seq -- seq )
82 #! Remove elements that produce no AST from sequence
83 [ ebnf-ensure-not? not ] filter [ ebnf-ensure? not ] filter ;
85 : syntax ( string -- parser )
86 #! Parses the string, ignoring white space, and
87 #! does not put the result in the AST.
90 : syntax-pack ( begin parser end -- parser )
91 #! Parse 'parser' surrounded by syntax elements
93 [ syntax ] 2dip syntax pack ;
95 #! Don't want to use 'replace' in an action since replace doesn't infer.
96 #! Do the compilation of the peg at parse time and call (replace).
97 PEG: escaper ( string -- ast )
99 "\\t" token [ drop "\t" ] action ,
100 "\\n" token [ drop "\n" ] action ,
101 "\\r" token [ drop "\r" ] action ,
102 "\\\\" token [ drop "\\" ] action ,
103 ] choice* any-char-parser 2array choice repeat0 ;
105 : replace-escapes ( string -- string )
106 escaper sift [ [ tree-write ] each ] with-string-writer ;
108 : insert-escapes ( string -- string )
110 "\t" token [ drop "\\t" ] action ,
111 "\n" token [ drop "\\n" ] action ,
112 "\r" token [ drop "\\r" ] action ,
113 ] choice* replace ;
115 : 'identifier' ( -- parser )
116 #! Return a parser that parses an identifer delimited by
117 #! a quotation character. The quotation can be single
118 #! or double quotes. The AST produced is the identifier
119 #! between the quotes.
121 [ CHAR: " = not ] satisfy repeat1 "\"" "\"" surrounded-by ,
122 [ CHAR: ' = not ] satisfy repeat1 "'" "'" surrounded-by ,
123 ] choice* [ >string replace-escapes ] action ;
125 : 'non-terminal' ( -- parser )
126 #! A non-terminal is the name of another rule. It can
127 #! be any non-blank character except for characters used
128 #! in the EBNF syntax itself.
153 ] satisfy repeat1 [ >string <ebnf-non-terminal> ] action ;
155 : 'terminal' ( -- parser )
156 #! A terminal is an identifier enclosed in quotations
157 #! and it represents the literal value of the identifier.
158 'identifier' [ <ebnf-terminal> ] action ;
160 : 'foreign-name' ( -- parser )
161 #! Parse a valid foreign parser name
167 ] satisfy repeat1 [ >string ] action ;
169 : 'foreign' ( -- parser )
170 #! A foreign call is a call to a rule in another ebnf grammar
172 "<foreign" syntax ,
173 'foreign-name' sp ,
174 'foreign-name' sp optional ,
176 ] seq* [ first2 <ebnf-foreign> ] action ;
178 : 'any-character' ( -- parser )
179 #! A parser to match the symbol for any character match.
180 [ CHAR: . = ] satisfy [ drop <ebnf-any-character> ] action ;
182 : 'range-parser' ( -- parser )
183 #! Match the syntax for declaring character ranges
185 [ "[" syntax , "[" token ensure-not , ] seq* hide ,
186 [ CHAR: ] = not ] satisfy repeat1 ,
188 ] seq* [ first >string <ebnf-range> ] action ;
190 : ('element') ( -- parser )
191 #! An element of a rule. It can be a terminal or a
192 #! non-terminal but must not be followed by a "=".
193 #! The latter indicates that it is the beginning of a
204 [ dup , "*" token hide , ] seq* [ first <ebnf-repeat0> ] action ,
205 [ dup , "+" token hide , ] seq* [ first <ebnf-repeat1> ] action ,
206 [ dup , "?[" token ensure-not , "?" token hide , ] seq* [ first <ebnf-optional> ] action ,
210 "=" syntax ensure-not ,
211 "=>" syntax ensure ,
213 ] seq* [ first ] action ;
217 : 'element' ( -- parser )
219 [ ('element') , ":" syntax , "a-zA-Z" range-pattern repeat1 [ >string ] action , ] seq* [ first2 <ebnf-var> ] action ,
225 : grouped ( quot suffix -- parser )
226 #! Parse a group of choices, with a suffix indicating
227 #! the type of group (repeat0, repeat1, etc) and
228 #! an quot that is the action that produces the AST.
231 "(" [ 'choice' sp ] delay ")" syntax-pack
233 [ first ] rot compose action ,
234 "{" [ 'choice' sp ] delay "}" syntax-pack
236 [ first <ebnf-whitespace> ] rot compose action ,
239 : 'group' ( -- parser )
240 #! A grouping with no suffix. Used for precedence.
242 "*" token sp ensure-not ,
243 "+" token sp ensure-not ,
244 "?" token sp ensure-not ,
245 ] seq* hide grouped ;
247 : 'repeat0' ( -- parser )
248 [ <ebnf-repeat0> ] "*" syntax grouped ;
250 : 'repeat1' ( -- parser )
251 [ <ebnf-repeat1> ] "+" syntax grouped ;
253 : 'optional' ( -- parser )
254 [ <ebnf-optional> ] "?" syntax grouped ;
256 : 'factor-code' ( -- parser )
258 "]]" token ensure-not ,
259 "]?" token ensure-not ,
260 [ drop t ] satisfy ,
261 ] seq* [ first ] action repeat0 [ >string ] action ;
263 : 'ensure-not' ( -- parser )
264 #! Parses the '!' syntax to ensure that
265 #! something that matches the following elements do
266 #! not exist in the parse stream.
270 ] seq* [ first <ebnf-ensure-not> ] action ;
272 : 'ensure' ( -- parser )
273 #! Parses the '&' syntax to ensure that
274 #! something that matches the following elements does
275 #! exist in the parse stream.
279 ] seq* [ first <ebnf-ensure> ] action ;
281 : ('sequence') ( -- parser )
282 #! A sequence of terminals and non-terminals, including
283 #! groupings of those.
294 [ dup , ":" syntax , "a-zA-Z" range-pattern repeat1 [ >string ] action , ] seq* [ first2 <ebnf-var> ] action ,
298 : 'action' ( -- parser )
299 "[[" 'factor-code' "]]" syntax-pack ;
301 : 'semantic' ( -- parser )
302 "?[" 'factor-code' "]?" syntax-pack ;
304 : 'sequence' ( -- parser )
305 #! A sequence of terminals and non-terminals, including
306 #! groupings of those.
308 [ ('sequence') , 'action' , ] seq* [ first2 <ebnf-action> ] action ,
309 [ ('sequence') , 'semantic' , ] seq* [ first2 <ebnf-semantic> ] action ,
311 ] choice* repeat1 [
312 dup length 1 = [ first ] [ <ebnf-sequence> ] if
315 : 'actioned-sequence' ( -- parser )
317 [ 'sequence' , "=>" syntax , 'action' , ] seq* [ first2 <ebnf-action> ] action ,
321 : 'choice' ( -- parser )
322 'actioned-sequence' sp repeat1 [ dup length 1 = [ first ] [ <ebnf-sequence> ] if ] action "|" token sp list-of [
323 dup length 1 = [ first ] [ <ebnf-choice> ] if
326 : 'tokenizer' ( -- parser )
328 "tokenizer" syntax ,
330 ">" token ensure-not ,
331 [ "default" token sp , 'choice' , ] choice* ,
332 ] seq* [ first <ebnf-tokenizer> ] action ;
334 : 'rule' ( -- parser )
336 "tokenizer" token ensure-not ,
337 'non-terminal' [ symbol>> ] action ,
339 ">" token ensure-not ,
341 ] seq* [ first2 <ebnf-rule> ] action ;
343 : 'ebnf' ( -- parser )
344 [ 'tokenizer' sp , 'rule' sp , ] choice* repeat1 [ <ebnf> ] action ;
346 GENERIC: (transform) ( ast -- parser )
352 : transform ( ast -- object )
353 H{ } clone dup dup [
360 M: ebnf (transform) ( ast -- parser )
361 rules>> [ (transform) ] map peek ;
363 M: ebnf-tokenizer (transform) ( ast -- parser )
364 elements>> dup "default" = [
365 drop default-tokenizer \ tokenizer set-global any-char
368 dup parser-tokenizer \ tokenizer set-global
371 M: ebnf-rule (transform) ( ast -- parser )
374 swap symbol>> dup get parser? [
375 "Rule '" over append "' defined more than once" append throw
381 M: ebnf-sequence (transform) ( ast -- parser )
382 #! If ignore-ws is set then each element of the sequence
383 #! ignores leading whitespace. This is not inherited by
384 #! subelements of the sequence.
386 f ignore-ws [ (transform) ] with-variable
387 ignore-ws get [ sp ] when
388 ] map seq [ dup length 1 = [ first ] when ] action ;
390 M: ebnf-choice (transform) ( ast -- parser )
391 options>> [ (transform) ] map choice ;
393 M: ebnf-any-character (transform) ( ast -- parser )
394 drop tokenizer any>> call ;
396 M: ebnf-range (transform) ( ast -- parser )
397 pattern>> range-pattern ;
399 : transform-group ( ast -- parser )
400 #! convert a ast node with groups to a parser for that group
401 group>> (transform) ;
403 M: ebnf-ensure (transform) ( ast -- parser )
404 transform-group ensure ;
406 M: ebnf-ensure-not (transform) ( ast -- parser )
407 transform-group ensure-not ;
409 M: ebnf-repeat0 (transform) ( ast -- parser )
410 transform-group repeat0 ;
412 M: ebnf-repeat1 (transform) ( ast -- parser )
413 transform-group repeat1 ;
415 M: ebnf-optional (transform) ( ast -- parser )
416 transform-group optional ;
418 M: ebnf-whitespace (transform) ( ast -- parser )
419 t ignore-ws [ transform-group ] with-variable ;
421 GENERIC: build-locals ( code ast -- code )
423 M: ebnf-sequence build-locals ( code ast -- code )
424 #! Note the need to filter out this ebnf items that
425 #! leave nothing in the AST
426 elements>> filter-hidden dup length 1 = [
427 first build-locals
429 dup [ ebnf-var? ] filter empty? [
433 "USING: locals sequences ; [let* | " %
437 " [ " % # " over nth ] " %
449 M: ebnf-var build-locals ( code ast -- )
451 "USING: locals kernel ; [let* | " %
452 name>> % " [ dup ] " %
458 M: object build-locals ( code ast -- )
461 ERROR: bad-effect quot effect ;
463 : check-action-effect ( quot -- quot )
465 { [ dup (( a -- b )) effect<= ] [ drop ] }
466 { [ dup (( -- b )) effect<= ] [ drop [ drop ] prepose ] }
470 M: ebnf-action (transform) ( ast -- parser )
471 [ parser>> (transform) ] [ code>> insert-escapes ] [ parser>> ] tri build-locals
472 string-lines parse-lines check-action-effect action ;
474 M: ebnf-semantic (transform) ( ast -- parser )
475 [ parser>> (transform) ] [ code>> insert-escapes ] [ parser>> ] tri build-locals
476 string-lines parse-lines semantic ;
478 M: ebnf-var (transform) ( ast -- parser )
479 parser>> (transform) ;
481 M: ebnf-terminal (transform) ( ast -- parser )
482 symbol>> tokenizer one>> call ;
484 M: ebnf-foreign (transform) ( ast -- parser )
486 [ "Foreign word '" swap word>> append "' not found" append throw ] unless*
487 swap rule>> [ main ] unless* over rule [
493 : parser-not-found ( name -- * )
495 "Parser '" % % "' not found." %
498 M: ebnf-non-terminal (transform) ( ast -- parser )
500 , \ dup , parser get , \ at , [ parser-not-found ] , \ unless* , \ nip ,
503 : transform-ebnf ( string -- object )
504 'ebnf' parse transform ;
506 : check-parse-result ( result -- result )
508 dup remaining>> [ blank? ] trim empty? [
510 "Unable to fully parse EBNF. Left to parse was: " %
515 "Could not parse EBNF" throw
518 : parse-ebnf ( string -- hashtable )
519 'ebnf' (parse) check-parse-result ast>> transform ;
521 : ebnf>quot ( string -- hashtable quot )
522 parse-ebnf dup dup parser [ main swap at compile ] with-variable
523 [ compiled-parse ] curry [ with-scope ast>> ] curry ;
525 : <EBNF "EBNF>" reset-tokenizer parse-multiline-string parse-ebnf main swap at
526 parsed reset-tokenizer ; parsing
528 : [EBNF "EBNF]" reset-tokenizer parse-multiline-string ebnf>quot nip
529 parsed \ call parsed reset-tokenizer ; parsing
532 reset-tokenizer CREATE-WORD dup ";EBNF" parse-multiline-string
533 ebnf>quot swapd 1 1 <effect> define-declared "ebnf-parser" set-word-prop
534 reset-tokenizer ; parsing